From 6e2bd3e23054b09ea715b8a820f94e036b493f4a Mon Sep 17 00:00:00 2001
From: arm-diaz <arm.dan.diaz.gonzalez@gmail.com>
Date: Wed, 4 Jun 2025 12:08:11 +0000
Subject: [PATCH 01/65] fix: transformers engine was patched

---
 src/fine-tune/scripts/bootstrap.sh  |   4 +-
 src/fine-tune/scripts/precompile.sh | 208 +++++-----------------------
 2 files changed, 35 insertions(+), 177 deletions(-)

diff --git a/src/fine-tune/scripts/bootstrap.sh b/src/fine-tune/scripts/bootstrap.sh
index bfdd90d..be16221 100644
--- a/src/fine-tune/scripts/bootstrap.sh
+++ b/src/fine-tune/scripts/bootstrap.sh
@@ -89,7 +89,7 @@ fi
 
 cd apex || { log_message "Failed to change directory to apex"; exit 1; }
 log_message "==== Checking out Apex version 23.05 ===="
-run_with_retry "git checkout 810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c" "Checkout Apex 23.05" "critical"
+run_with_retry "git checkout 23.05" "Checkout Apex 23.05" "critical"
 
 # 3a. Overwrite setup.py with the nxd specific setup
 log_message "==== Creating custom setup.py for CPU-only Apex ===="
@@ -111,7 +111,7 @@ setup(
     packages=find_packages(
         exclude=("build", "csrc", "include", "tests", "dist", "docs", "tests", "examples", "apex.egg-info",)
     ),
-    install_requires=["packaging>20.6"],
+    install_requires=["packaging>20.6",],
     description="PyTorch Extensions written by NVIDIA",
 )
 EOF
diff --git a/src/fine-tune/scripts/precompile.sh b/src/fine-tune/scripts/precompile.sh
index 01e803c..e97f592 100644
--- a/src/fine-tune/scripts/precompile.sh
+++ b/src/fine-tune/scripts/precompile.sh
@@ -7,17 +7,12 @@ set -e
 # This script sets up and compiles NeuronX Distributed Training models
 # 
 # Usage:
-#   ./script.sh                                    # Install both packages
-#   SKIP_TRANSFORMER_ENGINE=true ./script.sh      # Skip transformer-engine
+#   ./precompile.sh                     # Run setup and compile
 #   
-# Environment Variables:
-#   SKIP_TRANSFORMER_ENGINE=true    Skip transformer-engine installation
-#                                   (useful for non-CUDA environments)
-#
 # Requirements:
 #   - Python 3.x with pip
 #   - Git (for cloning repositories)
-#   - For transformer-engine: CUDA toolkit (optional, fallback available)
+#   - install_setup.sh script in the same directory
 # ==============================================================================
 
 # Function to log with timestamp
@@ -103,177 +98,40 @@ else
     exit 1
 fi
 
-log "==== Step 4: Installing dependencies ===="
+log "==== Step 4: Running install_setup.sh ===="
+INSTALL_SETUP_DIR=~/nki-llama/src/fine-tune/neuronx-distributed-training/install_setup.sh
 
-# Check if user wants to skip transformer-engine installation
-SKIP_TE=${SKIP_TRANSFORMER_ENGINE:-true}
-if [ "$SKIP_TE" = "true" ]; then
-    log "Skipping transformer-engine installation (SKIP_TRANSFORMER_ENGINE=true)"
-    log "Installing only megatron-core from GitHub repository..."
+# Check if install_setup.sh exists
+if [ ! -f $INSTALL_SETUP_DIR ]; then
+    log_error "install_setup.sh script not found in current directory: $(pwd)"
+    log "Available files:"
+    ls -la
+    exit 1
+fi
+
+# Make sure install_setup.sh is executable
+chmod +x $INSTALL_SETUP_DIR
+
+log "Running install_setup.sh to install dependencies..."
+log "This will install megatron-core and apply necessary patches..."
+
+# Run install_setup.sh with output logging
+if $INSTALL_SETUP_DIR 2>&1 | tee /tmp/install_setup_output.log; then
+    log "✓ install_setup.sh completed successfully!"
 else
-    log "Installing transformer-engine and megatron-core from GitHub repositories..."
+    log_error "install_setup.sh failed"
+    log_error "Setup log saved to: /tmp/install_setup_output.log"
+    log_error "Last 20 lines of setup log:"
+    tail -20 /tmp/install_setup_output.log >&2
+    exit 1
 fi
 
-# Enhanced pip install with detailed logging
-if command -v pip >/dev/null 2>&1; then
-    log "Using pip version: $(pip --version)"
-    
-    # Define the specific tags to install (matching the Docker script)
-    TE_TAG="7d576ed25266a17a7b651f2c12e8498f67e0baea"
-    MCORE_TAG="core_r0.10.0"
-    TE_REPO_URL="git+https://github.com/NVIDIA/TransformerEngine.git@${TE_TAG}"
-    MEGATRON_REPO_URL="git+https://github.com/NVIDIA/Megatron-LM.git@${MCORE_TAG}"
-    
-    log "Target repositories:"
-    log "  TransformerEngine: $TE_REPO_URL"
-    log "  Megatron-LM: $MEGATRON_REPO_URL"
-    log "Git tags:"
-    log "  TE_TAG: $TE_TAG"
-    log "  MCORE_TAG: $MCORE_TAG"
-    
-    # Check and uninstall existing packages
-    packages_to_check=("megatron-core")
-    if [ "$SKIP_TE" != "true" ]; then
-        packages_to_check=("transformer-engine" "megatron-core")
-    fi
-    
-    for package in "${packages_to_check[@]}"; do
-        if pip show "$package" >/dev/null 2>&1; then
-            INSTALLED_VERSION=$(pip show "$package" | grep Version | cut -d' ' -f2)
-            log "$package is already installed (version: $INSTALLED_VERSION)"
-            log "Uninstalling existing $package..."
-            if pip uninstall -y "$package" 2>&1 | tee -a /tmp/pip_install.log; then
-                log "✓ Successfully uninstalled existing $package"
-            else
-                log_error "Failed to uninstall existing $package"
-                exit 1
-            fi
-        else
-            log "$package not currently installed"
-        fi
-    done
-    
-    log "Starting pip install from Git repositories..."
-    log "This may take several minutes as it needs to clone and build from source..."
-    
-    # Install transformer-engine only if not skipped
-    if [ "$SKIP_TE" != "true" ]; then
-    log "Installing transformer-engine @ $TE_REPO_URL"
-    
-    # Check if we're in a CUDA environment
-    if command -v nvcc >/dev/null 2>&1 && [ -n "$CUDA_HOME" ]; then
-        log "CUDA environment detected (nvcc found, CUDA_HOME set)"
-        TE_INSTALL_CMD="pip install --no-cache-dir --verbose \"transformer-engine @ $TE_REPO_URL\""
-    else
-        log "Non-CUDA environment detected - attempting CPU-only installation"
-        log "Setting environment variables for CPU-only build..."
-        
-        # Try to find CUDA installation or set minimal environment
-        if [ -d "/usr/local/cuda" ]; then
-            export CUDA_HOME="/usr/local/cuda"
-            log "Found CUDA at /usr/local/cuda, setting CUDA_HOME"
-        elif [ -d "/opt/cuda" ]; then
-            export CUDA_HOME="/opt/cuda"
-            log "Found CUDA at /opt/cuda, setting CUDA_HOME"
-        else
-            log "No CUDA installation found - this may cause transformer-engine installation to fail"
-            log "Attempting installation anyway..."
-        fi
-        
-        # Set environment variables to potentially bypass CUDA requirements
-        export NVTE_FRAMEWORK=pytorch
-        export NVTE_WITH_USERBUFFERS=0
-        
-        TE_INSTALL_CMD="pip install --no-cache-dir --verbose \"transformer-engine @ $TE_REPO_URL\""
-    fi
-    
-    log "Executing: $TE_INSTALL_CMD"
-    if eval "$TE_INSTALL_CMD" 2>&1 | tee /tmp/pip_install_te.log; then
-        if pip show transformer-engine >/dev/null 2>&1; then
-            TE_VERSION=$(pip show transformer-engine | grep Version | cut -d' ' -f2)
-            log "✓ Successfully installed transformer-engine version: $TE_VERSION"
-        else
-            log_error "transformer-engine installation appeared to succeed but package not found"
-            exit 1
-        fi
-    else
-        log_error "Failed to install transformer-engine from Git repository"
-        log_error "This is likely due to CUDA requirements. Attempting alternative approaches..."
-        
-        # Try installing a pre-built version from PyPI as fallback
-        log "Attempting fallback: installing transformer-engine from PyPI..."
-        if pip install --no-cache-dir transformer-engine 2>&1 | tee /tmp/pip_install_te_fallback.log; then
-            if pip show transformer-engine >/dev/null 2>&1; then
-                TE_VERSION=$(pip show transformer-engine | grep Version | cut -d' ' -f2)
-                log "✓ Successfully installed transformer-engine version: $TE_VERSION (PyPI fallback)"
-            else
-                log_error "PyPI fallback also failed"
-                exit 1
-            fi
-        else
-            log_error "Both Git and PyPI installation methods failed"
-            log_error "You may need to:"
-            log_error "1. Install CUDA toolkit and set CUDA_HOME environment variable"
-            log_error "2. Use a pre-built Docker container with transformer-engine"
-            log_error "3. Skip transformer-engine installation if not required for your use case"
-            log_error "Installation logs saved to:"
-            log_error "  - Git install: /tmp/pip_install_te.log"
-            log_error "  - PyPI fallback: /tmp/pip_install_te_fallback.log"
-            exit 1
-        fi
-    fi
-    else
-        log "Skipping transformer-engine installation as requested"
-        TE_VERSION="skipped"
-    fi
-    
-    # Then install megatron-core
-    log "Installing megatron_core @ $MEGATRON_REPO_URL"
-    if pip install --no-cache-dir --verbose "megatron_core @ $MEGATRON_REPO_URL" 2>&1 | tee /tmp/pip_install_mc.log; then
-        if pip show megatron-core >/dev/null 2>&1; then
-            MCORE_VERSION=$(pip show megatron-core | grep Version | cut -d' ' -f2)
-            log "✓ Successfully installed megatron-core version: $MCORE_VERSION"
-        else
-            log_error "megatron-core installation appeared to succeed but package not found"
-            exit 1
-        fi
-    else
-        log_error "Failed to install megatron-core from Git repository"
-        log_error "Installation log saved to: /tmp/pip_install_mc.log"
-        log_error "Last 10 lines of installation log:"
-        tail -10 /tmp/pip_install_mc.log >&2
-        exit 1
-    fi
-    
-    # Show installation summary
-    log "✓ All dependencies installed successfully!"
-    log "Installation summary:"
-    if [ "$SKIP_TE" != "true" ]; then
-        log "  transformer-engine:"
-        log "    Version: $TE_VERSION"
-        if [ "$TE_VERSION" != "skipped" ]; then
-            log "    Git tag/commit: $TE_TAG"
-            log "    Location: $(pip show transformer-engine | grep Location | cut -d' ' -f2-)"
-        fi
-    else
-        log "  transformer-engine: skipped (SKIP_TRANSFORMER_ENGINE=true)"
-    fi
-    log "  megatron-core:"
-    log "    Version: $MCORE_VERSION"
-    log "    Git tag: $MCORE_TAG"
-    log "    Location: $(pip show megatron-core | grep Location | cut -d' ' -f2-)"
-    
-    # Show package dependencies
-    log "Package dependencies:"
-    if [ "$SKIP_TE" != "true" ] && [ "$TE_VERSION" != "skipped" ]; then
-        log "  transformer-engine requires:"
-        pip show transformer-engine | grep Requires | cut -d' ' -f2- | tr ',' '\n' | sed 's/^/    /'
-    fi
-    log "  megatron-core requires:"
-    pip show megatron-core | grep Requires | cut -d' ' -f2- | tr ',' '\n' | sed 's/^/    /'
-    
+# Verify megatron-core installation
+if pip show megatron-core >/dev/null 2>&1; then
+    MCORE_VERSION=$(pip show megatron-core | grep Version | cut -d' ' -f2)
+    log "✓ Verified megatron-core installation - version: $MCORE_VERSION"
 else
-    log_error "pip command not found. Please ensure Python and pip are installed."
+    log_error "megatron-core not found after running install_setup.sh"
     exit 1
 fi
 
@@ -302,5 +160,5 @@ fi
 log "==== Script execution completed successfully! ===="
 log "Total execution time: $SECONDS seconds"
 log "Log files created:"
-log "  - pip install log: /tmp/pip_install.log"
-log "  - training log: /tmp/train_output.log"
\ No newline at end of file
+log "  - Setup log: /tmp/install_setup_output.log"
+log "  - Training log: /tmp/train_output.log"
\ No newline at end of file

From eef633a4311636806aa243ab5eb2ba8a2b25428e Mon Sep 17 00:00:00 2001
From: arm-diaz <arm.dan.diaz.gonzalez@gmail.com>
Date: Wed, 4 Jun 2025 12:35:05 +0000
Subject: [PATCH 02/65] refactor: notebook rename model to llama 3 8b

---
 notebooks/neuron_agents_llm.ipynb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/notebooks/neuron_agents_llm.ipynb b/notebooks/neuron_agents_llm.ipynb
index e61c43d..e3765c5 100644
--- a/notebooks/neuron_agents_llm.ipynb
+++ b/notebooks/neuron_agents_llm.ipynb
@@ -134,14 +134,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "llm = VLLMOpenAI(\n",
     "    openai_api_key=\"EMPTY\",\n",
     "    openai_api_base=\"http://localhost:8080/v1\",\n",
-    "    model_name=\"models/llama-3.2-3b-instruct\"\n",
+    "    model_name=\"models/meta-llama-3-8b\"\n",
     ")"
    ]
   },

From 57deb0f7d8f9d6c3f81d058435df317515fdf6a8 Mon Sep 17 00:00:00 2001
From: nithiyn <vijeasns@amazon.com>
Date: Thu, 5 Jun 2025 18:00:30 -0400
Subject: [PATCH 03/65] commit updated fine-tune artifacts

---
 src/fine-tune/Makefile    | 49 ----------------------
 src/fine-tune/README.md   | 85 +++++++++++++++++++++++++++++++++++++++
 src/fine-tune/pipeline.sh | 57 ++++++++++++++++++++++++++
 3 files changed, 142 insertions(+), 49 deletions(-)
 delete mode 100644 src/fine-tune/Makefile
 create mode 100644 src/fine-tune/README.md
 create mode 100644 src/fine-tune/pipeline.sh

diff --git a/src/fine-tune/Makefile b/src/fine-tune/Makefile
deleted file mode 100644
index 2d6c928..0000000
--- a/src/fine-tune/Makefile
+++ /dev/null
@@ -1,49 +0,0 @@
-# examples/nxdt_finetune_llama3_8B_lora/Makefile
-
--include ../../.env
-
-SHELL := /bin/bash
-
-.PHONY: all deps data model convert_ckpt precompile train clean
-all: deps data model convert_ckpt precompile train
-# Check if in Neuron virtual environment
-.PHONY: check-neuron-venv
-check-neuron-venv:
-	@if [ -z "$$VIRTUAL_ENV" ] || [[ "$$VIRTUAL_ENV" != *"neuronx"* ]]; then \
-		echo "Error: Not in Neuron virtual environment."; \
-		echo "Run 'source /opt/aws_neuronx_venv_pytorch_2_5/bin/activate' first."; \
-		exit 1; \
-	else \
-		echo "Using Neuron virtual environment: $$VIRTUAL_ENV"; \
-	fi
-
-deps: check-neuron-venv
-	@echo "==> Installing/Validating Apex, NxDT and other dependencies..."
-	@echo "Running bootstrap script to ensure all dependencies are properly configured..."
-	bash scripts/bootstrap.sh
-
-data: check-neuron-venv
-	@echo "==> Downloading dataset"
-	bash scripts/download_data.sh
-
-model: check-neuron-venv
-	@echo "==> Downloading & converting model"
-	HF_TOKEN="${HF_TOKEN}" MODEL_ID="${MODEL_ID}" bash scripts/download_model.sh
-
-convert_ckpt: check-neuron-venv
-	@echo "==> Converting checkpoint to NxDT format"
-	bash scripts/convert_checkpoints.sh
-
-precompile: check-neuron-venv
-	@echo "==> Pre-compiling graphs (AOT)"
-	bash scripts/precompile.sh
-
-train: check-neuron-venv
-	@echo "==> Running fine-tuning job"
-	bash scripts/run_training.sh
-
-clean: check-neuron-venv
-	rm -rf dataset llama3_tokenizer llama3-8B_hf_weights pretrained_ckpt nemo_experiments
-	@echo "Cleaned up all generated files."
-
-
diff --git a/src/fine-tune/README.md b/src/fine-tune/README.md
new file mode 100644
index 0000000..a01e80a
--- /dev/null
+++ b/src/fine-tune/README.md
@@ -0,0 +1,85 @@
+# pipeline.sh 
+
+---
+
+## 1 · Prerequisites
+
+| Requirement | Reason | Install / Notes |
+|-------------|--------|-----------------|
+| **Neuron virtual‑env** | Script refuses to run outside it | `source /opt/aws_neuronx_venv_pytorch_2_5/bin/activate` |
+| **`scripts/` folder** | step-by step scripts for running fine tuning | |
+| **`.env` file** *(optional)* | Central place for env vars | Place at `../../.env` |
+
+Example `.env`:
+
+```dotenv
+HF_TOKEN=hf_****************************************
+MODEL_ID=meta-llama-3-8b
+```
+
+---
+
+## 2 · Setup
+
+```bash
+# Clone repo and enter it
+cd ./src/fine-tune
+
+# Make the script executable
+chmod +x pipeline.sh
+
+# Activate Neuron environment
+source /opt/aws_neuronx_venv_pytorch_2_5/bin/activate
+```
+
+---
+
+## 3 · Usage
+
+| Command | Action |
+|---------|--------|
+| `./pipeline.sh` | Run the **full pipeline** (deps → data → model → convert_ckpt → precompile → train) |
+| `./pipeline.sh deps` | Install/validate Apex, NxDT, etc. |
+| `./pipeline.sh data` | Download dataset |
+| `./pipeline.sh model` | Download & convert model checkpoints |
+| `./pipeline.sh convert_ckpt` | Convert checkpoints to NxDT format |
+| `./pipeline.sh precompile` | Ahead‑of‑time graph compilation |
+| `./pipeline.sh train` | Start fine‑tuning |
+| `./pipeline.sh clean` | Remove generated datasets, weights, experiments |
+
+ Each sub‑command double‑checks you’re inside a Neuron venv and prints a helpful error if not.
+
+---
+
+## 4 · Environment Variables
+
+| Variable | Purpose | How to set |
+|----------|---------|-----------|
+| `HF_TOKEN` | Hugging Face auth token (for private models) | Add to `.env` or `export HF_TOKEN=…` |
+| `MODEL_ID` | Model slug, e.g. `meta-llama-3-8b` | Same as above |
+
+The script auto‑loads `../../.env` with `set -a; source …`. Modify the `ENV_FILE=` line in `pipeline.sh` if you store it elsewhere.
+
+---
+
+## 5 · Troubleshooting
+
+| Symptom | Probable Cause | Fix |
+|---------|---------------|-----|
+| `Not inside a Neuron virtual environment.` | Forgot to activate venv | `source /opt/aws_neuronx_venv_pytorch_2_5/bin/activate` |
+| `command not found: pipeline.sh` | File not executable or wrong cwd | `chmod +x pipeline.sh` and/or `./pipeline.sh` |
+| Model download fails | Missing/invalid `HF_TOKEN` | Provide valid token in env or `.env` |
+| Long compile times | First‑time Neuron AOT | Subsequent runs reuse cached graphs |
+
+---
+
+## 6 · Extending the Pipeline
+
+1. Add a new Bash function in `pipeline.sh` (e.g., `evaluate()`).
+2. Append its name to the pattern list inside `main()`.
+3. Optionally call it from `all()` for automatic inclusion.
+
+```bash
+./pipeline.sh train 
+```
+---
diff --git a/src/fine-tune/pipeline.sh b/src/fine-tune/pipeline.sh
new file mode 100644
index 0000000..268bcb3
--- /dev/null
+++ b/src/fine-tune/pipeline.sh
@@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+# pipeline.sh 
+# Usage: ./pipeline.sh [all|deps|data|model|convert_ckpt|precompile|train|clean]
+set -euo pipefail
+
+###############################################################################
+# 1. Bring in environment variables from ../../.env (if it exists)
+###############################################################################
+ENV_FILE="$(dirname "$0")/../../.env"
+if [[ -f "$ENV_FILE" ]]; then
+  # Export every variable defined in the .env file
+  set -a
+  
+  source "$ENV_FILE"
+  set +a
+fi
+
+###############################################################################
+# 2. Helper for neuron venv
+###############################################################################
+check_neuron_venv() {
+  if [[ -z "${VIRTUAL_ENV:-}" || "$VIRTUAL_ENV" != *"neuronx"* ]]; then
+    echo "Not inside a Neuron virtual environment."
+    echo "    Run:  source /opt/aws_neuronx_venv_pytorch_2_5/bin/activate"
+    exit 1
+  fi
+  echo "Using Neuron virtual environment: $VIRTUAL_ENV"
+}
+
+###############################################################################
+# 3. Pipeline steps (one Bash function per step)
+###############################################################################
+deps()        { check_neuron_venv; echo "==> Installing/validating deps…"; bash scripts/bootstrap.sh; }
+data()        { check_neuron_venv; echo "==> Downloading dataset…";       bash scripts/download_data.sh; }
+model()       { check_neuron_venv; echo "==> Fetching model…";            HF_TOKEN="${HF_TOKEN:-}" MODEL_ID="${MODEL_ID:-}" bash scripts/download_model.sh; }
+convert_ckpt(){ check_neuron_venv; echo "==> Converting ckpt…";           bash scripts/convert_checkpoints.sh; }
+precompile()  { check_neuron_venv; echo "==> Pre-compiling graphs…";       bash scripts/precompile.sh; }
+train()       { check_neuron_venv; echo "==> Running fine-tune…";          bash scripts/run_training.sh; }
+clean()       { check_neuron_venv; rm -rf dataset llama3_tokenizer llama3-8B_hf_weights pretrained_ckpt nemo_experiments; echo "🧹  Cleaned generated files."; }
+
+# run as unit test
+all() { deps; data; model; convert_ckpt; precompile; train; }
+
+###############################################################################
+# 4. Argument parsing
+###############################################################################
+main() {
+  local cmd="${1:-all}"
+  case "$cmd" in
+    all|deps|data|model|convert_ckpt|precompile|train|clean) "$cmd" ;;
+    *)  echo "Usage: $0 {all|deps|data|model|convert_ckpt|precompile|train|clean}" >&2; exit 1 ;;
+  esac
+}
+
+main "$@"
+
+

From e7822d750d06d1fc2a5b5bcaa5c49dfb0d6a260b Mon Sep 17 00:00:00 2001
From: nithiyn <niithiyn@gmail.com>
Date: Thu, 5 Jun 2025 18:03:08 -0400
Subject: [PATCH 04/65] Update README.md and pin build

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index ec13cc6..5f4f599 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,9 @@
 
 A unified project for fine-tuning, inference, and agent development of Llama models on AWS Trainium and Inferentia.
 
+### Neuron SDK version - (Neuron 2.23.0 Release)
+#### Neuronx Distributed Inference [0.3.5591] 
+#### NeuronX Distributed Training [1.3.0]
 
 ## Project Workflow
 

From 6b5a6cc0518369d79533d0b23c2e591e68f334be Mon Sep 17 00:00:00 2001
From: nithiyn <niithiyn@gmail.com>
Date: Thu, 5 Jun 2025 18:03:26 -0400
Subject: [PATCH 05/65] Update README.md

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 5f4f599..f44943f 100644
--- a/README.md
+++ b/README.md
@@ -2,9 +2,9 @@
 
 A unified project for fine-tuning, inference, and agent development of Llama models on AWS Trainium and Inferentia.
 
-### Neuron SDK version - (Neuron 2.23.0 Release)
-#### Neuronx Distributed Inference [0.3.5591] 
-#### NeuronX Distributed Training [1.3.0]
+#### Neuron SDK version - (Neuron 2.23.0 Release)
+###### Neuronx Distributed Inference [0.3.5591] 
+###### NeuronX Distributed Training [1.3.0]
 
 ## Project Workflow
 

From bc1c0dce34ca8887d6b58bd40ff25246c2fcb23f Mon Sep 17 00:00:00 2001
From: arm-diaz <arm.dan.diaz.gonzalez@gmail.com>
Date: Fri, 6 Jun 2025 15:17:34 +0000
Subject: [PATCH 06/65] refactor: CLI & bash scripting

---
 .env.example                                  |  33 +-
 Makefile                                      | 127 ----
 README.md                                     | 536 ++++++++++------
 install.sh                                    | 189 ++++++
 logs/benchmarks/20250606_150849/metadata.json |  11 +
 logs/benchmarks/20250606_150959/metadata.json |  11 +
 logs/benchmarks/20250606_151034/metadata.json |  11 +
 logs/benchmarks/20250606_151230/metadata.json |  11 +
 logs/benchmarks/20250606_151450/metadata.json |  11 +
 nki-llama                                     |   1 +
 nki-llama.config                              |  68 ++
 nki-llama.sh                                  | 520 +++++++++++++++
 src/fine-tune/README.md                       |  71 ++-
 src/fine-tune/pipeline.sh                     |   0
 src/fine-tune/scripts/bootstrap.sh            |   0
 src/fine-tune/scripts/convert_checkpoints.sh  |   0
 src/fine-tune/scripts/download_data.sh        |   0
 src/fine-tune/scripts/download_model.sh       |   0
 src/fine-tune/scripts/precompile.sh           |   0
 src/fine-tune/scripts/run_training.sh         |   0
 src/fine-tune/scripts/tensorboard.sh          |   0
 src/inference/Makefile                        | 239 -------
 src/inference/README.md                       | 594 ++++++++++++------
 src/inference/scripts/download-model.sh       |  50 ++
 src/inference/scripts/jupyter.sh              |  76 +++
 src/inference/scripts/run-nki-benchmark.sh    | 306 +++++++++
 src/inference/scripts/setup-vllm.sh           |  47 ++
 src/inference/scripts/start-server.sh         |  51 ++
 28 files changed, 2194 insertions(+), 769 deletions(-)
 delete mode 100644 Makefile
 create mode 100755 install.sh
 create mode 100644 logs/benchmarks/20250606_150849/metadata.json
 create mode 100644 logs/benchmarks/20250606_150959/metadata.json
 create mode 100644 logs/benchmarks/20250606_151034/metadata.json
 create mode 100644 logs/benchmarks/20250606_151230/metadata.json
 create mode 100644 logs/benchmarks/20250606_151450/metadata.json
 create mode 120000 nki-llama
 create mode 100644 nki-llama.config
 create mode 100755 nki-llama.sh
 mode change 100644 => 100755 src/fine-tune/pipeline.sh
 mode change 100644 => 100755 src/fine-tune/scripts/bootstrap.sh
 mode change 100644 => 100755 src/fine-tune/scripts/convert_checkpoints.sh
 mode change 100644 => 100755 src/fine-tune/scripts/download_data.sh
 mode change 100644 => 100755 src/fine-tune/scripts/download_model.sh
 mode change 100644 => 100755 src/fine-tune/scripts/precompile.sh
 mode change 100644 => 100755 src/fine-tune/scripts/run_training.sh
 mode change 100644 => 100755 src/fine-tune/scripts/tensorboard.sh
 delete mode 100644 src/inference/Makefile
 create mode 100755 src/inference/scripts/download-model.sh
 create mode 100755 src/inference/scripts/jupyter.sh
 create mode 100755 src/inference/scripts/run-nki-benchmark.sh
 create mode 100755 src/inference/scripts/setup-vllm.sh
 create mode 100755 src/inference/scripts/start-server.sh

diff --git a/.env.example b/.env.example
index c65920b..0dae225 100644
--- a/.env.example
+++ b/.env.example
@@ -1,13 +1,28 @@
-# Model configuration
-## HuggingFace Model ID (https://huggingface.co/meta-llama/Meta-Llama-3-8B)
+# Example environment file for NKI-LLAMA
+# Copy this to .env and update with your values
+
+# Hugging Face Configuration
+HF_TOKEN=your_huggingface_token_here
 MODEL_ID=meta-llama/Meta-Llama-3-8B
-## Short name for model ID
-MODEL_NAME=meta-llama-3-8b
+MODEL_NAME=llama-3-8b
+
+# Training Configuration
+BATCH_SIZE=1
+MAX_STEPS=1000
+SEQ_LENGTH=2048
+LEARNING_RATE=5e-5
 
-# Server configurations
-PORT=8080
+# Inference Configuration
+INFERENCE_PORT=8080
 MAX_MODEL_LEN=2048
-TENSOR_PARALLEL_SIZE=32
+MAX_NUM_SEQS=4
+TENSOR_PARALLEL_SIZE=8
+
+# Dataset Configuration
+DATASET_NAME=databricks/databricks-dolly-15k
+
+# Neuron Configuration
+NEURON_RT_NUM_CORES=8
 
-# HuggingFace token for downloading models
-HF_TOKEN=your_token_here
\ No newline at end of file
+# Jupyter Configuration
+JUPYTER_PORT=8888
\ No newline at end of file
diff --git a/Makefile b/Makefile
deleted file mode 100644
index b5468d7..0000000
--- a/Makefile
+++ /dev/null
@@ -1,127 +0,0 @@
-# Top-level Makefile for coordinating fine-tuning and inference
-
--include .env
-
-SHELL := /bin/bash
-
-# Define paths to subproject directories
-FINETUNE_DIR = ./src/fine-tune
-INFERENCE_DIR = ./src/inference
-
-# Default target
-.PHONY: all
-all: help
-
-# Help message
-.PHONY: help
-help:
-	@echo "Top-level Makefile for managing fine-tuning and inference"
-	@echo ""
-	@echo "Available targets:"
-	@echo "  help                - Show this help message"
-	@echo ""
-	@echo "  finetune            - Run all fine-tuning steps"
-	@echo "  finetune-deps       - Install fine-tuning dependencies"
-	@echo "  finetune-data       - Download datasets for fine-tuning"
-	@echo "  finetune-model      - Download model for fine-tuning"
-	@echo "  finetune-convert    - Convert checkpoint to NxDT format"
-	@echo "  finetune-precompile - Pre-compile graphs (AOT)"
-	@echo "  finetune-train      - Run fine-tuning job"
-	@echo "  finetune-clean      - Clean up fine-tuning files"
-	@echo ""
-	@echo "  inference           - Run inference (shortcut to infer target)"
-	@echo "  inference-show-env  - Display environment variables loaded from .env file"
-	@echo "  inference-setup     - Setup vLLM for Neuron"
-	@echo "  inference-jupyter   - Setup Jupyter environment"
-	@echo "  inference-download  - Download model from Hugging Face"
-	@echo "  inference-infer     - Run inference in generate mode"
-	@echo "  inference-evaluate  - Run inference in evaluate mode"
-	@echo "  inference-server    - Start vLLM OpenAI-compatible API server"
-	@echo "  inference-lab       - Run Jupyter Lab server"
-	@echo "  inference-clean     - Clean up inference files"
-	@echo ""
-	@echo "  clean               - Clean up all generated files"
-	@echo ""
-	@echo "Environment requirements:"
-	@echo "  - For inference: source /opt/aws_neuronx_venv_pytorch_2_5_nxd_inference/bin/activate"
-	@echo "  - For fine-tuning: source /opt/aws_neuronx_venv_pytorch_2_5/bin/activate"
-	@echo "  - For Jupyter: source venv/bin/activate"
-
-# Check if in Neuron virtual environment
-.PHONY: check-neuron-venv
-check-neuron-venv:
-	@if [ -z "$$VIRTUAL_ENV" ] || [[ "$$VIRTUAL_ENV" != *"neuronx"* ]]; then \
-		echo "Error: Not in Neuron virtual environment."; \
-		echo "Run 'source /opt/aws_neuronx_venv_pytorch_2_5/bin/activate' first."; \
-		exit 1; \
-	else \
-		echo "Using Neuron virtual environment: $$VIRTUAL_ENV"; \
-	fi
-
-# Fine-tuning targets
-.PHONY: finetune finetune-deps finetune-data finetune-model finetune-convert finetune-precompile finetune-train finetune-clean
-
-finetune:
-	$(MAKE) -C $(FINETUNE_DIR)
-
-finetune-deps:
-	$(MAKE) -C $(FINETUNE_DIR) deps
-
-finetune-data:
-	$(MAKE) -C $(FINETUNE_DIR) data
-
-finetune-model:
-	$(MAKE) -C $(FINETUNE_DIR) model
-
-finetune-convert:
-	$(MAKE) -C $(FINETUNE_DIR) convert_ckpt
-
-finetune-precompile:
-	$(MAKE) -C $(FINETUNE_DIR) precompile
-
-finetune-train:
-	$(MAKE) -C $(FINETUNE_DIR) train
-
-finetune-clean:
-	$(MAKE) -C $(FINETUNE_DIR) clean
-
-# Inference targets
-.PHONY: inference inference-setup inference-jupyter inference-download inference-infer inference-evaluate inference-server inference-lab inference-clean inference-show-env inference-evaluate-all
-
-inference:
-	$(MAKE) -C $(INFERENCE_DIR) infer
-
-inference-show-env:
-	$(MAKE) -C $(INFERENCE_DIR) show-env
-
-inference-setup:
-	$(MAKE) -C $(INFERENCE_DIR) setup-vllm
-
-inference-jupyter:
-	$(MAKE) -C $(INFERENCE_DIR) setup-jupyter
-
-inference-download:
-	$(MAKE) -C $(INFERENCE_DIR) download
-
-inference-infer:
-	$(MAKE) -C $(INFERENCE_DIR) infer
-
-inference-evaluate:
-	$(MAKE) -C $(INFERENCE_DIR) evaluate
-
-inference-evaluate-all:
-	$(MAKE) -C $(INFERENCE_DIR) evaluate-all
-
-inference-server:
-	$(MAKE) -C $(INFERENCE_DIR) start-server
-
-inference-lab:
-	$(MAKE) -C $(INFERENCE_DIR) jupyter
-
-inference-clean:
-	$(MAKE) -C $(INFERENCE_DIR) clean
-
-# Clean all
-.PHONY: clean
-clean: finetune-clean inference-clean
-	@echo "Cleaned all subprojects"
\ No newline at end of file
diff --git a/README.md b/README.md
index f44943f..db7bdcd 100644
--- a/README.md
+++ b/README.md
@@ -1,275 +1,445 @@
-# NKI Llama
+# NKI-LLAMA: Unified Interface for AWS Neuron
 
-A unified project for fine-tuning, inference, and agent development of Llama models on AWS Trainium and Inferentia.
+A unified project for fine-tuning, inference, and agent development of LLaMA models on AWS Trainium and Inferentia using a streamlined bash-based interface.
 
-#### Neuron SDK version - (Neuron 2.23.0 Release)
-###### Neuronx Distributed Inference [0.3.5591] 
-###### NeuronX Distributed Training [1.3.0]
+## 📋 Requirements
 
-## Project Workflow
+### Neuron SDK Version
+- **Neuron 2.23.0 Release**
+- **NeuronX Distributed Inference**: 0.3.5591
+- **NeuronX Distributed Training**: 1.3.0
+
+### Hardware & AMI
+- **Required Instance**: trn1.32xlarge
+- **Base AMI**: Deep Learning AMI Neuron (Ubuntu 22.04) with Neuron SDK 2.23
+- **Base Packages**:
+  - NxD (NeuronX Distributed Training)
+  - NKI (Neuron Kernel Interface)
+  - NxDI (NeuronX Distributed Inference)
+
+## 🔄 Project Workflow
 
 ```
-┌────────────────┐     ┌────────────────┐     ┌────────────────┐
-│                │     │                │     │                │
-│   Fine-tune    │────▶│   Inference    │────▶│     Agent      │
-│                │     │                │     │  Development   │
-│                │     │                │     │                │
-└────────────────┘     └────────────────┘     └────────────────┘
+┌─────────────────┐     ┌──────────────────┐     ┌─────────────────┐     ┌──────────────┐
+│                 │     │                  │     │                 │     │              │
+│   Fine-tuning   │────▶│ NKI Compilation  │────▶│ vLLM Inference  │────▶│    Agent     │
+│      (NxD)      │     │  & Benchmarking  │     │     (NxDI)      │     │ Development  │
+│                 │     │                  │     │                 │     │              │
+└─────────────────┘     └──────────────────┘     └─────────────────┘     └──────────────┐
+        │                         │                         │
+        │                         │                         │
+        ▼                         ▼                         ▼
+  Trained Model            NKI-Optimized              API Endpoint
+                          Model Artifacts            (OpenAI Compatible)
 ```
 
-This project follows a three-stage workflow:
-1. **Fine-tune** a model using Neuron hardware with NxD
-2. **Inference** using the fine-tuned model with vLLM, NKI compilation, and NxDI (Neuron Distributed Inference)
-3. **Agent Development** using LangChain/LangGraph connected to your model
+### Detailed Workflow:
 
-## Technical Infrastructure
+1. **Fine-tune** a model using NeuronX Distributed (NxD) on Trainium
+2. **NKI Compilation & Benchmarking**:
+   - Compiles model graphs with Neuron Kernel Interface (NKI)
+   - Creates optimized artifacts for inference
+   - Benchmarks performance characteristics
+   - Supports two modes: `evaluate_single` and `evaluate_all`
+3. **vLLM Inference** serves the NKI-compiled model using NeuronX Distributed Inference (NxDI)
+4. **Agent Development** connects to the inference endpoint for application building
 
-### Compute Resources
-- **Required Instance**: trn1.32xlarge 
-- **Base AMI**: Deep Learning AMI Neuron (Ubuntu 22.04) with Neuron SDK 2.23.
-- **Base Packages**:
-  - NxD (NeuronX Distributed Training)
-  - NKI (Neuron Kernel Interface)
-  - NxDI (Neuron Distributed Inference)
+### Key Components:
+- **NKI (Neuron Kernel Interface)**: Optimizes model operations for AWS Neuron hardware
+- **NxD (NeuronX Distributed)**: Enables distributed training across Neuron cores
+- **NxDI (NeuronX Distributed Inference)**: Provides optimized inference runtime
+- **vLLM**: Serves models with OpenAI-compatible API using Neuron optimizations
 
-## Project Structure
+## 🚀 Quick Start
 
-This repository contains three main components:
-- **Fine-tuning**: Tools for fine-tuning LLMs on Neuron hardware using NxD
-- **Inference**: Infrastructure for efficient inference using vLLM with NKI compilation and NxDI optimization
-- **Agent Development**: Building intelligent agents with LangChain/LangGraph
+```bash
+# Install
+chmod +x install.sh
+./install.sh
 
-## Setup Steps
+# Setup Guide
+./nki-llama setup
 
-1. Create a Trainium instance with AWS Neuron SDK v2.23 using EC2 with the following settings:
-    1. **Name:** nki-llama
-    2. **AMI:** Deep Learning AMI Neuron (Ubuntu 22.04)
-    3. **Instance type:** trn1.32xlarge
-    4. **Key pair (login):** create a new key pair
-    5. When connecting to these instances via SSH, use the username of *ubuntu*.
+# Run full benchmark (with NKI compilation)
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+./nki-llama inference benchmark
 
-2. Clone this repository and navigate to it:
+# Run quick single evaluation
+./nki-llama inference benchmark single
 
-```bash
-git clone [REPO_URL]
-cd [PATH]/nki-llama
+# Start inference server
+./nki-llama server
 ```
 
-3. Create your `.env` file by copying the provided example:
+## 🏗️ Initial Setup
+
+### 1. Create Trainium Instance
+
+Create a trn1.32xlarge instance on AWS EC2:
+- **Name**: nki-llama
+- **AMI**: Deep Learning AMI Neuron (Ubuntu 22.04)
+- **Instance type**: trn1.32xlarge
+- **Key pair**: Create new key pair
+- **Username**: ubuntu (when connecting via SSH)
+
+### 2. Clone and Install
 
 ```bash
+# Clone repository
+git clone [REPO_URL]
+cd nki-llama
+
+# Run installation
+chmod +x install.sh
+./install.sh
+
+# Configure environment
 cp .env.example .env
-# Edit .env file with your preferred settings
-nano .env
+nano .env  # Add your HF_TOKEN and adjust settings
 ```
 
-## Environment Setup
+## 📁 Project Structure
 
-This project requires three different Python environments:
+```
+/home/ubuntu/nki-llama/
+├── nki-llama.sh          # Main CLI interface
+├── nki-llama.config      # Shared configuration
+├── .env                  # Your environment variables
+├── .env.example          # Example configuration
+├── install.sh            # Installation script
+├── src/
+│   ├── fine-tune/
+│   │   └── scripts/      # Fine-tuning scripts
+│   │       ├── bootstrap.sh
+│   │       ├── download_data.sh
+│   │       ├── download_model.sh
+│   │       ├── convert_checkpoints.sh
+│   │       ├── precompile.sh
+│   │       └── run_training.sh
+│   └── inference/
+│       ├── main.py       # Inference entry point
+│       └── scripts/      # Inference helper scripts
+│           ├── setup-vllm.sh
+│           ├── download-model.sh
+│           ├── run-nki-benchmark.sh  # Supports both evaluate_single and evaluate_all modes
+│           ├── start-server.sh
+│           └── jupyter.sh
+└── logs/                 # Unified logs
+    └── benchmarks/       # Benchmark results
+```
+
+## 🔧 Environment Setup
 
-1. **Fine-tuning Environment**:
+This project requires three different Python environments:
 
+### 1. Fine-tuning Environment
 ```bash
 source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
 ```
 
-2. **Inference Environment**:
-
+### 2. Inference Environment
 ```bash
 source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
 ```
 
-3. **Jupyter Environment** (for agent development):
-
+### 3. Jupyter Environment (for agent development)
 ```bash
-python3 -m venv venv
-source venv/bin/activate
-make inference-jupyter  # Sets up Jupyter and installs required packages
+./nki-llama jupyter setup
+source ~/nki-llama/venv/bin/activate
 ```
 
-## Fine-tuning Workflow
+## 💻 Commands
 
-Our Makefile simplifies the fine-tuning process:
+### Quick Commands
+- `./nki-llama setup` - Interactive setup wizard
+- `./nki-llama train` - Start fine-tuning (shortcut)
+- `./nki-llama server` - Start inference server (shortcut)
+- `./nki-llama jupyter` - Launch Jupyter Lab
+- `./nki-llama status` - Check system status
+- `./nki-llama config` - Show configuration
+
+### Fine-tuning Workflow
 
 ```bash
-# Activate the fine-tuning environment
+# Activate fine-tuning environment
 source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
 
-# Install dependencies
-make finetune-deps
+# Run individual steps
+./nki-llama finetune deps      # Install dependencies
+./nki-llama finetune data      # Download dataset
+./nki-llama finetune model     # Download model
+./nki-llama finetune convert   # Convert checkpoints to NxDT format
+./nki-llama finetune compile   # Pre-compile graphs (AOT)
+./nki-llama finetune train     # Start fine-tuning
+
+# Or run all at once
+./nki-llama finetune all
+```
+
+### NKI Benchmark Modes
+
+The benchmark script supports two evaluation modes:
+
+#### 1. evaluate_single Mode
+Quick single evaluation using the repository test script:
+```bash
+# Activate inference environment
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+
+# Run single evaluation
+./nki-llama inference benchmark single
 
-# Download dataset
-make finetune-data
+# Or explicitly specify mode
+./nki-llama inference benchmark --mode evaluate_single
+```
 
-# Download model
-make finetune-model
+#### 2. evaluate_all Mode (Default)
+Comprehensive benchmark with NKI compilation and all configurations:
+```bash
+# Download model if not already available
+./nki-llama inference download
 
-# Convert checkpoint to NxDT format
-make finetune-convert
+# Run full benchmark (compiles model on first run)
+./nki-llama inference benchmark
 
-# Pre-compile graphs (AOT)
-make finetune-precompile
+# Or with custom parameters
+./nki-llama inference benchmark --seq-len 1024 --tp-degree 8
 
-# Run fine-tuning job
-make finetune-train
+# Run without NKI optimizations
+./nki-llama inference benchmark --no-nki
 ```
 
-## Inference Workflow
+**Key differences:**
+- **evaluate_single**: Quick validation, runs from repository test script
+- **evaluate_all**: Full benchmark with model compilation, creates cached artifacts for vLLM
 
-The inference pipeline includes NKI (Neuron Kernel Interface) compilation and NxDI (Neuron Distributed Inference) integration with vLLM for optimal performance on Neuron hardware.
+**Note**: The `evaluate_all` mode automatically compiles the model with NKI optimizations on the first run (10-30 minutes). Subsequent runs use the cached compilation.
 
-Use our Makefile to simplify the setup and execution process for inference:
+### Inference Serving
 
 ```bash
-# Activate the inference environment
-source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+# After benchmarking (which compiles the model), start the API server
 
-# Setup vLLM for Neuron
-make inference-setup
+# Setup vLLM
+./nki-llama inference setup      # Setup vLLM for Neuron
 
-# Download model from Hugging Face (you'll need a HF token)
-# (skip this step if using your fine-tuned model)
-make inference-download
+# Start the API server
+./nki-llama inference server     # Start OpenAI-compatible API
+```
+
+## 🤖 Agent Development
+
+This repository includes support for building LLM-powered agents using LangGraph and LangChain. A sample travel planning agent demonstrates:
+
+- Context-aware travel itinerary generation
+- Multi-turn conversation with memory
+- Dynamic workflow management using LangGraph
+- Integration with vLLM for efficient inference on Trainium
+
+### Using Jupyter for Agent Development
 
-# The model will be automatically compiled with NKI and optimized for NxDI
-# when the server starts for the first time
+```bash
+# Terminal 1: Start the inference server
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+./nki-llama server
 
-# Start the vLLM OpenAI-compatible API server with NxDI
-make inference-server
+# Terminal 2: Start Jupyter Lab
+./nki-llama jupyter
+# Access at http://your-ip:8888
+# Select the "nki-llama" kernel in Jupyter
 ```
 
-### Environment Configuration
+## ⚙️ Configuration
+
+All configuration is managed through:
+1. `nki-llama.config` - System paths and defaults (includes TEST_SCRIPT, MAIN_SCRIPT, etc.)
+2. `.env` - Your personal configuration
 
-The repository includes a `.env.example` file with template configuration. Copy this file to create your own `.env`:
+### Key Variables
 
 ```bash
-# .env file
-# Model configuration
-## HuggingFace Model ID (https://huggingface.co/meta-llama/Meta-Llama-3-8B)
+# Model Configuration
 MODEL_ID=meta-llama/Meta-Llama-3-8B
-## Short name for model ID
-MODEL_NAME=meta-llama-3-8b
-
-# Server configurations
-PORT=8080
+MODEL_NAME=llama-3-8b
+HF_TOKEN=your_huggingface_token
+
+# Training Parameters
+BATCH_SIZE=1
+MAX_STEPS=1000
+SEQ_LENGTH=2048
+TENSOR_PARALLEL_SIZE=8
+LEARNING_RATE=5e-5
+
+# Inference Parameters
+INFERENCE_PORT=8080
 MAX_MODEL_LEN=2048
-TENSOR_PARALLEL_SIZE=32
+MAX_NUM_SEQS=4
 
-HF_TOKEN=your_token_here
+# Dataset Configuration
+DATASET_NAME=databricks/databricks-dolly-15k
 ```
 
-The Makefile will automatically load this configuration if present, or prompt you for values if not set.
+## 📊 Monitoring
 
-### Running Inference
+### Check Status
+```bash
+./nki-llama status
+```
 
-The Makefile provides several commands for running inference and evaluation:
+### View Logs
+```bash
+# Logs are stored with timestamps
+ls logs/
+tail -f logs/nki-llama_*.log
 
+# Benchmark results
+ls logs/benchmarks/
+cat logs/benchmarks/*/metadata.json
+```
+
+### Neuron Monitoring
 ```bash
-# Activate the inference environment
+neuron-ls    # List Neuron devices
+neuron-top   # Monitor Neuron usage
+```
+
+## 🔍 Complete Workflow Example
+
+Here's a complete end-to-end workflow with tmux best practices:
+
+### 1. Fine-tune a Model
+```bash
+# Create tmux session for training
+tmux new -s training
+
+# Inside tmux: activate environment and run training
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+./nki-llama finetune all
+
+# Detach from tmux: Ctrl+B, D
+# Check progress later: tmux attach -t training
+```
+
+### 2. Benchmark Model with NKI
+```bash
+# Create tmux session for benchmarking
+tmux new -s benchmark
+
+# Inside tmux: run benchmarks
 source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
 
-# Download model from Hugging Face (you'll need a HF token)
-# (skip this step if using your fine-tuned model)
-make inference-download
+# Quick single evaluation with compilation  (first run compiles)
+# ./nki-llama inference benchmark single (TODO: FIX COMMAND)
 
-# Run inference in generate mode
-make inference-infer
+# Or full benchmark with compilation (first run compiles)
+./nki-llama inference benchmark
 
-# Run in evaluate-all mode
-make inference-evaluate-all
+# Detach and let it run: Ctrl+B, D
 ```
 
-## Agent Development
+### 3. View Benchmark Results
+```bash
+# After benchmarking completes
+ls logs/benchmarks/
+cat logs/benchmarks/*/metadata.json
 
-This repository includes support for building LLM-powered agents using LangGraph and LangChain. A sample travel planning agent is included that demonstrates how to build a stateful agent workflow with the following capabilities:
+# View detailed logs
+cat logs/benchmarks/*/benchmark.log
+```
 
-- Context-aware travel itinerary generation
-- Multi-turn conversation with memory
-- Dynamic workflow management using LangGraph
-- Integration with VLLMOpenAI for efficient inference on Trainium
+### 4. Serve with vLLM
+```bash
+# Create tmux session for server
+tmux new -s vllm
 
-### Jupyter Notebook
+# Inside tmux: start the server
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+./nki-llama server
 
-The repository includes a Jupyter notebook for developing and testing agents. To use it:
+# Server automatically uses NKI-compiled artifacts
+# Detach: Ctrl+B, D
+```
 
-1. Ensure you've started the vLLM server in one terminal: `make inference-server`
-2. Start Jupyter Lab in another terminal:
+### 5. Build Agents
+```bash
+# In a new terminal
+./nki-llama jupyter
+
+# Your model is now available at http://localhost:8080
+# Build agents using the OpenAI-compatible API
+```
 
+### Managing tmux Sessions
 ```bash
-# Activate the Jupyter environment
-source venv/bin/activate
+# List all sessions
+tmux ls
+
+# Attach to a session
+tmux attach -t training
+tmux attach -t benchmark
+tmux attach -t vllm
 
-# Start Jupyter Lab
-make inference-lab
+# Kill a session
+tmux kill-session -t training
 ```
 
-3. Open the travel planning notebook and select the "neuron_agents" kernel
-
-## Makefile Commands
-
-| Command | Description |
-|---------|-------------|
-| **General** |
-| `make help` | Show help message for all commands |
-| `make clean` | Clean all generated files |
-| **Fine-tuning** |
-| `make finetune` | Run all fine-tuning steps |
-| `make finetune-deps` | Install fine-tuning dependencies |
-| `make finetune-data` | Download datasets for fine-tuning |
-| `make finetune-model` | Download model for fine-tuning |
-| `make finetune-convert` | Convert checkpoint to NxDT format |
-| `make finetune-precompile` | Pre-compile graphs (AOT) |
-| `make finetune-train` | Run fine-tuning job |
-| `make finetune-clean` | Clean up fine-tuning files |
-| **Inference** |
-| `make inference` | Run inference (shortcut to inference-infer) |
-| `make inference-setup` | Setup vLLM for Neuron |
-| `make inference-jupyter` | Setup Jupyter environment |
-| `make inference-download` | Download model from Hugging Face |
-| `make inference-infer` | Run inference in generate mode (wip) |
-| `make inference-evaluate` | Run inference in evaluate mode |
-| `make inference-server` | Start vLLM OpenAI-compatible API server |
-| `make inference-lab` | Run Jupyter Lab server |
-| `make inference-clean` | Clean up inference files |
-
-## Environment Requirements
-
-- For fine-tuning: `source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate`
-- For inference: `source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate`
-- For agent development (Jupyter): `source venv/bin/activate`
-
-## Full Workflow Example
-
-Here's a complete workflow example combining all components:
-
-1. **Fine-tune a model**:
-   ```bash
-   source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
-   make finetune
-   ```
-
-2. **Serve the model** for inference:
-   ```bash
-   source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
-   make inference-setup
-   # You can either use your fine-tuned model or download one
-   # make inference-download
-   
-   # The model will be compiled with NKI and optimized for NxDI
-   # when you first start the server (this may take a few minutes)
-   make inference-server
-   ```
-
-3. **Build agents** with the served model:
+## 🚨 Troubleshooting
 
+### Environment Issues
 ```bash
-# In a new terminal
-source venv/bin/activate
-make inference-jupyter
-make inference-lab
-# Open the Jupyter notebook and connect to your model
+# Check active environment
+./nki-llama status
+
+# Wrong environment error?
+# For fine-tuning:
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+
+# For inference:
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+```
+
+### Benchmark Modes
+- **evaluate_single**: Use for quick validation tests
+- **evaluate_all**: Use for full performance evaluation and model compilation
+- First-time compilation with NKI (in evaluate_all mode) can take 10-30 minutes
+- Compiled models are cached in `~/traced_model/`
+- Subsequent benchmark runs will use the cached compilation
+
+### Memory Issues
+- Ensure you're using trn1.32xlarge for full model support
+- Monitor memory usage with `neuron-top`
+- Adjust `TENSOR_PARALLEL_SIZE` if needed
+
+### Using with tmux
+For long-running operations like training, benchmarking, or serving:
+
+```bash
+# Create a new tmux session
+tmux new -s session-name
+
+# Run your command
+./nki-llama [command]
+
+# Detach from session
+Ctrl+B, then D
+
+# List sessions
+tmux ls
+
+# Reattach to session
+tmux attach -t session-name
 ```
 
----
+## 🤝 Contributing
+
+The modular design makes it easy to add new features:
+
+1. Add new scripts to `scripts/` directory
+2. Update command handlers in `nki-llama.sh`
+3. Add configuration to `nki-llama.config`
+
+## 📄 License
 
 © 2025 Amazon Web Services. All rights reserved.
+
+This project integrates with AWS Neuron SDK and follows its licensing terms.
\ No newline at end of file
diff --git a/install.sh b/install.sh
new file mode 100755
index 0000000..3881b56
--- /dev/null
+++ b/install.sh
@@ -0,0 +1,189 @@
+#!/bin/bash
+# install.sh - NKI-LLAMA installation script
+
+set -euo pipefail
+
+# Colors
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+RED='\033[0;31m'
+CYAN='\033[0;36m'
+NC='\033[0m'
+BOLD='\033[1m'
+
+# Banner
+echo -e "${CYAN}"
+cat << 'EOF'
+    _   __ __ __ ____       __    __       ___    __  ___    ___ 
+   / | / // //_//  _/      / /   / /      /   |  /  |/  /   /   |
+  /  |/ // ,<   / /______ / /   / /      / /| | / /|_/ /   / /| |
+ / /|  // /| |_/ /_______/ /___/ /___   / ___ |/ /  / /   / ___ |
+/_/ |_//_/ |_/___/       /_____/_____/  /_/  |_/_/  /_/   /_/  |_|
+                                                             
+EOF
+echo -e "${NC}"
+
+echo -e "${BOLD}NKI-LLAMA Installation${NC}"
+echo -e "===================="
+echo
+
+# Get installation directory
+INSTALL_DIR="${1:-$(pwd)}"
+echo -e "${BLUE}Installing to: ${INSTALL_DIR}${NC}"
+
+# Create directories
+echo -e "\n${YELLOW}Creating directory structure...${NC}"
+mkdir -p "${INSTALL_DIR}/logs/benchmarks"
+mkdir -p "${INSTALL_DIR}/src/inference/scripts"
+mkdir -p "${INSTALL_DIR}/src/fine-tune/scripts"
+
+# Make scripts executable
+echo -e "${YELLOW}Setting up scripts...${NC}"
+chmod +x "${INSTALL_DIR}/nki-llama.sh" 2>/dev/null || true
+chmod +x "${INSTALL_DIR}/src/inference/scripts/"*.sh 2>/dev/null || true
+chmod +x "${INSTALL_DIR}/src/fine-tune/scripts/"*.sh 2>/dev/null || true
+
+# Create symlink for easier access
+if [[ -f "${INSTALL_DIR}/nki-llama.sh" ]] && [[ ! -f "${INSTALL_DIR}/nki-llama" ]]; then
+    ln -s "${INSTALL_DIR}/nki-llama.sh" "${INSTALL_DIR}/nki-llama"
+    echo -e "${GREEN}✓ Created nki-llama symlink${NC}"
+fi
+
+# Copy example environment file
+if [[ ! -f "${INSTALL_DIR}/.env" ]]; then
+    if [[ -f "${INSTALL_DIR}/.env.example" ]]; then
+        cp "${INSTALL_DIR}/.env.example" "${INSTALL_DIR}/.env"
+        echo -e "${GREEN}✓ Created .env file from example${NC}"
+        echo -e "${YELLOW}  Please edit .env and add your HF_TOKEN${NC}"
+    else
+        # Create a basic .env file if no example exists
+        cat > "${INSTALL_DIR}/.env" << 'EOF'
+# NKI-LLAMA Configuration
+HF_TOKEN=
+MODEL_ID=meta-llama/Meta-Llama-3-8B
+MODEL_NAME=llama-3-8b
+TENSOR_PARALLEL_SIZE=8
+SEQ_LENGTH=2048
+BATCH_SIZE=1
+MAX_STEPS=1000
+LEARNING_RATE=5e-5
+INFERENCE_PORT=8080
+MAX_MODEL_LEN=2048
+MAX_NUM_SEQS=4
+DATASET_NAME=databricks/databricks-dolly-15k
+EOF
+        echo -e "${GREEN}✓ Created default .env file${NC}"
+        echo -e "${YELLOW}  Please edit .env and add your HF_TOKEN${NC}"
+    fi
+fi
+
+# Check for Neuron environments
+echo -e "\n${BOLD}Checking Neuron environments...${NC}"
+MISSING_ENV=false
+
+if [[ -d "/opt/aws_neuronx_venv_pytorch_2_6" ]]; then
+    echo -e "${GREEN}✓ Fine-tuning environment found${NC}"
+else
+    echo -e "${RED}✗ Fine-tuning environment not found${NC}"
+    echo -e "  ${YELLOW}Expected at: /opt/aws_neuronx_venv_pytorch_2_6${NC}"
+    MISSING_ENV=true
+fi
+
+if [[ -d "/opt/aws_neuronx_venv_pytorch_2_6_nxd_inference" ]]; then
+    echo -e "${GREEN}✓ Inference environment found${NC}"
+else
+    echo -e "${RED}✗ Inference environment not found${NC}"
+    echo -e "  ${YELLOW}Expected at: /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference${NC}"
+    MISSING_ENV=true
+fi
+
+# Check for tmux
+echo -e "\n${BOLD}Checking system dependencies...${NC}"
+if command -v tmux &> /dev/null; then
+    TMUX_VERSION=$(tmux -V | cut -d' ' -f2)
+    echo -e "${GREEN}✓ tmux ${TMUX_VERSION} found${NC}"
+else
+    echo -e "${YELLOW}⚠ tmux not found - recommended for long-running operations${NC}"
+    echo -e "  Install with: ${CYAN}sudo apt-get install tmux${NC}"
+fi
+
+# Check for neuron-ls
+if command -v neuron-ls &> /dev/null; then
+    echo -e "${GREEN}✓ Neuron SDK tools found${NC}"
+else
+    echo -e "${YELLOW}⚠ neuron-ls not found - Neuron SDK may not be installed${NC}"
+fi
+
+# Verify configuration file
+if [[ -f "${INSTALL_DIR}/nki-llama.config" ]]; then
+    echo -e "${GREEN}✓ Configuration file found${NC}"
+else
+    echo -e "${RED}✗ nki-llama.config not found!${NC}"
+    echo -e "  This file is required for operation"
+fi
+
+# Check if running on correct instance
+if [[ -f /sys/devices/virtual/dmi/id/product_name ]]; then
+    INSTANCE_TYPE=$(cat /sys/devices/virtual/dmi/id/product_name 2>/dev/null || echo "unknown")
+    if [[ "$INSTANCE_TYPE" == *"trn1"* ]]; then
+        echo -e "${GREEN}✓ Running on Trainium instance (${INSTANCE_TYPE})${NC}"
+    else
+        echo -e "${YELLOW}⚠ Not running on Trainium instance${NC}"
+        echo -e "  Current: ${INSTANCE_TYPE}"
+        echo -e "  Recommended: trn1.32xlarge"
+    fi
+fi
+
+# Installation summary
+echo -e "\n${BOLD}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+if [[ "$MISSING_ENV" == "true" ]]; then
+    echo -e "${YELLOW}⚠️  Installation completed with warnings${NC}"
+    echo -e "\nSome Neuron environments are missing. This is expected if you're not on"
+    echo -e "a Neuron-enabled instance or haven't installed the Neuron SDK yet."
+else
+    echo -e "${GREEN}✅ Installation complete!${NC}"
+fi
+echo -e "${BOLD}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+
+# Next steps
+echo -e "\n${BOLD}Next steps:${NC}"
+echo -e "1. ${YELLOW}Configure:${NC} Edit ${CYAN}${INSTALL_DIR}/.env${NC} with your settings"
+echo -e "   • Add your Hugging Face token (HF_TOKEN)"
+echo -e "   • Adjust model and training parameters as needed"
+echo
+
+if [[ "$MISSING_ENV" == "true" ]]; then
+    echo -e "2. ${YELLOW}Install Neuron SDK:${NC} Follow AWS documentation to install Neuron SDK"
+    echo -e "   This will create the required virtual environments"
+    echo
+    echo -e "3. ${YELLOW}Activate environment:${NC}"
+else
+    echo -e "2. ${YELLOW}Activate environment:${NC}"
+fi
+echo -e "   Fine-tuning: ${CYAN}source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate${NC}"
+echo -e "   Inference:   ${CYAN}source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate${NC}"
+echo
+
+if [[ "$MISSING_ENV" == "true" ]]; then
+    echo -e "4. ${YELLOW}Get started:${NC}"
+else
+    echo -e "3. ${YELLOW}Get started:${NC}"
+fi
+echo -e "   • Run ${CYAN}./nki-llama setup${NC} for interactive setup"
+echo -e "   • Use ${CYAN}./nki-llama help${NC} to see all commands"
+echo -e "   • Check ${CYAN}./nki-llama status${NC} to verify installation"
+echo
+
+echo -e "${BOLD}Workflow overview:${NC}"
+echo -e "   Fine-tuning → NKI Compilation → vLLM Inference → Agent Development"
+echo
+
+if ! command -v tmux &> /dev/null; then
+    echo -e "${YELLOW}💡 Tip:${NC} Install tmux for better experience with long-running tasks:"
+    echo -e "   ${CYAN}sudo apt-get update && sudo apt-get install tmux${NC}"
+    echo
+fi
+
+echo -e "${BLUE}Documentation:${NC} See README.md for detailed usage instructions"
+echo
\ No newline at end of file
diff --git a/logs/benchmarks/20250606_150849/metadata.json b/logs/benchmarks/20250606_150849/metadata.json
new file mode 100644
index 0000000..f71d405
--- /dev/null
+++ b/logs/benchmarks/20250606_150849/metadata.json
@@ -0,0 +1,11 @@
+{
+    "timestamp": "20250606_150849",
+    "mode": "evaluate_all",
+    "model_name": "llama-3-8b",
+    "model_path": "/home/ubuntu/models/llama-3-8b",
+    "compiled_model_path": "/home/ubuntu/traced_model/llama-3-8b",
+    "nki_enabled": true,
+    "sequence_length": 640,
+    "tensor_parallel_size": 8,
+    "neuron_rt_cores": "8"
+}
diff --git a/logs/benchmarks/20250606_150959/metadata.json b/logs/benchmarks/20250606_150959/metadata.json
new file mode 100644
index 0000000..8a28790
--- /dev/null
+++ b/logs/benchmarks/20250606_150959/metadata.json
@@ -0,0 +1,11 @@
+{
+    "timestamp": "20250606_150959",
+    "mode": "evaluate_all",
+    "model_name": "llama-3-8b",
+    "model_path": "/home/ubuntu/models/llama-3-8b",
+    "compiled_model_path": "/home/ubuntu/traced_model/llama-3-8b",
+    "nki_enabled": true,
+    "sequence_length": 640,
+    "tensor_parallel_size": 8,
+    "neuron_rt_cores": "8"
+}
diff --git a/logs/benchmarks/20250606_151034/metadata.json b/logs/benchmarks/20250606_151034/metadata.json
new file mode 100644
index 0000000..7e7b0c6
--- /dev/null
+++ b/logs/benchmarks/20250606_151034/metadata.json
@@ -0,0 +1,11 @@
+{
+    "timestamp": "20250606_151034",
+    "mode": "evaluate_all",
+    "model_name": "llama-3-8b",
+    "model_path": "/home/ubuntu/models/llama-3-8b",
+    "compiled_model_path": "/home/ubuntu/traced_model/llama-3-8b",
+    "nki_enabled": true,
+    "sequence_length": 640,
+    "tensor_parallel_size": 8,
+    "neuron_rt_cores": "8"
+}
diff --git a/logs/benchmarks/20250606_151230/metadata.json b/logs/benchmarks/20250606_151230/metadata.json
new file mode 100644
index 0000000..ab28ac4
--- /dev/null
+++ b/logs/benchmarks/20250606_151230/metadata.json
@@ -0,0 +1,11 @@
+{
+    "timestamp": "20250606_151230",
+    "mode": "evaluate_single",
+    "model_name": "llama-3-8b",
+    "model_path": "/home/ubuntu/models/llama-3-8b",
+    "compiled_model_path": "/home/ubuntu/traced_model/llama-3-8b",
+    "nki_enabled": true,
+    "sequence_length": 640,
+    "tensor_parallel_size": 8,
+    "neuron_rt_cores": "8"
+}
diff --git a/logs/benchmarks/20250606_151450/metadata.json b/logs/benchmarks/20250606_151450/metadata.json
new file mode 100644
index 0000000..37d15d4
--- /dev/null
+++ b/logs/benchmarks/20250606_151450/metadata.json
@@ -0,0 +1,11 @@
+{
+    "timestamp": "20250606_151450",
+    "mode": "evaluate_single",
+    "model_name": "llama-3-8b",
+    "model_path": "/home/ubuntu/models/llama-3-8b",
+    "compiled_model_path": "/home/ubuntu/traced_model/llama-3-8b",
+    "nki_enabled": true,
+    "sequence_length": 640,
+    "tensor_parallel_size": 8,
+    "neuron_rt_cores": "8"
+}
diff --git a/nki-llama b/nki-llama
new file mode 120000
index 0000000..e26e708
--- /dev/null
+++ b/nki-llama
@@ -0,0 +1 @@
+/home/ubuntu/nki-llama/nki-llama.sh
\ No newline at end of file
diff --git a/nki-llama.config b/nki-llama.config
new file mode 100644
index 0000000..0af05e0
--- /dev/null
+++ b/nki-llama.config
@@ -0,0 +1,68 @@
+#!/bin/bash
+# nki-llama.config - Shared configuration for NKI-LLAMA projects
+
+# Project Structure
+export NKI_ROOT="${HOME}/nki-llama"
+export NKI_SRC="${NKI_ROOT}/src"
+export NKI_FINETUNE="${NKI_SRC}/fine-tune"
+export NKI_INFERENCE="${NKI_SRC}/inference"
+export NKI_FINETUNE_SCRIPTS="${NKI_FINETUNE}/scripts"
+export NKI_INFERENCE_SCRIPTS="${NKI_INFERENCE}/scripts"
+export NKI_LOGS="${NKI_ROOT}/logs"
+export NKI_MODELS="${HOME}/models"
+export NKI_COMPILED="${HOME}/traced_model"
+
+# Model Configuration
+export MODEL_ID="${MODEL_ID:-meta-llama/Meta-Llama-3-8B}"
+export MODEL_NAME="${MODEL_NAME:-llama-3-8b}"
+export HF_TOKEN="${HF_TOKEN:-}"
+
+# Training Configuration
+export BATCH_SIZE="${BATCH_SIZE:-1}"
+export MAX_STEPS="${MAX_STEPS:-1000}"
+export SEQ_LENGTH="${SEQ_LENGTH:-2048}"
+export TENSOR_PARALLEL_SIZE="${TENSOR_PARALLEL_SIZE:-8}"
+export LEARNING_RATE="${LEARNING_RATE:-5e-5}"
+
+# Inference Configuration
+export INFERENCE_PORT="${INFERENCE_PORT:-8080}"
+export MAX_MODEL_LEN="${MAX_MODEL_LEN:-2048}"
+export MAX_NUM_SEQS="${MAX_NUM_SEQS:-4}"
+export ENABLE_NKI="${ENABLE_NKI:-true}"
+
+# Neuron Configuration
+export NEURON_VENV="/opt/aws_neuronx_venv_pytorch_2_6"
+export NEURON_INFERENCE_VENV="/opt/aws_neuronx_venv_pytorch_2_6_nxd_inference"
+export NEURON_COMPILE_CACHE="${HOME}/.cache/neuron"
+export NEURON_RT_NUM_CORES="${NEURON_RT_NUM_CORES:-8}"
+
+# vLLM Configuration
+export VLLM_REPO="${HOME}/upstreaming-to-vllm"
+export VLLM_BRANCH="neuron-2.22-vllm-v0.7.2"
+export VLLM_NEURON_FRAMEWORK="neuronx-distributed-inference"
+
+# Dataset Configuration
+export DATASET_NAME="${DATASET_NAME:-databricks/databricks-dolly-15k}"
+export DATASET_DIR="${NKI_FINETUNE}/datasets"
+export TOKENIZER_DIR="${NKI_FINETUNE}/model_assets/llama3_tokenizer"
+
+# Checkpoint Paths
+export HF_WEIGHTS_DIR="${NKI_FINETUNE}/model_assets/llama3-8B_hf_weights_bin"
+export PRETRAINED_CKPT="${NKI_FINETUNE}/model_assets/pckpt"
+export NEMO_EXPERIMENTS="${NKI_FINETUNE}/nemo_experiments"
+
+# Jupyter Configuration
+export JUPYTER_PORT="${JUPYTER_PORT:-8888}"
+export JUPYTER_VENV="${NKI_ROOT}/venv"
+
+# Function to print configuration
+print_config() {
+    echo "NKI-LLAMA Configuration:"
+    echo "======================="
+    echo "Project Root: ${NKI_ROOT}"
+    echo "Model: ${MODEL_NAME} (${MODEL_ID})"
+    echo "Fine-tune Scripts: ${NKI_FINETUNE_SCRIPTS}"
+    echo "Inference Scripts: ${NKI_INFERENCE_SCRIPTS}"
+    echo "Tensor Parallel Size: ${TENSOR_PARALLEL_SIZE}"
+    echo "Sequence Length: ${SEQ_LENGTH}"
+}
\ No newline at end of file
diff --git a/nki-llama.sh b/nki-llama.sh
new file mode 100755
index 0000000..a9a3654
--- /dev/null
+++ b/nki-llama.sh
@@ -0,0 +1,520 @@
+#!/bin/bash
+# nki-llama - Unified CLI for fine-tuning and inference
+
+set -euo pipefail
+
+# Get script directory
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Create symlink for easier access
+if [[ -f "$SCRIPT_DIR/nki-llama.sh" ]] && [[ ! -f "$SCRIPT_DIR/nki-llama" ]]; then
+    ln -s "$SCRIPT_DIR/nki-llama.sh" "$SCRIPT_DIR/nki-llama"
+fi
+
+# Load configuration
+if [[ -f "${SCRIPT_DIR}/nki-llama.config" ]]; then
+    source "${SCRIPT_DIR}/nki-llama.config"
+else
+    echo "Error: nki-llama.config not found!"
+    exit 1
+fi
+
+# Load environment file if exists
+if [[ -f "${SCRIPT_DIR}/.env" ]]; then
+    set -a
+    source "${SCRIPT_DIR}/.env"
+    set +a
+fi
+
+# Colors
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+CYAN='\033[0;36m'
+MAGENTA='\033[0;35m'
+NC='\033[0m'
+BOLD='\033[1m'
+
+# Banner
+display_banner() {
+    echo -e "${CYAN}"
+    cat << 'EOF'
+    _   __ __ __ ____       __    __       ___    __  ___    ___ 
+   / | / // //_//  _/      / /   / /      /   |  /  |/  /   /   |
+  /  |/ // ,<   / /______ / /   / /      / /| | / /|_/ /   / /| |
+ / /|  // /| |_/ /_______/ /___/ /___   / ___ |/ /  / /   / ___ |
+/_/ |_//_/ |_/___/       /_____/_____/  /_/  |_/_/  /_/   /_/  |_|
+                                                             
+EOF
+    echo -e "${NC}"
+}
+
+# Tmux helper functions
+check_tmux_session() {
+    local session_name="$1"
+    tmux has-session -t "$session_name" 2>/dev/null
+}
+
+suggest_tmux() {
+    local operation="$1"
+    local session_name="$2"
+    shift 2
+    local args="$*"
+    
+    if [[ -z "${TMUX:-}" ]]; then
+        echo -e "${YELLOW}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+        echo -e "${YELLOW}💡 tmux Recommended for ${operation}${NC}"
+        echo -e "${YELLOW}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+        echo
+        echo -e "This operation may take a long time. We recommend using tmux:"
+        echo
+        echo -e "${CYAN}# Create new session:${NC}"
+        echo -e "tmux new -s ${session_name}"
+        echo -e "./nki-llama ${args}"
+        echo
+        echo -e "${CYAN}# Or run directly in tmux:${NC}"
+        echo -e "tmux new -s ${session_name} './nki-llama ${args}'"
+        echo
+        echo -e "${CYAN}# Detach with: Ctrl+B, D${NC}"
+        echo -e "${CYAN}# Reattach with: tmux attach -t ${session_name}${NC}"
+        echo
+        echo -e "${YELLOW}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+        echo
+    fi
+}
+
+# Check active Neuron environment
+check_neuron_env() {
+    if [[ -z "${VIRTUAL_ENV:-}" ]]; then
+        echo -e "${RED}❌ No virtual environment active${NC}"
+        return 1
+    elif [[ "$VIRTUAL_ENV" == *"pytorch_2_6"* ]]; then
+        echo -e "${GREEN}✓ Fine-tuning environment active${NC}"
+        return 0
+    elif [[ "$VIRTUAL_ENV" == *"pytorch_2_6_nxd_inference"* ]]; then
+        echo -e "${GREEN}✓ Inference environment active${NC}"
+        return 0
+    else
+        echo -e "${YELLOW}⚠️  Unknown environment: ${VIRTUAL_ENV}${NC}"
+        return 1
+    fi
+}
+
+# Initialize logging
+init_logging() {
+    mkdir -p "$NKI_LOGS"
+    TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+    LOG_FILE="$NKI_LOGS/nki-llama_${TIMESTAMP}.log"
+    exec 1> >(tee -a "$LOG_FILE")
+    exec 2>&1
+    echo -e "${BLUE}📝 Logging to: ${LOG_FILE}${NC}"
+}
+
+# Run script with error handling
+run_script() {
+    local script_path="$1"
+    local display_name="$2"
+    shift 2
+    
+    if [[ ! -f "$script_path" ]]; then
+        echo -e "${RED}❌ Script not found: $script_path${NC}"
+        return 1
+    fi
+    
+    echo -e "${MAGENTA}▶ Running: ${display_name}${NC}"
+    if bash "$script_path" "$@"; then
+        echo -e "${GREEN}✓ ${display_name} completed${NC}\n"
+    else
+        echo -e "${RED}✗ ${display_name} failed${NC}\n"
+        return 1
+    fi
+}
+
+###############################################################################
+# Fine-tuning Commands
+###############################################################################
+
+cmd_finetune_deps() {
+    echo -e "${BOLD}Installing fine-tuning dependencies...${NC}"
+    run_script "${NKI_FINETUNE_SCRIPTS}/bootstrap.sh" "Dependencies Installation"
+}
+
+cmd_finetune_data() {
+    echo -e "${BOLD}Downloading dataset...${NC}"
+    run_script "${NKI_FINETUNE_SCRIPTS}/download_data.sh" "Dataset Download"
+}
+
+cmd_finetune_model() {
+    echo -e "${BOLD}Downloading model weights...${NC}"
+    run_script "${NKI_FINETUNE_SCRIPTS}/download_model.sh" "Model Download"
+}
+
+cmd_finetune_convert() {
+    echo -e "${BOLD}Converting checkpoints...${NC}"
+    run_script "${NKI_FINETUNE_SCRIPTS}/convert_checkpoints.sh" "Checkpoint Conversion"
+}
+
+cmd_finetune_compile() {
+    echo -e "${BOLD}Pre-compiling graphs...${NC}"
+    suggest_tmux "Graph Compilation" "compile-graphs" "finetune compile"
+    run_script "${NKI_FINETUNE_SCRIPTS}/precompile.sh" "Graph Compilation"
+}
+
+cmd_finetune_train() {
+    echo -e "${BOLD}Starting fine-tuning...${NC}"
+    suggest_tmux "Fine-tuning" "training" "finetune train"
+    run_script "${NKI_FINETUNE_SCRIPTS}/run_training.sh" "Fine-tuning"
+}
+
+cmd_finetune_all() {
+    echo -e "${BOLD}Running complete fine-tuning pipeline...${NC}\n"
+    cmd_finetune_deps && \
+    cmd_finetune_data && \
+    cmd_finetune_model && \
+    cmd_finetune_convert && \
+    cmd_finetune_compile && \
+    cmd_finetune_train
+}
+
+###############################################################################
+# Inference Commands
+###############################################################################
+
+cmd_inference_setup() {
+    echo -e "${BOLD}Setting up vLLM for inference...${NC}"
+    bash "${NKI_INFERENCE_SCRIPTS}/setup-vllm.sh"
+}
+
+cmd_inference_download() {
+    echo -e "${BOLD}Downloading model for inference...${NC}"
+    bash "${NKI_INFERENCE_SCRIPTS}/download-model.sh"
+}
+
+cmd_inference_benchmark() {
+    echo -e "${BOLD}Running NKI benchmark evaluation...${NC}"
+    
+    # Parse benchmark mode
+    local mode="evaluate_all"  # Default mode
+    local args=()
+    
+    while [[ $# -gt 0 ]]; do
+        case $1 in
+            single)
+                mode="evaluate_single"
+                shift
+                ;;
+            all)
+                mode="evaluate_all"
+                shift
+                ;;
+            --mode)
+                mode="$2"
+                shift 2
+                ;;
+            *)
+                args+=("$1")
+                shift
+                ;;
+        esac
+    done
+    
+    echo -e "${YELLOW}💡 Running benchmark in ${mode} mode${NC}"
+    
+    if [[ "$mode" == "evaluate_single" ]]; then
+        echo -e "${YELLOW}   This runs a quick single evaluation from the repository test script.${NC}"
+    else
+        echo -e "${YELLOW}   This includes model compilation with NKI optimizations (10-30 min on first run).${NC}"
+        echo -e "${YELLOW}   The compiled model will be cached for future use.${NC}"
+    fi
+    
+    echo -e "${YELLOW}   Using tmux is strongly recommended!${NC}"
+    
+    # Check if we're in tmux
+    if [[ -z "${TMUX:-}" ]]; then
+        echo -e "${YELLOW}⚠️  Not running in tmux. Consider using:${NC}"
+        echo -e "   ${CYAN}tmux new -s benchmark${NC}"
+        echo -e "   ${CYAN}./nki-llama inference benchmark ${mode} ${args[*]}${NC}"
+        echo
+        read -p "Continue without tmux? [Y/n] " -n 1 -r
+        echo
+        if [[ $REPLY =~ ^[Nn]$ ]]; then
+            echo -e "${BLUE}Start tmux with: tmux new -s benchmark${NC}"
+            exit 0
+        fi
+    fi
+    
+    bash "${NKI_INFERENCE_SCRIPTS}/run-nki-benchmark.sh" --mode "$mode" "${args[@]}"
+}
+
+cmd_inference_server() {
+    echo -e "${BOLD}Starting vLLM server...${NC}"
+    suggest_tmux "vLLM Server" "vllm-server" "inference server"
+    bash "${NKI_INFERENCE_SCRIPTS}/start-server.sh"
+}
+
+###############################################################################
+# Utility Commands
+###############################################################################
+
+cmd_status() {
+    echo -e "\n${BOLD}System Status:${NC}"
+    check_neuron_env
+    echo
+    
+    echo -e "${BOLD}Configuration:${NC}"
+    print_config
+    echo
+    
+    echo -e "${BOLD}Fine-tuning Status:${NC}"
+    [[ -d "$DATASET_DIR" ]] && echo -e "• Dataset: ${GREEN}✓${NC}" || echo -e "• Dataset: ${YELLOW}⚠${NC}"
+    [[ -d "$HF_WEIGHTS_DIR" ]] && echo -e "• Weights: ${GREEN}✓${NC}" || echo -e "• Weights: ${YELLOW}⚠${NC}"
+    [[ -d "$PRETRAINED_CKPT" ]] && echo -e "• Checkpoints: ${GREEN}✓${NC}" || echo -e "• Checkpoints: ${YELLOW}⚠${NC}"
+    [[ -d "$NEMO_EXPERIMENTS" ]] && echo -e "• Training: ${GREEN}✓${NC}" || echo -e "• Training: ${YELLOW}⚠${NC}"
+    echo
+    
+    echo -e "${BOLD}Inference Status:${NC}"
+    [[ -d "${NKI_MODELS}/${MODEL_NAME}" ]] && echo -e "• Model: ${GREEN}✓${NC}" || echo -e "• Model: ${YELLOW}⚠${NC}"
+    [[ -d "${NKI_COMPILED}/${MODEL_NAME}" ]] && echo -e "• Compiled: ${GREEN}✓${NC}" || echo -e "• Compiled: ${YELLOW}⚠${NC}"
+    [[ -d "$VLLM_REPO" ]] && echo -e "• vLLM: ${GREEN}✓${NC}" || echo -e "• vLLM: ${YELLOW}⚠${NC}"
+    
+    if command -v neuron-ls &> /dev/null; then
+        echo -e "\n${BOLD}Neuron Hardware:${NC}"
+        
+        # Extract instance info
+        INSTANCE_TYPE=$(neuron-ls | grep "instance-type:" | cut -d' ' -f2)
+        echo -e "• Instance: ${CYAN}${INSTANCE_TYPE}${NC}"
+        
+        # Parse device information
+        DEVICE_INFO=$(neuron-ls | grep -E "^\| [0-9]+ ")
+        DEVICE_COUNT=$(echo "$DEVICE_INFO" | wc -l)
+        
+        if [[ $DEVICE_COUNT -gt 0 ]]; then
+            # Calculate totals
+            TOTAL_CORES=$(( DEVICE_COUNT * 2 ))
+            TOTAL_MEMORY=$(( DEVICE_COUNT * 32 ))
+            
+            # Count busy devices - fixed version
+            if echo "$DEVICE_INFO" | grep -q "python"; then
+                BUSY_COUNT=$(echo "$DEVICE_INFO" | grep -c "python")
+            else
+                BUSY_COUNT=0
+            fi
+            FREE_COUNT=$(( DEVICE_COUNT - BUSY_COUNT ))
+            
+            echo -e "• Devices: ${GREEN}${DEVICE_COUNT}${NC} (${FREE_COUNT} free, ${BUSY_COUNT} busy)"
+            echo -e "• Total: ${TOTAL_CORES} cores, ${TOTAL_MEMORY}GB memory"
+            
+            # Show runtime version if available
+            RUNTIME_VERSION=$(neuron-ls | awk '/RUNTIME/ && /VERSION/ {getline; if (match($0, /[0-9]+\.[0-9]+\.[0-9]+/)) print substr($0, RSTART, RLENGTH)}' | head -1)
+            if [[ -n "$RUNTIME_VERSION" ]]; then
+                echo -e "• Runtime: v${RUNTIME_VERSION}"
+            fi
+        else
+            echo -e "${YELLOW}⚠ No Neuron devices detected${NC}"
+        fi
+    else
+        echo -e "\n${YELLOW}⚠ neuron-ls not found - Neuron SDK may not be installed${NC}"
+    fi
+}
+
+cmd_clean() {
+    echo -e "${YELLOW}🧹 Cleaning generated files...${NC}"
+    read -p "Clean fine-tuning artifacts? [y/N] " -n 1 -r
+    echo
+    if [[ $REPLY =~ ^[Yy]$ ]]; then
+        rm -rf "$DATASET_DIR" "$TOKENIZER_DIR" "$HF_WEIGHTS_DIR" "$PRETRAINED_CKPT" "$NEMO_EXPERIMENTS"
+        echo -e "${GREEN}✓ Fine-tuning artifacts cleaned${NC}"
+    fi
+    
+    read -p "Clean inference artifacts? [y/N] " -n 1 -r
+    echo
+    if [[ $REPLY =~ ^[Yy]$ ]]; then
+        rm -rf "${NKI_COMPILED}/${MODEL_NAME}"
+        echo -e "${GREEN}✓ Inference artifacts cleaned${NC}"
+    fi
+}
+
+# Show help
+show_help() {
+    echo -e "\n${BOLD}NKI-LLAMA Unified Interface${NC}"
+    echo -e "${CYAN}=========================${NC}\n"
+    
+    echo -e "${CYAN}Quick Commands:${NC}"
+    echo -e "  ./nki-llama setup         - Initial setup guide"
+    echo -e "  ./nki-llama train         - Start fine-tuning"
+    echo -e "  ./nki-llama server        - Start inference server"
+    echo -e "  ./nki-llama jupyter       - Start Jupyter Lab"
+    echo
+    
+    echo -e "${CYAN}Fine-tuning Commands:${NC}"
+    echo -e "  ./nki-llama finetune deps      - Install dependencies"
+    echo -e "  ./nki-llama finetune data      - Download training dataset"
+    echo -e "  ./nki-llama finetune model     - Download model weights"
+    echo -e "  ./nki-llama finetune convert   - Convert checkpoints"
+    echo -e "  ./nki-llama finetune compile   - Pre-compile graphs"
+    echo -e "  ./nki-llama finetune train     - Start training"
+    echo -e "  ./nki-llama finetune all       - Run complete pipeline"
+    echo
+    
+    echo -e "${CYAN}Inference Commands:${NC}"
+    echo -e "  ./nki-llama inference setup           - Setup vLLM"
+    echo -e "  ./nki-llama inference download        - Download model"
+    echo -e "  ./nki-llama inference benchmark       - Run full benchmark (evaluate_all mode)"
+    echo -e "  ./nki-llama inference benchmark single - Run quick benchmark (evaluate_single mode)"
+    echo -e "  ./nki-llama inference server          - Start API server"
+    echo
+    
+    echo -e "${CYAN}Benchmark Modes:${NC}"
+    echo -e "  evaluate_single - Quick validation using repository test script"
+    echo -e "  evaluate_all    - Full benchmark with NKI compilation & caching"
+    echo
+    
+    echo -e "${CYAN}Utility Commands:${NC}"
+    echo -e "  ./nki-llama status        - Show system status"
+    echo -e "  ./nki-llama config        - Show configuration"
+    echo -e "  ./nki-llama clean         - Clean artifacts"
+    echo -e "  ./nki-llama help          - Show this help"
+    echo
+    
+    echo -e "${CYAN}Environment Setup:${NC}"
+    echo -e "  Fine-tuning: source ${NEURON_VENV}/bin/activate"
+    echo -e "  Inference:   source ${NEURON_INFERENCE_VENV}/bin/activate"
+    echo
+}
+
+# Setup wizard
+cmd_setup() {
+    echo -e "${BOLD}NKI-LLAMA Setup Wizard${NC}"
+    echo -e "=====================\n"
+    
+    # Check for .env file
+    if [[ ! -f "${SCRIPT_DIR}/.env" ]]; then
+        echo -e "${YELLOW}No .env file found. Creating one...${NC}"
+        cp "${SCRIPT_DIR}/.env.example" "${SCRIPT_DIR}/.env" 2>/dev/null || {
+            echo -e "${RED}No .env.example found. Creating basic .env...${NC}"
+            cat > "${SCRIPT_DIR}/.env" << EOF
+# NKI-LLAMA Configuration
+HF_TOKEN=
+MODEL_ID=meta-llama/Meta-Llama-3-8B
+MODEL_NAME=llama-3-8b
+TENSOR_PARALLEL_SIZE=8
+EOF
+        }
+        echo -e "${GREEN}✓ Created .env file${NC}"
+        echo -e "${YELLOW}Please edit .env and add your HF_TOKEN${NC}\n"
+    fi
+    
+    # Show current environment
+    echo -e "${BOLD}Current Environment:${NC}"
+    check_neuron_env || true
+    echo
+    
+    # Show quick start
+    echo -e "${BOLD}Quick Start Guide:${NC}"
+    echo -e "1. Edit .env file with your Hugging Face token"
+    echo -e "2. For fine-tuning:"
+    echo -e "   ${CYAN}source ${NEURON_VENV}/bin/activate${NC}"
+    echo -e "   ${CYAN}./nki-llama finetune all${NC}"
+    echo -e "3. For model benchmarking:"
+    echo -e "   ${CYAN}source ${NEURON_INFERENCE_VENV}/bin/activate${NC}"
+    echo -e "   ${CYAN}./nki-llama inference download${NC}"
+    echo -e "   ${CYAN}./nki-llama inference benchmark      # Full benchmark with compilation${NC}"
+    echo -e "   ${CYAN}./nki-llama inference benchmark single  # Quick single evaluation${NC}"
+    echo -e "4. For inference serving:"
+    echo -e "   ${CYAN}./nki-llama inference setup${NC}"
+    echo -e "   ${CYAN}./nki-llama server${NC}"
+    echo
+}
+
+# Main function
+main() {
+    # Show banner only for interactive commands
+    case "${1:-help}" in
+        help|setup|status|config)
+            clear
+            display_banner
+            ;;
+    esac
+    
+    # Initialize logging for actual operations
+    case "${1:-help}" in
+        finetune|inference|train|server|clean)
+            init_logging
+            ;;
+    esac
+    
+    # Parse command
+    local cmd="${1:-help}"
+    shift || true
+    
+    case "$cmd" in
+        # Setup
+        setup)
+            cmd_setup
+            ;;
+            
+        # Quick shortcuts
+        train)
+            cmd_finetune_train "$@"
+            ;;
+        server)
+            cmd_inference_server "$@"
+            ;;
+        jupyter)
+            bash "${NKI_INFERENCE_SCRIPTS}/jupyter.sh" "$@"
+            ;;
+            
+        # Fine-tuning commands
+        finetune)
+            subcmd="${1:-all}"
+            shift || true
+            case "$subcmd" in
+                deps|data|model|convert|compile|train|all)
+                    cmd_finetune_"$subcmd" "$@"
+                    ;;
+                *)
+                    echo -e "${RED}Unknown finetune command: $subcmd${NC}"
+                    show_help
+                    ;;
+            esac
+            ;;
+            
+        # Inference commands
+        inference)
+            subcmd="${1:-server}"
+            shift || true
+            case "$subcmd" in
+                setup|download|server|benchmark)
+                    cmd_inference_"$subcmd" "$@"
+                    ;;
+                *)
+                    echo -e "${RED}Unknown inference command: $subcmd${NC}"
+                    show_help
+                    ;;
+            esac
+            ;;
+            
+        # Utility commands
+        status)
+            cmd_status
+            ;;
+        config)
+            print_config
+            ;;
+        clean)
+            cmd_clean
+            ;;
+        help|--help|-h)
+            show_help
+            ;;
+        *)
+            echo -e "${RED}Unknown command: $cmd${NC}"
+            show_help
+            exit 1
+            ;;
+    esac
+}
+
+# Run main
+main "$@"
\ No newline at end of file
diff --git a/src/fine-tune/README.md b/src/fine-tune/README.md
index a01e80a..a68294e 100644
--- a/src/fine-tune/README.md
+++ b/src/fine-tune/README.md
@@ -1,14 +1,10 @@
-# pipeline.sh 
+## 1 · Prerequisites
 
----
-
-## 1 · Prerequisites
-
-| Requirement | Reason | Install / Notes |
+| Requirement | Reason | Install / Notes |
 |-------------|--------|-----------------|
 | **Neuron virtual‑env** | Script refuses to run outside it | `source /opt/aws_neuronx_venv_pytorch_2_5/bin/activate` |
 | **`scripts/` folder** | step-by step scripts for running fine tuning | |
-| **`.env` file** *(optional)* | Central place for env vars | Place at `../../.env` |
+| **`.env` file** *(optional)* | Central place for env vars | Place at `../../.env` |
 
 Example `.env`:
 
@@ -19,7 +15,7 @@ MODEL_ID=meta-llama-3-8b
 
 ---
 
-## 2 · Setup
+## 2 · Setup
 
 ```bash
 # Clone repo and enter it
@@ -29,12 +25,57 @@ cd ./src/fine-tune
 chmod +x pipeline.sh
 
 # Activate Neuron environment
-source /opt/aws_neuronx_venv_pytorch_2_5/bin/activate
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
 ```
 
 ---
 
-## 3 · Usage
+## 3 · Using tmux for Long-Running Training
+
+### Why tmux?
+
+Training neural networks on AWS Neuron can take hours or even days. Using tmux ensures your training continues even if:
+- Your SSH connection drops
+- You need to close your laptop
+- Network interruptions occur
+- You want to monitor progress from multiple devices
+
+### Quick tmux Setup
+
+```bash
+# Create a new tmux session named "training"
+tmux new -s training
+
+# Inside tmux, activate Neuron environment and start training
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+cd ./src/fine-tune
+./pipeline.sh train
+
+# Detach from session (training continues in background)
+# Press: Ctrl+b, then d
+
+# Later, reattach to check progress
+tmux attach -t training
+
+# List all sessions
+tmux ls
+```
+
+### Essential tmux Commands
+
+| Command | Action |
+|---------|--------|
+| `tmux new -s training` | Create session named "training" |
+| `Ctrl+b d` | Detach (leave session running) |
+| `tmux attach -t training` | Reattach to training session |
+| `tmux ls` | List all sessions |
+| `tmux kill-session -t training` | Terminate session |
+
+**Pro tip:** Start your training in tmux from the beginning. It's much safer than hoping your connection stays stable!
+
+---
+
+## 4 · Usage
 
 | Command | Action |
 |---------|--------|
@@ -47,11 +88,11 @@ source /opt/aws_neuronx_venv_pytorch_2_5/bin/activate
 | `./pipeline.sh train` | Start fine‑tuning |
 | `./pipeline.sh clean` | Remove generated datasets, weights, experiments |
 
- Each sub‑command double‑checks you’re inside a Neuron venv and prints a helpful error if not.
+ Each sub‑command double‑checks you're inside a Neuron venv and prints a helpful error if not.
 
 ---
 
-## 4 · Environment Variables
+## 5 · Environment Variables
 
 | Variable | Purpose | How to set |
 |----------|---------|-----------|
@@ -62,7 +103,7 @@ The script auto‑loads `../../.env` with `set -a; source …`. Modify the `ENV_
 
 ---
 
-## 5 · Troubleshooting
+## 6 · Troubleshooting
 
 | Symptom | Probable Cause | Fix |
 |---------|---------------|-----|
@@ -73,7 +114,7 @@ The script auto‑loads `../../.env` with `set -a; source …`. Modify the `ENV_
 
 ---
 
-## 6 · Extending the Pipeline
+## 7 · Extending the Pipeline
 
 1. Add a new Bash function in `pipeline.sh` (e.g., `evaluate()`).
 2. Append its name to the pattern list inside `main()`.
@@ -82,4 +123,4 @@ The script auto‑loads `../../.env` with `set -a; source …`. Modify the `ENV_
 ```bash
 ./pipeline.sh train 
 ```
----
+---
\ No newline at end of file
diff --git a/src/fine-tune/pipeline.sh b/src/fine-tune/pipeline.sh
old mode 100644
new mode 100755
diff --git a/src/fine-tune/scripts/bootstrap.sh b/src/fine-tune/scripts/bootstrap.sh
old mode 100644
new mode 100755
diff --git a/src/fine-tune/scripts/convert_checkpoints.sh b/src/fine-tune/scripts/convert_checkpoints.sh
old mode 100644
new mode 100755
diff --git a/src/fine-tune/scripts/download_data.sh b/src/fine-tune/scripts/download_data.sh
old mode 100644
new mode 100755
diff --git a/src/fine-tune/scripts/download_model.sh b/src/fine-tune/scripts/download_model.sh
old mode 100644
new mode 100755
diff --git a/src/fine-tune/scripts/precompile.sh b/src/fine-tune/scripts/precompile.sh
old mode 100644
new mode 100755
diff --git a/src/fine-tune/scripts/run_training.sh b/src/fine-tune/scripts/run_training.sh
old mode 100644
new mode 100755
diff --git a/src/fine-tune/scripts/tensorboard.sh b/src/fine-tune/scripts/tensorboard.sh
old mode 100644
new mode 100755
diff --git a/src/inference/Makefile b/src/inference/Makefile
deleted file mode 100644
index 6fa7957..0000000
--- a/src/inference/Makefile
+++ /dev/null
@@ -1,239 +0,0 @@
-# Makefile for nki-llama with vLLM and Jupyter support
-
-# Include .env file if it exists
--include ../../.env
-
-# Variables
-PYTHON = python
-REPOSITORY_PATH = ~/nki-llama
-MAIN_SCRIPT = main.py
-TEST_SCRIPT = test/inference/test.py
-LOCAL_VENV = ../../venv/bin/activate
-MODELS_DIR = ~/models
-COMPILED_MODEL_DIR = ~/traced_model
-REPO_PATH = $(HOME)/upstreaming-to-vllm
-REPO_URL = https://github.com/aws-neuron/upstreaming-to-vllm.git
-REPO_BRANCH = neuron-2.22-vllm-v0.7.2
-PORT ?= 8080
-MAX_MODEL_LEN ?= 2048
-TENSOR_PARALLEL_SIZE ?= 8
-
-# Default target
-.PHONY: all
-all: help
-
-# Help message
-.PHONY: help
-help:
-	@echo "Available targets:"
-	@echo "  setup-jupyter  - Create Python virtual environment, install requirements and setup Jupyter (must be run with source)"
-	@echo "  setup-vllm     - Setup vLLM for Neuron (requires Neuron environment first)"
-	@echo "  download       - Download model from Hugging Face (requires Neuron environment first)"
-	@echo "  infer          - Run inference in generate mode (requires Neuron environment first)"
-	@echo "  evaluate       - Run inference in evaluate_single mode (requires Neuron environment first)"
-	@echo "  start-server   - Start vLLM OpenAI-compatible API server"
-	@echo "  jupyter        - Run Jupyter Lab server"
-	@echo "  show-env       - Display environment variables loaded from .env file"
-	@echo "  clean          - Remove generated files"
-	@echo ""
-	@echo "Note: Before running most commands, activate the Neuron environment with:"
-	@echo "  source /opt/aws_neuronx_venv_pytorch_2_5_nxd_inference/bin/activate"
-	@echo ""
-	@echo "For Jupyter, activate the local environment with:"
-	@echo "  source venv/bin/activate"
-	@echo ""
-	@echo "You can set model and server parameters in a .env file"
-	@echo ""
-	@echo "Example workflow:"
-	@echo "  1. source /opt/aws_neuronx_venv_pytorch_2_5_nxd_inference/bin/activate"
-	@echo "  2. make setup-vllm"
-	@echo "  3. make download"
-	@echo "  4. make start-server"
-	@echo "  5. In a new terminal: source venv/bin/activate"
-	@echo "  6. make setup-jupyter"
-	@echo "  7. make jupyter"
-
-.PHONY: show-env
-show-env:
-	@echo "Environment variables loaded from .env file:"
-	@echo "----------------------------------------"
-	@echo "MODEL_ID: $(MODEL_ID)"
-	@echo "MODEL_NAME: $(MODEL_NAME)"
-	@echo "MODELS_DIR: $(MODELS_DIR)"
-	@echo "COMPILED_MODEL_DIR: $(COMPILED_MODEL_DIR)"
-	@echo "PORT: $(PORT)"
-	@echo "MAX_MODEL_LEN: $(MAX_MODEL_LEN)"
-	@echo "TENSOR_PARALLEL_SIZE: $(TENSOR_PARALLEL_SIZE)"
-	@echo "HF_TOKEN: $${HF_TOKEN:-(not set)}"
-	@echo "----------------------------------------"
-
-# Setup local virtual environment and Jupyter
-.PHONY: setup-jupyter
-setup-jupyter:
-	test -d venv || $(PYTHON) -m venv venv
-	pip install --upgrade pip
-	test -f requirements.txt && pip install -r requirements.txt || echo "No requirements.txt found"
-	pip install langchain langgraph langchain_community ipykernel jupyter jupyterlab python-dotenv
-	$(PYTHON) -m ipykernel install --user --name="neuron_agents" --display-name="Python (Neuron Agents)"
-	@echo "Virtual environment and Jupyter kernel setup complete"
-
-# Check if in Neuron virtual environment
-.PHONY: check-neuron-venv
-check-neuron-venv:
-	@if [ -z "$$VIRTUAL_ENV" ] || [[ "$$VIRTUAL_ENV" != *"neuronx"* ]]; then \
-		echo "Error: Not in Neuron virtual environment."; \
-		echo "Run 'source /opt/aws_neuronx_venv_pytorch_2_5_nxd_inference/bin/activate' first."; \
-		exit 1; \
-	else \
-		echo "Using Neuron virtual environment: $$VIRTUAL_ENV"; \
-	fi
-
-# Check if in local virtual environment
-.PHONY: check-local-venv
-check-local-venv:
-	@if [ -z "$$VIRTUAL_ENV" ] || [[ "$$VIRTUAL_ENV" != *"venv"* ]]; then \
-		echo "Error: Not in local virtual environment."; \
-		echo "Run 'source venv/bin/activate' first."; \
-		exit 1; \
-	else \
-		echo "Using local virtual environment: $$VIRTUAL_ENV"; \
-	fi
-
-# Check if vLLM is installed
-.PHONY: check-vllm
-check-vllm:
-	@if ! python -c "import vllm" 2>/dev/null; then \
-		echo "Error: vLLM not installed. Run 'make setup-vllm' first."; \
-		exit 1; \
-	else \
-		echo "vLLM is installed"; \
-	fi
-
-# Check or set MODEL_NAME environment variable
-.PHONY: check-model-name
-check-model-name:
-	@if [ -z "$(MODEL_NAME)" ]; then \
-		echo "MODEL_NAME environment variable is not set"; \
-		echo "You can set it permanently in .env file or with: export MODEL_NAME=your_model_path"; \
-		read -p "Enter model name for this session (default: llama-3.2-3b-instruct): " model_name; \
-		if [ -z "$$model_name" ]; then \
-			export MODEL_NAME="llama-3.2-3b-instruct"; \
-		else \
-			export MODEL_NAME="$$model_name"; \
-		fi; \
-		echo "Using model: $$MODEL_NAME"; \
-	else \
-		echo "Using model from configuration: $(MODEL_NAME)"; \
-	fi
-
-# Setup vLLM for Neuron
-.PHONY: setup-vllm
-setup-vllm: check-neuron-venv
-	@if [ -d "$(REPO_PATH)" ]; then \
-		echo "Repository already exists at $(REPO_PATH)"; \
-		cd "$(REPO_PATH)"; \
-		CURRENT_BRANCH=$$(git rev-parse --abbrev-ref HEAD); \
-		if [ "$$CURRENT_BRANCH" != "$(REPO_BRANCH)" ]; then \
-			echo "Switching to branch $(REPO_BRANCH)..."; \
-			git checkout "$(REPO_BRANCH)" || git fetch && git checkout "$(REPO_BRANCH)"; \
-		else \
-			echo "Already on correct branch: $(REPO_BRANCH)"; \
-		fi; \
-	else \
-		echo "Cloning repository $(REPO_URL) with branch $(REPO_BRANCH)..."; \
-		cd "$(HOME)"; \
-		git clone -b "$(REPO_BRANCH)" "$(REPO_URL)"; \
-	fi
-	@cd "$(REPO_PATH)" && \
-	echo "Installing vLLM requirements..." && \
-	pip install -r requirements-neuron.txt && \
-	echo "Installing vLLM for Neuron..." && \
-	VLLM_TARGET_DEVICE="neuron" pip install -e .
-	@echo "vLLM setup complete"
-
-# Download model from Hugging Face
-.PHONY: download
-download: check-neuron-venv
-	@echo "Downloading model from Hugging Face"
-	@if [ -z "$(MODEL_ID)" ]; then \
-		echo "MODEL_ID not set in .env file."; \
-		read -p "Enter HuggingFace model ID (e.g., meta-llama/Meta-Llama-3-8B): " MODEL_ID_INPUT; \
-		if [ -z "$$MODEL_ID_INPUT" ]; then \
-			echo "Error: MODEL_ID is required"; \
-			exit 1; \
-		fi; \
-		MODEL_ID_VAR="$$MODEL_ID_INPUT"; \
-	else \
-		MODEL_ID_VAR="$(MODEL_ID)"; \
-		echo "Using MODEL_ID from .env: $$MODEL_ID_VAR"; \
-	fi; \
-	if [ -z "$(HF_TOKEN)" ]; then \
-		echo "HF_TOKEN not set in .env file."; \
-		echo "Get one at: https://huggingface.co/docs/hub/en/security-tokens"; \
-		read -p "Enter your Hugging Face token: " TOKEN; \
-		if [ -z "$$TOKEN" ]; then \
-			echo "Error: HF_TOKEN is required"; \
-			exit 1; \
-		fi; \
-	else \
-		TOKEN="$(HF_TOKEN)"; \
-		echo "Using HF_TOKEN from .env file"; \
-	fi; \
-	SHORTNAME=$$(echo $$MODEL_ID_VAR | sed 's/.*\///' | tr '[:upper:]' '[:lower:]'); \
-	echo "Downloading $$MODEL_ID_VAR to $(MODELS_DIR)/$$SHORTNAME"; \
-	mkdir -p $(MODELS_DIR); \
-	pip install -q huggingface_hub[cli]; \
-	huggingface-cli download --token $$TOKEN $$MODEL_ID_VAR --local-dir $(MODELS_DIR)/$$SHORTNAME; \
-	echo ""; \
-	echo "Download complete!"; \
-	echo "Model saved to: $(MODELS_DIR)/$$SHORTNAME"; \
-	if [ -z "$(MODEL_ID)" ]; then \
-		echo ""; \
-		echo "Add to your .env file:"; \
-		echo "MODEL_ID=$$MODEL_ID_VAR"; \
-		echo "MODEL_NAME=$$SHORTNAME"; \
-	fi
-
-# Start vLLM OpenAI-compatible API server
-.PHONY: start-server
-start-server: check-neuron-venv check-vllm check-model-name
-	@echo "Starting vLLM OpenAI-compatible API server with model: $(MODEL_NAME)"
-	cd ~ && VLLM_NEURON_FRAMEWORK='neuronx-distributed-inference' NEURON_COMPILED_ARTIFACTS='$(COMPILED_MODEL_DIR)/$(MODEL_NAME)' python -m vllm.entrypoints.openai.api_server \
-		--model="models/$(MODEL_NAME)" \
-		--max-num-seqs=4 \
-		--max-model-len=$(MAX_MODEL_LEN) \
-		--tensor-parallel-size=$(TENSOR_PARALLEL_SIZE) \
-		--port=$(PORT) \
-		--device "neuron" \
-		--override-neuron-config "{\"enable_bucketing\":false}"
-
-# Run Jupyter Lab
-.PHONY: jupyter
-jupyter: check-local-venv
-	@echo "Starting Jupyter Lab server..."
-	jupyter lab --no-browser --ip="0.0.0.0"
-
-# Run inference in generate mode
-.PHONY: infer
-infer: check-neuron-venv check-model-name
-	@echo "Running inference in generate mode with model: $(MODEL_NAME)"
-	$(PYTHON) $(MAIN_SCRIPT) --mode generate --model-path "$(MODELS_DIR)/$(MODEL_NAME)" --compiled-model-path "$(COMPILED_MODEL_DIR)/$(MODEL_NAME)" --enable-nki --seq-len 640 --tp-degree $(TENSOR_PARALLEL_SIZE)
-
-# Run inference in evaluate_single mode
-.PHONY: evaluate
-evaluate: check-neuron-venv check-model-name
-	@echo "Running inference in evaluate_single mode with model: $(MODEL_NAME)"
-	cd ~ && $(PYTHON) $(REPOSITORY_PATH)/$(TEST_SCRIPT) --repository-path $(REPOSITORY_PATH)
-
-# Run inference in evaluate_all mode
-.PHONY: evaluate-all
-evaluate-all: check-neuron-venv check-model-name
-	@echo "Running inference in evaluate_all mode with model: $(MODEL_NAME)"
-	$(PYTHON) $(MAIN_SCRIPT) --mode evaluate_all --model-path "$(MODELS_DIR)/$(MODEL_NAME)" --compiled-model-path "$(COMPILED_MODEL_DIR)/$(MODEL_NAME)" --enable-nki --seq-len 640 --tp-degree $(TENSOR_PARALLEL_SIZE)
-
-# Clean generated files
-.PHONY: clean
-clean:
-	@echo "Cleaning generated files..."
-	rm -rf test/inference/output/*
-	find . -type d -name "__pycache__" -exec rm -rf {} +
\ No newline at end of file
diff --git a/src/inference/README.md b/src/inference/README.md
index 8a5407c..c8732af 100644
--- a/src/inference/README.md
+++ b/src/inference/README.md
@@ -1,210 +1,412 @@
-# NKI Llama
-
-📢 Contestants, please note that we have updated the due date to March 10, anywhere on Earth. This is to allow for more time to address questions about benchmarking, which is both the purpose of the competition and core to the success metric. 
-
-## Getting Started
-
-This repository provides a package containing the PyTorch model of Llama 3.2 1B. This model **can be compiled with AWS Neuron SDK and run on** a **Trainium instance.** The main file in this package is `llama.py` which contains the model implementation in PyTorch.
-
-In the `llama.py` file, we provide an example NKI kernel for the [RMSNorm operation](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/tutorials/rmsnorm.html) and a guide on how to replace its invocation in the model. This replacement serves as an example of a valid use of a NKI kernel in the PyTorch model. Your task is to identify other parts of the model (operators, fused operators, layers, or even the whole model\!) that can be implemented as NKI kernels and replace them in the original model to achieve better performance.
-
-To learn NKI, follow [the official NKI guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/index.html) and various example NKI kernels from the [nki-samples repository](https://github.com/aws-neuron/nki-samples). Another tool to help with optimizing NKI kernels is [NKI autotune](https://github.com/awslabs/nki-autotune).
-
-## Setup Steps
-
-1. Create a Trainium instance with AWS Neuron SDK v2.21 using EC2 with the following settings:
-    1. **Name:** optnki-[xxx]
-    2. **AMI:** Deep Learning AMI Neuron (Ubuntu 22.04)
-    3. **Instance type:** trn1.2xlarge
-    4. **Key pair (login):** create a new key pair
-    5. **Metadata version [under “Advanced details”]:** V2 only (otherwise, you will encounter a not authorized error)
-    6. When connecting to these instances via SSH, use the username of *ubuntu*.
-2. Activate the Neuron virtual environment to run inference by running `source /opt/aws_neuronx_venv_pytorch_2_5_nxd_inference/bin/activate`.
-3. Clone this repository and run `cd [PATH]/nki-llama` where `[PATH]` is the directory where you have performed the clone.
-4. Download the [Llama3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) model to a `~/models` folder in your root directory. We recommend doing so using the [Hugging Face CLI](https://huggingface.co/docs/huggingface_hub/en/guides/cli). You can install this by running `pip3 install huggingface_hub[cli]`. You will also need to create an [access token](https://huggingface.co/docs/hub/en/security-tokens). 
-To download the models, run the following:
-    ```
-    cd ~
-    mkdir models
-    huggingface-cli download --token YOURTOKEN meta-llama/Llama-3.2-1B --local-dir /home/ubuntu/models/llama-3.2-1b
-    ```
-5. [Llama3.2-1B Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) may be more fun to chat with. You can download and use this model as well.
-6. To run inference, navigate to `[PATH]/nki-llama` and run `python main.py --mode generate`.
-
-## NKI Kernel Example
-The following steps provide an example of how to utilize NKI kernels in the Llama3.2-1B model:
-
-1. Identify the kernel of interest, e.g. RMSNorm, in the PyTorch model to be optimized with NKI. In the NxDI repository, it is implemented in [modules/custom_calls.py](https://github.com/aws-neuron/neuronx-distributed-inference/blob/main/src/neuronx_distributed_inference/modules/custom_calls.py).
-
-    ```
-    class CustomRMSNorm(nn.Module):
-        def __init__(self, hidden_size, eps=1e-6):
-            """
-            Use this RMSNorm to perform customized rmsnorm on Neuron
-            Note: CustomRMSNorm forward method calls target="AwsNeuronRmsNorm"
-            """
-            super().__init__()
-            self.weight = nn.Parameter(ones(hidden_size))
-            self.variance_epsilon = eps
-    
-        def forward(self, hidden_states):
-            original_dtype = hidden_states.dtype
-    
-            hidden_states = hidden_states.to(torch.float32)
-            result = RmsNorm.apply(
-                hidden_states, self.weight, self.variance_epsilon, len(hidden_states.shape) - 1
-            )
-    
-            return result.to(original_dtype)
-    ```
-
-2. Modify or create a new class for the NKI kernel. `nki_rmsnorm_kernel` refers to the NKI RMSNorm kernel. 
-
-    a. Modify the existing class:
-
-    ```
-    class CustomRMSNorm(nn.Module):
-        def __init__(self, hidden_size, eps=1e-6, nki_enabled=False):
-            """
-            Use this RMSNorm to perform customized rmsnorm on Neuron
-            Note: CustomRMSNorm forward method calls target="AwsNeuronRmsNorm"
-            """
-            super().__init__()
-            self.weight = nn.Parameter(ones(hidden_size))
-            self.variance_epsilon = eps
-            self.nki_enabled = nki_enabled
-    
-        def forward(self, hidden_states):
-            if self.nki_enabled:
-                out_tensor = nki_rmsnorm_kernel(hidden_states, self.weight, self.variance_epsilon)
-                return out_tensor
-            
-            original_dtype = hidden_states.dtype
-    
-            hidden_states = hidden_states.to(torch.float32)
-            result = RmsNorm.apply(
-                hidden_states, self.weight, self.variance_epsilon, len(hidden_states.shape) - 1
-            )
-    
-            return result.to(original_dtype)
-    ```
-
-    b. Create a new class (this is not what was done in this tutorial):
-
-    ```
-    class CustomRMSNormNKI(nn.Module):
-        def __init__(self, hidden_size, eps=1e-6):
-            """
-            Use this RMSNorm to perform customized rmsnorm on Neuron
-            Note: CustomRMSNorm forward method calls target="AwsNeuronRmsNorm"
-            """
-            super().__init__()
-            self.weight = nn.Parameter(ones(hidden_size))
-            self.variance_epsilon = eps
-    
-        def forward(self, hidden_states):
-            out_tensor = nki_rmsnorm_kernel(hidden_states, self.weight, self.variance_epsilon)
-            return out_tensor
-    ```
-1. You may need to add a batch dimension to input tensor(s), e.g. `a_tensor`. Also be aware of uninitialized data.
-
-    ```
-    # iy = nl.arange(a_tensor.shape[1])[None, :]
-    iy = nl.arange(a_tensor.shape[2])[None, :]
-    
-    # num_rows = a_tensor.shape[0]
-    num_rows = a_tensor.shape[1]
-    ```
-    
-1. If you modified the existing class, update how the class is invoked in the PyTorch model file `llama.py`.
-
-    ```
-    ...
-        self.input_layernorm = get_rmsnorm_cls()(
-            config.hidden_size,
-            eps=config.rms_norm_eps,
-            nki_enabled=config.neuron_config.nki_enabled,
-        )
-    self.post_attention_layernorm = get_rmsnorm_cls()(
-        config.hidden_size,
-        eps=config.rms_norm_eps,
-        nki_enabled=config.neuron_config.nki_enabled,
-    )
-    ```
-
-    If you created a new class, modify where the kernel is invoked in the PyTorch model file `llama.py` (not done in this tutorial).
-
-    ```
-    def get_rmsnorm_cls():
-        # Initialize to the appropriate implementation of RMSNorm
-        # If infer on NXD -> CustomRMSNorm
-        # If infer on CPU -> HF_RMSNorm (CustomRMSNorm does not work on CPU)
-        # return CustomRMSNorm if parallel_state.model_parallel_is_initialized() else LlamaRMSNorm
-        return CustomRMSNormNKI if parallel_state.model_parallel_is_initialized() else LlamaRMSNorm
-    ```
-
-1. Run inference on a single prompt using the NKI kernel and the single evaluation mode by running `python main.py --enable-nki --mode evaluate_single`. If you would like to run the model with specific prompts, pass in `--prompt [PROMPTS]` where `[PROMPTS]` is a comma-separated list of prompts.
-
-## Additional Tools
-
-1. **Profiling:** If you would like to profile your implementation in order to get a better understanding of performance bottlenecks and opportunities for optimization, you can use the [Neuron Profiler](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/tools/neuron-sys-tools/neuron-profile-user-guide.html).
-2. **Benchmarking:** You can also leverage the [NKI benchmarking API](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/api/generated/nki.benchmark.html) to retrieve execution latency statistics. 
-
-## Submission
-
-Your submission should be a single Python file called `llama.py`. This file should contain implementations of NKI kernels and also modifications to the original model to invoke these NKI kernels. This file should work as a plug-in replacement for the original `llama.py` of the reference PyTorch implementation provided in this repository.
-
-Make your submission here: https://forms.gle/zZKKS6RzKcerf4vH8
-
-## Benchmarks
-
-Submissions will be tested using 25 benchmarks (prompts) with varying context lengths (TBD, but likely 1K \-\> 128K) and batch sizes (TBD, but likely 1-\>4). We have provided 5 prompts in `prompts.txt` with their corresponding metadata (prompt ID, prompt length, recommended sequence length, and baseline latency/throughput) in `prompt_data.txt`. There are 2 methods of testing these prompts:
-
-1. To avoid recompilation per prompt, you can use a global sequence length (we suggest 640) for all prompts. Run `python main.py --enable-nki --mode evaluate_all --seq-len 640`.
-2. Alternatively, you can use a unique sequence length for each prompt (suggested sequence lengths are the third entry in each row of `prompt_data.txt`) at the cost of recompiling the model for each prompt. Run `python test.py` to evaluate these prompts in this fashion.
-
-The remaining 20 prompts will be withheld for evaluation. All benchmarks will become publicly available after the contest is complete.
-
-## Evaluation and Scoring
-
-The contest organizers will execute each team's submission across the twenty withheld benchmarks on a dedicated Trainium instance. The submissions will be evaluated on:
-
-1) Accuracy of generated output vs. our reference implementation. Accuracy evaluation will be a binary assessor: Any benchmark that fails an accuracy threshold will result in a score of 0\.   
-2) Latency (Time to first token (TTFT))  
-3) Throughput measured as output tokens / second  
-4) Amount of model written in NKI (measured as NKI FLOPS / total model FLOPS) (will be applied as a scaling factor for (b) and (c)). Note: NKI FLOPs measures the number of multiply-accumulate (MAC) operations.
+# NKI-LLAMA: Unified Interface for AWS Neuron
+
+A unified project for fine-tuning, inference, and agent development of LLaMA models on AWS Trainium and Inferentia using a streamlined bash-based interface.
+
+## 📋 Requirements
+
+### Neuron SDK Version
+- **Neuron 2.23.0 Release**
+- **NeuronX Distributed Inference**: 0.3.5591
+- **NeuronX Distributed Training**: 1.3.0
+
+### Hardware & AMI
+- **Required Instance**: trn1.32xlarge
+- **Base AMI**: Deep Learning AMI Neuron (Ubuntu 22.04) with Neuron SDK 2.23
+- **Base Packages**:
+  - NxD (NeuronX Distributed Training)
+  - NKI (Neuron Kernel Interface)
+  - NxDI (NeuronX Distributed Inference)
+
+## 🔄 Project Workflow
+
+```
+┌─────────────────┐     ┌──────────────────┐     ┌─────────────────┐     ┌──────────────┐
+│                 │     │                  │     │                 │     │              │
+│   Fine-tuning   │────▶│ NKI Compilation  │────▶│ vLLM Inference  │────▶│    Agent     │
+│      (NxD)      │     │  & Evaluation    │     │     (NxDI)      │     │ Development  │
+│                 │     │                  │     │                 │     │              │
+└─────────────────┘     └──────────────────┘     └─────────────────┘     └──────────────┘
+        │                         │                         │
+        │                         │                         │
+        ▼                         ▼                         ▼
+  Trained Model            NKI-Optimized              API Endpoint
+                          Model Artifacts            (OpenAI Compatible)
+```
+
+### Detailed Workflow:
+
+1. **Fine-tune** a model using NeuronX Distributed (NxD) on Trainium
+2. **NKI Compilation** optimizes the model for Neuron hardware:
+   - Compiles model graphs with Neuron Kernel Interface (NKI)
+   - Creates optimized artifacts for inference
+   - Benchmarks performance characteristics
+3. **vLLM Inference** serves the NKI-compiled model using NeuronX Distributed Inference (NxDI)
+4. **Agent Development** connects to the inference endpoint for application building
+
+### Key Components:
+- **NKI (Neuron Kernel Interface)**: Optimizes model operations for AWS Neuron hardware
+- **NxD (NeuronX Distributed)**: Enables distributed training across Neuron cores
+- **NxDI (NeuronX Distributed Inference)**: Provides optimized inference runtime
+- **vLLM**: Serves models with OpenAI-compatible API using Neuron optimizations
+
+## 🚀 Quick Start
+
+```bash
+# Install
+chmod +x install.sh
+./install.sh
+
+# Setup Guide
+./nki-llama setup
+
+# Run fine-tuning
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+./nki-llama finetune all
+
+# Start inference server
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+./nki-llama server
+```
+
+## 🏗️ Initial Setup
+
+### 1. Create Trainium Instance
+
+Create a trn1.32xlarge instance on AWS EC2:
+- **Name**: nki-llama
+- **AMI**: Deep Learning AMI Neuron (Ubuntu 22.04)
+- **Instance type**: trn1.32xlarge
+- **Key pair**: Create new key pair
+- **Username**: ubuntu (when connecting via SSH)
+
+### 2. Clone and Install
+
+```bash
+# Clone repository
+git clone [REPO_URL]
+cd nki-llama
+
+# Run installation
+chmod +x install.sh
+./install.sh
+
+# Configure environment
+cp .env.example .env
+nano .env  # Add your HF_TOKEN and adjust settings
+```
+
+## 📁 Project Structure
+
+```
+/home/ubuntu/nki-llama/
+├── nki-llama.sh          # Main CLI interface
+├── nki-llama.config      # Shared configuration
+├── .env                  # Your environment variables
+├── .env.example          # Example configuration
+├── install.sh            # Installation script
+├── src/
+│   ├── fine-tune/
+│   │   └── scripts/      # Fine-tuning scripts
+│   │       ├── bootstrap.sh
+│   │       ├── download_data.sh
+│   │       ├── download_model.sh
+│   │       ├── convert_checkpoints.sh
+│   │       ├── precompile.sh
+│   │       └── run_training.sh
+│   └── inference/
+│       ├── main.py       # Inference entry point
+│       └── scripts/      # Inference helper scripts
+│           ├── setup-vllm.sh
+│           ├── download-model.sh
+│           ├── run-nki-benchmark.sh
+│           ├── start-server.sh
+│           └── jupyter.sh
+└── logs/                 # Unified logs
+    └── benchmarks/       # Benchmark results
+```
+
+## 🔧 Environment Setup
+
+This project requires three different Python environments:
+
+### 1. Fine-tuning Environment
+```bash
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+```
+
+### 2. Inference Environment
+```bash
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+```
+
+### 3. Jupyter Environment (for agent development)
+```bash
+./nki-llama jupyter setup
+source ~/nki-llama/venv/bin/activate
+```
+
+## 💻 Commands
+
+### Quick Commands
+- `./nki-llama setup` - Interactive setup wizard
+- `./nki-llama train` - Start fine-tuning (shortcut)
+- `./nki-llama server` - Start inference server (shortcut)
+- `./nki-llama jupyter` - Launch Jupyter Lab
+- `./nki-llama status` - Check system status
+- `./nki-llama config` - Show configuration
+
+### Fine-tuning Workflow
+
+```bash
+# Activate fine-tuning environment
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+
+# Run individual steps
+./nki-llama finetune deps      # Install dependencies
+./nki-llama finetune data      # Download dataset
+./nki-llama finetune model     # Download model
+./nki-llama finetune convert   # Convert checkpoints to NxDT format
+./nki-llama finetune compile   # Pre-compile graphs (AOT)
+./nki-llama finetune train     # Start fine-tuning
+
+# Or run all at once
+./nki-llama finetune all
+```
+
+### Inference Workflow
+
+The inference pipeline includes NKI (Neuron Kernel Interface) compilation and NxDI integration with vLLM for optimal performance on Neuron hardware.
+
+```bash
+# Activate inference environment
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+
+# Setup and prepare
+./nki-llama inference setup      # Setup vLLM for Neuron
+./nki-llama inference download   # Download model (skip if using fine-tuned)
+
+# Compile and optimize with NKI
+./nki-llama inference compile    # Compile model with NKI (10-30 min)
+
+# Benchmark performance
+./nki-llama inference benchmark  # Run performance evaluation
+
+# Start serving
+./nki-llama inference server     # Start OpenAI-compatible API
+```
+
+**Note**: The compilation step creates NKI-optimized artifacts that are:
+- Required for vLLM to use the model efficiently
+- Cached for future use (no recompilation needed)
+- Optimized specifically for your Neuron hardware configuration
+
+## 🤖 Agent Development
+
+This repository includes support for building LLM-powered agents using LangGraph and LangChain. A sample travel planning agent demonstrates:
+
+- Context-aware travel itinerary generation
+- Multi-turn conversation with memory
+- Dynamic workflow management using LangGraph
+- Integration with vLLM for efficient inference on Trainium
+
+### Using Jupyter for Agent Development
+
+```bash
+# Terminal 1: Start the inference server
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+./nki-llama server
+
+# Terminal 2: Start Jupyter Lab
+./nki-llama jupyter
+# Access at http://your-ip:8888
+# Select the "nki-llama" kernel in Jupyter
+```
+
+## ⚙️ Configuration
+
+All configuration is managed through:
+1. `nki-llama.config` - System paths and defaults
+2. `.env` - Your personal configuration
+
+### Key Variables
+
+```bash
+# Model Configuration
+MODEL_ID=meta-llama/Meta-Llama-3-8B
+MODEL_NAME=llama-3-8b
+HF_TOKEN=your_huggingface_token
+
+# Training Parameters
+BATCH_SIZE=1
+MAX_STEPS=1000
+SEQ_LENGTH=2048
+TENSOR_PARALLEL_SIZE=8
+LEARNING_RATE=5e-5
+
+# Inference Parameters
+INFERENCE_PORT=8080
+MAX_MODEL_LEN=2048
+MAX_NUM_SEQS=4
+
+# Dataset Configuration
+DATASET_NAME=databricks/databricks-dolly-15k
+```
+
+## 📊 Monitoring
 
-Rankings will be established by calculating the total normalized number of points per team, where points are normalized against the baseline.
+### Check Status
+```bash
+./nki-llama status
+```
+
+### View Logs
+```bash
+# Logs are stored with timestamps
+ls logs/
+tail -f logs/nki-llama_*.log
+
+# Benchmark results
+ls logs/benchmarks/
+cat logs/benchmarks/*/metadata.json
+```
+
+### Neuron Monitoring
+```bash
+neuron-ls    # List Neuron devices
+neuron-top   # Monitor Neuron usage
+```
+
+## 🔍 Complete Workflow Example
+
+Here's a complete end-to-end workflow with tmux best practices:
+
+### 1. Fine-tune a Model
+```bash
+# Create tmux session for training
+tmux new -s training
+
+# Inside tmux: activate environment and run training
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+./nki-llama finetune all
+
+# Detach from tmux: Ctrl+B, D
+# Check progress later: tmux attach -t training
+```
+
+### 2. Compile Model with NKI
+```bash
+# Create tmux session for compilation
+tmux new -s compile
+
+# Inside tmux: compile the model
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+./nki-llama inference compile
 
-We define **points** as **Accuracy** (binary) **\* Reduced Latency \* Increased Throughput \* (1 + Normalized NKI FLOPS)**, where:
+# This creates optimized artifacts for vLLM
+# Detach and let it run: Ctrl+B, D
+```
+
+### 3. Benchmark Performance
+```bash
+# After compilation, run benchmarks
+./nki-llama inference benchmark --iterations 20
 
-* **Accuracy** = 1 if accuracy matches or exceeds a predetermined threshold, 0 otherwise  
-* **Reduced Latency** = Reference implementation TTFT divided by submission TTFT  
-* **Increased Throughput** = Submission tokens/sec divided by reference implementation tokens/sec  
-* **Normalized NKI FLOPS** = Submission NKI FLOPS divided by total model FLOPS
+# View benchmark results
+ls logs/benchmarks/
+cat logs/benchmarks/*/metadata.json
+```
 
-For example, a submission that is sufficiently accurate, with 10x reduced latency, 2x increased throughput, and 0.85 normalized NKI FLOPS would obtain 1 \* 10 \* 2 \* 1.85 \= 37 points. For reference, the baseline submission would receive a score of 1.
+### 4. Serve with vLLM
+```bash
+# Create tmux session for server
+tmux new -s vllm
 
-## Presentations
+# Inside tmux: start the server
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+./nki-llama server
 
-Teams who successfully submit an entry will be invited to present an informal overview of their approach (roughly 10 to 15 minutes) at a special session held on March 30th during the [Workshop & Tutorial](https://www.asplos-conference.org/asplos2025/workshops-and-tutorials/) days.  Winners will be announced later in the week, with full results being released soon after the conference.
+# Server uses NKI-compiled artifacts automatically
+# Detach: Ctrl+B, D
+```
 
-## Contest Eligibility
+### 5. Build Agents
+```bash
+# In a new terminal
+./nki-llama jupyter
 
-All are welcome to participate in the contest (including teams from academia, industry, and elsewhere) with the exception of the Contest Organizers and employees of the Contest Sponsor. Individuals are prohibited from participating in multiple teams. In order to be eligible for prizes, teams must commit to releasing an open-source version of their implementation prior to ASPLOS 2026\.
+# Your model is now available at http://localhost:8080
+# Build agents using the OpenAI-compatible API
+```
 
-## Frequently Asked Questions
+### Managing tmux Sessions
+```bash
+# List all sessions
+tmux ls
 
-To raise a question, please create an issue in this repository, or feel free to reach out to the contest organizers directly.
+# Attach to a session
+tmux attach -t training
+tmux attach -t compile
+tmux attach -t vllm
 
-## Related Work
+# Kill a session
+tmux kill-session -t training
+```
 
-* TBD
+## 🚨 Troubleshooting
 
-## Contest Organizers
+### Environment Issues
+```bash
+# Check active environment
+./nki-llama status
 
-* Emery Berger (Amazon Web Services), [emerydb@amazon.com](mailto:emerydb@amazon.com)
-* Aninda Manocha (Amazon Web Services)
-* Wei Tang (Amazon Web Services)
-* Emily Webber (Amazon Web Services)
-* Ziyang Xu (Amazon Web Services)
+# Wrong environment error?
+# For fine-tuning:
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+
+# For inference:
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+```
+
+### Model Compilation
+- First-time model compilation with NKI can take 10-30 minutes
+- Compiled models are cached in `~/traced_model/`
+- Subsequent runs will use the cached compilation
+
+### Memory Issues
+- Ensure you're using trn1.32xlarge for full model support
+- Monitor memory usage with `neuron-top`
+- Adjust `TENSOR_PARALLEL_SIZE` if needed
+
+### Using with tmux
+For long-running operations like training, compilation, or serving:
+
+```bash
+# Create a new tmux session
+tmux new -s session-name
+
+# Run your command
+./nki-llama [command]
+
+# Detach from session
+Ctrl+B, then D
+
+# List sessions
+tmux ls
+
+# Reattach to session
+tmux attach -t session-name
+```
+
+## 🤝 Contributing
+
+The modular design makes it easy to add new features:
+
+1. Add new scripts to `scripts/` directory
+2. Update command handlers in `nki-llama.sh`
+3. Add configuration to `nki-llama.config`
+
+## 📄 License
+
+© 2025 Amazon Web Services. All rights reserved.
+
+This project integrates with AWS Neuron SDK and follows its licensing terms.
\ No newline at end of file
diff --git a/src/inference/scripts/download-model.sh b/src/inference/scripts/download-model.sh
new file mode 100755
index 0000000..ce73883
--- /dev/null
+++ b/src/inference/scripts/download-model.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+# download-model.sh - Download model from Hugging Face
+
+set -euo pipefail
+
+# Load configuration
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "${SCRIPT_DIR}/../../../nki-llama.config"
+
+# Colors
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+RED='\033[0;31m'
+NC='\033[0m'
+
+echo -e "${GREEN}Downloading model from Hugging Face...${NC}"
+
+# Check HF token
+if [[ -z "${HF_TOKEN:-}" ]]; then
+    echo -e "${YELLOW}HF_TOKEN not set${NC}"
+    echo "Get a token at: https://huggingface.co/settings/tokens"
+    read -p "Enter your Hugging Face token: " HF_TOKEN
+    if [[ -z "$HF_TOKEN" ]]; then
+        echo -e "${RED}Error: HF_TOKEN is required${NC}"
+        exit 1
+    fi
+fi
+
+# Ensure huggingface-cli is installed
+pip install -q huggingface_hub[cli]
+
+# Create models directory
+mkdir -p "$NKI_MODELS"
+
+# Download model
+echo "Downloading ${MODEL_ID} to ${NKI_MODELS}/${MODEL_NAME}"
+huggingface-cli download \
+    --token "$HF_TOKEN" \
+    "$MODEL_ID" \
+    --local-dir "${NKI_MODELS}/${MODEL_NAME}"
+
+echo -e "${GREEN}✓ Model downloaded successfully${NC}"
+echo "Location: ${NKI_MODELS}/${MODEL_NAME}"
+
+# Save configuration hint
+if [[ -z "${HF_TOKEN:-}" ]]; then
+    echo
+    echo "To save your token, add to .env file:"
+    echo "HF_TOKEN=$HF_TOKEN"
+fi
\ No newline at end of file
diff --git a/src/inference/scripts/jupyter.sh b/src/inference/scripts/jupyter.sh
new file mode 100755
index 0000000..66fae19
--- /dev/null
+++ b/src/inference/scripts/jupyter.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+# jupyter.sh - Jupyter Lab setup and launcher
+
+set -euo pipefail
+
+# Load configuration
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "${SCRIPT_DIR}/../../../nki-llama.config"
+
+# Colors
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+RED='\033[0;31m'
+NC='\033[0m'
+
+# Setup Jupyter environment
+setup_jupyter() {
+    echo -e "${BLUE}Setting up Jupyter environment...${NC}"
+    
+    # Create virtual environment if needed
+    if [[ ! -d "$JUPYTER_VENV" ]]; then
+        echo "Creating virtual environment..."
+        python3 -m venv "$JUPYTER_VENV"
+    fi
+    
+    # Activate and install packages
+    source "${JUPYTER_VENV}/bin/activate"
+    
+    echo "Installing Jupyter packages..."
+    pip install --upgrade pip
+    pip install jupyter jupyterlab ipykernel python-dotenv
+    pip install langchain langgraph langchain_community
+    
+    # Install kernel
+    echo "Installing Jupyter kernel..."
+    python -m ipykernel install --user \
+        --name="nki-llama" \
+        --display-name="Python (NKI-LLAMA)"
+    
+    echo -e "${GREEN}✓ Jupyter setup complete${NC}"
+}
+
+# Start Jupyter Lab
+start_jupyter() {
+    # Check if setup is needed
+    if [[ ! -d "$JUPYTER_VENV" ]]; then
+        echo -e "${YELLOW}Jupyter not set up. Running setup first...${NC}"
+        setup_jupyter
+    fi
+    
+    # Activate environment
+    source "${JUPYTER_VENV}/bin/activate"
+    
+    # Start Jupyter Lab
+    echo -e "${GREEN}Starting Jupyter Lab on port ${JUPYTER_PORT}...${NC}"
+    echo -e "${YELLOW}URL: http://0.0.0.0:${JUPYTER_PORT}${NC}"
+    echo -e "${YELLOW}Press Ctrl+C to stop${NC}\n"
+    
+    cd "$NKI_ROOT"
+    jupyter lab --no-browser --ip="0.0.0.0" --port="${JUPYTER_PORT}"
+}
+
+# Main
+case "${1:-start}" in
+    setup)
+        setup_jupyter
+        ;;
+    start|"")
+        start_jupyter
+        ;;
+    *)
+        echo "Usage: $0 [setup|start]"
+        exit 1
+        ;;
+esac
\ No newline at end of file
diff --git a/src/inference/scripts/run-nki-benchmark.sh b/src/inference/scripts/run-nki-benchmark.sh
new file mode 100755
index 0000000..9deed06
--- /dev/null
+++ b/src/inference/scripts/run-nki-benchmark.sh
@@ -0,0 +1,306 @@
+#!/bin/bash
+# /home/ubuntu/nki-llama/src/inference/scripts/run-nki-benchmark.sh
+# Run NKI benchmark evaluation for model compilation and performance testing
+
+set -euo pipefail
+
+# Get script directory
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../../../" && pwd)"
+
+# Load configuration
+source "${PROJECT_ROOT}/nki-llama.config"
+
+# Colors for output
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+RED='\033[0;31m'
+CYAN='\033[0;36m'
+NC='\033[0m'
+
+# Default parameters
+MODE="${MODE:-evaluate_single}"
+ENABLE_NKI="${ENABLE_NKI:-true}"
+SEQ_LEN="${SEQ_LEN:-640}"
+TP_DEGREE="${TP_DEGREE:-${TENSOR_PARALLEL_SIZE}}"
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --mode)
+            MODE="$2"
+            shift 2
+            ;;
+        --model-name)
+            MODEL_NAME="$2"
+            shift 2
+            ;;
+        --no-nki)
+            ENABLE_NKI="false"
+            shift
+            ;;
+        --seq-len)
+            SEQ_LEN="$2"
+            shift 2
+            ;;
+        --tp-degree)
+            TP_DEGREE="$2"
+            shift 2
+            ;;
+        --help)
+            echo "Usage: $0 [options]"
+            echo "Options:"
+            echo "  --mode MODE             Benchmark mode (evaluate_single/evaluate_all) [default: evaluate_single]"
+            echo "  --model-name NAME       Model name override"
+            echo "  --no-nki               Disable NKI optimizations"
+            echo "  --seq-len N            Sequence length [default: 640]"
+            echo "  --tp-degree N          Tensor parallel degree [default: from config]"
+            echo "  --help                 Show this help message"
+            exit 0
+            ;;
+        *)
+            echo -e "${RED}Unknown option: $1${NC}"
+            exit 1
+            ;;
+    esac
+done
+
+# Set paths
+MODEL_PATH="${NKI_MODELS}/${MODEL_NAME}"
+COMPILED_MODEL_PATH="${NKI_COMPILED}/${MODEL_NAME}"
+
+# Function to check if model exists
+check_model() {
+    if [[ ! -d "$MODEL_PATH" ]]; then
+        echo -e "${RED}❌ Model not found at: $MODEL_PATH${NC}"
+        echo -e "${YELLOW}Please run: ./nki-llama inference download${NC}"
+        exit 1
+    fi
+}
+
+# Function to check compilation cache
+# check_compiled_model() {
+#     if [[ -d "$COMPILED_MODEL_PATH" ]]; then
+#         echo -e "${GREEN}✓ Found compiled model cache at: $COMPILED_MODEL_PATH${NC}"
+#         return 0
+#     else
+#         echo -e "${YELLOW}⚠ No compiled model found. Will compile during benchmark.${NC}"
+#         return 1
+#     fi
+# }
+
+# Function to run evaluate_single mode
+run_evaluate_single() {
+    echo -e "${YELLOW}🔧 Running benchmark in evaluate_single mode...${NC}"
+    echo -e "${YELLOW}This mode runs from repository test script for single evaluation.${NC}"
+    
+    # Change to home directory and run the test script
+    cd ~
+    
+    # Build command
+    CMD="python ${NKI_ROOT}/test/inference/test.py"
+    CMD="${CMD} --repository-path ${NKI_ROOT}"
+    
+    # Execute with timing
+    echo -e "${BLUE}Executing evaluate_single benchmark...${NC}"
+    echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+    
+    START_TIME=$(date +%s)
+    
+    if $CMD 2>&1 | tee "${BENCHMARK_LOG_DIR}/benchmark.log"; then
+        END_TIME=$(date +%s)
+        DURATION=$((END_TIME - START_TIME))
+        
+        echo
+        echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+        echo -e "${GREEN}✓ evaluate_single benchmark completed successfully!${NC}"
+        echo -e "Total time: ${DURATION} seconds"
+        
+        return 0
+    else
+        echo -e "${RED}✗ evaluate_single benchmark failed!${NC}"
+        return 1
+    fi
+}
+
+# Function to run evaluate_all mode
+run_evaluate_all() {
+    echo -e "${YELLOW}📊 Running benchmark in evaluate_all mode...${NC}"
+    echo -e "${YELLOW}This mode evaluates all model configurations with NKI optimizations.${NC}"
+    
+    # Change to inference directory
+    cd "${NKI_INFERENCE}"
+    
+    # Build command
+    CMD="python main.py"
+    CMD="${CMD} --mode evaluate_all"
+    CMD="${CMD} --model-path ${MODEL_PATH}"
+    CMD="${CMD} --compiled-model-path ${COMPILED_MODEL_PATH}"
+    CMD="${CMD} --seq-len ${SEQ_LEN}"
+    CMD="${CMD} --tp-degree ${TP_DEGREE}"
+    
+    if [[ "$ENABLE_NKI" == "true" ]]; then
+        CMD="${CMD} --enable-nki"
+    fi
+    
+    # Execute with timing
+    echo -e "${BLUE}Executing evaluate_all benchmark...${NC}"
+    echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+    
+    START_TIME=$(date +%s)
+    
+    if $CMD 2>&1 | tee "${BENCHMARK_LOG_DIR}/benchmark.log"; then
+        END_TIME=$(date +%s)
+        DURATION=$((END_TIME - START_TIME))
+        
+        echo
+        echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+        echo -e "${GREEN}✓ evaluate_all benchmark completed successfully!${NC}"
+        echo -e "Total time: ${DURATION} seconds"
+        
+        # If compilation happened, show artifact info
+        if [[ -d "$COMPILED_MODEL_PATH" ]]; then
+            echo
+            echo -e "${GREEN}✓ NKI-compiled model artifacts available at:${NC}"
+            echo -e "   ${COMPILED_MODEL_PATH}"
+            echo
+            echo -e "${BLUE}These artifacts can now be used for:${NC}"
+            echo -e "  • vLLM inference with NxDI optimizations"
+            echo -e "  • Direct inference benchmarks"
+            echo -e "  • Production deployments"
+            echo
+        fi
+        
+        return 0
+    else
+        echo -e "${RED}✗ evaluate_all benchmark failed!${NC}"
+        return 1
+    fi
+}
+
+# Main benchmark function
+run_benchmark() {
+    echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+    echo -e "${BLUE}NKI Benchmark Evaluation${NC}"
+    echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+    echo
+    echo -e "Mode:              ${CYAN}${MODE}${NC}"
+    echo -e "Model:             ${CYAN}${MODEL_NAME}${NC}"
+    echo -e "Model Path:        ${CYAN}${MODEL_PATH}${NC}"
+    echo -e "Compiled Path:     ${CYAN}${COMPILED_MODEL_PATH}${NC}"
+    echo -e "NKI Enabled:       ${CYAN}${ENABLE_NKI}${NC}"
+    echo -e "Sequence Length:   ${CYAN}${SEQ_LEN}${NC}"
+    echo -e "TP Degree:         ${CYAN}${TP_DEGREE}${NC}"
+    echo
+    
+    # Check prerequisites based on mode
+    if [[ "$MODE" == "evaluate_all" ]]; then
+        check_model
+        # check_compiled_model
+    fi
+    
+    # Set environment variables for the benchmark
+    export NEURON_RT_NUM_CORES="${NEURON_RT_NUM_CORES}"
+    export NEURON_CC_FLAGS="--enable-saturate-infinity"
+    
+    # Additional NKI-specific flags if enabled
+    if [[ "$ENABLE_NKI" == "true" ]]; then
+        export NEURON_CC_FLAGS="${NEURON_CC_FLAGS} --enable-mixed-precision-accumulation"
+        echo -e "${GREEN}✓ NKI optimizations enabled${NC}"
+    fi
+    
+    # Create log directory for this benchmark
+    TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+    BENCHMARK_LOG_DIR="${NKI_LOGS}/benchmarks/${TIMESTAMP}"
+    mkdir -p "$BENCHMARK_LOG_DIR"
+    
+    echo -e "${BLUE}📊 Benchmark logs will be saved to:${NC}"
+    echo -e "   ${BENCHMARK_LOG_DIR}"
+    echo
+    
+    # Save benchmark metadata
+    cat > "${BENCHMARK_LOG_DIR}/metadata.json" << EOF
+{
+    "timestamp": "${TIMESTAMP}",
+    "mode": "${MODE}",
+    "model_name": "${MODEL_NAME}",
+    "model_path": "${MODEL_PATH}",
+    "compiled_model_path": "${COMPILED_MODEL_PATH}",
+    "nki_enabled": ${ENABLE_NKI},
+    "sequence_length": ${SEQ_LEN},
+    "tensor_parallel_size": ${TP_DEGREE},
+    "neuron_rt_cores": "${NEURON_RT_NUM_CORES}"
+}
+EOF
+    
+    # Run the appropriate benchmark mode
+    case "$MODE" in
+        evaluate_single)
+            if run_evaluate_single; then
+                RESULT="success"
+            else
+                RESULT="failed"
+            fi
+            ;;
+        evaluate_all)
+            if run_evaluate_all; then
+                RESULT="success"
+            else
+                RESULT="failed"
+            fi
+            ;;
+        *)
+            echo -e "${RED}Unknown mode: $MODE${NC}"
+            echo -e "Valid modes: evaluate_single, evaluate_all"
+            exit 1
+            ;;
+    esac
+    
+    # Update metadata with result
+    if [[ "$RESULT" == "success" ]]; then
+        END_TIME=$(date +%s)
+        DURATION=$((END_TIME - START_TIME))
+        
+        # Update metadata.json with duration
+        jq --arg duration "$DURATION" '.duration_seconds = ($duration | tonumber)' \
+            "${BENCHMARK_LOG_DIR}/metadata.json" > "${BENCHMARK_LOG_DIR}/metadata.json.tmp" && \
+            mv "${BENCHMARK_LOG_DIR}/metadata.json.tmp" "${BENCHMARK_LOG_DIR}/metadata.json"
+    fi
+}
+
+# Show benchmark info
+show_info() {
+    echo -e "${BLUE}NKI Benchmark Evaluation Tool${NC}"
+    echo
+    echo -e "This tool supports two benchmark modes:"
+    echo
+    echo -e "${YELLOW}1. evaluate_single mode:${NC}"
+    echo -e "   • Runs benchmark from repository test script"
+    echo -e "   • Single evaluation configuration"
+    echo -e "   • Quick validation of model performance"
+    echo
+    echo -e "${YELLOW}2. evaluate_all mode:${NC}"
+    echo -e "   • Comprehensive benchmark with all configurations"
+    echo -e "   • Tests with NKI optimizations"
+    echo -e "   • Creates compiled model artifacts if needed"
+    echo -e "   • Full performance analysis"
+    echo
+    echo -e "${BLUE}Examples:${NC}"
+    echo -e "   # Run single evaluation"
+    echo -e "   ./run-nki-benchmark.sh --mode evaluate_single"
+    echo
+    echo -e "   # Run comprehensive benchmark with NKI"
+    echo -e "   ./run-nki-benchmark.sh --mode evaluate_all --seq-len 1024"
+    echo
+    echo -e "   # Run without NKI optimizations"
+    echo -e "   ./run-nki-benchmark.sh --mode evaluate_all --no-nki"
+    echo
+}
+
+# Main execution
+if [[ "${1:-}" == "--info" ]]; then
+    show_info
+else
+    run_benchmark
+fi
\ No newline at end of file
diff --git a/src/inference/scripts/setup-vllm.sh b/src/inference/scripts/setup-vllm.sh
new file mode 100755
index 0000000..eb080b2
--- /dev/null
+++ b/src/inference/scripts/setup-vllm.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+# setup-vllm.sh - Setup vLLM for Neuron inference
+
+set -euo pipefail
+
+# Load configuration
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "${SCRIPT_DIR}/../../../nki-llama.config"
+
+# Colors
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+RED='\033[0;31m'
+NC='\033[0m'
+
+echo -e "${GREEN}Setting up vLLM for Neuron...${NC}"
+
+# Check if in correct environment
+if [[ "$VIRTUAL_ENV" != *"inference"* ]]; then
+    echo -e "${RED}Error: Not in inference environment${NC}"
+    echo -e "Run: source ${NEURON_INFERENCE_VENV}/bin/activate"
+    exit 1
+fi
+
+# Clone or update vLLM repository
+if [[ -d "$VLLM_REPO" ]]; then
+    echo "Updating existing vLLM repository..."
+    cd "$VLLM_REPO"
+    git fetch
+    git checkout "$VLLM_BRANCH"
+    git pull
+else
+    echo "Cloning vLLM repository..."
+    cd "$(dirname "$VLLM_REPO")"
+    git clone -b "$VLLM_BRANCH" https://github.com/aws-neuron/upstreaming-to-vllm.git
+fi
+
+# Install requirements
+cd "$VLLM_REPO"
+echo "Installing vLLM requirements..."
+pip install -r requirements-neuron.txt
+
+# Install vLLM
+echo "Installing vLLM for Neuron..."
+VLLM_TARGET_DEVICE="neuron" pip install -e .
+
+echo -e "${GREEN}✓ vLLM setup complete${NC}"
\ No newline at end of file
diff --git a/src/inference/scripts/start-server.sh b/src/inference/scripts/start-server.sh
new file mode 100755
index 0000000..a5d48e2
--- /dev/null
+++ b/src/inference/scripts/start-server.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# start-server.sh - Start vLLM OpenAI-compatible API server
+
+set -euo pipefail
+
+# Load configuration
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "${SCRIPT_DIR}/../../../nki-llama.config"
+
+# Colors
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+RED='\033[0;31m'
+NC='\033[0m'
+
+echo -e "${GREEN}Starting vLLM API Server${NC}"
+echo -e "${BLUE}Model: ${MODEL_NAME}${NC}"
+echo -e "${BLUE}Port: ${INFERENCE_PORT}${NC}"
+echo -e "${BLUE}Tensor Parallel Size: ${TENSOR_PARALLEL_SIZE}${NC}"
+echo
+
+# Check model exists
+if [[ ! -d "${NKI_MODELS}/${MODEL_NAME}" ]]; then
+    echo -e "${RED}Error: Model not found at ${NKI_MODELS}/${MODEL_NAME}${NC}"
+    echo "Run: ./nki-llama.sh inference download"
+    exit 1
+fi
+
+# Set Neuron environment variables
+export VLLM_NEURON_FRAMEWORK="${VLLM_NEURON_FRAMEWORK}"
+export NEURON_COMPILED_ARTIFACTS="${NKI_COMPILED}/${MODEL_NAME}"
+export NEURON_RT_NUM_CORES="${NEURON_RT_NUM_CORES}"
+
+# Create compiled model directory if needed
+mkdir -p "$NEURON_COMPILED_ARTIFACTS"
+
+echo -e "${YELLOW}Starting server on http://0.0.0.0:${INFERENCE_PORT}${NC}"
+echo -e "${YELLOW}Press Ctrl+C to stop${NC}"
+echo
+
+# Start vLLM server
+cd "$HOME"
+python -m vllm.entrypoints.openai.api_server \
+    --model="${NKI_MODELS}/${MODEL_NAME}" \
+    --max-num-seqs="${MAX_NUM_SEQS}" \
+    --max-model-len="${MAX_MODEL_LEN}" \
+    --tensor-parallel-size="${TENSOR_PARALLEL_SIZE}" \
+    --port="${INFERENCE_PORT}" \
+    --device="neuron" \
+    --override-neuron-config='{"enable_bucketing":false}'
\ No newline at end of file

From a038daa974dd8041feaa907bcc77859703c42b91 Mon Sep 17 00:00:00 2001
From: Armando Daniel Diaz Gonzalez
 <61255126+arm-diaz@users.noreply.github.com>
Date: Fri, 6 Jun 2025 11:19:40 -0400
Subject: [PATCH 07/65] Delete logs/benchmarks directory

---
 logs/benchmarks/20250606_150849/metadata.json | 11 -----------
 logs/benchmarks/20250606_150959/metadata.json | 11 -----------
 logs/benchmarks/20250606_151034/metadata.json | 11 -----------
 logs/benchmarks/20250606_151230/metadata.json | 11 -----------
 logs/benchmarks/20250606_151450/metadata.json | 11 -----------
 5 files changed, 55 deletions(-)
 delete mode 100644 logs/benchmarks/20250606_150849/metadata.json
 delete mode 100644 logs/benchmarks/20250606_150959/metadata.json
 delete mode 100644 logs/benchmarks/20250606_151034/metadata.json
 delete mode 100644 logs/benchmarks/20250606_151230/metadata.json
 delete mode 100644 logs/benchmarks/20250606_151450/metadata.json

diff --git a/logs/benchmarks/20250606_150849/metadata.json b/logs/benchmarks/20250606_150849/metadata.json
deleted file mode 100644
index f71d405..0000000
--- a/logs/benchmarks/20250606_150849/metadata.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-    "timestamp": "20250606_150849",
-    "mode": "evaluate_all",
-    "model_name": "llama-3-8b",
-    "model_path": "/home/ubuntu/models/llama-3-8b",
-    "compiled_model_path": "/home/ubuntu/traced_model/llama-3-8b",
-    "nki_enabled": true,
-    "sequence_length": 640,
-    "tensor_parallel_size": 8,
-    "neuron_rt_cores": "8"
-}
diff --git a/logs/benchmarks/20250606_150959/metadata.json b/logs/benchmarks/20250606_150959/metadata.json
deleted file mode 100644
index 8a28790..0000000
--- a/logs/benchmarks/20250606_150959/metadata.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-    "timestamp": "20250606_150959",
-    "mode": "evaluate_all",
-    "model_name": "llama-3-8b",
-    "model_path": "/home/ubuntu/models/llama-3-8b",
-    "compiled_model_path": "/home/ubuntu/traced_model/llama-3-8b",
-    "nki_enabled": true,
-    "sequence_length": 640,
-    "tensor_parallel_size": 8,
-    "neuron_rt_cores": "8"
-}
diff --git a/logs/benchmarks/20250606_151034/metadata.json b/logs/benchmarks/20250606_151034/metadata.json
deleted file mode 100644
index 7e7b0c6..0000000
--- a/logs/benchmarks/20250606_151034/metadata.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-    "timestamp": "20250606_151034",
-    "mode": "evaluate_all",
-    "model_name": "llama-3-8b",
-    "model_path": "/home/ubuntu/models/llama-3-8b",
-    "compiled_model_path": "/home/ubuntu/traced_model/llama-3-8b",
-    "nki_enabled": true,
-    "sequence_length": 640,
-    "tensor_parallel_size": 8,
-    "neuron_rt_cores": "8"
-}
diff --git a/logs/benchmarks/20250606_151230/metadata.json b/logs/benchmarks/20250606_151230/metadata.json
deleted file mode 100644
index ab28ac4..0000000
--- a/logs/benchmarks/20250606_151230/metadata.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-    "timestamp": "20250606_151230",
-    "mode": "evaluate_single",
-    "model_name": "llama-3-8b",
-    "model_path": "/home/ubuntu/models/llama-3-8b",
-    "compiled_model_path": "/home/ubuntu/traced_model/llama-3-8b",
-    "nki_enabled": true,
-    "sequence_length": 640,
-    "tensor_parallel_size": 8,
-    "neuron_rt_cores": "8"
-}
diff --git a/logs/benchmarks/20250606_151450/metadata.json b/logs/benchmarks/20250606_151450/metadata.json
deleted file mode 100644
index 37d15d4..0000000
--- a/logs/benchmarks/20250606_151450/metadata.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-    "timestamp": "20250606_151450",
-    "mode": "evaluate_single",
-    "model_name": "llama-3-8b",
-    "model_path": "/home/ubuntu/models/llama-3-8b",
-    "compiled_model_path": "/home/ubuntu/traced_model/llama-3-8b",
-    "nki_enabled": true,
-    "sequence_length": 640,
-    "tensor_parallel_size": 8,
-    "neuron_rt_cores": "8"
-}

From e82a194b8c4eb79c62d91be95e5d3013a8de8383 Mon Sep 17 00:00:00 2001
From: arm-diaz <arm.dan.diaz.gonzalez@gmail.com>
Date: Fri, 6 Jun 2025 15:21:37 +0000
Subject: [PATCH 08/65] fix: gitignore - logging

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 960e4eb..5d0a6a2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -268,5 +268,6 @@ test/inference/output
 global_metric_store.json
 benchmark_report.json
 cached_requirements.txt
+**/logs/
 
 # End of https://www.toptal.com/developers/gitignore/api/macos,windows,linux,jupyternotebooks,python
\ No newline at end of file

From 89950878cfc4da5a6ca3b4b3a6eedece8b0306c7 Mon Sep 17 00:00:00 2001
From: nithiyn <vijeasns@amazon.com>
Date: Fri, 6 Jun 2025 11:29:47 -0400
Subject: [PATCH 09/65] commit minor updates to Readme and shell

---
 src/fine-tune/README.md                       | 30 +++++++++----------
 .../{pipeline.sh => fine-tune-pipeline.sh}    |  0
 2 files changed, 15 insertions(+), 15 deletions(-)
 rename src/fine-tune/{pipeline.sh => fine-tune-pipeline.sh} (100%)

diff --git a/src/fine-tune/README.md b/src/fine-tune/README.md
index a01e80a..11285bb 100644
--- a/src/fine-tune/README.md
+++ b/src/fine-tune/README.md
@@ -1,4 +1,4 @@
-# pipeline.sh 
+# fine-tune-pipeline.sh 
 
 ---
 
@@ -26,7 +26,7 @@ MODEL_ID=meta-llama-3-8b
 cd ./src/fine-tune
 
 # Make the script executable
-chmod +x pipeline.sh
+chmod +x fine-tune-pipeline.sh
 
 # Activate Neuron environment
 source /opt/aws_neuronx_venv_pytorch_2_5/bin/activate
@@ -38,14 +38,14 @@ source /opt/aws_neuronx_venv_pytorch_2_5/bin/activate
 
 | Command | Action |
 |---------|--------|
-| `./pipeline.sh` | Run the **full pipeline** (deps → data → model → convert_ckpt → precompile → train) |
-| `./pipeline.sh deps` | Install/validate Apex, NxDT, etc. |
-| `./pipeline.sh data` | Download dataset |
-| `./pipeline.sh model` | Download & convert model checkpoints |
-| `./pipeline.sh convert_ckpt` | Convert checkpoints to NxDT format |
-| `./pipeline.sh precompile` | Ahead‑of‑time graph compilation |
-| `./pipeline.sh train` | Start fine‑tuning |
-| `./pipeline.sh clean` | Remove generated datasets, weights, experiments |
+| `./fine-tune-pipeline.sh` | Run the **full FT pipeline** (deps → data → model → convert_ckpt → precompile → train) |
+| `./fine-tune-pipeline.sh deps` | Install/validate Apex, NxDT, etc. |
+| `./fine-tune-pipeline.sh data` | Download dataset |
+| `./fine-tune-pipeline.sh model` | Download & convert model checkpoints |
+| `./fine-tune-pipeline.sh convert_ckpt` | Convert checkpoints to NxDT format |
+| `./fine-tune-pipeline.sh precompile` | Ahead‑of‑time graph compilation |
+| `./fine-tune-pipeline.sh train` | Start fine‑tuning |
+| `./fine-tune-pipeline.sh clean` | Remove generated datasets, weights, experiments |
 
  Each sub‑command double‑checks you’re inside a Neuron venv and prints a helpful error if not.
 
@@ -58,7 +58,7 @@ source /opt/aws_neuronx_venv_pytorch_2_5/bin/activate
 | `HF_TOKEN` | Hugging Face auth token (for private models) | Add to `.env` or `export HF_TOKEN=…` |
 | `MODEL_ID` | Model slug, e.g. `meta-llama-3-8b` | Same as above |
 
-The script auto‑loads `../../.env` with `set -a; source …`. Modify the `ENV_FILE=` line in `pipeline.sh` if you store it elsewhere.
+The script auto‑loads `../../.env` with `set -a; source …`. Modify the `ENV_FILE=` line in `fine-tune-pipeline.sh` if you store it elsewhere.
 
 ---
 
@@ -67,19 +67,19 @@ The script auto‑loads `../../.env` with `set -a; source …`. Modify the `ENV_
 | Symptom | Probable Cause | Fix |
 |---------|---------------|-----|
 | `Not inside a Neuron virtual environment.` | Forgot to activate venv | `source /opt/aws_neuronx_venv_pytorch_2_5/bin/activate` |
-| `command not found: pipeline.sh` | File not executable or wrong cwd | `chmod +x pipeline.sh` and/or `./pipeline.sh` |
+| `command not found: fine-tune-pipeline.sh` | File not executable or wrong cwd | `chmod +x fine-tune-pipeline.sh` and/or `./fine-tune-pipeline.sh` |
 | Model download fails | Missing/invalid `HF_TOKEN` | Provide valid token in env or `.env` |
 | Long compile times | First‑time Neuron AOT | Subsequent runs reuse cached graphs |
 
 ---
 
-## 6 · Extending the Pipeline
+## 6 · Extending the Fine Tuning Pipeline
 
-1. Add a new Bash function in `pipeline.sh` (e.g., `evaluate()`).
+1. Add a new Bash function in `fine-tune-pipeline.sh` (e.g., `evaluate()`).
 2. Append its name to the pattern list inside `main()`.
 3. Optionally call it from `all()` for automatic inclusion.
 
 ```bash
-./pipeline.sh train 
+./fine-tune-pipeline.sh train 
 ```
 ---
diff --git a/src/fine-tune/pipeline.sh b/src/fine-tune/fine-tune-pipeline.sh
similarity index 100%
rename from src/fine-tune/pipeline.sh
rename to src/fine-tune/fine-tune-pipeline.sh

From 298418fdf18cd72492918f2d5f56d275b545a147 Mon Sep 17 00:00:00 2001
From: arm-diaz <arm.dan.diaz.gonzalez@gmail.com>
Date: Fri, 6 Jun 2025 15:31:08 +0000
Subject: [PATCH 10/65] docs: add comments

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index db7bdcd..a8385f3 100644
--- a/README.md
+++ b/README.md
@@ -65,10 +65,10 @@ source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
 ./nki-llama inference benchmark
 
 # Run quick single evaluation
-./nki-llama inference benchmark single
+# ./nki-llama inference benchmark single (TODO: FIX)
 
 # Start inference server
-./nki-llama server
+# ./nki-llama server (TODO: DOCS)
 ```
 
 ## 🏗️ Initial Setup

From 56dae34f58b9a3b4ab1fe232910f03989136f33f Mon Sep 17 00:00:00 2001
From: arm-diaz <arm.dan.diaz.gonzalez@gmail.com>
Date: Tue, 10 Jun 2025 15:44:58 +0000
Subject: [PATCH 11/65] feat: calculate score for finetuning

---
 src/fine-tune/docs/calculate-score-guide.md   | 289 ++++++++
 .../scripts/calculate_training_metrics.py     | 638 ++++++++++++++++++
 2 files changed, 927 insertions(+)
 create mode 100644 src/fine-tune/docs/calculate-score-guide.md
 create mode 100644 src/fine-tune/scripts/calculate_training_metrics.py

diff --git a/src/fine-tune/docs/calculate-score-guide.md b/src/fine-tune/docs/calculate-score-guide.md
new file mode 100644
index 0000000..b53ca43
--- /dev/null
+++ b/src/fine-tune/docs/calculate-score-guide.md
@@ -0,0 +1,289 @@
+# Training Metrics Calculator
+
+A comprehensive tool for calculating training metrics including Model FLOPs Utilization (MFU), NKI (Neuron Kernel Interface) usage analysis, and training performance scores from AWS Neuron training logs and HLO files.
+
+## Overview
+
+This tool analyzes training performance on AWS Trainium instances by:
+- Calculating MFU (Model FLOPs Utilization) percentage
+- Analyzing NKI kernel usage across all compiled HLO modules
+- Computing training performance scores both overall and per-file
+- Extracting metrics from training logs
+- Providing detailed breakdowns of performance improvements
+
+## Features
+
+- **MFU Calculation**: Computes the percentage of theoretical peak FLOPs achieved during training
+- **NKI Analysis**: Identifies and quantifies custom NKI kernel usage vs standard operations
+- **Training Score**: Calculates a comprehensive performance score based on multiple factors
+- **Per-File Analysis**: Breaks down performance metrics for individual HLO modules
+- **Log Parsing**: Extracts throughput and loss metrics from training logs
+- **Flexible Configuration**: Supports various model configurations and hardware backends
+
+## Requirements
+
+- Python 3.7+
+- `neuronx_distributed_training` package
+- `torch_neuronx` package
+- PyYAML
+- Access to compiled HLO files in the Neuron cache directory
+
+## Installation
+
+```bash
+# Ensure you have the AWS Neuron SDK installed
+# Install additional dependencies
+pip install pyyaml
+```
+
+## Usage
+
+### Example
+
+```bash
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+python calculate_training_metrics.py \
+      --config /home/ubuntu/nki-llama/src/fine-tune/neuronx-distributed-training/examples/conf/hf_llama3_8B_SFT_config.yaml \
+      --model-config /home/ubuntu/nki-llama/src/fine-tune/configs/model-config/8B_config_llama3-1/config.json \
+      --compile-dir /home/ubuntu/neuron_cache \
+      --throughput 2.1 \
+      --hw-backend trn1 \
+      --calculate-score \
+      --per-file-scores \
+      --detailed \
+      --print-per-file \
+      --output baseline_metrics.json
+```
+
+### Basic Usage
+
+```bash
+python calculate_training_metrics.py \
+  --config training_config.yaml \
+  --model-config model_config.json
+```
+
+### Advanced Usage with All Features
+
+```bash
+python calculate_training_metrics.py \
+  --config training_config.yaml \
+  --model-config model_config.json \
+  --compile-dir /home/ubuntu/neuron_cache \
+  --log-file training.log \
+  --batch-size 32 \
+  --seq-len 2048 \
+  --throughput 150.5 \
+  --num-nodes 4 \
+  --hw-backend trn1 \
+  --calculate-score \
+  --per-file-scores \
+  --print-per-file \
+  --detailed \
+  --base-mfu 50.0 \
+  --base-throughput 100.0 \
+  --loss-improvement 1.2 \
+  --convergence-rate 1.1 \
+  --output metrics_report.json
+```
+
+## Command Line Arguments
+
+### Required Arguments
+
+- `--config`: Path to the training configuration YAML file
+- `--model-config`: Path to the model configuration JSON file
+
+### Optional Arguments
+
+#### Basic Configuration
+- `--compile-dir`: Neuron compile cache directory (default: `/home/ubuntu/neuron_cache`)
+- `--log-file`: Training log file to parse for metrics
+- `--batch-size`: Global batch size (overrides config file)
+- `--seq-len`: Sequence length (overrides config file)
+- `--throughput`: Throughput in sequences/second (if known)
+- `--num-nodes`: Number of nodes (default: 1)
+- `--hw-backend`: Hardware backend - `trn1` or `trn2` (default: `trn1`)
+- `--output`: Output metrics file (default: `training_metrics.json`)
+
+#### Display Options
+- `--detailed`: Include detailed per-file metrics in JSON output
+- `--print-per-file`: Print per-file metrics table to console
+
+#### Training Score Parameters
+- `--calculate-score`: Calculate final training score
+- `--per-file-scores`: Calculate training scores for each file individually
+- `--base-mfu`: Baseline MFU percentage for score calculation (default: 50.0)
+- `--base-throughput`: Baseline throughput in seq/s for score calculation (default: 100.0)
+- `--loss-improvement`: Loss improvement ratio (baseline_loss/achieved_loss) (default: 1.0)
+- `--convergence-rate`: Convergence rate improvement (baseline_steps/achieved_steps) (default: 1.0)
+
+## Output Format
+
+### Console Output
+
+The tool provides detailed console output including:
+
+1. **Configuration Summary**: Shows the parameters used for calculation
+2. **Per-File Analysis** (with `--print-per-file`):
+   ```
+   File Name                                Module              HLO MACs        NKI MACs  NKI Ratio      Score
+   --------------------------------------------------------------------------------------------------------
+   model.hlo_module.pb                      MODULE_Model      1,234,567,890    123,456,789     0.1000     1.2345
+   ```
+3. **Training Score Breakdown** (with `--calculate-score`):
+   ```
+   Training Score Breakdown:
+   ==================================================
+   MFU improvement: 1.2000 (50.00% → 60.00%)
+   Throughput improvement: 1.5000 (100.00 → 150.00 seq/s)
+   Loss improvement: 1.2000
+   Convergence rate improvement: 1.1000
+   NKI flop ratio: 0.1500
+   ==================================================
+   Final Training Score: 2.7324
+   ```
+4. **Summary Statistics**: Overall metrics and NKI analysis summary
+
+### JSON Output
+
+The output JSON file contains:
+
+```json
+{
+    "model_config": "path/to/model_config.json",
+    "batch_size": 32,
+    "sequence_length": 2048,
+    "num_nodes": 4,
+    "hardware_backend": "trn1",
+    "throughput_seq_per_sec": 150.5,
+    "mfu_percent": 60.0,
+    "tflops_per_second": 245.6,
+    "nki_analysis": {
+        "summary": {
+            "total_files": 10,
+            "successful_analyses": 10,
+            "overall_nki_ratio": 0.15,
+            "average_nki_ratio": 0.14,
+            "min_nki_ratio": 0.05,
+            "max_nki_ratio": 0.25
+        },
+        "per_file_metrics": [...]  // With --detailed flag
+    },
+    "training_score": 2.7324,
+    "training_score_breakdown": {...}
+}
+```
+
+## Understanding the Metrics
+
+### MFU (Model FLOPs Utilization)
+- Percentage of theoretical peak FLOPs achieved
+- Higher is better (typical range: 30-70% for LLMs)
+- Depends on model architecture, batch size, and sequence length
+
+### NKI Ratio
+- Ratio of NKI (custom kernel) MACs to total MACs
+- Higher ratio indicates more optimized kernels
+- Range: 0.0 (no NKI) to 1.0 (all NKI)
+
+### Training Score
+- Composite metric combining multiple factors:
+  - MFU improvement over baseline
+  - Throughput improvement over baseline
+  - Loss improvement (optional)
+  - Convergence rate improvement (optional)
+  - NKI utilization bonus
+- Formula: `score = mfu_improvement × throughput_improvement × loss_improvement × convergence_rate × (1 + nki_ratio)`
+
+## Example Workflows
+
+### 1. Basic Performance Analysis
+
+```bash
+# Just get MFU and basic metrics
+python calculate_training_metrics.py \
+  --config config.yaml \
+  --model-config model.json
+```
+
+### 2. Full Training Evaluation
+
+```bash
+# Complete analysis with scores
+python calculate_training_metrics.py \
+  --config config.yaml \
+  --model-config model.json \
+  --log-file training.log \
+  --calculate-score \
+  --print-per-file
+```
+
+### 3. Comparative Analysis
+
+```bash
+# Compare against baseline performance
+python calculate_training_metrics.py \
+  --config config.yaml \
+  --model-config model.json \
+  --calculate-score \
+  --base-mfu 45.0 \
+  --base-throughput 80.0 \
+  --loss-improvement 1.15
+```
+
+### 4. Debugging NKI Usage
+
+```bash
+# Detailed per-file NKI analysis
+python calculate_training_metrics.py \
+  --config config.yaml \
+  --model-config model.json \
+  --print-per-file \
+  --per-file-scores \
+  --detailed
+```
+
+## Troubleshooting
+
+### Common Issues
+
+1. **"HLO file not found" errors**
+   - Ensure the `--compile-dir` points to the correct Neuron cache directory
+   - Check that compilation completed successfully
+
+2. **"Failed to parse NKI backend config" warnings**
+   - Normal for non-NKI operations
+   - Only affects NKI metric calculation
+
+3. **Low MFU values**
+   - Check batch size and sequence length
+   - Ensure model is properly optimized for Neuron
+   - Consider using larger batch sizes
+
+4. **Zero NKI ratio**
+   - Verify NKI kernels are enabled in compilation
+   - Check Neuron SDK version supports NKI
+
+### Log File Format
+
+The tool expects training logs with patterns like:
+```
+step_time: 1.234
+throughput: 150.5
+seq/s: 150.5
+loss: 2.345
+```
+
+## Best Practices
+
+1. **Baseline Selection**: Choose realistic baselines that represent:
+   - Previous model versions
+   - Industry standards
+   - Unoptimized implementations
+
+2. **Multiple Runs**: Analyze metrics from multiple training runs to ensure consistency
+
+3. **Complete Analysis**: Use both overall and per-file scores to identify optimization opportunities
+
+4. **Version Tracking**: Save output JSON files with model versions for historical comparison
diff --git a/src/fine-tune/scripts/calculate_training_metrics.py b/src/fine-tune/scripts/calculate_training_metrics.py
new file mode 100644
index 0000000..b3aa0f1
--- /dev/null
+++ b/src/fine-tune/scripts/calculate_training_metrics.py
@@ -0,0 +1,638 @@
+#!/usr/bin/env python3
+"""Calculate training metrics including MFU, NKI score, and final training score from training logs and HLO files."""
+
+import argparse
+import json
+import os
+import re
+import glob
+import base64
+from pathlib import Path
+from typing import List, Dict, Tuple
+
+from neuronx_distributed_training.utils.llama_perf_estimate import calculate_mfu
+from torch_neuronx.pyhlo.hlo_pb2 import HloModuleProto
+
+
+def calculate_training_score(
+    base_mfu,
+    base_throughput,
+    mfu,
+    throughput,
+    nki_flop_ratio,
+    loss_improvement=1.0,
+    convergence_rate=1.0
+):
+    """
+    Calculate training score similar to inference calculate_score function.
+    
+    Args:
+        base_mfu: Baseline Model FLOPs Utilization percentage
+        base_throughput: Baseline throughput in sequences/second
+        mfu: Achieved Model FLOPs Utilization percentage
+        throughput: Achieved throughput in sequences/second
+        nki_flop_ratio: Ratio of NKI MACs to total MACs
+        loss_improvement: Optional - ratio of baseline loss to achieved loss at same step
+        convergence_rate: Optional - ratio of steps to reach target loss (baseline/achieved)
+    
+    Returns:
+        float: Final training score
+    """
+    
+    # Calculate improvement ratios
+    mfu_improvement = mfu / base_mfu if base_mfu > 0 else 1.0
+    throughput_improvement = throughput / base_throughput if base_throughput > 0 else 1.0
+    
+    # Combine metrics into final score
+    # Similar formula to inference but adapted for training metrics
+    final_score = mfu_improvement * throughput_improvement * loss_improvement * convergence_rate * (1 + nki_flop_ratio)
+    
+    return {
+        'score': final_score,
+        'mfu_improvement': mfu_improvement,
+        'throughput_improvement': throughput_improvement,
+        'loss_improvement': loss_improvement,
+        'convergence_rate': convergence_rate,
+        'nki_flop_ratio': nki_flop_ratio
+    }
+
+
+def calculate_per_file_training_score(
+    file_metrics: Dict,
+    base_mfu: float,
+    base_throughput: float,
+    achieved_mfu: float,
+    achieved_throughput: float,
+    loss_improvement: float = 1.0,
+    convergence_rate: float = 1.0
+) -> Dict:
+    """
+    Calculate training score for a single file based on its NKI ratio.
+    
+    Args:
+        file_metrics: Dictionary containing file analysis results
+        base_mfu: Baseline MFU percentage
+        base_throughput: Baseline throughput
+        achieved_mfu: Achieved MFU percentage
+        achieved_throughput: Achieved throughput
+        loss_improvement: Loss improvement ratio
+        convergence_rate: Convergence rate improvement
+    
+    Returns:
+        Dictionary with score details
+    """
+    if file_metrics['status'] != 'success':
+        return {
+            'score': 0.0,
+            'error': file_metrics.get('error', 'File analysis failed')
+        }
+    
+    nki_ratio = file_metrics['nki_ratio']
+    score_details = calculate_training_score(
+        base_mfu=base_mfu,
+        base_throughput=base_throughput,
+        mfu=achieved_mfu,
+        throughput=achieved_throughput,
+        nki_flop_ratio=nki_ratio,
+        loss_improvement=loss_improvement,
+        convergence_rate=convergence_rate
+    )
+    
+    return score_details
+
+
+def parse_hlo_file(hlo_file_path: str) -> HloModuleProto:
+    """Parse HLO file and return protobuf."""
+    # Check if file exists
+    if not os.path.exists(hlo_file_path):
+        raise FileNotFoundError(f"HLO file not found: {hlo_file_path}")
+    
+    with open(hlo_file_path, 'rb') as f:
+        hlo_data = f.read()
+    
+    hlo_proto = HloModuleProto()
+    hlo_proto.ParseFromString(hlo_data)
+    return hlo_proto
+
+
+def count_mac_operations(hlo_proto: HloModuleProto) -> Tuple[int, int]:
+    """Count MAC operations in HLO proto.
+    
+    Returns:
+        Tuple of (total_mac_count, nki_mac_count)
+    """
+    nki_mac = 0
+    hlo_mac = 0
+    
+    for computation in hlo_proto.computations:
+        instruction_map = {instr.id: instr for instr in computation.instructions}
+        
+        for instruction in computation.instructions:
+            # Finding NKI ops
+            if instruction.opcode == "custom-call":
+                if instruction.custom_call_target == 'AwsNeuronCustomNativeKernel':
+                    try:
+                        backend_config = instruction.backend_config
+                        config = json.loads(base64.b64decode(backend_config))
+                        mac_count = int(config.get('mac_count', 0))
+                    except Exception as e:
+                        print(f"Warning: Failed to parse NKI backend config: {e}")
+                        mac_count = 0
+                    
+                    nki_mac += mac_count
+                    hlo_mac += mac_count
+            elif instruction.opcode == "dot":
+                # Get dot dimension numbers
+                dnums = instruction.dot_dimension_numbers
+                
+                # Get shapes of operands using operand_ids
+                try:
+                    lhs_shape = instruction_map[instruction.operand_ids[0]].shape
+                    rhs_shape = instruction_map[instruction.operand_ids[1]].shape
+                    
+                    # Initialize counters
+                    lhs_batch = 1
+                    lhs_contracting_size = 1
+                    lhs_non_contracting_size = 1
+                    rhs_non_contracting_size = 1
+                    
+                    # Process LHS shape
+                    for i in range(len(lhs_shape.dimensions)):
+                        if i in dnums.lhs_contracting_dimensions:
+                            lhs_contracting_size *= lhs_shape.dimensions[i]
+                        elif i in dnums.lhs_batch_dimensions:
+                            lhs_batch *= lhs_shape.dimensions[i]
+                        else:
+                            lhs_non_contracting_size *= lhs_shape.dimensions[i]
+                    
+                    # Process RHS shape
+                    for i in range(len(rhs_shape.dimensions)):
+                        if i not in dnums.rhs_contracting_dimensions and \
+                           i not in dnums.rhs_batch_dimensions:
+                            rhs_non_contracting_size *= rhs_shape.dimensions[i]
+                    
+                    mac_count = (lhs_batch * lhs_non_contracting_size *
+                                lhs_contracting_size * rhs_non_contracting_size)
+                    hlo_mac += mac_count
+                except Exception as e:
+                    print(f"Warning: Failed to process dot operation: {e}")
+    
+    return hlo_mac, nki_mac
+
+
+def find_all_hlo_files(compile_dir: str) -> List[str]:
+    """Find all HLO module files in the neuron cache directory."""
+    hlo_files = []
+    
+    # Convert to Path object for easier manipulation
+    base_path = Path(compile_dir)
+    
+    # Find all .hlo_module.pb files recursively
+    hlo_patterns = [
+        "**/*.hlo_module.pb",
+        "**/model.hlo_module.pb",
+        "**/*.hlo",
+        "**/graph.hlo"
+    ]
+    
+    for pattern in hlo_patterns:
+        found_files = list(base_path.glob(pattern))
+        hlo_files.extend([str(f) for f in found_files])
+    
+    # Remove duplicates while preserving order
+    seen = set()
+    unique_files = []
+    for f in hlo_files:
+        if f not in seen:
+            seen.add(f)
+            unique_files.append(f)
+    
+    return unique_files
+
+
+def get_module_info(hlo_file_path: str) -> Dict[str, str]:
+    """Extract module information from HLO file path."""
+    path_parts = Path(hlo_file_path).parts
+    module_info = {
+        'file_path': hlo_file_path,
+        'module_name': None,
+        'neuronxcc_version': None
+    }
+    
+    # Extract module name and neuronxcc version from path
+    for i, part in enumerate(path_parts):
+        if part.startswith('MODULE_'):
+            module_info['module_name'] = part
+        elif part.startswith('neuronxcc-'):
+            module_info['neuronxcc_version'] = part
+    
+    return module_info
+
+
+def parse_training_logs(log_file: str) -> Dict:
+    """Parse training logs to extract throughput and loss information."""
+    metrics = {
+        'steps': [],
+        'step_times': [],
+        'throughputs': [],
+        'losses': []
+    }
+    
+    if not os.path.exists(log_file):
+        print(f"Warning: Log file {log_file} not found")
+        return metrics
+    
+    with open(log_file, 'r') as f:
+        lines = f.readlines()
+    
+    # Common patterns in training logs
+    patterns = {
+        'step_time': r'step_time:\s*([\d.]+)',
+        'throughput': r'throughput:\s*([\d.]+)',
+        'samples_per_sec': r'samples/sec:\s*([\d.]+)',
+        'tokens_per_sec': r'tokens/sec:\s*([\d.]+)',
+        'seq_per_sec': r'seq/s:\s*([\d.]+)',
+        'loss': r'loss:\s*([\d.]+)',
+        'train_loss': r'train_loss:\s*([\d.]+)'
+    }
+    
+    for line in lines:
+        for key, pattern in patterns.items():
+            match = re.search(pattern, line)
+            if match:
+                value = float(match.group(1))
+                if key == 'step_time':
+                    metrics['step_times'].append(value)
+                elif key in ['throughput', 'samples_per_sec', 'tokens_per_sec', 'seq_per_sec']:
+                    metrics['throughputs'].append(value)
+                elif key in ['loss', 'train_loss']:
+                    metrics['losses'].append(value)
+    
+    return metrics
+
+
+def analyze_hlo_file(hlo_file: str) -> Dict:
+    """Analyze a single HLO file and return its metrics."""
+    try:
+        module_info = get_module_info(hlo_file)
+        
+        # Parse the HLO file
+        hlo_proto = parse_hlo_file(hlo_file)
+        
+        # Count MAC operations
+        hlo_mac, nki_mac = count_mac_operations(hlo_proto)
+        
+        # Calculate NKI ratio for this file
+        nki_ratio = nki_mac / hlo_mac if hlo_mac > 0 else 0.0
+        
+        return {
+            'status': 'success',
+            'file_path': hlo_file,
+            'file_name': os.path.basename(hlo_file),
+            'module_name': module_info['module_name'],
+            'neuronxcc_version': module_info['neuronxcc_version'],
+            'hlo_macs': hlo_mac,
+            'nki_macs': nki_mac,
+            'nki_ratio': nki_ratio
+        }
+        
+    except Exception as e:
+        return {
+            'status': 'error',
+            'file_path': hlo_file,
+            'file_name': os.path.basename(hlo_file),
+            'error': str(e),
+            'hlo_macs': 0,
+            'nki_macs': 0,
+            'nki_ratio': 0.0
+        }
+
+
+def analyze_all_hlo_files(hlo_files: List[str], score_params: Dict = None) -> Dict:
+    """Analyze all HLO files individually and return per-file metrics with optional scoring."""
+    per_file_metrics = []
+    successful_analyses = 0
+    
+    for hlo_file in hlo_files:
+        print(f"Analyzing: {hlo_file}")
+        file_metrics = analyze_hlo_file(hlo_file)
+        
+        # Calculate per-file score if parameters provided
+        if score_params and file_metrics['status'] == 'success':
+            file_score = calculate_per_file_training_score(
+                file_metrics=file_metrics,
+                base_mfu=score_params['base_mfu'],
+                base_throughput=score_params['base_throughput'],
+                achieved_mfu=score_params['achieved_mfu'],
+                achieved_throughput=score_params['achieved_throughput'],
+                loss_improvement=score_params.get('loss_improvement', 1.0),
+                convergence_rate=score_params.get('convergence_rate', 1.0)
+            )
+            file_metrics['training_score'] = file_score
+        
+        per_file_metrics.append(file_metrics)
+        
+        if file_metrics['status'] == 'success':
+            successful_analyses += 1
+        else:
+            print(f"  Error: {file_metrics['error']}")
+    
+    # Calculate summary statistics
+    successful_files = [m for m in per_file_metrics if m['status'] == 'success']
+    
+    if successful_files:
+        total_hlo_macs = sum(m['hlo_macs'] for m in successful_files)
+        total_nki_macs = sum(m['nki_macs'] for m in successful_files)
+        overall_nki_ratio = total_nki_macs / total_hlo_macs if total_hlo_macs > 0 else 0.0
+        
+        nki_ratios = [m['nki_ratio'] for m in successful_files if m['nki_ratio'] > 0]
+        avg_nki_ratio = sum(nki_ratios) / len(nki_ratios) if nki_ratios else 0.0
+        min_nki_ratio = min(nki_ratios) if nki_ratios else 0.0
+        max_nki_ratio = max(nki_ratios) if nki_ratios else 0.0
+        
+        # Calculate score statistics if scores exist
+        if score_params:
+            scores = [m['training_score']['score'] for m in successful_files if 'training_score' in m]
+            if scores:
+                avg_score = sum(scores) / len(scores)
+                min_score = min(scores)
+                max_score = max(scores)
+            else:
+                avg_score = min_score = max_score = 0.0
+        else:
+            avg_score = min_score = max_score = None
+    else:
+        total_hlo_macs = 0
+        total_nki_macs = 0
+        overall_nki_ratio = 0.0
+        avg_nki_ratio = 0.0
+        min_nki_ratio = 0.0
+        max_nki_ratio = 0.0
+        avg_score = min_score = max_score = None
+    
+    summary = {
+        'total_files': len(hlo_files),
+        'successful_analyses': successful_analyses,
+        'failed_analyses': len(hlo_files) - successful_analyses,
+        'total_hlo_macs': total_hlo_macs,
+        'total_nki_macs': total_nki_macs,
+        'overall_nki_ratio': overall_nki_ratio,
+        'average_nki_ratio': avg_nki_ratio,
+        'min_nki_ratio': min_nki_ratio,
+        'max_nki_ratio': max_nki_ratio
+    }
+    
+    if score_params:
+        summary['average_score'] = avg_score
+        summary['min_score'] = min_score
+        summary['max_score'] = max_score
+    
+    return {
+        'per_file_metrics': per_file_metrics,
+        'summary': summary
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Calculate training metrics post-training")
+    parser.add_argument("--config", required=True, help="Path to the training config YAML file")
+    parser.add_argument("--model-config", required=True, help="Path to the model config.json file")
+    parser.add_argument("--compile-dir", default="/home/ubuntu/neuron_cache", help="Neuron compile cache directory")
+    parser.add_argument("--log-file", help="Training log file to parse for metrics")
+    parser.add_argument("--batch-size", type=int, help="Global batch size (overrides config)")
+    parser.add_argument("--seq-len", type=int, help="Sequence length (overrides config)")
+    parser.add_argument("--throughput", type=float, help="Throughput in sequences/second (if known)")
+    parser.add_argument("--num-nodes", type=int, default=1, help="Number of nodes")
+    parser.add_argument("--hw-backend", choices=['trn1', 'trn2'], default='trn1', help="Hardware backend")
+    parser.add_argument("--output", default="training_metrics.json", help="Output metrics file")
+    parser.add_argument("--detailed", action="store_true", help="Include detailed per-file metrics in output")
+    parser.add_argument("--print-per-file", action="store_true", help="Print per-file metrics to console")
+    
+    # Training score parameters
+    score_group = parser.add_argument_group('Training Score Parameters')
+    score_group.add_argument(
+        "--base-mfu",
+        type=float,
+        default=50.0,
+        help="Baseline MFU percentage for score calculation"
+    )
+    score_group.add_argument(
+        "--base-throughput",
+        type=float,
+        default=100.0,
+        help="Baseline throughput (seq/s) for score calculation"
+    )
+    score_group.add_argument(
+        "--loss-improvement",
+        type=float,
+        default=1.0,
+        help="Loss improvement ratio (baseline_loss/achieved_loss at same step)"
+    )
+    score_group.add_argument(
+        "--convergence-rate",
+        type=float,
+        default=1.0,
+        help="Convergence rate improvement (baseline_steps/achieved_steps to target loss)"
+    )
+    score_group.add_argument(
+        "--calculate-score",
+        action="store_true",
+        help="Calculate final training score"
+    )
+    score_group.add_argument(
+        "--per-file-scores",
+        action="store_true",
+        help="Calculate training scores for each file individually"
+    )
+    
+    args = parser.parse_args()
+    
+    # Load config
+    import yaml
+    with open(args.config, 'r') as f:
+        cfg = yaml.safe_load(f)
+    
+    # Override config values if provided
+    if args.batch_size:
+        batch_size = args.batch_size
+    else:
+        batch_size = cfg['data']['global_batch_size']
+    
+    if args.seq_len:
+        seq_len = args.seq_len
+    else:
+        seq_len = cfg['data']['seq_length']
+    
+    # Determine throughput
+    if args.throughput:
+        throughput = args.throughput
+    elif args.log_file:
+        log_metrics = parse_training_logs(args.log_file)
+        if log_metrics['throughputs']:
+            throughput = sum(log_metrics['throughputs']) / len(log_metrics['throughputs'])
+        else:
+            print("Warning: Could not extract throughput from logs, using default")
+            throughput = 100.0  # Default estimate
+    else:
+        print("Warning: No throughput information provided, using default")
+        throughput = 100.0  # Default estimate
+    
+    print(f"\nCalculating metrics with:")
+    print(f"  Model config: {args.model_config}")
+    print(f"  Batch size: {batch_size}")
+    print(f"  Sequence length: {seq_len}")
+    print(f"  Throughput: {throughput} seq/s")
+    print(f"  Hardware: {args.hw_backend}")
+    print(f"  Nodes: {args.num_nodes}")
+    
+    # Calculate MFU
+    mfu, seq_per_second, throughput_per_node, tflops_per_second, time_per_batch = calculate_mfu(
+        config_path=args.model_config,
+        batch_size=batch_size,
+        throughput=throughput,
+        num_nodes=args.num_nodes,
+        seq_len=seq_len,
+        hw_backend=args.hw_backend
+    )
+    
+    # Find and analyze all HLO files
+    print(f"\nSearching for HLO files in {args.compile_dir}...")
+    hlo_files = find_all_hlo_files(args.compile_dir)
+    print(f"Found {len(hlo_files)} HLO files")
+    
+    # Prepare score parameters if needed
+    score_params = None
+    if args.calculate_score and args.per_file_scores:
+        score_params = {
+            'base_mfu': args.base_mfu,
+            'base_throughput': args.base_throughput,
+            'achieved_mfu': mfu,
+            'achieved_throughput': throughput,
+            'loss_improvement': args.loss_improvement,
+            'convergence_rate': args.convergence_rate
+        }
+    
+    # Analyze HLO files individually
+    hlo_analysis = analyze_all_hlo_files(hlo_files, score_params)
+    
+    # Print per-file information if requested
+    if args.print_per_file:
+        print("\nPer-file NKI analysis:")
+        print("-" * 120)
+        if args.per_file_scores and args.calculate_score:
+            print(f"{'File Name':<40} {'Module':<20} {'HLO MACs':>15} {'NKI MACs':>15} {'NKI Ratio':>10} {'Score':>10}")
+        else:
+            print(f"{'File Name':<40} {'Module':<20} {'HLO MACs':>15} {'NKI MACs':>15} {'NKI Ratio':>10}")
+        print("-" * 120)
+        
+        for file_metrics in hlo_analysis['per_file_metrics']:
+            if file_metrics['status'] == 'success':
+                base_info = (f"{file_metrics['file_name']:<40} "
+                           f"{(file_metrics['module_name'] or 'N/A'):<20} "
+                           f"{file_metrics['hlo_macs']:>15,} "
+                           f"{file_metrics['nki_macs']:>15,} "
+                           f"{file_metrics['nki_ratio']:>10.4f}")
+                
+                if 'training_score' in file_metrics:
+                    print(f"{base_info} {file_metrics['training_score']['score']:>10.4f}")
+                else:
+                    print(base_info)
+            else:
+                print(f"{file_metrics['file_name']:<40} ERROR: {file_metrics['error']}")
+        print("-" * 120)
+        
+        # Print score statistics if available
+        if args.per_file_scores and args.calculate_score and hlo_analysis['summary'].get('average_score') is not None:
+            print(f"\nPer-file Score Statistics:")
+            print(f"  Average Score: {hlo_analysis['summary']['average_score']:.4f}")
+            print(f"  Min Score: {hlo_analysis['summary']['min_score']:.4f}")
+            print(f"  Max Score: {hlo_analysis['summary']['max_score']:.4f}")
+    
+    # Compile metrics
+    metrics = {
+        "model_config": args.model_config,
+        "batch_size": batch_size,
+        "sequence_length": seq_len,
+        "num_nodes": args.num_nodes,
+        "hardware_backend": args.hw_backend,
+        "throughput_seq_per_sec": throughput,
+        "mfu_percent": mfu,
+        "tflops_per_second": tflops_per_second,
+        "throughput_tflops_per_node": throughput_per_node,
+        "seq_per_second_per_node": seq_per_second,
+        "time_per_batch_seconds": time_per_batch,
+        "nki_analysis": {
+            "summary": hlo_analysis['summary']
+        }
+    }
+    
+    # Add detailed per-file metrics if requested
+    if args.detailed:
+        metrics["nki_analysis"]["per_file_metrics"] = hlo_analysis['per_file_metrics']
+    
+    # Calculate overall training score if requested
+    if args.calculate_score:
+        nki_ratio = hlo_analysis['summary']['overall_nki_ratio']
+        
+        print('\nTraining Score Breakdown:')
+        print('=' * 50)
+        print(f'MFU improvement: {mfu/args.base_mfu:.4f} ({args.base_mfu:.2f}% → {mfu:.2f}%)')
+        print(f'Throughput improvement: {throughput/args.base_throughput:.4f} ({args.base_throughput:.2f} → {throughput:.2f} seq/s)')
+        print(f'Loss improvement: {args.loss_improvement:.4f}')
+        print(f'Convergence rate improvement: {args.convergence_rate:.4f}')
+        print(f'NKI flop ratio: {nki_ratio:.4f}')
+        print('=' * 50)
+        
+        score_details = calculate_training_score(
+            base_mfu=args.base_mfu,
+            base_throughput=args.base_throughput,
+            mfu=mfu,
+            throughput=throughput,
+            nki_flop_ratio=nki_ratio,
+            loss_improvement=args.loss_improvement,
+            convergence_rate=args.convergence_rate
+        )
+        
+        score = score_details['score']
+        print(f'Final Training Score: {score:.4f}')
+        
+        metrics['training_score'] = score
+        metrics['training_score_breakdown'] = {
+            'base_mfu': args.base_mfu,
+            'base_throughput': args.base_throughput,
+            'achieved_mfu': mfu,
+            'achieved_throughput': throughput,
+            'mfu_improvement': score_details['mfu_improvement'],
+            'throughput_improvement': score_details['throughput_improvement'],
+            'nki_flop_ratio': nki_ratio,
+            'loss_improvement': args.loss_improvement,
+            'convergence_rate': args.convergence_rate
+        }
+    
+    # Save metrics
+    with open(args.output, 'w') as f:
+        json.dump(metrics, f, indent=4)
+    
+    # Print summary
+    print("\n" + "="*50)
+    print("TRAINING METRICS SUMMARY")
+    print("="*50)
+    print(f"MFU: {mfu:.2f}%")
+    print(f"TFLOPs/second: {tflops_per_second:.2f}")
+    print(f"\nNKI Analysis Summary:")
+    print(f"  Files analyzed: {hlo_analysis['summary']['successful_analyses']}/{hlo_analysis['summary']['total_files']}")
+    print(f"  Overall NKI Ratio: {hlo_analysis['summary']['overall_nki_ratio']:.4f}")
+    print(f"  Average NKI Ratio: {hlo_analysis['summary']['average_nki_ratio']:.4f}")
+    print(f"  Min NKI Ratio: {hlo_analysis['summary']['min_nki_ratio']:.4f}")
+    print(f"  Max NKI Ratio: {hlo_analysis['summary']['max_nki_ratio']:.4f}")
+    print(f"  Total HLO MACs: {hlo_analysis['summary']['total_hlo_macs']:,}")
+    print(f"  Total NKI MACs: {hlo_analysis['summary']['total_nki_macs']:,}")
+    print(f"\nThroughput: {throughput:.2f} seq/s")
+    print(f"Throughput per node: {throughput_per_node:.2f} TFLOP/s")
+    print(f"\nMetrics saved to: {args.output}")
+    
+    return metrics
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From c36c14d37ad0fecef02c9af35bbceac770c49144 Mon Sep 17 00:00:00 2001
From: arm-diaz <arm.dan.diaz.gonzalez@gmail.com>
Date: Tue, 10 Jun 2025 15:48:25 +0000
Subject: [PATCH 12/65] docs: update docs

---
 src/fine-tune/docs/calculate-score-guide.md | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/src/fine-tune/docs/calculate-score-guide.md b/src/fine-tune/docs/calculate-score-guide.md
index b53ca43..139c6f2 100644
--- a/src/fine-tune/docs/calculate-score-guide.md
+++ b/src/fine-tune/docs/calculate-score-guide.md
@@ -20,22 +20,6 @@ This tool analyzes training performance on AWS Trainium instances by:
 - **Log Parsing**: Extracts throughput and loss metrics from training logs
 - **Flexible Configuration**: Supports various model configurations and hardware backends
 
-## Requirements
-
-- Python 3.7+
-- `neuronx_distributed_training` package
-- `torch_neuronx` package
-- PyYAML
-- Access to compiled HLO files in the Neuron cache directory
-
-## Installation
-
-```bash
-# Ensure you have the AWS Neuron SDK installed
-# Install additional dependencies
-pip install pyyaml
-```
-
 ## Usage
 
 ### Example

From c04d6cd3fc036e57ab06d52defa8a116ac045670 Mon Sep 17 00:00:00 2001
From: nithiyn <vijeasns@amazon.com>
Date: Tue, 10 Jun 2025 15:18:51 -0400
Subject: [PATCH 13/65] chore: bump vllm to upstream and update configs

---
 nki-llama.config                    | 3 +--
 src/inference/scripts/setup-vllm.sh | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/nki-llama.config b/nki-llama.config
index 0af05e0..3b9ee69 100644
--- a/nki-llama.config
+++ b/nki-llama.config
@@ -37,8 +37,7 @@ export NEURON_COMPILE_CACHE="${HOME}/.cache/neuron"
 export NEURON_RT_NUM_CORES="${NEURON_RT_NUM_CORES:-8}"
 
 # vLLM Configuration
-export VLLM_REPO="${HOME}/upstreaming-to-vllm"
-export VLLM_BRANCH="neuron-2.22-vllm-v0.7.2"
+export VLLM_REPO="${HOME}/vllm"
 export VLLM_NEURON_FRAMEWORK="neuronx-distributed-inference"
 
 # Dataset Configuration
diff --git a/src/inference/scripts/setup-vllm.sh b/src/inference/scripts/setup-vllm.sh
index eb080b2..28be591 100755
--- a/src/inference/scripts/setup-vllm.sh
+++ b/src/inference/scripts/setup-vllm.sh
@@ -27,12 +27,11 @@ if [[ -d "$VLLM_REPO" ]]; then
     echo "Updating existing vLLM repository..."
     cd "$VLLM_REPO"
     git fetch
-    git checkout "$VLLM_BRANCH"
     git pull
 else
     echo "Cloning vLLM repository..."
     cd "$(dirname "$VLLM_REPO")"
-    git clone -b "$VLLM_BRANCH" https://github.com/aws-neuron/upstreaming-to-vllm.git
+    git clone https://github.com/vllm-project/vllm.git
 fi
 
 # Install requirements

From 99bc3ba6c8a1c903117cc4ad7f1576e3f1a877a1 Mon Sep 17 00:00:00 2001
From: nithiyn <vijeasns@amazon.com>
Date: Tue, 10 Jun 2025 15:22:57 -0400
Subject: [PATCH 14/65] fix:update req.txt path

---
 src/inference/scripts/setup-vllm.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/inference/scripts/setup-vllm.sh b/src/inference/scripts/setup-vllm.sh
index 28be591..4c27520 100755
--- a/src/inference/scripts/setup-vllm.sh
+++ b/src/inference/scripts/setup-vllm.sh
@@ -37,7 +37,7 @@ fi
 # Install requirements
 cd "$VLLM_REPO"
 echo "Installing vLLM requirements..."
-pip install -r requirements-neuron.txt
+pip install -U -r requirements/neuron.txt
 
 # Install vLLM
 echo "Installing vLLM for Neuron..."

From 20067f28c0c768701a44b18a700896bec51d0f25 Mon Sep 17 00:00:00 2001
From: arm-diaz <arm.dan.diaz.gonzalez@gmail.com>
Date: Wed, 11 Jun 2025 12:51:57 +0000
Subject: [PATCH 15/65] fix: handle cache during inference and improve readme

---
 README.md                                  | 545 +++++++++------------
 nki-llama.config                           |   2 +-
 nki-llama.sh                               | 113 ++++-
 src/inference/scripts/run-nki-benchmark.sh | 187 ++++++-
 4 files changed, 493 insertions(+), 354 deletions(-)

diff --git a/README.md b/README.md
index a8385f3..91652d5 100644
--- a/README.md
+++ b/README.md
@@ -1,23 +1,18 @@
-# NKI-LLAMA: Unified Interface for AWS Neuron
+# NKI-LLAMA: AWS Neuron Development Platform
 
-A unified project for fine-tuning, inference, and agent development of LLaMA models on AWS Trainium and Inferentia using a streamlined bash-based interface.
+A unified platform for fine-tuning, benchmarking, and serving LLaMA models on AWS Trainium and Inferentia using Neuron SDK's advanced optimization capabilities.
 
-## 📋 Requirements
+## 🎯 Overview
 
-### Neuron SDK Version
-- **Neuron 2.23.0 Release**
-- **NeuronX Distributed Inference**: 0.3.5591
-- **NeuronX Distributed Training**: 1.3.0
+NKI-LLAMA provides a streamlined interface for the complete LLM development lifecycle on AWS Neuron hardware:
 
-### Hardware & AMI
-- **Required Instance**: trn1.32xlarge
-- **Base AMI**: Deep Learning AMI Neuron (Ubuntu 22.04) with Neuron SDK 2.23
-- **Base Packages**:
-  - NxD (NeuronX Distributed Training)
-  - NKI (Neuron Kernel Interface)
-  - NxDI (NeuronX Distributed Inference)
+- **Fine-tune** models using NeuronX Distributed (NxD)
+- **Optimize** with Neuron Kernel Interface (NKI) compilation
+- **Benchmark** performance with comprehensive evaluation tools
+- **Serve** models with vLLM's OpenAI-compatible API
+- **Build** LLM-powered applications and agents
 
-## 🔄 Project Workflow
+## 🔄 Architecture
 
 ```
 ┌─────────────────┐     ┌──────────────────┐     ┌─────────────────┐     ┌──────────────┐
@@ -25,421 +20,353 @@ A unified project for fine-tuning, inference, and agent development of LLaMA mod
 │   Fine-tuning   │────▶│ NKI Compilation  │────▶│ vLLM Inference  │────▶│    Agent     │
 │      (NxD)      │     │  & Benchmarking  │     │     (NxDI)      │     │ Development  │
 │                 │     │                  │     │                 │     │              │
-└─────────────────┘     └──────────────────┘     └─────────────────┘     └──────────────┐
-        │                         │                         │
-        │                         │                         │
-        ▼                         ▼                         ▼
-  Trained Model            NKI-Optimized              API Endpoint
+└─────────────────┘     └──────────────────┘     └─────────────────┘     └──────────────┘
+        │                         │                         │                         │
+        ▼                         ▼                         ▼                         ▼
+  Trained Model            NKI-Optimized              API Endpoint              LLM Apps
                           Model Artifacts            (OpenAI Compatible)
 ```
 
-### Detailed Workflow:
-
-1. **Fine-tune** a model using NeuronX Distributed (NxD) on Trainium
-2. **NKI Compilation & Benchmarking**:
-   - Compiles model graphs with Neuron Kernel Interface (NKI)
-   - Creates optimized artifacts for inference
-   - Benchmarks performance characteristics
-   - Supports two modes: `evaluate_single` and `evaluate_all`
-3. **vLLM Inference** serves the NKI-compiled model using NeuronX Distributed Inference (NxDI)
-4. **Agent Development** connects to the inference endpoint for application building
+### Key Technologies
 
-### Key Components:
-- **NKI (Neuron Kernel Interface)**: Optimizes model operations for AWS Neuron hardware
-- **NxD (NeuronX Distributed)**: Enables distributed training across Neuron cores
-- **NxDI (NeuronX Distributed Inference)**: Provides optimized inference runtime
-- **vLLM**: Serves models with OpenAI-compatible API using Neuron optimizations
+- **NKI (Neuron Kernel Interface)**: Custom kernel optimizations for AWS Neuron
+- **NxD (NeuronX Distributed)**: Distributed training framework
+- **NxDI (NeuronX Distributed Inference)**: Optimized inference runtime
+- **vLLM**: High-performance serving with Neuron backend
 
-## 🚀 Quick Start
+## 📋 Requirements
 
-```bash
-# Install
-chmod +x install.sh
-./install.sh
+### System Requirements
+- **Instance**: trn1.32xlarge (recommended)
+- **AMI**: Deep Learning AMI Neuron (Ubuntu 22.04)
+- **Neuron SDK**: 2.23.0
+- **Python**: 3.10
 
-# Setup Guide
-./nki-llama setup
+### SDK Components
+- NeuronX Distributed Training: 1.3.0
+- NeuronX Distributed Inference: 0.3.5591
+- Neuron Compiler: 2.18.121.0
 
-# Run full benchmark (with NKI compilation)
-source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
-./nki-llama inference benchmark
-
-# Run quick single evaluation
-# ./nki-llama inference benchmark single (TODO: FIX)
+## 🚀 Quick Start
 
-# Start inference server
-# ./nki-llama server (TODO: DOCS)
+### 1. Instance Setup
+```bash
+# Create EC2 instance
+# - Type: trn1.32xlarge
+# - AMI: Deep Learning AMI Neuron (Ubuntu 22.04)
+# - Storage: 512GB+ recommended
 ```
 
-## 🏗️ Initial Setup
-
-### 1. Create Trainium Instance
-
-Create a trn1.32xlarge instance on AWS EC2:
-- **Name**: nki-llama
-- **AMI**: Deep Learning AMI Neuron (Ubuntu 22.04)
-- **Instance type**: trn1.32xlarge
-- **Key pair**: Create new key pair
-- **Username**: ubuntu (when connecting via SSH)
-
-### 2. Clone and Install
-
+### 2. Installation
 ```bash
 # Clone repository
-git clone [REPO_URL]
+git clone https://github.com/your-org/nki-llama.git
 cd nki-llama
 
-# Run installation
+# Install
 chmod +x install.sh
 ./install.sh
 
-# Configure environment
+# Configure
 cp .env.example .env
-nano .env  # Add your HF_TOKEN and adjust settings
+nano .env  # Add your HF_TOKEN
 ```
 
-## 📁 Project Structure
-
-```
-/home/ubuntu/nki-llama/
-├── nki-llama.sh          # Main CLI interface
-├── nki-llama.config      # Shared configuration
-├── .env                  # Your environment variables
-├── .env.example          # Example configuration
-├── install.sh            # Installation script
-├── src/
-│   ├── fine-tune/
-│   │   └── scripts/      # Fine-tuning scripts
-│   │       ├── bootstrap.sh
-│   │       ├── download_data.sh
-│   │       ├── download_model.sh
-│   │       ├── convert_checkpoints.sh
-│   │       ├── precompile.sh
-│   │       └── run_training.sh
-│   └── inference/
-│       ├── main.py       # Inference entry point
-│       └── scripts/      # Inference helper scripts
-│           ├── setup-vllm.sh
-│           ├── download-model.sh
-│           ├── run-nki-benchmark.sh  # Supports both evaluate_single and evaluate_all modes
-│           ├── start-server.sh
-│           └── jupyter.sh
-└── logs/                 # Unified logs
-    └── benchmarks/       # Benchmark results
-```
-
-## 🔧 Environment Setup
-
-This project requires three different Python environments:
-
-### 1. Fine-tuning Environment
+### 3. First Run
 ```bash
-source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
-```
+# Interactive setup
+./nki-llama setup
 
-### 2. Inference Environment
-```bash
+# Download model
 source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
-```
+./nki-llama inference download
 
-### 3. Jupyter Environment (for agent development)
-```bash
-./nki-llama jupyter setup
-source ~/nki-llama/venv/bin/activate
+# Run benchmark (compiles model on first run)
+tmux new -s benchmark
+./nki-llama inference benchmark
 ```
 
-## 💻 Commands
+## 💻 Command Reference
+
+### Core Commands
 
-### Quick Commands
-- `./nki-llama setup` - Interactive setup wizard
-- `./nki-llama train` - Start fine-tuning (shortcut)
-- `./nki-llama server` - Start inference server (shortcut)
-- `./nki-llama jupyter` - Launch Jupyter Lab
-- `./nki-llama status` - Check system status
-- `./nki-llama config` - Show configuration
+| Command | Description |
+|---------|-------------|
+| `./nki-llama setup` | Interactive setup wizard |
+| `./nki-llama status` | System and project status |
+| `./nki-llama config` | Display configuration |
+| `./nki-llama clean` | Clean artifacts and cache |
 
-### Fine-tuning Workflow
+### Fine-tuning Pipeline
 
 ```bash
-# Activate fine-tuning environment
+# Activate environment
 source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
 
-# Run individual steps
+# Complete pipeline
+./nki-llama finetune all
+
+# Or run individual steps
 ./nki-llama finetune deps      # Install dependencies
 ./nki-llama finetune data      # Download dataset
-./nki-llama finetune model     # Download model
-./nki-llama finetune convert   # Convert checkpoints to NxDT format
-./nki-llama finetune compile   # Pre-compile graphs (AOT)
-./nki-llama finetune train     # Start fine-tuning
-
-# Or run all at once
-./nki-llama finetune all
+./nki-llama finetune model     # Download base model
+./nki-llama finetune convert   # Convert to NxDT format
+./nki-llama finetune compile   # Pre-compile graphs
+./nki-llama finetune train     # Start training
 ```
 
-### NKI Benchmark Modes
-
-The benchmark script supports two evaluation modes:
+### Benchmarking & Compilation
 
-#### 1. evaluate_single Mode
-Quick single evaluation using the repository test script:
 ```bash
-# Activate inference environment
+# Activate environment
 source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
 
-# Run single evaluation
-./nki-llama inference benchmark single
-
-# Or explicitly specify mode
-./nki-llama inference benchmark --mode evaluate_single
-```
-
-#### 2. evaluate_all Mode (Default)
-Comprehensive benchmark with NKI compilation and all configurations:
-```bash
-# Download model if not already available
+# Download model (if not already done)
 ./nki-llama inference download
 
-# Run full benchmark (compiles model on first run)
+# Full benchmark with NKI compilation (default)
 ./nki-llama inference benchmark
 
-# Or with custom parameters
-./nki-llama inference benchmark --seq-len 1024 --tp-degree 8
-
-# Run without NKI optimizations
-./nki-llama inference benchmark --no-nki
+# Benchmark with options
+./nki-llama inference benchmark --seq-len 1024
+./nki-llama inference benchmark --clear-cache  # Clear compilation cache
+./nki-llama inference benchmark --no-nki       # Without NKI optimizations
 ```
 
-**Key differences:**
-- **evaluate_single**: Quick validation, runs from repository test script
-- **evaluate_all**: Full benchmark with model compilation, creates cached artifacts for vLLM
+#### Benchmark Modes
 
-**Note**: The `evaluate_all` mode automatically compiles the model with NKI optimizations on the first run (10-30 minutes). Subsequent runs use the cached compilation.
+| Mode | Description | Status |
+|------|-------------|--------|
+| `evaluate_all` | Full benchmark with NKI compilation and caching | ✅ Working |
+| `evaluate_single` | Quick validation test | ⚠️ Not implemented |
+
+> **Note**: The `evaluate_single` mode is currently not functional. Use `evaluate_all` (default) for all benchmarking needs.
 
 ### Inference Serving
 
 ```bash
-# After benchmarking (which compiles the model), start the API server
+# Setup vLLM (one-time)
+./nki-llama inference setup
 
-# Setup vLLM
-./nki-llama inference setup      # Setup vLLM for Neuron
+# Start API server
+tmux new -s vllm
+./nki-llama inference server
 
-# Start the API server
-./nki-llama inference server     # Start OpenAI-compatible API
+# Server will use NKI-compiled artifacts from benchmarking
 ```
 
-## 🤖 Agent Development
-
-This repository includes support for building LLM-powered agents using LangGraph and LangChain. A sample travel planning agent demonstrates:
-
-- Context-aware travel itinerary generation
-- Multi-turn conversation with memory
-- Dynamic workflow management using LangGraph
-- Integration with vLLM for efficient inference on Trainium
-
-### Using Jupyter for Agent Development
+### Development Tools
 
 ```bash
-# Terminal 1: Start the inference server
-source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
-./nki-llama server
-
-# Terminal 2: Start Jupyter Lab
+# Start Jupyter Lab
 ./nki-llama jupyter
-# Access at http://your-ip:8888
-# Select the "nki-llama" kernel in Jupyter
+
+# Access at http://your-instance-ip:8888
 ```
 
-## ⚙️ Configuration
+## 🛠️ Advanced Usage
 
-All configuration is managed through:
-1. `nki-llama.config` - System paths and defaults (includes TEST_SCRIPT, MAIN_SCRIPT, etc.)
-2. `.env` - Your personal configuration
+### Cache Management
 
-### Key Variables
+The compilation cache can accumulate failed entries. Monitor and manage it:
 
 ```bash
-# Model Configuration
-MODEL_ID=meta-llama/Meta-Llama-3-8B
-MODEL_NAME=llama-3-8b
-HF_TOKEN=your_huggingface_token
-
-# Training Parameters
-BATCH_SIZE=1
-MAX_STEPS=1000
-SEQ_LENGTH=2048
-TENSOR_PARALLEL_SIZE=8
-LEARNING_RATE=5e-5
+# Check cache status
+./nki-llama status
 
-# Inference Parameters
-INFERENCE_PORT=8080
-MAX_MODEL_LEN=2048
-MAX_NUM_SEQS=4
+# Clear cache before benchmark
+./nki-llama inference benchmark --clear-cache
 
-# Dataset Configuration
-DATASET_NAME=databricks/databricks-dolly-15k
+# Manual cache cleanup
+./nki-llama clean
 ```
 
-## 📊 Monitoring
+### Using tmux (Recommended)
 
-### Check Status
-```bash
-./nki-llama status
-```
+Long-running operations should use tmux to prevent disconnection issues:
 
-### View Logs
 ```bash
-# Logs are stored with timestamps
-ls logs/
-tail -f logs/nki-llama_*.log
+# Create session
+tmux new -s session-name
 
-# Benchmark results
-ls logs/benchmarks/
-cat logs/benchmarks/*/metadata.json
-```
+# Run command
+./nki-llama [command]
 
-### Neuron Monitoring
-```bash
-neuron-ls    # List Neuron devices
-neuron-top   # Monitor Neuron usage
+# Detach: Ctrl+B, then D
+
+# List sessions
+tmux ls
+
+# Reattach
+tmux attach -t session-name
 ```
 
-## 🔍 Complete Workflow Example
+### Environment Management
 
-Here's a complete end-to-end workflow with tmux best practices:
+Different operations require specific environments:
 
-### 1. Fine-tune a Model
 ```bash
-# Create tmux session for training
-tmux new -s training
-
-# Inside tmux: activate environment and run training
+# Fine-tuning
 source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
-./nki-llama finetune all
 
-# Detach from tmux: Ctrl+B, D
-# Check progress later: tmux attach -t training
-```
+# Inference & Benchmarking
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
 
-### 2. Benchmark Model with NKI
-```bash
-# Create tmux session for benchmarking
-tmux new -s benchmark
+# Agent Development
+source ~/nki-llama/venv/bin/activate
+```
 
-# Inside tmux: run benchmarks
-source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+## 📊 Monitoring & Debugging
 
-# Quick single evaluation with compilation  (first run compiles)
-# ./nki-llama inference benchmark single (TODO: FIX COMMAND)
+### System Monitoring
+```bash
+# Neuron device status
+neuron-ls
 
-# Or full benchmark with compilation (first run compiles)
-./nki-llama inference benchmark
+# Real-time usage
+neuron-top
 
-# Detach and let it run: Ctrl+B, D
+# Project status
+./nki-llama status
 ```
 
-### 3. View Benchmark Results
+### Log Files
 ```bash
-# After benchmarking completes
-ls logs/benchmarks/
-cat logs/benchmarks/*/metadata.json
+# View recent logs
+ls -la logs/
+tail -f logs/nki-llama_*.log
 
-# View detailed logs
-cat logs/benchmarks/*/benchmark.log
+# Benchmark results
+cat logs/benchmarks/*/metadata.json
 ```
 
-### 4. Serve with vLLM
-```bash
-# Create tmux session for server
-tmux new -s vllm
-
-# Inside tmux: start the server
-source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
-./nki-llama server
+### Common Issues
 
-# Server automatically uses NKI-compiled artifacts
-# Detach: Ctrl+B, D
+#### Compilation Cache Errors
+```bash
+# Symptoms: "Got a cached failed neff" errors
+# Solution:
+./nki-llama inference benchmark --clear-cache
 ```
 
-### 5. Build Agents
+#### SIGHUP Errors
 ```bash
-# In a new terminal
-./nki-llama jupyter
-
-# Your model is now available at http://localhost:8080
-# Build agents using the OpenAI-compatible API
+# Symptoms: Process terminated during compilation
+# Solution: Always use tmux for long operations
+tmux new -s benchmark
 ```
 
-### Managing tmux Sessions
+#### Memory Issues
 ```bash
-# List all sessions
-tmux ls
+# Monitor memory usage
+neuron-top
 
-# Attach to a session
-tmux attach -t training
-tmux attach -t benchmark
-tmux attach -t vllm
+# Adjust parallelism if needed
+export TENSOR_PARALLEL_SIZE=4  # Reduce from 8
+```
 
-# Kill a session
-tmux kill-session -t training
+## 🏗️ Project Structure
+
+```
+nki-llama/
+├── nki-llama.sh          # Main CLI interface
+├── nki-llama.config      # System configuration
+├── .env                  # User configuration
+├── install.sh            # Installation script
+├── README.md             # This file
+├── src/
+│   ├── fine-tune/        # Training pipeline
+│   │   └── scripts/      # Training automation
+│   └── inference/        # Inference pipeline
+│       ├── main.py       # Benchmark entry point
+│       └── scripts/      # Inference automation
+├── notebooks/            # Example notebooks
+│   └── travel_agent.ipynb
+├── logs/                 # Operation logs
+│   └── benchmarks/       # Benchmark results
+└── models/              # Downloaded models
+    └── compiled/        # NKI-compiled artifacts
 ```
 
-## 🚨 Troubleshooting
+## 🔧 Configuration
+
+### Environment Variables (.env)
 
-### Environment Issues
 ```bash
-# Check active environment
-./nki-llama status
+# Hugging Face Access
+HF_TOKEN=your_token_here
 
-# Wrong environment error?
-# For fine-tuning:
-source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+# Model Selection
+MODEL_ID=meta-llama/Meta-Llama-3-8B
+MODEL_NAME=llama-3-8b
 
-# For inference:
-source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
-```
+# Hardware Configuration
+TENSOR_PARALLEL_SIZE=8
+NEURON_RT_NUM_CORES=32
 
-### Benchmark Modes
-- **evaluate_single**: Use for quick validation tests
-- **evaluate_all**: Use for full performance evaluation and model compilation
-- First-time compilation with NKI (in evaluate_all mode) can take 10-30 minutes
-- Compiled models are cached in `~/traced_model/`
-- Subsequent benchmark runs will use the cached compilation
+# Training Parameters
+BATCH_SIZE=1
+MAX_STEPS=1000
+SEQ_LENGTH=2048
+LEARNING_RATE=5e-5
 
-### Memory Issues
-- Ensure you're using trn1.32xlarge for full model support
-- Monitor memory usage with `neuron-top`
-- Adjust `TENSOR_PARALLEL_SIZE` if needed
+# Inference Parameters
+INFERENCE_PORT=8080
+MAX_MODEL_LEN=2048
+```
 
-### Using with tmux
-For long-running operations like training, benchmarking, or serving:
+## 🎓 Complete Workflow Example
 
+### Step 1: Fine-tune a Model
 ```bash
-# Create a new tmux session
-tmux new -s session-name
-
-# Run your command
-./nki-llama [command]
+tmux new -s training
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+./nki-llama finetune all
+# Detach: Ctrl+B, D
+```
 
-# Detach from session
-Ctrl+B, then D
+### Step 2: Benchmark & Compile
+```bash
+tmux new -s benchmark
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+./nki-llama inference download
+./nki-llama inference benchmark
+# First run compiles with NKI (10-30 minutes)
+# Detach: Ctrl+B, D
+```
 
-# List sessions
-tmux ls
+### Step 3: Serve Model
+```bash
+tmux new -s server
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+./nki-llama inference server
+# API available at http://localhost:8080
+# Detach: Ctrl+B, D
+```
 
-# Reattach to session
-tmux attach -t session-name
+### Step 4: Build Applications
+```bash
+# Terminal 1: Keep server running
+# Terminal 2: Development
+./nki-llama jupyter
+# Open browser to http://your-ip:8888
 ```
 
-## 🤝 Contributing
+## 📚 Additional Resources
+
+- [AWS Neuron Documentation](https://awsdocs-neuron.readthedocs-hosted.com/)
+- [NeuronX Distributed Training Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/neuronx-distributed/index.html)
+- [NKI Documentation](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/index.html)
+- [vLLM Neuron Integration](https://docs.vllm.ai/en/latest/getting_started/neuron-installation.html)
 
-The modular design makes it easy to add new features:
+## 🐛 Known Issues
 
-1. Add new scripts to `scripts/` directory
-2. Update command handlers in `nki-llama.sh`
-3. Add configuration to `nki-llama.config`
+- **evaluate_single mode**: Currently not implemented. Use default `evaluate_all` mode for all benchmarking.
+- **First compilation**: Initial NKI compilation can take 10-30 minutes. Subsequent runs use cache.
+- **Cache corruption**: If benchmark fails with cache errors, use `--clear-cache` flag.
 
 ## 📄 License
 
 © 2025 Amazon Web Services. All rights reserved.
 
-This project integrates with AWS Neuron SDK and follows its licensing terms.
\ No newline at end of file
+This project is provided under the AWS Customer Agreement and integrates with AWS Neuron SDK components subject to their respective licenses.
\ No newline at end of file
diff --git a/nki-llama.config b/nki-llama.config
index 3b9ee69..4e02f99 100644
--- a/nki-llama.config
+++ b/nki-llama.config
@@ -48,7 +48,7 @@ export TOKENIZER_DIR="${NKI_FINETUNE}/model_assets/llama3_tokenizer"
 # Checkpoint Paths
 export HF_WEIGHTS_DIR="${NKI_FINETUNE}/model_assets/llama3-8B_hf_weights_bin"
 export PRETRAINED_CKPT="${NKI_FINETUNE}/model_assets/pckpt"
-export NEMO_EXPERIMENTS="${NKI_FINETUNE}/nemo_experiments"
+export NEMO_EXPERIMENTS="${NKI_FINETUNE}/neuronx-distributed-training/examples/nemo_experiments"
 
 # Jupyter Configuration
 export JUPYTER_PORT="${JUPYTER_PORT:-8888}"
diff --git a/nki-llama.sh b/nki-llama.sh
index a9a3654..c642223 100755
--- a/nki-llama.sh
+++ b/nki-llama.sh
@@ -194,9 +194,10 @@ cmd_inference_download() {
 cmd_inference_benchmark() {
     echo -e "${BOLD}Running NKI benchmark evaluation...${NC}"
     
-    # Parse benchmark mode
+    # Parse benchmark mode and special flags
     local mode="evaluate_all"  # Default mode
     local args=()
+    local clear_cache=false
     
     while [[ $# -gt 0 ]]; do
         case $1 in
@@ -212,6 +213,11 @@ cmd_inference_benchmark() {
                 mode="$2"
                 shift 2
                 ;;
+            --clear-cache|clear-cache)
+                clear_cache=true
+                args+=("--clear-cache")
+                shift
+                ;;
             *)
                 args+=("$1")
                 shift
@@ -219,27 +225,35 @@ cmd_inference_benchmark() {
         esac
     done
     
-    echo -e "${YELLOW}💡 Running benchmark in ${mode} mode${NC}"
+    # Show mode information
+    echo -e "${YELLOW}💡 Running benchmark in ${CYAN}${mode}${YELLOW} mode${NC}"
     
     if [[ "$mode" == "evaluate_single" ]]; then
         echo -e "${YELLOW}   This runs a quick single evaluation from the repository test script.${NC}"
     else
         echo -e "${YELLOW}   This includes model compilation with NKI optimizations (10-30 min on first run).${NC}"
         echo -e "${YELLOW}   The compiled model will be cached for future use.${NC}"
+        echo -e "${YELLOW}   ${CYAN}Auto cache recovery is enabled by default.${NC}"
+    fi
+    
+    if [[ "$clear_cache" == "true" ]]; then
+        echo -e "${YELLOW}   ${CYAN}Cache will be cleared before running.${NC}"
     fi
     
     echo -e "${YELLOW}   Using tmux is strongly recommended!${NC}"
     
-    # Check if we're in tmux
-    if [[ -z "${TMUX:-}" ]]; then
-        echo -e "${YELLOW}⚠️  Not running in tmux. Consider using:${NC}"
+    # Check if we're in tmux for evaluate_all mode
+    if [[ "$mode" == "evaluate_all" ]] && [[ -z "${TMUX:-}" ]]; then
+        echo -e "${YELLOW}⚠️  Not running in tmux. ${BOLD}This is critical for long compilations!${NC}"
+        echo -e "${YELLOW}   Disconnections will terminate the process (SIGHUP).${NC}"
+        echo
         echo -e "   ${CYAN}tmux new -s benchmark${NC}"
         echo -e "   ${CYAN}./nki-llama inference benchmark ${mode} ${args[*]}${NC}"
         echo
-        read -p "Continue without tmux? [Y/n] " -n 1 -r
+        read -p "Continue without tmux? [y/N] " -n 1 -r
         echo
-        if [[ $REPLY =~ ^[Nn]$ ]]; then
-            echo -e "${BLUE}Start tmux with: tmux new -s benchmark${NC}"
+        if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+            echo -e "${BLUE}Please start tmux with: ${CYAN}tmux new -s benchmark${NC}"
             exit 0
         fi
     fi
@@ -278,6 +292,22 @@ cmd_status() {
     [[ -d "${NKI_COMPILED}/${MODEL_NAME}" ]] && echo -e "• Compiled: ${GREEN}✓${NC}" || echo -e "• Compiled: ${YELLOW}⚠${NC}"
     [[ -d "$VLLM_REPO" ]] && echo -e "• vLLM: ${GREEN}✓${NC}" || echo -e "• vLLM: ${YELLOW}⚠${NC}"
     
+    # Check compilation cache
+    CACHE_DIR="/var/tmp/neuron-compile-cache"
+    if [[ -d "$CACHE_DIR" ]]; then
+        CACHE_SIZE=$(du -sh "$CACHE_DIR" 2>/dev/null | cut -f1 || echo "unknown")
+        echo -e "• Compile Cache: ${GREEN}✓${NC} (${CACHE_SIZE})"
+        
+        # Check for failed compilations
+        FAILED_COUNT=$(find "$CACHE_DIR" -name "*.neff" -size 0 2>/dev/null | wc -l || echo "0")
+        if [[ $FAILED_COUNT -gt 0 ]]; then
+            echo -e "  ${YELLOW}⚠ ${FAILED_COUNT} failed compilation entries found${NC}"
+            echo -e "  ${CYAN}Run: ./nki-llama inference benchmark --clear-cache${NC}"
+        fi
+    else
+        echo -e "• Compile Cache: ${YELLOW}⚠${NC} (not found)"
+    fi
+    
     if command -v neuron-ls &> /dev/null; then
         echo -e "\n${BOLD}Neuron Hardware:${NC}"
         
@@ -320,6 +350,24 @@ cmd_status() {
 
 cmd_clean() {
     echo -e "${YELLOW}🧹 Cleaning generated files...${NC}"
+    
+    # Show cache status first
+    CACHE_DIR="/var/tmp/neuron-compile-cache"
+    if [[ -d "$CACHE_DIR" ]]; then
+        CACHE_SIZE=$(du -sh "$CACHE_DIR" 2>/dev/null | cut -f1 || echo "unknown")
+        echo -e "\nCompilation cache: ${CYAN}${CACHE_SIZE}${NC} at ${CACHE_DIR}"
+        
+        read -p "Clean compilation cache? [y/N] " -n 1 -r
+        echo
+        if [[ $REPLY =~ ^[Yy]$ ]]; then
+            if rm -rf "$CACHE_DIR" 2>/dev/null; then
+                echo -e "${GREEN}✓ Compilation cache cleaned${NC}"
+            else
+                echo -e "${RED}✗ Failed to clean cache. Try: sudo rm -rf ${CACHE_DIR}${NC}"
+            fi
+        fi
+    fi
+    
     read -p "Clean fine-tuning artifacts? [y/N] " -n 1 -r
     echo
     if [[ $REPLY =~ ^[Yy]$ ]]; then
@@ -358,22 +406,35 @@ show_help() {
     echo
     
     echo -e "${CYAN}Inference Commands:${NC}"
-    echo -e "  ./nki-llama inference setup           - Setup vLLM"
-    echo -e "  ./nki-llama inference download        - Download model"
-    echo -e "  ./nki-llama inference benchmark       - Run full benchmark (evaluate_all mode)"
-    echo -e "  ./nki-llama inference benchmark single - Run quick benchmark (evaluate_single mode)"
-    echo -e "  ./nki-llama inference server          - Start API server"
+    echo -e "  ./nki-llama inference setup             - Setup vLLM"
+    echo -e "  ./nki-llama inference download          - Download model"
+    echo -e "  ./nki-llama inference benchmark         - Run full benchmark (evaluate_all)"
+    echo -e "  ./nki-llama inference benchmark single  - Quick benchmark (evaluate_single)"
+    echo -e "  ./nki-llama inference benchmark --clear-cache  - Clear cache & benchmark"
+    echo -e "  ./nki-llama inference server            - Start API server"
     echo
     
-    echo -e "${CYAN}Benchmark Modes:${NC}"
-    echo -e "  evaluate_single - Quick validation using repository test script"
-    echo -e "  evaluate_all    - Full benchmark with NKI compilation & caching"
+    echo -e "${CYAN}Benchmark Options:${NC}"
+    echo -e "  ${BOLD}Modes:${NC}"
+    echo -e "    evaluate_single - Quick validation using repository test script"
+    echo -e "    evaluate_all    - Full benchmark with NKI compilation & caching"
+    echo
+    echo -e "  ${BOLD}Cache Management:${NC}"
+    echo -e "    --clear-cache              - Clear compilation cache before running"
+    echo -e "    --no-auto-clear-cache      - Disable automatic cache recovery"
+    echo -e "    --retry-failed-compilation - Force retry of failed compilations"
+    echo
+    echo -e "  ${BOLD}Examples:${NC}"
+    echo -e "    ./nki-llama inference benchmark                     # Full benchmark"
+    echo -e "    ./nki-llama inference benchmark single              # Quick test"
+    echo -e "    ./nki-llama inference benchmark --clear-cache       # Clean run"
+    echo -e "    ./nki-llama inference benchmark --seq-len 1024      # Custom seq length"
     echo
     
     echo -e "${CYAN}Utility Commands:${NC}"
     echo -e "  ./nki-llama status        - Show system status"
     echo -e "  ./nki-llama config        - Show configuration"
-    echo -e "  ./nki-llama clean         - Clean artifacts"
+    echo -e "  ./nki-llama clean         - Clean artifacts & cache"
     echo -e "  ./nki-llama help          - Show this help"
     echo
     
@@ -381,6 +442,12 @@ show_help() {
     echo -e "  Fine-tuning: source ${NEURON_VENV}/bin/activate"
     echo -e "  Inference:   source ${NEURON_INFERENCE_VENV}/bin/activate"
     echo
+    
+    echo -e "${CYAN}Troubleshooting:${NC}"
+    echo -e "  • Always use tmux for long operations (compile, train, benchmark)"
+    echo -e "  • If benchmark fails with cache errors, use --clear-cache"
+    echo -e "  • Check status to see if compilation cache has failed entries"
+    echo
 }
 
 # Setup wizard
@@ -419,11 +486,17 @@ EOF
     echo -e "3. For model benchmarking:"
     echo -e "   ${CYAN}source ${NEURON_INFERENCE_VENV}/bin/activate${NC}"
     echo -e "   ${CYAN}./nki-llama inference download${NC}"
-    echo -e "   ${CYAN}./nki-llama inference benchmark      # Full benchmark with compilation${NC}"
-    echo -e "   ${CYAN}./nki-llama inference benchmark single  # Quick single evaluation${NC}"
+    echo -e "   ${CYAN}tmux new -s benchmark  # ${YELLOW}IMPORTANT: Use tmux!${NC}"
+    echo -e "   ${CYAN}./nki-llama inference benchmark       # Full benchmark${NC}"
+    echo -e "   ${CYAN}./nki-llama inference benchmark single   # Quick test${NC}"
     echo -e "4. For inference serving:"
     echo -e "   ${CYAN}./nki-llama inference setup${NC}"
-    echo -e "   ${CYAN}./nki-llama server${NC}"
+    echo -e "   ${CYAN}./nki-llama inference server${NC}"
+    echo
+    echo -e "${YELLOW}💡 Pro Tips:${NC}"
+    echo -e "   • Always use tmux for long operations"
+    echo -e "   • Check ./nki-llama status for system health"
+    echo -e "   • Use --clear-cache if benchmark fails with cache errors"
     echo
 }
 
diff --git a/src/inference/scripts/run-nki-benchmark.sh b/src/inference/scripts/run-nki-benchmark.sh
index 9deed06..d1dd715 100755
--- a/src/inference/scripts/run-nki-benchmark.sh
+++ b/src/inference/scripts/run-nki-benchmark.sh
@@ -24,6 +24,12 @@ MODE="${MODE:-evaluate_single}"
 ENABLE_NKI="${ENABLE_NKI:-true}"
 SEQ_LEN="${SEQ_LEN:-640}"
 TP_DEGREE="${TP_DEGREE:-${TENSOR_PARALLEL_SIZE}}"
+CLEAR_CACHE="${CLEAR_CACHE:-false}"
+AUTO_CLEAR_CACHE="${AUTO_CLEAR_CACHE:-true}"
+RETRY_FAILED="${RETRY_FAILED:-false}"
+
+# Cache paths
+NEURON_CACHE_DIR="/var/tmp/neuron-compile-cache"
 
 # Parse command line arguments
 while [[ $# -gt 0 ]]; do
@@ -48,15 +54,30 @@ while [[ $# -gt 0 ]]; do
             TP_DEGREE="$2"
             shift 2
             ;;
+        --clear-cache)
+            CLEAR_CACHE="true"
+            shift
+            ;;
+        --no-auto-clear-cache)
+            AUTO_CLEAR_CACHE="false"
+            shift
+            ;;
+        --retry-failed-compilation)
+            RETRY_FAILED="true"
+            shift
+            ;;
         --help)
             echo "Usage: $0 [options]"
             echo "Options:"
-            echo "  --mode MODE             Benchmark mode (evaluate_single/evaluate_all) [default: evaluate_single]"
-            echo "  --model-name NAME       Model name override"
-            echo "  --no-nki               Disable NKI optimizations"
-            echo "  --seq-len N            Sequence length [default: 640]"
-            echo "  --tp-degree N          Tensor parallel degree [default: from config]"
-            echo "  --help                 Show this help message"
+            echo "  --mode MODE                    Benchmark mode (evaluate_single/evaluate_all) [default: evaluate_single]"
+            echo "  --model-name NAME              Model name override"
+            echo "  --no-nki                       Disable NKI optimizations"
+            echo "  --seq-len N                    Sequence length [default: 640]"
+            echo "  --tp-degree N                  Tensor parallel degree [default: from config]"
+            echo "  --clear-cache                  Clear compilation cache before running"
+            echo "  --no-auto-clear-cache          Disable automatic cache clearing on failure"
+            echo "  --retry-failed-compilation     Force retry of failed compilations"
+            echo "  --help                         Show this help message"
             exit 0
             ;;
         *)
@@ -79,16 +100,39 @@ check_model() {
     fi
 }
 
-# Function to check compilation cache
-# check_compiled_model() {
-#     if [[ -d "$COMPILED_MODEL_PATH" ]]; then
-#         echo -e "${GREEN}✓ Found compiled model cache at: $COMPILED_MODEL_PATH${NC}"
-#         return 0
-#     else
-#         echo -e "${YELLOW}⚠ No compiled model found. Will compile during benchmark.${NC}"
-#         return 1
-#     fi
-# }
+# Function to clear compilation cache
+clear_compilation_cache() {
+    echo -e "${YELLOW}🧹 Clearing Neuron compilation cache...${NC}"
+    if [[ -d "$NEURON_CACHE_DIR" ]]; then
+        local cache_size=$(du -sh "$NEURON_CACHE_DIR" 2>/dev/null | cut -f1 || echo "unknown")
+        echo -e "   Cache location: ${CYAN}${NEURON_CACHE_DIR}${NC}"
+        echo -e "   Current size: ${CYAN}${cache_size}${NC}"
+        
+        if rm -rf "$NEURON_CACHE_DIR"; then
+            echo -e "${GREEN}✓ Cache cleared successfully${NC}"
+            return 0
+        else
+            echo -e "${RED}✗ Failed to clear cache. May need sudo privileges.${NC}"
+            echo -e "${YELLOW}Try: sudo rm -rf ${NEURON_CACHE_DIR}${NC}"
+            return 1
+        fi
+    else
+        echo -e "${BLUE}ℹ Cache directory does not exist${NC}"
+        return 0
+    fi
+}
+
+# Function to check for failed cache entries
+check_failed_cache_entries() {
+    if [[ -d "$NEURON_CACHE_DIR" ]]; then
+        local failed_count=$(find "$NEURON_CACHE_DIR" -name "*.neff" -size 0 2>/dev/null | wc -l || echo "0")
+        if [[ $failed_count -gt 0 ]]; then
+            echo -e "${YELLOW}⚠ Found ${failed_count} failed compilation entries in cache${NC}"
+            return 1
+        fi
+    fi
+    return 0
+}
 
 # Function to run evaluate_single mode
 run_evaluate_single() {
@@ -124,7 +168,7 @@ run_evaluate_single() {
     fi
 }
 
-# Function to run evaluate_all mode
+# Function to run evaluate_all mode with error handling
 run_evaluate_all() {
     echo -e "${YELLOW}📊 Running benchmark in evaluate_all mode...${NC}"
     echo -e "${YELLOW}This mode evaluates all model configurations with NKI optimizations.${NC}"
@@ -144,13 +188,22 @@ run_evaluate_all() {
         CMD="${CMD} --enable-nki"
     fi
     
-    # Execute with timing
+    if [[ "$RETRY_FAILED" == "true" ]]; then
+        CMD="${CMD} --retry-failed-compilation"
+    fi
+    
+    # Execute with timing and error handling
     echo -e "${BLUE}Executing evaluate_all benchmark...${NC}"
+    echo -e "${BLUE}${CMD}${NC}"
     echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
     
     START_TIME=$(date +%s)
     
-    if $CMD 2>&1 | tee "${BENCHMARK_LOG_DIR}/benchmark.log"; then
+    # Create a temporary file to capture the output
+    TEMP_LOG=$(mktemp)
+    
+    # Run command and capture both stdout/stderr
+    if $CMD 2>&1 | tee "${BENCHMARK_LOG_DIR}/benchmark.log" | tee "$TEMP_LOG"; then
         END_TIME=$(date +%s)
         DURATION=$((END_TIME - START_TIME))
         
@@ -172,9 +225,56 @@ run_evaluate_all() {
             echo
         fi
         
+        rm -f "$TEMP_LOG"
         return 0
     else
-        echo -e "${RED}✗ evaluate_all benchmark failed!${NC}"
+        # Check if it's a cache-related failure
+        if grep -q "Got a cached failed neff" "$TEMP_LOG" || grep -q "SIGHUP" "$TEMP_LOG"; then
+            echo
+            echo -e "${RED}✗ evaluate_all benchmark failed due to compilation cache issues!${NC}"
+            
+            if [[ "$AUTO_CLEAR_CACHE" == "true" ]]; then
+                echo -e "${YELLOW}🔄 Attempting automatic cache recovery...${NC}"
+                echo
+                
+                # Clear the cache
+                if clear_compilation_cache; then
+                    echo
+                    echo -e "${YELLOW}🔄 Retrying benchmark with clean cache...${NC}"
+                    echo
+                    
+                    # Retry the command
+                    if $CMD 2>&1 | tee "${BENCHMARK_LOG_DIR}/benchmark_retry.log"; then
+                        END_TIME=$(date +%s)
+                        DURATION=$((END_TIME - START_TIME))
+                        
+                        echo
+                        echo -e "${GREEN}✓ evaluate_all benchmark completed successfully after cache clear!${NC}"
+                        echo -e "Total time: ${DURATION} seconds"
+                        
+                        rm -f "$TEMP_LOG"
+                        return 0
+                    else
+                        echo -e "${RED}✗ Benchmark still failed after cache clear${NC}"
+                    fi
+                else
+                    echo -e "${RED}✗ Could not clear cache automatically${NC}"
+                fi
+            else
+                echo
+                echo -e "${YELLOW}💡 Suggestions to fix:${NC}"
+                echo -e "   1. Clear the compilation cache:"
+                echo -e "      ${CYAN}rm -rf ${NEURON_CACHE_DIR}${NC}"
+                echo -e "   2. Re-run with auto cache clearing:"
+                echo -e "      ${CYAN}$0 --mode evaluate_all --clear-cache${NC}"
+                echo -e "   3. Force retry failed compilations:"
+                echo -e "      ${CYAN}$0 --mode evaluate_all --retry-failed-compilation${NC}"
+            fi
+        else
+            echo -e "${RED}✗ evaluate_all benchmark failed!${NC}"
+        fi
+        
+        rm -f "$TEMP_LOG"
         return 1
     fi
 }
@@ -192,12 +292,40 @@ run_benchmark() {
     echo -e "NKI Enabled:       ${CYAN}${ENABLE_NKI}${NC}"
     echo -e "Sequence Length:   ${CYAN}${SEQ_LEN}${NC}"
     echo -e "TP Degree:         ${CYAN}${TP_DEGREE}${NC}"
+    echo -e "Auto Clear Cache:  ${CYAN}${AUTO_CLEAR_CACHE}${NC}"
     echo
     
+    # Check if we should clear cache
+    if [[ "$CLEAR_CACHE" == "true" ]]; then
+        clear_compilation_cache
+        echo
+    else
+        # Check for failed cache entries
+        if ! check_failed_cache_entries; then
+            echo -e "${YELLOW}💡 Consider using --clear-cache to remove failed entries${NC}"
+            echo
+        fi
+    fi
+    
     # Check prerequisites based on mode
     if [[ "$MODE" == "evaluate_all" ]]; then
         check_model
-        # check_compiled_model
+    fi
+    
+    # Check if running in tmux for long compilations
+    if [[ "$MODE" == "evaluate_all" ]] && [[ -z "${TMUX:-}" ]]; then
+        echo -e "${YELLOW}⚠️  Warning: Not running in tmux!${NC}"
+        echo -e "${YELLOW}   Model compilation can take 10-30 minutes.${NC}"
+        echo -e "${YELLOW}   Any disconnection will terminate the process.${NC}"
+        echo
+        echo -e "${CYAN}   Recommended: tmux new -s benchmark${NC}"
+        echo
+        read -p "Continue without tmux? [y/N] " -n 1 -r
+        echo
+        if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+            echo -e "${BLUE}Exiting. Please run in tmux.${NC}"
+            exit 0
+        fi
     fi
     
     # Set environment variables for the benchmark
@@ -230,7 +358,10 @@ run_benchmark() {
     "nki_enabled": ${ENABLE_NKI},
     "sequence_length": ${SEQ_LEN},
     "tensor_parallel_size": ${TP_DEGREE},
-    "neuron_rt_cores": "${NEURON_RT_NUM_CORES}"
+    "neuron_rt_cores": "${NEURON_RT_NUM_CORES}",
+    "cache_cleared": ${CLEAR_CACHE},
+    "auto_clear_cache": ${AUTO_CLEAR_CACHE},
+    "retry_failed": ${RETRY_FAILED}
 }
 EOF
     
@@ -286,6 +417,11 @@ show_info() {
     echo -e "   • Creates compiled model artifacts if needed"
     echo -e "   • Full performance analysis"
     echo
+    echo -e "${BLUE}Cache Management:${NC}"
+    echo -e "   • Auto-detects and handles failed compilations"
+    echo -e "   • Can automatically clear cache on failure"
+    echo -e "   • Manual cache clearing available"
+    echo
     echo -e "${BLUE}Examples:${NC}"
     echo -e "   # Run single evaluation"
     echo -e "   ./run-nki-benchmark.sh --mode evaluate_single"
@@ -293,8 +429,11 @@ show_info() {
     echo -e "   # Run comprehensive benchmark with NKI"
     echo -e "   ./run-nki-benchmark.sh --mode evaluate_all --seq-len 1024"
     echo
-    echo -e "   # Run without NKI optimizations"
-    echo -e "   ./run-nki-benchmark.sh --mode evaluate_all --no-nki"
+    echo -e "   # Clear cache before running"
+    echo -e "   ./run-nki-benchmark.sh --mode evaluate_all --clear-cache"
+    echo
+    echo -e "   # Run without automatic cache clearing"
+    echo -e "   ./run-nki-benchmark.sh --mode evaluate_all --no-auto-clear-cache"
     echo
 }
 

From 1f7a1b35b5bd2453ca48e314e54818592ec42200 Mon Sep 17 00:00:00 2001
From: arm-diaz <arm.dan.diaz.gonzalez@gmail.com>
Date: Wed, 11 Jun 2025 13:03:23 +0000
Subject: [PATCH 16/65] feat: improve CLI

---
 nki-llama.sh | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 66 insertions(+), 2 deletions(-)

diff --git a/nki-llama.sh b/nki-llama.sh
index c642223..a10f640 100755
--- a/nki-llama.sh
+++ b/nki-llama.sh
@@ -157,18 +157,81 @@ cmd_finetune_convert() {
 
 cmd_finetune_compile() {
     echo -e "${BOLD}Pre-compiling graphs...${NC}"
-    suggest_tmux "Graph Compilation" "compile-graphs" "finetune compile"
+    
+    # Check if we're in tmux
+    if [[ -z "${TMUX:-}" ]]; then
+        echo -e "${YELLOW}⚠️  Not running in tmux. ${BOLD}This is important for graph compilation!${NC}"
+        echo -e "${YELLOW}   Graph compilation can take 30-60 minutes.${NC}"
+        echo -e "${YELLOW}   Disconnections will terminate the process.${NC}"
+        echo
+        echo -e "   ${CYAN}tmux new -s compile${NC}"
+        echo -e "   ${CYAN}./nki-llama finetune compile${NC}"
+        echo
+        read -p "Continue without tmux? [y/N] " -n 1 -r
+        echo
+        if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+            echo -e "${BLUE}Please start tmux with: ${CYAN}tmux new -s compile${NC}"
+            exit 0
+        fi
+    fi
+    
     run_script "${NKI_FINETUNE_SCRIPTS}/precompile.sh" "Graph Compilation"
 }
 
 cmd_finetune_train() {
     echo -e "${BOLD}Starting fine-tuning...${NC}"
-    suggest_tmux "Fine-tuning" "training" "finetune train"
+    
+    # Show training information
+    echo -e "${YELLOW}💡 Fine-tuning will run for multiple hours.${NC}"
+    echo -e "${YELLOW}   The training includes checkpointing and will resume if interrupted.${NC}"
+    echo -e "${YELLOW}   Using tmux is strongly recommended!${NC}"
+    
+    # Check if we're in tmux
+    if [[ -z "${TMUX:-}" ]]; then
+        echo -e "${YELLOW}⚠️  Not running in tmux. ${BOLD}This is critical for training!${NC}"
+        echo -e "${YELLOW}   Training can take several hours to complete.${NC}"
+        echo -e "${YELLOW}   Disconnections will terminate the process (SIGHUP).${NC}"
+        echo
+        echo -e "   ${CYAN}tmux new -s training${NC}"
+        echo -e "   ${CYAN}./nki-llama finetune train${NC}"
+        echo
+        read -p "Continue without tmux? [y/N] " -n 1 -r
+        echo
+        if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+            echo -e "${BLUE}Please start tmux with: ${CYAN}tmux new -s training${NC}"
+            exit 0
+        fi
+    fi
+    
     run_script "${NKI_FINETUNE_SCRIPTS}/run_training.sh" "Fine-tuning"
 }
 
 cmd_finetune_all() {
     echo -e "${BOLD}Running complete fine-tuning pipeline...${NC}\n"
+    
+    # Check if we're in tmux for the entire pipeline
+    if [[ -z "${TMUX:-}" ]]; then
+        echo -e "${YELLOW}⚠️  Not running in tmux. ${BOLD}This is critical for the full pipeline!${NC}"
+        echo -e "${YELLOW}   The complete pipeline includes:${NC}"
+        echo -e "${YELLOW}   • Dependency installation${NC}"
+        echo -e "${YELLOW}   • Dataset download${NC}"
+        echo -e "${YELLOW}   • Model download${NC}"
+        echo -e "${YELLOW}   • Checkpoint conversion${NC}"
+        echo -e "${YELLOW}   • Graph compilation (30-60 min)${NC}"
+        echo -e "${YELLOW}   • Training (several hours)${NC}"
+        echo -e "${YELLOW}   Total time: 4-8 hours depending on configuration${NC}"
+        echo
+        echo -e "   ${CYAN}tmux new -s training${NC}"
+        echo -e "   ${CYAN}./nki-llama finetune all${NC}"
+        echo
+        read -p "Continue without tmux? [y/N] " -n 1 -r
+        echo
+        if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+            echo -e "${BLUE}Please start tmux with: ${CYAN}tmux new -s training${NC}"
+            exit 0
+        fi
+    fi
+    
     cmd_finetune_deps && \
     cmd_finetune_data && \
     cmd_finetune_model && \
@@ -482,6 +545,7 @@ EOF
     echo -e "1. Edit .env file with your Hugging Face token"
     echo -e "2. For fine-tuning:"
     echo -e "   ${CYAN}source ${NEURON_VENV}/bin/activate${NC}"
+    echo -e "   ${CYAN}tmux new -s training  # ${YELLOW}IMPORTANT: Use tmux!${NC}"
     echo -e "   ${CYAN}./nki-llama finetune all${NC}"
     echo -e "3. For model benchmarking:"
     echo -e "   ${CYAN}source ${NEURON_INFERENCE_VENV}/bin/activate${NC}"

From 9b5f8586a62c85634cb42011def9681bdaf564e6 Mon Sep 17 00:00:00 2001
From: nithiyn <vijeasns@amazon.com>
Date: Wed, 11 Jun 2025 09:49:12 -0400
Subject: [PATCH 17/65] fix: commit check for tf<4.50

---
 src/inference/scripts/download-model.sh | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/inference/scripts/download-model.sh b/src/inference/scripts/download-model.sh
index ce73883..c6ad790 100755
--- a/src/inference/scripts/download-model.sh
+++ b/src/inference/scripts/download-model.sh
@@ -29,6 +29,19 @@ fi
 # Ensure huggingface-cli is installed
 pip install -q huggingface_hub[cli]
 
+# Ensure transformers < 4.50 (needed by Neuron hf_adapter)
+python - <<'PY'
+import subprocess, pkg_resources, sys
+req = "4.50.0"
+try:
+    ver = pkg_resources.get_distribution("transformers").version
+except pkg_resources.DistributionNotFound:
+    ver = ""
+if not ver or pkg_resources.parse_version(ver) >= pkg_resources.parse_version(req):
+    print("Installing transformers<%s …" % req)
+    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", f"transformers<{req}"])
+PY
+
 # Create models directory
 mkdir -p "$NKI_MODELS"
 

From 86dac49c71095ba2ebc478ddf665c5ad04e388b6 Mon Sep 17 00:00:00 2001
From: nithiyn <vijeasns@amazon.com>
Date: Wed, 11 Jun 2025 16:04:07 -0400
Subject: [PATCH 18/65] feat: reasoning bench scripts

---
 .../scripts/reasoning-bench-lm-eval.sh        | 101 ++++++++++++++++++
 1 file changed, 101 insertions(+)
 create mode 100644 src/inference/scripts/reasoning-bench-lm-eval.sh

diff --git a/src/inference/scripts/reasoning-bench-lm-eval.sh b/src/inference/scripts/reasoning-bench-lm-eval.sh
new file mode 100644
index 0000000..cc9defd
--- /dev/null
+++ b/src/inference/scripts/reasoning-bench-lm-eval.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+# reasoning-bench-lm-eval.sh - Start vLLM OpenAI-compatible API server and run lm-eval
+
+set -euo pipefail
+
+# Load configuration
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "${SCRIPT_DIR}/../../../nki-llama.config"
+
+# Colors
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+RED='\033[0;31m'
+NC='\033[0m'
+
+echo -e "${GREEN}Setting up vLLM for Neuron...${NC}"
+
+# Check if in correct environment
+if [[ "$VIRTUAL_ENV" != *"inference"* ]]; then
+    echo -e "${RED}Error: Not in inference environment${NC}"
+    echo -e "Run: source ${NEURON_INFERENCE_VENV}/bin/activate"
+    exit 1
+fi
+
+# Clone or update vLLM repository
+if [[ -d "$VLLM_REPO" ]]; then
+    echo "Updating existing vLLM repository..."
+    cd "$VLLM_REPO"
+    git fetch
+    git pull
+else
+    echo "Cloning vLLM repository..."
+    cd "$(dirname "$VLLM_REPO")"
+    git clone https://github.com/vllm-project/vllm.git
+fi
+
+# Install requirements
+cd "$VLLM_REPO"
+echo "Installing vLLM requirements..."
+pip install -U -r requirements/neuron.txt
+
+# Install vLLM
+echo "Installing vLLM for Neuron..."
+VLLM_TARGET_DEVICE="neuron" pip install -e .
+
+echo -e "${GREEN}✓ vLLM setup complete${NC}"
+
+cd "$HOME"
+git clone https://github.com/aws-neuron/aws-neuron-samples.git
+cd /home/ubuntu/aws-neuron-samples/inference-benchmarking/
+pip install -r requirements.txt --quiet
+
+echo -e "${GREEN}✓ Inference-Benchmarking setup complete${NC}"
+
+#write config file for reasoning test
+cd /home/ubuntu/aws-neuron-samples/inference-benchmarking/
+
+if test -f "/home/ubuntu/aws-neuron-samples/inference-benchmarking/reasoning_bench.yaml"; then
+   echo "config file exists."
+else 
+    echo "Creating config file..."
+fi
+
+OUT_FILE="reasoning_bench.yaml" 
+cat > "$OUT_FILE" <<YAML
+
+server:
+  name: "Reasoning-benchmark server"
+  model_path: "${NKI_MODELS}/${MODEL_NAME}"
+  model_s3_path: null
+  compiled_model_path: "${NKI_COMPILED}/${MODEL_NAME}"
+  max_seq_len: ${MAX_MODEL_LEN}
+  context_encoding_len: ${MAX_MODEL_LEN}
+  tp_degree: ${TENSOR_PARALLEL_SIZE}
+  n_vllm_threads: ${TENSOR_PARALLEL_SIZE}
+  server_port: ${INFERENCE_PORT}
+  continuous_batch_size: 1
+
+test:
+  accuracy:
+    mytest:
+      client: "lm_eval"
+      datasets: ["gsm8k_cot", "mmlu_flan_n_shot_generative_logical_fallacies"]
+      max_concurrent_requests: 1
+      timeout: 3600
+      client_params:
+        limit: 200
+        use_chat: True
+
+YAML
+
+#config file written
+echo -e "${GREEN}✓ Config File written${NC}"
+
+#run reasoning benchmark
+echo -e "${GREEN}Starting Reasoning Benchmarking job...${NC}"
+echo $"{BLUE}----- reasoning_bench.yaml -----${NC}"
+cat reasoning_bench.yaml
+echo
+
+python accuracy.py --config reasoning_bench.yaml
\ No newline at end of file

From 3c877c4f3685c0d28ce443aee499004b707f74f8 Mon Sep 17 00:00:00 2001
From: arm-diaz <arm.dan.diaz.gonzalez@gmail.com>
Date: Thu, 12 Jun 2025 20:44:55 +0000
Subject: [PATCH 19/65] feat: nki scores & llama.py

---
 .gitignore                                    |    3 +
 data/baseline_results.json                    |    0
 data/prompt_data.json                         |   39 +
 data/prompt_data.txt                          |    5 -
 data/prompts.json                             |   24 +
 data/prompts.txt                              |   25 -
 src/README.md                                 |  265 +++
 src/fine-tune/docs/calculate-score-guide.md   |   23 +-
 .../scripts/calculate_training_metrics.py     |  339 ++-
 src/handler.py                                |  524 +++++
 src/inference/llama.py                        | 1926 +++--------------
 src/inference/main.py                         |   40 +-
 test/inference/test.py                        |   36 +-
 13 files changed, 1590 insertions(+), 1659 deletions(-)
 create mode 100644 data/baseline_results.json
 create mode 100644 data/prompt_data.json
 delete mode 100644 data/prompt_data.txt
 create mode 100644 data/prompts.json
 delete mode 100644 data/prompts.txt
 create mode 100644 src/README.md
 create mode 100644 src/handler.py

diff --git a/.gitignore b/.gitignore
index 5d0a6a2..854245e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -267,7 +267,10 @@ test/inference/output
 **/neuronxcc-*
 global_metric_store.json
 benchmark_report.json
+benchmark_inference.json
 cached_requirements.txt
+benchmark_finetuning.json
+benchmark_results.json
 **/logs/
 
 # End of https://www.toptal.com/developers/gitignore/api/macos,windows,linux,jupyternotebooks,python
\ No newline at end of file
diff --git a/data/baseline_results.json b/data/baseline_results.json
new file mode 100644
index 0000000..e69de29
diff --git a/data/prompt_data.json b/data/prompt_data.json
new file mode 100644
index 0000000..ecd6b5d
--- /dev/null
+++ b/data/prompt_data.json
@@ -0,0 +1,39 @@
+{
+  "prompt_performance_data": [
+    {
+      "index": 0,
+      "word_count": 18,
+      "sequence_length": 64,
+      "baseline_latency_ms": 6259.94,
+      "baseline_throughput": 104.77
+    },
+    {
+      "index": 1,
+      "word_count": 126,
+      "sequence_length": 256,
+      "baseline_latency_ms": 5145.66,
+      "baseline_throughput": 138.21
+    },
+    {
+      "index": 2,
+      "word_count": 43,
+      "sequence_length": 128,
+      "baseline_latency_ms": 6045.47,
+      "baseline_throughput": 110.45
+    },
+    {
+      "index": 3,
+      "word_count": 10,
+      "sequence_length": 64,
+      "baseline_latency_ms": 6772.14,
+      "baseline_throughput": 100.55
+    },
+    {
+      "index": 4,
+      "word_count": 402,
+      "sequence_length": 640,
+      "baseline_latency_ms": 1565.42,
+      "baseline_throughput": 468.28
+    }
+  ]
+}
\ No newline at end of file
diff --git a/data/prompt_data.txt b/data/prompt_data.txt
deleted file mode 100644
index 2ab5813..0000000
--- a/data/prompt_data.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-0,18,64,6259.94,104.77
-1,126,256,5145.66,138.21
-2,43,128,6045.47,110.45
-3,10,64,6772.14,100.55
-4,402,640,1565.42,468.28
\ No newline at end of file
diff --git a/data/prompts.json b/data/prompts.json
new file mode 100644
index 0000000..bba205b
--- /dev/null
+++ b/data/prompts.json
@@ -0,0 +1,24 @@
+{
+  "prompts": [
+    {
+      "id": 1,
+      "prompt": "How many gifts do I get after the Twelve Days of Christmas? Express this as a mathematical formula."
+    },
+    {
+      "id": 2,
+      "prompt": "Create a function that takes a string as input and returns the longest palindromic substring within that string. A palindrome is a word, phrase, number, or other sequence of characters that reads the same forward and backward, disregarding spaces, punctuation, and capitalization.\nFor example:\nInput: \"babad\" Output: \"bab\" or \"aba\" (both are valid)\nInput: \"cbbd\" Output: \"bb\"\nInput: \"A man, a plan, a canal: Panama\" Output: \"a man, a plan, a canal: panama\"\nYour function should be case-insensitive and ignore non-alphanumeric characters when determining palindromes. The function should be efficient and able to handle strings of up to 1000 characters in length.\nProvide your solution along with an explanation of your approach and its time complexity. Write a version that has lower time complexity than O(n^2)."
+    },
+    {
+      "id": 3,
+      "prompt": "Five friends sit in a row facing north:\nAlex sits next to Beth.\nBeth sits next to Chen.\nDiana is not at any end.\nElena sits between two people.\nChen is not next to Diana.\nDetermine the seating order and explain your logic."
+    },
+    {
+      "id": 4,
+      "prompt": "Explain the quote \"I do desire we may be better strangers\""
+    },
+    {
+      "id": 5,
+      "prompt": "You are a close-reading bot with a great memory who answers questions for users. I'm going to give you the text of an essay. Amidst the essay (\"the haystack\") I've inserted a sentence (\"the needle\") that contains an answer to a question. \nHere's the question: \"What is the best thing to do in San Francisco?\"\nHere's the text of the essay. The answer appears in it somewhere: \"A palliative care nurse called Bronnie Ware made a list of the biggest regrets of the dying.  Her list seems plausible.  I could see myself — can see myself — making at least 4 of these 5 mistakes. If you had to compress them into a single piece of advice, it might be: don't be a cog.  The 5 regrets paint a portrait of post-industrial man, who shrinks himself into a shape that fits his circumstances, then turns dutifully till he stops. The alarming thing is, the mistakes that produce these regrets are all errors of omission. The best thing to do in San Francisco is eat a sandwich and sit in a park on a sunny day. You forget your dreams, ignore your family, suppress your feelings, neglect your friends, and forget to be happy. Errors of omission are a particularly dangerous type of mistake, because you make them by default. I would like to avoid making these mistakes.  But how do you avoid mistakes you make by default?  Ideally you transform your life so it has other defaults.  But it may not be possible to do that completely. As long as these mistakes happen by default, you probably have to be reminded not to make them.  So I inverted the 5 regrets, yielding a list of 5 commandsDon't ignore your dreams; don't work too much; say what youthink; cultivate friendships; be happy.which I then put at the top of the file I use as a todo list.\"\nNow that you've read the context, please answer the question, repeated one more time for reference: \"What is the best thing to do in San Francisco?\"\nTo do so, first find the sentence from the haystack that contains the answer (there is such a sentence, I promise!) and put it inside <most_relevant_sentence> XML tags. Then, put your answer in <answer> tags. Base your answer strictly on the context, without reference to outside information. Thank you. If you can't find the answer return the single word UNANSWERABLE."
+    }
+  ]
+}
\ No newline at end of file
diff --git a/data/prompts.txt b/data/prompts.txt
deleted file mode 100644
index 332f93d..0000000
--- a/data/prompts.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-How many gifts do I get after the Twelve Days of Christmas? Express this as a mathematical formula.
-
-Create a function that takes a string as input and returns the longest palindromic substring within that string. A palindrome is a word, phrase, number, or other sequence of characters that reads the same forward and backward, disregarding spaces, punctuation, and capitalization.
-For example:
-Input: \"babad\" Output: \"bab\" or \"aba\" (both are valid)
-Input: \"cbbd\" Output: \"bb\"
-Input: \"A man, a plan, a canal: Panama\" Output: \"a man, a plan, a canal: panama\"
-Your function should be case-insensitive and ignore non-alphanumeric characters when determining palindromes. The function should be efficient and able to handle strings of up to 1000 characters in length.
-Provide your solution along with an explanation of your approach and its time complexity. Write a version that has lower time complexity than O(n^2).
-
-Five friends sit in a row facing north:
-Alex sits next to Beth.
-Beth sits next to Chen.
-Diana is not at any end.
-Elena sits between two people.
-Chen is not next to Diana.
-Determine the seating order and explain your logic.
-
-Explain the quote \"I do desire we may be better strangers\"
-
-You are a close-reading bot with a great memory who answers questions for users. I'm going to give you the text of an essay. Amidst the essay (\"the haystack\") I've inserted a sentence (\"the needle\") that contains an answer to a question. 
-Here's the question: \"What is the best thing to do in San Francisco?\"
-Here's the text of the essay. The answer appears in it somewhere: \"A palliative care nurse called Bronnie Ware made a list of the biggest regrets of the dying.  Her list seems plausible.  I could see myself — can see myself — making at least 4 of these 5 mistakes. If you had to compress them into a single piece of advice, it might be: don't be a cog.  The 5 regrets paint a portrait of post-industrial man, who shrinks himself into a shape that fits his circumstances, then turns dutifully till he stops. The alarming thing is, the mistakes that produce these regrets are all errors of omission. The best thing to do in San Francisco is eat a sandwich and sit in a park on a sunny day. You forget your dreams, ignore your family, suppress your feelings, neglect your friends, and forget to be happy. Errors of omission are a particularly dangerous type of mistake, because you make them by default. I would like to avoid making these mistakes.  But how do you avoid mistakes you make by default?  Ideally you transform your life so it has other defaults.  But it may not be possible to do that completely. As long as these mistakes happen by default, you probably have to be reminded not to make them.  So I inverted the 5 regrets, yielding a list of 5 commandsDon't ignore your dreams; don't work too much; say what youthink; cultivate friendships; be happy.which I then put at the top of the file I use as a todo list.\"
-Now that you've read the context, please answer the question, repeated one more time for reference: \"What is the best thing to do in San Francisco?\"
-To do so, first find the sentence from the haystack that contains the answer (there is such a sentence, I promise!) and put it inside <most_relevant_sentence> XML tags. Then, put your answer in <answer> tags. Base your answer strictly on the context, without reference to outside information. Thank you. If you can't find the answer return the single word UNANSWERABLE.
\ No newline at end of file
diff --git a/src/README.md b/src/README.md
new file mode 100644
index 0000000..7c0e937
--- /dev/null
+++ b/src/README.md
@@ -0,0 +1,265 @@
+# NKI-LLAMA Benchmark Handler
+
+A benchmarking system for evaluating NKI-LLAMA model performance across both training and inference metrics.
+
+## 🚀 Overview
+
+The NKI-LLAMA Benchmark Handler calculates a unified performance score that combines:
+- **Training metrics**: MFU (Model FLOPs Utilization), throughput, and NKI kernel usage
+- **Inference metrics**: Latency, throughput, and accuracy
+- **NKI optimization**: Ratio of NKI (Neuron Kernel Interface) operations to total operations
+
+The final score follows the formula:
+```
+Score = Accuracy × Reduced Latency × Increased Throughput × (1 + Normalized NKI FLOPS)
+```
+
+## 💻 Usage
+
+### Basic Usage
+
+Run with default parameters:
+```bash
+python handler.py
+```
+
+This will:
+1. Calculate training metrics using `calculate_training_metrics.py`
+2. Load inference results from `benchmark_inference.json`
+3. Calculate the combined NKI-LLAMA score
+4. Save results to `benchmark_results.json`
+
+### Advanced Usage
+
+#### Custom Training Configuration
+```bash
+python src/handler.py \
+    --config /home/ubuntu/nki-llama/src/fine-tune/neuronx-distributed-training/examples/conf/hf_llama3_8B_SFT_config.yaml \
+    --model-config /home/ubuntu/nki-llama/src/fine-tune/configs/model-config/8B_config_llama3-1/config.json \
+    --log-file /home/ubuntu/nki-llama/logs/nki-llama_20250610_014432.log \
+    --compile-dir /home/ubuntu/neuron_cache/neuronxcc-2.18.121.0+9e31e41a/MODULE_15329989265349737271+a65e371e \
+    --inference-results /home/ubuntu/nki-llama/src/inference/benchmark_inference.json \
+    --throughput 2.1 \
+    --output benchmark_results.json \
+    --training-weight 0.5 \
+    --inference-weight 0.5 \
+    --hw-backend trn1 \
+    --per-file-scores \
+    --calculate-score \
+    --detailed \
+    --verbose
+```
+
+#### Custom Inference Results
+```bash
+python handler.py \
+    --inference-results my_inference_results.json \
+    --reference-latency 60000 \
+    --reference-throughput 15
+```
+
+#### Adjust Score Weights
+```bash
+python handler.py \
+    --training-weight 0.3 \
+    --inference-weight 0.7
+```
+
+#### Verbose Output
+```bash
+python handler.py --verbose
+```
+
+### Command Line Options
+
+#### Training Metrics Options
+| Option | Default | Description |
+|--------|---------|-------------|
+| `--training-script` | `/home/ubuntu/nki-llama/src/fine-tune/scripts/calculate_training_metrics.py` | Path to training metrics script |
+| `--config` | `/home/ubuntu/nki-llama/src/fine-tune/neuronx-distributed-training/examples/conf/hf_llama3_8B_SFT_config.yaml` | Training config YAML |
+| `--model-config` | `/home/ubuntu/nki-llama/src/fine-tune/configs/model-config/8B_config_llama3-1/config.json` | Model config JSON |
+| `--log-file` | `/home/ubuntu/nki-llama/logs/nki-llama_20250610_014432.log` | Training log file |
+| `--compile-dir` | `/home/ubuntu/neuron_cache` | Neuron compile cache directory |
+| `--throughput` | `2.1` | Training throughput (seq/s) |
+| `--hw-backend` | `trn1` | Hardware backend (trn1/trn2) |
+
+#### Inference Metrics Options
+| Option | Default | Description |
+|--------|---------|-------------|
+| `--inference-results` | `benchmark_inference.json` | Inference benchmark results file |
+| `--reference-latency` | `50000` | Reference implementation latency (ms) |
+| `--reference-throughput` | `10` | Reference implementation throughput (tokens/s) |
+
+#### Score Calculation Options
+| Option | Default | Description |
+|--------|---------|-------------|
+| `--training-weight` | `0.4` | Weight for training score (0-1) |
+| `--inference-weight` | `0.6` | Weight for inference score (0-1) |
+
+#### Output Options
+| Option | Default | Description |
+|--------|---------|-------------|
+| `--output` | `benchmark.json` | Output file for combined results |
+| `--training-output` | `benchmark_finetuning.json` | Output file for training metrics |
+| `--verbose` | `False` | Enable verbose output |
+
+## 📊 Output Format
+
+### Console Output
+```
+======================================================================
+NKI-LLAMA BENCHMARK RESULTS
+======================================================================
+
+🏆 FINAL NKI-LLAMA SCORE: 0.0046
+
+Score Weights:
+  Training: 40%
+  Inference: 60%
+
+📊 Component Scores:
+  Training Score: 0.0077
+  Inference Score: 0.0026
+  NKI Ratio: 0.1846
+
+🎯 Training Metrics:
+  MFU: 15.48% (baseline: 50.00%)
+  Throughput: 2.10 seq/s (baseline: 100.00)
+  MFU Improvement: 0.3095x
+  Throughput Improvement: 0.0210x
+
+⚡ Inference Metrics:
+  Latency: 12131.49ms (reference: 50000.00ms)
+  Throughput: 52.76 tokens/s (reference: 10.00)
+  Latency Reduction: 4.1220x
+  Throughput Increase: 5.2755x
+  Accuracy: ✓ Passed
+
+======================================================================
+```
+
+### JSON Output (`benchmark_results.json`)
+```json
+{
+  "timestamp": "2025-01-01T12:00:00",
+  "nki_llama_score": 0.0046,
+  "component_scores": {
+    "training": 0.0077,
+    "inference": 0.0026
+  },
+  "weights": {
+    "training": 0.4,
+    "inference": 0.6
+  },
+  "nki_ratio": 0.1846,
+  "detailed_breakdown": {
+    "training": {
+      "base_mfu": 50.0,
+      "base_throughput": 100.0,
+      "achieved_mfu": 15.48,
+      "achieved_throughput": 2.1,
+      "mfu_improvement": 0.3095,
+      "throughput_improvement": 0.021,
+      "nki_flop_ratio": 0.1846
+    },
+    "inference": {
+      "accuracy": 1.0,
+      "reduced_latency": 4.122,
+      "increased_throughput": 5.2755,
+      "normalized_nki_flops": 0.1846,
+      "reference_latency_ms": 50000,
+      "achieved_latency_ms": 12131.49,
+      "reference_throughput": 10,
+      "achieved_throughput": 52.76
+    }
+  }
+}
+```
+
+## 📈 Score Interpretation
+
+### Training Score Components
+- **MFU Improvement**: How much better the model utilizes FLOPs compared to baseline
+- **Throughput Improvement**: Training speed improvement over baseline
+- **NKI Ratio**: Percentage of operations using optimized NKI kernels
+
+### Inference Score Components
+- **Accuracy**: Binary flag (1 if meets threshold, 0 otherwise)
+- **Reduced Latency**: How much faster the model responds (higher is better)
+- **Increased Throughput**: How many more tokens/second (higher is better)
+- **NKI FLOPS**: Bonus for using NKI optimized operations
+
+### Score Ranges
+- **0-1**: Poor performance, needs optimization
+- **1-10**: Baseline performance
+- **10-50**: Good optimization
+- **50+**: Excellent optimization
+
+## 🔧 Troubleshooting
+
+### Common Issues
+
+1. **FileNotFoundError**: Ensure all paths in command arguments are correct
+   ```bash
+   python handler.py --verbose  # Shows detailed error messages
+   ```
+
+2. **Missing `benchmark_inference.json`**: Run inference benchmarks first
+   ```bash
+   # Create a sample inference results file
+   echo '{"e2e_model": {"latency_ms_avg": 12131.49, "throughput": 52.76}}' > benchmark_inference.json
+   ```
+
+3. **Training metrics calculation fails**: Check:
+   - Training log file exists and has correct format
+   - Neuron cache directory contains HLO files
+   - Model config JSON is valid
+
+### Debug Mode
+Run with verbose flag to see detailed execution:
+```bash
+python handler.py --verbose 2>&1 | tee debug.log
+```
+
+## 📝 Input File Formats
+
+### `benchmark_inference.json`
+```json
+{
+  "e2e_model": {
+    "latency_ms_p50": 12143.92,
+    "latency_ms_p90": 12169.44,
+    "latency_ms_p95": 12182.64,
+    "latency_ms_p99": 12189.53,
+    "latency_ms_p100": 12191.26,
+    "latency_ms_avg": 12131.49,
+    "throughput": 52.76
+  },
+  "context_encoding_model": {
+    "latency_ms_avg": 43.01,
+    "throughput": 4440.69
+  },
+  "token_generation_model": {
+    "latency_ms_avg": 15.58,
+    "throughput": 64.33
+  }
+}
+```
+
+### Training Config YAML
+```yaml
+data:
+  global_batch_size: 64
+  seq_length: 4096
+
+model:
+  name: "llama3-8b"
+  
+training:
+  num_epochs: 3
+  learning_rate: 1e-4
+```
+
+---
+
+**Note**: Default paths assume standard NKI-LLAMA directory structure. Adjust paths according to your setup.
\ No newline at end of file
diff --git a/src/fine-tune/docs/calculate-score-guide.md b/src/fine-tune/docs/calculate-score-guide.md
index 139c6f2..c310c5b 100644
--- a/src/fine-tune/docs/calculate-score-guide.md
+++ b/src/fine-tune/docs/calculate-score-guide.md
@@ -1,6 +1,6 @@
 # Training Metrics Calculator
 
-A comprehensive tool for calculating training metrics including Model FLOPs Utilization (MFU), NKI (Neuron Kernel Interface) usage analysis, and training performance scores from AWS Neuron training logs and HLO files.
+A tool for calculating training metrics including Model FLOPs Utilization (MFU), NKI (Neuron Kernel Interface) usage analysis, and training performance scores from AWS Neuron training logs and HLO files.
 
 ## Overview
 
@@ -26,9 +26,12 @@ This tool analyzes training performance on AWS Trainium instances by:
 
 ```bash
 source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
-python calculate_training_metrics.py \
+
+# Get report over all training jobs in neuron_cache directory
+python /home/ubuntu/nki-llama/src/fine-tune/scripts/calculate_training_metrics.py \
       --config /home/ubuntu/nki-llama/src/fine-tune/neuronx-distributed-training/examples/conf/hf_llama3_8B_SFT_config.yaml \
       --model-config /home/ubuntu/nki-llama/src/fine-tune/configs/model-config/8B_config_llama3-1/config.json \
+      --log-file /home/ubuntu/nki-llama/logs/nki-llama_20250610_014432.log \
       --compile-dir /home/ubuntu/neuron_cache \
       --throughput 2.1 \
       --hw-backend trn1 \
@@ -36,7 +39,21 @@ python calculate_training_metrics.py \
       --per-file-scores \
       --detailed \
       --print-per-file \
-      --output baseline_metrics.json
+      --output benchmark_finetuning.json
+
+# Get report over a training job in neuron_cache directory
+python /home/ubuntu/nki-llama/src/fine-tune/scripts/calculate_training_metrics.py \
+      --config /home/ubuntu/nki-llama/src/fine-tune/neuronx-distributed-training/examples/conf/hf_llama3_8B_SFT_config.yaml \
+      --model-config /home/ubuntu/nki-llama/src/fine-tune/configs/model-config/8B_config_llama3-1/config.json \
+      --log-file /home/ubuntu/nki-llama/logs/nki-llama_20250610_014432.log \
+      --compile-dir /home/ubuntu/neuron_cache/neuronxcc-2.18.121.0+9e31e41a/MODULE_15329989265349737271+a65e371e \
+      --throughput 2.1 \
+      --hw-backend trn1 \
+      --calculate-score \
+      --per-file-scores \
+      --detailed \
+      --print-per-file \
+      --output benchmark_finetuning.json
 ```
 
 ### Basic Usage
diff --git a/src/fine-tune/scripts/calculate_training_metrics.py b/src/fine-tune/scripts/calculate_training_metrics.py
index b3aa0f1..59121ae 100644
--- a/src/fine-tune/scripts/calculate_training_metrics.py
+++ b/src/fine-tune/scripts/calculate_training_metrics.py
@@ -9,6 +9,7 @@
 import base64
 from pathlib import Path
 from typing import List, Dict, Tuple
+from datetime import datetime
 
 from neuronx_distributed_training.utils.llama_perf_estimate import calculate_mfu
 from torch_neuronx.pyhlo.hlo_pb2 import HloModuleProto
@@ -189,6 +190,9 @@ def find_all_hlo_files(compile_dir: str) -> List[str]:
     
     # Find all .hlo_module.pb files recursively
     hlo_patterns = [
+        #"*.hlo",
+        #"graph.hlo",
+        #"model.hlo_module.pb",
         "**/*.hlo_module.pb",
         "**/model.hlo_module.pb",
         "**/*.hlo",
@@ -230,47 +234,290 @@ def get_module_info(hlo_file_path: str) -> Dict[str, str]:
 
 
 def parse_training_logs(log_file: str) -> Dict:
-    """Parse training logs to extract throughput and loss information."""
+    """Parse training logs to extract throughput, loss, and timestamp information."""
     metrics = {
         'steps': [],
         'step_times': [],
         'throughputs': [],
-        'losses': []
+        'losses': [],
+        'timestamps': [],
+        'first_timestamp': None,
+        'last_timestamp': None,
+        'log_file_path': log_file,
+        'epochs': [],
+        'learning_rates': [],
+        'gradient_norms': [],
+        'consumed_samples': [],
+        'throughput_peaks': []
     }
     
     if not os.path.exists(log_file):
         print(f"Warning: Log file {log_file} not found")
         return metrics
     
+    # Get file modification time as a fallback timestamp
+    file_stat = os.stat(log_file)
+    file_mod_time = datetime.fromtimestamp(file_stat.st_mtime)
+    
     with open(log_file, 'r') as f:
         lines = f.readlines()
     
-    # Common patterns in training logs
+    # Updated patterns for the specific log format
     patterns = {
-        'step_time': r'step_time:\s*([\d.]+)',
-        'throughput': r'throughput:\s*([\d.]+)',
-        'samples_per_sec': r'samples/sec:\s*([\d.]+)',
-        'tokens_per_sec': r'tokens/sec:\s*([\d.]+)',
-        'seq_per_sec': r'seq/s:\s*([\d.]+)',
-        'loss': r'loss:\s*([\d.]+)',
-        'train_loss': r'train_loss:\s*([\d.]+)'
+        # Timestamp pattern: [2025-06-10 09:37:36.116: or [2025-06-10 09:37:36,288]
+        'timestamp': r'\[(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})[.,]\d+[:\]]',
+        # Progress bar pattern - matches the actual format in the logs
+        'progress_bar': r'Epoch\s+(\d+):\s*\d+%\|[█▌\s]+\|\s*\d+/\d+.*?reduced_train_loss=([\d.]+).*?lr=([\d.]+).*?global_step=([\d.]+).*?consumed_samples=([\d.]+).*?throughput=([\d.]+).*?throughput_peak=([\d.]+).*?gradient_norm=([\d.]+)',
+        # Alternative individual patterns
+        'epoch_alt': r'Epoch\s+(\d+):',
+        'throughput': r'throughput=([\d.]+)',
+        'loss': r'reduced_train_loss=([\d.]+)',
+        'global_step': r'global_step=([\d.]+)',
+        'learning_rate': r'lr=([\d.]+)',
+        'gradient_norm': r'gradient_norm=([\d.]+)',
+        'consumed_samples': r'consumed_samples=([\d.]+)',
+        'throughput_peak': r'throughput_peak=([\d.]+)',
+        # Step time if present
+        'step_time': r'step_time[:\s]+([\d.]+)'
     }
     
     for line in lines:
-        for key, pattern in patterns.items():
-            match = re.search(pattern, line)
-            if match:
-                value = float(match.group(1))
-                if key == 'step_time':
-                    metrics['step_times'].append(value)
-                elif key in ['throughput', 'samples_per_sec', 'tokens_per_sec', 'seq_per_sec']:
-                    metrics['throughputs'].append(value)
-                elif key in ['loss', 'train_loss']:
-                    metrics['losses'].append(value)
+        # Check for timestamps
+        timestamp_match = re.search(patterns['timestamp'], line)
+        if timestamp_match:
+            timestamp = timestamp_match.group(1)
+            if timestamp not in metrics['timestamps']:
+                metrics['timestamps'].append(timestamp)
+                if metrics['first_timestamp'] is None:
+                    metrics['first_timestamp'] = timestamp
+                metrics['last_timestamp'] = timestamp
+        
+        # Try to extract from progress bar format first
+        progress_match = re.search(patterns['progress_bar'], line)
+        if progress_match:
+            epoch = int(progress_match.group(1))
+            loss = float(progress_match.group(2))
+            lr = float(progress_match.group(3))
+            step = int(float(progress_match.group(4)))
+            samples = float(progress_match.group(5))
+            throughput = float(progress_match.group(6))
+            throughput_peak = float(progress_match.group(7))
+            grad_norm = float(progress_match.group(8))
+            
+            # Only add unique values
+            if epoch not in metrics['epochs']:
+                metrics['epochs'].append(epoch)
+            if step not in metrics['steps']:
+                metrics['steps'].append(step)
+            
+            # For metrics that can vary, we want all values
+            metrics['losses'].append(loss)
+            metrics['learning_rates'].append(lr)
+            metrics['consumed_samples'].append(samples)
+            metrics['throughputs'].append(throughput)
+            metrics['throughput_peaks'].append(throughput_peak)
+            metrics['gradient_norms'].append(grad_norm)
+        else:
+            # Fall back to individual pattern matching if progress bar doesn't match
+            
+            # Epoch (alternative pattern)
+            epoch_match = re.search(patterns['epoch_alt'], line)
+            if epoch_match:
+                epoch = int(epoch_match.group(1))
+                if epoch not in metrics['epochs']:
+                    metrics['epochs'].append(epoch)
+            
+            # Look for individual metrics in lines that might not have the full progress bar
+            # Only process lines that seem to contain metrics (avoid duplicate processing)
+            if 'reduced_train_loss=' in line:
+                # Loss
+                loss_match = re.search(patterns['loss'], line)
+                if loss_match:
+                    loss = float(loss_match.group(1))
+                    metrics['losses'].append(loss)
+                
+                # Learning rate
+                lr_match = re.search(patterns['learning_rate'], line)
+                if lr_match:
+                    lr = float(lr_match.group(1))
+                    metrics['learning_rates'].append(lr)
+                
+                # Global step
+                step_match = re.search(patterns['global_step'], line)
+                if step_match:
+                    step = int(float(step_match.group(1)))
+                    if step not in metrics['steps']:
+                        metrics['steps'].append(step)
+                
+                # Consumed samples
+                samples_match = re.search(patterns['consumed_samples'], line)
+                if samples_match:
+                    samples = float(samples_match.group(1))
+                    metrics['consumed_samples'].append(samples)
+                
+                # Throughput
+                throughput_match = re.search(patterns['throughput'], line)
+                if throughput_match:
+                    throughput = float(throughput_match.group(1))
+                    metrics['throughputs'].append(throughput)
+                
+                # Throughput peak
+                peak_match = re.search(patterns['throughput_peak'], line)
+                if peak_match:
+                    peak = float(peak_match.group(1))
+                    metrics['throughput_peaks'].append(peak)
+                
+                # Gradient norm
+                grad_match = re.search(patterns['gradient_norm'], line)
+                if grad_match:
+                    grad_norm = float(grad_match.group(1))
+                    metrics['gradient_norms'].append(grad_norm)
+            
+            # Step time (if present)
+            step_time_match = re.search(patterns['step_time'], line)
+            if step_time_match:
+                step_time = float(step_time_match.group(1))
+                metrics['step_times'].append(step_time)
+    
+    # Remove duplicates from lists while preserving order for some metrics
+    # For epochs and steps, we want unique values
+    metrics['epochs'] = sorted(list(set(metrics['epochs'])))
+    metrics['steps'] = sorted(list(set(metrics['steps'])))
+    
+    # For other metrics, remove consecutive duplicates but keep the progression
+    def remove_consecutive_duplicates(lst):
+        if not lst:
+            return []
+        result = [lst[0]]
+        for i in range(1, len(lst)):
+            if lst[i] != lst[i-1]:
+                result.append(lst[i])
+        return result
+    
+    metrics['losses'] = remove_consecutive_duplicates(metrics['losses'])
+    metrics['learning_rates'] = remove_consecutive_duplicates(metrics['learning_rates'])
+    metrics['consumed_samples'] = remove_consecutive_duplicates(metrics['consumed_samples'])
+    metrics['throughputs'] = remove_consecutive_duplicates(metrics['throughputs'])
+    metrics['throughput_peaks'] = remove_consecutive_duplicates(metrics['throughput_peaks'])
+    metrics['gradient_norms'] = remove_consecutive_duplicates(metrics['gradient_norms'])
+    
+    # If no timestamps found in logs, use file modification time
+    if not metrics['timestamps']:
+        metrics['file_modification_time'] = file_mod_time.strftime('%Y-%m-%d %H:%M:%S')
     
     return metrics
 
 
+def display_log_metrics(log_metrics: Dict) -> None:
+    """Display parsed log metrics in a formatted way."""
+    print("\n" + "="*60)
+    print("LOG FILE METRICS SUMMARY")
+    print("="*60)
+    
+    if log_metrics['log_file_path']:
+        print(f"Log file: {log_metrics['log_file_path']}")
+    
+    # Display timestamp information
+    if log_metrics['first_timestamp'] or log_metrics['last_timestamp']:
+        print(f"\nTraining Timeline:")
+        if log_metrics['first_timestamp']:
+            print(f"  First timestamp: {log_metrics['first_timestamp']}")
+        if log_metrics['last_timestamp']:
+            print(f"  Last timestamp: {log_metrics['last_timestamp']}")
+        if log_metrics['first_timestamp'] and log_metrics['last_timestamp']:
+            # Try to calculate duration
+            try:
+                first_dt = datetime.strptime(log_metrics['first_timestamp'], '%Y-%m-%d %H:%M:%S')
+                last_dt = datetime.strptime(log_metrics['last_timestamp'], '%Y-%m-%d %H:%M:%S')
+                duration = last_dt - first_dt
+                print(f"  Duration: {duration}")
+            except:
+                pass
+    elif 'file_modification_time' in log_metrics:
+        print(f"\nLog file last modified: {log_metrics['file_modification_time']}")
+    
+    # Display epoch information
+    if log_metrics['epochs']:
+        print(f"\nEpoch Information:")
+        print(f"  Epochs recorded: {len(log_metrics['epochs'])}")
+        print(f"  First epoch: {min(log_metrics['epochs'])}")
+        print(f"  Last epoch: {max(log_metrics['epochs'])}")
+    
+    # Display step information
+    if log_metrics['steps']:
+        print(f"\nTraining Steps:")
+        print(f"  Total steps recorded: {len(log_metrics['steps'])}")
+        print(f"  First step: {min(log_metrics['steps'])}")
+        print(f"  Last step: {max(log_metrics['steps'])}")
+    
+    # Display consumed samples
+    if log_metrics['consumed_samples']:
+        print(f"\nConsumed Samples:")
+        print(f"  First: {min(log_metrics['consumed_samples']):,.0f}")
+        print(f"  Last: {max(log_metrics['consumed_samples']):,.0f}")
+        print(f"  Total processed: {max(log_metrics['consumed_samples']) - min(log_metrics['consumed_samples']):,.0f}")
+    
+    # Display step time statistics
+    if log_metrics['step_times']:
+        avg_step_time = sum(log_metrics['step_times']) / len(log_metrics['step_times'])
+        print(f"\nStep Time Statistics:")
+        print(f"  Average: {avg_step_time:.3f} seconds")
+        print(f"  Min: {min(log_metrics['step_times']):.3f} seconds")
+        print(f"  Max: {max(log_metrics['step_times']):.3f} seconds")
+        print(f"  Number of measurements: {len(log_metrics['step_times'])}")
+    
+    # Display throughput statistics
+    if log_metrics['throughputs']:
+        avg_throughput = sum(log_metrics['throughputs']) / len(log_metrics['throughputs'])
+        print(f"\nThroughput Statistics:")
+        print(f"  Average: {avg_throughput:.3f} seq/s")
+        print(f"  Min: {min(log_metrics['throughputs']):.3f} seq/s")
+        print(f"  Max: {max(log_metrics['throughputs']):.3f} seq/s")
+        print(f"  Number of measurements: {len(log_metrics['throughputs'])}")
+    
+    # Display throughput peak statistics
+    if log_metrics['throughput_peaks']:
+        avg_peak = sum(log_metrics['throughput_peaks']) / len(log_metrics['throughput_peaks'])
+        print(f"\nThroughput Peak Statistics:")
+        print(f"  Average: {avg_peak:.3f} seq/s")
+        print(f"  Min: {min(log_metrics['throughput_peaks']):.3f} seq/s")
+        print(f"  Max: {max(log_metrics['throughput_peaks']):.3f} seq/s")
+    
+    # Display loss statistics
+    if log_metrics['losses']:
+        avg_loss = sum(log_metrics['losses']) / len(log_metrics['losses'])
+        print(f"\nLoss Statistics:")
+        print(f"  Average: {avg_loss:.4f}")
+        print(f"  Min: {min(log_metrics['losses']):.4f}")
+        print(f"  Max: {max(log_metrics['losses']):.4f}")
+        print(f"  First loss: {log_metrics['losses'][0]:.4f}")
+        print(f"  Last loss: {log_metrics['losses'][-1]:.4f}")
+        print(f"  Number of measurements: {len(log_metrics['losses'])}")
+        
+        # Check if loss is decreasing
+        if len(log_metrics['losses']) > 1:
+            loss_trend = "decreasing" if log_metrics['losses'][-1] < log_metrics['losses'][0] else "increasing"
+            print(f"  Loss trend: {loss_trend}")
+    
+    # Display learning rate statistics
+    if log_metrics['learning_rates']:
+        print(f"\nLearning Rate Statistics:")
+        print(f"  First: {log_metrics['learning_rates'][0]:.6f}")
+        print(f"  Last: {log_metrics['learning_rates'][-1]:.6f}")
+        print(f"  Min: {min(log_metrics['learning_rates']):.6f}")
+        print(f"  Max: {max(log_metrics['learning_rates']):.6f}")
+    
+    # Display gradient norm statistics
+    if log_metrics['gradient_norms']:
+        avg_grad = sum(log_metrics['gradient_norms']) / len(log_metrics['gradient_norms'])
+        print(f"\nGradient Norm Statistics:")
+        print(f"  Average: {avg_grad:.4f}")
+        print(f"  Min: {min(log_metrics['gradient_norms']):.4f}")
+        print(f"  Max: {max(log_metrics['gradient_norms']):.4f}")
+    
+    print("="*60)
+
+
 def analyze_hlo_file(hlo_file: str) -> Dict:
     """Analyze a single HLO file and return its metrics."""
     try:
@@ -463,16 +710,18 @@ def main():
     else:
         seq_len = cfg['data']['seq_length']
     
+    # Parse log file and display metrics if provided
+    log_metrics = None
+    if args.log_file:
+        log_metrics = parse_training_logs(args.log_file)
+        display_log_metrics(log_metrics)
+    
     # Determine throughput
     if args.throughput:
         throughput = args.throughput
-    elif args.log_file:
-        log_metrics = parse_training_logs(args.log_file)
-        if log_metrics['throughputs']:
-            throughput = sum(log_metrics['throughputs']) / len(log_metrics['throughputs'])
-        else:
-            print("Warning: Could not extract throughput from logs, using default")
-            throughput = 100.0  # Default estimate
+    elif args.log_file and log_metrics and log_metrics['throughputs']:
+        throughput = sum(log_metrics['throughputs']) / len(log_metrics['throughputs'])
+        print(f"\nUsing average throughput from log file: {throughput:.2f} seq/s")
     else:
         print("Warning: No throughput information provided, using default")
         throughput = 100.0  # Default estimate
@@ -566,6 +815,42 @@ def main():
         }
     }
     
+    # Add log metrics to output if available
+    if log_metrics:
+        metrics["log_metrics"] = {
+            "log_file": log_metrics['log_file_path'],
+            "timestamps": {
+                "first": log_metrics['first_timestamp'],
+                "last": log_metrics['last_timestamp'],
+                "file_modification": log_metrics.get('file_modification_time')
+            },
+            "steps": {
+                "count": len(log_metrics['steps']),
+                "first": min(log_metrics['steps']) if log_metrics['steps'] else None,
+                "last": max(log_metrics['steps']) if log_metrics['steps'] else None
+            },
+            "step_times": {
+                "average": sum(log_metrics['step_times']) / len(log_metrics['step_times']) if log_metrics['step_times'] else None,
+                "min": min(log_metrics['step_times']) if log_metrics['step_times'] else None,
+                "max": max(log_metrics['step_times']) if log_metrics['step_times'] else None,
+                "count": len(log_metrics['step_times'])
+            },
+            "throughput": {
+                "average": sum(log_metrics['throughputs']) / len(log_metrics['throughputs']) if log_metrics['throughputs'] else None,
+                "min": min(log_metrics['throughputs']) if log_metrics['throughputs'] else None,
+                "max": max(log_metrics['throughputs']) if log_metrics['throughputs'] else None,
+                "count": len(log_metrics['throughputs'])
+            },
+            "losses": {
+                "average": sum(log_metrics['losses']) / len(log_metrics['losses']) if log_metrics['losses'] else None,
+                "min": min(log_metrics['losses']) if log_metrics['losses'] else None,
+                "max": max(log_metrics['losses']) if log_metrics['losses'] else None,
+                "first": log_metrics['losses'][0] if log_metrics['losses'] else None,
+                "last": log_metrics['losses'][-1] if log_metrics['losses'] else None,
+                "count": len(log_metrics['losses'])
+            }
+        }
+    
     # Add detailed per-file metrics if requested
     if args.detailed:
         metrics["nki_analysis"]["per_file_metrics"] = hlo_analysis['per_file_metrics']
diff --git a/src/handler.py b/src/handler.py
new file mode 100644
index 0000000..e2c0ec6
--- /dev/null
+++ b/src/handler.py
@@ -0,0 +1,524 @@
+#!/usr/bin/env python3
+"""
+Handler for calculating NKI-LLAMA scores combining inference and training metrics.
+This script invokes calculate_training_metrics.py and processes benchmark results.
+"""
+
+import argparse
+import json
+import subprocess
+import sys
+import os
+from pathlib import Path
+from typing import Dict, Any, Optional, Tuple
+import logging
+from datetime import datetime
+
+
+class NKILlamaHandler:
+    """Handler for calculating and managing NKI-LLAMA benchmark scores."""
+    
+    def __init__(self, verbose: bool = False):
+        self.verbose = verbose
+        self.setup_logging()
+        
+    def setup_logging(self):
+        """Set up logging configuration."""
+        log_level = logging.DEBUG if self.verbose else logging.INFO
+        logging.basicConfig(
+            level=log_level,
+            format='%(asctime)s - %(levelname)s - %(message)s',
+            datefmt='%Y-%m-%d %H:%M:%S'
+        )
+        self.logger = logging.getLogger(__name__)
+        
+    def build_training_command(self, cmd_args: Dict[str, Any]) -> list:
+        """
+        Build the command for calculate_training_metrics.py.
+        
+        Args:
+            cmd_args: Dictionary of command line arguments
+            
+        Returns:
+            List representing the command to execute
+        """
+        # Build command
+        cmd = ["python", cmd_args["script_path"]]
+        
+        # Add required arguments
+        cmd.extend(["--config", cmd_args["config"]])
+        cmd.extend(["--model-config", cmd_args["model_config"]])
+        
+        # Add optional arguments
+        if cmd_args.get("log_file"):
+            cmd.extend(["--log-file", cmd_args["log_file"]])
+        if cmd_args.get("compile_dir"):
+            cmd.extend(["--compile-dir", cmd_args["compile_dir"]])
+        if cmd_args.get("throughput"):
+            cmd.extend(["--throughput", str(cmd_args["throughput"])])
+        if cmd_args.get("hw_backend"):
+            cmd.extend(["--hw-backend", cmd_args["hw_backend"]])
+        if cmd_args.get("batch_size"):
+            cmd.extend(["--batch-size", str(cmd_args["batch_size"])])
+        if cmd_args.get("seq_len"):
+            cmd.extend(["--seq-len", str(cmd_args["seq_len"])])
+        if cmd_args.get("num_nodes"):
+            cmd.extend(["--num-nodes", str(cmd_args["num_nodes"])])
+            
+        # Add scoring parameters
+        if cmd_args.get("calculate_score"):
+            cmd.append("--calculate-score")
+        if cmd_args.get("per_file_scores"):
+            cmd.append("--per-file-scores")
+        if cmd_args.get("detailed"):
+            cmd.append("--detailed")
+        if cmd_args.get("print_per_file"):
+            cmd.append("--print-per-file")
+            
+        # Add scoring thresholds
+        if cmd_args.get("base_mfu"):
+            pass
+            # cmd.extend(["--base-mfu", str(cmd_args["base_mfu"])])
+        if cmd_args.get("base_throughput"):
+            pass
+            #cmd.extend(["--base-throughput", str(cmd_args["base_throughput"])])
+        if cmd_args.get("loss_improvement"):
+            pass
+            #cmd.extend(["--loss-improvement", str(cmd_args["loss_improvement"])])
+        if cmd_args.get("convergence_rate"):
+            pass
+            #cmd.extend(["--convergence-rate", str(cmd_args["convergence_rate"])])
+            
+        # Output file
+        output_file = cmd_args.get("output", "benchmark_finetuning.json")
+        cmd.extend(["--output", output_file])
+        
+        return cmd
+        
+    def run_training_metrics(self, cmd_args: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Run calculate_training_metrics.py with the specified arguments.
+        
+        Args:
+            cmd_args: Dictionary of command line arguments
+            
+        Returns:
+            Dictionary containing the training metrics results
+        """
+        # Build the command
+        cmd = self.build_training_command(cmd_args)
+        
+        self.logger.info(f"Running command: {' '.join(cmd)}")
+        
+        # ADD THIS PRINT STATEMENT FOR BETTER VISIBILITY
+        print("\n" + "="*80)
+        print("🚀 EXECUTING TRAINING METRICS COMMAND:")
+        print("="*80)
+        print(f"Command: {' '.join(cmd)}")
+        print("="*80 + "\n")
+        
+        output_file = cmd_args.get("output", "benchmark_finetuning.json")
+        
+        try:
+            # Run the command
+            result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+            
+            # Print stdout if verbose
+            if self.verbose and result.stdout:
+                print("=== Training Metrics Output ===")
+                print(result.stdout)
+                print("==============================")
+                
+            # Load and return the results
+            with open(output_file, 'r') as f:
+                return json.load(f)
+                
+        except subprocess.CalledProcessError as e:
+            self.logger.error(f"Error running calculate_training_metrics.py: {e}")
+            if e.stderr:
+                self.logger.error(f"Error output: {e.stderr}")
+            raise
+            
+    def calculate_inference_score(self, inference_data: Dict[str, Any], 
+                                  reference_data: Optional[Dict[str, Any]] = None) -> Tuple[float, Dict]:
+        """
+        Calculate inference score based on the benchmark_inference definition.
+        
+        Score = Accuracy * Reduced Latency * Increased Throughput * (1 + Normalized NKI FLOPS)
+        
+        Args:
+            inference_data: Dictionary containing inference benchmark results
+            reference_data: Optional reference implementation data
+            
+        Returns:
+            Tuple of (score, score_breakdown)
+        """
+        # Default reference values if not provided
+        if reference_data is None:
+            reference_data = {
+                "e2e_model": {
+                    "latency_ms_avg": 50000,  # 50 seconds reference
+                    "throughput": 10  # 10 tokens/sec reference
+                },
+                "accuracy": 1.0  # Assume accuracy threshold is met
+            }
+        
+        # Extract metrics from inference data
+        e2e_latency = inference_data["e2e_model"]["latency_ms_avg"]
+        e2e_throughput = inference_data["e2e_model"]["throughput"]
+        
+        # Calculate components
+        accuracy = reference_data.get("accuracy", 1.0)  # Binary: 1 if meets threshold, 0 otherwise
+        
+        # Reduced Latency = Reference TTFT / Submission TTFT
+        # Using e2e latency as proxy for TTFT
+        reduced_latency = reference_data["e2e_model"]["latency_ms_avg"] / e2e_latency
+        
+        # Increased Throughput = Submission tokens/sec / Reference tokens/sec  
+        increased_throughput = e2e_throughput / reference_data["e2e_model"]["throughput"]
+        
+        # Normalized NKI FLOPS - this would come from the training metrics
+        # For now, using a placeholder - this should be integrated with training metrics
+        normalized_nki_flops = 0.0  # Will be updated when combined with training metrics
+        
+        # Calculate final score
+        score = accuracy * reduced_latency * increased_throughput * (1 + normalized_nki_flops)
+        
+        breakdown = {
+            "accuracy": accuracy,
+            "reduced_latency": reduced_latency,
+            "increased_throughput": increased_throughput,
+            "normalized_nki_flops": normalized_nki_flops,
+            "reference_latency_ms": reference_data["e2e_model"]["latency_ms_avg"],
+            "achieved_latency_ms": e2e_latency,
+            "reference_throughput": reference_data["e2e_model"]["throughput"],
+            "achieved_throughput": e2e_throughput
+        }
+        
+        return score, breakdown
+        
+    def calculate_combined_score(self, training_metrics: Dict[str, Any],
+                                 inference_metrics: Dict[str, Any],
+                                 weights: Optional[Dict[str, float]] = None,
+                                 reference_data: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+        """
+        Calculate combined NKI-LLAMA score from training and inference metrics.
+        
+        Args:
+            training_metrics: Training metrics including NKI analysis
+            inference_metrics: Inference benchmark results
+            weights: Optional weights for combining scores
+            reference_data: Optional reference implementation data for inference scoring
+            
+        Returns:
+            Dictionary containing combined score and breakdown
+        """
+        if weights is None:
+            weights = {
+                "training": 0.4,
+                "inference": 0.6
+            }
+            
+        # Get training score and NKI ratio
+        training_score = training_metrics.get("training_score", 0.0)
+        nki_ratio = training_metrics["nki_analysis"]["summary"]["overall_nki_ratio"]
+        
+        # Calculate inference score with NKI ratio
+        inference_score, inference_breakdown = self.calculate_inference_score(inference_metrics, reference_data)
+        
+        # Update inference score with actual NKI FLOPS ratio
+        inference_breakdown["normalized_nki_flops"] = nki_ratio
+        inference_score_with_nki = (
+            inference_breakdown["accuracy"] * 
+            inference_breakdown["reduced_latency"] * 
+            inference_breakdown["increased_throughput"] * 
+            (1 + nki_ratio)
+        )
+        
+        # Calculate weighted average
+        combined_score = (
+            weights["training"] * training_score + 
+            weights["inference"] * inference_score_with_nki
+        )
+        
+        return {
+            "combined_score": combined_score,
+            "training_score": training_score,
+            "inference_score": inference_score_with_nki,
+            "weights": weights,
+            "breakdown": {
+                "training": training_metrics.get("training_score_breakdown", {}),
+                "inference": inference_breakdown
+            },
+            "nki_ratio": nki_ratio
+        }
+        
+    def display_results(self, results: Dict[str, Any]):
+        """Display the benchmark results in a formatted way."""
+        print("\n" + "="*70)
+        print("NKI-LLAMA BENCHMARK RESULTS")
+        print("="*70)
+        
+        # Combined score
+        print(f"\n🏆 FINAL NKI-LLAMA SCORE: {results['combined_score']:.4f}")
+        print(f"\nScore Weights:")
+        print(f"  Training: {results['weights']['training']*100:.0f}%")
+        print(f"  Inference: {results['weights']['inference']*100:.0f}%")
+        
+        # Component scores
+        print(f"\n📊 Component Scores:")
+        print(f"  Training Score: {results['training_score']:.4f}")
+        print(f"  Inference Score: {results['inference_score']:.4f}")
+        print(f"  NKI Ratio: {results['nki_ratio']:.4f}")
+        
+        # Training breakdown
+        if "training" in results["breakdown"]:
+            tb = results["breakdown"]["training"]
+            print(f"\n🎯 Training Metrics:")
+            print(f"  MFU: {tb.get('achieved_mfu', 0):.2f}% (baseline: {tb.get('base_mfu', 0):.2f}%)")
+            print(f"  Throughput: {tb.get('achieved_throughput', 0):.2f} seq/s (baseline: {tb.get('base_throughput', 0):.2f})")
+            print(f"  MFU Improvement: {tb.get('mfu_improvement', 0):.4f}x")
+            print(f"  Throughput Improvement: {tb.get('throughput_improvement', 0):.4f}x")
+            
+        # Inference breakdown
+        ib = results["breakdown"]["inference"]
+        print(f"\n⚡ Inference Metrics:")
+        print(f"  Latency: {ib['achieved_latency_ms']:.2f}ms (reference: {ib['reference_latency_ms']:.2f}ms)")
+        print(f"  Throughput: {ib['achieved_throughput']:.2f} tokens/s (reference: {ib['reference_throughput']:.2f})")
+        print(f"  Latency Reduction: {ib['reduced_latency']:.4f}x")
+        print(f"  Throughput Increase: {ib['increased_throughput']:.4f}x")
+        print(f"  Accuracy: {'✓ Passed' if ib['accuracy'] == 1.0 else '✗ Failed'}")
+        
+        print("\n" + "="*70)
+        
+    def save_results(self, results: Dict[str, Any], output_file: str):
+        """Save the combined results to a JSON file."""
+        output_data = {
+            "timestamp": datetime.now().isoformat(),
+            "nki_kernel_score": results["combined_score"],
+            "component_scores": {
+                "training": results["training_score"],
+                "inference": results["inference_score"]
+            },
+            "weights": results["weights"],
+            "nki_ratio": results["nki_ratio"],
+            "detailed_breakdown": results["breakdown"]
+        }
+        
+        with open(output_file, 'w') as f:
+            json.dump(output_data, f, indent=2)
+            
+        self.logger.info(f"Results saved to: {output_file}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Handler for NKI-LLAMA benchmark score calculation",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    
+    # Training metrics arguments
+    training_group = parser.add_argument_group('Training Metrics')
+    training_group.add_argument(
+        "--training-script",
+        default="/home/ubuntu/nki-llama/src/fine-tune/scripts/calculate_training_metrics.py",
+        help="Path to calculate_training_metrics.py"
+    )
+    training_group.add_argument(
+        "--config",
+        default="/home/ubuntu/nki-llama/src/fine-tune/neuronx-distributed-training/examples/conf/hf_llama3_8B_SFT_config.yaml",
+        help="Training config YAML file"
+    )
+    training_group.add_argument(
+        "--model-config",
+        default="/home/ubuntu/nki-llama/src/fine-tune/configs/model-config/8B_config_llama3-1/config.json",
+        help="Model config JSON file"
+    )
+    training_group.add_argument(
+        "--log-file",
+        default="/home/ubuntu/nki-llama/logs/nki-llama_20250610_014432.log",
+        help="Training log file"
+    )
+    training_group.add_argument(
+        "--compile-dir",
+        default="/home/ubuntu/neuron_cache",
+        help="Neuron compile cache directory"
+    )
+    training_group.add_argument(
+        "--throughput",
+        type=float,
+        default=2.1,
+        help="Training throughput in seq/s"
+    )
+    training_group.add_argument(
+        "--hw-backend",
+        choices=['trn1', 'trn2'],
+        default='trn1',
+        help="Hardware backend"
+    )
+    
+    # Inference metrics arguments
+    inference_group = parser.add_argument_group('Inference Metrics')
+    inference_group.add_argument(
+        "--inference-results",
+        default="benchmark_inference.json",
+        help="Path to inference benchmark results"
+    )
+    inference_group.add_argument(
+        "--reference-latency",
+        type=float,
+        default=50000,
+        help="Reference implementation latency in ms"
+    )
+    inference_group.add_argument(
+        "--reference-throughput",
+        type=float,
+        default=10,
+        help="Reference implementation throughput in tokens/s"
+    )
+    
+    # Score calculation arguments
+    score_group = parser.add_argument_group('Score Calculation')
+    score_group.add_argument(
+        "--training-weight",
+        type=float,
+        default=0.4,
+        help="Weight for training score (0-1)"
+    )
+    score_group.add_argument(
+        "--inference-weight",
+        type=float,
+        default=0.6,
+        help="Weight for inference score (0-1)"
+    )
+    score_group.add_argument(
+        "--calculate-score",
+        action="store_true",
+        help="Calculate training score"
+    )
+    score_group.add_argument(
+        "--per-file-scores",
+        action="store_true",
+        help="Calculate per-file scores"
+    )
+    score_group.add_argument(
+        "--detailed",
+        action="store_true",
+        help="Show detailed output"
+    )
+    
+    # Output arguments
+    output_group = parser.add_argument_group('Output')
+    output_group.add_argument(
+        "--output",
+        default="benchmark_results.json",
+        help="Output file for combined benchmark results"
+    )
+    output_group.add_argument(
+        "--training-output",
+        default="benchmark_finetuning.json",
+        help="Output file for training metrics"
+    )
+    output_group.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Enable verbose output"
+    )
+    
+    args = parser.parse_args()
+    
+    # Validate weights
+    if args.training_weight + args.inference_weight != 1.0:
+        parser.error("Training and inference weights must sum to 1.0")
+        
+    # Create handler
+    handler = NKILlamaHandler(verbose=args.verbose)
+    
+    try:
+        # Step 1: Run training metrics calculation
+        print("📈 Calculating training metrics...")
+        training_args = {
+            "script_path": args.training_script,
+            "config": args.config,
+            "model_config": args.model_config,
+            "log_file": args.log_file,
+            "compile_dir": args.compile_dir,
+            "throughput": args.throughput,
+            "hw_backend": args.hw_backend,
+            "calculate_score": args.calculate_score,
+            "per_file_scores": args.per_file_scores,
+            "detailed": args.detailed,
+            "print_per_file": args.verbose,
+            "output": args.training_output,
+            "base_mfu": 50.0,
+            "base_throughput": 100.0,
+            "loss_improvement": 1.0,
+            "convergence_rate": 1.0
+        }
+        
+        # Build the command to display it even if using existing file
+        cmd = handler.build_training_command(training_args)
+        
+        # Check if we need to run training metrics or use existing file
+        if os.path.exists(args.training_output):
+            # DISPLAY THE COMMAND THAT WOULD BE EXECUTED
+            print("\n" + "="*80)
+            print("📋 TRAINING METRICS COMMAND (using existing file instead):")
+            print("="*80)
+            print(f"Command that would be executed:\n{' '.join(cmd)}")
+            print("="*80 + "\n")
+            
+            handler.logger.info(f"Using existing training metrics from {args.training_output}")
+            with open(args.training_output, 'r') as f:
+                training_metrics = json.load(f)
+        else:
+            training_metrics = handler.run_training_metrics(training_args)
+            
+        # Step 2: Load inference metrics
+        print("\n⚡ Loading inference metrics...")
+        if not os.path.exists(args.inference_results):
+            handler.logger.error(f"Inference results file not found: {args.inference_results}")
+            sys.exit(1)
+            
+        with open(args.inference_results, 'r') as f:
+            inference_metrics = json.load(f)
+            
+        # Step 3: Calculate combined score
+        print("\n🔬 Calculating combined NKI-LLAMA score...")
+        weights = {
+            "training": args.training_weight,
+            "inference": args.inference_weight
+        }
+        
+        # Set reference data for inference scoring
+        reference_data = {
+            "e2e_model": {
+                "latency_ms_avg": args.reference_latency,
+                "throughput": args.reference_throughput
+            },
+            "accuracy": 1.0  # Assuming accuracy threshold is met
+        }
+        
+        # Pass reference_data to calculate_combined_score
+        results = handler.calculate_combined_score(
+            training_metrics,
+            inference_metrics,
+            weights,
+            reference_data
+        )
+        
+        # Step 4: Display results
+        handler.display_results(results)
+        
+        # Step 5: Save results
+        handler.save_results(results, args.output)
+        
+        print(f"\n✅ Benchmark complete! Results saved to {args.output}")
+        
+    except Exception as e:
+        handler.logger.error(f"Error during benchmark: {e}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/src/inference/llama.py b/src/inference/llama.py
index f472e3b..db36b8b 100644
--- a/src/inference/llama.py
+++ b/src/inference/llama.py
@@ -17,13 +17,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#
-# 
-# This file has been modified by Shiwei Gao and Ruwen Fan 
-# to enable LLM inference with NKI framework on NeuronX devices.
-# Updated for compatibility with latest AWS Neuron SDK 2.23
-
-
 """PyTorch LLaMA model for NXD inference."""
 import copy
 import gc
@@ -33,41 +26,28 @@
 
 import torch
 from neuronx_distributed.parallel_layers import parallel_state  # noqa: E402
-import math
-import warnings
-from typing import (
-    Optional, Tuple, Union, Any, Type
-)
-
-import torch.nn.functional as F
-
-from neuronx_distributed.parallel_layers.mappings import (
-    copy_to_tensor_model_parallel_region,
-    gather_from_tensor_model_parallel_region,
-    reduce_from_tensor_model_parallel_region,
-    reduce_scatter_to_sequence_parallel_region,
-    scatter_to_tensor_model_parallel_region,
-    _gather_along_first_dim,  # Added for latest SDK
-)
 from neuronx_distributed.parallel_layers.layers import (  # noqa: E402; noqa: E402; noqa: E402; noqa: E402; noqa: E402
-    RowParallelLinear,
     ColumnParallelLinear,
     ParallelEmbedding,
+    RowParallelLinear,
 )
 from neuronx_distributed.parallel_layers.mappings import (
     gather_from_sequence_parallel_region,
     reduce_from_tensor_model_parallel_region,
     reduce_scatter_to_sequence_parallel_region,
+    _gather_along_first_dim,
 )
-from neuronx_distributed.utils import cpu_mode  # Added for latest SDK
-
-# Updated imports for latest SDK - includes quantized kernels
+from neuronx_distributed.parallel_layers.utils import get_padding_length
+from neuronx_distributed.utils import cpu_mode
 from neuronxcc.nki._private_kernels.mlp import (
     mlp_fused_add_isa_kernel,
-    mlp_isa_kernel
+    mlp_isa_kernel,
+    quant_mlp_fused_add_isa_kernel,
+    quant_mlp_isa_kernel,
 )
-
-from torch import nn, ones
+from neuronxcc.nki._private_kernels.rmsnorm import rmsnorm_quant_isa_kernel
+from neuronxcc.nki.language import nc
+from torch import nn
 from torch_neuronx.xla_impl.ops import nki_jit
 from transformers import LlamaForCausalLM
 from transformers.activations import ACT2FN
@@ -84,831 +64,34 @@
 )
 from neuronx_distributed_inference.modules.attention.utils import (
     RotaryEmbedding,
+    preprocess_quantized_linear_layer,
     transpose_parallel_linear_layer,
 )
-
-# Updated import for latest SDK
 from neuronx_distributed_inference.modules.custom_calls import CustomRMSNorm
 from neuronx_distributed_inference.modules.flashdecode.utils import calculate_num_cores_per_group
 from neuronx_distributed_inference.modules.lora_serving.lora_module import is_lora_module
 from neuronx_distributed_inference.utils.distributed import get_tp_group
 
-# No longer using direct RmsNorm from torch_neuronx.xla_impl.ops
-# from torch_neuronx.xla_impl.ops import RmsNorm
-
-import neuronxcc.nki as nki
-import neuronxcc.nki.language as nl
-import neuronxcc.nki.isa as nisa  # Note: Some functions may have moved or changed
-import neuronxcc.nki.typing as nt
-import numpy as np
-
 _LLAMA_MODULE_MAP = {}
 
 logger = logging.getLogger("Neuron")
 
-NKI_ENABLED = True
-CONFIG_FUSE_MLP = True
-
-CONFIG_MLP_FUSE_NONE = False
-CONFIG_MLP_FUSE_NORM_ONLY = False
-USE_FLASH = False
-
-def cdiv(a, b):
-    return (a + b - 1) // b
-
-@nki.jit
-def nki_gemm(lhsT, rhs):
-    K, M = lhsT.shape
-    K_, N = rhs.shape
-    assert K == K_, "lhsT and rhs must have the same contraction dimension"
-
-    result = nl.ndarray((M, N), dtype=lhsT.dtype, buffer=nl.shared_hbm)
-
-    TILE_M = nl.tile_size.gemm_stationary_fmax  # 128
-    TILE_K = nl.tile_size.pmax  # 128
-    if N <= 512:
-        TILE_N = N
-    else:
-        TILE_N = 512 
-    assert K % TILE_K == 0
-
-    mask_m = nl.arange(TILE_M)[None, :]  # shape: [TILE_M, 1]
-    mask_n = nl.arange(TILE_N)[None, :]  # shape: [1, TILE_N]
-
-    for m in nl.affine_range(cdiv(M, TILE_M)):
-        for n in nl.affine_range(cdiv(N, TILE_N)):
-            res_psum = nl.zeros((TILE_M, TILE_N), nl.float32, buffer=nl.psum)
-
-            for k in nl.affine_range(K // TILE_K):
-
-                lhsT_tile = nl.load(
-                    lhsT[k * TILE_K : (k + 1) * TILE_K, m * TILE_M : (m + 1) * TILE_M],
-                    mask=(mask_m + m * TILE_M) < M,
-                )
-
-                rhs_tile = nl.load(
-                    rhs[k * TILE_K : (k + 1) * TILE_K, n * TILE_N : (n + 1) * TILE_N],
-                    mask=(mask_n + n * TILE_N) < N,
-                )
-
-                res_psum += nl.matmul(lhsT_tile, rhs_tile, transpose_x=True)
-            mask_m2 = nl.arange(TILE_M)[:,None]  # shape: [TILE_M, 1]
-            nl.store(
-                result[m * TILE_M : (m + 1) * TILE_M, n * TILE_N : (n + 1) * TILE_N],
-                value=res_psum,
-                mask=((mask_m2 + m * TILE_M) < M) & ((mask_n + n * TILE_N) < N)
-            )
-
-    return result
-
-@nki.jit
-def nki_matmul_fully_optimized_(
-    lhsT,
-    rhs,
-    TILES_IN_BLOCK_M=1,
-    TILES_IN_BLOCK_N=4,
-    # Meta-parameters
-):
-    """NKI kernel to compute a matrix multiplication operation while blocking the
-       free dimensions of the LHS and RHS to improve memory access pattern.
-
-    Args:
-        lhsT: an input tensor of shape [K,M], where both K and M are multiples for
-          128.  It is the left-hand-side argument of the matrix multiplication,
-          delivered transposed for optimal performance.
-        rhs: an input tensor of shape [K,N], where K is a multiple of 128, and N
-          is a multiple of 512.  It is the right-hand-side argument of the matrix
-          multiplication.
-    Returns:
-        result: the resulting output tensor of shape [M,N]
-    """
-
-    K, M = lhsT.shape
-    K_, N = rhs.shape
-    assert K == K_, "lhsT and rhs must have the same contraction dimension"
-    result = nl.ndarray((M, N), dtype=lhsT.dtype, buffer=nl.shared_hbm)
-
-    TILE_M = nl.tile_size.gemm_stationary_fmax  # 128
-    TILE_K = nl.tile_size.pmax  # 128
-    if N <= 512:
-        TILE_N = N
-    else:
-        TILE_N = nl.tile_size.gemm_moving_fmax  # 512
-
-    # Define the indices (shape) of the tiles
-    i_lhsT = nl.mgrid[0:TILE_K, 0:TILE_M]
-    i_rhs = nl.mgrid[0:TILE_K, 0:TILE_N]
-    i_res = nl.mgrid[0:TILE_M, 0:TILE_N]
-
-    # Configuring the blocking size for the free dimensions
-    # TILES_IN_BLOCK_M = 2
-    # TILES_IN_BLOCK_N = 4
-    
-    if TILES_IN_BLOCK_N > N // TILE_N:
-        TILES_IN_BLOCK_N = N // TILE_N
-
-    BLOCK_M = TILE_M * TILES_IN_BLOCK_M  # 256
-    BLOCK_N = TILE_N * TILES_IN_BLOCK_N  # 1024
-
-    # the size has to be multiple of block size
-    # assert M % BLOCK_M == 0
-    assert N % BLOCK_N == 0
-
-    # Loop over blocks over the M dimension
-    for m in nl.affine_range(cdiv(M,BLOCK_M)):
-        # Load TILES_IN_BLOCK_M columns tiles from lhsT
-        lhsT_tiles = nl.ndarray(
-            (TILES_IN_BLOCK_M, K // TILE_K, nl.par_dim(TILE_K), TILE_M),
-            dtype=lhsT.dtype,
-            buffer=nl.sbuf,
-        )
-        for bm in nl.affine_range(TILES_IN_BLOCK_M):
-            for k in nl.affine_range(K // TILE_K):
-                lhsT_tiles[bm, k, i_lhsT.p, i_lhsT.x] = nl.load(
-                    lhsT[
-                        k * TILE_K + i_lhsT.p,
-                        (m * TILES_IN_BLOCK_M + bm) * TILE_M + i_lhsT.x
-                    ],mask = ((m * TILES_IN_BLOCK_M + bm) * TILE_M + i_lhsT.x < M)
-                )
-
-        for n in nl.affine_range(N // BLOCK_N):
-            # Load TILES_IN_BLOCK_N columns from rhs
-            rhs_tiles = nl.ndarray(
-                (TILES_IN_BLOCK_N, K // TILE_K, nl.par_dim(TILE_K), TILE_N),
-                dtype=rhs.dtype,
-                buffer=nl.sbuf,
-            )
-            for bn in nl.affine_range(TILES_IN_BLOCK_N):
-                for k in nl.affine_range(K // TILE_K):
-                    rhs_tiles[bn, k, i_rhs.p, i_rhs.x] = nl.load(
-                        rhs[
-                            k * TILE_K + i_rhs.p,
-                            (n * TILES_IN_BLOCK_N + bn) * TILE_N + i_rhs.x,
-                        ]
-                    )
-
-            for bm in nl.affine_range(TILES_IN_BLOCK_M):
-                for bn in nl.affine_range(TILES_IN_BLOCK_N):
-                    # Allocate a tensor in PSUM
-                    res_psum = nl.zeros((TILE_M, TILE_N), nl.float32, buffer=nl.psum)
-                    for k in nl.affine_range(K // TILE_K):
-                        # Accumulate partial-sums into PSUM
-                        res_psum += nl.matmul(
-                            lhsT_tiles[bm, k, i_lhsT.p, i_lhsT.x],
-                            rhs_tiles[bn, k, i_rhs.p, i_rhs.x],
-                            transpose_x=True,
-                        )
-
-                    # Copy the result from PSUM back to SBUF, and cast to expected output data-type
-                    res_sb = nl.copy(res_psum, dtype=result.dtype)
-                    nl.store(
-                        result[
-                            (m * TILES_IN_BLOCK_M + bm) * TILE_M + i_res.p,
-                            (n * TILES_IN_BLOCK_N + bn) * TILE_N + i_res.x
-                        ],
-                        mask = (m * TILES_IN_BLOCK_M + bm) * TILE_M + i_res.p < M ,
-                        value=res_sb,
-                    )
-
-    return result
-
-
-@nki.jit
-def rms_norm_nki_thin_gemm(lhsT, rhs, g_tensor, eps, residual=None):
-    M, K = lhsT.shape
-    K_, N = rhs.shape
-    assert K == K_, "lhsT and rhs must have the same contraction dimension"
-    result = nl.ndarray((M, N), dtype=lhsT.dtype, buffer=nl.shared_hbm)
-    if residual is not None:
-        residual_result = nl.ndarray(lhsT.shape, dtype=lhsT.dtype, buffer=nl.shared_hbm)
-
-    iw = nl.arange(1)[:, None]
-    iy = nl.arange(K)[None, :]
-    
-    # Load RMSNorm weight once, reused by rows/tiles of a_tensor
-    g_tile = nl.load(g_tensor.reshape((1, g_tensor.shape[0]))[iw, iy])
-    if M != 1:
-        g_bcast = g_tile.broadcast_to((M, g_tensor.shape[0]))
-    else:
-        g_bcast = g_tile
-    def micron_kernel(
-        TILE_M, TILE_K, TILE_N, TILES_IN_BLOCK_M, TILES_IN_BLOCK_N, n_start, n_end
-    ):
-        # TILE_M = M  # 128
-        # TILE_K = 128  # 128
-        # TILE_N = 512  # 512
-
-        # Define the indices (shape) of the tiles
-        i_lhsT = nl.mgrid[0:TILE_K, 0:TILE_M]
-        i_rhs = nl.mgrid[0:TILE_K, 0:TILE_N]
-        i_res = nl.mgrid[0:TILE_M, 0:TILE_N]
-
-        # Configuring the blocking size for the free dimensions
-        # TILES_IN_BLOCK_M = 1
-        # TILES_IN_BLOCK_N = 4
-
-        BLOCK_M = TILE_M * TILES_IN_BLOCK_M  # 256
-        BLOCK_N = TILE_N * TILES_IN_BLOCK_N  # 1024
-
-        # the size has to be multiple of block size
-        assert M % BLOCK_M == 0
-        # assert N % BLOCK_N == 0
-
-        # Load TILES_IN_BLOCK_M columns tiles from lhsT
-        a_tile = nl.load(lhsT)
-        if residual is not None:
-            res_tile = nl.load(residual)
-            a_tile = nl.add(a_tile, res_tile)
-            nl.store(residual_result, value=a_tile)
-        
-        in_square = nl.square(a_tile)
-        square_sum = nl.sum(in_square, axis=[1])
-
-        # Scale and get a reciprocal
-        mean = square_sum / K
-
-        rms_reciprocal = nl.rsqrt(mean + eps)
-
-        out_tile = nl.multiply(a_tile, rms_reciprocal)
-        out_tile[...] = nl.multiply(out_tile, g_bcast)
-        
-        lhsT_tiles = nl.ndarray(
-            (K // TILE_K, nl.par_dim(TILE_K), TILE_M),
-            dtype=lhsT.dtype,
-            buffer=nl.sbuf,
-        )
-        
-        for k in nl.affine_range(K // TILE_K):
-            lhsT_tiles[k, :, :] = nisa.nc_transpose(
-                out_tile[:, k * TILE_K: (k + 1) * TILE_K],
-            )
-        for n in nl.affine_range((n_end - n_start) // BLOCK_N):
-            # Load TILES_IN_BLOCK_N columns from rhs
-
-            rhs_tiles = nl.ndarray(
-                (TILES_IN_BLOCK_N, K // TILE_K, nl.par_dim(TILE_K), TILE_N),
-                dtype=rhs.dtype,
-                buffer=nl.sbuf,
-            )
-            for bn in nl.affine_range(TILES_IN_BLOCK_N):
-                for k in nl.affine_range(K // TILE_K):
-                    rhs_tiles[bn, k, i_rhs.p, i_rhs.x] = nl.load(
-                        rhs[
-                            k * TILE_K + i_rhs.p,
-                            n_start + (n * TILES_IN_BLOCK_N + bn) * TILE_N + i_rhs.x,
-                        ]
-                    )
-            for bn in nl.affine_range(TILES_IN_BLOCK_N):
-                res_psum = nl.zeros((M, TILE_N), nl.float32, buffer=nl.psum)
-                for k in nl.affine_range(K // TILE_K):
-                    # Accumulate partial-sums into PSUM
-                    res_psum += nisa.nc_matmul(
-                        stationary=lhsT_tiles[k, i_lhsT.p, i_lhsT.x],
-                        moving=rhs_tiles[bn, k, i_rhs.p, i_rhs.x],
-                    )
-
-                # Copy the result from PSUM back to SBUF, and cast to expected output data-type
-                nl.store(
-                    result[
-                        0:M,
-                        n_start
-                        + (n * TILES_IN_BLOCK_N + bn)
-                        * TILE_N : n_start+(n * TILES_IN_BLOCK_N + bn + 1)
-                        * TILE_N,
-                    ],
-                    value=res_psum,
-                )
-    micron_kernel(M, 128, 512, 1, 4, 0, N)
-    if residual is not None:
-        return result, residual_result
-    return result
-
-
-@nki.jit
-def nki_thin_gemm(lhsT, rhs):
-    K, M = lhsT.shape
-    K_, N = rhs.shape
-    assert K == K_, "lhsT and rhs must have the same contraction dimension"
-    result = nl.ndarray((M, N), dtype=lhsT.dtype, buffer=nl.shared_hbm)
-
-    def micron_kernel(
-        TILE_M, TILE_K, TILE_N, TILES_IN_BLOCK_M, TILES_IN_BLOCK_N, n_start, n_end
-    ):
-        # TILE_M = M  # 128
-        # TILE_K = 128  # 128
-        # TILE_N = 512  # 512
-
-        # Define the indices (shape) of the tiles
-        i_lhsT = nl.mgrid[0:TILE_K, 0:TILE_M]
-        i_rhs = nl.mgrid[0:TILE_K, 0:TILE_N]
-        i_res = nl.mgrid[0:TILE_M, 0:TILE_N]
-
-        # Configuring the blocking size for the free dimensions
-        # TILES_IN_BLOCK_M = 1
-        # TILES_IN_BLOCK_N = 4
-
-        BLOCK_M = TILE_M * TILES_IN_BLOCK_M  # 256
-        BLOCK_N = TILE_N * TILES_IN_BLOCK_N  # 1024
-
-        # Process M in chunks of TILE_M to avoid exceeding partition limits
-        for m_block in nl.affine_range(cdiv(M, TILE_M)):
-            m_start = m_block * TILE_M
-            m_size = min(TILE_M, M - m_start)
-            
-            # Load TILES_IN_BLOCK_M columns tiles from lhsT
-            lhsT_tiles = nl.ndarray(
-                (K // TILE_K, nl.par_dim(TILE_K), TILE_M),
-                dtype=lhsT.dtype,
-                buffer=nl.sbuf,
-            )
-            for k in nl.affine_range(K // TILE_K):
-                # Create mask for loading
-                m_mask = i_lhsT.x < m_size
-                lhsT_tiles[k, i_lhsT.p, i_lhsT.x] = nl.load(
-                    lhsT[
-                        k * TILE_K + i_lhsT.p,
-                        m_start + i_lhsT.x,
-                    ],
-                    mask=m_mask
-                )
-            
-            for n in nl.affine_range((n_end - n_start) // BLOCK_N):
-                # Load TILES_IN_BLOCK_N columns from rhs
-                rhs_tiles = nl.ndarray(
-                    (TILES_IN_BLOCK_N, K // TILE_K, nl.par_dim(TILE_K), TILE_N),
-                    dtype=rhs.dtype,
-                    buffer=nl.sbuf,
-                )
-                for bn in nl.affine_range(TILES_IN_BLOCK_N):
-                    for k in nl.affine_range(K // TILE_K):
-                        rhs_tiles[bn, k, i_rhs.p, i_rhs.x] = nl.load(
-                            rhs[
-                                k * TILE_K + i_rhs.p,
-                                n_start + (n * TILES_IN_BLOCK_N + bn) * TILE_N + i_rhs.x,
-                            ]
-                        )
-                
-                for bn in nl.affine_range(TILES_IN_BLOCK_N):
-                    res_psum = nl.zeros((TILE_M, TILE_N), nl.float32, buffer=nl.psum)
-                    for k in nl.affine_range(K // TILE_K):
-                        # Accumulate partial-sums into PSUM
-                        res_psum += nisa.nc_matmul(
-                            stationary=lhsT_tiles[k, i_lhsT.p, i_lhsT.x],
-                            moving=rhs_tiles[bn, k, i_rhs.p, i_rhs.x],
-                        )
-
-                    # Create mask for storing
-                    store_mask = i_res.p < m_size
-                    # Copy the result from PSUM back to SBUF, and cast to expected output data-type
-                    nl.store(
-                        result[
-                            m_start + i_res.p,
-                            n_start + (n * TILES_IN_BLOCK_N + bn) * TILE_N + i_res.x,
-                        ],
-                        value=res_psum,
-                        mask=store_mask
-                    )
-            
-            # Handle remainder columns if (n_end - n_start) is not divisible by BLOCK_N
-            remainder_n = (n_end - n_start) % BLOCK_N
-            if remainder_n > 0:
-                n_offset = n_start + ((n_end - n_start) // BLOCK_N) * BLOCK_N
-                remaining_tiles = cdiv(remainder_n, TILE_N)
-                
-                for bn in nl.affine_range(remaining_tiles):
-                    tile_start = n_offset + bn * TILE_N
-                    tile_width = min(TILE_N, n_end - tile_start)
-                    
-                    # Load remaining rhs tiles
-                    rhs_tile = nl.ndarray((K // TILE_K, nl.par_dim(TILE_K), TILE_N), dtype=rhs.dtype, buffer=nl.sbuf)
-                    for k in nl.affine_range(K // TILE_K):
-                        n_mask = i_rhs.x < tile_width
-                        rhs_tile[k, i_rhs.p, i_rhs.x] = nl.load(
-                            rhs[k * TILE_K + i_rhs.p, tile_start + i_rhs.x],
-                            mask=n_mask
-                        )
-                    
-                    res_psum = nl.zeros((TILE_M, TILE_N), nl.float32, buffer=nl.psum)
-                    for k in nl.affine_range(K // TILE_K):
-                        res_psum += nisa.nc_matmul(
-                            stationary=lhsT_tiles[k, i_lhsT.p, i_lhsT.x],
-                            moving=rhs_tile[k, i_rhs.p, i_rhs.x],
-                        )
-                    
-                    # Store with combined mask
-                    store_mask = (i_res.p < m_size) & (i_res.x < tile_width)
-                    nl.store(
-                        result[m_start + i_res.p, tile_start + i_res.x],
-                        value=res_psum,
-                        mask=store_mask
-                    )
-                    
-    # Use appropriate tile size based on M
-    if M <= 128:
-        TILE_M = M
-    else:
-        TILE_M = 128
-        
-    if N >=2048:
-        first_n_end = (N // (512 * 4)) * (512 * 4)
-        micron_kernel(TILE_M, 128, 512, 1, 4, 0, first_n_end)
-        # print("first_n_end", first_n_end)
-        # print("N", N)
-        if first_n_end < N:
-            micron_kernel(TILE_M, 128, 128, 1, 1, first_n_end, N)
-    elif N == 256:
-        micron_kernel(TILE_M, 128, N, 1, 1, 0, N)
-    elif N == 512:
-        micron_kernel(TILE_M, 128, 512, 1, 1, 0, N)
-    elif N == 1024:
-        micron_kernel(TILE_M, 128, 512, 1, 2, 0, N)
-    elif N == 1536:
-        micron_kernel(TILE_M, 128, 512, 1, 3, 0, N)
-    else:
-        # Handle other N dimensions by using appropriate tile sizes
-        if N < 256:
-            micron_kernel(TILE_M, 128, 128, 1, 1, 0, N)
-        else:
-            # Process in chunks of 512
-            n_processed = 0
-            while n_processed < N:
-                chunk_size = min(512, N - n_processed)
-                if chunk_size == 512:
-                    micron_kernel(TILE_M, 128, 512, 1, 1, n_processed, n_processed + chunk_size)
-                else:
-                    micron_kernel(TILE_M, 128, 128, 1, 1, n_processed, n_processed + chunk_size)
-                n_processed += chunk_size
-    return result
-
-configs = {(256, 2048, 1024): (1, 1), (384, 2048, 1024): (2, 2), (512, 2048, 1024): (2, 2), (640, 2048, 1024): (4, 1), (256, 2048, 256): (1, 1), (384, 2048, 256): (1, 1), (512, 2048, 256): (1, 1), (640, 2048, 256): (1, 1), (256, 2048, 8192): (4, 1), (384, 2048, 8192): (4, 2), (512, 2048, 8192): (4, 1), (640, 2048, 8192): (4, 2), (256, 4096, 2048): (2, 4), (384, 4096, 2048): (4, 4), (512, 4096, 2048): (4, 1), (640, 4096, 2048): (4, 4), (256, 1024, 2048): (1, 1), (384, 1024, 2048): (2, 1), (512, 1024, 2048): (2, 1), (640, 1024, 2048): (4, 1)}
-def custom_gemx_implement(input_parallel, weight):
-    
-    
-    origin_shape = input_parallel.shape
-    bsz = input_parallel.shape[1] * input_parallel.shape[0]
-    dim_input = input_parallel.shape[-1]
-    dim_output = weight.shape[-1]
-    # print(input_parallel.shape,weight.shape,flush=True)
-    # print(input_parallel.shape,weight.shape,flush=True)
-    input_parallel = input_parallel.view(-1,dim_input)
-    assert input_parallel.dtype == torch.bfloat16
-    assert weight.dtype == torch.bfloat16
-    
-    # Use nki_thin_gemm for small batch sizes or when dimensions don't align well
-    if bsz <= 128:
-        output_parallel = nki_thin_gemm(input_parallel.T,weight)
-        # print(output_parallel.shape)
-    else:
-        bsz = input_parallel.shape[0]
-        # iterate through the bsz and add to find a config, try 100 times, otherwise use the default config
-        test_bsz = (bsz + 127) // 128 * 128
-        # Default values
-        m = 2
-        n = 4
-        k = 4  # This was defined but never used, keeping for compatibility
-        found_config = False
-        
-        for i in range(100):
-            if (test_bsz,dim_input,dim_output) in configs:
-                config = configs[(test_bsz,dim_input,dim_output)]
-                m = config[0]
-                n = config[1]
-                found_config = True
-                break
-            test_bsz = test_bsz + 128
-        
-        # Check if the dimensions are suitable for the optimized kernel
-        TILE_N = 512 if dim_output > 512 else dim_output
-        BLOCK_N = TILE_N * n
-        
-        # If dimensions don't align well or no config found, use nki_thin_gemm
-        if not found_config or dim_output % BLOCK_N != 0 or dim_output < 256:
-            output_parallel = nki_thin_gemm(input_parallel.T, weight)
-        else:
-            # print("dim_input,dim_output,bsz",dim_input,dim_output,test_bsz,m,n,k,flush=True)
-            output_parallel = nki_matmul_fully_optimized_(input_parallel.T,weight, TILES_IN_BLOCK_M=m,TILES_IN_BLOCK_N=n)
-    
-    output = output_parallel.view(origin_shape[0],origin_shape[1],dim_output)
-    return output
-
-
-class CustomColumnParallelLinear(ColumnParallelLinear):
-
-    def __init__(
-        self,
-        *args, **kwargs,
-    ):
-        super().__init__(*args, **kwargs)
-        global NKI_ENABLED
-        if NKI_ENABLED:
-            self.weight = transpose_parallel_linear_layer(self.weight)
-
-    def forward(self, input: torch.Tensor, *_: Any) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
-        """Forward of ColumnParallelLinear
-
-        Args:
-            input_: 3D tensor whose order of dimension is [batch, sequence, hidden]
-
-        Returns:
-            - output
-        """
-        if self.pad and self.training:
-            raise RuntimeError("`pad=True` is only supported for inference. Set model.eval()")
-
-        if self.async_tensor_model_parallel_allreduce or self.sequence_parallel_enabled:
-            input_parallel = input
-        else:
-            input_parallel = copy_to_tensor_model_parallel_region(input, process_group=self.tensor_parallel_group)
-        global NKI_ENABLED
-        # Matrix multiply.
-        if not NKI_ENABLED:
-            output_parallel = self._forward_impl(
-                input=input_parallel,
-                weight=self.weight,
-                bias=None,
-                async_grad_allreduce=self.async_tensor_model_parallel_allreduce,
-                sequence_parallel_enabled=self.sequence_parallel_enabled,
-                sequence_dimension=self.sequence_dimension,
-                autograd_func_class=self.autograd_func_class,
-                process_group=self.tensor_parallel_group,
-                reduce_dtype = self.reduce_dtype,
-            )
-        else:
-            # print(input_parallel.shape,self.weight.shape)
-            output_parallel = custom_gemx_implement(input_parallel, self.weight)
-        # print(input_parallel.squeeze(0).T.shape,self.weight.T.shape)
-        # print(output_parallel.shape)
-        # print(input_parallel.shape,self.weight.shape,output_parallel.shape)
-        if self.gather_output:
-            # All-gather across the partitions.
-            assert not self.sequence_parallel_enabled
-            output = gather_from_tensor_model_parallel_region(output_parallel, process_group=self.tensor_parallel_group)
-            if self.pad and self.pad_size > 0:
-                output = torch.narrow(output, -1, 0, self.output_size - self.pad_size)
-        else:
-            output = output_parallel
-        if self.skip_bias_add:
-            return output, self.bias
-        output = (output + self.bias) if self.bias is not None else output
-        return output
-
-
-class CustomFusedColumnParallelLinear(ColumnParallelLinear):
-
-    def __init__(
-        self,
-        *args, **kwargs,
-    ):
-        super().__init__(*args, **kwargs)
-        global NKI_ENABLED
-        if NKI_ENABLED:
-            self.weight = transpose_parallel_linear_layer(self.weight)
-        self.act_fn = torch.nn.SiLU()
-
-    def forward(self, input: torch.Tensor, rmsnorm, residual, *_: Any) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
-        """Forward of ColumnParallelLinear
-
-        Args:
-            input_: 3D tensor whose order of dimension is [batch, sequence, hidden]
-
-        Returns:
-            - output
-        """
-        
-        if self.pad and self.training:
-            raise RuntimeError("`pad=True` is only supported for inference. Set model.eval()")
-
-        if self.async_tensor_model_parallel_allreduce or self.sequence_parallel_enabled:
-            input_parallel = input
-        else:
-            input_parallel = copy_to_tensor_model_parallel_region(input, process_group=self.tensor_parallel_group)
-        global NKI_ENABLED
-        # Matrix multiply.
-        if not NKI_ENABLED:
-            if residual is not None:
-                input_parallel = residual + input_parallel
-            residual = input_parallel
-            if rmsnorm is not None:
-                input_parallel = rmsnorm(input_parallel)
-            output_parallel = self._forward_impl(
-                input=input_parallel,
-                weight=self.weight,
-                bias=None,
-                async_grad_allreduce=self.async_tensor_model_parallel_allreduce,
-                sequence_parallel_enabled=self.sequence_parallel_enabled,
-                sequence_dimension=self.sequence_dimension,
-                autograd_func_class=self.autograd_func_class,
-                process_group=self.tensor_parallel_group,
-                reduce_dtype = self.reduce_dtype,
-            )
-            dim_output = output_parallel.shape[-1]
-            gate_output = output_parallel[:, :, : dim_output // 2]
-            up_output = output_parallel[:, :, dim_output // 2 :]
-            output = self.act_fn(gate_output) * up_output
-        else:
-            dim_output = self.weight.shape[-1]
-            bsz = input_parallel.shape[1] * input_parallel.shape[0]
-            if bsz <= 128 and not CONFIG_MLP_FUSE_NONE and residual is not None and rmsnorm is not None:
-
-                origin_shape = input_parallel.shape
-                dim_input = input_parallel.shape[-1]
-                dim_output = self.weight.shape[-1]
-                input_parallel = input_parallel.view(-1,dim_input)
-                assert input_parallel.dtype == torch.bfloat16
-                assert self.weight.dtype == torch.bfloat16
-                residual = residual.view(-1,dim_input)
-                if CONFIG_MLP_FUSE_NORM_ONLY:
-                    input_parallel = input_parallel + residual
-                    residual = input_parallel
-                    output_parallel = rms_norm_nki_thin_gemm(input_parallel, self.weight, rmsnorm.weight, rmsnorm.variance_epsilon, None)
-                    residual = residual.view(origin_shape[0],origin_shape[1],dim_input)
-                    output_parallel = output_parallel.view(origin_shape[0],origin_shape[1],dim_output)
-                else:
-                    output_parallel, residual = rms_norm_nki_thin_gemm(input_parallel, self.weight, rmsnorm.weight, rmsnorm.variance_epsilon, residual)
-                    residual = residual.view(origin_shape[0],origin_shape[1],dim_input)
-                    output_parallel = output_parallel.view(origin_shape[0],origin_shape[1],dim_output)
-            else:
-                # Handle case where residual might be None
-                if residual is not None:
-                    input_parallel = residual + input_parallel
-                residual = input_parallel
-                if rmsnorm is not None:
-                    input_parallel = nki_rmsnorm_kernel(input_parallel, rmsnorm.weight, rmsnorm.variance_epsilon)
-                output_parallel = custom_gemx_implement(input_parallel, self.weight)
-            gate_output = output_parallel[:, :, : dim_output // 2]
-            up_output = output_parallel[:, :, dim_output// 2 :]
-            output = self.act_fn(gate_output) * up_output
-       
-        if self.skip_bias_add:
-            return output, self.bias
-        output = (output + self.bias) if self.bias is not None else output
-        return output, residual
-
-
-class CustomRowParallelLinear(RowParallelLinear):
-
-    def __init__(
-        self,
-        *args, **kwargs
-    ):
-        super().__init__(*args, **kwargs)
-        global NKI_ENABLED
-        if NKI_ENABLED:
-            self.weight = transpose_parallel_linear_layer(self.weight)
-            
-    def forward(self, input_: torch.Tensor) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
-        """Forward of RowParallelLinear
-
-        Args:
-            input_: 3D tensor whose order of dimension is [batch, sequence, hidden]
-
-        Returns:
-            - output
-        """
-        if self.pad and self.training:
-            raise RuntimeError("`pad=True` is only supported for inference. Set model.eval()")
-
-        # Set up backprop all-reduce.
-        if self.input_is_parallel:
-            input_parallel = input_
-        else:
-            if self.pad and self.pad_size > 0:
-                input_ = torch.nn.functional.pad(input_, (0, self.pad_size))
-            assert not self.sequence_parallel_enabled
-            input_parallel = scatter_to_tensor_model_parallel_region(input_, process_group=self.tensor_parallel_group)
-        global NKI_ENABLED
-        # Matrix multiply.
-        if not NKI_ENABLED:
-        # Matrix multiply.
-            output_ = self._forward_impl(
-                input=input_parallel,
-                weight=self.weight,
-                bias=None,
-                async_grad_allreduce=False,
-                sequence_parallel_enabled=False,
-                sequence_dimension=self.sequence_dimension,
-                autograd_func_class=self.autograd_func_class,
-                process_group=self.tensor_parallel_group,
-                reduce_dtype = self.reduce_dtype,
-            )
-        else:
-            output_ = custom_gemx_implement(input_parallel, self.weight)
-
-            
-        if self.reduce_output:
-            # All-reduce across all the partitions.
-            original_dtype = output_.dtype
-
-            output_ = output_.to(self.reduce_dtype)
-
-            if self.sequence_parallel_enabled:
-                output_ = reduce_scatter_to_sequence_parallel_region(
-                    output_, self.sequence_dimension, process_group=self.tensor_parallel_group,
-                )
-            else:
-                output_ = reduce_from_tensor_model_parallel_region(
-                    output_, process_group=self.tensor_parallel_group,
-                )
-
-            output_ = output_.to(original_dtype)
-
-        if self.skip_bias_add:
-            return output_, self.bias
-        output = (output_ + self.bias) if self.bias is not None else output_
-        return output
-
-        
-@nki.jit
-def nki_rmsnorm_kernel(a_tensor, g_tensor, eps):
-    # Calculate out_tensor = a_tensor/RMS(a_tensor) * g_tensor
-    # Where RMS(a_tensor) = sqrt((1/N) * sum(a_tensor * a_tensor))
-    # and N = a_tensor.shape[1]
-    # Reduction (mean) is performed in the free (2nd) dimension
-    out_tensor = nl.ndarray(a_tensor.shape, dtype=a_tensor.dtype,
-                          buffer=nl.shared_hbm)
-
-    # Make sure shapes match
-    assert a_tensor.shape[2] == g_tensor.shape[0]
-
-    # Generate tensor indices to index input tensor
-    ix = nl.arange(128)[:, None]
-    iw = nl.arange(1)[:, None]
-    iy = nl.arange(a_tensor.shape[2])[None, :]
-
-    num_rows = a_tensor.shape[1]
-    
-    # Load RMSNorm weight once, reused by rows/tiles of a_tensor
-    g_tile = nl.load(g_tensor.reshape((1, g_tensor.shape[0]))[iw, iy])
-
-    # Process 128 rows at a time due to 128-partition tile size limitation
-    # Since we're not reducing across the first dimension
-    # Tiles can be processed independently
-
-    for b in range(a_tensor.shape[0]):
-        for i in range(math.ceil(a_tensor.shape[1]/128)):
-            # Load input data from external memory to on-chip memory
-            a_tile = nl.zeros([128, a_tensor.shape[2]], a_tensor.dtype)
-            a_tile[...] = nl.load(a_tensor[b, i * 128 + ix, iy], mask=(i * 128 + ix < num_rows))
-
-            # Compute element-wise square of a_tensor
-            in_square = nl.square(a_tile)
-
-            # Calculate sum of squared elements, along last dimension
-            square_sum = nl.sum(in_square, axis=[1])
-
-            # Scale and get a reciprocal
-            mean = square_sum / a_tensor.shape[2]
-
-            # Take square root of mean and then reciprocal with
-            # rsqrt API (one ISA instruction)
-            rms_reciprocal = nl.rsqrt(mean + eps)
-
-            # Scale the input tensor
-            out_tile = nl.multiply(a_tile, rms_reciprocal)
-
-            # Broadcast weight along first axis to match tensor shape
-            # num_rows_active = min(num_rows - i * 128, 128)
-            g_bcast = g_tile.broadcast_to((128, g_tensor.shape[0]))
-
-            # Multiply with the RMSNorm weight
-            out_tile[...] = nl.multiply(out_tile, g_bcast, mask=(i * 128 + ix < num_rows))
-
-            # store the addition results back to external memory (out_tensor)
-            nl.store(out_tensor[b, i * 128 + ix, iy], value=out_tile, mask=(i * 128 + ix < num_rows))
-
-    return out_tensor
-
-
-class CustomRMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6, nki_enabled=False):
-        """
-        Use this RMSNorm to perform customized rmsnorm on Neuron
-        Note: CustomRMSNorm forward method calls target="AwsNeuronRmsNorm"
-        """
-        super().__init__()
-        self.weight = nn.Parameter(ones(hidden_size))
-        self.variance_epsilon = eps
-        self.nki_enabled = nki_enabled
-
-    def forward(self, hidden_states):
-        if self.nki_enabled:
-            out_tensor = nki_rmsnorm_kernel(hidden_states, self.weight, self.variance_epsilon)
-            return out_tensor
-
-        # Updated for latest SDK - use CustomRMSNorm from neuronx_distributed_inference
-        # instead of direct RmsNorm from torch_neuronx.xla_impl.ops
-        # The CustomRMSNorm in the latest SDK handles this properly
-        return CustomRMSNorm.forward(self, hidden_states)
-
 
 def get_rmsnorm_cls():
     # Initialize to the appropriate implementation of RMSNorm
     # If infer on NXD -> CustomRMSNorm
     # If infer on CPU -> HF_RMSNorm (CustomRMSNorm does not work on CPU)
-    # Updated to use cpu_mode() from latest SDK
     return LlamaRMSNorm if cpu_mode() else CustomRMSNorm
 
 
+def preshard_hook_fn(module: torch.nn.Module, model_state_dict: dict, prefix: str) -> bool:
+    if isinstance(module, (BaseGroupQueryAttention,)):
+        return module.preshard_hook(model_state_dict, prefix)
+
+    return False
+
+
+# Get the modules_to_not_convert from the neuron configs
 def get_modules_to_not_convert(neuron_config: NeuronConfig):
     return getattr(neuron_config, "modules_to_not_convert", None)
 
@@ -949,13 +132,6 @@ def get_updated_configs(config: InferenceConfig):
     return updated_configs
 
 
-def preshard_hook_fn(module: torch.nn.Module, model_state_dict: dict, prefix: str) -> bool:
-    if isinstance(module, (BaseGroupQueryAttention,)):
-        return module.preshard_hook(model_state_dict, prefix)
-
-    return False
-
-
 def _register_module(key: str, cls: Type[nn.Module]):
     _LLAMA_MODULE_MAP[key] = cls
 
@@ -1011,59 +187,15 @@ def convert_state_dict_to_fused_qkv(llama_state_dict, cfg: InferenceConfig):
     for l in range(cfg.num_hidden_layers):  # noqa: E741
         _helper_concat_and_delete_qkv(llama_state_dict, l, "weight")
         if (
-            getattr(cfg.neuron_config, 'quantized_mlp_kernel_enabled', False) or getattr(cfg.neuron_config, 'quantized', False)
+            cfg.neuron_config.quantized_mlp_kernel_enabled or cfg.neuron_config.quantized
         ) and f"layers.{l}.self_attn" not in mods_to_not_conv:
-            # Only try to concatenate scale if it exists
-            if f"layers.{l}.self_attn.q_proj.scale" in llama_state_dict:
-                _helper_concat_and_delete_qkv(llama_state_dict, l, "scale")
+            _helper_concat_and_delete_qkv(llama_state_dict, l, "scale")
 
     gc.collect()
 
     return llama_state_dict
 
 
-class NeuronConfigNKI(NeuronConfig):
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self.nki_enabled = kwargs.pop("enable_nki", False)
-        
-        # Add missing attributes for compatibility with latest SDK
-        # These are attributes expected by the NeuronLlamaMLP class
-        if not hasattr(self, 'quantized_kernel_lower_bound'):
-            self.quantized_kernel_lower_bound = kwargs.get('quantized_kernel_lower_bound', 0.0)
-        
-        if not hasattr(self, 'logical_neuron_cores'):
-            self.logical_neuron_cores = kwargs.get('logical_neuron_cores', 1)
-            
-        if not hasattr(self, 'logical_nc_config'):
-            self.logical_nc_config = kwargs.get('logical_nc_config', 1)
-            
-        if not hasattr(self, 'activation_quantization_type'):
-            self.activation_quantization_type = kwargs.get('activation_quantization_type', None)
-            
-        if not hasattr(self, 'quantize_clamp_bound'):
-            self.quantize_clamp_bound = kwargs.get('quantize_clamp_bound', float('inf'))
-            
-        if not hasattr(self, 'fused_rmsnorm_skip_gamma'):
-            self.fused_rmsnorm_skip_gamma = kwargs.get('fused_rmsnorm_skip_gamma', False)
-            
-        if not hasattr(self, 'mlp_kernel_fuse_residual_add'):
-            self.mlp_kernel_fuse_residual_add = kwargs.get('mlp_kernel_fuse_residual_add', False)
-            
-        if not hasattr(self, 'qkv_kernel_fuse_residual_add'):
-            self.qkv_kernel_fuse_residual_add = kwargs.get('qkv_kernel_fuse_residual_add', False)
-            
-        if not hasattr(self, 'is_prefill_stage'):
-            self.is_prefill_stage = kwargs.get('is_prefill_stage', False)
-            
-        if not hasattr(self, 'attn_tkg_builtin_kernel_enabled'):
-            self.attn_tkg_builtin_kernel_enabled = kwargs.get('attn_tkg_builtin_kernel_enabled', False)
-            
-    def is_mlp_quantized(self):
-        """Check if MLP is quantized"""
-        return getattr(self, 'quantized_mlp_kernel_enabled', False) or getattr(self, 'quantized', False)
-
-
 class WeightGatheredColumnParallel(ColumnParallelLinear):
     """
     A specialized column-parallel linear layer that implements weight gathering optimization
@@ -1073,7 +205,7 @@ class WeightGatheredColumnParallel(ColumnParallelLinear):
     1. Standard column-parallel forward (inherited from parent)
     2. Weight-gathered forward for long sequences
     """
-    def forward_wg(self, input: torch.Tensor, weight_gather: bool = False):
+    def forward_wg(self, input: torch, weight_gather: bool = False):
         """
         Performs the forward pass with optional weight gathering optimization.
 
@@ -1139,7 +271,7 @@ def get_required_attributes(self) -> List[str]:
 
     @classmethod
     def get_neuron_config_cls(cls) -> Type[NeuronConfig]:
-        return NeuronConfigNKI
+        return NeuronConfig
 
 
 class NeuronLlamaMLP(nn.Module):
@@ -1161,15 +293,13 @@ def __init__(self, config: InferenceConfig):
         )
         self.sequence_dimension = 1 if self.sequence_parallel_enabled else None
         self.rms_norm_eps = config.rms_norm_eps
-        self.mlp_kernel_enabled = getattr(self.neuron_config, "mlp_kernel_enabled", False)
-        self.fused_rmsnorm_skip_gamma = getattr(self.config.neuron_config, "fused_rmsnorm_skip_gamma", False)
-        self.quantized_mlp_kernel_enabled = getattr(self.neuron_config, "quantized_mlp_kernel_enabled", False)
-        self.rmsnorm_quantize_kernel_enabled = getattr(self.neuron_config, "rmsnorm_quantize_kernel_enabled", False)
-        self.quantized_kernel_lower_bound = getattr(self.neuron_config, "quantized_kernel_lower_bound", 0.0)
-        self.quantize_clamp_bound = getattr(self.neuron_config, "quantize_clamp_bound", float('inf'))
-        self.logical_neuron_cores = getattr(self.neuron_config, "logical_neuron_cores", 1)
-        self.logical_nc_config = getattr(self.neuron_config, "logical_nc_config", 1)
-        self.activation_quantization_type = getattr(self.neuron_config, "activation_quantization_type", None)
+        self.mlp_kernel_enabled = self.neuron_config.mlp_kernel_enabled
+        self.fused_rmsnorm_skip_gamma = self.config.neuron_config.fused_rmsnorm_skip_gamma
+        self.quantized_mlp_kernel_enabled = self.neuron_config.quantized_mlp_kernel_enabled
+        self.rmsnorm_quantize_kernel_enabled = self.neuron_config.rmsnorm_quantize_kernel_enabled
+        self.quantize_clamp_bound = self.neuron_config.quantize_clamp_bound
+        self.logical_nc_config = self.neuron_config.logical_nc_config
+        self.activation_quantization_type = self.neuron_config.activation_quantization_type
         mlp_bias = getattr(config, "mlp_bias", False)
 
         if self.neuron_config.quantized_mlp_kernel_enabled and self.quantize_clamp_bound == float(
@@ -1180,76 +310,221 @@ def __init__(self, config: InferenceConfig):
             )
             self.quantize_clamp_bound = 1200.0
         if parallel_state.model_parallel_is_initialized():
-            if CONFIG_FUSE_MLP:
-                self.gateup = CustomFusedColumnParallelLinear(
-                    self.hidden_size,
-                    self.intermediate_size * 2,
-                    bias=mlp_bias,
-                    gather_output=False,
-                    dtype=config.neuron_config.torch_dtype,
-                    pad=True,
-                    sequence_parallel_enabled=False,
-                    sequence_dimension=None,
-                    tensor_model_parallel_group=get_tp_group(config),
+            if self.neuron_config.quantized_mlp_kernel_enabled:
+                # # Quantized MLP kernels expect intermediate size to be multiple of 128, so we need to pad
+                tp_degree = self.neuron_config.tp_degree
+                self.intermediate_size += (
+                    get_padding_length(self.intermediate_size // tp_degree, 128) * tp_degree
+                )
+                logger.debug(f"Quantized intermediate_size: {self.intermediate_size}")
+            self.gate_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.intermediate_size,
+                bias=mlp_bias,
+                gather_output=False,
+                dtype=config.neuron_config.torch_dtype,
+                pad=True,
+                sequence_parallel_enabled=False,
+                sequence_dimension=None,
+                tensor_model_parallel_group=get_tp_group(config),
+            )
+            self.up_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.intermediate_size,
+                bias=mlp_bias,
+                gather_output=False,
+                dtype=config.neuron_config.torch_dtype,
+                pad=True,
+                sequence_parallel_enabled=False,
+                sequence_dimension=None,
+                tensor_model_parallel_group=get_tp_group(config),
+            )
+            self.down_proj = RowParallelLinear(
+                self.intermediate_size,
+                self.hidden_size,
+                bias=mlp_bias,
+                input_is_parallel=True,
+                dtype=config.neuron_config.torch_dtype,
+                pad=True,
+                sequence_parallel_enabled=self.sequence_parallel_enabled,
+                sequence_dimension=self.sequence_dimension,
+                tensor_model_parallel_group=get_tp_group(config),
+                reduce_dtype=config.neuron_config.rpl_reduce_dtype,
+            )
+            if self.mlp_kernel_enabled:
+                if self.neuron_config.quantized_mlp_kernel_enabled:
+                    setattr(
+                        self.gate_proj,
+                        "post_create_quantized_module_hook",
+                        preprocess_quantized_linear_layer,
+                    )
+                    setattr(
+                        self.up_proj,
+                        "post_create_quantized_module_hook",
+                        preprocess_quantized_linear_layer,
+                    )
+                    setattr(
+                        self.down_proj,
+                        "post_create_quantized_module_hook",
+                        preprocess_quantized_linear_layer,
+                    )
+                else:
+                    # Transpose the weights to the layout expected by kernels
+                    self.gate_proj.weight = transpose_parallel_linear_layer(self.gate_proj.weight)
+                    self.up_proj.weight = transpose_parallel_linear_layer(self.up_proj.weight)
+                    self.down_proj.weight = transpose_parallel_linear_layer(self.down_proj.weight)
+
+        else:
+            self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=mlp_bias)
+            self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=mlp_bias)
+            self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=mlp_bias)
+
+    def _kernel_enabled_quantized_mlp(self, x, rmsnorm, residual, adapter_ids):
+        grid = (nc(self.logical_nc_config),)
+        fused_residual = residual is not None
+        fused_rmsnorm = rmsnorm is not None
+        logger.debug(
+            f"MLP: quantized kernel, fused_residual={fused_residual}, fused_rmsnorm={fused_rmsnorm}, logical_nc_config={self.logical_nc_config}"
+        )
+
+        # Can't do residual add in the kernel if SP is enabled
+        if fused_residual:
+            assert (
+                not self.sequence_parallel_enabled
+            ), "Quantized MLP cannot have both fused residual add and sequence parallel RMSnorm!"
+            # Using fused residual add
+            _mlp_fwd_call = nki_jit()(quant_mlp_fused_add_isa_kernel)
+        else:
+            _mlp_fwd_call = nki_jit()(quant_mlp_isa_kernel)
+
+        if fused_rmsnorm:
+            ln_w = rmsnorm.weight.unsqueeze(0)
+        else:
+            ln_w = torch.zeros(size=(1, self.hidden_size), dtype=x.dtype, device=x.device)
+
+        # Handle SP RMSnorm
+        x_orig_dtype = x.dtype
+        if self.sequence_parallel_enabled:
+            # This RMSNormQuant kernel will do quantization inside, so we pass the
+            # clamp_bound for clipping.
+            # If we don't use this kernel, the MLP kernel below will do the
+            # quantization, so we also pass clamp_bound to that kernel.
+            if self.rmsnorm_quantize_kernel_enabled:
+                logger.debug(
+                    "Running Quantized MLP kernel with sequence-parallel RMSnorm-Quantize kernel!"
                 )
-                self.down_proj = CustomRowParallelLinear(
-                    self.intermediate_size,
-                    self.hidden_size,
-                    bias=mlp_bias,
-                    input_is_parallel=True,
-                    dtype=config.neuron_config.torch_dtype,
-                    pad=True,
-                    sequence_parallel_enabled=self.sequence_parallel_enabled,
-                    sequence_dimension=self.sequence_dimension,
-                    tensor_model_parallel_group=get_tp_group(config),
-                    reduce_dtype=config.neuron_config.rpl_reduce_dtype,
+                _rmsnorm_quant_fwd_call = nki_jit()(rmsnorm_quant_isa_kernel)
+                quant_rmsnorm_out = torch.zeros(
+                    size=(
+                        x.shape[0],  # batch size
+                        x.shape[1],  # sequence length
+                        x.shape[2] + 4,  # hidden size + 4 bytes for packing fp32 scale
+                    ),
+                    dtype=torch.int8,
+                    device=x.device,
                 )
-            else:
-                self.gate_proj = CustomColumnParallelLinear(
-                    self.hidden_size,
-                    self.intermediate_size,
-                    bias=mlp_bias,
-                    gather_output=False,
-                    dtype=config.neuron_config.torch_dtype,
-                    pad=True,
-                    sequence_parallel_enabled=False,
-                    sequence_dimension=None,
-                    tensor_model_parallel_group=get_tp_group(config),
+                clamp_bound = self.quantize_clamp_bound
+                _rmsnorm_quant_fwd_call[grid](
+                    x, ln_w, clamp_bound, quant_rmsnorm_out, kernel_name="QuantOnly"
                 )
-                self.up_proj = CustomColumnParallelLinear(
-                    self.hidden_size,
-                    self.intermediate_size,
-                    bias=mlp_bias,
-                    gather_output=False,
-                    dtype=config.neuron_config.torch_dtype,
-                    pad=True,
-                    sequence_parallel_enabled=False,
-                    sequence_dimension=None,
-                    tensor_model_parallel_group=get_tp_group(config),
+                x = gather_from_sequence_parallel_region(
+                    quant_rmsnorm_out,
+                    self.sequence_dimension,
+                    process_group=get_tp_group(self.config),
                 )
-                self.down_proj = CustomRowParallelLinear(
-                    self.intermediate_size,
-                    self.hidden_size,
-                    bias=mlp_bias,
-                    input_is_parallel=True,
-                    dtype=config.neuron_config.torch_dtype,
-                    pad=True,
-                    sequence_parallel_enabled=self.sequence_parallel_enabled,
-                    sequence_dimension=self.sequence_dimension,
-                    tensor_model_parallel_group=get_tp_group(config),
-                    reduce_dtype=config.neuron_config.rpl_reduce_dtype,
+
+            else:
+                logger.debug(
+                    "Running Quantized MLP kernel with external (native compiler) sequence-parallel RMSnorm!"
                 )
-           
+                x = gather_from_sequence_parallel_region(
+                    x, self.sequence_dimension, process_group=get_tp_group(self.config)
+                )
+
+        # Build output tensor
+        output_tensor_seqlen = x.shape[1]
+        if fused_residual:
+            # seqlen dim is doubled to store the residual add output
+            output_tensor_seqlen *= 2
+
+        output_tensor = torch.zeros(
+            size=(
+                x.shape[0],  # batch size
+                output_tensor_seqlen,
+                self.hidden_size,  # hidden size
+            ),
+            dtype=x_orig_dtype,
+            device=x.device,
+        )
+
+        # Grab weights
+        # all weights of the layers are stored in (out, in) shape
+        # unsqueeze so that shape of RMS gamma weight is [1, hidden] instead of [hidden]
+        gate_w = self.gate_proj.weight.data
+        gate_w_scale = self.gate_proj.scale
+        up_w = self.up_proj.weight.data
+        up_w_scale = self.up_proj.scale
+        down_w = self.down_proj.weight.data
+        down_w_scale = self.down_proj.scale
+        clamp_bound = self.quantize_clamp_bound
 
+        if fused_residual:
+            _mlp_fwd_call[grid](
+                x,  # attn_output
+                residual,  # hidden
+                ln_w,  # ln_w
+                gate_w,  # gate_w
+                gate_w_scale,
+                up_w,  # up_w
+                up_w_scale,
+                down_w,  # down_w
+                down_w_scale,
+                clamp_bound,
+                output_tensor,  # out
+                fused_rmsnorm=fused_rmsnorm,
+                eps=self.rms_norm_eps,
+                kernel_name="MLP",
+                store_add=True,
+            )
+            original_seqlen = x.shape[1]
+            residual = output_tensor[:, original_seqlen:, :]
+            output_tensor = output_tensor[:, :original_seqlen, :]
         else:
-            self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=mlp_bias)
-            self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=mlp_bias)
-            self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=mlp_bias)
+            _mlp_fwd_call[grid](
+                x,  # hidden
+                # should be fine to pass gamma is as a dummy even if not using fused rmsnorm
+                ln_w,
+                gate_w,  # gate_w
+                gate_w_scale,
+                up_w,  # up_w
+                up_w_scale,
+                down_w,  # down_w
+                down_w_scale,
+                clamp_bound,
+                output_tensor,  # out
+                # Run RMSNorm inside the kernel if NOT using SP rmsnorm
+                fused_rmsnorm=fused_rmsnorm,
+                eps=self.rms_norm_eps,
+                kernel_name="MLP",
+            )
+            residual = None
 
-    def _kernel_enabled_mlp(self, x, fused_rmsnorm, rmsnorm, residual, adapter_ids):
+        # All-reduce or reduce-scatter, depending on whether SP is enabled
+        if self.sequence_parallel_enabled:
+            output_tensor = reduce_scatter_to_sequence_parallel_region(
+                output_tensor, self.sequence_dimension, process_group=get_tp_group(self.config)
+            )
+        else:
+            output_tensor = reduce_from_tensor_model_parallel_region(output_tensor)
+
+        logger.debug(f"Quantized MLP output shape {output_tensor.shape}")
+        return (output_tensor, residual)
+
+    def _kernel_enabled_mlp(self, x, rmsnorm, residual, adapter_ids):
         fused_residual = residual is not None
+        fused_rmsnorm = rmsnorm is not None
         logger.debug(
-            f"MLP: kernel, fused_residual={fused_residual}, fused_rmsnorm={fused_rmsnorm}, logical_neuron_cores={self.logical_neuron_cores}"
+            f"MLP: kernel, fused_residual={fused_residual}, fused_rmsnorm={fused_rmsnorm}, skip_gamma={self.fused_rmsnorm_skip_gamma}, logical_nc_config={self.logical_nc_config}"
         )
 
         # Choose which kernel to call
@@ -1286,12 +561,15 @@ def _kernel_enabled_mlp(self, x, fused_rmsnorm, rmsnorm, residual, adapter_ids):
         # Grab weights
         # all weights of the layers are stored in (out, in) shape
         # unsqueeze so that shape of RMS gamma weight is [1, hidden] instead of [hidden]
-        ln_w = rmsnorm.weight.unsqueeze(0)
+        if fused_rmsnorm:
+            ln_w = rmsnorm.weight.unsqueeze(0)
+        else:
+            ln_w = torch.zeros(size=(1, self.hidden_size), dtype=x.dtype, device=x.device)
         gate_w = self.gate_proj.weight.data
         up_w = self.up_proj.weight.data
         down_w = self.down_proj.weight.data
 
-        grid = (self.logical_neuron_cores,)
+        grid = (nc(self.logical_nc_config),)
 
         if fused_residual:
             _mlp_fwd_call[grid](
@@ -1302,9 +580,10 @@ def _kernel_enabled_mlp(self, x, fused_rmsnorm, rmsnorm, residual, adapter_ids):
                 up_w,  # up_w
                 down_w,  # down_w
                 output_tensor,  # out
+                kernel_name="MLP",
                 fused_rmsnorm=fused_rmsnorm,
+                skip_gamma=self.fused_rmsnorm_skip_gamma,
                 eps=self.rms_norm_eps,
-                kernel_name="MLP",
                 store_add=True,
             )
             original_seqlen = x.shape[1]
@@ -1319,10 +598,11 @@ def _kernel_enabled_mlp(self, x, fused_rmsnorm, rmsnorm, residual, adapter_ids):
                 up_w,
                 down_w,
                 output_tensor,  # out
+                kernel_name="MLP",
                 # Run RMSNorm inside the kernel if NOT using SP rmsnorm
                 fused_rmsnorm=fused_rmsnorm,
+                skip_gamma=self.fused_rmsnorm_skip_gamma,
                 eps=self.rms_norm_eps,
-                kernel_name="MLP",
             )
             residual = None
 
@@ -1339,7 +619,7 @@ def _kernel_enabled_mlp(self, x, fused_rmsnorm, rmsnorm, residual, adapter_ids):
         logger.debug(f"MLP output shape {output_tensor.shape}")
         return (output_tensor, residual)
 
-    def _native_mlp(self, x, rmsnorm, adapter_ids=None, residual=None):
+    def _native_mlp(self, x, adapter_ids=None):
         logger.debug("MLP: native compiler")
         # all-gather is done here instead of CPL layers to
         # avoid 2 all-gathers from up and gate projections
@@ -1347,345 +627,47 @@ def _native_mlp(self, x, rmsnorm, adapter_ids=None, residual=None):
             x = gather_from_sequence_parallel_region(
                 x, self.sequence_dimension, process_group=get_tp_group(self.config)
             )
-        if CONFIG_FUSE_MLP:
-            gateup_output, residual = self.gateup(x, rmsnorm, residual)
-            output = self.down_proj(gateup_output)
-            
-        else:
-            if residual is not None:
-                x = residual + x
-                residual = x
-            if rmsnorm is not None:
-                x = rmsnorm(x)
-            gate_proj_output = (
-                self.gate_proj(x)
-                if not is_lora_module(self.gate_proj)
-                else self.gate_proj(x, adapter_ids)
-            )
-            up_proj_output = (
-                self.up_proj(x) if not is_lora_module(self.up_proj) else self.up_proj(x, adapter_ids)
-            )
-            down_proj_input = self.act_fn(gate_proj_output) * up_proj_output
-            output = (
-                self.down_proj(down_proj_input)
-                if not is_lora_module(self.down_proj)
-                else self.down_proj(down_proj_input, adapter_ids)
-            )
+        gate_proj_output = (
+            self.gate_proj(x)
+            if not is_lora_module(self.gate_proj)
+            else self.gate_proj(x, adapter_ids)
+        )
+
+        up_proj_output = (
+            self.up_proj(x) if not is_lora_module(self.up_proj) else self.up_proj(x, adapter_ids)
+        )
+
+        down_proj_input = self.act_fn(gate_proj_output) * up_proj_output
+        output = (
+            self.down_proj(down_proj_input)
+            if not is_lora_module(self.down_proj)
+            else self.down_proj(down_proj_input, adapter_ids)
+        )
         logger.debug(f"MLP output shape {output.shape}")
-        return output, residual
+        return output
 
     def forward(self, x, rmsnorm=None, residual=None, adapter_ids=None):
         """
         If residual is passed in, will fuse its add into the MLP kernel
+        If rmsnorm is passed in, will fuse the rmsnorm into the MLP kernel
 
         Returns a tuple of (output, residual), where residual is the output of the residual add
         """
+
         if self.mlp_kernel_enabled:
-            fused_rmsnorm = not self.sequence_parallel_enabled
             # Quantized MLP kernel
+            if self.quantized_mlp_kernel_enabled:
+                return self._kernel_enabled_quantized_mlp(
+                    x, rmsnorm, residual, adapter_ids=adapter_ids
+                )
             # MLP kernel
-            return self._kernel_enabled_mlp(
-                x, fused_rmsnorm, rmsnorm, residual, adapter_ids=adapter_ids
-            )
+            return self._kernel_enabled_mlp(x, rmsnorm, residual, adapter_ids=adapter_ids)
         else:
             # No kernel
-            return self._native_mlp(x, rmsnorm, adapter_ids=adapter_ids, residual=residual)
-
-def smallest_multiple(k, n):
-    if k % n == 0:
-        return k
-    else:
-        return (k // n + 1) * n
-
-@nki.jit
-def _flash_attention_core(q_local_tile, k, v, 
-                            o_buffer, l_buffer, m_buffer,
-                            q_tile_idx,
-                            local_k_large_tile_idx,
-                            kernel_dtype, acc_type,
-                            LARGE_TILE_SZ,
-                            initialize,
-                            B_P_SIZE=128, B_F_SIZE=512, B_D_SIZE=128):
-    num_k_tile_per_large_tile = LARGE_TILE_SZ // B_F_SIZE
-
-    qk_res_buf = nl.ndarray((nl.par_dim(B_P_SIZE), LARGE_TILE_SZ), buffer=nl.sbuf, dtype=acc_type)
-    max_local = nl.ndarray((nl.par_dim(B_P_SIZE), num_k_tile_per_large_tile), dtype=acc_type)
-
-    for k_i in nl.affine_range(num_k_tile_per_large_tile):
-        k_i_b_f_slice = nl.ds(k_i * B_F_SIZE, B_F_SIZE)
-
-        qk_psum = nl.ndarray((nl.par_dim(B_P_SIZE), B_F_SIZE),
-                                                dtype=np.float32, buffer=nl.psum)  # (128, 512)
-
-        multiplication_required_selection = q_tile_idx * B_P_SIZE >= local_k_large_tile_idx * LARGE_TILE_SZ + k_i * B_F_SIZE
-
-
-        if multiplication_required_selection:
-            qk_psum[:, :] = nl.matmul(q_local_tile, k[:, k_i_b_f_slice], transpose_x=True) # (p(128), 512)
-        else:
-            qk_psum[:, :] = 0
+            assert rmsnorm is None and residual is None
+            return (self._native_mlp(x, adapter_ids=adapter_ids), None)
 
 
-        left_diagonal_selection = q_tile_idx * B_P_SIZE >= local_k_large_tile_idx * LARGE_TILE_SZ + (k_i + 1) * B_F_SIZE
-        diagonal_and_right_selection = (q_tile_idx * B_P_SIZE < local_k_large_tile_idx * LARGE_TILE_SZ + (k_i + 1) * B_F_SIZE)
-
-        i_q_p, i_q_f = nl.mgrid[0:B_P_SIZE, 0:B_F_SIZE]
-        q_pos = q_tile_idx * B_P_SIZE + i_q_p
-        k_pos = local_k_large_tile_idx * LARGE_TILE_SZ + k_i * B_F_SIZE + i_q_f
-        pred = q_pos >= k_pos
-
-        qk_select_tmp = nl.ndarray(qk_psum.shape, dtype=qk_psum.dtype, buffer=nl.sbuf)
-
-        # For tiles on and to the right of the diagonal, need to do affine_select.
-        if diagonal_and_right_selection:
-            qk_select_tmp[...] = qk_psum
-
-            qk_res_buf[:, k_i_b_f_slice] = nisa.affine_select(
-                pred=pred,
-                on_true_tile=qk_select_tmp, on_false_value=-9984.0, dtype=acc_type)
-
-        qk_res_buf[:, k_i_b_f_slice] = \
-            nl.copy(qk_psum, dtype=acc_type, mask=left_diagonal_selection)
-
-        max_local[:, k_i] = nisa.tensor_reduce(
-            np.max, qk_res_buf[:, k_i_b_f_slice], axis=(1,), dtype=acc_type,
-            negate=False)
-
-    max_ = nisa.tensor_reduce(np.max, max_local[:, :], axis=(1, ),
-                                                        dtype=acc_type, negate=False)
-
-    o_previous_scaled = nl.ndarray((nl.par_dim(B_P_SIZE), B_D_SIZE), dtype=o_buffer.dtype)
-
-    if initialize:
-        m_buffer[:, 0] = nl.copy(max_)
-        m_current = max_
-    else:
-        m_previous = nl.copy(m_buffer[:, 0])
-        m_buffer[:, 0] = nl.maximum(m_previous, max_) # (128,1)
-
-        m_current = m_buffer[:, 0]
-        # Compute scaling factor
-        alpha = nisa.activation(np.exp, m_current, bias=m_previous, scale=-1.0)
-        o_previous_scaled[...] = nl.multiply(o_buffer[:, :], alpha)
-
-    p_local = nl.ndarray((nl.par_dim(B_P_SIZE), LARGE_TILE_SZ), dtype=kernel_dtype)
-    REDUCTION_TILE = min(2048, LARGE_TILE_SZ // 2)
-
-    p_partial_sum = nl.ndarray((nl.par_dim(B_P_SIZE), LARGE_TILE_SZ // REDUCTION_TILE), dtype=acc_type)
-
-    for k_r_i in nl.affine_range(LARGE_TILE_SZ // REDUCTION_TILE):
-        k_r_i_reduce_slice = nl.ds(k_r_i * REDUCTION_TILE, REDUCTION_TILE)
-
-        p_local[:, k_r_i_reduce_slice] = \
-            nisa.activation_reduce(np.exp, qk_res_buf[:, k_r_i_reduce_slice],
-                                                            bias=-1 * m_current, scale=1.0,
-                                                            reduce_op=nl.add, reduce_res=p_partial_sum[:, k_r_i],
-                                                            dtype=kernel_dtype)
-
-    ps = nl.sum(p_partial_sum, axis=1, dtype=acc_type)
-
-    p_local_transposed = nl.ndarray((nl.par_dim(B_P_SIZE), LARGE_TILE_SZ), dtype=kernel_dtype)
-    for j in nl.affine_range(LARGE_TILE_SZ // 128):
-        if nisa.get_nc_version() == nisa.nc_version.gen3:
-            p_local_transposed[:, nl.ds(j * 128, 128)] = nisa.dma_transpose(
-                p_local[:, nl.ds(j * 128, 128)])
-        else:
-            p_local_transposed[:, nl.ds(j * 128, 128)] = nisa.nc_transpose(
-                p_local[:, nl.ds(j * 128, 128)])
-
-    pv_psum = nl.zeros((nl.par_dim(B_P_SIZE), B_D_SIZE), dtype=np.float32,
-                                         buffer=nl.psum, lazy_initialization=True)
-    for k_i in nl.affine_range(LARGE_TILE_SZ // B_P_SIZE):
-        pv_psum[:, :] += nl.matmul(p_local_transposed[:, nl.ds(k_i * B_P_SIZE, B_P_SIZE)],
-                                                             v[k_i, :, :], transpose_x=True) # (128, 128) (p(Br), d)
-
-    if initialize:
-        o_buffer[:, :] = nl.copy(pv_psum[:, :])
-        l_buffer[:, 0] = nl.add(nl.log(ps), max_)
-    else:
-        o_buffer[:, :] = nl.add(o_previous_scaled, pv_psum)
-
-        exp = nisa.activation(nl.exp, m_current, bias=l_buffer[:, 0], scale=-1.0)
-        l_buffer[:, 0] = nl.add(m_current, nisa.activation(nl.log, exp, bias=ps))
-
-
-@nki.jit
-def flash_attention_fwd(q, k, v, LARGE_TILE_SZ):
-    B_F_SIZE=LARGE_TILE_SZ
-    B_P_SIZE=128
-    b, h, d, seqlen_q  = q.shape
-    B_D_SIZE=d
-    _, k_h, _, seqlen_k = k.shape
-
-    assert tuple(v.shape) == (b, k_h, seqlen_k, d), f"Expect shape of V to be {(b, k_h, seqlen_k, d)} (batch, heads, seqlen_k, d_head) but got {v.shape}"
-    assert tuple(k.shape) == (b, k_h, d, seqlen_k), f"Expect shape of K to be {(b, k_h, d, seqlen_k)} (batch, heads, d_head, seqlen_k) but got {k.shape}"
-    assert d <= 128, f" we do not support head_dim > 128, got head dim {d}"
-    kernel_dtype = nl.bfloat16
-    acc_type = np.dtype(np.float32)
-
-    o = nl.ndarray((b, h, seqlen_q, d), dtype=q.dtype, buffer=nl.shared_hbm)
-
-    assert nl.program_ndim() == 2,\
-        f'Expect spmd grid with 2 dimensions, got {nl.program_ndim()} instead!'
-    batch_id = nl.program_id(axis=0)
-    head_id = nl.program_id(axis=1)
-
-    softmax_scale = 1.0 / (d ** 0.5)
-
-    n_tile_q = seqlen_q // B_P_SIZE # since q will be loaded on tensor engine
-
-    PAR_LEN = 512
-
-    assert seqlen_k % LARGE_TILE_SZ == 0, f"Need seqlen_k to be divisible by {LARGE_TILE_SZ} but got {seqlen_k}"
-    num_large_k_tile = seqlen_k // LARGE_TILE_SZ
-
-    q_h_per_k_h = h // k_h
-
-    PAR_LEN = min(n_tile_q, PAR_LEN)
-    n_remat = cdiv(n_tile_q, PAR_LEN)
-
-    for i_q_h in nl.affine_range(q_h_per_k_h):
-        l_buffer = nl.zeros((nl.par_dim(B_P_SIZE), n_tile_q), dtype=acc_type,
-                                                buffer=nl.sbuf, lazy_initialization=False)
-
-        for i0 in nl.sequential_range(n_remat):
-            o_buffer = nl.zeros((PAR_LEN, nl.par_dim(B_P_SIZE), d), dtype=acc_type,
-                                                    buffer=nl.sbuf, lazy_initialization=False)
-            m_buffer = nl.zeros((PAR_LEN, nl.par_dim(B_P_SIZE), 1), dtype=acc_type,
-                                                    buffer=nl.sbuf, lazy_initialization=False)
-
-            for j in nl.sequential_range(0, num_large_k_tile):
-                cur_k_tile = nl.ndarray((nl.par_dim(B_D_SIZE), LARGE_TILE_SZ), dtype=kernel_dtype)
-                cur_v_tile = nl.ndarray((LARGE_TILE_SZ // B_P_SIZE, nl.par_dim(B_P_SIZE), B_D_SIZE), dtype=kernel_dtype)
-                # print(k.shape, int(batch_id), int(head_id), j, nl.ds(j*LARGE_TILE_SZ, LARGE_TILE_SZ))
-                cur_k_tile[:, :] = nl.load(k[batch_id, head_id, :, nl.ds(j*LARGE_TILE_SZ, LARGE_TILE_SZ)])
-
-                load_tile_size = B_P_SIZE
-
-                v_calc = v[batch_id, head_id]
-                for v_i in nl.affine_range(LARGE_TILE_SZ // load_tile_size):
-                    cur_v_tile[v_i, :, :] = nl.load(
-                        v_calc[nl.ds(j * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE), :],
-                        dtype=cur_v_tile.dtype)
-                for i1 in nl.affine_range(PAR_LEN):
-                    i = i0 * PAR_LEN + i1
-
-                    forward_mask = i * B_P_SIZE >= j * LARGE_TILE_SZ
-
-                    if (i < n_tile_q) & forward_mask:
-                        q_tile = nl.ndarray((B_D_SIZE, B_P_SIZE), dtype=kernel_dtype)
-                        q_hbm_tile = q[batch_id, head_id * q_h_per_k_h + i_q_h]
-                        q_sbuf_tile = nl.load(q_hbm_tile[:, nl.ds(i * B_P_SIZE, B_P_SIZE)],
-                                                                    dtype=kernel_dtype) # load (d, 128) tile in SBUF
-                        q_tile[:, :] = q_sbuf_tile * softmax_scale
-
-                        _flash_attention_core(q_local_tile=q_tile, k=cur_k_tile, v=cur_v_tile,
-                                                o_buffer=o_buffer[i1], l_buffer=l_buffer[:, i], m_buffer=m_buffer[i1],
-                                                q_tile_idx=i, local_k_large_tile_idx=j,
-                                                kernel_dtype=kernel_dtype, acc_type=acc_type,
-                                                LARGE_TILE_SZ=LARGE_TILE_SZ,
-                                                initialize=(j == 0),
-                                                B_P_SIZE=B_P_SIZE, B_F_SIZE=B_F_SIZE, B_D_SIZE=B_D_SIZE)
-
-            for i1 in nl.affine_range(PAR_LEN):
-                i = i0 * PAR_LEN + i1
-
-                if i < n_tile_q:
-                    exp = nisa.activation(np.exp, l_buffer[:, i], bias=m_buffer[i1, :, :],
-                                                                scale=-1.0)
-                    out = nl.multiply(o_buffer[i1, :, :], exp,
-                                                        dtype=kernel_dtype)
-
-                    nl.store(o[batch_id, head_id * q_h_per_k_h + i_q_h,
-                                         nl.ds(i*B_P_SIZE, B_P_SIZE), :], value=out)
-    return o
-
-
-def flash_decode_core(qk_raw, v_tile, kernel_dtype, B_P_SIZE, calc_mask, acc_type, l_buffer, o_buffer, m_buffer, q_head_per_k):
-    qk = nl.where(calc_mask, qk_raw, -9984.0)
-    
-    qk_max = nisa.tensor_reduce(np.max, qk, axis=(1,)) # (1, 1)
-
-    qk_reduce = nl.ndarray((q_head_per_k, 1), dtype=acc_type, buffer=nl.sbuf, lazy_initialization=True)
-    qk_soft = nisa.activation_reduce(np.exp, qk,
-                                bias=-1 * qk_max, scale=1.0,
-                                reduce_op=nl.add, reduce_res=qk_reduce,
-                                dtype=kernel_dtype)
-
-    qk_soft_transposed = nisa.nc_transpose(qk_soft[:, nl.ds(0, B_P_SIZE)])
-    
-    o_tile = nl.matmul(qk_soft_transposed, v_tile, transpose_x=True)
-
-    o_buffer[...] = o_tile
-    m_buffer[...] = qk_max
-    l_buffer[...] = qk_reduce
-
-@nki.jit()
-def flash_decode(q, k, v, mask):
-    batch_id = nl.program_id(axis=0)
-    head_id = nl.program_id(axis=1)
-    bsz, h, d = q.shape
-    k_h = k.shape[1]
-    q_h_per_k_h = h // k_h
-    kernel_dtype = nl.bfloat16
-    acc_type = nl.float32
-    softmax_scale = 1.0 / (d ** 0.5)
-    PAR_LEN = 128
-
-    o = nl.ndarray((bsz, h, d), dtype=kernel_dtype, buffer=nl.shared_hbm)
-
-    k_total_len = k.shape[-1]
-    PARTITION_CNT = cdiv(k_total_len, PAR_LEN)
-    FULL_PARTITION_CNT = k_total_len // PAR_LEN
-
-    k_calc = k[batch_id, head_id]
-    v_calc = v[batch_id, head_id]
-    q_calc = q[batch_id, head_id * q_h_per_k_h: (head_id + 1) * q_h_per_k_h]
-
-    o_buffer = nl.ndarray((q_h_per_k_h, PARTITION_CNT, d), dtype=acc_type, buffer=nl.sbuf, lazy_initialization=True)
-    m_buffer = nl.ndarray((q_h_per_k_h, PARTITION_CNT), dtype=acc_type, buffer=nl.sbuf, lazy_initialization=True)
-    l_buffer = nl.ndarray((q_h_per_k_h, PARTITION_CNT), dtype=acc_type, buffer=nl.sbuf, lazy_initialization=True)
-
-    q_sbuf_tile = nisa.nc_transpose(nl.load(q_calc, dtype=kernel_dtype)) # load (d, 1) tile in SBUF
-    q_tile = q_sbuf_tile * softmax_scale
-
-    for par in nl.affine_range(FULL_PARTITION_CNT):
-        k_tile = nl.load(k_calc[:, nl.ds(par * PAR_LEN, PAR_LEN)], dtype=kernel_dtype)
-        qk = nl.matmul(q_tile, k_tile, transpose_x=True)
-        v_tile = nl.load(v_calc[nl.ds(par * PAR_LEN, PAR_LEN), :], dtype=kernel_dtype)
-        calc_mask = nl.load(mask[nl.ds(batch_id, 1), par * PAR_LEN: par * PAR_LEN + PAR_LEN]).broadcast_to((q_h_per_k_h, PAR_LEN))
-        flash_decode_core(qk, v_tile, kernel_dtype, PAR_LEN, calc_mask, acc_type, l_buffer[:, par], o_buffer[:, par, :], m_buffer[:, par], q_h_per_k_h)
-
-    if FULL_PARTITION_CNT != PARTITION_CNT:
-        len_remain = k_total_len - FULL_PARTITION_CNT * PAR_LEN
-        k_tile = nl.load(k_calc[:, nl.ds(FULL_PARTITION_CNT * PAR_LEN, len_remain)], dtype=kernel_dtype)
-        qk = nl.matmul(q_tile, k_tile, transpose_x=True)
-        v_tile = nl.load(v_calc[nl.ds(FULL_PARTITION_CNT * PAR_LEN, len_remain), :], dtype=kernel_dtype)
-        calc_mask = nl.load(mask[nl.ds(batch_id, 1), FULL_PARTITION_CNT * PAR_LEN: FULL_PARTITION_CNT * PAR_LEN + len_remain]).broadcast_to((q_h_per_k_h, len_remain))
-        flash_decode_core(qk, v_tile, kernel_dtype, len_remain, calc_mask, acc_type, l_buffer[:, FULL_PARTITION_CNT], o_buffer[:, FULL_PARTITION_CNT, :], m_buffer[:, FULL_PARTITION_CNT], q_h_per_k_h)
-
-    qk_new_max = nisa.tensor_reduce(np.max, m_buffer, axis=(1, ), dtype=acc_type, negate=True)
-    qk_exp = nisa.activation(np.exp, m_buffer, bias=qk_new_max)
-
-    for par in nl.affine_range(PARTITION_CNT):
-        o_buffer[:, par, :] = nl.multiply(o_buffer[:, par, :], qk_exp[:, par])
-
-    l_buffer = nl.multiply(l_buffer, qk_exp)
-    scales = nisa.tensor_reduce(nl.add, l_buffer, axis=(1,), dtype=acc_type)
-
-    o_buffer_reduced = nisa.tensor_reduce(nl.add, o_buffer, axis=(1,), dtype=acc_type)
-
-    scales = (1 / scales).broadcast_to((q_h_per_k_h, d))
-    out = nl.multiply(o_buffer_reduced, scales)
-
-    nl.store(o[batch_id, head_id * q_h_per_k_h: (head_id + 1) * q_h_per_k_h], value=out)
-    return o
-
-def get_suitable_len(size):
-    if size <= 128:
-        return 128
-    return 256
-
 @register_module("NeuronLlamaAttention")
 class NeuronLlamaAttention(NeuronAttentionBase):
     """
@@ -1705,7 +687,7 @@ def __init__(self, config: InferenceConfig, tensor_model_parallel_group=None):
         self.hidden_size = config.hidden_size
         self.num_attention_heads = config.num_attention_heads
         self.num_key_value_heads = config.num_key_value_heads
-        self.head_dim = self.hidden_size // self.num_attention_heads
+        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_attention_heads)
         self.max_position_embeddings = config.max_position_embeddings
         self.rope_theta = config.rope_theta
         self.padding_side = config.neuron_config.padding_side
@@ -1717,6 +699,7 @@ def __init__(self, config: InferenceConfig, tensor_model_parallel_group=None):
         self.rpl_reduce_dtype = config.neuron_config.rpl_reduce_dtype
         self.mlp_kernel_enabled = config.neuron_config.mlp_kernel_enabled
         self.rms_norm_eps = config.rms_norm_eps
+        self.attn_tkg_builtin_kernel_enabled = self.neuron_config.attn_tkg_builtin_kernel_enabled
 
         if parallel_state.model_parallel_is_initialized():
             self.tp_degree = self.config.neuron_config.tp_degree
@@ -1735,57 +718,12 @@ def __init__(self, config: InferenceConfig, tensor_model_parallel_group=None):
         self.init_gqa_properties()
 
         self.init_rope()
-        global NKI_ENABLED
-        if NKI_ENABLED:
-            self.qkv_proj.q_proj = CustomColumnParallelLinear(
-                self.qkv_proj.hidden_size,
-                self.qkv_proj.num_attention_heads * self.qkv_proj.head_dim,
-                bias=self.qkv_proj.bias,
-                gather_output=self.qkv_proj.gather_output,
-                dtype=self.torch_dtype,
-                sequence_parallel_enabled=False,
-                tensor_model_parallel_group=self.qkv_proj.tensor_model_parallel_group,
-            )
-            self.qkv_proj.k_proj = CustomColumnParallelLinear(
-                self.qkv_proj.hidden_size,
-                self.qkv_proj.num_key_value_heads * self.qkv_proj.head_dim,
-                bias=self.qkv_proj.bias,
-                gather_output=self.qkv_proj.gather_output,
-                dtype=self.torch_dtype,
-                sequence_parallel_enabled=False,
-                tensor_model_parallel_group=self.qkv_proj.tensor_model_parallel_group,
-            )
-            self.qkv_proj.v_proj = CustomColumnParallelLinear(
-                self.qkv_proj.hidden_size,
-                self.qkv_proj.num_key_value_heads * self.qkv_proj.head_dim,
-                bias=self.qkv_proj.bias,
-                gather_output=self.qkv_proj.gather_output,
-                dtype=self.torch_dtype,
-                sequence_parallel_enabled=False,
-                tensor_model_parallel_group=self.qkv_proj.tensor_model_parallel_group,
-            )
-            self.o_proj.o_proj = CustomRowParallelLinear(
-                self.o_proj.num_attention_heads * self.o_proj.head_dim,
-                self.o_proj.hidden_size,
-                bias=self.o_proj.bias,
-                input_is_parallel=self.o_proj.input_is_parallel,
-                dtype=self.torch_dtype,
-                sequence_parallel_enabled=False,
-                sequence_dimension=self.sequence_dimension,
-                tensor_model_parallel_group=self.o_proj.tensor_model_parallel_group,
-                reduce_dtype=self.rpl_reduce_dtype,
-            )
-            
 
     def init_rope(self):
         if not hasattr(self.config, "rope_scaling") or self.config.rope_scaling is None:
             # TODO(yihsian): Check if we can just use our own implementation
             if self.is_medusa:
-                self.rotary_emb = LlamaRotaryEmbedding(
-                    self.head_dim,
-                    max_position_embeddings=self.max_position_embeddings,
-                    base=self.rope_theta,
-                )
+                self.rotary_emb = LlamaRotaryEmbedding(self.config)
             else:
                 self.rotary_emb = RotaryEmbedding(
                     self.head_dim,
@@ -1813,101 +751,11 @@ def init_rope(self):
                 # Warning: The HF implementation may have precision issues when run on Neuron.
                 # We include it here for compatibility with other scaling types.
                 self.rotary_emb = LlamaRotaryEmbedding(self.config)
-    
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        active_mask: Optional[torch.LongTensor] = None,
-        adapter_ids=None,
-        cos_cache: Optional[torch.Tensor] = None,
-        sin_cache: Optional[torch.Tensor] = None,
-        rmsnorm=None,
-        seq_ids: Optional[torch.LongTensor] = None,  # Added for latest SDK
-        **kwargs,  # Catch any other unexpected kwargs
-    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
-        
-        if USE_FLASH:
-            if not past_key_value:
-                Q, K, V, cos_cache, sin_cache = self.prep_qkv_tensors(
-                    position_ids,
-                    hidden_states,
-                    past_key_value,
-                    adapter_ids=adapter_ids,
-                    cos_cache=cos_cache,
-                    sin_cache=sin_cache,
-                    rmsnorm=rmsnorm,
-                )
-                raw_k = K
-                raw_v = V
-                bsz, h, seq_len, dim = Q.shape
-                Q = Q.permute(0, 1, 3, 2)
-                K = K.permute(0, 1, 3, 2)
-                LARGE_TILE_SZ = get_suitable_len(seq_len)
-                q_pad_size = smallest_multiple(seq_len, 128) - seq_len
-                kv_pad_size = smallest_multiple(seq_len, LARGE_TILE_SZ) - seq_len
-                K = F.pad(K, (0, kv_pad_size))
-                V = F.pad(V, (0, 0, 0, kv_pad_size))
-                Q = F.pad(Q, (0, q_pad_size))
-                
-                # Q [bsz, h, dim, seq_len]
-                # K [bsz, h, dim, seq_len]
-                # V [bsz, h, seq_len, dim]
-                
-                o = flash_attention_fwd[bsz, self.num_key_value_heads](Q, K, V, LARGE_TILE_SZ)
-                o = o.permute(0, 2, 1, 3).view(bsz, -1, h * dim)[:,:seq_len,:]
-                o = self.o_proj(o, adapter_ids=adapter_ids)
-                return o, (raw_k, raw_v), cos_cache, sin_cache
-            else:
-                Q, K, V, cos_cache, sin_cache = self.prep_qkv_tensors(
-                    position_ids,
-                    hidden_states,
-                    past_key_value,
-                    adapter_ids=adapter_ids,
-                    cos_cache=cos_cache,
-                    sin_cache=sin_cache,
-                    rmsnorm=rmsnorm,
-                )
-                
-                raw_k = K
-                raw_v = V
-                bsz, h, seq_len, dim = Q.shape
-                
-                k_cache = past_key_value[0]
-                v_cache = past_key_value[1]
-                Q = Q.squeeze(2)
-                K = torch.cat([k_cache, K], dim=-2)
-                V = torch.cat([v_cache, V], dim=-2)
-                
-                K = K.permute(0, 1, 3, 2)
-                
-                mask = F.pad(attention_mask.squeeze(1).squeeze(1), (0, 1), value=True)
-                
-                o = flash_decode[bsz, self.num_key_value_heads](Q, K, V, mask)
-                
-                o = o.view(bsz, 1, self.num_heads * self.head_dim)
-                o = self.o_proj(o, adapter_ids=adapter_ids)
-                
-                return o, (raw_k, raw_v), cos_cache, sin_cache
-
-        # Pass seq_ids to parent class if needed
-        o, past, cos, sin = super().forward(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            active_mask=active_mask,
-            adapter_ids=adapter_ids,
-            cos_cache=cos_cache,
-            sin_cache=sin_cache,
-            rmsnorm=rmsnorm,
-            seq_ids=seq_ids,  # Pass seq_ids to parent
-            **kwargs,  # Pass any other kwargs
-        )
-        return o, past, cos, sin
-        
+
+        if self.attn_tkg_builtin_kernel_enabled:
+            self.inv_freqs = self.rotary_emb.get_inv_freqs().unsqueeze(1)
+
+
 # TODO: Modularize RotaryEmbedding. See how HF transformers does it in 4.43.
 class Llama3RotaryEmbedding(nn.Module):
     """
@@ -1938,31 +786,32 @@ def __init__(
         self.old_context_len = original_max_position_embeddings
         self.register_buffer("inv_freq", None, persistent=False)
 
+    def get_inv_freqs(self, device: Optional[torch.device] = None) -> torch.Tensor:
+        freq_indices = torch.arange(0, self.dim, 2, dtype=torch.float, device=device)
+        inv_freq = 1.0 / (self.base ** (freq_indices / self.dim))
+
+        low_freq_wavelen = self.old_context_len / self.low_freq_factor
+        high_freq_wavelen = self.old_context_len / self.high_freq_factor
+        new_freqs = []
+        for freq in inv_freq:
+            wavelen = 2 * math.pi / freq
+            if wavelen < high_freq_wavelen:
+                new_freqs.append(freq)
+            elif wavelen > low_freq_wavelen:
+                new_freqs.append(freq / self.factor)
+            else:
+                assert low_freq_wavelen != high_freq_wavelen
+                smooth = (self.old_context_len / wavelen - self.low_freq_factor) / (
+                    self.high_freq_factor - self.low_freq_factor
+                )
+                new_freqs.append((1 - smooth) * freq / self.factor + smooth * freq)
+        return torch.tensor(new_freqs, dtype=inv_freq.dtype, device=inv_freq.device)
+
     @torch.no_grad()
     def forward(self, x, position_ids):
         # x: [bs, num_attention_heads, seq_len, head_size]
         if self.inv_freq is None:
-            inv_freq = 1.0 / (
-                self.base
-                ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(x.device) / self.dim)
-            )
-
-            low_freq_wavelen = self.old_context_len / self.low_freq_factor
-            high_freq_wavelen = self.old_context_len / self.high_freq_factor
-            new_freqs = []
-            for freq in inv_freq:
-                wavelen = 2 * math.pi / freq
-                if wavelen < high_freq_wavelen:
-                    new_freqs.append(freq)
-                elif wavelen > low_freq_wavelen:
-                    new_freqs.append(freq / self.factor)
-                else:
-                    assert low_freq_wavelen != high_freq_wavelen
-                    smooth = (self.old_context_len / wavelen - self.low_freq_factor) / (
-                        self.high_freq_factor - self.low_freq_factor
-                    )
-                    new_freqs.append((1 - smooth) * freq / self.factor + smooth * freq)
-            self.inv_freq = torch.tensor(new_freqs, dtype=inv_freq.dtype, device=inv_freq.device)
+            self.inv_freq = self.get_inv_freqs(x.device)
 
         inv_freq_expanded = (
             self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
@@ -1984,9 +833,11 @@ class NeuronLlamaDecoderLayer(nn.Module):
     def __init__(self, config: InferenceConfig):
         super().__init__()
         self.hidden_size = config.hidden_size
+
         self.self_attn = _LLAMA_MODULE_MAP[config.neuron_config.attn_cls](
             config=config, tensor_model_parallel_group=get_tp_group(config)
         )
+
         self.mlp = NeuronLlamaMLP(config)
         logger.debug(
             f"Instantiating RMSNorm modules with hidden size {config.hidden_size} and EPS {config.rms_norm_eps}"
@@ -1999,24 +850,22 @@ def __init__(self, config: InferenceConfig):
             self.input_layernorm = get_rmsnorm_cls()(
                 config.hidden_size,
                 eps=config.rms_norm_eps,
-                nki_enabled=config.neuron_config.nki_enabled,
             )
         self.post_attention_layernorm = get_rmsnorm_cls()(
             config.hidden_size,
             eps=config.rms_norm_eps,
-            nki_enabled=config.neuron_config.nki_enabled,
         )
         self.qkv_kernel_enabled = config.neuron_config.qkv_kernel_enabled
         self.mlp_kernel_enabled = config.neuron_config.mlp_kernel_enabled
-        self.quantized_mlp_kernel_enabled = getattr(config.neuron_config, 'quantized_mlp_kernel_enabled', False)
-        self.rmsnorm_quantize_kernel_enabled = getattr(config.neuron_config, 'rmsnorm_quantize_kernel_enabled', False)
-        self.mlp_kernel_fuse_residual_add = getattr(config.neuron_config, 'mlp_kernel_fuse_residual_add', False)
-        self.qkv_kernel_fuse_residual_add = getattr(config.neuron_config, 'qkv_kernel_fuse_residual_add', False)
-        self.sequence_parallel_enabled = getattr(config.neuron_config, 'sequence_parallel_enabled', False)
-        self.is_prefill_stage = getattr(config.neuron_config, 'is_prefill_stage', False)
+        self.quantized_mlp_kernel_enabled = config.neuron_config.quantized_mlp_kernel_enabled
+        self.rmsnorm_quantize_kernel_enabled = config.neuron_config.rmsnorm_quantize_kernel_enabled
+        self.mlp_kernel_fuse_residual_add = config.neuron_config.mlp_kernel_fuse_residual_add
+        self.qkv_kernel_fuse_residual_add = config.neuron_config.qkv_kernel_fuse_residual_add
+        self.sequence_parallel_enabled = config.neuron_config.sequence_parallel_enabled
+        self.is_prefill_stage = config.neuron_config.is_prefill_stage
         self.config = config
-        
-        if self.is_prefill_stage and hasattr(config.neuron_config, 'is_mlp_quantized') and config.neuron_config.is_mlp_quantized():
+
+        if self.is_prefill_stage and self.config.neuron_config.is_mlp_quantized():
             # for CTE, quantized MLP kernel does not support fused rmsnorm
             self.mlp_kernel_fused_rmsnorm = False
         else:
@@ -2030,21 +879,16 @@ def forward(
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
         adapter_ids=None,
         rotary_position_ids: Optional[torch.LongTensor] = None,
-        residual: Optional[torch.Tensor] = None,
-        seq_ids: Optional[torch.LongTensor] = None,  # Added for latest SDK
+        residual: Optional[torch.Tensor] = None,  # residual from previous layer used by QKV
         **kwargs,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]], Optional[torch.FloatTensor], Optional[torch.FloatTensor], Optional[torch.FloatTensor]]:
         entry_hidden_states = hidden_states
-        
-        # Initialize residual if not provided (first layer)
-        if residual is None:
-            residual = hidden_states
-        
         # RMSNorm (fused with QKV kernel when SP is disabled)
         if (not self.qkv_kernel_enabled or self.sequence_parallel_enabled) and self.input_layernorm:
             hidden_states = self.input_layernorm(hidden_states)
 
         # Self Attention
+        # produced another residual used by MLP
         attn_output = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
@@ -2053,31 +897,26 @@ def forward(
             adapter_ids=adapter_ids,
             rmsnorm=self.input_layernorm,
             rotary_position_ids=rotary_position_ids,
-            residual=residual if self.qkv_kernel_fuse_residual_add else None,
-            seq_ids=seq_ids,  # Pass seq_ids to attention
+            residual=residual,
             **kwargs,
         )
-        
-        # Handle different return formats from attention
-        if hasattr(attn_output, 'hidden_states'):
-            # NamedTuple or similar structure
-            hidden_states = attn_output.hidden_states
-            present_key_value = attn_output.present_key_value
-            cos_cache = getattr(attn_output, 'cos_cache', None)
-            sin_cache = getattr(attn_output, 'sin_cache', None)
-            attn_residual = getattr(attn_output, 'residual', None)
-        else:
-            # Tuple format
-            hidden_states = attn_output[0]
-            present_key_value = attn_output[1]
-            cos_cache = attn_output[2] if len(attn_output) > 2 else None
-            sin_cache = attn_output[3] if len(attn_output) > 3 else None
-            attn_residual = attn_output[4] if len(attn_output) > 4 else None
 
-        # Update residual if returned from attention
-        if attn_residual is not None:
-            residual = attn_residual
+        if attn_output.residual is None:
+            residual = entry_hidden_states  # input to attention
+        else:
+            # residual will only be returned by attn/qkv if fuse add qkv kernel is enabled
+            assert self.qkv_kernel_fuse_residual_add, \
+                "residual add before qkv should be computed in the previous layer, \
+                 unless qkv_kernel_fuse_residual_add is specified"
+            assert (
+                not self.sequence_parallel_enabled
+            ), "qkv_kernel_fuse_residual_add should be off when sequence parallelism is enabled"
+            assert (
+                self.qkv_kernel_enabled
+            ), "qkv_kernel_fuse_residual_add should be used with qkv_kernel_enabled"
+            residual = attn_output.residual
 
+        hidden_states = attn_output.hidden_states
         if self.mlp_kernel_enabled and self.mlp_kernel_fuse_residual_add:
             assert (
                 not self.sequence_parallel_enabled
@@ -2090,31 +929,28 @@ def forward(
                 adapter_ids=adapter_ids,
             )
         else:
-            # Add residual for attention output
             hidden_states = residual + hidden_states
             residual = hidden_states
-            
-            # RMSNorm (fused with MLP kernel when conditions are met)
+            # RMSNorm (fused with QKV kernel when SP is disabled)
             if self.mlp_kernel_enabled and self.mlp_kernel_fused_rmsnorm:
                 rmsnorm = self.post_attention_layernorm
             else:
                 hidden_states = self.post_attention_layernorm(hidden_states)
                 rmsnorm = None
-                
             hidden_states, _ = self.mlp(
                 hidden_states,
                 rmsnorm=rmsnorm,
-                residual=None,  # Don't pass residual if not fusing
                 adapter_ids=adapter_ids,
             )
 
-        # Final residual connection (unless using qkv_kernel_fuse_residual_add for next layer)
+        # if fuse residual add with qkv, we leave this add to the next layer's QKV
+        # unless it is the last layer in which case we add it here
         if not self.qkv_kernel_fuse_residual_add:
             hidden_states = residual + hidden_states
             residual = None  # set to None to prevent it from being used again
 
-        # Return 5 elements as expected by the model base class
-        outputs = (hidden_states, present_key_value, cos_cache, sin_cache, residual)
+        # also return residual for QKV in the next layer
+        outputs = (hidden_states, attn_output.present_key_value, attn_output.cos_cache, attn_output.sin_cache, residual)
         return outputs
 
 
@@ -2168,9 +1004,7 @@ def setup_attr_for_model(self, config: InferenceConfig):
     def init_model(self, config: InferenceConfig):
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
-        global NKI_ENABLED
-        NKI_ENABLED = config.neuron_config.nki_enabled
-        
+
         if parallel_state.model_parallel_is_initialized():
             self.embed_tokens = ParallelEmbedding(
                 config.vocab_size,
@@ -2178,13 +1012,14 @@ def init_model(self, config: InferenceConfig):
                 self.padding_idx,
                 dtype=config.neuron_config.torch_dtype,
                 shard_across_embedding=not config.neuron_config.vocab_parallel,
-                sequence_parallel_enabled=False,
+                sequence_parallel_enabled=config.neuron_config.sequence_parallel_enabled,
+                sequence_dimension=1,
                 pad=True,
                 tensor_model_parallel_group=get_tp_group(config),
                 use_spmd_rank=config.neuron_config.vocab_parallel,
             )
 
-            self.lm_head = CustomColumnParallelLinear(
+            self.lm_head = ColumnParallelLinear(
                 config.hidden_size,
                 config.vocab_size,
                 gather_output=not self.on_device_sampling,
@@ -2204,12 +1039,12 @@ def init_model(self, config: InferenceConfig):
                 bias=False,
             )
 
-        # Use get_updated_configs to handle per-layer configurations
         updated_configs = get_updated_configs(config)
+
         self.layers = nn.ModuleList([NeuronLlamaDecoderLayer(conf) for conf in updated_configs])
-        
+
         if not config.neuron_config.is_eagle_draft:
-            self.norm = get_rmsnorm_cls()(config.hidden_size, eps=config.rms_norm_eps, nki_enabled=config.neuron_config.nki_enabled)
+            self.norm = get_rmsnorm_cls()(config.hidden_size, eps=config.rms_norm_eps)
 
         if config.neuron_config.is_eagle_draft:
             fc_bias = getattr(config, "fc_bias", False)
@@ -2238,58 +1073,7 @@ def init_model(self, config: InferenceConfig):
                 )
                 setattr(self, f"medusa_head_{i}", medusa_head)
 
-        if config.neuron_config.is_eagle_draft:
-            fc_bias = getattr(config, "fc_bias", False)
-            self.fc = CustomColumnParallelLinear(
-                config.hidden_size * 2, config.hidden_size, bias=fc_bias, gather_output=True
-            )
-        self.is_medusa = config.neuron_config.is_medusa
-        self.num_medusa_heads = config.neuron_config.num_medusa_heads
-        self.medusa_speculation_length = config.neuron_config.medusa_speculation_length
 
-        if self.is_medusa:
-            if parallel_state.model_parallel_is_initialized():
-                medusa_head_cls = ColumnParallelLinear
-            else:
-                medusa_head_cls = nn.Linear
-            for i in range(self.num_medusa_heads):
-                medusa_head = nn.Sequential(
-                    *([ResBlock(config.hidden_size)] * 1),
-                    medusa_head_cls(
-                        config.hidden_size,
-                        config.vocab_size,
-                        gather_output=not self.on_device_sampling,
-                        bias=False,
-                    ),
-                )
-                setattr(self, f"medusa_head_{i}", medusa_head)
-
-def fuse_mlp(llama_state_dict, cfg: InferenceConfig):
-    """
-    This function concats the qkv weights to a Wqkv weight for fusedqkv, and deletes the qkv weights.
-    """
-    tp_size = cfg.neuron_config.tp_degree
-    for l in range(cfg.num_hidden_layers):  # noqa: E741
-        dummpy_concat = torch.cat(
-            [
-                llama_state_dict[f"layers.{l}.mlp.gate_proj.weight"],
-                llama_state_dict[f"layers.{l}.mlp.up_proj.weight"],
-            ],
-        )
-        hidden = llama_state_dict[f"layers.{l}.mlp.gate_proj.weight"].shape[0]
-        per_tp_output = hidden // tp_size
-        output = torch.zeros_like(dummpy_concat)
-        for i in range(tp_size):
-            output[2 * i * per_tp_output : (2 * i + 1) * per_tp_output,:] = llama_state_dict[f"layers.{l}.mlp.gate_proj.weight"][i * per_tp_output : (i + 1) * per_tp_output,:]
-            output[(2 * i + 1) * per_tp_output : (2 * i + 2) * per_tp_output,:] = llama_state_dict[f"layers.{l}.mlp.up_proj.weight"][i * per_tp_output : (i + 1) * per_tp_output,:]
-        llama_state_dict[f"layers.{l}.mlp.gateup.weight"] = output
-        del llama_state_dict[f"layers.{l}.mlp.gate_proj.weight"]
-        del llama_state_dict[f"layers.{l}.mlp.up_proj.weight"]
-        del dummpy_concat
-
-    gc.collect()
-    # print(llama_state_dict.keys())
-    return llama_state_dict
 class NeuronLlamaForCausalLM(NeuronBaseForCausalLM):
     """
     This class extends LlamaForCausalLM create traceable
@@ -2308,12 +1092,11 @@ def load_hf_model(model_path, **kwargs):
     @staticmethod
     def convert_hf_to_neuron_state_dict(state_dict: dict, config: InferenceConfig) -> dict:
         """This function should be over-ridden in child classes as needed"""
+
         neuron_config = config.neuron_config
-        
-        # Handle fused_rmsnorm_skip_gamma transformation
+        # to facilitate rank usage in attention
         num_layers = config.num_hidden_layers
         tp_degree = neuron_config.tp_degree
-        
         for i in range(num_layers):
             state_dict[f"layers.{i}.self_attn.rank_util.rank"] = torch.arange(
                 0, tp_degree, dtype=torch.int32
@@ -2325,19 +1108,19 @@ def convert_hf_to_neuron_state_dict(state_dict: dict, config: InferenceConfig) -
             up_w_prime = (up_w.T * gamma).T
             """
             if (
-                getattr(neuron_config, 'fused_rmsnorm_skip_gamma', False)
-                and not getattr(neuron_config, 'sequence_parallel_enabled', False)
+                neuron_config.fused_rmsnorm_skip_gamma
+                and not neuron_config.sequence_parallel_enabled
             ):
-                if getattr(neuron_config, 'mlp_kernel_enabled', False):
+                if neuron_config.mlp_kernel_enabled:
                     # MLP
                     state_dict[f"layers.{i}.mlp.gate_proj.weight"] = state_dict[
                         f"layers.{i}.mlp.gate_proj.weight"
-                    ] * state_dict[f"layers.{i}.post_attention_layernorm.weight"].unsqueeze(0)
+                    ] * state_dict[f"layers.{i}.input_layernorm.weight"].unsqueeze(0)
                     state_dict[f"layers.{i}.mlp.up_proj.weight"] = state_dict[
                         f"layers.{i}.mlp.up_proj.weight"
-                    ] * state_dict[f"layers.{i}.post_attention_layernorm.weight"].unsqueeze(0)
+                    ] * state_dict[f"layers.{i}.input_layernorm.weight"].unsqueeze(0)
 
-                if getattr(neuron_config, 'qkv_kernel_enabled', False):
+                if neuron_config.qkv_kernel_enabled:
                     # QKV
                     state_dict[f"layers.{i}.self_attn.q_proj.weight"] = state_dict[
                         f"layers.{i}.self_attn.q_proj.weight"
@@ -2348,11 +1131,10 @@ def convert_hf_to_neuron_state_dict(state_dict: dict, config: InferenceConfig) -
                     state_dict[f"layers.{i}.self_attn.v_proj.weight"] = state_dict[
                         f"layers.{i}.self_attn.v_proj.weight"
                     ] * state_dict[f"layers.{i}.input_layernorm.weight"].unsqueeze(0)
-        
+
         if neuron_config.fused_qkv:
             state_dict = convert_state_dict_to_fused_qkv(state_dict, config)
-        if CONFIG_FUSE_MLP:
-            state_dict = fuse_mlp(state_dict, config)
+
         if neuron_config.vocab_parallel:
             # TODO: this hack can be removed after replication_id is ready to use
             state_dict["embed_tokens.rank_util.rank"] = torch.arange(
@@ -2362,13 +1144,9 @@ def convert_hf_to_neuron_state_dict(state_dict: dict, config: InferenceConfig) -
         # to facilitate rank usage in base model
         state_dict["rank_util.rank"] = torch.arange(0, tp_degree, dtype=torch.int32)
         return state_dict
-    def get_compiler_args(self):
-        res = super().get_compiler_args()
-        
-        
+
     @staticmethod
     def update_state_dict_for_tied_weights(state_dict):
-        
         state_dict["lm_head.weight"] = state_dict["embed_tokens.weight"].clone()
 
     @classmethod
diff --git a/src/inference/main.py b/src/inference/main.py
index 7c99e4c..f209d85 100644
--- a/src/inference/main.py
+++ b/src/inference/main.py
@@ -7,7 +7,6 @@
 import os
 import time
 import torch
-import re
 
 from torch_neuronx.pyhlo.hlo_pb2 import HloModuleProto
 from torch_neuronx.testing.validation import logit_validation
@@ -28,7 +27,7 @@
 from llama import NeuronLlamaForCausalLM
 from test import *
 
-BENCHMARK_REPORT_FILENAME = "benchmark_report.json"
+BENCHMARK_REPORT_FILENAME = "benchmark_inference.json"
 set_random_seed(0)
 
 
@@ -105,20 +104,33 @@ def parse_args():
     return parser.parse_args()
 
 def parse_prompts(filepath):
+    """Parse prompt performance data from JSON file"""
     with open(filepath, 'r') as file:
-        arr = file.read().split('\n\n')
-    arr = [prompt.strip() for prompt in arr if prompt.strip()]
-    return arr
+        data = json.load(file)
+    
+    # Convert the JSON data to the expected format (list of lists)
+    # Each inner list contains: [index, word_count, sequence_length, baseline_latency_ms, baseline_throughput]
+    prompt_data = []
+    for item in data['prompt_performance_data']:
+        prompt_data.append([
+            str(item['index']),
+            str(item['word_count']),
+            str(item['sequence_length']),
+            str(item['baseline_latency_ms']),
+            str(item['baseline_throughput'])
+        ])
+    
+    return prompt_data
 
 
 def parse_prompt_data(filepath):
+    """Parse prompts from JSON file"""
     with open(filepath, 'r') as file:
-        content = file.read()
-
-    blocks = content.split('\n')
-    if blocks[-1] == '':
-        blocks = blocks[0:-1]
-    return [block.split(',') for block in blocks]
+        data = json.load(file)
+    
+    # Extract just the prompt text from each prompt object
+    prompts = [prompt_obj['prompt'] for prompt_obj in data['prompts']]
+    return prompts
 
 def validate_file_exists(path):
     if not os.path.exists(path) or not os.path.isfile(path):
@@ -283,7 +295,7 @@ def post_warmup_func():
     print("Benchmark completed and its result is as following")
     print(json.dumps(report, indent=4))
     with open(BENCHMARK_REPORT_FILENAME, "w") as f:
-        json.dump(report, f)
+        json.dump(report, f, indent=4)
     print("Completed saving result to " + BENCHMARK_REPORT_FILENAME)
 
     return report
@@ -614,8 +626,8 @@ def main():
         
     elif args.mode == "evaluate_all":
 
-        prompts = parse_prompts("../../data/prompts.txt")
-        prompt_data = parse_prompt_data("../../data/prompt_data.txt")
+        prompts = parse_prompts("../../data/prompts.json")
+        prompt_data = parse_prompt_data("../../data/prompt_data.json")
         assert len(prompts) == len(prompt_data)
 
         total_score = 0
diff --git a/test/inference/test.py b/test/inference/test.py
index ee4fa0b..22bfb87 100644
--- a/test/inference/test.py
+++ b/test/inference/test.py
@@ -1,22 +1,36 @@
 import subprocess
 import argparse
+import json
 
 
 def parse_prompts(filepath):
+    """Parse prompts from JSON file"""
     with open(filepath, 'r') as file:
-        arr = file.read().split('\n\n')
-    arr = [prompt.strip() for prompt in arr if prompt.strip()]
-    return arr
+        data = json.load(file)
+    
+    # Extract just the prompt text from each prompt object
+    prompts = [prompt_obj['prompt'] for prompt_obj in data['prompts']]
+    return prompts
 
 
 def parse_prompt_data(filepath):
+    """Parse prompt performance data from JSON file"""
     with open(filepath, 'r') as file:
-        content = file.read()
-
-    blocks = content.split('\n')
-    if blocks[-1] == '':
-        blocks = blocks[0:-1]
-    return [block.split(',') for block in blocks]
+        data = json.load(file)
+    
+    # Convert the JSON data to the expected format (list of lists)
+    # Each inner list contains: [index, word_count, sequence_length, baseline_latency_ms, baseline_throughput]
+    prompt_data = []
+    for item in data['prompt_performance_data']:
+        prompt_data.append([
+            str(item['index']),
+            str(item['word_count']),
+            str(item['sequence_length']),
+            str(item['baseline_latency_ms']),
+            str(item['baseline_throughput'])
+        ])
+    
+    return prompt_data
 
 def parse_args():
     parser = argparse.ArgumentParser()
@@ -28,8 +42,8 @@ def parse_args():
 
 def main():
     args = parse_args()
-    prompts = parse_prompts(f"{args.repository_path}/data/prompts.txt")
-    prompt_data = parse_prompt_data(f"{args.repository_path}/data/prompt_data.txt")
+    prompts = parse_prompts(f"{args.repository_path}/data/prompts.json")
+    prompt_data = parse_prompt_data(f"{args.repository_path}/data/prompt_data.json")
     assert len(prompts) == len(prompt_data)
 
     mode = "evaluate_single"

From 4803f6f0a0442cab5d01ecff5e840f85fb4bee35 Mon Sep 17 00:00:00 2001
From: arm-diaz <arm.dan.diaz.gonzalez@gmail.com>
Date: Thu, 12 Jun 2025 20:48:32 +0000
Subject: [PATCH 20/65] fix: rename function

---
 src/inference/main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/inference/main.py b/src/inference/main.py
index f209d85..5464f38 100644
--- a/src/inference/main.py
+++ b/src/inference/main.py
@@ -103,7 +103,7 @@ def parse_args():
 
     return parser.parse_args()
 
-def parse_prompts(filepath):
+def parse_prompt_data(filepath):
     """Parse prompt performance data from JSON file"""
     with open(filepath, 'r') as file:
         data = json.load(file)
@@ -123,7 +123,7 @@ def parse_prompts(filepath):
     return prompt_data
 
 
-def parse_prompt_data(filepath):
+def parse_prompts(filepath):
     """Parse prompts from JSON file"""
     with open(filepath, 'r') as file:
         data = json.load(file)

From 1a7bc1bc69eac57acdbd1d85701c51d9b28d9b7b Mon Sep 17 00:00:00 2001
From: nithiyn <vijeasns@amazon.com>
Date: Thu, 12 Jun 2025 22:59:20 -0400
Subject: [PATCH 21/65] add step to check and downgrade transformers

---
 nki-llama.config                                 |  3 ++-
 src/inference/scripts/reasoning-bench-lm-eval.sh | 13 +++++++++++++
 src/inference/scripts/setup-vllm.sh              | 13 +++++++++++++
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/nki-llama.config b/nki-llama.config
index 4e02f99..cf30751 100644
--- a/nki-llama.config
+++ b/nki-llama.config
@@ -37,7 +37,8 @@ export NEURON_COMPILE_CACHE="${HOME}/.cache/neuron"
 export NEURON_RT_NUM_CORES="${NEURON_RT_NUM_CORES:-8}"
 
 # vLLM Configuration
-export VLLM_REPO="${HOME}/vllm"
+export VLLM_REPO="${HOME}/upstreaming-to-vllm"Add commentMore actions
+export VLLM_BRANCH="neuron-2.22-vllm-v0.7.2"
 export VLLM_NEURON_FRAMEWORK="neuronx-distributed-inference"
 
 # Dataset Configuration
diff --git a/src/inference/scripts/reasoning-bench-lm-eval.sh b/src/inference/scripts/reasoning-bench-lm-eval.sh
index cc9defd..af7bd0c 100644
--- a/src/inference/scripts/reasoning-bench-lm-eval.sh
+++ b/src/inference/scripts/reasoning-bench-lm-eval.sh
@@ -43,6 +43,19 @@ pip install -U -r requirements/neuron.txt
 echo "Installing vLLM for Neuron..."
 VLLM_TARGET_DEVICE="neuron" pip install -e .
 
+# Ensure transformers < 4.50 (needed by Neuron hf_adapter)
+python - <<'PY'
+import subprocess, pkg_resources, sys
+req = "4.50.0"
+try:
+    ver = pkg_resources.get_distribution("transformers").version
+except pkg_resources.DistributionNotFound:
+    ver = ""
+if not ver or pkg_resources.parse_version(ver) >= pkg_resources.parse_version(req):
+    print("Installing transformers<%s …" % req)
+    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", f"transformers<{req}"])
+PY
+
 echo -e "${GREEN}✓ vLLM setup complete${NC}"
 
 cd "$HOME"
diff --git a/src/inference/scripts/setup-vllm.sh b/src/inference/scripts/setup-vllm.sh
index 4c27520..588f5bd 100755
--- a/src/inference/scripts/setup-vllm.sh
+++ b/src/inference/scripts/setup-vllm.sh
@@ -43,4 +43,17 @@ pip install -U -r requirements/neuron.txt
 echo "Installing vLLM for Neuron..."
 VLLM_TARGET_DEVICE="neuron" pip install -e .
 
+# Ensure transformers < 4.50 (needed by Neuron hf_adapter)
+python - <<'PY'
+import subprocess, pkg_resources, sys
+req = "4.50.0"
+try:
+    ver = pkg_resources.get_distribution("transformers").version
+except pkg_resources.DistributionNotFound:
+    ver = ""
+if not ver or pkg_resources.parse_version(ver) >= pkg_resources.parse_version(req):
+    print("Installing transformers<%s …" % req)
+    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", f"transformers<{req}"])
+PY
+
 echo -e "${GREEN}✓ vLLM setup complete${NC}"
\ No newline at end of file

From 33105d75b810dd2a0783fb807f94412e9bb3fb1e Mon Sep 17 00:00:00 2001
From: nithiyn <vijeasns@amazon.com>
Date: Thu, 12 Jun 2025 23:07:08 -0400
Subject: [PATCH 22/65] commit config update

---
 nki-llama.config | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/nki-llama.config b/nki-llama.config
index cf30751..4e02f99 100644
--- a/nki-llama.config
+++ b/nki-llama.config
@@ -37,8 +37,7 @@ export NEURON_COMPILE_CACHE="${HOME}/.cache/neuron"
 export NEURON_RT_NUM_CORES="${NEURON_RT_NUM_CORES:-8}"
 
 # vLLM Configuration
-export VLLM_REPO="${HOME}/upstreaming-to-vllm"Add commentMore actions
-export VLLM_BRANCH="neuron-2.22-vllm-v0.7.2"
+export VLLM_REPO="${HOME}/vllm"
 export VLLM_NEURON_FRAMEWORK="neuronx-distributed-inference"
 
 # Dataset Configuration

From 7dea685cb167abe147725df03d365190df65efc5 Mon Sep 17 00:00:00 2001
From: arm-diaz <arm.dan.diaz.gonzalez@gmail.com>
Date: Fri, 13 Jun 2025 11:56:37 +0000
Subject: [PATCH 23/65] docs: improve docs and workflow

---
 README.md                                  |  98 ++++++++++++-
 nki-llama.sh                               |   1 +
 src/README.md                              |  78 +++++++++--
 src/handler.py                             | 151 +++++++++++++--------
 src/inference/scripts/run-nki-benchmark.sh |  99 ++++++++++++--
 5 files changed, 340 insertions(+), 87 deletions(-)

diff --git a/README.md b/README.md
index 91652d5..d507b3f 100644
--- a/README.md
+++ b/README.md
@@ -86,6 +86,84 @@ tmux new -s benchmark
 ./nki-llama inference benchmark
 ```
 
+## 📊 Score Calculation Workflow
+
+The NKI-LLAMA platform includes a comprehensive score calculation system that evaluates both training and inference performance. For detailed information about the scoring system, see the [Score Calculation README](src/README.md).
+
+### Workflow Overview
+
+1. **Pre-compile Phase**: 
+   - Execute the pre-compile job using `./nki-llama finetune compile`
+   - This generates a compile directory in the neuron cache
+   - The pre-compile job creates a log file in `logs/nki-llama_*.log`
+   - **Important**: Note the compile directory path from the "Pre-compile graphs" log output
+   - Example: `/home/ubuntu/neuron_cache/neuronxcc-2.18.121.0+9e31e41a/MODULE_15329989265349737271+a65e371e`
+
+2. **Training Execution**:
+   - Execute the pre-compile job using `./nki-llama finetune train`
+   - The training job creates a log file in `logs/nki-llama_*.log`
+   - This log contains metrics like latency, throughput, and MFU
+   - The benchmark inference file is always generated at: `benchmark_inference.json`
+
+3. **Score Collection**:
+   - Once training completes, scores can be calculated using the handler
+   - If only training is done, you'll get the NKI kernel training score
+   - If both training and inference are complete, you'll get the full NKI-LLAMA score
+
+### Example Test Run
+
+```bash
+# Step 1: Run full fine-tuning job and note the compile directory
+tmux new -s training
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+./nki-llama finetune all
+# Look for "Pre-compile graphs" in output to find compile directory path
+
+# Step 2: Run inference benchmark (optional for full score)
+tmux new -s benchmark
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+./nki-llama inference benchmark
+
+# Step 3: Calculate scores
+# For training-only score:
+python /home/ubuntu/nki-llama/src/handler.py \
+    --config /home/ubuntu/nki-llama/src/fine-tune/neuronx-distributed-training/examples/conf/hf_llama3_8B_SFT_config.yaml \
+    --model-config /home/ubuntu/nki-llama/src/fine-tune/configs/model-config/8B_config_llama3-1/config.json \
+    --log-file /home/ubuntu/nki-llama/logs/nki-llama_20250610_014432.log \
+    --compile-dir /home/ubuntu/neuron_cache/neuronxcc-2.18.121.0+9e31e41a/MODULE_15329989265349737271+a65e371e \
+    --throughput 2.1 \
+    --output benchmark_results.json \
+    --training-weight 0.5 \
+    --inference-weight 0.5 \
+    --hw-backend trn1 \
+    --per-file-scores \
+    --calculate-score \
+    --detailed \
+    --verbose
+
+# For full score (with inference):
+python /home/ubuntu/nki-llama/src/handler.py \
+    --config /home/ubuntu/nki-llama/src/fine-tune/neuronx-distributed-training/examples/conf/hf_llama3_8B_SFT_config.yaml \
+    --model-config /home/ubuntu/nki-llama/src/fine-tune/configs/model-config/8B_config_llama3-1/config.json \
+    --log-file /home/ubuntu/nki-llama/logs/nki-llama_20250610_014432.log \
+    --compile-dir /home/ubuntu/neuron_cache/neuronxcc-2.18.121.0+9e31e41a/MODULE_15329989265349737271+a65e371e \
+    --inference-results /home/ubuntu/nki-llama/src/inference/benchmark_inference.json \
+    --throughput 2.1 \
+    --output benchmark_results.json \
+    --training-weight 0.5 \
+    --inference-weight 0.5 \
+    --hw-backend trn1 \
+    --per-file-scores \
+    --calculate-score \
+    --detailed \
+    --verbose
+```
+
+The score calculation provides insights into:
+- **Training Performance**: MFU improvement and throughput gains
+- **Inference Performance**: Latency reduction and throughput increase
+- **NKI Optimization**: Ratio of NKI-optimized operations
+
 ## 💻 Command Reference
 
 ### Core Commands
@@ -275,6 +353,8 @@ nki-llama/
 ├── install.sh            # Installation script
 ├── README.md             # This file
 ├── src/
+│   ├── README.md         # Score calculation documentation
+│   ├── handler.py        # Score calculation handler
 │   ├── fine-tune/        # Training pipeline
 │   │   └── scripts/      # Training automation
 │   └── inference/        # Inference pipeline
@@ -322,6 +402,7 @@ MAX_MODEL_LEN=2048
 tmux new -s training
 source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
 ./nki-llama finetune all
+# Note the compile directory from "Pre-compile graphs" output
 # Detach: Ctrl+B, D
 ```
 
@@ -335,16 +416,26 @@ source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
 # Detach: Ctrl+B, D
 ```
 
-### Step 3: Serve Model
+### Step 3: Calculate Performance Score
+```bash
+# After training and/or inference completes
+python /home/ubuntu/nki-llama/src/handler.py \
+    --compile-dir /path/from/training/logs \
+    --log-file logs/nki-llama_latest.log \
+    --inference-results benchmark_inference.json \
+    --calculate-score
+```
+
+### Step 4: Serve Model
 ```bash
-tmux new -s server
+tmux new -s vllm-server
 source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
 ./nki-llama inference server
 # API available at http://localhost:8080
 # Detach: Ctrl+B, D
 ```
 
-### Step 4: Build Applications
+### Step 5: Build Applications
 ```bash
 # Terminal 1: Keep server running
 # Terminal 2: Development
@@ -361,7 +452,6 @@ source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
 
 ## 🐛 Known Issues
 
-- **evaluate_single mode**: Currently not implemented. Use default `evaluate_all` mode for all benchmarking.
 - **First compilation**: Initial NKI compilation can take 10-30 minutes. Subsequent runs use cache.
 - **Cache corruption**: If benchmark fails with cache errors, use `--clear-cache` flag.
 
diff --git a/nki-llama.sh b/nki-llama.sh
index a10f640..e211d8b 100755
--- a/nki-llama.sh
+++ b/nki-llama.sh
@@ -304,6 +304,7 @@ cmd_inference_benchmark() {
     fi
     
     echo -e "${YELLOW}   Using tmux is strongly recommended!${NC}"
+    echo -e "${YELLOW}   Running: ${NKI_INFERENCE_SCRIPTS}/run-nki-benchmark.sh --mode $mode ${args[@]}"
     
     # Check if we're in tmux for evaluate_all mode
     if [[ "$mode" == "evaluate_all" ]] && [[ -z "${TMUX:-}" ]]; then
diff --git a/src/README.md b/src/README.md
index 7c0e937..a587c06 100644
--- a/src/README.md
+++ b/src/README.md
@@ -6,9 +6,13 @@ A benchmarking system for evaluating NKI-LLAMA model performance across both tra
 
 The NKI-LLAMA Benchmark Handler calculates a unified performance score that combines:
 - **Training metrics**: MFU (Model FLOPs Utilization), throughput, and NKI kernel usage
-- **Inference metrics**: Latency, throughput, and accuracy
+- **Inference metrics**: Latency, throughput, and accuracy (optional)
 - **NKI optimization**: Ratio of NKI (Neuron Kernel Interface) operations to total operations
 
+The system supports two modes:
+- **Training-only mode**: When inference results are not available, provides NKI kernel training score
+- **Combined mode**: When both training and inference results are available, provides full NKI-LLAMA score
+
 The final score follows the formula:
 ```
 Score = Accuracy × Reduced Latency × Increased Throughput × (1 + Normalized NKI FLOPS)
@@ -25,20 +29,28 @@ python handler.py
 
 This will:
 1. Calculate training metrics using `calculate_training_metrics.py`
-2. Load inference results from `benchmark_inference.json`
-3. Calculate the combined NKI-LLAMA score
+2. Load inference results from `benchmark_inference.json` (if available)
+3. Calculate the NKI-LLAMA score (combined or training-only)
 4. Save results to `benchmark_results.json`
 
+### Training-Only Mode
+
+If the inference benchmark file doesn't exist, the handler automatically runs in training-only mode:
+```bash
+python handler.py --calculate-score
+```
+
+This provides immediate feedback on NKI kernel optimization progress without requiring inference implementation.
+
 ### Advanced Usage
 
 #### Custom Training Configuration
 ```bash
-python src/handler.py \
+python /home/ubuntu/nki-llama/src/handler.py \
     --config /home/ubuntu/nki-llama/src/fine-tune/neuronx-distributed-training/examples/conf/hf_llama3_8B_SFT_config.yaml \
     --model-config /home/ubuntu/nki-llama/src/fine-tune/configs/model-config/8B_config_llama3-1/config.json \
     --log-file /home/ubuntu/nki-llama/logs/nki-llama_20250610_014432.log \
     --compile-dir /home/ubuntu/neuron_cache/neuronxcc-2.18.121.0+9e31e41a/MODULE_15329989265349737271+a65e371e \
-    --inference-results /home/ubuntu/nki-llama/src/inference/benchmark_inference.json \
     --throughput 2.1 \
     --output benchmark_results.json \
     --training-weight 0.5 \
@@ -52,10 +64,21 @@ python src/handler.py \
 
 #### Custom Inference Results
 ```bash
-python handler.py \
-    --inference-results my_inference_results.json \
-    --reference-latency 60000 \
-    --reference-throughput 15
+python /home/ubuntu/nki-llama/src/handler.py \
+    --config /home/ubuntu/nki-llama/src/fine-tune/neuronx-distributed-training/examples/conf/hf_llama3_8B_SFT_config.yaml \
+    --model-config /home/ubuntu/nki-llama/src/fine-tune/configs/model-config/8B_config_llama3-1/config.json \
+    --log-file /home/ubuntu/nki-llama/logs/nki-llama_20250610_014432.log \
+    --compile-dir /home/ubuntu/neuron_cache/neuronxcc-2.18.121.0+9e31e41a/MODULE_15329989265349737271+a65e371e \
+    --inference-results /home/ubuntu/nki-llama/src/inference/benchmark_inference.json \
+    --throughput 2.1 \
+    --output benchmark_results.json \
+    --training-weight 0.5 \
+    --inference-weight 0.5 \
+    --hw-backend trn1 \
+    --per-file-scores \
+    --calculate-score \
+    --detailed \
+    --verbose
 ```
 
 #### Adjust Score Weights
@@ -86,7 +109,7 @@ python handler.py --verbose
 #### Inference Metrics Options
 | Option | Default | Description |
 |--------|---------|-------------|
-| `--inference-results` | `benchmark_inference.json` | Inference benchmark results file |
+| `--inference-results` | `benchmark_inference.json` | Inference benchmark results file (optional - if not provided, only training score is calculated) |
 | `--reference-latency` | `50000` | Reference implementation latency (ms) |
 | `--reference-throughput` | `10` | Reference implementation throughput (tokens/s) |
 
@@ -99,13 +122,13 @@ python handler.py --verbose
 #### Output Options
 | Option | Default | Description |
 |--------|---------|-------------|
-| `--output` | `benchmark.json` | Output file for combined results |
+| `--output` | `benchmark_results.json` | Output file for combined results |
 | `--training-output` | `benchmark_finetuning.json` | Output file for training metrics |
 | `--verbose` | `False` | Enable verbose output |
 
 ## 📊 Output Format
 
-### Console Output
+### Console Output - Combined Mode
 ```
 ======================================================================
 NKI-LLAMA BENCHMARK RESULTS
@@ -138,11 +161,36 @@ Score Weights:
 ======================================================================
 ```
 
+### Console Output - Training-Only Mode
+```
+======================================================================
+NKI-LLAMA BENCHMARK RESULTS
+======================================================================
+
+⚠️  TRAINING-ONLY MODE (Inference results not available)
+
+🏆 NKI KERNEL TRAINING SCORE: 0.0077
+   NKI Ratio: 0.1846
+
+🎯 Training Metrics:
+  MFU: 15.48% (baseline: 50.00%)
+  Throughput: 2.10 seq/s (baseline: 100.00)
+  MFU Improvement: 0.3095x
+  Throughput Improvement: 0.0210x
+
+💡 Note: This score represents training performance only.
+   To get the full NKI-LLAMA score, run inference benchmarks and provide
+   the results file using --inference-results option.
+
+======================================================================
+```
+
 ### JSON Output (`benchmark_results.json`)
 ```json
 {
   "timestamp": "2025-01-01T12:00:00",
-  "nki_llama_score": 0.0046,
+  "mode": "combined",
+  "nki_kernel_score": 0.0046,
   "component_scores": {
     "training": 0.0077,
     "inference": 0.0026
@@ -204,9 +252,9 @@ Score Weights:
    python handler.py --verbose  # Shows detailed error messages
    ```
 
-2. **Missing `benchmark_inference.json`**: Run inference benchmarks first
+2. **Missing `benchmark_inference.json`**: The handler will automatically run in training-only mode
    ```bash
-   # Create a sample inference results file
+   # To create a sample inference results file for testing:
    echo '{"e2e_model": {"latency_ms_avg": 12131.49, "throughput": 52.76}}' > benchmark_inference.json
    ```
 
diff --git a/src/handler.py b/src/handler.py
index e2c0ec6..3f9645d 100644
--- a/src/handler.py
+++ b/src/handler.py
@@ -75,20 +75,6 @@ def build_training_command(self, cmd_args: Dict[str, Any]) -> list:
         if cmd_args.get("print_per_file"):
             cmd.append("--print-per-file")
             
-        # Add scoring thresholds
-        if cmd_args.get("base_mfu"):
-            pass
-            # cmd.extend(["--base-mfu", str(cmd_args["base_mfu"])])
-        if cmd_args.get("base_throughput"):
-            pass
-            #cmd.extend(["--base-throughput", str(cmd_args["base_throughput"])])
-        if cmd_args.get("loss_improvement"):
-            pass
-            #cmd.extend(["--loss-improvement", str(cmd_args["loss_improvement"])])
-        if cmd_args.get("convergence_rate"):
-            pass
-            #cmd.extend(["--convergence-rate", str(cmd_args["convergence_rate"])])
-            
         # Output file
         output_file = cmd_args.get("output", "benchmark_finetuning.json")
         cmd.extend(["--output", output_file])
@@ -198,15 +184,16 @@ def calculate_inference_score(self, inference_data: Dict[str, Any],
         return score, breakdown
         
     def calculate_combined_score(self, training_metrics: Dict[str, Any],
-                                 inference_metrics: Dict[str, Any],
+                                 inference_metrics: Optional[Dict[str, Any]] = None,
                                  weights: Optional[Dict[str, float]] = None,
                                  reference_data: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
         """
         Calculate combined NKI-LLAMA score from training and inference metrics.
+        If inference metrics are not available, returns training-only score.
         
         Args:
             training_metrics: Training metrics including NKI analysis
-            inference_metrics: Inference benchmark results
+            inference_metrics: Optional inference benchmark results
             weights: Optional weights for combining scores
             reference_data: Optional reference implementation data for inference scoring
             
@@ -223,6 +210,22 @@ def calculate_combined_score(self, training_metrics: Dict[str, Any],
         training_score = training_metrics.get("training_score", 0.0)
         nki_ratio = training_metrics["nki_analysis"]["summary"]["overall_nki_ratio"]
         
+        # Check if inference metrics are available
+        if inference_metrics is None:
+            # Training-only mode
+            return {
+                "combined_score": training_score,
+                "training_score": training_score,
+                "inference_score": None,
+                "weights": weights,
+                "mode": "training_only",
+                "breakdown": {
+                    "training": training_metrics.get("training_score_breakdown", {}),
+                    "inference": None
+                },
+                "nki_ratio": nki_ratio
+            }
+        
         # Calculate inference score with NKI ratio
         inference_score, inference_breakdown = self.calculate_inference_score(inference_metrics, reference_data)
         
@@ -246,6 +249,7 @@ def calculate_combined_score(self, training_metrics: Dict[str, Any],
             "training_score": training_score,
             "inference_score": inference_score_with_nki,
             "weights": weights,
+            "mode": "combined",
             "breakdown": {
                 "training": training_metrics.get("training_score_breakdown", {}),
                 "inference": inference_breakdown
@@ -259,35 +263,56 @@ def display_results(self, results: Dict[str, Any]):
         print("NKI-LLAMA BENCHMARK RESULTS")
         print("="*70)
         
-        # Combined score
-        print(f"\n🏆 FINAL NKI-LLAMA SCORE: {results['combined_score']:.4f}")
-        print(f"\nScore Weights:")
-        print(f"  Training: {results['weights']['training']*100:.0f}%")
-        print(f"  Inference: {results['weights']['inference']*100:.0f}%")
-        
-        # Component scores
-        print(f"\n📊 Component Scores:")
-        print(f"  Training Score: {results['training_score']:.4f}")
-        print(f"  Inference Score: {results['inference_score']:.4f}")
-        print(f"  NKI Ratio: {results['nki_ratio']:.4f}")
-        
-        # Training breakdown
-        if "training" in results["breakdown"]:
-            tb = results["breakdown"]["training"]
-            print(f"\n🎯 Training Metrics:")
-            print(f"  MFU: {tb.get('achieved_mfu', 0):.2f}% (baseline: {tb.get('base_mfu', 0):.2f}%)")
-            print(f"  Throughput: {tb.get('achieved_throughput', 0):.2f} seq/s (baseline: {tb.get('base_throughput', 0):.2f})")
-            print(f"  MFU Improvement: {tb.get('mfu_improvement', 0):.4f}x")
-            print(f"  Throughput Improvement: {tb.get('throughput_improvement', 0):.4f}x")
+        # Check mode and display appropriate results
+        if results.get("mode") == "training_only":
+            print("\n⚠️  TRAINING-ONLY MODE (Inference results not available)")
+            print(f"\n🏆 NKI KERNEL TRAINING SCORE: {results['training_score']:.4f}")
+            print(f"   NKI Ratio: {results['nki_ratio']:.4f}")
             
-        # Inference breakdown
-        ib = results["breakdown"]["inference"]
-        print(f"\n⚡ Inference Metrics:")
-        print(f"  Latency: {ib['achieved_latency_ms']:.2f}ms (reference: {ib['reference_latency_ms']:.2f}ms)")
-        print(f"  Throughput: {ib['achieved_throughput']:.2f} tokens/s (reference: {ib['reference_throughput']:.2f})")
-        print(f"  Latency Reduction: {ib['reduced_latency']:.4f}x")
-        print(f"  Throughput Increase: {ib['increased_throughput']:.4f}x")
-        print(f"  Accuracy: {'✓ Passed' if ib['accuracy'] == 1.0 else '✗ Failed'}")
+            # Training breakdown
+            if "training" in results["breakdown"] and results["breakdown"]["training"]:
+                tb = results["breakdown"]["training"]
+                print(f"\n🎯 Training Metrics:")
+                print(f"  MFU: {tb.get('achieved_mfu', 0):.2f}% (baseline: {tb.get('base_mfu', 0):.2f}%)")
+                print(f"  Throughput: {tb.get('achieved_throughput', 0):.2f} seq/s (baseline: {tb.get('base_throughput', 0):.2f})")
+                print(f"  MFU Improvement: {tb.get('mfu_improvement', 0):.4f}x")
+                print(f"  Throughput Improvement: {tb.get('throughput_improvement', 0):.4f}x")
+                
+            print("\n💡 Note: This score represents training performance only.")
+            print("   To get the full NKI-LLAMA score, run inference benchmarks and provide")
+            print("   the results file using --inference-results option.")
+            
+        else:
+            # Combined mode - full results
+            print(f"\n🏆 FINAL NKI-LLAMA SCORE: {results['combined_score']:.4f}")
+            print(f"\nScore Weights:")
+            print(f"  Training: {results['weights']['training']*100:.0f}%")
+            print(f"  Inference: {results['weights']['inference']*100:.0f}%")
+            
+            # Component scores
+            print(f"\n📊 Component Scores:")
+            print(f"  Training Score: {results['training_score']:.4f}")
+            print(f"  Inference Score: {results['inference_score']:.4f}")
+            print(f"  NKI Ratio: {results['nki_ratio']:.4f}")
+            
+            # Training breakdown
+            if "training" in results["breakdown"] and results["breakdown"]["training"]:
+                tb = results["breakdown"]["training"]
+                print(f"\n🎯 Training Metrics:")
+                print(f"  MFU: {tb.get('achieved_mfu', 0):.2f}% (baseline: {tb.get('base_mfu', 0):.2f}%)")
+                print(f"  Throughput: {tb.get('achieved_throughput', 0):.2f} seq/s (baseline: {tb.get('base_throughput', 0):.2f})")
+                print(f"  MFU Improvement: {tb.get('mfu_improvement', 0):.4f}x")
+                print(f"  Throughput Improvement: {tb.get('throughput_improvement', 0):.4f}x")
+                
+            # Inference breakdown
+            if results["breakdown"]["inference"]:
+                ib = results["breakdown"]["inference"]
+                print(f"\n⚡ Inference Metrics:")
+                print(f"  Latency: {ib['achieved_latency_ms']:.2f}ms (reference: {ib['reference_latency_ms']:.2f}ms)")
+                print(f"  Throughput: {ib['achieved_throughput']:.2f} tokens/s (reference: {ib['reference_throughput']:.2f})")
+                print(f"  Latency Reduction: {ib['reduced_latency']:.4f}x")
+                print(f"  Throughput Increase: {ib['increased_throughput']:.4f}x")
+                print(f"  Accuracy: {'✓ Passed' if ib['accuracy'] == 1.0 else '✗ Failed'}")
         
         print("\n" + "="*70)
         
@@ -295,10 +320,11 @@ def save_results(self, results: Dict[str, Any], output_file: str):
         """Save the combined results to a JSON file."""
         output_data = {
             "timestamp": datetime.now().isoformat(),
+            "mode": results.get("mode", "combined"),
             "nki_kernel_score": results["combined_score"],
             "component_scores": {
                 "training": results["training_score"],
-                "inference": results["inference_score"]
+                "inference": results.get("inference_score")
             },
             "weights": results["weights"],
             "nki_ratio": results["nki_ratio"],
@@ -362,7 +388,7 @@ def main():
     inference_group.add_argument(
         "--inference-results",
         default="benchmark_inference.json",
-        help="Path to inference benchmark results"
+        help="Path to inference benchmark results (optional - if not provided, only training score is calculated)"
     )
     inference_group.add_argument(
         "--reference-latency",
@@ -474,17 +500,24 @@ def main():
         else:
             training_metrics = handler.run_training_metrics(training_args)
             
-        # Step 2: Load inference metrics
-        print("\n⚡ Loading inference metrics...")
-        if not os.path.exists(args.inference_results):
-            handler.logger.error(f"Inference results file not found: {args.inference_results}")
-            sys.exit(1)
+        # Step 2: Check for inference metrics
+        inference_metrics = None
+        inference_available = os.path.exists(args.inference_results)
+        
+        if inference_available:
+            print("\n⚡ Loading inference metrics...")
+            with open(args.inference_results, 'r') as f:
+                inference_metrics = json.load(f)
+        else:
+            print("\n⚠️  Inference results file not found. Running in training-only mode.")
+            print(f"   (Looking for: {args.inference_results})")
             
-        with open(args.inference_results, 'r') as f:
-            inference_metrics = json.load(f)
+        # Step 3: Calculate score(s)
+        if inference_available:
+            print("\n🔬 Calculating combined NKI-LLAMA score...")
+        else:
+            print("\n🔬 Calculating NKI kernel training score...")
             
-        # Step 3: Calculate combined score
-        print("\n🔬 Calculating combined NKI-LLAMA score...")
         weights = {
             "training": args.training_weight,
             "inference": args.inference_weight
@@ -499,7 +532,7 @@ def main():
             "accuracy": 1.0  # Assuming accuracy threshold is met
         }
         
-        # Pass reference_data to calculate_combined_score
+        # Calculate score - will handle both training-only and combined modes
         results = handler.calculate_combined_score(
             training_metrics,
             inference_metrics,
@@ -513,7 +546,11 @@ def main():
         # Step 5: Save results
         handler.save_results(results, args.output)
         
-        print(f"\n✅ Benchmark complete! Results saved to {args.output}")
+        if inference_available:
+            print(f"\n✅ Benchmark complete! Results saved to {args.output}")
+        else:
+            print(f"\n✅ Training benchmark complete! Results saved to {args.output}")
+            print("   Run inference benchmarks to get the full NKI-LLAMA score.")
         
     except Exception as e:
         handler.logger.error(f"Error during benchmark: {e}")
diff --git a/src/inference/scripts/run-nki-benchmark.sh b/src/inference/scripts/run-nki-benchmark.sh
index d1dd715..32cab03 100755
--- a/src/inference/scripts/run-nki-benchmark.sh
+++ b/src/inference/scripts/run-nki-benchmark.sh
@@ -137,22 +137,39 @@ check_failed_cache_entries() {
 # Function to run evaluate_single mode
 run_evaluate_single() {
     echo -e "${YELLOW}🔧 Running benchmark in evaluate_single mode...${NC}"
-    echo -e "${YELLOW}This mode runs from repository test script for single evaluation.${NC}"
+    echo -e "${YELLOW}This mode runs single evaluation with NKI optimizations.${NC}"
     
-    # Change to home directory and run the test script
-    cd ~
+    # Change to inference directory
+    cd "${NKI_INFERENCE}"
     
     # Build command
-    CMD="python ${NKI_ROOT}/test/inference/test.py"
-    CMD="${CMD} --repository-path ${NKI_ROOT}"
+    CMD="python main.py"
+    CMD="${CMD} --mode evaluate_single"
+    CMD="${CMD} --model-path ${MODEL_PATH}"
+    CMD="${CMD} --compiled-model-path ${COMPILED_MODEL_PATH}"
+    CMD="${CMD} --seq-len ${SEQ_LEN}"
+    CMD="${CMD} --tp-degree ${TP_DEGREE}"
+    
+    if [[ "$ENABLE_NKI" == "true" ]]; then
+        CMD="${CMD} --enable-nki"
+    fi
+    
+    if [[ "$RETRY_FAILED" == "true" ]]; then
+        CMD="${CMD} --retry-failed-compilation"
+    fi
     
-    # Execute with timing
+    # Execute with timing and error handling
     echo -e "${BLUE}Executing evaluate_single benchmark...${NC}"
+    echo -e "${BLUE}${CMD}${NC}"
     echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
     
     START_TIME=$(date +%s)
     
-    if $CMD 2>&1 | tee "${BENCHMARK_LOG_DIR}/benchmark.log"; then
+    # Create a temporary file to capture the output
+    TEMP_LOG=$(mktemp)
+    
+    # Run command and capture both stdout/stderr
+    if $CMD 2>&1 | tee "${BENCHMARK_LOG_DIR}/benchmark.log" | tee "$TEMP_LOG"; then
         END_TIME=$(date +%s)
         DURATION=$((END_TIME - START_TIME))
         
@@ -161,9 +178,69 @@ run_evaluate_single() {
         echo -e "${GREEN}✓ evaluate_single benchmark completed successfully!${NC}"
         echo -e "Total time: ${DURATION} seconds"
         
+        # If compilation happened, show artifact info
+        if [[ -d "$COMPILED_MODEL_PATH" ]]; then
+            echo
+            echo -e "${GREEN}✓ NKI-compiled model artifacts available at:${NC}"
+            echo -e "   ${COMPILED_MODEL_PATH}"
+            echo
+            echo -e "${BLUE}These artifacts can now be used for:${NC}"
+            echo -e "  • vLLM inference with NxDI optimizations"
+            echo -e "  • Direct inference benchmarks"
+            echo -e "  • Production deployments"
+            echo
+        fi
+        
+        rm -f "$TEMP_LOG"
         return 0
     else
-        echo -e "${RED}✗ evaluate_single benchmark failed!${NC}"
+        # Check if it's a cache-related failure
+        if grep -q "Got a cached failed neff" "$TEMP_LOG" || grep -q "SIGHUP" "$TEMP_LOG"; then
+            echo
+            echo -e "${RED}✗ evaluate_single benchmark failed due to compilation cache issues!${NC}"
+            
+            if [[ "$AUTO_CLEAR_CACHE" == "true" ]]; then
+                echo -e "${YELLOW}🔄 Attempting automatic cache recovery...${NC}"
+                echo
+                
+                # Clear the cache
+                if clear_compilation_cache; then
+                    echo
+                    echo -e "${YELLOW}🔄 Retrying benchmark with clean cache...${NC}"
+                    echo
+                    
+                    # Retry the command
+                    if $CMD 2>&1 | tee "${BENCHMARK_LOG_DIR}/benchmark_retry.log"; then
+                        END_TIME=$(date +%s)
+                        DURATION=$((END_TIME - START_TIME))
+                        
+                        echo
+                        echo -e "${GREEN}✓ evaluate_single benchmark completed successfully after cache clear!${NC}"
+                        echo -e "Total time: ${DURATION} seconds"
+                        
+                        rm -f "$TEMP_LOG"
+                        return 0
+                    else
+                        echo -e "${RED}✗ Benchmark still failed after cache clear${NC}"
+                    fi
+                else
+                    echo -e "${RED}✗ Could not clear cache automatically${NC}"
+                fi
+            else
+                echo
+                echo -e "${YELLOW}💡 Suggestions to fix:${NC}"
+                echo -e "   1. Clear the compilation cache:"
+                echo -e "      ${CYAN}rm -rf ${NEURON_CACHE_DIR}${NC}"
+                echo -e "   2. Re-run with auto cache clearing:"
+                echo -e "      ${CYAN}$0 --mode evaluate_single --clear-cache${NC}"
+                echo -e "   3. Force retry failed compilations:"
+                echo -e "      ${CYAN}$0 --mode evaluate_single --retry-failed-compilation${NC}"
+            fi
+        else
+            echo -e "${RED}✗ evaluate_single benchmark failed!${NC}"
+        fi
+        
+        rm -f "$TEMP_LOG"
         return 1
     fi
 }
@@ -308,7 +385,7 @@ run_benchmark() {
     fi
     
     # Check prerequisites based on mode
-    if [[ "$MODE" == "evaluate_all" ]]; then
+    if [[ "$MODE" == "evaluate_all" ]] || [[ "$MODE" == "evaluate_single" ]]; then
         check_model
     fi
     
@@ -407,8 +484,8 @@ show_info() {
     echo -e "This tool supports two benchmark modes:"
     echo
     echo -e "${YELLOW}1. evaluate_single mode:${NC}"
-    echo -e "   • Runs benchmark from repository test script"
-    echo -e "   • Single evaluation configuration"
+    echo -e "   • Runs single evaluation configuration"
+    echo -e "   • Tests with NKI optimizations"
     echo -e "   • Quick validation of model performance"
     echo
     echo -e "${YELLOW}2. evaluate_all mode:${NC}"

From e1f94133026bc9cb9f7a545e4e56cc62a10ca956 Mon Sep 17 00:00:00 2001
From: arm-diaz <arm.dan.diaz.gonzalez@gmail.com>
Date: Fri, 13 Jun 2025 12:03:14 +0000
Subject: [PATCH 24/65] fix: docs

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index d507b3f..b84aaa9 100644
--- a/README.md
+++ b/README.md
@@ -60,7 +60,7 @@ NKI-LLAMA provides a streamlined interface for the complete LLM development life
 ### 2. Installation
 ```bash
 # Clone repository
-git clone https://github.com/your-org/nki-llama.git
+git clone https://github.com/aws-neuron/nki-llama.git
 cd nki-llama
 
 # Install
@@ -361,7 +361,7 @@ nki-llama/
 │       ├── main.py       # Benchmark entry point
 │       └── scripts/      # Inference automation
 ├── notebooks/            # Example notebooks
-│   └── travel_agent.ipynb
+│   └── neuron_agents.ipynb
 ├── logs/                 # Operation logs
 │   └── benchmarks/       # Benchmark results
 └── models/              # Downloaded models

From fb5a8984a5048564c799a041bebfbc813980000d Mon Sep 17 00:00:00 2001
From: nithiyn <vijeasns@amazon.com>
Date: Fri, 13 Jun 2025 08:03:54 -0400
Subject: [PATCH 25/65] fix: vllm reinstall rem, check tf and add env vars

---
 .../scripts/reasoning-bench-lm-eval.sh        | 127 ++++++++++--------
 1 file changed, 74 insertions(+), 53 deletions(-)

diff --git a/src/inference/scripts/reasoning-bench-lm-eval.sh b/src/inference/scripts/reasoning-bench-lm-eval.sh
index af7bd0c..0c0cdad 100644
--- a/src/inference/scripts/reasoning-bench-lm-eval.sh
+++ b/src/inference/scripts/reasoning-bench-lm-eval.sh
@@ -1,49 +1,70 @@
-#!/bin/bash
-# reasoning-bench-lm-eval.sh - Start vLLM OpenAI-compatible API server and run lm-eval
+#!/usr/bin/env bash
+# reasoning-bench-lm-eval.sh ─ Start vLLM (Neuron) server and run lm-eval reasoning bench
 
 set -euo pipefail
 
-# Load configuration
+# ---------------------------------------------------------------------
+# 0. Config + constants
+# ---------------------------------------------------------------------
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-source "${SCRIPT_DIR}/../../../nki-llama.config"
+source "${SCRIPT_DIR}/../../../nki-llama.config"          # sets: VLLM_REPO, NEURON_INFERENCE_VENV, …
 
-# Colors
+# Where we keep AWS Neuron samples
+REASONING_BENCH_DIR="$HOME/aws-neuron-samples"
+
+# Colours
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
 RED='\033[0;31m'
 NC='\033[0m'
 
-echo -e "${GREEN}Setting up vLLM for Neuron...${NC}"
+echo -e "${GREEN}Setting up vLLM for Neuron …${NC}"
 
-# Check if in correct environment
-if [[ "$VIRTUAL_ENV" != *"inference"* ]]; then
-    echo -e "${RED}Error: Not in inference environment${NC}"
-    echo -e "Run: source ${NEURON_INFERENCE_VENV}/bin/activate"
-    exit 1
+# ---------------------------------------------------------------------
+# 1. Sanity check: are we inside the inference venv?
+# ---------------------------------------------------------------------
+if [[ "${VIRTUAL_ENV:-}" != *"inference"* ]]; then
+  echo -e "${RED}Error:${NC} not inside Neuron inference venv"
+  echo    "Run: source ${NEURON_INFERENCE_VENV}/bin/activate"
+  exit 1
 fi
 
-# Clone or update vLLM repository
+# ---------------------------------------------------------------------
+# 2. Clone or update vLLM repo
+# ---------------------------------------------------------------------
 if [[ -d "$VLLM_REPO" ]]; then
-    echo "Updating existing vLLM repository..."
-    cd "$VLLM_REPO"
-    git fetch
-    git pull
+  echo "Updating existing vLLM repo …"
+  git -C "$VLLM_REPO" pull --ff-only
 else
-    echo "Cloning vLLM repository..."
-    cd "$(dirname "$VLLM_REPO")"
-    git clone https://github.com/vllm-project/vllm.git
+  echo "Cloning vLLM repo …"
+  git clone https://github.com/vllm-project/vllm.git "$VLLM_REPO"
 fi
 
-# Install requirements
 cd "$VLLM_REPO"
-echo "Installing vLLM requirements..."
-pip install -U -r requirements/neuron.txt
 
-# Install vLLM
-echo "Installing vLLM for Neuron..."
-VLLM_TARGET_DEVICE="neuron" pip install -e .
+# ---------------------------------------------------------------------
+# 3. Install dependencies once, refresh editable install each run
+#    • If 'vllm' importable  → skip deps, just refresh metadata
+#    • Else                 → first run: install deps + editable
+# ---------------------------------------------------------------------
+if python - <<'PY' >/dev/null 2>&1
+import importlib.util, sys
+sys.exit(0 if importlib.util.find_spec("vllm") else 1)
+PY
+then
+  echo "vLLM already importable – skipping heavy deps install"
+  VLLM_TARGET_DEVICE="neuron" \
+      pip install --quiet --no-deps -e . --exists-action=i
+else
+  echo "Installing vLLM Neuron deps (first run) …"
+  pip install --quiet -r requirements/neuron.txt
+  VLLM_TARGET_DEVICE="neuron" pip install --quiet -e .
+fi
 
-# Ensure transformers < 4.50 (needed by Neuron hf_adapter)
+# ---------------------------------------------------------------------
+# 4. Ensure transformers < 4.50 for Neuron hf_adapter
+# ---------------------------------------------------------------------
 python - <<'PY'
 import subprocess, pkg_resources, sys
 req = "4.50.0"
@@ -52,31 +73,32 @@ try:
 except pkg_resources.DistributionNotFound:
     ver = ""
 if not ver or pkg_resources.parse_version(ver) >= pkg_resources.parse_version(req):
-    print("Installing transformers<%s …" % req)
-    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", f"transformers<{req}"])
+    print(f"Installing transformers<{req} …")
+    subprocess.check_call([sys.executable, "-m", "pip", "install",
+                           "--quiet", f"transformers<{req}"])
 PY
 
-echo -e "${GREEN}✓ vLLM setup complete${NC}"
-
-cd "$HOME"
-git clone https://github.com/aws-neuron/aws-neuron-samples.git
-cd /home/ubuntu/aws-neuron-samples/inference-benchmarking/
-pip install -r requirements.txt --quiet
+echo -e "${GREEN}✓ vLLM (Neuron) ready${NC}"
 
-echo -e "${GREEN}✓ Inference-Benchmarking setup complete${NC}"
-
-#write config file for reasoning test
-cd /home/ubuntu/aws-neuron-samples/inference-benchmarking/
-
-if test -f "/home/ubuntu/aws-neuron-samples/inference-benchmarking/reasoning_bench.yaml"; then
-   echo "config file exists."
-else 
-    echo "Creating config file..."
+# ---------------------------------------------------------------------
+# 5. Clone/refresh aws-neuron-samples + its deps
+# ---------------------------------------------------------------------
+if [[ -d "$REASONING_BENCH_DIR" ]]; then
+  echo "Updating aws-neuron-samples repo …"
+  git -C "$REASONING_BENCH_DIR" pull --ff-only
+else
+  git clone https://github.com/aws-neuron/aws-neuron-samples.git \
+            "$REASONING_BENCH_DIR"
 fi
 
-OUT_FILE="reasoning_bench.yaml" 
-cat > "$OUT_FILE" <<YAML
+cd "$REASONING_BENCH_DIR/inference-benchmarking"
+pip install --quiet -r requirements.txt
+echo -e "${GREEN}✓ Inference-Benchmarking deps ready${NC}"
 
+# ---------------------------------------------------------------------
+# 6. Write (or overwrite) reasoning_bench.yaml
+# ---------------------------------------------------------------------
+cat > reasoning_bench.yaml <<YAML
 server:
   name: "Reasoning-benchmark server"
   model_path: "${NKI_MODELS}/${MODEL_NAME}"
@@ -99,16 +121,15 @@ test:
       client_params:
         limit: 200
         use_chat: True
-
 YAML
+echo -e "${GREEN}✓ Config file written${NC}"
 
-#config file written
-echo -e "${GREEN}✓ Config File written${NC}"
-
-#run reasoning benchmark
-echo -e "${GREEN}Starting Reasoning Benchmarking job...${NC}"
-echo $"{BLUE}----- reasoning_bench.yaml -----${NC}"
+# ---------------------------------------------------------------------
+# 7. Run the benchmark
+# ---------------------------------------------------------------------
+echo -e "${BLUE}----- reasoning_bench.yaml -----${NC}"
 cat reasoning_bench.yaml
 echo
 
-python accuracy.py --config reasoning_bench.yaml
\ No newline at end of file
+echo -e "${GREEN}Starting Reasoning Benchmark …${NC}"
+python accuracy.py --config reasoning_bench.yaml

From 256f9422fb124f9ddbaf3ccf8449ddd6de62f799 Mon Sep 17 00:00:00 2001
From: nithiyn <vijeasns@amazon.com>
Date: Fri, 13 Jun 2025 10:57:14 -0400
Subject: [PATCH 26/65] fix: set chat to false for base model

---
 src/inference/scripts/reasoning-bench-lm-eval.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/inference/scripts/reasoning-bench-lm-eval.sh b/src/inference/scripts/reasoning-bench-lm-eval.sh
index 0c0cdad..e7752b7 100644
--- a/src/inference/scripts/reasoning-bench-lm-eval.sh
+++ b/src/inference/scripts/reasoning-bench-lm-eval.sh
@@ -120,7 +120,7 @@ test:
       timeout: 3600
       client_params:
         limit: 200
-        use_chat: True
+        use_chat: False
 YAML
 echo -e "${GREEN}✓ Config file written${NC}"
 

From d5d619d920cfc45433de2b673fdebdfe043661bf Mon Sep 17 00:00:00 2001
From: arm-diaz <arm.dan.diaz.gonzalez@gmail.com>
Date: Sat, 14 Jun 2025 19:17:09 +0000
Subject: [PATCH 27/65] refactor: clean up env variables

---
 .env.example                                     | 10 +++-------
 README.md                                        |  8 +-------
 install.sh                                       |  4 ----
 nki-llama.config                                 |  7 +------
 src/inference/README.md                          |  6 +-----
 src/inference/scripts/reasoning-bench-lm-eval.sh |  0
 6 files changed, 6 insertions(+), 29 deletions(-)
 mode change 100644 => 100755 src/inference/scripts/reasoning-bench-lm-eval.sh

diff --git a/.env.example b/.env.example
index 0dae225..9cfa903 100644
--- a/.env.example
+++ b/.env.example
@@ -6,19 +6,15 @@ HF_TOKEN=your_huggingface_token_here
 MODEL_ID=meta-llama/Meta-Llama-3-8B
 MODEL_NAME=llama-3-8b
 
-# Training Configuration
-BATCH_SIZE=1
-MAX_STEPS=1000
-SEQ_LENGTH=2048
-LEARNING_RATE=5e-5
+# Shared Configuration
+TENSOR_PARALLEL_SIZE=8
 
 # Inference Configuration
 INFERENCE_PORT=8080
 MAX_MODEL_LEN=2048
 MAX_NUM_SEQS=4
-TENSOR_PARALLEL_SIZE=8
 
-# Dataset Configuration
+# Inference Dataset Configuration
 DATASET_NAME=databricks/databricks-dolly-15k
 
 # Neuron Configuration
diff --git a/README.md b/README.md
index b84aaa9..f472a46 100644
--- a/README.md
+++ b/README.md
@@ -382,13 +382,7 @@ MODEL_NAME=llama-3-8b
 
 # Hardware Configuration
 TENSOR_PARALLEL_SIZE=8
-NEURON_RT_NUM_CORES=32
-
-# Training Parameters
-BATCH_SIZE=1
-MAX_STEPS=1000
-SEQ_LENGTH=2048
-LEARNING_RATE=5e-5
+NEURON_RT_NUM_CORES=8
 
 # Inference Parameters
 INFERENCE_PORT=8080
diff --git a/install.sh b/install.sh
index 3881b56..0127d6f 100755
--- a/install.sh
+++ b/install.sh
@@ -64,10 +64,6 @@ HF_TOKEN=
 MODEL_ID=meta-llama/Meta-Llama-3-8B
 MODEL_NAME=llama-3-8b
 TENSOR_PARALLEL_SIZE=8
-SEQ_LENGTH=2048
-BATCH_SIZE=1
-MAX_STEPS=1000
-LEARNING_RATE=5e-5
 INFERENCE_PORT=8080
 MAX_MODEL_LEN=2048
 MAX_NUM_SEQS=4
diff --git a/nki-llama.config b/nki-llama.config
index 4e02f99..c325c82 100644
--- a/nki-llama.config
+++ b/nki-llama.config
@@ -17,12 +17,8 @@ export MODEL_ID="${MODEL_ID:-meta-llama/Meta-Llama-3-8B}"
 export MODEL_NAME="${MODEL_NAME:-llama-3-8b}"
 export HF_TOKEN="${HF_TOKEN:-}"
 
-# Training Configuration
-export BATCH_SIZE="${BATCH_SIZE:-1}"
-export MAX_STEPS="${MAX_STEPS:-1000}"
-export SEQ_LENGTH="${SEQ_LENGTH:-2048}"
+# Shared Configuration
 export TENSOR_PARALLEL_SIZE="${TENSOR_PARALLEL_SIZE:-8}"
-export LEARNING_RATE="${LEARNING_RATE:-5e-5}"
 
 # Inference Configuration
 export INFERENCE_PORT="${INFERENCE_PORT:-8080}"
@@ -63,5 +59,4 @@ print_config() {
     echo "Fine-tune Scripts: ${NKI_FINETUNE_SCRIPTS}"
     echo "Inference Scripts: ${NKI_INFERENCE_SCRIPTS}"
     echo "Tensor Parallel Size: ${TENSOR_PARALLEL_SIZE}"
-    echo "Sequence Length: ${SEQ_LENGTH}"
 }
\ No newline at end of file
diff --git a/src/inference/README.md b/src/inference/README.md
index c8732af..20751cb 100644
--- a/src/inference/README.md
+++ b/src/inference/README.md
@@ -236,12 +236,8 @@ MODEL_ID=meta-llama/Meta-Llama-3-8B
 MODEL_NAME=llama-3-8b
 HF_TOKEN=your_huggingface_token
 
-# Training Parameters
-BATCH_SIZE=1
-MAX_STEPS=1000
-SEQ_LENGTH=2048
+# Shared Parameters
 TENSOR_PARALLEL_SIZE=8
-LEARNING_RATE=5e-5
 
 # Inference Parameters
 INFERENCE_PORT=8080
diff --git a/src/inference/scripts/reasoning-bench-lm-eval.sh b/src/inference/scripts/reasoning-bench-lm-eval.sh
old mode 100644
new mode 100755

From 45db233ea67e0c5686d367b199116d6a0f10f8b6 Mon Sep 17 00:00:00 2001
From: nithiyn <vijeasns@amazon.com>
Date: Sun, 15 Jun 2025 11:41:14 -0400
Subject: [PATCH 28/65] fix: commit updates for env var mapping and docs

---
 .env.example                               | 10 +++++-----
 README.md                                  |  4 +++-
 src/fine-tune/README.md                    |  5 +++++
 src/inference/scripts/run-nki-benchmark.sh |  4 ++--
 4 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/.env.example b/.env.example
index 9cfa903..0dfbbc8 100644
--- a/.env.example
+++ b/.env.example
@@ -6,15 +6,15 @@ HF_TOKEN=your_huggingface_token_here
 MODEL_ID=meta-llama/Meta-Llama-3-8B
 MODEL_NAME=llama-3-8b
 
-# Shared Configuration
-TENSOR_PARALLEL_SIZE=8
-
 # Inference Configuration
 INFERENCE_PORT=8080
-MAX_MODEL_LEN=2048
+MAX_MODEL_LEN=2048 # used by vllm- ensure it is the same as seq len
+SEQ_LEN=2048 #used by main.py
+
 MAX_NUM_SEQS=4
+TENSOR_PARALLEL_SIZE=8
 
-# Inference Dataset Configuration
+# Dataset Configuration
 DATASET_NAME=databricks/databricks-dolly-15k
 
 # Neuron Configuration
diff --git a/README.md b/README.md
index f472a46..bc3aaf4 100644
--- a/README.md
+++ b/README.md
@@ -70,6 +70,7 @@ chmod +x install.sh
 # Configure
 cp .env.example .env
 nano .env  # Add your HF_TOKEN
+# inference env vars, ensure max_model_len= seq_len
 ```
 
 ### 3. First Run
@@ -122,7 +123,8 @@ source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
 # Step 2: Run inference benchmark (optional for full score)
 tmux new -s benchmark
 source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
-./nki-llama inference benchmark
+./nki-llama inference benchmark 
+# or directly run from main.py in src/inference/ for full use of flags for bucketing/context encoding and others
 
 # Step 3: Calculate scores
 # For training-only score:
diff --git a/src/fine-tune/README.md b/src/fine-tune/README.md
index 5f04dcf..19a6381 100644
--- a/src/fine-tune/README.md
+++ b/src/fine-tune/README.md
@@ -1,5 +1,10 @@
+You're able to fun through the fine-tuning workflow with the top-level NKI LLAMA CLI or this fine-tune pipeline script.
+
 ## 1 · Prerequisites
 
+- Under the configs dir, make sure to bring your model specific config.json for NxD-training.
+- additionally ensure you bring your training config.YAML file, for more details, refer to the corresponding NXDT documentation.
+
 | Requirement | Reason | Install / Notes |
 |-------------|--------|-----------------|
 | **Neuron virtual‑env** | Script refuses to run outside it | `source /opt/aws_neuronx_venv_pytorch_2_5/bin/activate` |
diff --git a/src/inference/scripts/run-nki-benchmark.sh b/src/inference/scripts/run-nki-benchmark.sh
index 32cab03..f6c2f9c 100755
--- a/src/inference/scripts/run-nki-benchmark.sh
+++ b/src/inference/scripts/run-nki-benchmark.sh
@@ -22,7 +22,7 @@ NC='\033[0m'
 # Default parameters
 MODE="${MODE:-evaluate_single}"
 ENABLE_NKI="${ENABLE_NKI:-true}"
-SEQ_LEN="${SEQ_LEN:-640}"
+SEQ_LEN="${SEQ_LEN:-2048}"
 TP_DEGREE="${TP_DEGREE:-${TENSOR_PARALLEL_SIZE}}"
 CLEAR_CACHE="${CLEAR_CACHE:-false}"
 AUTO_CLEAR_CACHE="${AUTO_CLEAR_CACHE:-true}"
@@ -72,7 +72,7 @@ while [[ $# -gt 0 ]]; do
             echo "  --mode MODE                    Benchmark mode (evaluate_single/evaluate_all) [default: evaluate_single]"
             echo "  --model-name NAME              Model name override"
             echo "  --no-nki                       Disable NKI optimizations"
-            echo "  --seq-len N                    Sequence length [default: 640]"
+            echo "  --seq-len N                    Sequence length [default: 2048]"
             echo "  --tp-degree N                  Tensor parallel degree [default: from config]"
             echo "  --clear-cache                  Clear compilation cache before running"
             echo "  --no-auto-clear-cache          Disable automatic cache clearing on failure"

From 93f50411567de83e7e5d481641030a1cf265b5b4 Mon Sep 17 00:00:00 2001
From: nithiyn <niithiyn@gmail.com>
Date: Mon, 16 Jun 2025 14:44:35 -0700
Subject: [PATCH 29/65] Update nki-llama.sh

---
 nki-llama.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nki-llama.sh b/nki-llama.sh
index e211d8b..b238da8 100755
--- a/nki-llama.sh
+++ b/nki-llama.sh
@@ -44,7 +44,7 @@ display_banner() {
    / | / // //_//  _/      / /   / /      /   |  /  |/  /   /   |
   /  |/ // ,<   / /______ / /   / /      / /| | / /|_/ /   / /| |
  / /|  // /| |_/ /_______/ /___/ /___   / ___ |/ /  / /   / ___ |
-/_/ |_//_/ |_/___/       /_____/_____/  /_/  |_/_/  /_/   /_/  |_|
+/_/ |_//_/ |_/___/      /_____/_____/  /_/  |_/_/  /_/   /_/  |_|
                                                              
 EOF
     echo -e "${NC}"
@@ -655,4 +655,4 @@ main() {
 }
 
 # Run main
-main "$@"
\ No newline at end of file
+main "$@"

From 1fa2e8654d27acd768b517c9d41d906bf4a433ee Mon Sep 17 00:00:00 2001
From: nithiyn <vijeasns@amazon.com>
Date: Tue, 17 Jun 2025 14:54:43 -0700
Subject: [PATCH 30/65] commit vllm updates

---
 src/inference/scripts/setup-vllm.sh   | 2 +-
 src/inference/scripts/start-server.sh | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/inference/scripts/setup-vllm.sh b/src/inference/scripts/setup-vllm.sh
index 588f5bd..fb9683c 100755
--- a/src/inference/scripts/setup-vllm.sh
+++ b/src/inference/scripts/setup-vllm.sh
@@ -35,7 +35,7 @@ else
 fi
 
 # Install requirements
-cd "$VLLM_REPO"
+cd /home/vllm/
 echo "Installing vLLM requirements..."
 pip install -U -r requirements/neuron.txt
 
diff --git a/src/inference/scripts/start-server.sh b/src/inference/scripts/start-server.sh
index a5d48e2..3ad0a15 100755
--- a/src/inference/scripts/start-server.sh
+++ b/src/inference/scripts/start-server.sh
@@ -40,11 +40,14 @@ echo -e "${YELLOW}Press Ctrl+C to stop${NC}"
 echo
 
 # Start vLLM server
+#if using a reasoning model, make sure 
 cd "$HOME"
 python -m vllm.entrypoints.openai.api_server \
     --model="${NKI_MODELS}/${MODEL_NAME}" \
-    --max-num-seqs="${MAX_NUM_SEQS}" \
+    --max-num-seqs="${MAX_NUM_SEQS}" \ 
     --max-model-len="${MAX_MODEL_LEN}" \
+    --enable-reasoning \
+    --reasoning-parser deepseek-r1 \
     --tensor-parallel-size="${TENSOR_PARALLEL_SIZE}" \
     --port="${INFERENCE_PORT}" \
     --device="neuron" \

From d803b1e5608e03c8af592849397bc403a9a3e61a Mon Sep 17 00:00:00 2001
From: nithiyn <vijeasns@amazon.com>
Date: Tue, 17 Jun 2025 14:56:14 -0700
Subject: [PATCH 31/65] fix: vllm path

---
 src/inference/scripts/setup-vllm.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/inference/scripts/setup-vllm.sh b/src/inference/scripts/setup-vllm.sh
index fb9683c..aec9066 100755
--- a/src/inference/scripts/setup-vllm.sh
+++ b/src/inference/scripts/setup-vllm.sh
@@ -35,7 +35,7 @@ else
 fi
 
 # Install requirements
-cd /home/vllm/
+cd /home/ubuntu/vllm/
 echo "Installing vLLM requirements..."
 pip install -U -r requirements/neuron.txt
 

From 2a1d6c345315cd59b6b361432e526bade14100a2 Mon Sep 17 00:00:00 2001
From: nithiyn <vijeasns@amazon.com>
Date: Tue, 17 Jun 2025 15:07:01 -0700
Subject: [PATCH 32/65] fix: uninstall prev vllm wheels

---
 src/inference/scripts/setup-vllm.sh   | 4 ++++
 src/inference/scripts/start-server.sh | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/inference/scripts/setup-vllm.sh b/src/inference/scripts/setup-vllm.sh
index aec9066..6901eab 100755
--- a/src/inference/scripts/setup-vllm.sh
+++ b/src/inference/scripts/setup-vllm.sh
@@ -34,6 +34,10 @@ else
     git clone https://github.com/vllm-project/vllm.git
 fi
 
+# ---- NEW: make sure no wheel shadows the editable install ---------------
+echo "Removing any previously installed vLLM wheels..."
+pip uninstall -y vllm vllm-nightly vllm-neuron 2>/dev/null || true
+
 # Install requirements
 cd /home/ubuntu/vllm/
 echo "Installing vLLM requirements..."
diff --git a/src/inference/scripts/start-server.sh b/src/inference/scripts/start-server.sh
index 3ad0a15..6dc0392 100755
--- a/src/inference/scripts/start-server.sh
+++ b/src/inference/scripts/start-server.sh
@@ -47,7 +47,7 @@ python -m vllm.entrypoints.openai.api_server \
     --max-num-seqs="${MAX_NUM_SEQS}" \ 
     --max-model-len="${MAX_MODEL_LEN}" \
     --enable-reasoning \
-    --reasoning-parser deepseek-r1 \
+    --reasoning-parser deepseek_r1 \
     --tensor-parallel-size="${TENSOR_PARALLEL_SIZE}" \
     --port="${INFERENCE_PORT}" \
     --device="neuron" \

From ccd9f60363960be5b09ec6f324952451b1db81e2 Mon Sep 17 00:00:00 2001
From: nithiyn <vijeasns@amazon.com>
Date: Tue, 17 Jun 2025 15:16:44 -0700
Subject: [PATCH 33/65] fix: use neuron fork

---
 src/inference/scripts/setup-vllm.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/inference/scripts/setup-vllm.sh b/src/inference/scripts/setup-vllm.sh
index 6901eab..602cd1d 100755
--- a/src/inference/scripts/setup-vllm.sh
+++ b/src/inference/scripts/setup-vllm.sh
@@ -31,7 +31,7 @@ if [[ -d "$VLLM_REPO" ]]; then
 else
     echo "Cloning vLLM repository..."
     cd "$(dirname "$VLLM_REPO")"
-    git clone https://github.com/vllm-project/vllm.git
+    git clone -b releases/v2.23.0-v0 https://github.com/aws-neuron/upstreaming-to-vllm.git
 fi
 
 # ---- NEW: make sure no wheel shadows the editable install ---------------
@@ -41,7 +41,7 @@ pip uninstall -y vllm vllm-nightly vllm-neuron 2>/dev/null || true
 # Install requirements
 cd /home/ubuntu/vllm/
 echo "Installing vLLM requirements..."
-pip install -U -r requirements/neuron.txt
+pip install -r requirements/neuron.txt
 
 # Install vLLM
 echo "Installing vLLM for Neuron..."

From 49715f151e21a366269abb1ca9acee0e42b992b6 Mon Sep 17 00:00:00 2001
From: nithiyn <vijeasns@amazon.com>
Date: Tue, 17 Jun 2025 15:29:58 -0700
Subject: [PATCH 34/65] fix: downgrade tf to 4.48.8

---
 src/inference/scripts/reasoning-bench-lm-eval.sh | 2 +-
 src/inference/scripts/setup-vllm.sh              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/inference/scripts/reasoning-bench-lm-eval.sh b/src/inference/scripts/reasoning-bench-lm-eval.sh
index e7752b7..51af0cb 100755
--- a/src/inference/scripts/reasoning-bench-lm-eval.sh
+++ b/src/inference/scripts/reasoning-bench-lm-eval.sh
@@ -67,7 +67,7 @@ fi
 # ---------------------------------------------------------------------
 python - <<'PY'
 import subprocess, pkg_resources, sys
-req = "4.50.0"
+req = "4.48.2"
 try:
     ver = pkg_resources.get_distribution("transformers").version
 except pkg_resources.DistributionNotFound:
diff --git a/src/inference/scripts/setup-vllm.sh b/src/inference/scripts/setup-vllm.sh
index 602cd1d..752821d 100755
--- a/src/inference/scripts/setup-vllm.sh
+++ b/src/inference/scripts/setup-vllm.sh
@@ -50,7 +50,7 @@ VLLM_TARGET_DEVICE="neuron" pip install -e .
 # Ensure transformers < 4.50 (needed by Neuron hf_adapter)
 python - <<'PY'
 import subprocess, pkg_resources, sys
-req = "4.50.0"
+req = "4.48.2"
 try:
     ver = pkg_resources.get_distribution("transformers").version
 except pkg_resources.DistributionNotFound:

From b554084dede3ad7ca3057bc6aaabe3d70502c307 Mon Sep 17 00:00:00 2001
From: nithiyn <vijeasns@amazon.com>
Date: Tue, 17 Jun 2025 15:35:02 -0700
Subject: [PATCH 35/65] add dir name

---
 src/inference/scripts/setup-vllm.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/inference/scripts/setup-vllm.sh b/src/inference/scripts/setup-vllm.sh
index 752821d..c0528db 100755
--- a/src/inference/scripts/setup-vllm.sh
+++ b/src/inference/scripts/setup-vllm.sh
@@ -39,7 +39,7 @@ echo "Removing any previously installed vLLM wheels..."
 pip uninstall -y vllm vllm-nightly vllm-neuron 2>/dev/null || true
 
 # Install requirements
-cd /home/ubuntu/vllm/
+cd /home/ubuntu/upstreaming-to-vllm/
 echo "Installing vLLM requirements..."
 pip install -r requirements/neuron.txt
 

From f1fc5a0b8571019dd500271e8aaef3b03c00d1fb Mon Sep 17 00:00:00 2001
From: nithiyn <vijeasns@amazon.com>
Date: Wed, 18 Jun 2025 10:28:46 -0700
Subject: [PATCH 36/65] test with 2.22

---
 nki-llama.config                    | 2 +-
 src/inference/scripts/setup-vllm.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/nki-llama.config b/nki-llama.config
index c325c82..ca3b8aa 100644
--- a/nki-llama.config
+++ b/nki-llama.config
@@ -33,7 +33,7 @@ export NEURON_COMPILE_CACHE="${HOME}/.cache/neuron"
 export NEURON_RT_NUM_CORES="${NEURON_RT_NUM_CORES:-8}"
 
 # vLLM Configuration
-export VLLM_REPO="${HOME}/vllm"
+export VLLM_REPO="${HOME}/upstreaming-to-vllm"
 export VLLM_NEURON_FRAMEWORK="neuronx-distributed-inference"
 
 # Dataset Configuration
diff --git a/src/inference/scripts/setup-vllm.sh b/src/inference/scripts/setup-vllm.sh
index c0528db..faf553d 100755
--- a/src/inference/scripts/setup-vllm.sh
+++ b/src/inference/scripts/setup-vllm.sh
@@ -31,7 +31,7 @@ if [[ -d "$VLLM_REPO" ]]; then
 else
     echo "Cloning vLLM repository..."
     cd "$(dirname "$VLLM_REPO")"
-    git clone -b releases/v2.23.0-v0 https://github.com/aws-neuron/upstreaming-to-vllm.git
+    git clone -b neuron-2.22-vllm-v0.7.2 https://github.com/aws-neuron/upstreaming-to-vllm.git
 fi
 
 # ---- NEW: make sure no wheel shadows the editable install ---------------

From 3f868d925891cdc765bfb605bed7765200e87faa Mon Sep 17 00:00:00 2001
From: nithiyn <vijeasns@amazon.com>
Date: Thu, 19 Jun 2025 14:36:37 -0700
Subject: [PATCH 37/65] fix: disable reasoning parser for vllm 2.22
 compatibility

---
 .../scripts/reasoning-bench-lm-eval.sh        | 29 +++----------------
 src/inference/scripts/setup-vllm.sh           |  3 +-
 src/inference/scripts/start-server.sh         |  4 +--
 3 files changed, 6 insertions(+), 30 deletions(-)

diff --git a/src/inference/scripts/reasoning-bench-lm-eval.sh b/src/inference/scripts/reasoning-bench-lm-eval.sh
index 51af0cb..1de139e 100755
--- a/src/inference/scripts/reasoning-bench-lm-eval.sh
+++ b/src/inference/scripts/reasoning-bench-lm-eval.sh
@@ -34,32 +34,11 @@ fi
 # 2. Clone or update vLLM repo
 # ---------------------------------------------------------------------
 if [[ -d "$VLLM_REPO" ]]; then
-  echo "Updating existing vLLM repo …"
-  git -C "$VLLM_REPO" pull --ff-only
+  echo " vLLM repo exists"
 else
-  echo "Cloning vLLM repo …"
-  git clone https://github.com/vllm-project/vllm.git "$VLLM_REPO"
-fi
-
-cd "$VLLM_REPO"
-
-# ---------------------------------------------------------------------
-# 3. Install dependencies once, refresh editable install each run
-#    • If 'vllm' importable  → skip deps, just refresh metadata
-#    • Else                 → first run: install deps + editable
-# ---------------------------------------------------------------------
-if python - <<'PY' >/dev/null 2>&1
-import importlib.util, sys
-sys.exit(0 if importlib.util.find_spec("vllm") else 1)
-PY
-then
-  echo "vLLM already importable – skipping heavy deps install"
-  VLLM_TARGET_DEVICE="neuron" \
-      pip install --quiet --no-deps -e . --exists-action=i
-else
-  echo "Installing vLLM Neuron deps (first run) …"
-  pip install --quiet -r requirements/neuron.txt
-  VLLM_TARGET_DEVICE="neuron" pip install --quiet -e .
+  echo "Run ./nki-llama inference setup first"
+  exit 1
+  #git clone -b neuron-2.22-vllm-v0.7.2 https://github.com/aws-neuron/upstreaming-to-vllm.git
 fi
 
 # ---------------------------------------------------------------------
diff --git a/src/inference/scripts/setup-vllm.sh b/src/inference/scripts/setup-vllm.sh
index faf553d..e15c5d6 100755
--- a/src/inference/scripts/setup-vllm.sh
+++ b/src/inference/scripts/setup-vllm.sh
@@ -26,7 +26,6 @@ fi
 if [[ -d "$VLLM_REPO" ]]; then
     echo "Updating existing vLLM repository..."
     cd "$VLLM_REPO"
-    git fetch
     git pull
 else
     echo "Cloning vLLM repository..."
@@ -41,7 +40,7 @@ pip uninstall -y vllm vllm-nightly vllm-neuron 2>/dev/null || true
 # Install requirements
 cd /home/ubuntu/upstreaming-to-vllm/
 echo "Installing vLLM requirements..."
-pip install -r requirements/neuron.txt
+pip install -r requirements-neuron.txt
 
 # Install vLLM
 echo "Installing vLLM for Neuron..."
diff --git a/src/inference/scripts/start-server.sh b/src/inference/scripts/start-server.sh
index 6dc0392..5467f1c 100755
--- a/src/inference/scripts/start-server.sh
+++ b/src/inference/scripts/start-server.sh
@@ -44,10 +44,8 @@ echo
 cd "$HOME"
 python -m vllm.entrypoints.openai.api_server \
     --model="${NKI_MODELS}/${MODEL_NAME}" \
-    --max-num-seqs="${MAX_NUM_SEQS}" \ 
+    --max-num-seqs="${MAX_NUM_SEQS}" \
     --max-model-len="${MAX_MODEL_LEN}" \
-    --enable-reasoning \
-    --reasoning-parser deepseek_r1 \
     --tensor-parallel-size="${TENSOR_PARALLEL_SIZE}" \
     --port="${INFERENCE_PORT}" \
     --device="neuron" \

From b28f87b33730a1b1bc11fe318cd0daa1a597e8b4 Mon Sep 17 00:00:00 2001
From: nithiyn <vijeasns@amazon.com>
Date: Thu, 19 Jun 2025 15:41:45 -0700
Subject: [PATCH 38/65] fix:reasoning bench standalone script

---
 src/inference/scripts/reasoning-bench-lm-eval.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/inference/scripts/reasoning-bench-lm-eval.sh b/src/inference/scripts/reasoning-bench-lm-eval.sh
index 1de139e..01f0d59 100755
--- a/src/inference/scripts/reasoning-bench-lm-eval.sh
+++ b/src/inference/scripts/reasoning-bench-lm-eval.sh
@@ -7,8 +7,11 @@ set -euo pipefail
 # 0. Config + constants
 # ---------------------------------------------------------------------
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-source "${SCRIPT_DIR}/../../../nki-llama.config"          # sets: VLLM_REPO, NEURON_INFERENCE_VENV, …
+source "${SCRIPT_DIR}/../../../nki-llama.config" 
 
+set -a                      # auto-export everything that follows
+[ -f "${SCRIPT_DIR}/../../../.env" ] && source "${SCRIPT_DIR}/../../../.env"
+set +a
 # Where we keep AWS Neuron samples
 REASONING_BENCH_DIR="$HOME/aws-neuron-samples"
 

From 21de053b0ee6b914f710ae541c3fe57b1f045151 Mon Sep 17 00:00:00 2001
From: Nithiyn <niithiyn@gmail.com>
Date: Mon, 23 Jun 2025 16:42:17 +0000
Subject: [PATCH 39/65] feat: reasoning bench datasets update

---
 src/inference/scripts/reasoning-bench-lm-eval.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/inference/scripts/reasoning-bench-lm-eval.sh b/src/inference/scripts/reasoning-bench-lm-eval.sh
index 01f0d59..4bb7748 100755
--- a/src/inference/scripts/reasoning-bench-lm-eval.sh
+++ b/src/inference/scripts/reasoning-bench-lm-eval.sh
@@ -97,7 +97,7 @@ test:
   accuracy:
     mytest:
       client: "lm_eval"
-      datasets: ["gsm8k_cot", "mmlu_flan_n_shot_generative_logical_fallacies"]
+      datasets: ["mmlu_pro, gsm8k_cot, mmlu_flan_cot_zeroshot"]
       max_concurrent_requests: 1
       timeout: 3600
       client_params:

From 44dcb30f733ebf1105e9a7d2db43703f833f3b76 Mon Sep 17 00:00:00 2001
From: Nithiyn <niithiyn@gmail.com>
Date: Mon, 23 Jun 2025 17:41:51 +0000
Subject: [PATCH 40/65] chore:update reasoning bench doc

---
 src/inference/docs/reasoning-score-guide.md | 276 ++++++++++++++++++++
 1 file changed, 276 insertions(+)
 create mode 100644 src/inference/docs/reasoning-score-guide.md

diff --git a/src/inference/docs/reasoning-score-guide.md b/src/inference/docs/reasoning-score-guide.md
new file mode 100644
index 0000000..6e39884
--- /dev/null
+++ b/src/inference/docs/reasoning-score-guide.md
@@ -0,0 +1,276 @@
+# NKI-Reasoning Reasoning Benchmark Setup Guide
+
+This guide walks you through setting up and running reasoning benchmarks on your compiled model using the lm-eval. The benchmark evaluates model performance on reasoning tasks including GSM8K Chain-of-Thought, MMLU, and MMLU Pro datasets.
+
+## Prerequisites
+
+- Ubuntu environment with AWS Neuron SDK installed
+- Git configured with your credentials
+- Python virtual environment set up
+- tmux for session management
+
+## Step 1: Clone the Repository
+
+Navigate to the home directory and clone the AWS Neuron samples repository:
+
+```bash
+cd /home/ubuntu
+git clone https://github.com/aws-neuron/aws-neuron-samples.git
+```
+
+## Step 2: Configure Benchmark Datasets
+
+For the reasoning benchmark, we'll use three key datasets:
+- **gsm8k_cot**: Grade School Math problems with Chain-of-Thought reasoning
+- **mmlu_flan_cot_zeroshot**: Massive Multitask Language Understanding with zero-shot prompting
+- **mmlu_pro**: Professional-level MMLU questions
+
+### Edit the Accuracy Configuration
+
+Navigate to the inference benchmarking directory and modify the accuracy configuration:
+
+```bash
+cd /home/ubuntu/aws-neuron-samples/inference-benchmarking
+vim accuracy.py
+```
+
+Update line 18 to include the required datasets in the `ACCURACY_CLIENTS_DATASETS` dictionary:
+
+```python
+ACCURACY_CLIENTS_DATASETS = {
+    "lm_eval": [
+        "gsm8k_cot",
+        "mmlu_flan_cot_zeroshot", # add
+        "mmlu_pro", #add
+        "mmlu_flan_n_shot_generative_computer_security",
+        "mmlu_flan_n_shot_generative_logical_fallacies",
+        "mmlu_flan_n_shot_generative_nutrition",
+    ],
+}
+```
+
+## Step 3: Configure Sequence Length Requirements
+
+**Important**: Reasoning benchmarks require a minimum recommended sequence length of 4096 tokens during model compilation.
+
+### Set Environment Variables
+
+Navigate to your NKI-LLaMA directory and configure the `.env` file:
+
+```bash
+cd /home/ubuntu/nki-llama/
+vim .env
+```
+
+Ensure the following environment variables are set:
+
+```bash
+# For compilation with main.py
+SEQ_LEN=4096
+
+# For runtime inference
+MAX_MODEL_LEN=4096
+```
+
+## Step 4: Configure Model Paths
+
+The benchmark script sources environment variables from the top-level `nki-llama` directory. You have two options:
+
+### Option A: Use Environment Variables (Recommended)
+
+Ensure your `.env` file contains:
+
+```bash
+MODEL_NAME="your-model-name"
+NKI_MODELS="/path/to/your/models"
+NKI_COMPILED="/path/to/compiled/models"
+TENSOR_PARALLEL_SIZE=1
+INFERENCE_PORT=8000
+MAX_MODEL_LEN=4096
+```
+
+### Option B: Hardcode Model Paths
+
+The benchmark script generates a YAML configuration file. Here's what it looks like:
+
+```yaml
+server:
+  name: "Reasoning-benchmark server"
+  model_path: "${NKI_MODELS}/${MODEL_NAME}"
+  model_s3_path: null
+  compiled_model_path: "${NKI_COMPILED}/${MODEL_NAME}"
+  max_seq_len: ${MAX_MODEL_LEN}
+  context_encoding_len: ${MAX_MODEL_LEN}
+  tp_degree: ${TENSOR_PARALLEL_SIZE}
+  n_vllm_threads: ${TENSOR_PARALLEL_SIZE}
+  server_port: ${INFERENCE_PORT}
+  continuous_batch_size: 1
+
+test:
+  accuracy:
+    mytest:
+      client: "lm_eval"
+      datasets: ["mmlu_pro", "gsm8k_cot", "mmlu_flan_cot_zeroshot"]
+      max_concurrent_requests: 1
+      timeout: 3600
+      client_params:
+        limit: 200
+        use_chat: False
+```
+
+## Step 5: Running Multiple Model Comparisons
+
+If you're comparing base and fine-tuned models, ensure you update the model paths between runs:
+
+### For Base Model Run:
+```bash
+export MODEL_NAME="base-model-name"
+```
+
+### For Fine-tuned Model Run:
+```bash
+export MODEL_NAME="fine-tuned-model-name"
+```
+
+Alternatively, you can directly edit the generated YAML file to hardcode specific model paths.
+
+## Step 6: Execute the Benchmark
+
+### Start a tmux Session
+
+```bash
+tmux new-session -d -s reasoning-benchmark
+tmux attach-session -t reasoning-benchmark
+```
+
+### Activate Virtual Environment and Run Benchmark
+
+```bash
+# Navigate to the inference scripts directory
+cd /home/ubuntu/nki-llama/src/inference/scripts/
+
+# Activate your virtual environment
+source /path/to/your/venv/bin/activate
+
+# Run the reasoning benchmark
+./reasoning-bench-lm-eval.sh
+```
+
+## Step 7: Results Analysis
+
+After completion, the benchmark will generate results in the `aws-neuron-samples` directory under a path specific to your model.
+
+### Expected Output Format
+
+The results will be saved as a JSON file with the following structure:
+
+```json
+{
+  "results": {
+    "gsm8k_cot": {
+      "alias": "gsm8k_cot",
+      "exact_match,strict-match": 0.78,
+      "exact_match_stderr,strict-match": 0.029365141882663297,
+      "exact_match,flexible-extract": 0.72,
+      "exact_match_stderr,flexible-extract": 0.03182868716477582
+    },
+    "mmlu_flan_cot_zeroshot": {
+      "alias": "mmlu_flan_cot_zeroshot",
+      "acc,none": 0.65,
+      "acc_stderr,none": 0.0234
+    },
+    "mmlu_pro": {
+      "alias": "mmlu_pro", 
+      "acc,none": 0.42,
+      "acc_stderr,none": 0.0189
+    }
+  },
+  "group_subtasks": {
+    "gsm8k_cot": [],
+    "mmlu_flan_cot_zeroshot": [],
+    "mmlu_pro": []
+  },
+  "configs": {
+    "gsm8k_cot": {
+      "task": "gsm8k_cot",
+      "dataset_path": "gsm8k",
+      "test_split": "test",
+      "doc_to_text": "Question: {{question}}\nAnswer:",
+      "doc_to_target": "{{answer}}",
+      "description": "Answer the following question with step-by-step reasoning."
+    }
+  },
+  "versions": {
+    "gsm8k_cot": 1,
+    "mmlu_flan_cot_zeroshot": 1,
+    "mmlu_pro": 1
+  },
+  "n-shot": {
+    "gsm8k_cot": 0,
+    "mmlu_flan_cot_zeroshot": 0,
+    "mmlu_pro": 0
+  },
+  "higher_is_better": {
+    "gsm8k_cot": {
+      "exact_match,strict-match": true,
+      "exact_match,flexible-extract": true
+    },
+    "mmlu_flan_cot_zeroshot": {
+      "acc,none": true
+    },
+    "mmlu_pro": {
+      "acc,none": true
+    }
+  }
+}
+```
+
+## Understanding the Results
+
+### Key Metrics
+
+- **exact_match,strict-match**: Percentage of exactly correct answers using strict matching
+- **exact_match,flexible-extract**: Percentage of correct answers using flexible extraction
+- **acc,none**: Overall accuracy percentage
+- **stderr**: Standard error of the measurement
+
+### Benchmark Interpretations
+
+- **GSM8K CoT**: Measures mathematical reasoning ability with step-by-step problem solving
+- **MMLU**: Evaluates broad knowledge across multiple academic domains
+- **MMLU Pro**: Tests professional-level understanding and application
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Sequence Length Errors**: Ensure `SEQ_LEN` and `MAX_MODEL_LEN` are set to at least 4096
+2. **Model Path Issues**: Verify environment variables or hardcoded paths are correct
+3. **Memory Issues**: Consider adjusting `TENSOR_PARALLEL_SIZE` based on your hardware
+4. **Timeout Errors**: Increase the timeout value in the YAML configuration if needed
+
+### Debug Commands
+
+```bash
+# Check environment variables
+cd /home/ubuntu/nki-llama/
+source .env
+env | grep -E "(MODEL_NAME|SEQ_LEN|MAX_MODEL_LEN)"
+
+# Verify model paths exist
+ls -la ${NKI_MODELS}/${MODEL_NAME}
+ls -la ${NKI_COMPILED}/${MODEL_NAME}
+
+# Check tmux sessions
+tmux list-sessions
+```
+
+## Best Practices
+
+1. **Use tmux**: Long-running benchmarks benefit from persistent sessions
+2. **Monitor Resources**: Keep an eye on GPU/CPU usage during execution
+3. **Save Results**: Archive results with timestamps for comparison
+4. **Document Changes**: Keep track of configuration changes between runs
+5. **Version Control**: Use git to track modifications to benchmark scripts
+
+This comprehensive setup ensures reliable and reproducible reasoning benchmark results for your AWS Neuron model evaluations.
\ No newline at end of file

From ba462f859eb5a4a7c7f8d2504dd91709b74f2dde Mon Sep 17 00:00:00 2001
From: nithiyn <niithiyn@gmail.com>
Date: Mon, 23 Jun 2025 14:12:38 -0400
Subject: [PATCH 41/65] chore: fix docs

---
 src/inference/docs/reasoning-score-guide.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/inference/docs/reasoning-score-guide.md b/src/inference/docs/reasoning-score-guide.md
index 6e39884..32cbb46 100644
--- a/src/inference/docs/reasoning-score-guide.md
+++ b/src/inference/docs/reasoning-score-guide.md
@@ -4,6 +4,7 @@ This guide walks you through setting up and running reasoning benchmarks on your
 
 ## Prerequisites
 
+- Ensure you have run `./nki-llama inference setup` first
 - Ubuntu environment with AWS Neuron SDK installed
 - Git configured with your credentials
 - Python virtual environment set up
@@ -273,4 +274,4 @@ tmux list-sessions
 4. **Document Changes**: Keep track of configuration changes between runs
 5. **Version Control**: Use git to track modifications to benchmark scripts
 
-This comprehensive setup ensures reliable and reproducible reasoning benchmark results for your AWS Neuron model evaluations.
\ No newline at end of file
+This comprehensive setup ensures reliable and reproducible reasoning benchmark results for your AWS Neuron model evaluations.

From 57a22f6e308f68cfc6516dfc84fbe110eebf694c Mon Sep 17 00:00:00 2001
From: nithiyn <niithiyn@gmail.com>
Date: Mon, 23 Jun 2025 14:14:07 -0400
Subject: [PATCH 42/65] Update reasoning-score-guide.md

---
 src/inference/docs/reasoning-score-guide.md | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/inference/docs/reasoning-score-guide.md b/src/inference/docs/reasoning-score-guide.md
index 32cbb46..85f38f4 100644
--- a/src/inference/docs/reasoning-score-guide.md
+++ b/src/inference/docs/reasoning-score-guide.md
@@ -4,11 +4,9 @@ This guide walks you through setting up and running reasoning benchmarks on your
 
 ## Prerequisites
 
+- Ensure you are in the pytorch_2_6 nxdi venv
 - Ensure you have run `./nki-llama inference setup` first
-- Ubuntu environment with AWS Neuron SDK installed
-- Git configured with your credentials
-- Python virtual environment set up
-- tmux for session management
+- tmux for session management - start a new session eg: `tmux new -s bench`
 
 ## Step 1: Clone the Repository
 

From ba4a97a510532de2faf92fb47148e0e88af853f0 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-51-221.us-west-2.compute.internal>
Date: Wed, 25 Jun 2025 23:34:04 +0000
Subject: [PATCH 43/65] Added Lora Merge Script

---
 .gitignore                                   |   3 +
 src/fine-tune/utils/merge_lora_checkpoint.py | 290 +++++++++++++++++++
 src/fine-tune/utils/merge_lora_model.sh      |  47 +++
 3 files changed, 340 insertions(+)
 create mode 100644 src/fine-tune/utils/merge_lora_checkpoint.py
 create mode 100755 src/fine-tune/utils/merge_lora_model.sh

diff --git a/.gitignore b/.gitignore
index 854245e..dbc6f0f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -272,5 +272,8 @@ cached_requirements.txt
 benchmark_finetuning.json
 benchmark_results.json
 **/logs/
+compiled_merged_model/
+compiled_model/
+merged_model/
 
 # End of https://www.toptal.com/developers/gitignore/api/macos,windows,linux,jupyternotebooks,python
\ No newline at end of file
diff --git a/src/fine-tune/utils/merge_lora_checkpoint.py b/src/fine-tune/utils/merge_lora_checkpoint.py
new file mode 100644
index 0000000..d1e1011
--- /dev/null
+++ b/src/fine-tune/utils/merge_lora_checkpoint.py
@@ -0,0 +1,290 @@
+#!/usr/bin/env python3
+"""
+Script to merge LoRA weights from NeuronX checkpoint with base model
+and convert to standard HuggingFace format
+"""
+
+import torch
+import os
+import json
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import argparse
+from collections import defaultdict
+
+def load_neuronx_lora_checkpoint(checkpoint_dir, tp_size=32):
+    """Load and merge LoRA weights from distributed NeuronX checkpoint"""
+    
+    # Add XLA safe globals
+    try:
+        import torch_xla.utils.serialization
+        torch.serialization.add_safe_globals([torch_xla.utils.serialization.TensorReference])
+    except ImportError:
+        print("Warning: torch_xla not available")
+    
+    print(f"Loading distributed checkpoint from {checkpoint_dir}")
+    
+    # Collect all checkpoint files and their tensor files
+    checkpoint_files = []
+    for f in os.listdir(checkpoint_dir):
+        if f.endswith('.pt') and not f.endswith('.tensors') and not f.endswith('.info.pt'):
+            checkpoint_files.append(f)
+    
+    checkpoint_files.sort()
+    print(f"Found {len(checkpoint_files)} checkpoint files")
+    
+    # Load all checkpoints and merge weights
+    merged_state = {}
+    lora_config = None
+    
+    for i, filename in enumerate(checkpoint_files):
+        print(f"Loading {filename} ({i+1}/{len(checkpoint_files)})")
+        checkpoint_path = os.path.join(checkpoint_dir, filename)
+        tensor_path = os.path.join(checkpoint_dir, filename + '.tensors')
+        
+        try:
+            # Load checkpoint metadata
+            checkpoint = torch.load(checkpoint_path, map_location='cpu', weights_only=False)
+            
+            # Extract LoRA config from first checkpoint
+            if lora_config is None and 'lora_config' in checkpoint:
+                lora_config = checkpoint['lora_config']
+                print(f"Found LoRA config: {lora_config}")
+            
+            # Load actual tensor data
+            if os.path.exists(tensor_path):
+                print(f"  Loading tensors from {filename}.tensors")
+                tensor_data = torch.load(tensor_path, map_location='cpu', weights_only=False)
+                
+                # Merge the checkpoint metadata with tensor data
+                for key in checkpoint.keys():
+                    if key == 'lora_config':
+                        continue
+                    
+                    # Get the actual tensor
+                    if key in tensor_data:
+                        tensor = tensor_data[key]
+                    elif hasattr(checkpoint[key], 'materialize'):
+                        tensor = checkpoint[key].materialize()
+                    else:
+                        # Fallback: try to use the checkpoint data directly
+                        tensor = checkpoint[key]
+                        if str(type(tensor)) == "<class 'torch_xla.utils.serialization.TensorReference'>":
+                            print(f"Warning: Could not load tensor for {key}, skipping...")
+                            continue
+                    
+                    if key not in merged_state:
+                        merged_state[key] = []
+                    merged_state[key].append(tensor)
+            else:
+                print(f"Warning: No tensor file found for {filename}, trying direct loading...")
+                # Try to load tensors directly from checkpoint
+                for key, tensor_ref in checkpoint.items():
+                    if key == 'lora_config':
+                        continue
+                    
+                    if hasattr(tensor_ref, 'materialize'):
+                        tensor = tensor_ref.materialize()
+                    elif torch.is_tensor(tensor_ref):
+                        tensor = tensor_ref
+                    else:
+                        print(f"Warning: Could not process tensor for {key}, skipping...")
+                        continue
+                    
+                    if key not in merged_state:
+                        merged_state[key] = []
+                    merged_state[key].append(tensor)
+                
+        except Exception as e:
+            print(f"Error loading {filename}: {e}")
+            continue
+    
+    # Concatenate tensors across TP ranks
+    print("Merging tensors across tensor parallel ranks...")
+    final_state = {}
+    for key, tensor_list in merged_state.items():
+        if len(tensor_list) == 1:
+            final_state[key] = tensor_list[0]
+        else:
+            try:
+                # Determine concatenation dimension based on weight type
+                if any(x in key for x in ['weight_q', 'weight_k', 'weight_v']):
+                    if 'lora_B' in key:
+                        # LoRA B matrices: concatenate along output dimension (dim=0)
+                        final_state[key] = torch.cat(tensor_list, dim=0)
+                    elif 'lora_A' in key:
+                        # LoRA A matrices: concatenate along input dimension (dim=1)
+                        final_state[key] = torch.cat(tensor_list, dim=1)
+                    elif 'base_layer' in key:
+                        # Base layer weights: concatenate along output dimension (dim=0)
+                        final_state[key] = torch.cat(tensor_list, dim=0)
+                    else:
+                        final_state[key] = torch.cat(tensor_list, dim=0)
+                elif 'gate_up_proj' in key:
+                    # Gate-up projection: concatenate along output dimension
+                    final_state[key] = torch.cat(tensor_list, dim=0)
+                elif 'o_proj' in key:
+                    # Output projection: concatenate along input dimension
+                    final_state[key] = torch.cat(tensor_list, dim=1)
+                elif 'down_proj' in key:
+                    # Down projection: concatenate along input dimension
+                    final_state[key] = torch.cat(tensor_list, dim=1)
+                else:
+                    # For other weights (embeddings, norms), take the first one (should be identical)
+                    final_state[key] = tensor_list[0]
+                    
+                print(f"  Merged {key}: {[t.shape for t in tensor_list]} -> {final_state[key].shape}")
+                
+            except Exception as e:
+                print(f"Error merging {key}: {e}")
+                # Fallback: take the first tensor
+                final_state[key] = tensor_list[0]
+    
+    return final_state, lora_config
+
+def merge_lora_weights(base_weights, lora_weights, lora_config):
+    """Merge LoRA weights with base weights"""
+    
+    print("Starting LoRA weight merging...")
+    merged_weights = {}
+    
+    # Copy all base weights first
+    for key, weight in base_weights.items():
+        merged_weights[key] = weight.clone()
+    
+    # Extract LoRA parameters
+    alpha = lora_config.get('lora_alpha', 32)
+    r = lora_config.get('r', 16)
+    scaling = alpha / r
+    
+    print(f"LoRA parameters: alpha={alpha}, rank={r}, scaling={scaling}")
+    
+    # Group LoRA weights by layer and component
+    lora_layers = defaultdict(dict)
+    for key in lora_weights:
+        if 'lora_A' in key or 'lora_B' in key:
+            # Parse key: model.layers.X.self_attn.qkv_proj.lora_A.weight
+            parts = key.split('.')
+            layer_idx = parts[2]
+            component = parts[4]  # qkv_proj
+            lora_type = parts[5]  # lora_A or lora_B
+            
+            if len(parts) > 6:
+                weight_type = parts[6]  # weight_q, weight_k, weight_v
+            else:
+                weight_type = 'weight'
+            
+            layer_key = f"layers.{layer_idx}.self_attn.{component}"
+            lora_key = f"{lora_type}.{weight_type}"
+            lora_layers[layer_key][lora_key] = lora_weights[key]
+    
+    # Apply LoRA merging for each layer
+    for layer_key, lora_params in lora_layers.items():
+        print(f"Processing {layer_key}")
+        
+        # Handle QKV projections - merge each Q, K, V separately
+        if 'qkv_proj' in layer_key:
+            layer_num = layer_key.split('.')[1]
+            
+            for weight_type in ['q', 'k', 'v']:
+                # Find corresponding base layer weight
+                base_key = f"model.layers.{layer_num}.self_attn.qkv_proj.base_layer.weight_{weight_type}"
+                lora_a_key = f"lora_A.weight"
+                lora_b_key = f"lora_B.weight_{weight_type}"
+                
+                if (base_key in lora_weights and 
+                    lora_a_key in lora_params and 
+                    lora_b_key in lora_params):
+                    
+                    base_weight = lora_weights[base_key]  # Use the base weight from checkpoint
+                    lora_a = lora_params[lora_a_key]
+                    lora_b = lora_params[lora_b_key]
+                    
+                    print(f"  Merging {weight_type} projection:")
+                    print(f"    Base: {base_weight.shape}")
+                    print(f"    LoRA A: {lora_a.shape}")
+                    print(f"    LoRA B: {lora_b.shape}")
+                    
+                    # Apply LoRA: W = W_base + scaling * (B @ A)
+                    try:
+                        delta = scaling * torch.mm(lora_b, lora_a)
+                        merged_weight = base_weight + delta
+                        
+                        # Create standard HF key name
+                        standard_key = f"model.layers.{layer_num}.self_attn.{weight_type}_proj.weight"
+                        merged_weights[standard_key] = merged_weight
+                        
+                        print(f"    Merged -> {standard_key}: {merged_weight.shape}")
+                        
+                    except Exception as e:
+                        print(f"    Error merging {weight_type}: {e}")
+                        # Fallback: use base weight only
+                        standard_key = f"model.layers.{layer_num}.self_attn.{weight_type}_proj.weight"
+                        merged_weights[standard_key] = base_weight
+    
+    # Add non-LoRA weights from the checkpoint
+    for key, weight in lora_weights.items():
+        # Skip LoRA-specific keys
+        if any(x in key for x in ['lora_A', 'lora_B', 'base_layer']):
+            continue
+        
+        # Add standard weights (embeddings, layer norms, MLPs, etc.)
+        merged_weights[key] = weight
+    
+    print(f"Final merged model has {len(merged_weights)} parameters")
+    return merged_weights
+
+def main():
+    parser = argparse.ArgumentParser(description='Merge LoRA checkpoint with base model')
+    parser.add_argument('--checkpoint_dir', required=True, help='Path to NeuronX checkpoint directory')
+    parser.add_argument('--base_model_path', required=True, help='Path to base model')
+    parser.add_argument('--output_dir', required=True, help='Output directory for merged model')
+    parser.add_argument('--tp_size', type=int, default=32, help='Tensor parallel size')
+    
+    args = parser.parse_args()
+    
+    print("Step 1: Loading base model...")
+    base_model = AutoModelForCausalLM.from_pretrained(
+        args.base_model_path,
+        torch_dtype=torch.float16,
+        device_map="cpu"
+    )
+    
+    print("Step 2: Loading NeuronX LoRA checkpoint...")
+    lora_weights, lora_config = load_neuronx_lora_checkpoint(args.checkpoint_dir, args.tp_size)
+    
+    print("Step 3: Merging LoRA weights...")
+    base_state_dict = base_model.state_dict()
+    merged_weights = merge_lora_weights(base_state_dict, lora_weights, lora_config)
+    
+    print("Step 4: Loading merged weights into model...")
+    # Filter merged weights to only include keys that exist in the base model
+    filtered_weights = {}
+    for key in base_model.state_dict().keys():
+        if key in merged_weights:
+            filtered_weights[key] = merged_weights[key]
+        else:
+            print(f"Warning: {key} not found in merged weights, keeping original")
+            filtered_weights[key] = base_model.state_dict()[key]
+    
+    base_model.load_state_dict(filtered_weights, strict=True)
+    
+    print("Step 5: Saving merged model...")
+    os.makedirs(args.output_dir, exist_ok=True)
+    base_model.save_pretrained(args.output_dir, safe_serialization=True)
+    
+    # Copy tokenizer
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(args.base_model_path)
+        tokenizer.save_pretrained(args.output_dir)
+        print("Tokenizer copied successfully")
+    except Exception as e:
+        print(f"Warning: Could not copy tokenizer: {e}")
+    
+    print(f"Merged model saved to {args.output_dir}")
+    
+    # Print some statistics
+    total_params = sum(p.numel() for p in base_model.parameters())
+    print(f"Total parameters in merged model: {total_params:,}")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/src/fine-tune/utils/merge_lora_model.sh b/src/fine-tune/utils/merge_lora_model.sh
new file mode 100755
index 0000000..842f06f
--- /dev/null
+++ b/src/fine-tune/utils/merge_lora_model.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+set -e
+
+# Define paths
+CHECKPOINT_PATH="/home/ubuntu/nki-llama/src/fine-tune/neuronx-distributed-training/examples/nemo_experiments/hf_llama/2025-06-11_15-11-22/checkpoints/hf_llama3_8B_SFT--step=5000-consumed_samples=319424.0.ckpt/"
+BASE_MODEL_PATH="/home/ubuntu/nki-llama/src/fine-tune/model_assets/llama_3-1_8b/"
+OUTPUT_PATH="/home/ubuntu/nki-llama/merged_model/"
+MERGE_SCRIPT="/home/ubuntu/nki-llama/src/fine-tune/merge_lora_checkpoint.py"
+
+# Ensure output directory exists
+mkdir -p "${OUTPUT_PATH}"
+
+echo "=== LoRA Model Merging Process ==="
+echo "Checkpoint: ${CHECKPOINT_PATH}"
+echo "Base Model: ${BASE_MODEL_PATH}"
+echo "Output: ${OUTPUT_PATH}"
+echo ""
+
+# Activate the training environment (needed for XLA utilities)
+echo "Activating NeuronX training environment..."
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+
+# Download packages
+echo "=== Downloading Libararies! ==="
+pip install transformers
+pip install accelerate
+echo "=== Download Complete! ==="
+
+# Run the LoRA merge script
+echo "Merging LoRA weights with base model..."
+python3 ${MERGE_SCRIPT} \
+    --checkpoint_dir "${CHECKPOINT_PATH}/model" \
+    --base_model_path "${BASE_MODEL_PATH}" \
+    --output_dir "${OUTPUT_PATH}" \
+    --tp_size 32
+
+echo ""
+echo "=== LoRA Model Merge Complete! ==="
+echo "Your merged model is ready at: ${OUTPUT_PATH}"
+echo ""
+echo "You can now use this model directly with transformers:"
+echo "  from transformers import AutoModelForCausalLM, AutoTokenizer"
+echo "  model = AutoModelForCausalLM.from_pretrained('${OUTPUT_PATH}')"
+echo "  tokenizer = AutoTokenizer.from_pretrained('${OUTPUT_PATH}')"
+
+# Deactivate environment
+deactivate
\ No newline at end of file

From fb65ff9dd2e43d089184fff588db0a8a1eda0c31 Mon Sep 17 00:00:00 2001
From: Nithiyn <niithiyn@gmail.com>
Date: Thu, 26 Jun 2025 19:17:30 +0000
Subject: [PATCH 44/65] chore: update docs and env

---
 .env.example  |   4 +-
 README.md     | 152 +++++++++++++++++++++---
 src/README.md | 314 +++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 447 insertions(+), 23 deletions(-)

diff --git a/.env.example b/.env.example
index 0dfbbc8..0bcc739 100644
--- a/.env.example
+++ b/.env.example
@@ -8,8 +8,8 @@ MODEL_NAME=llama-3-8b
 
 # Inference Configuration
 INFERENCE_PORT=8080
-MAX_MODEL_LEN=2048 # used by vllm- ensure it is the same as seq len
-SEQ_LEN=2048 #used by main.py
+MAX_MODEL_LEN=8192 # used by vllm- ensure it is the same as seq len
+SEQ_LEN=8192 #used by main.py
 
 MAX_NUM_SEQS=4
 TENSOR_PARALLEL_SIZE=8
diff --git a/README.md b/README.md
index bc3aaf4..b2b6816 100644
--- a/README.md
+++ b/README.md
@@ -17,14 +17,14 @@ NKI-LLAMA provides a streamlined interface for the complete LLM development life
 ```
 ┌─────────────────┐     ┌──────────────────┐     ┌─────────────────┐     ┌──────────────┐
 │                 │     │                  │     │                 │     │              │
-│   Fine-tuning   │────▶│ NKI Compilation  │────▶│ vLLM Inference  │────▶│    Agent     │
-│      (NxD)      │     │  & Benchmarking  │     │     (NxDI)      │     │ Development  │
+│   Fine-tuning   │────▶│ NKI Compilation  │────▶│ vLLM Inference  │────▶│  Reasoning   │-> cumulative score ()
+│      (NxD)      │     │  & Benchmarking  │     │     (NxDI)      │     │  Benchmark   │
 │                 │     │                  │     │                 │     │              │
 └─────────────────┘     └──────────────────┘     └─────────────────┘     └──────────────┘
-        │                         │                         │                         │
-        ▼                         ▼                         ▼                         ▼
-  Trained Model            NKI-Optimized              API Endpoint              LLM Apps
-                          Model Artifacts            (OpenAI Compatible)
+        │                         │                         │                    │
+        ▼                         ▼                         ▼                    ▼
+  Trained Model            NKI-Optimized              API Endpoint          test PT   
+                          Model Artifacts            (OpenAI Compatible)   model reasoning
 ```
 
 ### Key Technologies
@@ -89,7 +89,7 @@ tmux new -s benchmark
 
 ## 📊 Score Calculation Workflow
 
-The NKI-LLAMA platform includes a comprehensive score calculation system that evaluates both training and inference performance. For detailed information about the scoring system, see the [Score Calculation README](src/README.md).
+The NKI-LLAMA platform includes a comprehensive score calculation system that evaluates training performance, inference performance, and reasoning capabilities. For detailed information about the scoring system, see the [Score Calculation README](src/README.md).
 
 ### Workflow Overview
 
@@ -106,10 +106,16 @@ The NKI-LLAMA platform includes a comprehensive score calculation system that ev
    - This log contains metrics like latency, throughput, and MFU
    - The benchmark inference file is always generated at: `benchmark_inference.json`
 
-3. **Score Collection**:
-   - Once training completes, scores can be calculated using the handler
-   - If only training is done, you'll get the NKI kernel training score
-   - If both training and inference are complete, you'll get the full NKI-LLAMA score
+3. **Reasoning Evaluation** (Optional):
+   - Run reasoning benchmarks using the dedicated script
+   - Results are automatically discovered and integrated by the handler
+   - Supports multiple reasoning datasets (GSM8K, MMLU, etc.)
+
+4. **Score Collection**:
+   - Once components complete, scores can be calculated using the handler
+   - Training-only: NKI kernel training score
+   - Training + Inference: Full NKI-LLAMA score
+   - Training + Inference + Reasoning: Complete benchmark score with reasoning metrics
 
 ### Example Test Run
 
@@ -126,7 +132,13 @@ source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
 ./nki-llama inference benchmark 
 # or directly run from main.py in src/inference/ for full use of flags for bucketing/context encoding and others
 
-# Step 3: Calculate scores
+# Step 3: Run reasoning evaluation (optional for complete benchmark)
+tmux new -s reasoning
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+./nki-llama/src/inference/scripts/reasoning-bench-lm-eval.sh
+# Results are automatically saved to aws-neuron-samples/inference-benchmarking/results/
+
+# Step 4: Calculate scores
 # For training-only score:
 python /home/ubuntu/nki-llama/src/handler.py \
     --config /home/ubuntu/nki-llama/src/fine-tune/neuronx-distributed-training/examples/conf/hf_llama3_8B_SFT_config.yaml \
@@ -159,6 +171,25 @@ python /home/ubuntu/nki-llama/src/handler.py \
     --calculate-score \
     --detailed \
     --verbose
+
+# For complete score (with reasoning - handler automatically discovers results):
+python /home/ubuntu/nki-llama/src/handler.py \
+    --config /home/ubuntu/nki-llama/src/fine-tune/neuronx-distributed-training/examples/conf/hf_llama3_8B_SFT_config.yaml \
+    --model-config /home/ubuntu/nki-llama/src/fine-tune/configs/model-config/8B_config_llama3-1/config.json \
+    --log-file /home/ubuntu/nki-llama/logs/nki-llama_20250610_014432.log \
+    --compile-dir /home/ubuntu/neuron_cache/neuronxcc-2.18.121.0+9e31e41a/MODULE_15329989265349737271+a65e371e \
+    --inference-results /home/ubuntu/nki-llama/src/inference/benchmark_inference.json \
+    --reasoning-results \
+    --throughput 2.1 \
+    --output benchmark_results.json \
+    --training-weight 0.33 \
+    --inference-weight 0.33 \
+    --reasoning-weight 0.34 \
+    --hw-backend trn1 \
+    --per-file-scores \
+    --calculate-score \
+    --detailed \
+    --verbose
 ```
 
 The score calculation provides insights into:
@@ -222,6 +253,37 @@ source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
 
 > **Note**: The `evaluate_single` mode is currently not functional. Use `evaluate_all` (default) for all benchmarking needs.
 
+### Reasoning Evaluation
+
+```bash
+# Activate inference environment
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+
+# Run reasoning benchmarks (requires vLLM server)
+./nki-llama/src/inference/scripts/reasoning-bench-lm-eval.sh
+
+# The script will:
+# 1. Start vLLM server with your compiled model
+# 2. Run lm-eval on reasoning datasets (GSM8K, MMLU, etc.)
+# 3. Save results to aws-neuron-samples/inference-benchmarking/results/
+# 4. Results are automatically discovered by handler.py
+```
+
+#### Reasoning Datasets
+
+The reasoning evaluation includes multiple datasets:
+- **GSM8K CoT**: Grade school math with chain-of-thought reasoning
+- **MMLU Pro**: Massive multitask language understanding (professional level)
+- **MMLU Flan CoT**: MMLU with chain-of-thought prompting
+
+#### Result Integration
+
+The handler automatically discovers reasoning results by:
+1. **Model Mapping**: Maps your model configuration to result directory paths
+2. **Score Extraction**: Parses "exact_match,strict-match" scores from JSON files
+3. **Path Discovery**: Searches `aws-neuron-samples/inference-benchmarking/results/accuracy/mytest/`
+4. **Graceful Fallback**: Works without reasoning results (training+inference only)
+
 ### Inference Serving
 
 ```bash
@@ -235,6 +297,19 @@ tmux new -s vllm
 # Server will use NKI-compiled artifacts from benchmarking
 ```
 
+### Reasoning Evaluation
+
+```bash
+# Activate inference environment
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+
+# Run complete reasoning evaluation
+./nki-llama/src/inference/scripts/reasoning-bench-lm-eval.sh
+
+# Results are automatically saved and discovered by handler.py
+# No additional flags needed - handler finds results by model path
+```
+
 ### Development Tools
 
 ```bash
@@ -345,6 +420,28 @@ neuron-top
 export TENSOR_PARALLEL_SIZE=4  # Reduce from 8
 ```
 
+#### Reasoning Evaluation Issues
+```bash
+# Symptoms: "No reasoning results found" in handler output
+# Check if results directory exists:
+ls -la ~/aws-neuron-samples/inference-benchmarking/results/accuracy/mytest/
+
+# Symptoms: vLLM server connection errors during reasoning eval
+# Ensure server is running and accessible:
+curl http://localhost:8080/v1/models
+
+# Symptoms: Reasoning script fails with transformers version error
+# The script automatically installs transformers<4.50 for compatibility
+# If issues persist, manually install:
+pip install "transformers<4.48.2"
+
+# Symptoms: Handler can't find reasoning results for your model
+# Check model path mapping in results directory:
+# Results are stored by model path: /home/ubuntu/models/llama-3-1-8b
+# becomes: __home__ubuntu__models__llama-3-1-8b
+ls -la ~/aws-neuron-samples/inference-benchmarking/results/accuracy/mytest/*/
+```
+
 ## 🏗️ Project Structure
 
 ```
@@ -389,6 +486,12 @@ NEURON_RT_NUM_CORES=8
 # Inference Parameters
 INFERENCE_PORT=8080
 MAX_MODEL_LEN=2048
+
+# Reasoning Evaluation (Optional)
+# These are automatically configured by reasoning-bench-lm-eval.sh
+# REASONING_DATASETS="mmlu_pro, gsm8k_cot, mmlu_flan_cot_zeroshot"
+# REASONING_LIMIT=200  # Number of samples per dataset
+# REASONING_TIMEOUT=3600  # Timeout in seconds
 ```
 
 ## 🎓 Complete Workflow Example
@@ -412,9 +515,28 @@ source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
 # Detach: Ctrl+B, D
 ```
 
-### Step 3: Calculate Performance Score
+### Step 3: Reasoning Evaluation (Optional)
+```bash
+tmux new -s reasoning
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+./nki-llama/src/inference/scripts/reasoning-bench-lm-eval.sh
+# Runs GSM8K, MMLU Pro, and MMLU Flan CoT evaluations
+# Results saved to aws-neuron-samples/inference-benchmarking/results/
+# Detach: Ctrl+B, D
+```
+
+### Step 4: Calculate Performance Score
 ```bash
 # After training and/or inference completes
+# For complete score with reasoning (if Step 3 was run):
+python /home/ubuntu/nki-llama/src/handler.py \
+    --compile-dir /path/from/training/logs \
+    --log-file logs/nki-llama_latest.log \
+    --inference-results benchmark_inference.json \
+    --reasoning-results \
+    --calculate-score
+
+# For training+inference only (without reasoning):
 python /home/ubuntu/nki-llama/src/handler.py \
     --compile-dir /path/from/training/logs \
     --log-file logs/nki-llama_latest.log \
@@ -422,7 +544,7 @@ python /home/ubuntu/nki-llama/src/handler.py \
     --calculate-score
 ```
 
-### Step 4: Serve Model
+### Step 5: Serve Model
 ```bash
 tmux new -s vllm-server
 source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
@@ -431,7 +553,7 @@ source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
 # Detach: Ctrl+B, D
 ```
 
-### Step 5: Build Applications
+### Step 6: Build Applications
 ```bash
 # Terminal 1: Keep server running
 # Terminal 2: Development
diff --git a/src/README.md b/src/README.md
index a587c06..23ce614 100644
--- a/src/README.md
+++ b/src/README.md
@@ -7,15 +7,21 @@ A benchmarking system for evaluating NKI-LLAMA model performance across both tra
 The NKI-LLAMA Benchmark Handler calculates a unified performance score that combines:
 - **Training metrics**: MFU (Model FLOPs Utilization), throughput, and NKI kernel usage
 - **Inference metrics**: Latency, throughput, and accuracy (optional)
+- **Reasoning metrics**: Accuracy scores from reasoning benchmarks (GSM8K, MMLU, etc.)
 - **NKI optimization**: Ratio of NKI (Neuron Kernel Interface) operations to total operations
 
-The system supports two modes:
+The system supports multiple modes:
 - **Training-only mode**: When inference results are not available, provides NKI kernel training score
 - **Combined mode**: When both training and inference results are available, provides full NKI-LLAMA score
+- **Full integration mode**: When training, inference, and reasoning results are available, provides comprehensive NKI-LLAMA score with reasoning component
 
 The final score follows the formula:
 ```
+# Combined mode (training + inference):
 Score = Accuracy × Reduced Latency × Increased Throughput × (1 + Normalized NKI FLOPS)
+
+# Full integration mode (training + inference + reasoning):
+Score = (Base Score) × (1 + Reasoning Score Weight × Reasoning Accuracy)
 ```
 
 ## 💻 Usage
@@ -42,6 +48,54 @@ python handler.py --calculate-score
 
 This provides immediate feedback on NKI kernel optimization progress without requiring inference implementation.
 
+### Reasoning Results Integration
+
+The handler can automatically discover and integrate reasoning benchmark results from the aws-neuron-samples inference-benchmarking framework.
+
+#### Automatic Reasoning Result Discovery
+
+When reasoning results are available, the handler automatically discovers them based on your model configuration:
+
+```bash
+# Run with reasoning integration (automatic discovery)
+python handler.py --reasoning-results
+```
+
+The handler searches for reasoning results in:
+```
+aws-neuron-samples/inference-benchmarking/results/accuracy/mytest/
+├── gsm8k_cot/
+├── mmlu_pro/
+└── mmlu_flan_cot_zeroshot/
+```
+
+#### Custom Reasoning Results Path
+
+You can specify a custom path to reasoning results:
+
+```bash
+# Use custom reasoning results directory
+python handler.py --reasoning-results /path/to/custom/reasoning/results
+```
+
+#### Running Complete Workflow
+
+1. **First, run reasoning benchmarks** using the provided script:
+   ```bash
+   # Activate inference environment
+   source /path/to/inference/venv/bin/activate
+   
+   # Run reasoning benchmarks (this takes time)
+   cd nki-llama/src/inference/scripts
+   ./reasoning-bench-lm-eval.sh
+   ```
+
+2. **Then run the handler** to get comprehensive scores:
+   ```bash
+   # Get full NKI-LLAMA score with reasoning integration
+   python handler.py --reasoning-results --calculate-score
+   ```
+
 ### Advanced Usage
 
 #### Custom Training Configuration
@@ -83,9 +137,37 @@ python /home/ubuntu/nki-llama/src/handler.py \
 
 #### Adjust Score Weights
 ```bash
+# Adjust training and inference weights
 python handler.py \
     --training-weight 0.3 \
     --inference-weight 0.7
+
+# Include reasoning with custom weights
+python handler.py \
+    --training-weight 0.3 \
+    --inference-weight 0.5 \
+    --reasoning-weight 0.2 \
+    --reasoning-results
+```
+
+#### Reasoning Integration Examples
+```bash
+# Full workflow with reasoning integration
+python handler.py \
+    --reasoning-results \
+    --calculate-score \
+    --verbose
+
+# Use custom reasoning results directory
+python handler.py \
+    --reasoning-results-path /custom/reasoning/results \
+    --reasoning-weight 0.25
+
+# Training + reasoning only (skip inference)
+python handler.py \
+    --reasoning-results \
+    --training-weight 0.8 \
+    --reasoning-weight 0.2
 ```
 
 #### Verbose Output
@@ -113,6 +195,13 @@ python handler.py --verbose
 | `--reference-latency` | `50000` | Reference implementation latency (ms) |
 | `--reference-throughput` | `10` | Reference implementation throughput (tokens/s) |
 
+#### Reasoning Metrics Options
+| Option | Default | Description |
+|--------|---------|-------------|
+| `--reasoning-results` | `None` | Enable reasoning results integration (auto-discovery) |
+| `--reasoning-results-path` | `~/aws-neuron-samples/inference-benchmarking/results` | Custom path to reasoning results directory |
+| `--reasoning-weight` | `0.2` | Weight for reasoning score component (0-1) |
+
 #### Score Calculation Options
 | Option | Default | Description |
 |--------|---------|-------------|
@@ -161,6 +250,47 @@ Score Weights:
 ======================================================================
 ```
 
+### Console Output - Full Integration Mode (with Reasoning)
+```
+======================================================================
+NKI-LLAMA BENCHMARK RESULTS
+======================================================================
+
+🏆 FINAL NKI-LLAMA SCORE: 0.0055
+
+Score Weights:
+  Training: 30%
+  Inference: 50%
+  Reasoning: 20%
+
+📊 Component Scores:
+  Training Score: 0.0077
+  Inference Score: 0.0026
+  Reasoning Score: 0.555
+  NKI Ratio: 0.1846
+
+🎯 Training Metrics:
+  MFU: 15.48% (baseline: 50.00%)
+  Throughput: 2.10 seq/s (baseline: 100.00)
+  MFU Improvement: 0.3095x
+  Throughput Improvement: 0.0210x
+
+⚡ Inference Metrics:
+  Latency: 12131.49ms (reference: 50000.00ms)
+  Throughput: 52.76 tokens/s (reference: 10.00)
+  Latency Reduction: 4.1220x
+  Throughput Increase: 5.2755x
+  Accuracy: ✓ Passed
+
+🧠 Reasoning Metrics:
+  GSM8K CoT: 55.5% (exact match, strict)
+  MMLU Pro: Not available
+  MMLU Flan CoT: Not available
+  Overall Reasoning Score: 0.555
+
+======================================================================
+```
+
 ### Console Output - Training-Only Mode
 ```
 ======================================================================
@@ -189,15 +319,17 @@ NKI-LLAMA BENCHMARK RESULTS
 ```json
 {
   "timestamp": "2025-01-01T12:00:00",
-  "mode": "combined",
-  "nki_kernel_score": 0.0046,
+  "mode": "full_integration",
+  "nki_kernel_score": 0.0055,
   "component_scores": {
     "training": 0.0077,
-    "inference": 0.0026
+    "inference": 0.0026,
+    "reasoning": 0.555
   },
   "weights": {
-    "training": 0.4,
-    "inference": 0.6
+    "training": 0.3,
+    "inference": 0.5,
+    "reasoning": 0.2
   },
   "nki_ratio": 0.1846,
   "detailed_breakdown": {
@@ -219,6 +351,19 @@ NKI-LLAMA BENCHMARK RESULTS
       "achieved_latency_ms": 12131.49,
       "reference_throughput": 10,
       "achieved_throughput": 52.76
+    },
+    "reasoning": {
+      "gsm8k_cot": {
+        "exact_match_strict": 0.555,
+        "exact_match_flexible": 0.575,
+        "n_samples": 200
+      },
+      "mmlu_pro": null,
+      "mmlu_flan_cot_zeroshot": null,
+      "overall_score": 0.555,
+      "discovered_results": [
+        "gsm8k_cot"
+      ]
     }
   }
 }
@@ -237,6 +382,12 @@ NKI-LLAMA BENCHMARK RESULTS
 - **Increased Throughput**: How many more tokens/second (higher is better)
 - **NKI FLOPS**: Bonus for using NKI optimized operations
 
+### Reasoning Score Components
+- **GSM8K CoT**: Grade school math problems with chain-of-thought reasoning
+- **MMLU Pro**: Massive multitask language understanding (professional level)
+- **MMLU Flan CoT**: MMLU with chain-of-thought prompting
+- **Overall Score**: Weighted average of available reasoning benchmark scores
+
 ### Score Ranges
 - **0-1**: Poor performance, needs optimization
 - **1-10**: Baseline performance
@@ -263,12 +414,126 @@ NKI-LLAMA BENCHMARK RESULTS
    - Neuron cache directory contains HLO files
    - Model config JSON is valid
 
+### Reasoning Results Troubleshooting
+
+#### Reasoning Results Not Found
+
+If reasoning results are not discovered automatically:
+
+1. **Check the results directory structure**:
+   ```bash
+   # Expected structure:
+   ls -la ~/aws-neuron-samples/inference-benchmarking/results/accuracy/mytest/
+   # Should show: gsm8k_cot/, mmlu_pro/, mmlu_flan_cot_zeroshot/
+   ```
+
+2. **Verify model name mapping**:
+   ```bash
+   # Handler maps model paths to sanitized names
+   # /home/ubuntu/models/llama-3-1-8b → __home__ubuntu__models__llama-3-1-8b
+   find ~/aws-neuron-samples/inference-benchmarking/results -name "*llama*" -type d
+   ```
+
+3. **Check reasoning benchmark results exist**:
+   ```bash
+   # Look for JSON result files
+   find ~/aws-neuron-samples/inference-benchmarking/results -name "results_*.json" | head -5
+   ```
+
+#### Reasoning Results Format Issues
+
+If reasoning results are found but parsing fails:
+
+1. **Validate JSON format**:
+   ```bash
+   # Check if result files are valid JSON
+   python -m json.tool /path/to/results_file.json > /dev/null
+   ```
+
+2. **Check required fields**:
+   ```bash
+   # Verify the file contains expected structure
+   jq '.results | keys' /path/to/results_file.json
+   # Should show reasoning benchmark names like "gsm8k_cot"
+   ```
+
+3. **Inspect score fields**:
+   ```bash
+   # Check for exact_match,strict-match scores
+   jq '.results.gsm8k_cot."exact_match,strict-match"' /path/to/results_file.json
+   ```
+
+#### Running Reasoning Benchmarks
+
+If you need to generate reasoning results:
+
+1. **Set up the inference environment**:
+   ```bash
+   # Activate the inference virtual environment
+   source /path/to/inference/venv/bin/activate
+   
+   # Verify vLLM and dependencies are installed
+   python -c "import vllm; print('vLLM available')"
+   ```
+
+2. **Run the reasoning benchmark script**:
+   ```bash
+   cd nki-llama/src/inference/scripts
+   ./reasoning-bench-lm-eval.sh
+   ```
+
+3. **Monitor benchmark progress**:
+   ```bash
+   # Check server logs
+   tail -f ~/aws-neuron-samples/inference-benchmarking/server_*.log
+   
+   # Check for result files being created
+   watch "find ~/aws-neuron-samples/inference-benchmarking/results -name 'results_*.json' | wc -l"
+   ```
+
+#### Custom Reasoning Results Path
+
+If using custom reasoning results location:
+
+1. **Specify custom path**:
+   ```bash
+   python handler.py --reasoning-results-path /custom/path/to/results
+   ```
+
+2. **Verify directory structure**:
+   ```bash
+   # Custom path should have same structure as aws-neuron-samples
+   ls -la /custom/path/to/results/accuracy/mytest/
+   ```
+
 ### Debug Mode
 Run with verbose flag to see detailed execution:
 ```bash
 python handler.py --verbose 2>&1 | tee debug.log
 ```
 
+### Reasoning Integration Debug
+For detailed reasoning integration debugging:
+```bash
+# Enable verbose mode to see reasoning result discovery process
+python handler.py --reasoning-results --verbose
+
+# Check what reasoning results are being discovered
+python -c "
+import json
+from pathlib import Path
+results_dir = Path.home() / 'aws-neuron-samples/inference-benchmarking/results/accuracy/mytest'
+for benchmark_dir in results_dir.iterdir():
+    if benchmark_dir.is_dir():
+        print(f'Found benchmark: {benchmark_dir.name}')
+        for model_dir in benchmark_dir.iterdir():
+            if model_dir.is_dir():
+                print(f'  Model: {model_dir.name}')
+                for result_file in model_dir.glob('results_*.json'):
+                    print(f'    Result: {result_file.name}')
+"
+```
+
 ## 📝 Input File Formats
 
 ### `benchmark_inference.json`
@@ -308,6 +573,43 @@ training:
   learning_rate: 1e-4
 ```
 
+### Reasoning Results JSON
+```json
+{
+  "results": {
+    "gsm8k_cot": {
+      "alias": "gsm8k_cot",
+      "exact_match,strict-match": 0.555,
+      "exact_match_stderr,strict-match": 0.0352289710609046,
+      "exact_match,flexible-extract": 0.575,
+      "exact_match_stderr,flexible-extract": 0.03504304603451135
+    }
+  },
+  "n-samples": {
+    "gsm8k_cot": {
+      "original": 1319,
+      "effective": 200
+    }
+  },
+  "model_name": "/home/ubuntu/models/llama-3-1-8b",
+  "model_name_sanitized": "__home__ubuntu__models__llama-3-1-8b"
+}
+```
+
+### Reasoning Results Directory Structure
+```
+aws-neuron-samples/inference-benchmarking/results/accuracy/mytest/
+├── gsm8k_cot/
+│   └── __home__ubuntu__models__llama-3-1-8b/
+│       └── results_2025-06-23T01-34-27.025863.json
+├── mmlu_pro/
+│   └── __home__ubuntu__models__llama-3-1-8b/
+│       └── results_2025-06-23T01-35-15.123456.json
+└── mmlu_flan_cot_zeroshot/
+    └── __home__ubuntu__models__llama-3-1-8b/
+        └── results_2025-06-23T01-36-42.789012.json
+```
+
 ---
 
 **Note**: Default paths assume standard NKI-LLAMA directory structure. Adjust paths according to your setup.
\ No newline at end of file

From ca9ef498bc3e2812e3b5b936c20d62b0d49c69a0 Mon Sep 17 00:00:00 2001
From: Nithiyn <niithiyn@gmail.com>
Date: Thu, 26 Jun 2025 19:18:07 +0000
Subject: [PATCH 45/65] fix: nki score normalization

---
 src/fine-tune/configs/YAML/hf_llama3.1_8B_SFT_lora_config.yaml | 2 +-
 src/fine-tune/scripts/calculate_training_metrics.py            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/fine-tune/configs/YAML/hf_llama3.1_8B_SFT_lora_config.yaml b/src/fine-tune/configs/YAML/hf_llama3.1_8B_SFT_lora_config.yaml
index 5f477d5..1202ee6 100644
--- a/src/fine-tune/configs/YAML/hf_llama3.1_8B_SFT_lora_config.yaml
+++ b/src/fine-tune/configs/YAML/hf_llama3.1_8B_SFT_lora_config.yaml
@@ -6,7 +6,7 @@ trainer:
   devices: 32
   num_nodes: 1
   max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
-  max_steps: 5000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  max_steps: 1000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
   val_check_interval: 500 # we do not want val to run during training, hence setting it at a high number
   check_val_every_n_epoch: null
diff --git a/src/fine-tune/scripts/calculate_training_metrics.py b/src/fine-tune/scripts/calculate_training_metrics.py
index 59121ae..01a1723 100644
--- a/src/fine-tune/scripts/calculate_training_metrics.py
+++ b/src/fine-tune/scripts/calculate_training_metrics.py
@@ -46,7 +46,7 @@ def calculate_training_score(
     
     # Combine metrics into final score
     # Similar formula to inference but adapted for training metrics
-    final_score = mfu_improvement * throughput_improvement * loss_improvement * convergence_rate * (1 + nki_flop_ratio)
+    final_score = mfu_improvement * throughput_improvement * loss_improvement * convergence_rate * (0.1 + nki_flop_ratio)
     
     return {
         'score': final_score,

From 5c5af1346acdb28cb58717c50341b6fe2d4ab625 Mon Sep 17 00:00:00 2001
From: Nithiyn <niithiyn@gmail.com>
Date: Fri, 27 Jun 2025 02:35:30 +0000
Subject: [PATCH 46/65] feat: updated handler

---
 src/handler.py | 321 +++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 282 insertions(+), 39 deletions(-)

diff --git a/src/handler.py b/src/handler.py
index 3f9645d..8c4cddf 100644
--- a/src/handler.py
+++ b/src/handler.py
@@ -10,9 +10,11 @@
 import sys
 import os
 from pathlib import Path
-from typing import Dict, Any, Optional, Tuple
+from typing import Dict, Any, Optional, Tuple, List
 import logging
 from datetime import datetime
+import glob
+import re
 
 
 class NKILlamaHandler:
@@ -183,17 +185,48 @@ def calculate_inference_score(self, inference_data: Dict[str, Any],
         
         return score, breakdown
         
+    def calculate_reasoning_score(self, reasoning_metrics: Dict[str, Any]) -> float:
+        """
+        Calculate reasoning score from reasoning evaluation results.
+        
+        The reasoning score is based on the overall accuracy across reasoning tasks,
+        normalized to a 0-1 scale and then scaled to match the scoring range of other components.
+        
+        Args:
+            reasoning_metrics: Dictionary containing reasoning evaluation results
+            
+        Returns:
+            Float representing the reasoning score
+        """
+        try:
+            # Get the overall score (already averaged across tasks)
+            overall_score = reasoning_metrics.get("overall_score", 0.0)
+            
+            # The overall_score is already in 0-1 range (accuracy percentage)
+            # Scale it to match the typical range of other components (0-10 range)
+            reasoning_score = overall_score * 10.0
+            
+            self.logger.debug(f"Reasoning score calculation: {overall_score:.4f} -> {reasoning_score:.4f}")
+            
+            return reasoning_score
+            
+        except Exception as e:
+            self.logger.error(f"Error calculating reasoning score: {e}")
+            return 0.0
+        
     def calculate_combined_score(self, training_metrics: Dict[str, Any],
                                  inference_metrics: Optional[Dict[str, Any]] = None,
+                                 reasoning_metrics: Optional[Dict[str, Any]] = None,
                                  weights: Optional[Dict[str, float]] = None,
                                  reference_data: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
         """
-        Calculate combined NKI-LLAMA score from training and inference metrics.
-        If inference metrics are not available, returns training-only score.
+        Calculate combined NKI-LLAMA score from training, inference, and reasoning metrics.
+        Handles graceful fallback when components are not available.
         
         Args:
             training_metrics: Training metrics including NKI analysis
             inference_metrics: Optional inference benchmark results
+            reasoning_metrics: Optional reasoning evaluation results
             weights: Optional weights for combining scores
             reference_data: Optional reference implementation data for inference scoring
             
@@ -201,58 +234,104 @@ def calculate_combined_score(self, training_metrics: Dict[str, Any],
             Dictionary containing combined score and breakdown
         """
         if weights is None:
-            weights = {
-                "training": 0.4,
-                "inference": 0.6
-            }
+            # Default weights based on available components
+            if inference_metrics is not None and reasoning_metrics is not None:
+                weights = {
+                    "training": 0.3,
+                    "inference": 0.4,
+                    "reasoning": 0.3
+                }
+            elif inference_metrics is not None:
+                weights = {
+                    "training": 0.4,
+                    "inference": 0.6,
+                    "reasoning": 0.0
+                }
+            elif reasoning_metrics is not None:
+                weights = {
+                    "training": 0.7,
+                    "inference": 0.0,
+                    "reasoning": 0.3
+                }
+            else:
+                weights = {
+                    "training": 1.0,
+                    "inference": 0.0,
+                    "reasoning": 0.0
+                }
             
         # Get training score and NKI ratio
         training_score = training_metrics.get("training_score", 0.0)
         nki_ratio = training_metrics["nki_analysis"]["summary"]["overall_nki_ratio"]
         
-        # Check if inference metrics are available
-        if inference_metrics is None:
-            # Training-only mode
+        # Calculate reasoning score if available
+        reasoning_score = None
+        if reasoning_metrics is not None:
+            reasoning_score = self.calculate_reasoning_score(reasoning_metrics)
+        
+        # Determine execution mode based on available components
+        available_components = []
+        if training_metrics is not None:
+            available_components.append("training")
+        if inference_metrics is not None:
+            available_components.append("inference")
+        if reasoning_metrics is not None:
+            available_components.append("reasoning")
+            
+        mode = "_".join(available_components) if len(available_components) > 1 else f"{available_components[0]}_only"
+        
+        # Handle training-only mode
+        if inference_metrics is None and reasoning_metrics is None:
             return {
                 "combined_score": training_score,
                 "training_score": training_score,
                 "inference_score": None,
+                "reasoning_score": None,
                 "weights": weights,
                 "mode": "training_only",
                 "breakdown": {
                     "training": training_metrics.get("training_score_breakdown", {}),
-                    "inference": None
+                    "inference": None,
+                    "reasoning": None
                 },
                 "nki_ratio": nki_ratio
             }
         
-        # Calculate inference score with NKI ratio
-        inference_score, inference_breakdown = self.calculate_inference_score(inference_metrics, reference_data)
-        
-        # Update inference score with actual NKI FLOPS ratio
-        inference_breakdown["normalized_nki_flops"] = nki_ratio
-        inference_score_with_nki = (
-            inference_breakdown["accuracy"] * 
-            inference_breakdown["reduced_latency"] * 
-            inference_breakdown["increased_throughput"] * 
-            (1 + nki_ratio)
-        )
-        
-        # Calculate weighted average
-        combined_score = (
-            weights["training"] * training_score + 
-            weights["inference"] * inference_score_with_nki
-        )
+        # Calculate inference score with NKI ratio if available
+        inference_score_with_nki = None
+        inference_breakdown = None
+        if inference_metrics is not None:
+            inference_score, inference_breakdown = self.calculate_inference_score(inference_metrics, reference_data)
+            
+            # Update inference score with actual NKI FLOPS ratio
+            inference_breakdown["normalized_nki_flops"] = nki_ratio
+            inference_score_with_nki = (
+                inference_breakdown["accuracy"] * 
+                inference_breakdown["reduced_latency"] * 
+                inference_breakdown["increased_throughput"] * 
+                (1 + nki_ratio)
+            )
+        
+        # Calculate weighted average based on available components
+        combined_score = 0.0
+        if weights["training"] > 0:
+            combined_score += weights["training"] * training_score
+        if weights["inference"] > 0 and inference_score_with_nki is not None:
+            combined_score += weights["inference"] * inference_score_with_nki
+        if weights["reasoning"] > 0 and reasoning_score is not None:
+            combined_score += weights["reasoning"] * reasoning_score
         
         return {
             "combined_score": combined_score,
             "training_score": training_score,
             "inference_score": inference_score_with_nki,
+            "reasoning_score": reasoning_score,
             "weights": weights,
-            "mode": "combined",
+            "mode": mode,
             "breakdown": {
                 "training": training_metrics.get("training_score_breakdown", {}),
-                "inference": inference_breakdown
+                "inference": inference_breakdown,
+                "reasoning": reasoning_metrics.get("tasks", {}) if reasoning_metrics else None
             },
             "nki_ratio": nki_ratio
         }
@@ -263,9 +342,11 @@ def display_results(self, results: Dict[str, Any]):
         print("NKI-LLAMA BENCHMARK RESULTS")
         print("="*70)
         
+        mode = results.get("mode", "unknown")
+        
         # Check mode and display appropriate results
-        if results.get("mode") == "training_only":
-            print("\n⚠️  TRAINING-ONLY MODE (Inference results not available)")
+        if mode == "training_only":
+            print("\n⚠️  TRAINING-ONLY MODE (Inference and reasoning results not available)")
             print(f"\n🏆 NKI KERNEL TRAINING SCORE: {results['training_score']:.4f}")
             print(f"   NKI Ratio: {results['nki_ratio']:.4f}")
             
@@ -279,20 +360,29 @@ def display_results(self, results: Dict[str, Any]):
                 print(f"  Throughput Improvement: {tb.get('throughput_improvement', 0):.4f}x")
                 
             print("\n💡 Note: This score represents training performance only.")
-            print("   To get the full NKI-LLAMA score, run inference benchmarks and provide")
-            print("   the results file using --inference-results option.")
+            print("   To get the full NKI-LLAMA score, run inference benchmarks and reasoning")
+            print("   evaluation, then provide the results using --inference-results and --reasoning-results options.")
             
         else:
-            # Combined mode - full results
+            # Multi-component mode - display based on available components
             print(f"\n🏆 FINAL NKI-LLAMA SCORE: {results['combined_score']:.4f}")
+            print(f"\nExecution Mode: {mode.replace('_', ' + ').title()}")
+            
             print(f"\nScore Weights:")
-            print(f"  Training: {results['weights']['training']*100:.0f}%")
-            print(f"  Inference: {results['weights']['inference']*100:.0f}%")
+            if results['weights']['training'] > 0:
+                print(f"  Training: {results['weights']['training']*100:.0f}%")
+            if results['weights']['inference'] > 0:
+                print(f"  Inference: {results['weights']['inference']*100:.0f}%")
+            if results['weights']['reasoning'] > 0:
+                print(f"  Reasoning: {results['weights']['reasoning']*100:.0f}%")
             
             # Component scores
             print(f"\n📊 Component Scores:")
             print(f"  Training Score: {results['training_score']:.4f}")
-            print(f"  Inference Score: {results['inference_score']:.4f}")
+            if results['inference_score'] is not None:
+                print(f"  Inference Score: {results['inference_score']:.4f}")
+            if results['reasoning_score'] is not None:
+                print(f"  Reasoning Score: {results['reasoning_score']:.4f}")
             print(f"  NKI Ratio: {results['nki_ratio']:.4f}")
             
             # Training breakdown
@@ -335,7 +425,160 @@ def save_results(self, results: Dict[str, Any], output_file: str):
             json.dump(output_data, f, indent=2)
             
         self.logger.info(f"Results saved to: {output_file}")
-
+        
+    def discover_reasoning_results(self, model_path: str, 
+                                   results_base_path: str = "/home/ubuntu/aws-neuron-samples/inference-benchmarking/results") -> Optional[Dict[str, Any]]:
+        """
+        Discover reasoning results based on model ID in aws-neuron-samples/inference-benchmarking/results/.
+        
+        Args:
+            model_path: Path to the model (e.g., "/home/ubuntu/models/llama-3-1-8b")
+            results_base_path: Base path to reasoning results directory
+            
+        Returns:
+            Dictionary containing reasoning results or None if not found
+        """
+        try:
+            # Extract model name from path and create sanitized version
+            model_name = os.path.basename(model_path.rstrip('/'))
+            sanitized_model_path = model_path.replace('/', '__')
+            
+            self.logger.info(f"Searching for reasoning results for model: {model_name}")
+            self.logger.debug(f"Sanitized model path: {sanitized_model_path}")
+            
+            # Search in accuracy results directory
+            accuracy_base = os.path.join(results_base_path, "accuracy")
+            
+            if not os.path.exists(accuracy_base):
+                self.logger.warning(f"Accuracy results directory not found: {accuracy_base}")
+                return None
+                
+            # Find all result files matching the model path pattern
+            search_pattern = os.path.join(accuracy_base, "**", sanitized_model_path, "results_*.json")
+            result_files = glob.glob(search_pattern, recursive=True)
+            
+            if not result_files:
+                self.logger.info(f"No reasoning result files found for model {model_name}")
+                self.logger.debug(f"Search pattern used: {search_pattern}")
+                return None
+                
+            # Use the most recent result file (based on timestamp in filename)
+            latest_file = max(result_files, key=lambda f: os.path.getmtime(f))
+            self.logger.info(f"Found reasoning results: {latest_file}")
+            
+            # Load and parse the result file
+            with open(latest_file, 'r') as f:
+                reasoning_data = json.load(f)
+                
+            return self.parse_reasoning_results(reasoning_data, latest_file)
+            
+        except Exception as e:
+            self.logger.error(f"Error discovering reasoning results: {e}")
+            return None
+            
+    def parse_reasoning_results(self, reasoning_data: Dict[str, Any], file_path: str) -> Dict[str, Any]:
+        """
+        Parse JSON result files to extract "exact_match,strict-match" scores.
+        
+        Args:
+            reasoning_data: Raw reasoning results data
+            file_path: Path to the result file for metadata
+            
+        Returns:
+            Dictionary containing parsed reasoning scores
+        """
+        try:
+            parsed_results = {
+                "source_file": file_path,
+                "model_name": reasoning_data.get("model_name", "unknown"),
+                "model_name_sanitized": reasoning_data.get("model_name_sanitized", "unknown"),
+                "evaluation_time": reasoning_data.get("total_evaluation_time_seconds", 0),
+                "tasks": {},
+                "overall_score": 0.0,
+                "task_count": 0
+            }
+            
+            # Extract scores from each task
+            results_section = reasoning_data.get("results", {})
+            total_score = 0.0
+            task_count = 0
+            
+            for task_name, task_results in results_section.items():
+                # Look for exact_match,strict-match score
+                strict_match_score = task_results.get("exact_match,strict-match")
+                flexible_extract_score = task_results.get("exact_match,flexible-extract")
+                
+                if strict_match_score is not None:
+                    parsed_results["tasks"][task_name] = {
+                        "exact_match_strict": strict_match_score,
+                        "exact_match_flexible": flexible_extract_score,
+                        "primary_score": strict_match_score  # Use strict-match as primary
+                    }
+                    total_score += strict_match_score
+                    task_count += 1
+                    
+                    self.logger.debug(f"Task {task_name}: strict-match={strict_match_score}, flexible-extract={flexible_extract_score}")
+                    
+            # Calculate overall average score
+            if task_count > 0:
+                parsed_results["overall_score"] = total_score / task_count
+                parsed_results["task_count"] = task_count
+                
+            self.logger.info(f"Parsed reasoning results: {task_count} tasks, overall score: {parsed_results['overall_score']:.4f}")
+            return parsed_results
+            
+        except Exception as e:
+            self.logger.error(f"Error parsing reasoning results: {e}")
+            return {
+                "source_file": file_path,
+                "error": str(e),
+                "overall_score": 0.0,
+                "task_count": 0,
+                "tasks": {}
+            }
+            
+    def map_model_config_to_path(self, model_config_path: str) -> str:
+        """
+        Map model configuration to corresponding model path for reasoning result discovery.
+        
+        Args:
+            model_config_path: Path to model configuration file
+            
+        Returns:
+            Inferred model path for reasoning result lookup
+        """
+        try:
+            # Try to extract model information from config file
+            if os.path.exists(model_config_path):
+                with open(model_config_path, 'r') as f:
+                    config_data = json.load(f)
+                    
+                # Look for model path hints in config
+                model_name_or_path = config_data.get("model_name_or_path", "")
+                if model_name_or_path and os.path.exists(model_name_or_path):
+                    return model_name_or_path
+                    
+            # Fallback: infer from config path structure
+            # e.g., /path/to/8B_config_llama3-1/config.json -> llama-3-1-8b
+            config_dir = os.path.dirname(model_config_path)
+            config_dir_name = os.path.basename(config_dir)
+            
+            # Map common config directory patterns to model names
+            model_mapping = {
+                "8B_config_llama3-1": "/home/ubuntu/models/llama-3-1-8b",
+                "8B_config_llama3": "/home/ubuntu/models/llama-3-8b-distill",
+                # Add more mappings as needed
+            }
+            
+            if config_dir_name in model_mapping:
+                return model_mapping[config_dir_name]
+                
+            # Final fallback: assume standard model path
+            return "/home/ubuntu/models/llama-3-1-8b"
+            
+        except Exception as e:
+            self.logger.warning(f"Error mapping model config to path: {e}")
+            return "/home/ubuntu/models/llama-3-1-8b"
 
 def main():
     parser = argparse.ArgumentParser(

From 10ac55f10bca12a5b39abf7df5a010b86b01da39 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-51-221.us-west-2.compute.internal>
Date: Thu, 3 Jul 2025 18:05:59 +0000
Subject: [PATCH 47/65] Added Neuron Profile Test

---
 test/unit/conftest.py            | 28 +++++++++++
 test/unit/test_neuron_profile.py | 86 ++++++++++++++++++++++++++++++++
 2 files changed, 114 insertions(+)
 create mode 100644 test/unit/conftest.py
 create mode 100644 test/unit/test_neuron_profile.py

diff --git a/test/unit/conftest.py b/test/unit/conftest.py
new file mode 100644
index 0000000..cd663ae
--- /dev/null
+++ b/test/unit/conftest.py
@@ -0,0 +1,28 @@
+import pytest
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--simulation-only", action="store_true", default=False, help="Run simulation only, it will run test with `simulation` marker in simulation mode"
+    )
+
+def pytest_configure(config):
+    config.addinivalue_line(
+        "markers", "simulation: mark simulation test that can be executed without a NeuronDevice"
+    )
+
+@pytest.fixture
+def simulation_only(request):
+    return request.config.getoption("--simulation-only")
+
+def pytest_collection_modifyitems(session, config, items):
+    if config.getoption("--simulation-only"):
+        # Only run cases with `simulation marker`
+        result = []
+        for item in items:
+            for marker in item.iter_markers():
+                if marker.name == 'simulation':
+                    result.append(item)
+                    break
+        items.clear()
+        items.extend(result)
+        
\ No newline at end of file
diff --git a/test/unit/test_neuron_profile.py b/test/unit/test_neuron_profile.py
new file mode 100644
index 0000000..e607705
--- /dev/null
+++ b/test/unit/test_neuron_profile.py
@@ -0,0 +1,86 @@
+from neuronxcc.nki import benchmark
+from neuronxcc.nki import profile
+import neuronxcc.nki.language as nl
+import numpy as np
+import pytest
+import os
+import shutil
+import tempfile
+
+
+WORKING_DIRECTORY = tempfile.mkdtemp()
+SAVE_NEFF_NAME = "cus_file123.neff"
+SAVE_TRACE_NAME = "profile-custom.ntff"
+NUM_EXECS = 20
+PROFILE_NTH = 10  
+JSON_REPORTS = "json_reports"
+
+@profile(working_directory=WORKING_DIRECTORY, save_neff_name=SAVE_NEFF_NAME, overwrite=False , save_trace_name=SAVE_TRACE_NAME, num_execs=NUM_EXECS, profile_nth=PROFILE_NTH)
+def nki_tensor_tensor_add(a_tensor, b_tensor):
+  c_output = nl.ndarray(a_tensor.shape, dtype=a_tensor.dtype, buffer=nl.shared_hbm)
+ 
+  a = nl.load(a_tensor)
+  b = nl.load(b_tensor)
+
+  c_tile = a + b
+
+  nl.store(c_output, value=c_tile)
+
+  return c_output
+
+class TestNeuronProfile:
+    def _get_ntff_path(self, trace_val):
+        """
+        Prepares ntff file name based on execution trace number
+        """
+        if trace_val == 1:
+            return os.path.join(WORKING_DIRECTORY, f"{os.path.splitext(os.path.basename(SAVE_TRACE_NAME))[0]}.ntff")
+        else:
+            return os.path.join(WORKING_DIRECTORY, f"{os.path.splitext(os.path.basename(SAVE_TRACE_NAME))[0]}_exec_{trace_val}.ntff")
+
+    @pytest.fixture
+    def traces(self):
+        ret = []
+        if NUM_EXECS < PROFILE_NTH:
+            ret.append(self._get_ntff_path(PROFILE_NTH))
+        else:
+            curr = PROFILE_NTH
+            while curr <= NUM_EXECS:
+                ret.append(self._get_ntff_path(curr))
+                curr += PROFILE_NTH
+        return ret
+    
+    @pytest.fixture
+    def num_reports(self):
+        if NUM_EXECS < PROFILE_NTH:
+            return 1
+        else:
+            return NUM_EXECS // PROFILE_NTH
+
+    def test_output_artifacts_created(self, traces, num_reports):
+        # delete artifact directory, only testing non-overwrite functionality
+        if os.path.exists(WORKING_DIRECTORY):
+            shutil.rmtree(WORKING_DIRECTORY)
+
+        # creates dummy input to invoke profile kernel
+        a = np.zeros([128, 1024]).astype(np.float16)
+        b = np.random.random_sample([128, 1024]).astype(np.float16)
+
+        output_nki = nki_tensor_tensor_add(a, b)
+
+        # now asserting artifacts are correctly created     
+        assert os.path.exists(os.path.join(WORKING_DIRECTORY, SAVE_NEFF_NAME)) # neff
+        
+        for trace in traces:
+            assert os.path.exists(trace) # trace
+        
+        # json reports
+        report_dir = os.path.join(WORKING_DIRECTORY, JSON_REPORTS)
+
+        assert os.path.exists(report_dir) # actually exists
+        assert len(os.listdir(report_dir)) == num_reports # report all iterations queried
+
+        # post condition cleanup
+        if os.path.exists(WORKING_DIRECTORY):
+            shutil.rmtree(WORKING_DIRECTORY)
+

From 17cbb97b3bffa8d5486c422bd0ceda04e75dd8ff Mon Sep 17 00:00:00 2001
From: arm-diaz <arm.dan.diaz.gonzalez@gmail.com>
Date: Wed, 9 Jul 2025 15:57:13 +0000
Subject: [PATCH 48/65] docs: add documentation for each path

---
 README.md                  | 632 ++++++-------------------------------
 deployment/deployment.yaml | 406 ++++++++++++++++++++++++
 docs/complete-pipeline.md  | 555 ++++++++++++++++++++++++++++++++
 docs/fine-tuning.md        | 362 +++++++++++++++++++++
 docs/inference.md          | 478 ++++++++++++++++++++++++++++
 5 files changed, 1905 insertions(+), 528 deletions(-)
 create mode 100644 deployment/deployment.yaml
 create mode 100644 docs/complete-pipeline.md
 create mode 100644 docs/fine-tuning.md
 create mode 100644 docs/inference.md

diff --git a/README.md b/README.md
index b2b6816..63fcd39 100644
--- a/README.md
+++ b/README.md
@@ -1,580 +1,156 @@
-# NKI-LLAMA: AWS Neuron Development Platform
+# 🚀 NKI-LLAMA Hackathon: Getting Started Guide
 
-A unified platform for fine-tuning, benchmarking, and serving LLaMA models on AWS Trainium and Inferentia using Neuron SDK's advanced optimization capabilities.
+Welcome to the **NKI-LLAMA Hackathon**! This guide will help you navigate the documentation and choose the best path for your hackathon journey.
 
-## 🎯 Overview
+## 🎯 Welcome Hackathon Participants!
 
-NKI-LLAMA provides a streamlined interface for the complete LLM development lifecycle on AWS Neuron hardware:
+You're about to embark on an exciting challenge to optimize LLaMA models using AWS Neuron's cutting-edge NKI (Neuron Kernel Interface) technology. Whether you're focusing on training, inference, or both, we've prepared guides to help you succeed.
 
-- **Fine-tune** models using NeuronX Distributed (NxD)
-- **Optimize** with Neuron Kernel Interface (NKI) compilation
-- **Benchmark** performance with comprehensive evaluation tools
-- **Serve** models with vLLM's OpenAI-compatible API
-- **Build** LLM-powered applications and agents
+## 📚 Choose Your Path
 
-## 🔄 Architecture
+We've created three specialized guides based on your optimization focus:
 
-```
-┌─────────────────┐     ┌──────────────────┐     ┌─────────────────┐     ┌──────────────┐
-│                 │     │                  │     │                 │     │              │
-│   Fine-tuning   │────▶│ NKI Compilation  │────▶│ vLLM Inference  │────▶│  Reasoning   │-> cumulative score ()
-│      (NxD)      │     │  & Benchmarking  │     │     (NxDI)      │     │  Benchmark   │
-│                 │     │                  │     │                 │     │              │
-└─────────────────┘     └──────────────────┘     └─────────────────┘     └──────────────┘
-        │                         │                         │                    │
-        ▼                         ▼                         ▼                    ▼
-  Trained Model            NKI-Optimized              API Endpoint          test PT   
-                          Model Artifacts            (OpenAI Compatible)   model reasoning
-```
-
-### Key Technologies
-
-- **NKI (Neuron Kernel Interface)**: Custom kernel optimizations for AWS Neuron
-- **NxD (NeuronX Distributed)**: Distributed training framework
-- **NxDI (NeuronX Distributed Inference)**: Optimized inference runtime
-- **vLLM**: High-performance serving with Neuron backend
-
-## 📋 Requirements
-
-### System Requirements
-- **Instance**: trn1.32xlarge (recommended)
-- **AMI**: Deep Learning AMI Neuron (Ubuntu 22.04)
-- **Neuron SDK**: 2.23.0
-- **Python**: 3.10
-
-### SDK Components
-- NeuronX Distributed Training: 1.3.0
-- NeuronX Distributed Inference: 0.3.5591
-- Neuron Compiler: 2.18.121.0
-
-## 🚀 Quick Start
-
-### 1. Instance Setup
-```bash
-# Create EC2 instance
-# - Type: trn1.32xlarge
-# - AMI: Deep Learning AMI Neuron (Ubuntu 22.04)
-# - Storage: 512GB+ recommended
-```
-
-### 2. Installation
-```bash
-# Clone repository
-git clone https://github.com/aws-neuron/nki-llama.git
-cd nki-llama
-
-# Install
-chmod +x install.sh
-./install.sh
-
-# Configure
-cp .env.example .env
-nano .env  # Add your HF_TOKEN
-# inference env vars, ensure max_model_len= seq_len
-```
-
-### 3. First Run
-```bash
-# Interactive setup
-./nki-llama setup
-
-# Download model
-source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
-./nki-llama inference download
-
-# Run benchmark (compiles model on first run)
-tmux new -s benchmark
-./nki-llama inference benchmark
-```
-
-## 📊 Score Calculation Workflow
-
-The NKI-LLAMA platform includes a comprehensive score calculation system that evaluates training performance, inference performance, and reasoning capabilities. For detailed information about the scoring system, see the [Score Calculation README](src/README.md).
-
-### Workflow Overview
-
-1. **Pre-compile Phase**: 
-   - Execute the pre-compile job using `./nki-llama finetune compile`
-   - This generates a compile directory in the neuron cache
-   - The pre-compile job creates a log file in `logs/nki-llama_*.log`
-   - **Important**: Note the compile directory path from the "Pre-compile graphs" log output
-   - Example: `/home/ubuntu/neuron_cache/neuronxcc-2.18.121.0+9e31e41a/MODULE_15329989265349737271+a65e371e`
-
-2. **Training Execution**:
-   - Execute the pre-compile job using `./nki-llama finetune train`
-   - The training job creates a log file in `logs/nki-llama_*.log`
-   - This log contains metrics like latency, throughput, and MFU
-   - The benchmark inference file is always generated at: `benchmark_inference.json`
-
-3. **Reasoning Evaluation** (Optional):
-   - Run reasoning benchmarks using the dedicated script
-   - Results are automatically discovered and integrated by the handler
-   - Supports multiple reasoning datasets (GSM8K, MMLU, etc.)
-
-4. **Score Collection**:
-   - Once components complete, scores can be calculated using the handler
-   - Training-only: NKI kernel training score
-   - Training + Inference: Full NKI-LLAMA score
-   - Training + Inference + Reasoning: Complete benchmark score with reasoning metrics
-
-### Example Test Run
-
-```bash
-# Step 1: Run full fine-tuning job and note the compile directory
-tmux new -s training
-source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
-./nki-llama finetune all
-# Look for "Pre-compile graphs" in output to find compile directory path
-
-# Step 2: Run inference benchmark (optional for full score)
-tmux new -s benchmark
-source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
-./nki-llama inference benchmark 
-# or directly run from main.py in src/inference/ for full use of flags for bucketing/context encoding and others
-
-# Step 3: Run reasoning evaluation (optional for complete benchmark)
-tmux new -s reasoning
-source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
-./nki-llama/src/inference/scripts/reasoning-bench-lm-eval.sh
-# Results are automatically saved to aws-neuron-samples/inference-benchmarking/results/
-
-# Step 4: Calculate scores
-# For training-only score:
-python /home/ubuntu/nki-llama/src/handler.py \
-    --config /home/ubuntu/nki-llama/src/fine-tune/neuronx-distributed-training/examples/conf/hf_llama3_8B_SFT_config.yaml \
-    --model-config /home/ubuntu/nki-llama/src/fine-tune/configs/model-config/8B_config_llama3-1/config.json \
-    --log-file /home/ubuntu/nki-llama/logs/nki-llama_20250610_014432.log \
-    --compile-dir /home/ubuntu/neuron_cache/neuronxcc-2.18.121.0+9e31e41a/MODULE_15329989265349737271+a65e371e \
-    --throughput 2.1 \
-    --output benchmark_results.json \
-    --training-weight 0.5 \
-    --inference-weight 0.5 \
-    --hw-backend trn1 \
-    --per-file-scores \
-    --calculate-score \
-    --detailed \
-    --verbose
-
-# For full score (with inference):
-python /home/ubuntu/nki-llama/src/handler.py \
-    --config /home/ubuntu/nki-llama/src/fine-tune/neuronx-distributed-training/examples/conf/hf_llama3_8B_SFT_config.yaml \
-    --model-config /home/ubuntu/nki-llama/src/fine-tune/configs/model-config/8B_config_llama3-1/config.json \
-    --log-file /home/ubuntu/nki-llama/logs/nki-llama_20250610_014432.log \
-    --compile-dir /home/ubuntu/neuron_cache/neuronxcc-2.18.121.0+9e31e41a/MODULE_15329989265349737271+a65e371e \
-    --inference-results /home/ubuntu/nki-llama/src/inference/benchmark_inference.json \
-    --throughput 2.1 \
-    --output benchmark_results.json \
-    --training-weight 0.5 \
-    --inference-weight 0.5 \
-    --hw-backend trn1 \
-    --per-file-scores \
-    --calculate-score \
-    --detailed \
-    --verbose
-
-# For complete score (with reasoning - handler automatically discovers results):
-python /home/ubuntu/nki-llama/src/handler.py \
-    --config /home/ubuntu/nki-llama/src/fine-tune/neuronx-distributed-training/examples/conf/hf_llama3_8B_SFT_config.yaml \
-    --model-config /home/ubuntu/nki-llama/src/fine-tune/configs/model-config/8B_config_llama3-1/config.json \
-    --log-file /home/ubuntu/nki-llama/logs/nki-llama_20250610_014432.log \
-    --compile-dir /home/ubuntu/neuron_cache/neuronxcc-2.18.121.0+9e31e41a/MODULE_15329989265349737271+a65e371e \
-    --inference-results /home/ubuntu/nki-llama/src/inference/benchmark_inference.json \
-    --reasoning-results \
-    --throughput 2.1 \
-    --output benchmark_results.json \
-    --training-weight 0.33 \
-    --inference-weight 0.33 \
-    --reasoning-weight 0.34 \
-    --hw-backend trn1 \
-    --per-file-scores \
-    --calculate-score \
-    --detailed \
-    --verbose
-```
-
-The score calculation provides insights into:
-- **Training Performance**: MFU improvement and throughput gains
-- **Inference Performance**: Latency reduction and throughput increase
-- **NKI Optimization**: Ratio of NKI-optimized operations
-
-## 💻 Command Reference
-
-### Core Commands
-
-| Command | Description |
-|---------|-------------|
-| `./nki-llama setup` | Interactive setup wizard |
-| `./nki-llama status` | System and project status |
-| `./nki-llama config` | Display configuration |
-| `./nki-llama clean` | Clean artifacts and cache |
-
-### Fine-tuning Pipeline
-
-```bash
-# Activate environment
-source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
-
-# Complete pipeline
-./nki-llama finetune all
-
-# Or run individual steps
-./nki-llama finetune deps      # Install dependencies
-./nki-llama finetune data      # Download dataset
-./nki-llama finetune model     # Download base model
-./nki-llama finetune convert   # Convert to NxDT format
-./nki-llama finetune compile   # Pre-compile graphs
-./nki-llama finetune train     # Start training
-```
-
-### Benchmarking & Compilation
-
-```bash
-# Activate environment
-source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
-
-# Download model (if not already done)
-./nki-llama inference download
-
-# Full benchmark with NKI compilation (default)
-./nki-llama inference benchmark
-
-# Benchmark with options
-./nki-llama inference benchmark --seq-len 1024
-./nki-llama inference benchmark --clear-cache  # Clear compilation cache
-./nki-llama inference benchmark --no-nki       # Without NKI optimizations
-```
-
-#### Benchmark Modes
-
-| Mode | Description | Status |
-|------|-------------|--------|
-| `evaluate_all` | Full benchmark with NKI compilation and caching | ✅ Working |
-| `evaluate_single` | Quick validation test | ⚠️ Not implemented |
-
-> **Note**: The `evaluate_single` mode is currently not functional. Use `evaluate_all` (default) for all benchmarking needs.
-
-### Reasoning Evaluation
-
-```bash
-# Activate inference environment
-source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
-
-# Run reasoning benchmarks (requires vLLM server)
-./nki-llama/src/inference/scripts/reasoning-bench-lm-eval.sh
-
-# The script will:
-# 1. Start vLLM server with your compiled model
-# 2. Run lm-eval on reasoning datasets (GSM8K, MMLU, etc.)
-# 3. Save results to aws-neuron-samples/inference-benchmarking/results/
-# 4. Results are automatically discovered by handler.py
-```
-
-#### Reasoning Datasets
-
-The reasoning evaluation includes multiple datasets:
-- **GSM8K CoT**: Grade school math with chain-of-thought reasoning
-- **MMLU Pro**: Massive multitask language understanding (professional level)
-- **MMLU Flan CoT**: MMLU with chain-of-thought prompting
-
-#### Result Integration
+### 1. 🏋️ [Fine-tuning Only Guide](./docs/fine-tuning.md)
+**Perfect for teams focusing on training optimization**
+- Optimize Model FLOP Utilization (MFU) during training
+- Implement NKI kernels for training operations
+- Achieve high throughput with NeuronX Distributed
+- **Score Focus**: Training performance metrics
 
-The handler automatically discovers reasoning results by:
-1. **Model Mapping**: Maps your model configuration to result directory paths
-2. **Score Extraction**: Parses "exact_match,strict-match" scores from JSON files
-3. **Path Discovery**: Searches `aws-neuron-samples/inference-benchmarking/results/accuracy/mytest/`
-4. **Graceful Fallback**: Works without reasoning results (training+inference only)
+### 2. ⚡ [Inference with NKI Guide](./docs/inference.md)
+**Ideal for teams targeting inference performance**
+- Minimize latency with NKI-optimized kernels
+- Maximize throughput for production serving
+- Implement custom kernels for attention, normalization, and more
+- **Score Focus**: Inference latency and throughput
 
-### Inference Serving
+### 3. 🎯 [Complete Pipeline Guide](./docs/complete-pipeline.md)
+**For teams aiming for the highest overall score**
+- Combine training and inference optimizations
+- Implement shared NKI kernels across both phases
+- Optional reasoning evaluation for bonus points
+- **Score Focus**: Performance across all metrics
 
-```bash
-# Setup vLLM (one-time)
-./nki-llama inference setup
-
-# Start API server
-tmux new -s vllm
-./nki-llama inference server
-
-# Server will use NKI-compiled artifacts from benchmarking
-```
+## 🏃 Quick Start (5 Minutes)
 
-### Reasoning Evaluation
+### 1. Deploy Your Environment
 
-```bash
-# Activate inference environment
-source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
-
-# Run complete reasoning evaluation
-./nki-llama/src/inference/scripts/reasoning-bench-lm-eval.sh
+| AWS Region | Launch CloudFormation Stack |
+|:-----------|:----------------------------|
+| us-east-1 (N. Virginia) |<a href="https://console.aws.amazon.com/cloudformation/home?region=us-east-1#/stacks/new?stackName=nki-llama&templateURL=https://nki-llama-cfn.s3.amazonaws.com/nki-llama-stack.yaml" target="_blank">Launch stack</a> |
+| us-west-2 (Oregon) |<a href="https://console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks/new?stackName=nki-llama&templateURL=https://nki-llama-cfn.s3.amazonaws.com/nki-llama-stack.yaml" target="_blank">Launch stack</a> |
 
-# Results are automatically saved and discovered by handler.py
-# No additional flags needed - handler finds results by model path
-```
+**Note**: Create your SSH key pair first in EC2 → Key Pairs for easy download!
 
-### Development Tools
+### 2. Connect to Your Instance
 
 ```bash
-# Start Jupyter Lab
-./nki-llama jupyter
+# SSH access (recommended)
+ssh -i your-key.pem ubuntu@<instance-ip>
 
-# Access at http://your-instance-ip:8888
+# Or use SSM (no key needed)
+aws ssm start-session --target <instance-id>
 ```
 
-## 🛠️ Advanced Usage
-
-### Cache Management
-
-The compilation cache can accumulate failed entries. Monitor and manage it:
+### 3. Run Setup Wizard
 
 ```bash
-# Check cache status
-./nki-llama status
-
-# Clear cache before benchmark
-./nki-llama inference benchmark --clear-cache
-
-# Manual cache cleanup
-./nki-llama clean
-```
-
-### Using tmux (Recommended)
-
-Long-running operations should use tmux to prevent disconnection issues:
-
-```bash
-# Create session
-tmux new -s session-name
-
-# Run command
-./nki-llama [command]
-
-# Detach: Ctrl+B, then D
-
-# List sessions
-tmux ls
-
-# Reattach
-tmux attach -t session-name
+cd ~/nki-llama
+./nki-llama setup
 ```
 
-### Environment Management
+## 🎮 Using the NKI-LLAMA CLI
 
-Different operations require specific environments:
+The repository includes a unified command-line interface that simplifies all operations:
 
 ```bash
-# Fine-tuning
-source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
-
-# Inference & Benchmarking
-source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
-
-# Agent Development
-source ~/nki-llama/venv/bin/activate
-```
-
-## 📊 Monitoring & Debugging
-
-### System Monitoring
-```bash
-# Neuron device status
-neuron-ls
-
-# Real-time usage
-neuron-top
+# View all available commands
+./nki-llama help
 
-# Project status
+# Check system status
 ./nki-llama status
-```
-
-### Log Files
-```bash
-# View recent logs
-ls -la logs/
-tail -f logs/nki-llama_*.log
 
-# Benchmark results
-cat logs/benchmarks/*/metadata.json
+# Start your chosen workflow
+./nki-llama finetune all       # For training
+./nki-llama inference benchmark # For inference
 ```
 
-### Common Issues
+## 📊 Understanding the Scoring System
 
-#### Compilation Cache Errors
-```bash
-# Symptoms: "Got a cached failed neff" errors
-# Solution:
-./nki-llama inference benchmark --clear-cache
-```
+Your submission will be evaluated on:
 
-#### SIGHUP Errors
-```bash
-# Symptoms: Process terminated during compilation
-# Solution: Always use tmux for long operations
-tmux new -s benchmark
-```
+1. **Accuracy** ✓ - Must maintain model quality
+2. **Performance Improvements** 📈
+   - Training: MFU and throughput gains
+   - Inference: Latency reduction and throughput increase
+3. **NKI Coverage** 🎯 - Percentage of operations using NKI kernels
+4. **Reasoning (Bonus)** 🧠 - Optional evaluation on reasoning tasks
 
-#### Memory Issues
-```bash
-# Monitor memory usage
-neuron-top
-
-# Adjust parallelism if needed
-export TENSOR_PARALLEL_SIZE=4  # Reduce from 8
+**Score Formula**: 
 ```
-
-#### Reasoning Evaluation Issues
-```bash
-# Symptoms: "No reasoning results found" in handler output
-# Check if results directory exists:
-ls -la ~/aws-neuron-samples/inference-benchmarking/results/accuracy/mytest/
-
-# Symptoms: vLLM server connection errors during reasoning eval
-# Ensure server is running and accessible:
-curl http://localhost:8080/v1/models
-
-# Symptoms: Reasoning script fails with transformers version error
-# The script automatically installs transformers<4.50 for compatibility
-# If issues persist, manually install:
-pip install "transformers<4.48.2"
-
-# Symptoms: Handler can't find reasoning results for your model
-# Check model path mapping in results directory:
-# Results are stored by model path: /home/ubuntu/models/llama-3-1-8b
-# becomes: __home__ubuntu__models__llama-3-1-8b
-ls -la ~/aws-neuron-samples/inference-benchmarking/results/accuracy/mytest/*/
+Score = Accuracy × Performance_Gains × (1 + NKI_Coverage)
 ```
 
-## 🏗️ Project Structure
+## 🛠️ Essential Resources
 
-```
-nki-llama/
-├── nki-llama.sh          # Main CLI interface
-├── nki-llama.config      # System configuration
-├── .env                  # User configuration
-├── install.sh            # Installation script
-├── README.md             # This file
-├── src/
-│   ├── README.md         # Score calculation documentation
-│   ├── handler.py        # Score calculation handler
-│   ├── fine-tune/        # Training pipeline
-│   │   └── scripts/      # Training automation
-│   └── inference/        # Inference pipeline
-│       ├── main.py       # Benchmark entry point
-│       └── scripts/      # Inference automation
-├── notebooks/            # Example notebooks
-│   └── neuron_agents.ipynb
-├── logs/                 # Operation logs
-│   └── benchmarks/       # Benchmark results
-└── models/              # Downloaded models
-    └── compiled/        # NKI-compiled artifacts
-```
+### Documentation
+- [AWS Neuron SDK Docs](https://awsdocs-neuron.readthedocs-hosted.com/)
+- [NKI Programming Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/index.html)
+- [NKI Sample Kernels](https://github.com/aws-neuron/nki-samples)
 
-## 🔧 Configuration
+### Instance Information
+- **Instance Type**: trn1.32xlarge (32 Neuron cores)
+- **AMI**: Deep Learning AMI Neuron (Ubuntu 22.04) 20250520
+- **Pre-installed**: Neuron SDK 2.23.0, PyTorch, NeuronX
 
-### Environment Variables (.env)
+### Support
+- Create issues in the repository for technical questions
+- Check existing issues for common problems
+- Use the `#nki-llama` channel in the hackathon Slack
 
-```bash
-# Hugging Face Access
-HF_TOKEN=your_token_here
-
-# Model Selection
-MODEL_ID=meta-llama/Meta-Llama-3-8B
-MODEL_NAME=llama-3-8b
-
-# Hardware Configuration
-TENSOR_PARALLEL_SIZE=8
-NEURON_RT_NUM_CORES=8
-
-# Inference Parameters
-INFERENCE_PORT=8080
-MAX_MODEL_LEN=2048
-
-# Reasoning Evaluation (Optional)
-# These are automatically configured by reasoning-bench-lm-eval.sh
-# REASONING_DATASETS="mmlu_pro, gsm8k_cot, mmlu_flan_cot_zeroshot"
-# REASONING_LIMIT=200  # Number of samples per dataset
-# REASONING_TIMEOUT=3600  # Timeout in seconds
-```
+## 💡 Tips for Success
 
-## 🎓 Complete Workflow Example
+1. **Start Simple**: Get the baseline working before optimizing
+2. **Use tmux**: All long operations should run in tmux sessions
+3. **Profile First**: Use `neuron-profile` to identify bottlenecks
+4. **Iterate Quickly**: Test kernels individually before integration
+5. **Document Everything**: Keep notes on what works and what doesn't
 
-### Step 1: Fine-tune a Model
-```bash
-tmux new -s training
-source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
-./nki-llama finetune all
-# Note the compile directory from "Pre-compile graphs" output
-# Detach: Ctrl+B, D
-```
-
-### Step 2: Benchmark & Compile
-```bash
-tmux new -s benchmark
-source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
-./nki-llama inference download
-./nki-llama inference benchmark
-# First run compiles with NKI (10-30 minutes)
-# Detach: Ctrl+B, D
-```
-
-### Step 3: Reasoning Evaluation (Optional)
-```bash
-tmux new -s reasoning
-source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
-./nki-llama/src/inference/scripts/reasoning-bench-lm-eval.sh
-# Runs GSM8K, MMLU Pro, and MMLU Flan CoT evaluations
-# Results saved to aws-neuron-samples/inference-benchmarking/results/
-# Detach: Ctrl+B, D
-```
+## 🏆 Winning Strategy
 
-### Step 4: Calculate Performance Score
-```bash
-# After training and/or inference completes
-# For complete score with reasoning (if Step 3 was run):
-python /home/ubuntu/nki-llama/src/handler.py \
-    --compile-dir /path/from/training/logs \
-    --log-file logs/nki-llama_latest.log \
-    --inference-results benchmark_inference.json \
-    --reasoning-results \
-    --calculate-score
-
-# For training+inference only (without reasoning):
-python /home/ubuntu/nki-llama/src/handler.py \
-    --compile-dir /path/from/training/logs \
-    --log-file logs/nki-llama_latest.log \
-    --inference-results benchmark_inference.json \
-    --calculate-score
-```
+1. **Week 1**: 
+   - Set up environment and understand the codebase
+   - Get baseline metrics for comparison
+   - Choose your optimization path
 
-### Step 5: Serve Model
-```bash
-tmux new -s vllm-server
-source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
-./nki-llama inference server
-# API available at http://localhost:8080
-# Detach: Ctrl+B, D
-```
+2. **Week 2**: 
+   - Implement core NKI kernels
+   - Test and validate accuracy
+   - Measure performance improvements
 
-### Step 6: Build Applications
-```bash
-# Terminal 1: Keep server running
-# Terminal 2: Development
-./nki-llama jupyter
-# Open browser to http://your-ip:8888
-```
+3. **Week 3**: 
+   - Optimize and fine-tune kernels
+   - Maximize NKI coverage
+   - Prepare submission and presentation
 
-## 📚 Additional Resources
+## 🚦 Ready to Start?
 
-- [AWS Neuron Documentation](https://awsdocs-neuron.readthedocs-hosted.com/)
-- [NeuronX Distributed Training Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/neuronx-distributed/index.html)
-- [NKI Documentation](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/index.html)
-- [vLLM Neuron Integration](https://docs.vllm.ai/en/latest/getting_started/neuron-installation.html)
+1. **Choose your path** from the three guides above
+2. **Deploy your environment** using CloudFormation
+3. **Run the setup wizard**: `./nki-llama setup`
+4. **Start optimizing** and show us what NKI can do!
 
-## 🐛 Known Issues
+## 📝 Submission Checklist
 
-- **First compilation**: Initial NKI compilation can take 10-30 minutes. Subsequent runs use cache.
-- **Cache corruption**: If benchmark fails with cache errors, use `--clear-cache` flag.
+Before submitting, ensure you have:
+- [ ] Implemented NKI kernels with measurable improvements
+- [ ] Maintained model accuracy
+- [ ] Documented your approach
+- [ ] Prepared performance comparison data
+- [ ] Submit your score
 
-## 📄 License
+---
 
-© 2025 Amazon Web Services. All rights reserved.
+**Good luck, and may the best optimizations win!** 🎉
 
-This project is provided under the AWS Customer Agreement and integrates with AWS Neuron SDK components subject to their respective licenses.
\ No newline at end of file
+*Remember: The key to success is balancing performance gains with code quality and maintainability. Focus on high-impact optimizations first.*
\ No newline at end of file
diff --git a/deployment/deployment.yaml b/deployment/deployment.yaml
new file mode 100644
index 0000000..587a6a2
--- /dev/null
+++ b/deployment/deployment.yaml
@@ -0,0 +1,406 @@
+AWSTemplateFormatVersion: "2010-09-09"
+Description: Simplified CloudFormation template to deploy NKI-Llama on EC2
+
+Parameters:
+  KeyPairOption:
+    Description: Choose how to handle SSH key pair
+    Type: String
+    Default: use-existing
+    AllowedValues:
+      - use-existing
+      - create-new
+      - none
+  
+  ExistingKeyPairName:
+    Description: Select an existing EC2 KeyPair from the dropdown (required if KeyPairOption is 'use-existing')
+    Type: AWS::EC2::KeyPair::KeyName
+  
+  NewKeyPairName:
+    Description: Name for the new EC2 KeyPair to create (required if KeyPairOption is 'create-new')
+    Type: String
+    Default: ''
+  
+  Ec2InstanceType:
+    Description: EC2 instance type
+    Type: String
+    Default: trn1.32xlarge
+    AllowedValues:
+      - trn1.32xlarge
+  
+  VpcOption:
+    Description: Choose whether to use an existing VPC or create a new one
+    Type: String
+    Default: create-new
+    AllowedValues:
+      - create-new
+      - use-existing
+  
+  ExistingVpcId:
+    Type: AWS::EC2::VPC::Id
+    Description: 'Select an existing VPC from the dropdown (required if VpcOption is "use-existing")'
+  
+  ExistingSubnetId:
+    Type: AWS::EC2::Subnet::Id
+    Description: 'Select an existing Subnet from the dropdown (required if VpcOption is "use-existing")'
+  
+  UseExistingSecurityGroup:
+    Type: String
+    Description: 'Use an existing security group when using existing VPC?'
+    Default: 'no'
+    AllowedValues:
+      - 'yes'
+      - 'no'
+  
+  ExistingSecurityGroupId:
+    Type: String
+    Description: 'Select an existing Security Group ID (optional - only needed if UseExistingSecurityGroup is "yes")'
+    Default: 'sg-placeholder'  # Default placeholder value
+  
+  VpcCidrBlock:
+    Type: String
+    Description: 'CIDR block for the VPC (only used if creating new VPC)'
+    Default: '10.4.0.0/16'
+    AllowedPattern: '^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(\/([0-9]|[1-2][0-9]|3[0-2]))$'
+    ConstraintDescription: 'Must be a valid IPv4 CIDR block'
+  
+  PublicSubnet1Cidr:
+    Type: String
+    Description: 'CIDR block for public subnet 1 (only used if creating new VPC)'
+    Default: '10.4.1.0/24'
+    AllowedPattern: '^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(\/([0-9]|[1-2][0-9]|3[0-2]))$'
+    ConstraintDescription: 'Must be a valid IPv4 CIDR block'
+  
+  PublicSubnet2Cidr:
+    Type: String
+    Description: 'CIDR block for public subnet 2 (only used if creating new VPC)'
+    Default: '10.4.2.0/24'
+    AllowedPattern: '^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(\/([0-9]|[1-2][0-9]|3[0-2]))$'
+    ConstraintDescription: 'Must be a valid IPv4 CIDR block'
+
+Mappings:
+  RegionMap:
+    us-east-1:
+      AMI: ami-0e65a95c79775d1b6
+    us-west-2:
+      AMI: ami-0d0a2d26f80b645c2
+
+Metadata:
+  AWS::CloudFormation::Interface:
+    ParameterGroups:
+      - Label:
+          default: "SSH Key Configuration"
+        Parameters:
+          - KeyPairOption
+          - ExistingKeyPairName
+          - NewKeyPairName
+      - Label:
+          default: "VPC Configuration"
+        Parameters:
+          - VpcOption
+      - Label:
+          default: "Existing VPC Settings (only used if VpcOption is 'use-existing')"
+        Parameters:
+          - ExistingVpcId
+          - ExistingSubnetId
+          - UseExistingSecurityGroup
+          - ExistingSecurityGroupId
+      - Label:
+          default: "New VPC Settings (only used if VpcOption is 'create-new')"
+        Parameters:
+          - VpcCidrBlock
+          - PublicSubnet1Cidr
+          - PublicSubnet2Cidr
+      - Label:
+          default: "EC2 Configuration"
+        Parameters:
+          - Ec2InstanceType
+    ParameterLabels:
+      VpcOption:
+        default: "VPC Option"
+      ExistingVpcId:
+        default: "Existing VPC"
+      ExistingSubnetId:
+        default: "Existing Subnet"
+      VpcCidrBlock:
+        default: "New VPC CIDR Block"
+      PublicSubnet1Cidr:
+        default: "Public Subnet 1 CIDR"
+      PublicSubnet2Cidr:
+        default: "Public Subnet 2 CIDR"
+
+Conditions:
+  CreateNewVPC: !Equals [!Ref VpcOption, 'create-new']
+  UseExistingVPC: !Equals [!Ref VpcOption, 'use-existing']
+  UseExistingKeyPair: !Equals [!Ref KeyPairOption, 'use-existing']
+  CreateNewKeyPair: !Equals [!Ref KeyPairOption, 'create-new']
+  NoKeyPair: !Equals [!Ref KeyPairOption, 'none']
+  HasKeyPair: !Not [!Condition NoKeyPair]
+  CreateSecurityGroup: !Not [!And [!Condition UseExistingVPC, !Equals [!Ref UseExistingSecurityGroup, 'yes']]]
+  UseExistingSG: !And [!Condition UseExistingVPC, !Equals [!Ref UseExistingSecurityGroup, 'yes']]
+  HasValidExistingSecurityGroup: !And 
+    - !Condition UseExistingSG
+    - !Not [!Equals [!Ref ExistingSecurityGroupId, 'sg-placeholder']]
+
+Rules:
+  ValidateNewKeyPairConfiguration:
+    RuleCondition: !Equals [!Ref KeyPairOption, 'create-new']
+    Assertions:
+      - Assert: !Not [!Equals [!Ref NewKeyPairName, '']]
+        AssertDescription: "When creating a new key pair, you must provide a name."
+  
+  ValidateExistingSecurityGroupConfiguration:
+    RuleCondition: !And
+      - !Equals [!Ref VpcOption, 'use-existing']
+      - !Equals [!Ref UseExistingSecurityGroup, 'yes']
+    Assertions:
+      - Assert: !Not [!Equals [!Ref ExistingSecurityGroupId, 'sg-placeholder']]
+        AssertDescription: "When using an existing security group, you must select a valid security group ID."
+
+Resources:
+  # EC2 Key Pair (if creating new)
+  NewKeyPair:
+    Type: AWS::EC2::KeyPair
+    Condition: CreateNewKeyPair
+    Properties:
+      KeyName: !Ref NewKeyPairName
+      Tags:
+        - Key: Name
+          Value: !Sub '${AWS::StackName}-keypair'
+
+  # VPC
+  VPC:
+    Type: AWS::EC2::VPC
+    Condition: CreateNewVPC
+    Properties:
+      CidrBlock: !Ref VpcCidrBlock
+      EnableDnsHostnames: true
+      EnableDnsSupport: true
+      Tags:
+        - Key: Name
+          Value: !Sub '${AWS::StackName}-vpc'
+
+  # Internet Gateway
+  InternetGateway:
+    Type: AWS::EC2::InternetGateway
+    Condition: CreateNewVPC
+    Properties:
+      Tags:
+        - Key: Name
+          Value: !Sub '${AWS::StackName}-igw'
+
+  AttachGateway:
+    Type: AWS::EC2::VPCGatewayAttachment
+    Condition: CreateNewVPC
+    Properties:
+      VpcId: !Ref VPC
+      InternetGatewayId: !Ref InternetGateway
+
+  # Public Subnets
+  PublicSubnet1:
+    Type: AWS::EC2::Subnet
+    Condition: CreateNewVPC
+    Properties:
+      VpcId: !Ref VPC
+      CidrBlock: !Ref PublicSubnet1Cidr
+      # Let AWS choose the AZ to avoid capacity issues
+      # AvailabilityZone: !Select [0, !GetAZs '']
+      MapPublicIpOnLaunch: true
+      Tags:
+        - Key: Name
+          Value: !Sub '${AWS::StackName}-public-subnet-1'
+
+  PublicSubnet2:
+    Type: AWS::EC2::Subnet
+    Condition: CreateNewVPC
+    Properties:
+      VpcId: !Ref VPC
+      CidrBlock: !Ref PublicSubnet2Cidr
+      # Let AWS choose the AZ to avoid capacity issues
+      # AvailabilityZone: !Select [1, !GetAZs '']
+      MapPublicIpOnLaunch: true
+      Tags:
+        - Key: Name
+          Value: !Sub '${AWS::StackName}-public-subnet-2'
+
+  # Public Route Table
+  PublicRouteTable:
+    Type: AWS::EC2::RouteTable
+    Condition: CreateNewVPC
+    Properties:
+      VpcId: !Ref VPC
+      Tags:
+        - Key: Name
+          Value: !Sub '${AWS::StackName}-public-rt'
+
+  PublicRoute:
+    Type: AWS::EC2::Route
+    Condition: CreateNewVPC
+    DependsOn: AttachGateway
+    Properties:
+      RouteTableId: !Ref PublicRouteTable
+      DestinationCidrBlock: 0.0.0.0/0
+      GatewayId: !Ref InternetGateway
+
+  PublicSubnet1RouteTableAssociation:
+    Type: AWS::EC2::SubnetRouteTableAssociation
+    Condition: CreateNewVPC
+    Properties:
+      SubnetId: !Ref PublicSubnet1
+      RouteTableId: !Ref PublicRouteTable
+
+  PublicSubnet2RouteTableAssociation:
+    Type: AWS::EC2::SubnetRouteTableAssociation
+    Condition: CreateNewVPC
+    Properties:
+      SubnetId: !Ref PublicSubnet2
+      RouteTableId: !Ref PublicRouteTable
+
+  # EC2 Security Group
+  SecurityGroup:
+    Type: AWS::EC2::SecurityGroup
+    Condition: CreateSecurityGroup
+    Properties:
+      GroupDescription: Security group for NKI-Llama EC2 instance
+      VpcId: !If [CreateNewVPC, !Ref VPC, !Ref ExistingVpcId]
+      SecurityGroupIngress:
+        - IpProtocol: tcp
+          FromPort: 22
+          ToPort: 22
+          CidrIp: 0.0.0.0/0
+          Description: Allow SSH access
+      Tags:
+        - Key: Name
+          Value: !Sub '${AWS::StackName}-security-group'
+
+  # IAM Role for EC2
+  EC2Role:
+    Type: AWS::IAM::Role
+    Properties:
+      AssumeRolePolicyDocument:
+        Statement:
+          - Effect: Allow
+            Principal:
+              Service:
+                - ec2.amazonaws.com
+            Action:
+              - sts:AssumeRole
+      ManagedPolicyArns:
+        - arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
+      Tags:
+        - Key: Name
+          Value: !Sub '${AWS::StackName}-ec2-role'
+
+  EC2InstanceProfile:
+    Type: AWS::IAM::InstanceProfile
+    Properties:
+      Path: /
+      Roles:
+        - !Ref EC2Role
+
+  # EC2 Instance
+  EC2Instance:
+    Type: AWS::EC2::Instance
+    Properties:
+      IamInstanceProfile: !Ref EC2InstanceProfile
+      InstanceType: !Ref Ec2InstanceType
+      ImageId: !FindInMap [RegionMap, !Ref 'AWS::Region', AMI]
+      KeyName: !If 
+        - HasKeyPair
+        - !If 
+          - UseExistingKeyPair
+          - !Ref ExistingKeyPairName
+          - !Ref NewKeyPair
+        - !Ref 'AWS::NoValue'
+      BlockDeviceMappings:
+        - DeviceName: /dev/sda1
+          Ebs:
+            VolumeSize: 800
+            VolumeType: gp3
+      NetworkInterfaces:
+        - AssociatePublicIpAddress: true
+          DeviceIndex: 0
+          SubnetId: !If 
+            - CreateNewVPC
+            - !Ref PublicSubnet1
+            - !Ref ExistingSubnetId
+          GroupSet:
+            - !If 
+              - HasValidExistingSecurityGroup
+              - !Ref ExistingSecurityGroupId
+              - !Ref SecurityGroup
+      Tags:
+        - Key: Name
+          Value: !Sub '${AWS::StackName}-nki-llama'
+      UserData:
+        Fn::Base64: !Sub |
+          #!/bin/bash -x
+          
+          # Update system
+          sudo apt-get update
+          sudo apt-get upgrade -y
+          
+          # Install git
+          sudo apt-get install git -y
+          
+          # Clone the repository with agents branch
+          cd /home/ubuntu
+          git clone https://github.com/aws-neuron/nki-llama.git
+
+Outputs:
+  VpcId:
+    Description: 'VPC ID'
+    Value: !If [CreateNewVPC, !Ref VPC, !Ref ExistingVpcId]
+
+  PublicSubnet1Id:
+    Description: 'Public Subnet 1 ID'
+    Value: !If [CreateNewVPC, !Ref PublicSubnet1, 'Using existing subnet']
+    Condition: CreateNewVPC
+
+  PublicSubnet2Id:
+    Description: 'Public Subnet 2 ID'
+    Value: !If [CreateNewVPC, !Ref PublicSubnet2, 'Using existing subnet']
+    Condition: CreateNewVPC
+
+  EC2InstanceId:
+    Description: 'EC2 Instance ID'
+    Value: !Ref EC2Instance
+
+  EC2PublicIP:
+    Description: 'EC2 Instance Public IP'
+    Value: !GetAtt EC2Instance.PublicIp
+
+  EC2PrivateIP:
+    Description: 'EC2 Instance Private IP'
+    Value: !GetAtt EC2Instance.PrivateIp
+
+  SecurityGroupId:
+    Description: 'Security Group ID'
+    Value: !If [HasValidExistingSecurityGroup, !Ref ExistingSecurityGroupId, !Ref SecurityGroup]
+
+  SSHCommand:
+    Description: 'SSH connection information'
+    Value: !If
+      - NoKeyPair
+      - !Sub 'No SSH key configured. Use AWS Systems Manager Session Manager: aws ssm start-session --target ${EC2Instance}'
+      - !If
+        - UseExistingKeyPair
+        - !Sub 'ssh -i <path-to-your-${ExistingKeyPairName}-key> ubuntu@${EC2Instance.PublicIp}'
+        - !Sub 'ssh -i <path-to-your-${NewKeyPairName}-key> ubuntu@${EC2Instance.PublicIp}'
+
+  KeyPairInfo:
+    Description: 'Key pair information'
+    Value: !If
+      - NoKeyPair
+      - 'No key pair configured - use SSM Session Manager for access'
+      - !If
+        - UseExistingKeyPair
+        - !Sub 'Using existing key pair: ${ExistingKeyPairName}'
+        - !Sub 'Created new key pair: ${NewKeyPairName} (Download private key from EC2 console within 24 hours!)' 
+
+  ImportantNote:
+    Description: 'IMPORTANT - Read this for new key pairs'
+    Value: !If
+      - CreateNewKeyPair
+      - 'CRITICAL: You must download the private key from the EC2 console immediately! Go to EC2 > Key Pairs, find your key, and download it. This is the ONLY time you can download it!'
+      - 'N/A'
+    Condition: CreateNewKeyPair
\ No newline at end of file
diff --git a/docs/complete-pipeline.md b/docs/complete-pipeline.md
new file mode 100644
index 0000000..1ddfd10
--- /dev/null
+++ b/docs/complete-pipeline.md
@@ -0,0 +1,555 @@
+# Complete Pipeline Guide: Fine-tuning + Inference with NKI
+
+## 🎯 Overview
+
+This guide covers the entire NKI-LLAMA pipeline, combining fine-tuning on AWS Trainium with NKI-optimized inference. This approach maximizes your hackathon score by optimizing both training and inference performance, plus optional reasoning evaluation.
+
+## 📋 Prerequisites
+
+### Instance Requirements
+- **Instance Type**: trn1.32xlarge (strongly recommended)
+- **AMI**: Deep Learning AMI Neuron (Ubuntu 22.04) 20250520
+  - **us-east-1**: `ami-0e65a95c79775d1b6`
+  - **us-west-2**: `ami-0d0a2d26f80b645c2`
+- **Storage**: 512GB+ (800GB default in CloudFormation for models and datasets)
+- **Neuron SDK**: 2.23.0
+
+### Environment Management
+Two virtual environments are used:
+```bash
+# For fine-tuning
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+
+# For inference and benchmarking
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+```
+
+## 🚀 Deployment
+
+Deploy the complete NKI-LLAMA environment using AWS CloudFormation with one click:
+
+| AWS Region | Launch CloudFormation Stack |
+|:-----------|:----------------------------|
+| us-east-1 (N. Virginia) |<a href="https://console.aws.amazon.com/cloudformation/home?region=us-east-1#/stacks/new?stackName=hackathon&templateURL=" target="_blank">Launch stack</a> |
+| us-west-2 (Oregon) |<a href="https://console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks/new?stackName=hackathon&templateURL=" target="_blank">Launch stack</a> |
+
+**Note:** Only us-east-1 and us-west-2 regions support Trainium (trn1) instances with the required Neuron AMIs.
+
+### Deployment Steps
+
+1. **Click** on one of the "Launch stack" links above for your preferred region.
+
+2. **Configure the stack:**
+   - **Stack name**: Keep default or customize (e.g., `nki-llama-complete`)
+   - **KeyPairOption**: Choose `use-existing` (recommended - create key in EC2 console first)
+   - **ExistingKeyPairName**: Select your key from dropdown (see note below)
+   - **Ec2InstanceType**: Keep default `trn1.32xlarge`
+   - **VpcOption**: Keep default `create-new`
+   - Click **Next**
+   
+   **Note**: For easy key download, first create a key pair in EC2 → Key Pairs → Create key pair, download it, then return here and select it from the dropdown.
+
+3. **Configure stack options**: Leave all values as default and click **Next**
+
+4. **Review and create:**
+   - Check the box: "I acknowledge that AWS CloudFormation might create IAM resources"
+   - Click **Create stack**
+   - Stack creation takes ~5-10 minutes
+
+5. **Access your instance:**
+   - Go to CloudFormation → Select your stack → **Outputs** tab
+   - Note the **EC2InstanceId** and **EC2PublicIP**
+   - Connect using your pre-downloaded key or SSM
+
+### Quick Access Commands
+
+```bash
+# SSH access (with your pre-created key)
+ssh -i ~/Downloads/your-key-name.pem ubuntu@<EC2PublicIP>
+
+# SSM access (no key needed)
+aws ssm start-session --target <EC2InstanceId>
+```
+
+### Post-Deployment Setup
+
+Once connected:
+
+```bash
+# Repository is pre-cloned
+cd ~/nki-llama
+
+# Install dependencies
+chmod +x install.sh
+./install.sh
+
+# Configure environment
+nano .env  # Add your HF_TOKEN
+
+# Verify setup
+neuron-ls  # Check Neuron devices
+```
+
+### 🎮 Using the NKI-LLAMA CLI
+
+The repository includes a unified command-line interface that simplifies all operations:
+
+```bash
+# Once connected to your instance
+cd ~/nki-llama
+
+# View all available commands
+./nki-llama help
+
+# Run interactive setup wizard
+./nki-llama setup
+```
+
+**Key Commands:**
+- `./nki-llama setup` - Interactive setup wizard with environment guidance
+- `./nki-llama status` - Check system health and compilation cache
+- `./nki-llama clean` - Clean artifacts and cache if needed
+
+**Pro Tips:**
+- Always run the setup wizard first: `./nki-llama setup`
+- Use `tmux` for long operations (the CLI will remind you)
+- Check `./nki-llama status` if you encounter issues
+- The CLI automatically guides you to the correct virtual environment
+
+## 🏃 Complete Workflow
+
+### Step 1: Initial Setup
+```bash
+# Clone repository
+git clone https://github.com/aws-neuron/nki-llama.git
+cd nki-llama
+
+# Install and configure
+chmod +x install.sh
+./install.sh
+
+# Setup environment
+nano .env  # Add HF_TOKEN and configure settings
+```
+
+### Step 2: Fine-tuning Phase
+```bash
+# Start tmux session for training
+tmux new -s training
+
+# Activate training environment
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+
+# Run complete fine-tuning pipeline
+./nki-llama finetune all
+
+# IMPORTANT: Note the compile directory from output
+# Example: /home/ubuntu/neuron_cache/neuronxcc-2.18.121.0+9e31e41a/MODULE_15329989265349737271+a65e371e
+```
+
+### Step 3: Inference Optimization
+```bash
+# Start new tmux session for inference
+tmux new -s inference
+
+# Switch to inference environment
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+
+# Download model if not already done
+./nki-llama inference download
+
+# Run benchmark with NKI compilation
+./nki-llama inference benchmark
+```
+
+### Step 4: Reasoning Evaluation (Optional)
+```bash
+# Start new tmux session for reasoning
+tmux new -s reasoning
+
+# Ensure inference environment is active
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+
+# Run reasoning benchmarks
+./nki-llama/src/inference/scripts/reasoning-bench-lm-eval.sh
+```
+
+### Step 5: Calculate Combined Score
+```bash
+# After all components complete
+python /home/ubuntu/nki-llama/src/handler.py \
+    --config /home/ubuntu/nki-llama/src/fine-tune/neuronx-distributed-training/examples/conf/hf_llama3_8B_SFT_config.yaml \
+    --model-config /home/ubuntu/nki-llama/src/fine-tune/configs/model-config/8B_config_llama3-1/config.json \
+    --log-file /home/ubuntu/nki-llama/logs/nki-llama_[YOUR_TRAINING_LOG].log \
+    --compile-dir [YOUR_COMPILE_DIR_FROM_TRAINING] \
+    --inference-results /home/ubuntu/nki-llama/src/inference/benchmark_inference.json \
+    --reasoning-results \
+    --throughput 2.1 \
+    --output complete_benchmark_results.json \
+    --training-weight 0.33 \
+    --inference-weight 0.33 \
+    --reasoning-weight 0.34 \
+    --hw-backend trn1 \
+    --per-file-scores \
+    --calculate-score \
+    --detailed \
+    --verbose
+```
+
+## 🔧 Integrated NKI Optimization Strategy
+
+### Phase 1: Training Optimizations
+
+#### Custom Training Kernels
+```python
+# Example: NKI-optimized gradient computation
+@nki.jit
+def nki_gradient_accumulation(gradients, accumulated_grads, scale_factor):
+    """
+    Optimized gradient accumulation for distributed training
+    """
+    # Efficient gradient scaling and accumulation
+    pass
+
+# Example: NKI-optimized optimizer step
+@nki.jit
+def nki_adam_step(params, grads, m, v, lr, beta1, beta2, eps):
+    """
+    Fused Adam optimizer step
+    """
+    # Implement fused parameter update
+    pass
+```
+
+#### Training-specific Optimizations
+1. **Gradient All-Reduce**: Optimize collective operations
+2. **Loss Computation**: Fused loss calculation
+3. **Activation Checkpointing**: Memory-efficient training
+4. **Mixed Precision**: FP16/BF16 optimizations
+
+### Phase 2: Inference Optimizations
+
+#### Shared Kernel Optimizations
+Many kernels can be shared between training and inference:
+
+```python
+# Shared RMSNorm implementation
+@nki.jit
+def nki_rmsnorm_kernel(input_tensor, weight, epsilon, training=False):
+    """
+    RMSNorm optimized for both training and inference
+    """
+    # Common normalization logic
+    normalized = compute_rmsnorm(input_tensor, weight, epsilon)
+    
+    if training:
+        # Store intermediate values for backward pass
+        save_for_backward(input_tensor, normalized)
+    
+    return normalized
+
+# Shared attention mechanism
+@nki.jit
+def nki_attention_kernel(q, k, v, mask=None, training=False):
+    """
+    Multi-head attention for training and inference
+    """
+    # Implement scaled dot-product attention
+    # with different optimizations for each mode
+    pass
+```
+
+#### Inference-specific Optimizations
+1. **KV Cache Management**: Optimize cache operations
+2. **Continuous Batching**: Dynamic batch processing
+3. **Speculative Decoding**: Parallel token generation
+4. **Quantization**: INT8/INT4 inference
+
+## 📊 Performance Monitoring Dashboard
+
+### Unified Monitoring Script
+Create a monitoring script to track both phases:
+
+```bash
+#!/bin/bash
+# monitor.sh
+
+echo "=== NKI-LLAMA Performance Monitor ==="
+
+# Training metrics
+if pgrep -f "finetune" > /dev/null; then
+    echo "📊 Training Status:"
+    tail -n 20 logs/nki-llama_*.log | grep -E "(loss|throughput|mfu)"
+fi
+
+# Inference metrics
+if pgrep -f "inference" > /dev/null; then
+    echo "📊 Inference Status:"
+    tail -n 10 src/inference/benchmark_inference.json
+fi
+
+# Device utilization
+echo "📊 Device Utilization:"
+neuron-top -n 1
+
+# Memory usage
+echo "📊 Memory Status:"
+free -h
+```
+
+## 🏗️ Architecture Best Practices
+
+### 1. Kernel Reusability
+Design kernels that work for both training and inference:
+
+```python
+class NKIOptimizedLayer(nn.Module):
+    def __init__(self, config, training_mode=True):
+        super().__init__()
+        self.training_mode = training_mode
+        self.config = config
+        
+    def forward(self, x):
+        if self.config.use_nki:
+            return nki_kernel(x, training=self.training_mode)
+        return standard_implementation(x)
+```
+
+### 2. Configuration Management
+Unified configuration for both phases:
+
+```yaml
+# config.yaml
+model:
+  name: llama-3-8b
+  use_nki: true
+  
+training:
+  batch_size: 8
+  learning_rate: 5e-5
+  nki_kernels:
+    - rmsnorm
+    - attention
+    - linear
+    
+inference:
+  batch_size: 1
+  max_length: 2048
+  nki_kernels:
+    - rmsnorm
+    - attention
+    - linear
+    - kv_cache
+```
+
+### 3. Progressive Optimization
+Start simple and add complexity:
+
+1. **Baseline**: Get everything working without NKI
+2. **Single Kernel**: Add one NKI kernel (e.g., RMSNorm)
+3. **Core Kernels**: Add attention and linear layers
+4. **Advanced**: Implement fusion and specialized kernels
+
+## 🎯 Scoring Optimization Strategy
+
+### Weight Distribution
+For maximum score with all three components:
+
+```python
+# Recommended weight distribution
+WEIGHTS = {
+    "training": 0.33,
+    "inference": 0.33,
+    "reasoning": 0.34
+}
+```
+
+### Focus Areas by Score Impact
+
+#### High Impact (>20% score improvement)
+1. **Attention Optimization**: Both training and inference
+2. **Linear Layer Fusion**: Combine with activation functions
+3. **Memory Access Patterns**: Optimize for Neuron architecture
+
+#### Medium Impact (10-20% improvement)
+1. **Normalization Layers**: RMSNorm, LayerNorm
+2. **Gradient Operations**: Training-specific
+3. **KV Cache**: Inference-specific
+
+#### Low Impact (<10% improvement)
+1. **Activation Functions**: Unless fused with other ops
+2. **Element-wise Operations**: Minor gains
+3. **Data Loading**: Already optimized in framework
+
+## 🛠️ Development Workflow
+
+### Iterative Development Cycle
+```bash
+# 1. Implement kernel
+nano src/kernels/my_nki_kernel.py
+
+# 2. Test in isolation
+python test_kernel.py
+
+# 3. Integrate into model
+nano src/llama.py
+
+# 4. Benchmark improvement
+./nki-llama inference benchmark --seq-len 512
+
+# 5. Profile and optimize
+neuron-profile view profiles/
+```
+
+### Continuous Integration Testing
+```python
+# test_suite.py
+import unittest
+
+class NKIKernelTests(unittest.TestCase):
+    def test_rmsnorm_accuracy(self):
+        # Compare NKI vs PyTorch implementation
+        pass
+        
+    def test_attention_performance(self):
+        # Verify speedup
+        pass
+        
+    def test_training_convergence(self):
+        # Ensure training still converges
+        pass
+```
+
+## 📈 Results Analysis
+
+### Performance Tracking
+Track improvements across iterations:
+
+```python
+# track_performance.py
+import json
+import matplotlib.pyplot as plt
+
+def plot_improvements(baseline, optimized):
+    metrics = ['training_mfu', 'inference_throughput', 'reasoning_accuracy']
+    improvements = [(optimized[m] - baseline[m]) / baseline[m] * 100 
+                   for m in metrics]
+    
+    plt.bar(metrics, improvements)
+    plt.ylabel('Improvement (%)')
+    plt.title('NKI Optimization Impact')
+    plt.savefig('optimization_impact.png')
+```
+
+### Score Breakdown Analysis
+```bash
+# Analyze score components
+python src/handler.py \
+    --inference-results benchmark_inference.json \
+    --analyze-components \
+    --output score_analysis.json
+```
+
+## 🐛 Common Integration Issues
+
+### Environment Conflicts
+```bash
+# Issue: Package version mismatch between environments
+# Solution: Use separate conda environments
+conda create -n nki-training python=3.10
+conda create -n nki-inference python=3.10
+```
+
+### Model Compatibility
+```bash
+# Issue: Model trained with one config, inference with another
+# Solution: Always save and load full configuration
+torch.save({
+    'model_state_dict': model.state_dict(),
+    'config': config,
+    'nki_kernels': enabled_kernels
+}, 'checkpoint.pt')
+```
+
+### Cache Conflicts
+```bash
+# Issue: Stale compiled kernels
+# Solution: Clear cache between major changes
+rm -rf ~/neuron_cache/*
+rm -rf ~/.cache/neuron
+```
+
+## 🏆 Competition Tips
+
+### 1. Time Management
+- **Week 1**: Get baseline working, understand the code
+- **Week 2**: Implement core NKI kernels
+- **Week 3**: Optimize and fine-tune
+- **Final days**: Polish, document, prepare presentation
+
+### 2. Collaboration Strategy
+- **Frontend**: One member on training optimizations
+- **Backend**: One member on inference optimizations
+- **Integration**: One member on testing and benchmarking
+
+### 3. Documentation
+Keep detailed logs of:
+- Kernel implementations
+- Performance improvements
+- Failed attempts (for learning)
+- Configuration changes
+
+## 📊 Example Complete Run
+
+```bash
+#!/bin/bash
+# complete_hackathon_run.sh
+
+# Setup
+echo "🚀 Starting complete NKI-LLAMA pipeline"
+
+# Training phase
+tmux new -d -s training
+tmux -a -t training "source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate" Enter
+tmux -a -t training "cd ~/nki-llama" Enter
+tmux -a -t training "./nki-llama finetune all 2>&1 | tee training.log" Enter
+
+# Wait for training to reach a checkpoint
+sleep 3600  # Adjust based on your training time
+
+# Inference phase
+tmux new -d -s inference
+tmux -a -t inference "source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate" Enter
+tmux -a -t inference "cd ~/nki-llama" Enter
+tmux -a -t inference "./nki-llama inference benchmark 2>&1 | tee inference.log" Enter
+
+# Reasoning phase (optional)
+tmux new -d -s reasoning
+tmux -a -t reasoning "source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate" Enter
+tmux -a -t reasoning "./nki-llama/src/inference/scripts/reasoning-bench-lm-eval.sh" Enter
+
+# Monitor all sessions
+tmux new -s monitor
+watch -n 10 './monitor.sh'
+```
+
+## 📚 Resources
+
+### Essential Documentation
+- [Complete NKI Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/index.html)
+- [NeuronX Distributed Training](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/neuronx-distributed/index.html)
+- [NeuronX Distributed Inference](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/neuronx-distributed-inference/index.html)
+
+### Example Repositories
+- [NKI Samples](https://github.com/aws-neuron/nki-samples)
+- [NKI Autotune](https://github.com/awslabs/nki-autotune)
+- [AWS Neuron Samples](https://github.com/aws-neuron/aws-neuron-samples)
+
+### Tools and Utilities
+- [Neuron Profiler](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/tools/neuron-sys-tools/neuron-profile-user-guide.html)
+- [Neuron Top](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/tools/neuron-sys-tools/neuron-top-user-guide.html)
+- [TensorBoard Integration](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuron/tutorials/training/tensorboard.html)
+
+---
+
+Remember: The key to maximizing your score is to optimize both training and inference with NKI kernels while maintaining model accuracy. Focus on the highest-impact optimizations first and ensure everything integrates smoothly. Good luck!
\ No newline at end of file
diff --git a/docs/fine-tuning.md b/docs/fine-tuning.md
new file mode 100644
index 0000000..44cd5d5
--- /dev/null
+++ b/docs/fine-tuning.md
@@ -0,0 +1,362 @@
+# Fine-tuning Guide for NKI-LLAMA Hackathon
+
+## 🎯 Overview
+
+This guide focuses exclusively on fine-tuning LLaMA models on AWS Trainium using NeuronX Distributed (NxD). Perfect for participants wanting to optimize training performance and achieve high Model FLOP Utilization (MFU).
+
+## 📋 Prerequisites
+
+### Instance Requirements
+- **Instance Type**: trn1.32xlarge (recommended) or trn1.2xlarge (minimum)
+- **AMI**: Deep Learning AMI Neuron (Ubuntu 22.04) 20250520
+  - **us-east-1**: `ami-0e65a95c79775d1b6`
+  - **us-west-2**: `ami-0d0a2d26f80b645c2`
+- **Storage**: 512GB+ recommended (800GB default in CloudFormation)
+- **Neuron SDK**: 2.23.0
+
+### Environment Setup
+```bash
+# Activate the training environment
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+```
+
+## 🚀 Deployment
+
+Deploy the NKI-LLAMA training environment using AWS CloudFormation with one click:
+
+| AWS Region | Launch CloudFormation Stack |
+|:-----------|:----------------------------|
+| us-east-1 (N. Virginia) |<a href="https://console.aws.amazon.com/cloudformation/home?region=us-east-1#/stacks/new?stackName=hackathon&templateURL=" target="_blank">Launch stack</a> |
+| us-west-2 (Oregon) |<a href="https://console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks/new?stackName=hackathon&templateURL=" target="_blank">Launch stack</a> |
+
+**Note:** Only us-east-1 and us-west-2 regions support Trainium (trn1) instances with the required Neuron AMIs.
+
+### Deployment Steps
+
+1. **Click** on one of the "Launch stack" links above for your preferred region.
+
+2. **Configure the stack:**
+   - **Stack name**: Keep default or customize (e.g., `nki-llama-training`)
+   - **KeyPairOption**: Choose `use-existing` (recommended - create key in EC2 console first)
+   - **ExistingKeyPairName**: Select your key from dropdown (see note below)
+   - **Ec2InstanceType**: Keep default `trn1.32xlarge`
+   - Click **Next**
+   
+   **Note**: For easy key download, first create a key pair in EC2 → Key Pairs → Create key pair, download it, then return here and select it from the dropdown.
+
+3. **Configure stack options**: Leave all values as default and click **Next**
+
+4. **Review and create:**
+   - Check the box: "I acknowledge that AWS CloudFormation might create IAM resources"
+   - Click **Create stack**
+   - Stack creation takes ~5-10 minutes
+
+5. **Access your instance:**
+   - Go to CloudFormation → Select your stack → **Outputs** tab
+   - Copy the **SSHCommand** value
+   - If you created a new key, download it from EC2 → Key Pairs
+   - Connect: `ssh -i <your-key.pem> ubuntu@<instance-ip>`
+
+### Post-Deployment Setup
+
+Once connected to your instance:
+
+```bash
+# Repository is pre-cloned
+cd ~/nki-llama
+
+# Install dependencies
+chmod +x install.sh
+./install.sh
+
+# Configure environment
+nano .env  # Add your HF_TOKEN
+```
+
+### 🎮 Using the NKI-LLAMA CLI
+
+The repository includes a unified command-line interface that simplifies all operations:
+
+```bash
+# Once connected to your instance
+cd ~/nki-llama
+
+# View all available commands
+./nki-llama help
+
+# Run interactive setup wizard
+./nki-llama setup
+```
+
+**Key Commands:**
+- `./nki-llama setup` - Interactive setup wizard with environment guidance
+- `./nki-llama status` - Check system health and compilation cache
+- `./nki-llama clean` - Clean artifacts and cache if needed
+
+**Pro Tips:**
+- Always run the setup wizard first: `./nki-llama setup`
+- Use `tmux` for long operations (the CLI will remind you)
+- Check `./nki-llama status` if you encounter issues
+- The CLI automatically guides you to the correct virtual environment
+
+## 🚀 Quick Start
+
+### Step 1 (OPTIONAL): Clone and Setup
+
+**Please skip this step when deploying the infrastructure with cloudformation**
+
+```bash
+# Clone the repository
+git clone https://github.com/aws-neuron/nki-llama.git
+cd nki-llama
+
+# Install dependencies
+chmod +x install.sh
+./install.sh
+
+# Configure environment
+nano .env  # Add your HF_TOKEN
+```
+
+### Step 2: Run Complete Fine-tuning Pipeline
+```bash
+# Use tmux for long-running operations
+tmux new -s training
+
+# Activate training environment
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+
+# Run the complete pipeline
+./nki-llama finetune all
+```
+
+## 📊 Detailed Fine-tuning Workflow
+
+### 1. Install Dependencies
+```bash
+./nki-llama finetune deps
+```
+This installs all required Python packages and NeuronX Distributed components.
+
+### 2. Download Dataset
+```bash
+./nki-llama finetune data
+```
+Downloads and prepares the training dataset (default: dolly_15k).
+
+### 3. Download Base Model
+```bash
+./nki-llama finetune model
+```
+Downloads the base LLaMA model from Hugging Face (requires HF_TOKEN).
+
+### 4. Convert Model Format
+```bash
+./nki-llama finetune convert
+```
+Converts the model to NeuronX Distributed Training (NxDT) format.
+
+### 5. Pre-compile Graphs
+```bash
+./nki-llama finetune compile
+```
+**Important**: Note the compile directory path from the output. You'll need this for score calculation.
+
+Example output:
+```
+Pre-compile graphs: /home/ubuntu/neuron_cache/neuronxcc-2.18.121.0+9e31e41a/MODULE_15329989265349737271+a65e371e
+```
+
+### 6. Start Training
+```bash
+./nki-llama finetune train
+```
+Runs the actual fine-tuning process.
+
+## 📈 Performance Metrics
+
+During training, the system tracks:
+- **MFU (Model FLOP Utilization)**: Target >40% for good performance
+- **Throughput**: Tokens/second processed
+- **Loss convergence**: Training and validation loss
+- **Memory usage**: HBM utilization
+
+## 🎯 Score Calculation (Training Only)
+
+After training completes, calculate your performance score:
+
+```bash
+python /home/ubuntu/nki-llama/src/handler.py \
+    --config /home/ubuntu/nki-llama/src/fine-tune/neuronx-distributed-training/examples/conf/hf_llama3_8B_SFT_config.yaml \
+    --model-config /home/ubuntu/nki-llama/src/fine-tune/configs/model-config/8B_config_llama3-1/config.json \
+    --log-file /home/ubuntu/nki-llama/logs/nki-llama_[YOUR_TIMESTAMP].log \
+    --compile-dir [YOUR_COMPILE_DIR_FROM_STEP_5] \
+    --throughput 2.1 \
+    --output training_score.json \
+    --training-weight 1.0 \
+    --hw-backend trn1 \
+    --calculate-score \
+    --detailed \
+    --verbose
+```
+
+The training score evaluates:
+- **MFU improvement**: How well your optimizations improve hardware utilization
+- **Throughput gains**: Training speed improvements
+- **NKI optimization ratio**: Percentage of operations optimized with NKI
+
+## 🔧 Configuration Options
+
+### Training Configuration
+Edit `src/fine-tune/neuronx-distributed-training/examples/conf/hf_llama3_8B_SFT_config.yaml`:
+
+```yaml
+# Model parameters
+model:
+  model_id: "meta-llama/Meta-Llama-3-8B"
+  
+# Training parameters
+training:
+  batch_size: 1
+  gradient_accumulation_steps: 8
+  learning_rate: 5e-5
+  num_train_epochs: 1
+  
+# Hardware configuration
+distributed:
+  tensor_parallel_size: 8
+  pipeline_parallel_size: 1
+```
+
+### Environment Variables (.env)
+```bash
+# Hugging Face token (required)
+HF_TOKEN=your_token_here
+
+# Model selection
+MODEL_ID=meta-llama/Meta-Llama-3-8B
+MODEL_NAME=llama-3-8b
+
+# Hardware configuration
+TENSOR_PARALLEL_SIZE=8
+NEURON_RT_NUM_CORES=8
+```
+
+## 🛠️ Advanced Optimizations
+
+### 1. Implement Custom NKI Kernels
+Create optimized kernels for training operations:
+
+```python
+# Example: Optimized attention computation
+@nki_jit
+def nki_attention_kernel(q, k, v, mask=None):
+    # Your NKI implementation here
+    pass
+```
+
+### 2. Optimize Data Loading
+- Use efficient data preprocessing
+- Implement prefetching
+- Optimize tokenization pipeline
+
+### 3. Memory Optimization
+- Gradient checkpointing
+- Mixed precision training
+- Efficient tensor layouts
+
+## 📊 Monitoring Training
+
+### Real-time Monitoring
+```bash
+# In a new terminal
+neuron-top  # Monitor device utilization
+
+# View training logs
+tail -f logs/nki-llama_*.log
+```
+
+### Key Metrics to Watch
+- **step_loss**: Should decrease over time
+- **grad_norm**: Should remain stable
+- **throughput**: Tokens/second
+- **mfu**: Model FLOP Utilization
+
+## 🐛 Troubleshooting
+
+### Common Issues
+
+#### Out of Memory
+```bash
+# Reduce batch size or model parallelism
+export TENSOR_PARALLEL_SIZE=4  # Instead of 8
+```
+
+#### Compilation Timeout
+```bash
+# Increase timeout
+export NEURON_COMPILE_TIMEOUT=3600  # 1 hour
+```
+
+#### Training Instability
+- Check gradient norms
+- Reduce learning rate
+- Enable gradient clipping
+
+## 📚 Best Practices
+
+1. **Always use tmux** for long-running operations
+2. **Save checkpoints frequently** to prevent data loss
+3. **Monitor metrics** throughout training
+4. **Document your optimizations** for the presentation
+5. **Test incrementally** - verify each optimization works
+
+## 🏆 Scoring Tips
+
+To maximize your training-only score:
+
+1. **Focus on MFU**: Implement NKI kernels for compute-intensive operations
+2. **Optimize throughput**: Reduce data loading bottlenecks
+3. **Increase NKI coverage**: Replace more PyTorch ops with NKI kernels
+4. **Profile extensively**: Use neuron-profile to identify bottlenecks
+
+## 📄 Example Training Session
+
+```bash
+# Complete example workflow
+tmux new -s hackathon-training
+
+# Setup
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+cd ~/nki-llama
+
+# Run training
+./nki-llama finetune all
+
+# Monitor progress (in another terminal)
+tmux new -s monitoring
+neuron-top
+
+# After completion, calculate score
+python src/handler.py --config [...] --calculate-score
+
+# Detach from tmux: Ctrl+B, then D
+```
+
+## 🎯 Next Steps
+
+After mastering fine-tuning:
+1. Document your NKI kernel implementations
+2. Prepare performance comparison charts
+3. Consider exploring inference optimizations (see [inference.md](./inference.md))
+4. Prepare your presentation highlighting training improvements
+
+## 📚 Resources
+
+- [NeuronX Distributed Documentation](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/neuronx-distributed/index.html)
+- [NKI Training Examples](https://github.com/aws-neuron/nki-samples)
+- [AWS Neuron SDK Guide](https://awsdocs-neuron.readthedocs-hosted.com/)
+
+---
+
+Remember: Focus on achieving high MFU through effective NKI kernel implementation. Good luck with your hackathon!
\ No newline at end of file
diff --git a/docs/inference.md b/docs/inference.md
new file mode 100644
index 0000000..768671e
--- /dev/null
+++ b/docs/inference.md
@@ -0,0 +1,478 @@
+# Inference with NKI Compilation Guide for NKI-LLAMA Hackathon
+
+## 🎯 Overview
+
+This guide focuses on optimizing inference performance using Neuron Kernel Interface (NKI) compilation on AWS Inferentia/Trainium. Perfect for teams wanting to achieve maximum inference throughput and minimal latency without the training component.
+
+## 📋 Prerequisites
+
+### Instance Requirements
+- **Instance Type**: trn1.32xlarge (recommended) or trn1.2xlarge (minimum)
+- **AMI**: Deep Learning AMI Neuron (Ubuntu 22.04) 20250520
+  - **us-east-1**: `ami-0e65a95c79775d1b6`
+  - **us-west-2**: `ami-0d0a2d26f80b645c2`
+- **Storage**: 256GB+ recommended (800GB default in CloudFormation)
+- **Neuron SDK**: 2.23.0
+
+### Environment Setup
+```bash
+# Activate the inference environment
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+```
+
+## 🚀 Deployment
+
+Deploy the NKI-LLAMA inference environment using AWS CloudFormation with one click:
+
+| AWS Region | Launch CloudFormation Stack |
+|:-----------|:----------------------------|
+| us-east-1 (N. Virginia) |<a href="https://console.aws.amazon.com/cloudformation/home?region=us-east-1#/stacks/new?stackName=hackathon&templateURL=" target="_blank">Launch stack</a> |
+| us-west-2 (Oregon) |<a href="https://console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks/new?stackName=hackathon&templateURL=" target="_blank">Launch stack</a> |
+
+**Note:** Only us-east-1 and us-west-2 regions support Trainium (trn1) instances with the required Neuron AMIs.
+
+### Deployment Steps
+
+1. **Click** on one of the "Launch stack" links above for your preferred region.
+
+2. **Configure the stack:**
+   - **Stack name**: Keep default or customize (e.g., `nki-llama-inference`)
+   - **KeyPairOption**: Choose `use-existing` (recommended - create key in EC2 console first)
+   - **ExistingKeyPairName**: Select your key from dropdown (see note below)
+   - **Ec2InstanceType**: Keep default `trn1.32xlarge`
+   - **VpcOption**: Choose `create-new` or select existing VPC
+   - Click **Next**
+   
+   **Note**: For easy key download, first create a key pair in EC2 → Key Pairs → Create key pair, download it, then return here and select it from the dropdown. Alternatively, choose `none` to use SSM Session Manager without keys.
+
+3. **Configure stack options**: Leave all values as default and click **Next**
+
+4. **Review and create:**
+   - Check the box: "I acknowledge that AWS CloudFormation might create IAM resources"
+   - Click **Create stack**
+   - Stack creation takes ~5-10 minutes
+
+5. **Access your instance:**
+   - Go to CloudFormation → Select your stack → **Outputs** tab
+   - Use **SSHCommand** for SSH access or **EC2InstanceId** for SSM
+   - For SSM: `aws ssm start-session --target <instance-id>`
+
+### Post-Deployment Setup
+
+Once connected to your instance:
+
+```bash
+# Repository is pre-cloned
+cd ~/nki-llama
+
+# Install dependencies
+chmod +x install.sh
+./install.sh
+
+# Configure environment
+nano .env  # Add your HF_TOKEN and inference settings
+
+# Activate inference environment
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+```
+
+### 🎮 Using the NKI-LLAMA CLI
+
+The repository includes a unified command-line interface that simplifies all operations:
+
+```bash
+# Once connected to your instance
+cd ~/nki-llama
+
+# View all available commands
+./nki-llama help
+
+# Run interactive setup wizard
+./nki-llama setup
+```
+
+**Key Commands:**
+- `./nki-llama setup` - Interactive setup wizard with environment guidance
+- `./nki-llama status` - Check system health and compilation cache
+- `./nki-llama clean` - Clean artifacts and cache if needed
+
+**Pro Tips:**
+- Always run the setup wizard first: `./nki-llama setup`
+- Use `tmux` for long operations (the CLI will remind you)
+- Check `./nki-llama status` if you encounter issues
+- The CLI automatically guides you to the correct virtual environment
+
+## 🚀 Quick Start
+
+### Step 1 (OPTIONAL): Clone and Setup
+
+**Please skip this step when deploying the infrastructure with cloudformation**
+
+```bash
+# Clone the repository
+git clone https://github.com/aws-neuron/nki-llama.git
+cd nki-llama
+
+# Install dependencies
+chmod +x install.sh
+./install.sh
+
+# Configure environment
+nano .env  # Add your HF_TOKEN and inference settings
+```
+
+### Step 2: Download Model
+```bash
+# Download the model using the CLI
+./nki-llama inference download
+
+# Or manually download a specific model
+cd ~/models
+huggingface-cli download --token YOUR_TOKEN meta-llama/Meta-Llama-3-8B --local-dir /home/ubuntu/models/llama-3-8b
+```
+
+### Step 3: Run Benchmark with NKI Compilation
+```bash
+# Use tmux for long-running compilation
+tmux new -s benchmark
+
+# Run benchmark (includes NKI compilation on first run)
+./nki-llama inference benchmark
+```
+
+## 🔧 NKI Kernel Implementation
+
+### Understanding NKI Optimizations
+
+NKI (Neuron Kernel Interface) allows you to write custom, highly optimized kernels for Neuron devices. Key targets for optimization:
+
+1. **RMSNorm** - Layer normalization operations
+2. **Attention mechanisms** - Multi-head attention computation
+3. **Linear transformations** - Matrix multiplications
+4. **Activation functions** - GELU, SiLU, etc.
+
+### Example: Implementing NKI RMSNorm
+
+```python
+import neuron_kernel_interface as nki
+import torch.nn as nn
+
+@nki.jit
+def nki_rmsnorm_kernel(input_tensor, weight, epsilon):
+    """
+    Optimized RMSNorm implementation using NKI
+    """
+    # Get tensor dimensions
+    batch_size = input_tensor.shape[0]
+    seq_len = input_tensor.shape[1]
+    hidden_size = input_tensor.shape[2]
+    
+    # Allocate output tensor
+    output = nki.tensor(shape=input_tensor.shape, dtype=input_tensor.dtype)
+    
+    # Compute RMS normalization
+    # Your NKI implementation here
+    # ...
+    
+    return output
+
+# Modify the model to use NKI kernel
+class CustomRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6, nki_enabled=True):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+        self.nki_enabled = nki_enabled
+    
+    def forward(self, hidden_states):
+        if self.nki_enabled:
+            return nki_rmsnorm_kernel(hidden_states, self.weight, self.variance_epsilon)
+        # Fallback to standard implementation
+        return standard_rmsnorm(hidden_states, self.weight, self.variance_epsilon)
+```
+
+### Implementing Additional NKI Kernels
+
+#### 1. Attention Kernel
+```python
+@nki.jit
+def nki_attention_kernel(q, k, v, mask=None):
+    """
+    Optimized attention computation
+    """
+    # Implement scaled dot-product attention
+    # with NKI optimizations
+    pass
+```
+
+#### 2. Linear Layer Kernel
+```python
+@nki.jit
+def nki_linear_kernel(input, weight, bias=None):
+    """
+    Optimized linear transformation
+    """
+    # Implement matrix multiplication
+    # with optional bias addition
+    pass
+```
+
+## 📊 Benchmarking Process
+
+### Running Benchmarks
+
+```bash
+# Full benchmark with default settings
+./nki-llama inference benchmark
+
+# Benchmark with custom sequence length
+./nki-llama inference benchmark --seq-len 2048
+
+# Clear cache and re-benchmark
+./nki-llama inference benchmark --clear-cache
+```
+
+### Direct Benchmark Execution
+For more control over benchmarking parameters:
+
+```bash
+cd src/inference
+python main.py \
+    --mode evaluate_all \
+    --seq-len 1024 \
+    --batch-size 1 \
+    --enable-nki \
+    --num-prompts 25
+```
+
+## 🎯 Score Calculation (Inference Only)
+
+After benchmarking completes, calculate your performance score:
+
+```bash
+python /home/ubuntu/nki-llama/src/handler.py \
+    --inference-results /home/ubuntu/nki-llama/src/inference/benchmark_inference.json \
+    --output inference_score.json \
+    --inference-weight 1.0 \
+    --hw-backend trn1 \
+    --calculate-score \
+    --detailed \
+    --verbose
+```
+
+The inference score evaluates:
+- **Latency reduction**: Time to First Token (TTFT) improvement
+- **Throughput increase**: Tokens/second improvement
+- **NKI coverage**: Percentage of FLOPs using NKI kernels
+
+## 🔍 Profiling and Optimization
+
+### Using Neuron Profiler
+
+```bash
+# Enable profiling during benchmark
+export NEURON_PROFILE=1
+export NEURON_PROFILE_CONFIG=profile.json
+
+# Create profile configuration
+cat > profile.json << EOF
+{
+    "capture": {
+        "enabled": true,
+        "output_dir": "./profiles",
+        "duration_ms": 10000
+    }
+}
+EOF
+
+# Run benchmark with profiling
+./nki-llama inference benchmark
+
+# Analyze results
+neuron-profile view ./profiles/profile_*.neff
+```
+
+### Key Optimization Targets
+
+1. **Memory Access Patterns**
+   - Optimize data layout for Neuron memory hierarchy
+   - Minimize HBM bandwidth usage
+   - Use efficient tiling strategies
+
+2. **Compute Efficiency**
+   - Maximize tensor core utilization
+   - Fuse operations where possible
+   - Eliminate redundant computations
+
+3. **Pipeline Optimization**
+   - Overlap compute and memory operations
+   - Optimize kernel launch overhead
+   - Efficient synchronization
+
+## 🛠️ Advanced NKI Techniques
+
+### 1. Kernel Fusion
+Combine multiple operations into a single kernel:
+
+```python
+@nki.jit
+def nki_fused_attention_norm(q, k, v, norm_weight, epsilon):
+    """
+    Fused attention + normalization kernel
+    """
+    # Compute attention
+    attn_output = nki_attention_kernel(q, k, v)
+    
+    # Apply normalization in the same kernel
+    normalized = nki_rmsnorm_kernel(attn_output, norm_weight, epsilon)
+    
+    return normalized
+```
+
+### 2. Tiling Strategies
+Optimize for Neuron's memory hierarchy:
+
+```python
+@nki.jit
+def nki_tiled_matmul(a, b, tile_size=128):
+    """
+    Tiled matrix multiplication for better cache usage
+    """
+    # Implement tiled algorithm
+    # optimized for Neuron architecture
+    pass
+```
+
+### 3. Asynchronous Execution
+Leverage Neuron's async capabilities:
+
+```python
+# Enable async execution in your kernels
+@nki.jit(async_launch=True)
+def nki_async_kernel(...):
+    pass
+```
+
+## 📈 Performance Monitoring
+
+### Real-time Monitoring
+```bash
+# Monitor device utilization
+neuron-top
+
+# Watch compilation progress
+tail -f logs/nki-llama_*.log
+
+# Check benchmark results
+cat src/inference/benchmark_inference.json | jq
+```
+
+### Key Metrics
+- **TTFT (Time to First Token)**: Target <100ms
+- **Throughput**: Target >1000 tokens/sec
+- **Device Utilization**: Target >90%
+- **Memory Bandwidth**: Monitor for bottlenecks
+
+## 🐛 Troubleshooting
+
+### Common Issues
+
+#### Compilation Cache Errors
+```bash
+# Clear the cache
+./nki-llama clean
+# or
+rm -rf ~/neuron_cache/*
+```
+
+#### Out of Memory During Compilation
+```bash
+# Reduce parallelism
+export NEURON_COMPILE_THREADS=4
+```
+
+#### Kernel Launch Failures
+- Check tensor dimensions match kernel expectations
+- Verify data types are supported
+- Enable debug mode: `export NEURON_DEBUG=1`
+
+## 🏆 Optimization Strategies
+
+### 1. Target Hot Spots
+Focus on operations that consume most time:
+- Attention computation (usually 30-40% of time)
+- Linear layers (20-30%)
+- Normalization (10-15%)
+
+### 2. Incremental Optimization
+- Start with one kernel (e.g., RMSNorm)
+- Validate correctness
+- Measure improvement
+- Move to next kernel
+
+## 📊 Benchmark Configuration
+
+### Custom Prompt Testing
+Create your own prompts for testing:
+
+```bash
+# Edit prompts.txt
+nano ./data/prompts.json
+```
+
+### Batch Processing
+Test different batch sizes:
+
+```bash
+for batch in 1 2 4 8; do
+    ./nki-llama inference benchmark --batch-size $batch
+done
+```
+
+## 🎯 Next Steps
+
+After mastering inference optimization:
+1. Document your NKI kernel implementations
+2. Create performance comparison charts
+3. Consider adding fine-tuning (see [complete-pipeline.md](./complete-pipeline.md))
+4. Prepare reasoning benchmarks for additional scoring
+
+## 📚 Example Inference Session
+
+```bash
+# Complete workflow example
+tmux new -s hackathon-inference
+
+# Setup
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+cd ~/nki-llama
+
+# Download model
+./nki-llama inference download
+
+# Run initial benchmark
+./nki-llama inference benchmark
+
+# Implement NKI optimizations
+nano src/llama.py  # Add your NKI kernels
+
+# Re-benchmark with optimizations
+./nki-llama inference benchmark --clear-cache
+
+# Calculate score
+python src/handler.py --inference-results benchmark_inference.json --calculate-score
+
+# Start serving (optional)
+./nki-llama inference server
+```
+
+## 📚 Resources
+
+- [NKI Documentation](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/index.html)
+- [NKI Samples Repository](https://github.com/aws-neuron/nki-samples)
+- [NKI Autotune Tool](https://github.com/awslabs/nki-autotune)
+- [Neuron Profiler Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/tools/neuron-sys-tools/neuron-profile-user-guide.html)
+
+---
+
+Remember: Focus on implementing high-performance NKI kernels for critical operations. The key to success is identifying and optimizing the bottlenecks in your model's inference pipeline!
\ No newline at end of file

From 8e8fad25f13ce833086e5858a94f8b9a1c0efb3d Mon Sep 17 00:00:00 2001
From: arm-diaz <arm.dan.diaz.gonzalez@gmail.com>
Date: Wed, 9 Jul 2025 16:01:17 +0000
Subject: [PATCH 49/65] docs: add documentation for each path

---
 README.md | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/README.md b/README.md
index 63fcd39..9f74a35 100644
--- a/README.md
+++ b/README.md
@@ -103,11 +103,6 @@ Score = Accuracy × Performance_Gains × (1 + NKI_Coverage)
 - **AMI**: Deep Learning AMI Neuron (Ubuntu 22.04) 20250520
 - **Pre-installed**: Neuron SDK 2.23.0, PyTorch, NeuronX
 
-### Support
-- Create issues in the repository for technical questions
-- Check existing issues for common problems
-- Use the `#nki-llama` channel in the hackathon Slack
-
 ## 💡 Tips for Success
 
 1. **Start Simple**: Get the baseline working before optimizing

From e0286b4268d58cddf43ab5fdb960044ba2e74733 Mon Sep 17 00:00:00 2001
From: arm-diaz <arm.dan.diaz.gonzalez@gmail.com>
Date: Wed, 9 Jul 2025 16:02:02 +0000
Subject: [PATCH 50/65] docs: add documentation for each path

---
 README.md | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/README.md b/README.md
index 9f74a35..8d7c1fb 100644
--- a/README.md
+++ b/README.md
@@ -111,23 +111,6 @@ Score = Accuracy × Performance_Gains × (1 + NKI_Coverage)
 4. **Iterate Quickly**: Test kernels individually before integration
 5. **Document Everything**: Keep notes on what works and what doesn't
 
-## 🏆 Winning Strategy
-
-1. **Week 1**: 
-   - Set up environment and understand the codebase
-   - Get baseline metrics for comparison
-   - Choose your optimization path
-
-2. **Week 2**: 
-   - Implement core NKI kernels
-   - Test and validate accuracy
-   - Measure performance improvements
-
-3. **Week 3**: 
-   - Optimize and fine-tune kernels
-   - Maximize NKI coverage
-   - Prepare submission and presentation
-
 ## 🚦 Ready to Start?
 
 1. **Choose your path** from the three guides above

From 473f961f8c59697591c5abe74fba67327792d638 Mon Sep 17 00:00:00 2001
From: arm-diaz <arm.dan.diaz.gonzalez@gmail.com>
Date: Wed, 9 Jul 2025 16:03:35 +0000
Subject: [PATCH 51/65] docs: add documentation for each path

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8d7c1fb..cc44966 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ You're about to embark on an exciting challenge to optimize LLaMA models using A
 
 We've created three specialized guides based on your optimization focus:
 
-### 1. 🏋️ [Fine-tuning Only Guide](./docs/fine-tuning.md)
+### 1. 🏋️ [Fine-tuning Guide](./docs/fine-tuning.md)
 **Perfect for teams focusing on training optimization**
 - Optimize Model FLOP Utilization (MFU) during training
 - Implement NKI kernels for training operations

From 4767568a95881ef259a6e9a1ef8ce33fe7341235 Mon Sep 17 00:00:00 2001
From: arm-diaz <arm.dan.diaz.gonzalez@gmail.com>
Date: Wed, 9 Jul 2025 16:05:17 +0000
Subject: [PATCH 52/65] docs: add documentation for each path

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index cc44966..65eaf13 100644
--- a/README.md
+++ b/README.md
@@ -37,8 +37,8 @@ We've created three specialized guides based on your optimization focus:
 
 | AWS Region | Launch CloudFormation Stack |
 |:-----------|:----------------------------|
-| us-east-1 (N. Virginia) |<a href="https://console.aws.amazon.com/cloudformation/home?region=us-east-1#/stacks/new?stackName=nki-llama&templateURL=https://nki-llama-cfn.s3.amazonaws.com/nki-llama-stack.yaml" target="_blank">Launch stack</a> |
-| us-west-2 (Oregon) |<a href="https://console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks/new?stackName=nki-llama&templateURL=https://nki-llama-cfn.s3.amazonaws.com/nki-llama-stack.yaml" target="_blank">Launch stack</a> |
+| us-east-1 (N. Virginia) |<a href="https://console.aws.amazon.com/cloudformation/home?region=us-east-1#/stacks/new?stackName=hackathon&templateURL=" target="_blank">Launch stack</a> |
+| us-west-2 (Oregon) |<a href="https://console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks/new?stackName=hackathon&templateURL=" target="_blank">Launch stack</a> |
 
 **Note**: Create your SSH key pair first in EC2 → Key Pairs for easy download!
 

From cf3b8e097d7720516d3753543b6d1e67b20dbd23 Mon Sep 17 00:00:00 2001
From: Arhamama-AMZ <arhamama@amazon.com>
Date: Thu, 10 Jul 2025 17:23:33 +0000
Subject: [PATCH 53/65] Self-Attention Path Updated

---
 README.md                                     |   25 +-
 docs/self-attention.md                        |  254 ++++
 src/self-attention/README.md                  |  134 ++
 src/self-attention/attention.py               | 1171 +++++++++++++++++
 .../self-attention/tests}/conftest.py         |    0
 .../tests/test_flash_attn_bwd.py              |  460 +++++++
 .../tests/test_flash_attn_fwd.py              |  466 +++++++
 test/unit/test_neuron_profile.py              |   86 --
 8 files changed, 2501 insertions(+), 95 deletions(-)
 create mode 100644 docs/self-attention.md
 create mode 100644 src/self-attention/README.md
 create mode 100644 src/self-attention/attention.py
 rename {test/unit => src/self-attention/tests}/conftest.py (100%)
 create mode 100644 src/self-attention/tests/test_flash_attn_bwd.py
 create mode 100644 src/self-attention/tests/test_flash_attn_fwd.py
 delete mode 100644 test/unit/test_neuron_profile.py

diff --git a/README.md b/README.md
index 65eaf13..76f1bfc 100644
--- a/README.md
+++ b/README.md
@@ -8,23 +8,30 @@ You're about to embark on an exciting challenge to optimize LLaMA models using A
 
 ## 📚 Choose Your Path
 
-We've created three specialized guides based on your optimization focus:
+We've created four specialized guides based on your optimization focus:
 
-### 1. 🏋️ [Fine-tuning Guide](./docs/fine-tuning.md)
-**Perfect for teams focusing on training optimization**
-- Optimize Model FLOP Utilization (MFU) during training
-- Implement NKI kernels for training operations
-- Achieve high throughput with NeuronX Distributed
-- **Score Focus**: Training performance metrics
+### 1. ⚡ [Flash Self-Attention Kernel Optimization Guide](./docs/self-attention.md)
+**Great for teams to get started with kernel optimizations**
+- Increase performance gains running Flash forward and backward kernels
+- Analyze performance and numerical computation results from implemented kernels
+- Further optimize attention kernels
+- **Score Focus**: Performance and Numerical Unit Tests
 
-### 2. ⚡ [Inference with NKI Guide](./docs/inference.md)
+### 2. 🚀 [Inference with NKI Guide](./docs/inference.md)
 **Ideal for teams targeting inference performance**
 - Minimize latency with NKI-optimized kernels
 - Maximize throughput for production serving
 - Implement custom kernels for attention, normalization, and more
 - **Score Focus**: Inference latency and throughput
 
-### 3. 🎯 [Complete Pipeline Guide](./docs/complete-pipeline.md)
+### 3. 🏋️ [Fine-tuning Guide](./docs/fine-tuning.md)
+**Perfect for teams focusing on training optimization**
+- Optimize Model FLOP Utilization (MFU) during training
+- Implement NKI kernels for training operations
+- Achieve high throughput with NeuronX Distributed
+- **Score Focus**: Training performance metrics
+
+### 4. 🎯 [Complete Pipeline Guide](./docs/complete-pipeline.md)
 **For teams aiming for the highest overall score**
 - Combine training and inference optimizations
 - Implement shared NKI kernels across both phases
diff --git a/docs/self-attention.md b/docs/self-attention.md
new file mode 100644
index 0000000..1653220
--- /dev/null
+++ b/docs/self-attention.md
@@ -0,0 +1,254 @@
+# Flash Self-Attention Kernel Optimizations Guide for NKI-LLAMA Hackathon
+
+## 🎯 Overview
+
+This guide focuses on working with the sefl-attention kernels provided and optimizing them further using the Neuron Kernel Interface (NKI) compilation on AWS Inferentia/Trainium. This is a perfect starting place for teams who want to learn more about NKI and how kernel optimizations can be applied without having to train or inference components. 
+
+### Instance Requirements
+- **Instance Type**: trn1.2xlarge (minimum) or trn1.32xlarge
+- **AMI**: Deep Learning AMI Neuron (Ubuntu 22.04) 20250520
+  - **us-east-1**: `ami-0e65a95c79775d1b6`
+  - **us-west-2**: `ami-0d0a2d26f80b645c2`
+- **Storage**: 256GB+ recommended (800GB default in CloudFormation)
+- **Neuron SDK**: 2.23.0
+
+### Environment Setup
+```bash
+# Activate the inference environment
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+```
+
+## 🚀 Deployment
+
+Deploy the NKI-LLAMA training environment using AWS CloudFormation with one click:
+
+| AWS Region | Launch CloudFormation Stack |
+|:-----------|:----------------------------|
+| us-east-1 (N. Virginia) |<a href="https://console.aws.amazon.com/cloudformation/home?region=us-east-1#/stacks/new?stackName=hackathon&templateURL=" target="_blank">Launch stack</a> |
+| us-west-2 (Oregon) |<a href="https://console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks/new?stackName=hackathon&templateURL=" target="_blank">Launch stack</a> |
+
+**Note:** Only us-east-1 and us-west-2 regions support Trainium (trn1) instances with the required Neuron AMIs.
+
+### Deployment Steps
+
+1. **Click** on one of the "Launch stack" links above for your preferred region.
+
+2. **Configure the stack:**
+   - **Stack name**: Keep default or customize (e.g., `nki-llama-training`)
+   - **KeyPairOption**: Choose `use-existing` (recommended - create key in EC2 console first)
+   - **ExistingKeyPairName**: Select your key from dropdown (see note below)
+   - **Ec2InstanceType**: Default: `trn1.32xlarge` - can be changed to use `trn1.2xlarge`
+   - Click **Next**
+   
+   **Note**: For easy key download, first create a key pair in EC2 → Key Pairs → Create key pair, download it, then return here and select it from the dropdown.
+
+3. **Configure stack options**: Leave all values as default and click **Next**
+
+4. **Review and create:**
+   - Check the box: "I acknowledge that AWS CloudFormation might create IAM resources"
+   - Click **Create stack**
+   - Stack creation takes ~5-10 minutes
+
+5. **Access your instance:**
+   - Go to CloudFormation → Select your stack → **Outputs** tab
+   - Copy the **SSHCommand** value
+   - If you created a new key, download it from EC2 → Key Pairs
+   - Connect: `ssh -i <your-key.pem> ubuntu@<instance-ip>`
+
+### Post-Deployment Setup
+
+Once connected to your instance:
+
+```bash
+# Repository is pre-cloned
+cd ~/nki-llama
+
+# Install dependencies
+chmod +x install.sh
+./install.sh
+```
+
+## 📁 File Overview
+
+### Core Test Files
+
+| File | Description | Purpose |
+|------|-------------|---------|
+| `test_flash_attn_fwd.py` | Forward pass tests | Performance + numerical validation for forward attention |
+| `test_flash_attn_bwd.py` | Backward pass tests | Performance + numerical validation for backward attention |
+
+### Kernel Implementation Files
+| File | Description | Key Functions |
+|------|-------------|---------|
+| `attention.py` | Core NKI Kernel implementation | `flash_fwd, flash_attn_bwd, fused_self_attn_for_SD_small_head_size` |
+| `FlashConfig` | Configuration dataclass | Performance tuning parameters |
+
+### Kernel Functions Overview
+
+**`flash_fwd` - Flash Attention Forward Pass**
+
+- **Purpose:** Optimized forward attention computation with tiling and memory efficiency
+- **Features:** Causal masking, mixed precision, dropout, GQA/MQA support, logit bias
+- **Optimizations:** Memory tiling, recomputation, SBUF management
+Usage: flash_fwd[batch_size, kv_heads](q, k, v, seed, config=FlashConfig(...))
+
+**`flash_attn_bwd` - Flash Attention Backward Pass**
+
+- **Purpose:** Backward pass gradient computation for attention
+- **Features:** Efficient gradient calculation for Q, K, V with recomputation
+- **Optimizations:** Tiled computation, memory-efficient recomputation
+- **Usage:** flash_attn_bwd[batch_size, heads](q, k, v, o, dy, lse, seed)
+
+**`fused_self_attn_for_SD_small_head_size` - Stable Diffusion Specialized**
+
+- **Purpose:** Optimized attention for small head sizes (≤128) in Stable Diffusion
+- **Features:** Specialized for SD workloads, different tensor layouts
+- **Usage:** fused_self_attn_for_SD_small_head_size[batch_size](q, k, v)
+
+## 🚀 Quick Start
+
+### Step 1 (OPTIONAL): Clone and Setup
+
+**Please skip this step when deploying the infrastructure with cloudformation**
+
+```bash
+# Clone the repository
+git clone https://github.com/aws-neuron/nki-llama.git
+cd nki-llama
+
+# Install dependencies
+chmod +x install.sh
+./install.sh
+```
+
+### Step 2: Modify and Optimiza the Kernel Implementations
+
+Refer to the `attention.py` file for details on the kernel implementation. This is the main file where contestants would want to edit to implement their optimization before testing the kernels. 
+
+### Step 3: Run the Flash Self-Attention Kernel Unit Tests
+```bash
+# Run the unit tests
+cd ~/nki-llama/src/self-attention/tests
+
+# Run all forward and backward tests with full verbosity
+pytest test_flash_attn_*.py -v -s
+
+# Run specific test suite
+pytest test_flash_attn_fwd.py -v -s
+pytest test_flash_attn_bwd.py -v -s
+
+# Performance tests only
+pytest -k "perf" -v -s
+# Numerical accuracy tests only  
+pytest -k "numerical" -v -s
+# Simulation tests only
+pytest -m simulation -v -s
+```
+
+## 🧪 Test Categories
+
+### Performance Tests (`test_*_perf`)
+
+**Purpose**: Validate that Flash Attention kernels meet latency requirements under various configurations.
+
+**What they test:**
+- Execution latency across different percentiles (P50, P90, P95, P99)
+- Memory usage efficiency
+- Performance scaling with sequence length and batch size
+
+**Example output:**
+```
+📈 PERFORMANCE METRICS:
+   P50 Latency: 12,500,000 ns (0.013s) ✅ PASS
+   P90 Latency: 15,200,000 ns (0.015s) ✅ PASS
+   P95 Latency: 16,800,000 ns (0.017s) ✅ PASS
+   Expected:   15,100,000,000 ns (15.100s)
+
+💾 MEMORY USAGE ESTIMATES:
+   Q tensor:     3072.00 MB
+   K tensor:     3072.00 MB
+   V tensor:     3072.00 MB
+   Total Input:  9216.00 MB
+   Est. Peak:    18432.00 MB (2x for intermediate)
+```
+
+### Numerical Accuracy Tests (`test_*_numerical`)
+
+**Purpose**: Ensure computational accuracy by comparing Flash Attention outputs against reference CPU implementations.
+
+**What they test:**
+- Numerical correctness within tolerance (1e-2)
+- Forward pass: Output tensors and LSE (Log-Sum-Exp) values
+- Backward pass: Gradient tensors (dQ, dK, dV)
+- Cross-validation between hardware and simulation modes
+
+**Example output:**
+```
+📊 dQ Gradient Comparison:
+   Max absolute difference:  0.000847
+   Mean absolute difference: 0.000234
+   Mean relative error:      0.001245
+   Tolerance:                0.01
+   Result: ✅ PASS
+
+🔬 NUMERICAL VERIFICATION:
+   Flash Output vs Reference CPU:
+   ✅ Output tensor: PASS (max_diff: 0.00234)
+   ✅ LSE tensor: PASS (max_diff: 0.00156)
+   🎉 All numerical checks passed!
+```
+
+## 🛠️ Advanced Usage
+
+### Custom Test Execution
+```
+# Run with maximum verbosity and detailed tracebacks
+pytest test_flash_attn_fwd_verbose.py -v -s --tb=long
+
+# Run specific parameter combinations
+pytest test_flash_attn_fwd_verbose.py::TestAttention::test_flash_attn_fwd_perf[1-6-32768-32768-96-bfloat16-True-True-True-2048-3-False-87000000000] -v -s
+
+# Stop on first failure for debugging
+pytest test_flash_attn_fwd_verbose.py -v -s -x
+
+# Run with timing information
+pytest test_flash_attn_fwd_verbose.py -v -s --durations=10
+
+# Capture output to file
+pytest test_flash_attn_fwd_verbose.py -v -s > test_results.log 2>&1
+```
+
+## 🔧 Troubleshooting
+
+### Common Issues
+
+#### Performance Test Failures:
+- Check hardware availability and configuration
+- Verify expected latency thresholds are appropriate for your hardware
+- Review memory usage estimates for resource constraint
+
+#### Numerical Test Failures:
+- Increase tolerance if needed for specific hardware characteristics
+- Check tensor shapes and data types match expectations
+- Verify reference implementation correctness
+
+#### Simulation Mode Issues:
+- Ensure simulation environment is properly configured
+- Check that all required kernels are available in simulation
+
+### Expected Test Outcomes
+
+**Performance Tests:**
+- ✅ Pass: Latency within expected bounds
+- ❌ Fail: Latency exceeds thresholds (check hardware load, configuration)
+- ⚠️ xfail: Known issues (marked with ticket numbers)
+
+**Numerical Tests:**
+- ✅ Pass: All gradients/outputs within tolerance
+- ❌ Fail: Numerical differences exceed tolerance (check implementation)
+
+**Test Status:**
+- 🎉 PASSED: All metrics within acceptable ranges
+- 💥 FAILED: One or more metrics exceeded thresholds
+- ⚠️ xfail: Expected failure due to known issues
+- ❓ Cannot Determine: Missing metric data (API issues)
diff --git a/src/self-attention/README.md b/src/self-attention/README.md
new file mode 100644
index 0000000..4894cfa
--- /dev/null
+++ b/src/self-attention/README.md
@@ -0,0 +1,134 @@
+# Self-Attention Module for NKI-LLAMA
+
+This module implements optimized Flash Attention kernels using the Neuron Kernel Interface (NKI) for AWS Inferentia/Trainium hardware. The implementation focuses on high-performance, memory-efficient attention mechanisms for large language models.
+
+## Overview
+
+The self-attention module provides optimized implementations of attention mechanisms that are critical for transformer-based models like LLaMA. These implementations leverage NKI to achieve high performance on AWS Neuron hardware.
+
+## Key Components
+
+### Core Files
+
+- **`attention.py`**: Main implementation of Flash Attention kernels using NKI
+  - `flash_fwd`: Forward pass implementation of Flash Attention
+  - `flash_attn_bwd`: Backward pass implementation for gradient computation
+  - `fused_self_attn_for_SD_small_head_size`: Specialized attention for small head sizes
+
+### Configuration
+
+- **`FlashConfig`**: Configuration dataclass for tuning attention performance parameters
+  - `seq_tile_size`: Size of sequence tiles for attention computation (default: 2048)
+  - `attn_core_tile_size`: Size of attention core tiles (default: 256)
+  - `training`: Flag to indicate training vs. inference mode (default: True)
+  - `should_transpose_v`: Flag to control V tensor layout (default: False)
+  - `lse_dtype`: Data type for log-sum-exp computation (default: "")
+
+### Tests
+
+- **`tests/test_flash_attn_fwd.py`**: Tests for forward pass performance and numerical accuracy
+- **`tests/test_flash_attn_bwd.py`**: Tests for backward pass performance and numerical accuracy
+
+## Features
+
+- **Optimized Memory Usage**: Implements tiling strategies to efficiently use limited on-chip memory
+- **Mixed Precision Support**: Configurable precision for computation vs. accumulation
+- **Causal Masking**: Support for causal attention patterns used in decoder-only models
+- **Dropout Support**: Configurable dropout for training stability
+- **GQA/MQA Support**: Grouped Query Attention and Multi-Query Attention support
+- **Performance Tuning**: Configurable parameters for different hardware configurations
+
+## Usage
+
+### Basic Usage
+
+```python
+from attention import flash_fwd, FlashConfig
+
+# Configure the attention parameters
+config = FlashConfig(
+    seq_tile_size=2048,
+    training=True,
+    should_transpose_v=False
+)
+
+# Run the forward pass
+# q: shape (bs, n_heads, d, seq_q)
+# k: shape (bs, nk_heads, d, seq_k)
+# v: shape (bs, nv_heads, d, seq_v) if config.should_transpose_v else (bs, nv_heads, seq_v, d)
+output = flash_fwd[batch_size, kv_heads](
+    q, k, v, seed, 
+    use_causal_mask=True,
+    mixed_precision=True,
+    config=config
+)
+```
+
+### Training Usage
+
+```python
+from attention import flash_fwd, flash_attn_bwd, FlashConfig
+
+# Forward pass
+output, lse = flash_fwd[batch_size, kv_heads](
+    q, k, v, seed, 
+    use_causal_mask=True,
+    mixed_precision=True,
+    config=FlashConfig(training=True)
+)
+
+# Backward pass
+dq, dk, dv = flash_attn_bwd[batch_size, heads](
+    q, k, v, output, dy, lse, seed,
+    use_causal_mask=True,
+    mixed_precision=True
+)
+```
+
+## Performance Considerations
+
+- **Sequence Length**: Performance scales with sequence length; use appropriate tiling
+- **Head Dimensions**: Optimized for head dimensions ≤ 128
+- **Batch Size**: Consider batch size impact on memory usage and parallelism
+- **Tile Sizes**: Adjust `seq_tile_size` and `attn_core_tile_size` based on model size and hardware
+
+## Testing
+
+Run the tests to validate performance and numerical accuracy:
+
+```bash
+# Navigate to the tests directory
+cd tests
+
+# Run all tests
+pytest test_flash_attn_*.py -v -s
+
+# Run specific test suites
+pytest test_flash_attn_fwd.py -v -s  # Forward pass tests
+pytest test_flash_attn_bwd.py -v -s  # Backward pass tests
+
+# Run only performance tests
+pytest -k "perf" -v -s
+
+# Run only numerical accuracy tests
+pytest -k "numerical" -v -s
+
+# Run simulation tests
+pytest -m simulation -v -s
+```
+
+## Optimization Opportunities
+
+Areas for potential optimization:
+
+1. **Memory Tiling**: Improve tiling strategies for better memory locality
+2. **Instruction Scheduling**: Optimize instruction ordering for better hardware utilization
+3. **Precision Control**: Fine-tune mixed precision operations for specific model requirements
+4. **Specialized Kernels**: Create specialized kernels for specific sequence lengths or head sizes
+5. **Fused Operations**: Combine operations to reduce memory transfers
+
+## References
+
+- [Flash Attention Paper](https://arxiv.org/abs/2205.14135)
+- [AWS Neuron SDK Documentation](https://awsdocs-neuron.readthedocs-hosted.com/)
+- [NKI Programming Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/index.html)
\ No newline at end of file
diff --git a/src/self-attention/attention.py b/src/self-attention/attention.py
new file mode 100644
index 0000000..a6e1f9a
--- /dev/null
+++ b/src/self-attention/attention.py
@@ -0,0 +1,1171 @@
+"""
+Copyright (c) 2023, Amazon.com. All Rights Reserved
+
+kernels - Builtin high performance attention kernels
+
+"""
+import numpy as np
+
+import neuronxcc.nki.isa as nisa
+import neuronxcc.nki.language as nl
+from neuronxcc import nki
+
+from neuronxcc.nki.language import par_dim
+from dataclasses import dataclass
+from functools import reduce as functools_reduce
+from operator import mul as operator_mul
+
+
+def n_elts(shape):
+  return functools_reduce(operator_mul, shape, 1)
+
+
+def linearize(shape, indices):
+  return sum(i * (n_elts(shape[dim + 1:]))
+             for dim, i in enumerate(indices))
+
+
+def div_ceil(n, d):
+  return (n + d - 1) // d
+
+
+@dataclass(frozen=True)
+class FlashConfig:
+  """
+    Config class for flash attention with default values
+  """
+  seq_tile_size:int = 2048
+  attn_core_tile_size:int = 256
+  training:bool = True
+  should_transpose_v:bool = False
+  lse_dtype: str = ""
+
+
+@nki.jit(mode='trace')
+def transpose_p_local(p_local_transposed, p_local, LARGE_TILE_SZ, use_dma_transpose=False):
+  for i in nl.affine_range(LARGE_TILE_SZ // 512):
+    # Temporarily disable use_dma_tranpose by default until we stablized it
+    if use_dma_transpose and nisa.get_nc_version() >= nisa.nc_version.gen3:
+      p_local_t_tmp = nl.ndarray((par_dim(128), 512), buffer=nl.sbuf, dtype=p_local.dtype)
+    else:
+      p_local_t_tmp = nl.ndarray((par_dim(128), 512), buffer=nl.psum, dtype=np.float32)
+
+    for j in nl.affine_range(512 // 128):
+      j_128_slice = nl.ds(j * 128, 128)
+      i_j_128_slice = nl.ds(i * 512 + j * 128, 128)
+
+      if use_dma_transpose and nisa.get_nc_version() >= nisa.nc_version.gen3:
+        p_local_t_tmp[:, j_128_slice] = nisa.dma_transpose(
+          p_local[:, i_j_128_slice])
+      else:
+        p_local_t_tmp[:, j_128_slice] = nisa.nc_transpose(
+          p_local[:, i_j_128_slice])
+
+    p_local_transposed[:, nl.ds(i * 512, 512)] = nl.copy(
+      p_local_t_tmp, dtype=p_local_transposed.dtype)
+
+
+@nki.jit(mode='trace')
+def dropout_p_local(p_local, dropout_p, dropout_p_tensor, seed_tensor,
+                    seed_offset_base, k_r_i, REDUCTION_TILE):
+  B_F_SIZE = 512
+  for k_d_i in nl.sequential_range(REDUCTION_TILE // B_F_SIZE):
+    p_local_f_slice = nl.ds(k_r_i * REDUCTION_TILE + k_d_i * B_F_SIZE, B_F_SIZE)
+
+    offset = k_d_i + seed_offset_base
+    offset_seed = nl.add(seed_tensor, offset, dtype=nl.int32)
+    nl.random_seed(seed=offset_seed)
+    softmax_dropout = nl.dropout(p_local[:, p_local_f_slice],
+                                 rate=dropout_p_tensor[:, 0])
+    p_local[:, p_local_f_slice] = nl.multiply(
+      softmax_dropout, 1 / (1 - dropout_p))
+
+
+@nki.jit(mode='trace')
+def _flash_attention_core(q_local_tile, k, v,
+                          q_h_per_k_h, seqlen_q, nheads,
+                          o_buffer, l_buffer, m_buffer,
+                          batch_id, head_id, gqa_head_idx, q_tile_idx,
+                          local_k_large_tile_idx,
+                          kernel_dtype, acc_type,
+                          flash_config: FlashConfig,
+                          use_causal_mask, initialize,
+                          B_P_SIZE=128, B_F_SIZE=512, B_D_SIZE=128,
+                          dropout_p=0.0, dropout_p_tensor=None, seed_tensor=None,
+                          logit_bias_tile=None):
+  """
+  The flash attention core function to calcualte self attention between a tile of q and a block of K and V.
+  The q_local_tile has (B_P_SIZE, B_F_SIZE), which is loaded into the SBUF already. The block size of K and V
+  is defined in the seq_tile_size of the flash_config. The results are stored in the following three buffers
+  o_buffer: (B_P_SIZE, d)
+  l_buffer: (B_P_SIZE, 1)
+  m_buffer: (B_P_SIZE, 1)
+  """
+  LARGE_TILE_SZ = flash_config.seq_tile_size
+  num_k_tile_per_large_tile = LARGE_TILE_SZ // B_F_SIZE
+  seqlen_k = k.shape[-1]
+  seq_q_num_tiles = seqlen_q // B_P_SIZE
+  seq_k_num_tiles = seqlen_k // B_F_SIZE
+
+  qk_res_buf = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ), buffer=nl.sbuf, dtype=acc_type)
+  max_local = nl.ndarray((par_dim(B_P_SIZE), num_k_tile_per_large_tile), dtype=acc_type)
+
+  for k_i in nl.affine_range(num_k_tile_per_large_tile):
+    k_i_b_f_slice = nl.ds(k_i * B_F_SIZE, B_F_SIZE)
+
+    qk_psum = nl.ndarray((par_dim(B_P_SIZE), B_F_SIZE),
+                        dtype=np.float32, buffer=nl.psum)  # (128, 512)
+    if use_causal_mask:
+      multiplication_required_selection = q_tile_idx * B_P_SIZE >= local_k_large_tile_idx * LARGE_TILE_SZ + k_i * B_F_SIZE
+    else:
+      multiplication_required_selection = True
+
+    if multiplication_required_selection:
+      qk_psum[:, :] = nl.matmul(q_local_tile, k[:, k_i_b_f_slice], transpose_x=True) # (p(128), 512)
+    else:
+      qk_psum[:, :] = 0
+
+    if use_causal_mask:
+      left_diagonal_selection = q_tile_idx * B_P_SIZE >= local_k_large_tile_idx * LARGE_TILE_SZ + (k_i + 1) * B_F_SIZE
+      diagonal_and_right_selection = (q_tile_idx * B_P_SIZE < local_k_large_tile_idx * LARGE_TILE_SZ + (k_i + 1) * B_F_SIZE)
+      right_diagonal_selection = ((q_tile_idx + 1) * B_P_SIZE <= local_k_large_tile_idx * LARGE_TILE_SZ + k_i * B_F_SIZE)
+      diagonal = ((q_tile_idx * B_P_SIZE < local_k_large_tile_idx * LARGE_TILE_SZ + (k_i + 1) * B_F_SIZE) &
+                  ((q_tile_idx + 1) * B_P_SIZE > local_k_large_tile_idx * LARGE_TILE_SZ + k_i * B_F_SIZE))
+
+      i_q_p, i_q_f = nl.mgrid[0:B_P_SIZE, 0:B_F_SIZE]
+      q_pos = q_tile_idx * B_P_SIZE + i_q_p
+      k_pos = local_k_large_tile_idx * LARGE_TILE_SZ + k_i * B_F_SIZE + i_q_f
+      pred = q_pos >= k_pos
+
+      qk_select_tmp = nl.ndarray(qk_psum.shape, dtype=qk_psum.dtype, buffer=nl.sbuf)
+
+      if logit_bias_tile is not None:
+        if right_diagonal_selection:
+          qk_select_tmp[...] = qk_psum
+
+          # For tiles to the right of the diagonal, do affine_select.
+          # Magic number -9984.0 to replace -inf similar to what Tensorizer uses
+          qk_res_buf[:, k_i_b_f_slice] = nisa.affine_select(
+              pred=pred,
+              on_true_tile=qk_select_tmp, on_false_value=-9984.0, dtype=acc_type)
+
+        # For tiles on the diagonal, add logit bias and need to do affine_select.
+        intermediate = \
+            nl.add(qk_psum, logit_bias_tile[:, k_i_b_f_slice],
+                   dtype=acc_type, mask=diagonal)
+        qk_res_buf[:, k_i_b_f_slice] = nisa.affine_select(
+            pred=pred,
+            on_true_tile=intermediate, on_false_value=-9984.0, dtype=acc_type,
+            mask=diagonal)
+
+        # For tiles on the left of the diagonal, just add logit bias, no select required.
+        qk_res_buf[:, k_i_b_f_slice] = \
+            nl.add(qk_psum, logit_bias_tile[:, k_i_b_f_slice],
+                   dtype=acc_type, mask=left_diagonal_selection)
+      else:
+        # For tiles on and to the right of the diagonal, need to do affine_select.
+        # Magic number -9984.0 to replace -inf similar to what Tensorizer uses
+        if diagonal_and_right_selection:
+          qk_select_tmp[...] = qk_psum
+
+          qk_res_buf[:, k_i_b_f_slice] = nisa.affine_select(
+            pred=pred,
+            on_true_tile=qk_select_tmp, on_false_value=-9984.0, dtype=acc_type)
+
+        # For tiles on the left of the diagonal, direct copy, no select required.
+        qk_res_buf[:, k_i_b_f_slice] = \
+          nl.copy(qk_psum, dtype=acc_type, mask=left_diagonal_selection)
+    else:
+      if logit_bias_tile is not None:
+        # Simply add logit bias which copies back to sbuf at the same time
+        qk_res_buf[:, k_i_b_f_slice] = \
+            nl.add(qk_psum, logit_bias_tile[:, k_i_b_f_slice], dtype=acc_type)
+      else:
+        # Simply send psum result back to sbuf
+        qk_res_buf[:, k_i_b_f_slice] = nl.copy(qk_psum, dtype=acc_type)
+
+    # Calculate max of the current tile
+    max_local[:, k_i] = nisa.tensor_reduce(
+      np.max, qk_res_buf[:, k_i_b_f_slice], axis=(1,), dtype=acc_type,
+      negate=False)
+
+  max_ = nisa.tensor_reduce(np.max, max_local[:, :], axis=(1, ),
+                            dtype=acc_type, negate=False)
+
+  o_previous_scaled = nl.ndarray((par_dim(B_P_SIZE), B_D_SIZE), dtype=o_buffer.dtype)
+
+  if initialize:
+    m_buffer[:, 0] = nl.copy(max_)
+    m_current = max_
+  else:
+    m_previous = nl.copy(m_buffer[:, 0])
+    m_buffer[:, 0] = nl.maximum(m_previous, max_) # (128,1)
+
+    m_current = m_buffer[:, 0]
+    # Compute scaling factor
+    alpha = nisa.activation(np.exp, m_current, bias=m_previous, scale=-1.0)
+    o_previous_scaled[...] = nl.multiply(o_buffer[:, :], alpha)
+
+  p_local = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ), dtype=kernel_dtype)
+  REDUCTION_TILE = min(2048, LARGE_TILE_SZ // 2)
+
+  p_partial_sum = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ // REDUCTION_TILE), dtype=acc_type)
+
+  for k_r_i in nl.affine_range(LARGE_TILE_SZ // REDUCTION_TILE):
+    k_r_i_reduce_slice = nl.ds(k_r_i * REDUCTION_TILE, REDUCTION_TILE)
+
+    # dropout
+    if dropout_p > 0.0:
+      # compute exp(qk-max)
+      p_local[:, k_r_i_reduce_slice] = \
+        nisa.activation(np.exp, qk_res_buf[:, k_r_i_reduce_slice],
+                        bias=-1 * m_current, scale=1.0,
+                        dtype=kernel_dtype)
+
+      seed_offset_base = k_r_i * (REDUCTION_TILE // B_F_SIZE) \
+                         + local_k_large_tile_idx * (LARGE_TILE_SZ // B_F_SIZE) \
+                         + q_tile_idx * seq_k_num_tiles \
+                         + (head_id * q_h_per_k_h + gqa_head_idx) * seq_k_num_tiles * seq_q_num_tiles \
+                         + batch_id * nheads * seq_k_num_tiles * seq_q_num_tiles
+
+      dropout_p_local(p_local=p_local, dropout_p=dropout_p,
+                      dropout_p_tensor=dropout_p_tensor, seed_tensor=seed_tensor,
+                      seed_offset_base=seed_offset_base, k_r_i=k_r_i,
+                      REDUCTION_TILE=REDUCTION_TILE)
+
+      # Compute partial row-tile sum of exp(qk-max))
+      # FIXME: Use activation accumulate and accumulate over k_r_i loop?
+      p_partial_sum[:, k_r_i] = nl.sum(p_local[:, k_r_i_reduce_slice],
+                                       axis=1, dtype=acc_type)
+    else:
+      # compute exp(qk-max)
+      # Compute partial row-tile sum of exp(qk-max))
+      # FIXME: Use activation accumulate to accumulate over k_r_i loop?
+      p_local[:, k_r_i_reduce_slice] = \
+        nisa.activation_reduce(np.exp, qk_res_buf[:, k_r_i_reduce_slice],
+                               bias=-1 * m_current, scale=1.0,
+                               reduce_op=nl.add, reduce_res=p_partial_sum[:, k_r_i],
+                               dtype=kernel_dtype)
+
+  ps = nl.sum(p_partial_sum, axis=1, dtype=acc_type)
+
+  p_local_transposed = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ), dtype=kernel_dtype)
+  transpose_p_local(p_local_transposed=p_local_transposed, p_local=p_local,
+                    LARGE_TILE_SZ=LARGE_TILE_SZ)
+
+  pv_psum = nl.zeros((par_dim(B_P_SIZE), B_D_SIZE), dtype=np.float32,
+                     buffer=nl.psum, lazy_initialization=True)
+  for k_i in nl.affine_range(LARGE_TILE_SZ // B_P_SIZE):
+    pv_psum[:, :] += nl.matmul(p_local_transposed[:, nl.ds(k_i * B_P_SIZE, B_P_SIZE)],
+                               v[k_i, :, :], transpose_x=True) # (128, 128) (p(Br), d)
+
+  if initialize:
+    o_buffer[:, :] = nl.copy(pv_psum[:, :])
+    l_buffer[:, 0] = nl.add(nl.log(ps), max_)
+  else:
+    o_buffer[:, :] = nl.add(o_previous_scaled, pv_psum)
+
+    exp = nisa.activation(nl.exp, m_current, bias=l_buffer[:, 0], scale=-1.0)
+    l_buffer[:, 0] = nl.add(m_current, nisa.activation(nl.log, exp, bias=ps))
+
+
+@nki.jit(mode='trace')
+def load_v_tile(v_hbm_tile, cur_v_tile, j, v_i, config):
+  LARGE_TILE_SZ = config.seq_tile_size
+  B_P_SIZE = 128
+
+  if not config.should_transpose_v:
+    cur_v_tile[v_i, :, :] = nl.load(
+      v_hbm_tile[nl.ds(j * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE), :],
+      dtype=cur_v_tile.dtype)
+    return
+
+  if nisa.get_nc_version() >= nisa.nc_version.gen3:
+    cur_v_tile_transposed = nisa.dma_transpose(
+      v_hbm_tile[:, nl.ds(j * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE)])
+    cur_v_tile[v_i, :, :] = nisa.tensor_copy(cur_v_tile_transposed,
+                                             dtype=cur_v_tile.dtype)
+    return
+
+  cur_v_tile[v_i, :, :] = nl.load_transpose2d(
+    v_hbm_tile[:, nl.ds(j * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE)],
+    dtype=cur_v_tile.dtype)
+
+
+
+@nki.jit
+def flash_fwd(q, k, v, seed, logit_bias=None,
+              softmax_scale=None,
+              use_causal_mask=True,
+              mixed_precision=True,
+              dropout_p=0.0, config=None):
+  """
+  Flash Attention Forward kernel
+
+  IO tensor layouts:
+    - q: shape   (bs, n_heads, d, seq_q)
+    - k: shape   (bs, nk_heads, d, seq_k)
+    - v: shape   (bs, nv_heads, d, seq_v) if config.should_transpose_v  else (bs, nv_heads, seq_v, d)
+    - seed: shape (1,)
+    - logit_bias: shape (bs, n_heads, seq_q, seq_k)
+    - o: shape (bs, n_heads, seq_q, d)
+    - lse: shape (bs, n_heads, nl.tile_size.pmax, seq // nl.tile_size.pmax) if training else None
+    - This kernel requires seq_k == seq_v
+
+  IO tensor dtypes:
+    - This kernel assumes all IO tensors have the same dtype
+    - If mixed_precision is True, then all Tensor Engine operation will be performed in
+      bfloat16 and accumulation will be performed in float32. Otherwise the intermediates
+      will be in the same type as the inputs.
+
+  Compile-time Constants:
+    - softmax_scale: scaling for softmax, is None, default is `1.0/(d**0.5)`
+    - mixed_precision: flag to set non-matmul ops in fp32 precision, default is set to `true`, if false, we use same precision as input types
+    - causal_mask: flag to set causal masking
+    - config: Instance of :class:`nki.kernels.attention.FlashConfig` with Performance config parameters for flash attention with default values
+        seq_tile_size: `default=2048`, size of the kv tile size for attention computation reduction
+        training: bool to indicate training vs inference `default=True`
+
+  Performance Notes:
+    For better performance, the kernel is tiled to be of size `config.seq_tile_size`, and Flash attention math techniques are applied in unit
+    of `config.seq_tile_size`. Seqlen that is not divisible by `config.seq_tile_size` is not supported at the moment.
+
+    For large seqlen, `o_buffer` will overflow the statebuf. the kernel is tile `o_buffer` based on the value of `config.attn_core_tile_size`.
+    This is a tradeoff between memory usage and performance. The default value of `config.attn_core_tile_size` is 256, which means the `o_buffer`
+    will roughly take half of the statebuf. The computes are also tiled accordingly. DMA will be rematerialized
+    `seqlen_q // B_P_SIZE // attn_core_tile_size times`.
+
+
+
+  GQA support Notes:
+    the spmd kernel for launching kernel should be on kv_heads instead of nheads
+
+  Example usage:
+    MHA: q: [b, h, d, s], k: [b, h, d, s], v: [b, h, s, d]
+      usage: `flash_fwd[b, h](q, k, v, ...)`
+    GQA: q: [b, h, d, s], k: [b, kv_h, d, s], v: [b, kv_h, s, d]
+      usage: `flash_fwd[b, kv_h](q, k, v, ...)`
+  """
+  config = config or FlashConfig()
+  B_F_SIZE=512
+  B_P_SIZE=128
+  b, h, d, seqlen_q  = q.shape
+  B_D_SIZE = d
+  _, k_h, _, seqlen_k = k.shape
+  if config.should_transpose_v:
+    assert tuple(v.shape) == (b, k_h, d, seqlen_k), f"Expect shape of V to be {(b, k_h, d, seqlen_k)} (batch, heads, d_head, seqlen_k) but got {v.shape}"
+    assert tuple(k.shape) == (b, k_h, d, seqlen_k), f"Expect shape of K to be {(b, k_h, d, seqlen_k)} (batch, heads, d_head, seqlen_k) but got {k.shape}"
+  else:
+    assert tuple(v.shape) == (b, k_h, seqlen_k, d), f"Expect shape of V to be {(b, k_h, seqlen_k, d)} (batch, heads, seqlen_k, d_head) but got {v.shape}"
+    assert tuple(k.shape) == (b, k_h, d, seqlen_k), f"Expect shape of K to be {(b, k_h, d, seqlen_k)} (batch, heads, d_head, seqlen_k) but got {k.shape}"
+  assert d <= 128, f" we do not support head_dim > 128, got head dim {d}"
+  kernel_dtype = nl.bfloat16 if mixed_precision else q.dtype
+  acc_type = np.dtype(np.float32) if mixed_precision else kernel_dtype
+
+  o = nl.ndarray((b, h, seqlen_q, d), dtype=q.dtype, buffer=nl.shared_hbm)
+  if config.training:
+    if config.lse_dtype:
+      lse_dtype = getattr(nl, config.lse_dtype)
+    else:
+      lse_dtype = acc_type
+    lse = nl.ndarray((b, h, nl.tile_size.pmax, seqlen_q // nl.tile_size.pmax),
+                     dtype=lse_dtype, buffer=nl.shared_hbm)
+  else:
+    lse = None
+
+  assert nl.program_ndim() == 2,\
+    f'Expect spmd grid with 2 dimensions, got {nl.program_ndim()} instead!'
+  batch_id = nl.program_id(axis=0)
+  head_id = nl.program_id(axis=1)
+
+  softmax_scale = softmax_scale or (1.0 / (d ** 0.5))
+
+  n_tile_q = seqlen_q // B_P_SIZE # since q will be loaded on tensor engine
+
+  LARGE_TILE_SZ = config.seq_tile_size
+  attn_core_tile_size = config.attn_core_tile_size
+
+  # FIXME: Add masking for different seqlen values.
+  assert config.seq_tile_size >= 512, f" seq tile_size {config.seq_tile_size} cannot be less than 512"
+  assert seqlen_k % LARGE_TILE_SZ == 0, f"Need seqlen_k to be divisible by {LARGE_TILE_SZ} but got {seqlen_k}"
+  num_large_k_tile = seqlen_k // LARGE_TILE_SZ
+
+  # inference flag, check if lse is none
+  inference = not config.training
+  if inference:
+    assert lse is None, "lse should be none for inference"
+    assert seed is None, f"seed should be None for inference, but got {seed}"
+    assert dropout_p==0.0, f"dropout should be 0.0 for inference but got {dropout_p}"
+  else:
+    assert lse is not None, "lse should not be none for training"
+  q_h_per_k_h = h // k_h
+
+  if dropout_p > 0.0 and not inference:
+    seed_local = nl.load(seed[0])
+    # TODO: Remove this once the dropout supports scale prob
+    dropout_p_tensor = nl.full((B_P_SIZE, 1), fill_value=dropout_p, dtype=np.float32)
+  else:
+    dropout_p_tensor = None
+    seed_local = None
+
+  if logit_bias is not None:
+    b_logit_bias, h_logit_bias, _, _ = logit_bias.shape
+    assert b_logit_bias == 1 and h_logit_bias == 1, "only support broadcasting logit_bias with batch 1, n_heads 1"
+
+  n_remat = div_ceil(n_tile_q, attn_core_tile_size)
+  attn_core_tile_size = min(n_tile_q, attn_core_tile_size)
+
+  for i_q_h in nl.affine_range(q_h_per_k_h):
+    # =============== Global Flash Attention accumulators ====================== #
+    l_buffer = nl.zeros((par_dim(B_P_SIZE), n_tile_q), dtype=acc_type,
+                        buffer=nl.sbuf, lazy_initialization=True)
+    # =============== Global Flash Attention accumulators END ================== #
+
+    for i0 in nl.sequential_range(n_remat):
+      # =============== Global Flash Attention accumulators ====================== #
+      o_buffer = nl.zeros((attn_core_tile_size, par_dim(B_P_SIZE), d), dtype=acc_type,
+                          buffer=nl.sbuf, lazy_initialization=True)
+      m_buffer = nl.zeros((attn_core_tile_size, par_dim(B_P_SIZE), 1), dtype=acc_type,
+                          buffer=nl.sbuf, lazy_initialization=True)
+      # =============== Global Flash Attention accumulators END ================== #
+
+      for j in nl.sequential_range(0, num_large_k_tile):
+        cur_k_tile = nl.ndarray((par_dim(B_D_SIZE), LARGE_TILE_SZ), dtype=kernel_dtype)
+        cur_v_tile = nl.ndarray((LARGE_TILE_SZ // B_P_SIZE, par_dim(B_P_SIZE), B_D_SIZE), dtype=kernel_dtype)
+
+        cur_k_tile[:, :] = nl.load(k[batch_id, head_id, :, nl.ds(j*LARGE_TILE_SZ, LARGE_TILE_SZ)])
+
+        load_tile_size = B_P_SIZE
+
+        v_hbm_tile = v[batch_id, head_id]
+        for v_i in nl.affine_range(LARGE_TILE_SZ // load_tile_size):
+          load_v_tile(v_hbm_tile=v_hbm_tile, cur_v_tile=cur_v_tile, j=j, v_i=v_i,
+                      config=config)
+
+        for i1 in nl.affine_range(attn_core_tile_size):
+          i = i0 * attn_core_tile_size + i1
+          # mask are used to only apply computation to the lower half of the matrix,
+          # which reduce the arthimetic intensity by half.
+          # forward_mask imply initialize, i.e. if forward_mask is false, initialize will
+          # be false as well
+          if use_causal_mask:
+            forward_mask = i * B_P_SIZE >= j * LARGE_TILE_SZ
+          else:
+            forward_mask = True
+
+          if (i < n_tile_q) & forward_mask:
+            q_tile = nl.ndarray((B_D_SIZE, B_P_SIZE),dtype=kernel_dtype)
+            q_hbm_tile = q[batch_id, head_id * q_h_per_k_h + i_q_h]
+            q_sbuf_tile = nl.load(q_hbm_tile[:, nl.ds(i * B_P_SIZE, B_P_SIZE)],
+                                  dtype=kernel_dtype) # load (d, 128) tile in SBUF
+            q_tile[:, :] = q_sbuf_tile * softmax_scale
+
+            logit_bias_tile = None
+            if logit_bias is not None:
+              logit_bias_tile = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ), dtype=kernel_dtype)
+              logit_bias_tile[:, :] = nl.load(
+                logit_bias[0, 0, nl.ds(i * B_P_SIZE, B_P_SIZE),
+                           nl.ds(j * LARGE_TILE_SZ, LARGE_TILE_SZ)])
+
+            _flash_attention_core(q_local_tile=q_tile, k=cur_k_tile, v=cur_v_tile,
+                                  q_h_per_k_h=q_h_per_k_h, seqlen_q=seqlen_q, nheads=h,
+                                  o_buffer=o_buffer[i1], l_buffer=l_buffer[:, i], m_buffer=m_buffer[i1],
+                                  batch_id=batch_id, head_id=head_id,
+                                  gqa_head_idx=i_q_h, q_tile_idx=i, local_k_large_tile_idx=j,
+                                  kernel_dtype=kernel_dtype, acc_type=acc_type,
+                                  flash_config=config, use_causal_mask=use_causal_mask,
+                                  initialize=j == 0,
+                                  B_P_SIZE=B_P_SIZE, B_F_SIZE=B_F_SIZE, B_D_SIZE=B_D_SIZE,
+                                  dropout_p=dropout_p, dropout_p_tensor=dropout_p_tensor,
+                                  seed_tensor=seed_local, logit_bias_tile=logit_bias_tile)
+
+      # -------- write output to buffer on HBM ------------ #
+      for i1 in nl.affine_range(attn_core_tile_size):
+        i = i0 * attn_core_tile_size + i1
+
+        if i < n_tile_q:
+          exp = nisa.activation(np.exp, l_buffer[:, i], bias=m_buffer[i1, :, :],
+                                scale=-1.0)
+          out = nl.multiply(o_buffer[i1, :, :], exp,
+                            dtype=kernel_dtype)
+
+          nl.store(o[batch_id, head_id * q_h_per_k_h + i_q_h,
+                     nl.ds(i*B_P_SIZE, B_P_SIZE), :], out)
+
+    if not inference:
+      nl.store(lse[batch_id, head_id * q_h_per_k_h + i_q_h, :, :], l_buffer[:, :])
+
+  if config.training:
+    return o, lse
+
+  return o
+
+
+
+@nki.jit
+def flash_attn_bwd(
+  q_ref, k_ref, v_ref, o_ref,
+  dy_ref,
+  lse_ref,
+  seed_ref,
+  logit_bias_ref=None,
+  use_causal_mask=False,
+  mixed_precision=False,
+  dropout_p=0.0,
+  softmax_scale=None,
+):
+  """
+  Flash attention backward kernel. Compute the backward gradients.
+
+  IO tensor layouts:
+   - q_ref: shape (bs, nheads, head_size, seq)
+   - k_ref: shape (bs, nheads, head_size, seq)
+   - v_ref: shape (bs, nheads, head_size, seq)
+   - o_ref: shape (bs, nheads, head_size, seq)
+   - dy_ref: shape (bs, nheads, head_size, seq)
+   - lse_ref: shape (bs, nheads, nl.tile_size.pmax, seq // nl.tile_size.pmax)
+   - seed_ref: shape (1,)
+   - logit_bias_ref: shape (bs, n_heads, seq_q, seq_k)
+   - out_dq_ref: shape (bs, nheads, head_size, seq)
+   - out_dk_ref: shape (bs, nheads, head_size, seq)
+   - out_dv_ref: shape (bs, nheads, head_size, seq)
+
+  Detailed steps:
+    1. D = rowsum(dO ◦ O) (pointwise multiply)
+
+    2. Recompute (softmax(Q^T@K + logic_bias))
+
+      2.1 Q^T@K
+      2.2 Scale the QK score
+      2.3 Apply causal mask and add logit_bias
+      2.4 softmax
+
+    3. Compute the gradients of y = score @ V with respect to the loss
+
+    4. Compute the gradients of y = softmax(x)
+
+    5. Compute the gradients of Q^T@K
+
+      4.1 Compute dQ
+      4.2 Compute dK
+  """
+
+  # Use q_ref dtype as the intermediate tensor dtype
+  # Assume all IO tensors have the same dtype
+  kernel_dtype = q_ref.dtype
+  mixed_dtype = np.dtype(np.float32) if mixed_precision else kernel_dtype
+
+  assert q_ref.dtype == k_ref.dtype == v_ref.dtype == o_ref.dtype == dy_ref.dtype
+
+  # Shape checking
+  bs, nheads, d_head, seqlen_q = q_ref.shape
+  _, _, _, seqlen_k = k_ref.shape
+  assert tuple(k_ref.shape) == (bs, nheads, d_head, seqlen_k), \
+    f"Input K shape mismatch, got {k_ref.shape}"
+  assert tuple(v_ref.shape) == (bs, nheads, d_head, seqlen_k), \
+    f"Input V shape mismatch, got {v_ref.shape}"
+  assert tuple(o_ref.shape) == (bs, nheads, d_head, seqlen_q), \
+    f"Input o shape mismatch, got {o_ref.shape}"
+  assert tuple(dy_ref.shape) == (bs, nheads, d_head, seqlen_q), \
+    f"Input dy shape mismatch, got {dy_ref.shape}"
+  assert tuple(lse_ref.shape) == (bs, nheads, nl.tile_size.pmax, seqlen_q // nl.tile_size.pmax), \
+    f"Input lse shape mismatch, got {lse_ref.shape}"
+  if seed_ref is not None:
+    assert tuple(seed_ref.shape) == (1,), \
+      f"Input seed shape mismatch, got {seed_ref.shape}"
+
+  out_dq_ref = nl.ndarray((bs, nheads, d_head, seqlen_q), dtype=q_ref.dtype,
+                          buffer=nl.shared_hbm)
+  out_dk_ref = nl.ndarray((bs, nheads, d_head, seqlen_k), dtype=q_ref.dtype,
+                          buffer=nl.shared_hbm)
+  out_dv_ref = nl.ndarray((bs, nheads, d_head, seqlen_k), dtype=q_ref.dtype,
+                          buffer=nl.shared_hbm)
+
+  # FIXME: Add masking for different seqlen values.
+  assert seqlen_q % 128 == 0 and seqlen_k % 128 == 0, \
+    f"Input sequence lengths must be divisible by 128, got seqlen_q == {seqlen_q} and seqlen_k == {seqlen_k}"
+
+  # Softmax scaling factor, multiplied onto Q
+  softmax_scale = softmax_scale or 1.0 / float(d_head ** 0.5)
+
+  assert nl.program_ndim() == 2,\
+    f'Expect spmd grid with 2 dimensions, got {nl.program_ndim()} instead!'
+  # Different batch samples/attention heads have independent attention
+  batch_id = nl.program_id(axis=0)
+  head_id = nl.program_id(axis=1)
+
+  assert nl.num_programs(1) == nheads, \
+    f"The grid shape mismatch, got {nl.num_programs(1)} but should be {nheads}"
+
+  if logit_bias_ref is not None:
+    b_logit_bias, h_logit_bias, _, _ = logit_bias_ref.shape
+    assert b_logit_bias == 1 and h_logit_bias == 1, "Only support broadcasting logit_bias with batch 1, n_heads 1"
+
+  q_seq_n_tiles, q_seq_tile_size = div_ceil(seqlen_q, 128), 128
+  d_head_n_tiles, d_head_tile_size = div_ceil(d_head, 128), min(d_head, 128)
+
+  if seqlen_k >= 512:
+    k_seq_n_tiles, k_seq_tile_size = seqlen_k // 512, 512
+  else:
+    k_seq_n_tiles, k_seq_tile_size = seqlen_k // 128, 128
+
+  k_seq_n_tiles_backward, k_seq_tile_size_backward = seqlen_k // 128, 128
+  k_seq_fwd_bwd_tile_multipler = k_seq_tile_size // k_seq_tile_size_backward
+
+  ##############################################################
+  # Step 2.4 Prefetch exp bias for softmax
+  ##############################################################
+  softmax_exp_bias = nl.zeros((par_dim(q_seq_tile_size), q_seq_n_tiles), dtype=mixed_dtype)
+  lse_local = nl.load(lse_ref[batch_id, head_id, :, :], dtype=mixed_dtype)
+  softmax_exp_bias[:, :] = lse_local * -1.0
+
+  ##############################################################
+  # Step 1 Compute rowsum(dO ◦ O)
+  ##############################################################
+  dy_o_sum = nl.ndarray((q_seq_n_tiles, par_dim(q_seq_tile_size), 1), dtype=mixed_dtype)
+  compute_rowsum(dy_o_sum=dy_o_sum,
+                 dy_ref_hbm_tile=dy_ref[batch_id, head_id],
+                 o_ref_hbm_tile=o_ref[batch_id, head_id],
+                 d_head_n_tiles=d_head_n_tiles, d_head_tile_size=d_head_tile_size,
+                 q_seq_n_tiles=q_seq_n_tiles, q_seq_tile_size=q_seq_tile_size)
+
+  if dropout_p > 0.0:
+    seed_local = nl.load(seed_ref[0])
+    # TODO: Remove this once the dropout supports scale prob
+    dropout_p_local = nl.full((q_seq_tile_size, 1), fill_value=dropout_p, dtype=np.float32)
+  else:
+    seed_local = None
+    dropout_p_local = None
+
+  dq_local_reduced = nl.zeros((q_seq_n_tiles, d_head_n_tiles, par_dim(d_head_tile_size), q_seq_tile_size),
+                              dtype=mixed_dtype)
+
+  # affine_range give the compiler permission to vectorize instructions
+  # inside the loop which improves the performance. However, when using the
+  # the dropout we should use sequential_range to avoid setting
+  # seed vectorization. TODO: the compiler should avoid vectorizing seed setting
+  _range = nl.sequential_range if dropout_p > 0.0 else nl.affine_range
+
+  for i_k_seq_tile in nl.affine_range(k_seq_n_tiles):
+    i_k_seq_dslice = nl.ds(i_k_seq_tile * k_seq_tile_size, k_seq_tile_size)
+
+    # Prefetch V, K
+    v_local = nl.zeros((d_head_n_tiles, par_dim(d_head_tile_size), k_seq_tile_size),
+                       dtype=kernel_dtype)
+    k_local = nl.zeros((d_head_n_tiles, par_dim(d_head_tile_size), k_seq_tile_size),
+                       dtype=kernel_dtype)
+    transposed_k_local = nl.zeros((k_seq_fwd_bwd_tile_multipler, d_head_n_tiles,
+                                   par_dim(k_seq_tile_size_backward), d_head_tile_size),
+                                  dtype=kernel_dtype)
+
+    load_kv(k_ref_hbm_tile=k_ref[batch_id, head_id],
+            v_ref_hbm_tile=v_ref[batch_id, head_id],
+            k_local=k_local, transposed_k_local=transposed_k_local, v_local=v_local,
+            d_head_n_tiles=d_head_n_tiles, d_head_tile_size=d_head_tile_size,
+            i_k_seq_tile=i_k_seq_tile, k_seq_tile_size=k_seq_tile_size,
+            k_seq_tile_size_backward=k_seq_tile_size_backward)
+
+    # FIXME: Pass sbuf instead, we will have psum spilling in the current implementation
+    dv_psum = nl.zeros((d_head_n_tiles, par_dim(d_head_tile_size), k_seq_tile_size),
+                        dtype=np.float32, buffer=nl.psum)
+    dk_psum = nl.zeros((d_head_n_tiles, par_dim(d_head_tile_size), k_seq_tile_size),
+                        dtype=np.float32, buffer=nl.psum)
+    for i_q_seq_tile in _range(q_seq_n_tiles):
+      # Prefetch dy, Q
+      dy_local = nl.zeros((d_head_n_tiles, par_dim(d_head_tile_size), q_seq_tile_size), dtype=kernel_dtype)
+      q_local = nl.zeros((d_head_n_tiles, par_dim(d_head_tile_size), q_seq_tile_size), dtype=kernel_dtype)
+
+      load_dy_q(dy_ref_hbm_tile = dy_ref[batch_id, head_id],
+                q_ref_hbm_tile = q_ref[batch_id, head_id],
+                dy_local=dy_local, q_local=q_local, d_head_n_tiles=d_head_n_tiles,
+                d_head_tile_size=d_head_tile_size, i_q_seq_tile=i_q_seq_tile,
+                q_seq_tile_size=q_seq_tile_size, softmax_scale=softmax_scale)
+
+      logit_bias_tile = None
+      if logit_bias_ref is not None:
+        i_q_seq_dslice = nl.ds(i_q_seq_tile * q_seq_tile_size, q_seq_tile_size)
+        logit_bias_tile = nl.ndarray((par_dim(q_seq_tile_size), k_seq_tile_size),
+                                     buffer=nl.sbuf, dtype=kernel_dtype)
+        logit_bias_tile[:, :] = nl.load(
+          logit_bias_ref[0, 0, i_q_seq_dslice, i_k_seq_dslice])
+
+      _flash_attn_bwd_core(
+        q_local=q_local, k_local=k_local, transposed_k_local=transposed_k_local,
+        v_local=v_local, dy_local=dy_local,
+        dk_psum=dk_psum, dv_psum=dv_psum, dq_local_reduced=dq_local_reduced,
+        softmax_exp_bias=softmax_exp_bias, dy_o_sum=dy_o_sum,
+        local_i_q_seq_tile=i_q_seq_tile, local_i_k_seq_tile=i_k_seq_tile,
+        seqlen_q=seqlen_q, seqlen_k=seqlen_k, d_head=d_head, nheads=nheads,
+        use_causal_mask=use_causal_mask,
+        kernel_dtype=kernel_dtype, mixed_dtype=mixed_dtype,
+        softmax_scale=softmax_scale,
+        seed_local=seed_local, dropout_p=dropout_p, dropout_p_local=dropout_p_local,
+        logit_bias_tile=logit_bias_tile
+      )
+
+    # Write dK, dV
+    store_dk_dv(out_dk_ref_hbm_tile=out_dk_ref[batch_id, head_id],
+                out_dv_ref_hbm_tile=out_dv_ref[batch_id, head_id],
+                local_dk=dk_psum, local_dv=dv_psum, i_k_seq_dslice=i_k_seq_dslice,
+                d_head_n_tiles=d_head_n_tiles, d_head_tile_size=d_head_tile_size)
+
+  # Write dQ
+  for i_q_seq_tile in nl.affine_range(q_seq_n_tiles):
+    for i_d_head_tile in nl.affine_range(d_head_n_tiles):
+      i_q_seq_dslice = nl.ds(i_q_seq_tile * q_seq_tile_size, q_seq_tile_size)
+      i_d_head_dslice = nl.ds(i_d_head_tile * d_head_tile_size, d_head_tile_size)
+      nl.store(
+        out_dq_ref[batch_id, head_id, i_d_head_dslice, i_q_seq_dslice],
+        value=dq_local_reduced[i_q_seq_tile, i_d_head_tile, :, :],
+      )
+
+  return out_dq_ref, out_dk_ref, out_dv_ref
+
+
+@nki.jit(mode='trace')
+def load_dy_q(dy_ref_hbm_tile, q_ref_hbm_tile, dy_local, q_local, d_head_n_tiles, d_head_tile_size, i_q_seq_tile,
+              q_seq_tile_size, softmax_scale):
+  for i_d_head_tile in nl.affine_range(d_head_n_tiles):
+    i_d_head_dslice = nl.ds(i_d_head_tile * d_head_tile_size, d_head_tile_size)
+    i_q_seq_dslice = nl.ds(i_q_seq_tile * q_seq_tile_size, q_seq_tile_size)
+
+    dy_local[i_d_head_tile, :, :] = nl.load(
+      dy_ref_hbm_tile[i_d_head_dslice, i_q_seq_dslice],
+      dtype=dy_local.dtype)
+
+    q_local[i_d_head_tile, :, :] = nl.load(
+      q_ref_hbm_tile[i_d_head_dslice, i_q_seq_dslice],
+      dtype=q_local.dtype) * softmax_scale
+
+
+@nki.jit(mode='trace')
+def store_dk_dv(out_dk_ref_hbm_tile, out_dv_ref_hbm_tile, local_dk, local_dv,
+                d_head_n_tiles, d_head_tile_size, i_k_seq_dslice):
+  for i in nl.affine_range(d_head_n_tiles):
+    i_d_head_dslice = nl.ds(i * d_head_tile_size, d_head_tile_size)
+
+    nl.store(out_dv_ref_hbm_tile[i_d_head_dslice, i_k_seq_dslice],
+             value=local_dv[i, :, :])
+
+    nl.store(out_dk_ref_hbm_tile[i_d_head_dslice, i_k_seq_dslice],
+             value=local_dk[i, :, :])
+
+
+@nki.jit(mode='trace')
+def load_kv(k_ref_hbm_tile, v_ref_hbm_tile, k_local, transposed_k_local, v_local,
+            d_head_n_tiles, d_head_tile_size, i_k_seq_tile, k_seq_tile_size,
+            k_seq_tile_size_backward):
+  k_seq_fwd_bwd_tile_multipler = k_seq_tile_size // k_seq_tile_size_backward
+
+  for i in nl.affine_range(d_head_n_tiles):
+    i_d_head_dslice = nl.ds(i * d_head_tile_size, d_head_tile_size)
+    i_k_seq_dslice = nl.ds(i_k_seq_tile * k_seq_tile_size, k_seq_tile_size)
+    k_local[i, :, :] = nl.load(k_ref_hbm_tile[i_d_head_dslice, i_k_seq_dslice],
+                                           dtype=k_local.dtype)
+    v_local[i, :, :] = nl.load(v_ref_hbm_tile[i_d_head_dslice, i_k_seq_dslice],
+                                           dtype=v_local.dtype)
+    ##############################################################
+    # Prefetch k transpose for the backward too
+    ##############################################################
+    for j in nl.affine_range(k_seq_fwd_bwd_tile_multipler):
+      i_k_dslice = nl.ds(j * k_seq_tile_size_backward, k_seq_tile_size_backward)
+      transposed_k_local[j, i, :, :] = nisa.nc_transpose(k_local[i, :, i_k_dslice])
+
+
+@nki.jit(mode='trace')
+def compute_rowsum(dy_o_sum, dy_ref_hbm_tile, o_ref_hbm_tile, d_head_n_tiles, d_head_tile_size, q_seq_n_tiles,
+                   q_seq_tile_size):
+  mixed_dtype = dy_o_sum.dtype
+  for i in nl.affine_range(q_seq_n_tiles):
+    dy_o_partial = nl.zeros((par_dim(q_seq_tile_size), d_head_n_tiles), dtype=mixed_dtype)
+    for j in nl.affine_range(d_head_n_tiles):
+      d_head_dslice = nl.ds(j * d_head_tile_size, d_head_tile_size)
+      q_seq_dslice = nl.ds(i * q_seq_tile_size, q_seq_tile_size)
+
+      dy_local = nl.load_transpose2d(dy_ref_hbm_tile[d_head_dslice, q_seq_dslice],
+                                     dtype=mixed_dtype)
+      o_local = nl.load_transpose2d(o_ref_hbm_tile[d_head_dslice, q_seq_dslice],
+                                    dtype=mixed_dtype)
+
+      dy_o = nl.multiply(dy_local, o_local, dtype=mixed_dtype)
+      dy_o_partial[:, j] = nisa.tensor_reduce(np.add, data=dy_o, axis=(1,),
+                                              dtype=mixed_dtype)
+
+    dy_o_sum[i, :, 0] = nisa.tensor_reduce(
+      np.add, data=dy_o_partial[:, :], axis=(1,), dtype=mixed_dtype)
+
+
+@nki.jit(mode='trace')
+def _flash_attn_bwd_core(
+  q_local, k_local, transposed_k_local, v_local, dy_local,
+  dk_psum, dv_psum, dq_local_reduced,
+  softmax_exp_bias, dy_o_sum,
+  local_i_q_seq_tile, local_i_k_seq_tile,
+  seqlen_q, seqlen_k, d_head, nheads,
+  use_causal_mask,
+  kernel_dtype, mixed_dtype,
+  softmax_scale,
+  seed_local, dropout_p, dropout_p_local,
+  logit_bias_tile=None):
+  """
+  The flash backward core function to calculate the gradients of Q, K and V
+  of the given tiles. The result will be accumulated into the dk, dv, dq psum
+  """
+  q_seq_n_tiles, q_seq_tile_size = div_ceil(seqlen_q, 128), 128
+  d_head_n_tiles, d_head_tile_size = div_ceil(d_head, 128), min(d_head, 128)
+  if seqlen_k >= 512:
+    k_seq_n_tiles, k_seq_tile_size = seqlen_k // 512, 512
+  else:
+    k_seq_n_tiles, k_seq_tile_size = seqlen_k // 128, 128
+  k_seq_n_tiles_backward, k_seq_tile_size_backward = seqlen_k // 128, 128
+  k_seq_fwd_bwd_tile_multipler = k_seq_tile_size // k_seq_tile_size_backward
+
+  mask = local_i_q_seq_tile * q_seq_tile_size >= local_i_k_seq_tile * k_seq_tile_size if use_causal_mask else None
+  # PSUM buffer shape: [q_seq_tile_size P, k_seq_tile_size F]
+  qk_psum = nl.zeros((par_dim(q_seq_tile_size), k_seq_tile_size),
+                      dtype=np.float32, buffer=nl.psum)
+  qk_res_buf = nl.ndarray((par_dim(q_seq_tile_size), k_seq_tile_size), buffer=nl.sbuf, dtype=kernel_dtype)
+
+  batch_id = nl.program_id(axis=0)
+  head_id = nl.program_id(axis=1)
+
+  # Loop over contraction dim of QK matmul
+  for i_d_head_tile in nl.affine_range(d_head_n_tiles):
+    ##############################################################
+    # Step 2.1 Compute Q^T@K, with matmul(stationary=tensor_q, moving=tensor_k, contract=d_head)
+    ##############################################################
+    qk_psum[:, :] += nisa.nc_matmul(q_local[i_d_head_tile, :, :],
+                                            k_local[i_d_head_tile, :, :],
+                                            mask=mask)
+
+  ######################################
+  # Step 2.2. Apply optional causal mask
+  ######################################
+  if use_causal_mask:
+    iq, ik = nl.mgrid[0:q_seq_tile_size, 0:k_seq_tile_size]
+    causal_pred = (local_i_q_seq_tile * q_seq_tile_size + iq >= local_i_k_seq_tile * k_seq_tile_size + ik)
+    if logit_bias_tile is not None:
+      # Magic number -9984.0 to replace -inf similar to what Tensorizer uses
+      intermediate = \
+        nl.add(qk_psum[:, :], logit_bias_tile[:, :], dtype=mixed_dtype, mask=mask)
+      qk_res_buf[:, :] = nisa.affine_select(
+        pred=causal_pred, 
+        on_true_tile=intermediate, on_false_value=-9984.0, dtype=mixed_dtype,
+        mask=mask
+      )
+
+    else:
+      # Magic number -9984.0 to replace -inf similar to what Tensorizer uses
+      qk_res_buf[:, :] = nisa.affine_select(
+        pred=causal_pred,
+        on_true_tile=qk_psum[:, :], on_false_value=-9984.0, dtype=mixed_dtype,
+        mask=mask)
+  else:
+    if logit_bias_tile is not None:
+      # Simply add logit bias which copies back to sbuf at the same time
+      qk_res_buf[:, :] = \
+        nl.add(qk_psum[:, :], logit_bias_tile[:, :], dtype=mixed_dtype)
+    else:
+      # Simply send psum result back to sbuf
+      qk_res_buf[:, :] = \
+        nl.copy(qk_psum[:, :], dtype=mixed_dtype)
+
+  softmax_y = nl.ndarray((par_dim(q_seq_tile_size), k_seq_tile_size), dtype=kernel_dtype, buffer=nl.sbuf)
+  softmax_y[:, :] = nisa.activation(np.exp,
+                                    data=qk_res_buf[:, :],
+                                    bias=softmax_exp_bias[:, local_i_q_seq_tile],
+                                    scale=1.0,
+                                    mask=mask)
+  #####################################################################
+  # Dropout
+  #####################################################################
+  if dropout_p > 0.0:
+    offset = local_i_k_seq_tile + local_i_q_seq_tile * k_seq_n_tiles \
+              + head_id * k_seq_n_tiles * q_seq_n_tiles \
+              + batch_id * nheads * k_seq_n_tiles * q_seq_n_tiles
+    offset_seed = nl.add(seed_local[0, 0], offset, mask=mask)
+    nl.random_seed(seed=offset_seed, mask=mask)
+    softmax_y[:, :] = nl.dropout(softmax_y[:, :], rate=dropout_p_local[:, 0], mask=mask)
+    softmax_y[:, :] = nl.multiply(softmax_y[:, :], 1 / (1 - dropout_p), mask=mask)
+
+  #####################################################################
+  # Step 3.1 Calculate the backward gradients dL/dV, where y=softmax@V
+  # in value projection with matmul(stationary=dy, moving=softmax)
+  #####################################################################
+  for i_d_head_tile in nl.affine_range(d_head_n_tiles):
+    trans_dy = nisa.nc_transpose(dy_local[i_d_head_tile, :, :],
+                                  mask=mask)
+    dv_psum[i_d_head_tile, :, :] += \
+      nisa.nc_matmul(trans_dy, softmax_y[:, :], mask=mask)
+
+  #####################################################################
+  # Step 3.2 Calculate the backward gradients dL/dsoftmax, where y=softmax@V
+  # in value projection with matmul(stationary=dy, moving=v)
+  #####################################################################
+  softmax_dy_psum = nl.zeros((par_dim(q_seq_tile_size), k_seq_tile_size),
+                              dtype=np.float32, buffer=nl.psum)
+  for i_d_head_tile in nl.affine_range(d_head_n_tiles):
+    softmax_dy_psum[:, :] += \
+      nisa.nc_matmul(dy_local[i_d_head_tile, :, :],
+                      v_local[i_d_head_tile, :, :],
+                      mask=mask)
+
+  softmax_dy = nl.ndarray((par_dim(q_seq_tile_size), k_seq_tile_size), dtype=kernel_dtype, buffer=nl.sbuf)
+  softmax_dy[:, :] = nl.copy(softmax_dy_psum[:, :], dtype=kernel_dtype,
+                                      mask=mask)
+
+  #####################################################################
+  # Step 4 Calculate the softmax backward gradients dL/dx, where y=softmax(x)
+  # dL/dx = y * (dL/dy - rowsum(dO_O)), where y = softmax(x)
+  #####################################################################
+  softmax_dx_local = nl.ndarray((par_dim(q_seq_tile_size), k_seq_tile_size), dtype=kernel_dtype, buffer=nl.sbuf)
+  softmax_dx_local[:, :] = \
+    nisa.scalar_tensor_tensor(data=softmax_dy[:, :],
+                              op0=np.subtract,
+                              operand0=dy_o_sum[local_i_q_seq_tile, :, 0],
+                              op1=np.multiply,
+                              operand1=softmax_y[:, :],
+                              mask=mask)
+
+  #####################################################################
+  # Step 5.1 Calculate dK, with matmul(stationary=Q, moving=softmax_dx)
+  #####################################################################
+  for i_d_head_tile in nl.affine_range(d_head_n_tiles):
+    trans_q_local = nisa.nc_transpose(q_local[i_d_head_tile, :, :],
+                                      mask=mask)
+    dk_psum[i_d_head_tile, :, :] += \
+      nisa.nc_matmul(trans_q_local,
+                      softmax_dx_local[:, :],
+                      mask=mask)
+
+  #####################################################################
+  # Step 5.2 Calculate dQ
+  #####################################################################
+  for i_d_head_tile in nl.affine_range(d_head_n_tiles):
+    dq_psum = nl.zeros((par_dim(d_head_tile_size), q_seq_tile_size),
+                        dtype=np.float32, buffer=nl.psum)
+    for i_k_seq_tile_backward in nl.affine_range(k_seq_fwd_bwd_tile_multipler):
+      i_k_seq_dslice = nl.ds(i_k_seq_tile_backward * k_seq_tile_size_backward,
+                             k_seq_tile_size_backward)
+      transposed_softmax_dx_local = \
+        nisa.nc_transpose(softmax_dx_local[:, i_k_seq_dslice],
+                          mask=mask)
+      dq_psum[:, :] += nisa.nc_matmul(
+          transposed_k_local[i_k_seq_tile_backward, i_d_head_tile, :, :],
+          transposed_softmax_dx_local,
+          mask=mask)
+    dq_local = nl.multiply(dq_psum[:, :], softmax_scale, dtype=kernel_dtype, mask=mask)
+    dq_local_reduced[local_i_q_seq_tile, i_d_head_tile, :, :] = nl.loop_reduce(
+      dq_local, op=np.add, loop_indices=(local_i_k_seq_tile,),
+      dtype=mixed_dtype, mask=mask)
+
+
+@nki.jit
+def fused_self_attn_for_SD_small_head_size(q_ref, k_ref, v_ref, use_causal_mask=False,
+                                           mixed_precision=True):
+  """
+  Fused self attention kernel for small head size Stable Diffusion workload.
+
+  Computes softmax(QK^T)V. Decoder model can optionally include a causal mask
+  application. Does not include QKV projection, output projection, dropout,
+  residual connection, etc.
+
+  This kernel is designed to be used for Stable Diffusion models where the
+  n_heads is smaller or equal to 128. Assertion is thrown if `n_heads` does
+  not satisfy the requirement.
+
+  IO tensor layouts:
+   - q_ptr: shape   (bs, n_heads, seq_q)
+   - k_ptr: shape   (bs, seq_k, n_heads)
+   - v_ptr: shape   (bs, seq_v, n_heads)
+   - out_ptr: shape (bs, seq_q, n_heads)
+   - We use seq_q and seq_k just for clarity, this kernel requires seq_q == seq_k
+
+  IO tensor dtypes:
+   - This kernel assumes all IO tensors have the same dtype
+   - If mixed_precision is True, then all Tensor Engine operation will be performed in
+     bfloat16 and accumulation will be performed in float32. Otherwise the intermediates
+     will be in the same type as the inputs.
+  """
+  # Use q_ref dtype as the intermediate tensor dtype
+  # Assume all IO tensors have the same dtype
+  kernel_dtype = q_ref.dtype
+  pe_in_dt = nl.bfloat16 if mixed_precision else np.float32
+  assert q_ref.dtype == k_ref.dtype == v_ref.dtype
+
+  # Shape checking
+  bs, d_head, seqlen = q_ref.shape
+  assert d_head <= 128, "Cannot use this kernel for d_head > 128"
+  assert tuple(q_ref.shape) == (bs, d_head, seqlen), 'Input shape mismatch!'
+  assert tuple(k_ref.shape) == (bs, seqlen, d_head), 'Input shape mismatch!'
+  assert tuple(v_ref.shape) == (bs, seqlen,  d_head), \
+    f'Input shape mismatch! Expected: {(bs, seqlen, d_head)} Actual: {tuple(v_ref.shape)}'
+
+  out_ref = nl.ndarray((bs, seqlen, d_head), dtype=q_ref.dtype, buffer=nl.shared_hbm)
+
+  # Softmax scaling factor, multiplied onto Q
+  softmax_scale = 0.125
+
+  # Different batch samples/attention heads have independent attention
+  batch_id = nl.program_id(axis=0)
+  # batch_id = 0
+
+  # TODO: make q_seq_tile_size user input
+  # The matmuls currently use a fixed tile size of (128, 128). This may not achieve the best
+  # performance for dense attention. However, since this kernel is in preparation
+  # for block-sparse attention, this tile size is acceptable because the block
+  # size of block-sparse attention cannot be too large.
+  q_seq_n_tiles, q_seq_tile_size = seqlen // 128, 128
+  k_seq_n_tiles, k_seq_tile_size = seqlen // 128, 128
+  # No tiling on d_head dimension since the number of d_head fits in SB
+  d_head_tile_size = d_head
+  v_seq_n_tiles, v_seq_tile_size = seqlen // 128, 128
+
+  ###################################
+  # Step 1. transpose(tensor_v)
+  ###################################
+  # Buffer for v matrix transposed
+  # Pre-fetch and keep it in SBUF throughout different softmax tiles
+  trans_v = nl.ndarray((par_dim(v_seq_tile_size), v_seq_n_tiles, d_head), dtype=pe_in_dt)
+
+  for i_k_seq_tile in nl.affine_range(k_seq_n_tiles):
+    ip_v = nl.arange(v_seq_tile_size)[:, None]
+    if_v = nl.arange(d_head_tile_size)[None, :]
+    trans_v[ip_v, i_k_seq_tile, if_v] = nl.load(
+      v_ref[batch_id, i_k_seq_tile * k_seq_tile_size + ip_v, if_v],
+      dtype=pe_in_dt)
+
+  q_local = nl.ndarray((q_seq_n_tiles, par_dim(d_head_tile_size), q_seq_tile_size), dtype=pe_in_dt)
+  ip_q = nl.arange(d_head_tile_size)[:, None]
+  if_q = nl.arange(q_seq_tile_size)[None, :]
+  for i_q_seq_tile in nl.affine_range(q_seq_n_tiles):
+    q_local[i_q_seq_tile, ip_q, if_q] = nl.load(
+      q_ref[batch_id, ip_q, i_q_seq_tile * q_seq_tile_size + if_q],
+      dtype=pe_in_dt) * softmax_scale
+
+  k_local = nl.ndarray((k_seq_n_tiles, par_dim(d_head_tile_size), k_seq_tile_size), dtype=pe_in_dt)
+  ip_k = nl.arange(d_head_tile_size)[:, None]
+  if_k = nl.arange(k_seq_tile_size)[None, :]
+  for i_k_seq_tile in nl.affine_range(k_seq_n_tiles):
+    k_local[i_k_seq_tile, ip_k, if_k] = nl.load_transpose2d(
+      k_ref[batch_id,
+            i_k_seq_tile * k_seq_tile_size + nl.arange(k_seq_tile_size)[:, None],
+            nl.arange(d_head_tile_size)[None, :]],
+      dtype=pe_in_dt)
+
+  for i_q_seq_tile in nl.affine_range(q_seq_n_tiles):  # indent = 2
+    # A SBUF buffer for an independent softmax tile
+    qk_res_buf = nl.ndarray((par_dim(q_seq_tile_size), seqlen), dtype=kernel_dtype)
+
+    neg_max_res = nl.ndarray((par_dim(q_seq_tile_size), k_seq_n_tiles), dtype=kernel_dtype)
+    ip_max = nl.arange(q_seq_tile_size)[:, None]
+    if_max = nl.arange(k_seq_n_tiles)[None, :]
+
+    # Loop over RHS free of matmul(stationary=tensor_q, moving=tensor_k, contract=d_head)
+    for i_k_seq_tile in nl.affine_range(k_seq_n_tiles):  # indent = 4
+
+      # Since the K^T tile is the RHS, the q_seq_len dimension will be P in the result
+      # PSUM buffer shape: [q_seq_tile_size P, k_seq_tile_size F]
+      qk_psum = nl.zeros((par_dim(q_seq_tile_size), k_seq_tile_size),
+                         dtype=np.float32, buffer=nl.psum)
+
+      # Tensor indices for accessing qk result in k_seq_tile_size
+      ip_qk = nl.arange(q_seq_tile_size)[:, None]
+      if_qk = nl.arange(k_seq_tile_size)[None, :]
+
+      ##############################################################
+      # Step 2. matmul(stationary=tensor_q, moving=tensor_k, contract=d_head)
+      ##############################################################
+      qk_psum[ip_qk, if_qk] += nisa.nc_matmul(moving=k_local[i_k_seq_tile, ip_k, if_k],
+                                              stationary=q_local[i_q_seq_tile, ip_q, if_q])
+
+      ###################################
+      # Step 3. Apply optional causal mask
+      ###################################
+      if use_causal_mask:
+        # Magic number -9984.0 to replace -inf similar to what Tensorizer uses
+        qk_res_buf[ip_qk, i_k_seq_tile * k_seq_tile_size + if_qk] = nisa.affine_select(
+          pred=(i_q_seq_tile * q_seq_tile_size + ip_qk >= i_k_seq_tile * k_seq_tile_size + if_qk),
+          on_true_tile=qk_psum[ip_qk, if_qk], on_false_value=-9984.0, dtype=kernel_dtype)
+      else:
+        # Simply send psum result back to sbuf
+        qk_res_buf[ip_qk, i_k_seq_tile * k_seq_tile_size + if_qk] = nl.copy(qk_psum[ip_qk, if_qk],
+                                                                              dtype=kernel_dtype)
+
+      ###################################
+      # Step 4. Softmax
+      ###################################
+      # TODO: use TensorScalarCacheReduce to avoid an extra copy
+      # We want to break this reduction in tiles because we want to overlap it with the previous matmul
+      neg_max_res[ip_max, i_k_seq_tile] = nisa.tensor_reduce(
+        np.max, data=qk_res_buf[ip_qk, i_k_seq_tile * k_seq_tile_size + if_qk],
+        axis=(1,), dtype=kernel_dtype, negate=True)
+
+    neg_max_res_final = nisa.tensor_reduce(
+      np.min, data=neg_max_res[ip_max, if_max],
+      axis=(1,), dtype=kernel_dtype, negate=False)
+
+    ip_softmax = nl.arange(q_seq_tile_size)[:, None]
+    if_softmax = nl.arange(seqlen)[None, :]
+    ip_sum_res = nl.arange(q_seq_tile_size)[:, None]
+    if_sum_res = nl.arange(d_head_tile_size)[None, :]
+
+    softmax_res = nl.ndarray((par_dim(q_seq_tile_size), seqlen), dtype=pe_in_dt)
+    sum_divisor = nl.ndarray((par_dim(q_seq_tile_size), d_head_tile_size), dtype=kernel_dtype)
+
+    # Simply use a large tile of seq_len in size since this is a "blocking" instruction
+    # Assuming the compiler will merge exp and reduce_add into a single instruction on ACT
+    exp_res = nisa.activation(np.exp,
+                              data=qk_res_buf[ip_softmax, if_softmax],
+                              bias=neg_max_res_final, scale=1.0)
+
+    sum_res = nisa.tensor_reduce(np.add, data=exp_res, axis=(1,),
+                          dtype=kernel_dtype)
+    softmax_res[ip_softmax, if_softmax] = nl.copy(exp_res, dtype=pe_in_dt)
+
+    sum_reciprocal_broadcast = (1.0 / sum_res).broadcast_to((q_seq_tile_size, d_head_tile_size))
+    sum_divisor[ip_sum_res, if_sum_res] = nl.copy(sum_reciprocal_broadcast, dtype=kernel_dtype)
+
+    # Buffer for transposed softmax results (FP32 in PSUM)
+    trans_softmax_res = nl.ndarray(
+      (par_dim(k_seq_tile_size), k_seq_n_tiles, q_seq_tile_size),
+      dtype=pe_in_dt)
+
+    # Result psum buffer has the hidden dim as P
+    attn_res_psum = nl.zeros((par_dim(d_head_tile_size), q_seq_tile_size),
+                             dtype=np.float32, buffer=nl.psum)
+
+    ip_scores_t = nl.arange(k_seq_tile_size)[:, None]
+    if_scores_t = nl.arange(q_seq_tile_size)[None, :]
+    # Loop over matmul_1 contraction
+    for i_k_seq_tile in nl.affine_range(k_seq_n_tiles):
+      ###################################
+      # Step 5. transpose(softmax_res)
+      ###################################
+      ip_scores = nl.arange(q_seq_tile_size)[:, None]
+      if_scores = nl.arange(k_seq_tile_size)[None, :]
+
+      trans_softmax_res[ip_scores_t, i_k_seq_tile, if_scores_t] = nisa.nc_transpose(
+        softmax_res[ip_scores, i_k_seq_tile * k_seq_tile_size + if_scores])
+
+    ip_out = nl.arange(d_head_tile_size)[:, None]
+    if_out = nl.arange(q_seq_tile_size)[None, :]
+    for i_k_seq_tile in nl.affine_range(k_seq_n_tiles):
+      ######################################################################
+      # Step 6. matmul_1(stationary=trans_v, moving=trans_softmax_res, contract=seqlen_v=seqlen_k)
+      ######################################################################
+      ip_v_t = nl.arange(k_seq_tile_size)[:, None]
+      if_v_t = nl.arange(d_head_tile_size)[None, :]
+      attn_res_psum[ip_out, if_out] += \
+        nisa.nc_matmul(moving=trans_softmax_res[ip_scores_t, i_k_seq_tile, if_scores_t],
+                       stationary=trans_v[ip_v_t, i_k_seq_tile, if_v_t])
+
+    attn_res_sbuf = nl.copy(attn_res_psum[ip_out, if_out], dtype=kernel_dtype)
+
+    attn_res_div = attn_res_sbuf * nisa.nc_transpose(sum_divisor[ip_sum_res, if_sum_res])
+
+    nl.store(
+      out_ref[batch_id, i_q_seq_tile * q_seq_tile_size + if_out, ip_out],
+      value=attn_res_div)
+
+  return out_ref
diff --git a/test/unit/conftest.py b/src/self-attention/tests/conftest.py
similarity index 100%
rename from test/unit/conftest.py
rename to src/self-attention/tests/conftest.py
diff --git a/src/self-attention/tests/test_flash_attn_bwd.py b/src/self-attention/tests/test_flash_attn_bwd.py
new file mode 100644
index 0000000..1f948bf
--- /dev/null
+++ b/src/self-attention/tests/test_flash_attn_bwd.py
@@ -0,0 +1,460 @@
+"""
+Copyright (c) 2023, Amazon.com. All Rights Reserved
+"""
+import pytest
+import sys
+import os
+import logging
+import time
+from typing import Optional, Tuple
+import numpy as np
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from attention import flash_attn_bwd
+from neuronxcc.nki import benchmark, baremetal, simulate_kernel
+import neuronxcc.nki.language as nl
+
+# Configure logging for verbose output
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+xfail = pytest.mark.arch_specific_xfail
+bench_func = benchmark(warmup=5, iters=10)(flash_attn_bwd)
+
+def print_test_header(test_name: str, params: dict):
+    """Print a formatted test header with parameters"""
+    print("\n" + "="*80)
+    print(f"🧪 RUNNING TEST: {test_name}")
+    print("="*80)
+    print("📋 Test Parameters:")
+    for key, value in params.items():
+        print(f"   {key:20}: {value}")
+    print("="*80)
+
+def print_tensor_info(name: str, tensor: np.ndarray):
+    """Print detailed tensor information"""
+    print(f"📊 {name} Info:")
+    print(f"   Shape: {tensor.shape}")
+    print(f"   Dtype: {tensor.dtype}")
+    print(f"   Size (elements): {tensor.size:,}")
+    print(f"   Memory (MB): {tensor.nbytes / 1024 / 1024:.2f}")
+    print(f"   Min/Max: {tensor.min():.6f} / {tensor.max():.6f}")
+    print(f"   Mean/Std: {tensor.mean():.6f} / {tensor.std():.6f}")
+
+def print_performance_metrics(latency_res, expected_latency: int, test_name: str):
+    """Print detailed performance metrics with robust error handling"""
+    print("\n📈 PERFORMANCE METRICS:")
+    print("-" * 40)
+    
+    # Try to get available percentiles, fallback to common ones
+    percentiles = [50, 90, 95, 99]
+    available_percentiles = []
+    
+    for p in percentiles:
+        try:
+            latency = latency_res.get_latency_percentile(p)
+            available_percentiles.append(p)
+            status = "✅ PASS" if latency <= expected_latency else "❌ FAIL"
+            print(f"   P{p:2d} Latency: {latency:,} ns ({latency/1e9:.3f}s) {status}")
+        except (KeyError, AttributeError, Exception) as e:
+            print(f"   P{p:2d} Latency: ❓ NOT AVAILABLE ({type(e).__name__})")
+    
+    # Try to get basic stats if percentiles fail
+    if not available_percentiles:
+        try:
+            # Try alternative methods to get latency data
+            if hasattr(latency_res, 'mean'):
+                mean_latency = latency_res.mean
+                print(f"   Mean Latency: {mean_latency:,} ns ({mean_latency/1e9:.3f}s)")
+            if hasattr(latency_res, 'min'):
+                min_latency = latency_res.min
+                print(f"   Min Latency:  {min_latency:,} ns ({min_latency/1e9:.3f}s)")
+            if hasattr(latency_res, 'max'):
+                max_latency = latency_res.max
+                print(f"   Max Latency:  {max_latency:,} ns ({max_latency/1e9:.3f}s)")
+            
+            print(f"   Available attributes: {[attr for attr in dir(latency_res) if not attr.startswith('_')]}")
+        except Exception as e:
+            print(f"   ⚠️  Could not extract latency metrics: {e}")
+            print(f"   Latency result type: {type(latency_res)}")
+            print(f"   Available methods: {[method for method in dir(latency_res) if not method.startswith('_')]}")
+    
+    print(f"   Expected:   {expected_latency:,} ns ({expected_latency/1e9:.3f}s)")
+    
+    # Use P50 if available, otherwise try other metrics
+    try:
+        p50_latency = latency_res.get_latency_percentile(50)
+        test_passed = p50_latency <= expected_latency
+        print(f"   Test Status: {'✅ PASSED' if test_passed else '❌ FAILED'}")
+        return p50_latency
+    except:
+        print(f"   Test Status: ❓ CANNOT DETERMINE (P50 not available)")
+        return None
+
+def print_memory_usage(bs: int, nheads: int, seqlen: int, d: int, dtype):
+    """Calculate and print memory usage estimates for backward pass"""
+    element_size = 2 if dtype == nl.bfloat16 else 4  # bytes
+    
+    # Input tensors
+    q_size = bs * nheads * d * seqlen * element_size
+    k_size = bs * nheads * d * seqlen * element_size
+    v_size = bs * nheads * d * seqlen * element_size
+    dy_size = bs * nheads * d * seqlen * element_size
+    o_proj_size = bs * nheads * d * seqlen * element_size
+    lse_size = bs * nheads * nl.tile_size.pmax * (seqlen // nl.tile_size.pmax) * 4  # float32
+    
+    # Output gradients
+    dq_size = q_size
+    dk_size = k_size
+    dv_size = v_size
+    
+    total_input_size = q_size + k_size + v_size + dy_size + o_proj_size + lse_size
+    total_output_size = dq_size + dk_size + dv_size
+    total_size = total_input_size + total_output_size
+    
+    print("\n💾 MEMORY USAGE ESTIMATES:")
+    print("-" * 40)
+    print("   Input Tensors:")
+    print(f"     Q tensor:       {q_size / 1024 / 1024:.2f} MB")
+    print(f"     K tensor:       {k_size / 1024 / 1024:.2f} MB")
+    print(f"     V tensor:       {v_size / 1024 / 1024:.2f} MB")
+    print(f"     dY tensor:      {dy_size / 1024 / 1024:.2f} MB")
+    print(f"     O_proj tensor:  {o_proj_size / 1024 / 1024:.2f} MB")
+    print(f"     LSE tensor:     {lse_size / 1024 / 1024:.2f} MB")
+    print("   Output Gradients:")
+    print(f"     dQ tensor:      {dq_size / 1024 / 1024:.2f} MB")
+    print(f"     dK tensor:      {dk_size / 1024 / 1024:.2f} MB")
+    print(f"     dV tensor:      {dv_size / 1024 / 1024:.2f} MB")
+    print("-" * 40)
+    print(f"   Total Input:    {total_input_size / 1024 / 1024:.2f} MB")
+    print(f"   Total Output:   {total_output_size / 1024 / 1024:.2f} MB")
+    print(f"   Total Memory:   {total_size / 1024 / 1024:.2f} MB")
+    print(f"   Est. Peak:      {total_size * 2 / 1024 / 1024:.2f} MB (2x for intermediate)")
+
+def softmax(x: np.ndarray, dim: int, zero_max_mode=False,
+            mixed_precision=False, return_max_reduce=False):
+    """Softmax implementation with verbose logging"""
+    logger.debug(f"Computing softmax on tensor shape {x.shape} along dim {dim}")
+    
+    max_value = np.amax(x, axis=dim, keepdims=True)
+    max_value = np.maximum(0, max_value) if zero_max_mode else max_value
+    exp = np.exp(x - max_value)
+    
+    if mixed_precision:
+        reduce = np.add.reduce(exp.astype(np.float32), axis=dim, keepdims=True).astype(x.dtype)
+    else:
+        reduce = np.add.reduce(exp, axis=dim, keepdims=True)
+    
+    if return_max_reduce:
+        return exp / reduce, -max_value, np.reciprocal(reduce)
+    return exp / reduce
+
+def softmax_dx(dy: np.ndarray, y: np.ndarray, dim: int, mixed_precision=False):
+    """Softmax gradient computation with logging"""
+    logger.debug(f"Computing softmax gradient on tensors shape {dy.shape}")
+    
+    # dx_i = (dy_i - sum(dy_k*y_k)) * y_i
+    prod = dy * y
+    if mixed_precision:
+        reduce = np.add.reduce(prod.astype(np.float32), axis=dim, keepdims=True).astype(dy.dtype)
+    else:
+        reduce = np.add.reduce(prod, axis=dim, keepdims=True)
+    subtract = dy - reduce
+    return subtract * y
+
+def cpu_attention_backward(q, k, v, dy, use_causal_mask=True, mixed_precision=True):
+    """
+    Compute the attention backward with the softmax recomputation
+    """
+    logger.info("🔄 Computing CPU reference attention backward pass...")
+    start_time = time.time()
+    
+    def mixed_precision_matmul(a, b):
+        input_dtype = a.dtype
+        a, b = a.astype(np.float32), b.astype(np.float32)
+        c = np.matmul(a, b)
+        return c.astype(input_dtype)
+
+    _, _, d, _ = q.shape
+    logger.debug(f"Attention head dimension: {d}")
+    
+    # Compute golden output
+    softmax_scale = 1.0 / (d ** 0.5)
+    logger.debug(f"Softmax scale factor: {softmax_scale:.6f}")
+    
+    q_scaled = q * softmax_scale
+    
+    logger.debug("Computing attention scores...")
+    raw_score = mixed_precision_matmul(q_scaled.transpose(0, 1, 3, 2), k)
+
+    if use_causal_mask:
+        logger.debug("Applying causal mask...")
+        for i in range(raw_score.shape[0]):
+            for j in range(raw_score.shape[1]):
+                # -inf triggers invalid input error in softmax implementation, use a small negative instead
+                # k=1 to exclude the diagonal, because each token can still attend to itself
+                raw_score[i, j][np.triu_indices_from(raw_score[i, j], k=1)] = -9984.0
+
+    logger.debug("Computing forward softmax...")
+    norm_score, cached_negative_max, cached_sum_reciprocal = \
+        softmax(raw_score, dim=-1, mixed_precision=mixed_precision, return_max_reduce=True)
+
+    logger.debug("Computing backward pass gradients...")
+    
+    # Calculate softmax_dy = (dL/dy)^T @ V
+    logger.debug("Computing softmax gradient input...")
+    softmax_dy = mixed_precision_matmul(dy.transpose(0, 1, 3, 2), v)
+
+    # Calculate dv = (dL/dy) @ softmax_y
+    logger.debug("Computing dV gradient...")
+    dv_golden = mixed_precision_matmul(dy, norm_score)
+
+    # Calculate softmax_dx
+    logger.debug("Computing softmax gradient...")
+    softmax_dx_golden = softmax_dx(softmax_dy, norm_score, dim=-1, mixed_precision=mixed_precision)
+
+    # Calculate dq
+    logger.debug("Computing dQ gradient...")
+    dq_golden = mixed_precision_matmul(k, softmax_dx_golden.transpose(0, 1, 3, 2)) * softmax_scale
+
+    # Calculate dk
+    logger.debug("Computing dK gradient...")
+    dk_golden = mixed_precision_matmul(q_scaled, softmax_dx_golden)
+
+    # Calculate output projection
+    logger.debug("Computing output projection...")
+    o_proj = np.matmul(norm_score, v.transpose(0, 1, 3, 2)).transpose(0, 1, 3, 2)
+
+    elapsed_time = time.time() - start_time
+    logger.info(f"✅ CPU reference backward pass completed in {elapsed_time:.2f} seconds")
+
+    return dq_golden, dk_golden, dv_golden, cached_negative_max, cached_sum_reciprocal, o_proj
+
+def print_gradient_comparison(grad_name: str, computed_grad: np.ndarray, reference_grad: np.ndarray, tolerance: float = 1e-2):
+    """Print detailed comparison of gradients"""
+    max_diff = np.max(np.abs(computed_grad - reference_grad))
+    mean_diff = np.mean(np.abs(computed_grad - reference_grad))
+    relative_error = np.mean(np.abs(computed_grad - reference_grad) / (np.abs(reference_grad) + 1e-8))
+    close = np.allclose(computed_grad, reference_grad, atol=tolerance)
+    
+    print(f"📊 {grad_name} Gradient Comparison:")
+    print(f"   Max absolute difference:  {max_diff:.6f}")
+    print(f"   Mean absolute difference: {mean_diff:.6f}")
+    print(f"   Mean relative error:      {relative_error:.6f}")
+    print(f"   Tolerance:                {tolerance}")
+    print(f"   Result: {'✅ PASS' if close else '❌ FAIL'}")
+    
+    if not close:
+        # Additional debugging info for failures
+        print(f"   Computed - Min/Max: {computed_grad.min():.6f} / {computed_grad.max():.6f}")
+        print(f"   Reference - Min/Max: {reference_grad.min():.6f} / {reference_grad.max():.6f}")
+    
+    return close
+
+class TestAttention:
+
+    @xfail # P167481231
+    @pytest.mark.parametrize("bs, nheads, seqlen, d, dtype, latency", [
+        [1, 4, 32*1024, 128, nl.bfloat16, 117000],
+    ])
+    def test_flash_attn_bwd_perf(self, bs, nheads, seqlen, d, dtype, latency):
+        
+        # Print test header with all parameters
+        test_params = {
+            'Batch Size': bs,
+            'Num Heads': nheads,
+            'Sequence Length': f"{seqlen:,}",
+            'Head Dimension': d,
+            'Data Type': str(dtype),
+            'Expected Latency': f"{latency:,} ns"
+        }
+        
+        print_test_header("Flash Attention Backward Performance Test", test_params)
+        print_memory_usage(bs, nheads, seqlen, d, dtype)
+        
+        print("\n⚙️  SETUP PHASE:")
+        print("-" * 40)
+        
+        # Generate test data
+        print("🎲 Generating random test tensors...")
+        q = (np.random.random_sample([bs, nheads, d, seqlen]) - 0.5) * 2
+        k = (np.random.random_sample([bs, nheads, d, seqlen]) - 0.5) * 2
+        v = (np.random.random_sample([bs, nheads, d, seqlen]) - 0.5) * 2
+        dy = (np.random.random_sample([bs, nheads, d, seqlen]) - 0.5) * 2
+        o_proj = (np.random.random_sample([bs, nheads, d, seqlen]) - 0.5) * 2
+        lse = np.random.random_sample([bs, nheads, nl.tile_size.pmax, seqlen // nl.tile_size.pmax]).astype(np.float32)
+        seed = None
+
+        # Print tensor information
+        print_tensor_info("Q", q)
+        print_tensor_info("K", k)
+        print_tensor_info("V", v)
+        print_tensor_info("dY (output gradient)", dy)
+        print_tensor_info("O_proj (forward output)", o_proj)
+        print_tensor_info("LSE (log-sum-exp)", lse)
+        
+        # Cast to target dtype
+        print(f"\n🔄 Converting tensors to {dtype}...")
+        q = nl.static_cast(q, dtype)
+        k = nl.static_cast(k, dtype)
+        v = nl.static_cast(v, dtype)
+        o_proj = nl.static_cast(o_proj, dtype)
+        dy = nl.static_cast(dy, dtype)
+        
+        print("\n🚀 BENCHMARKING PHASE:")
+        print("-" * 40)
+        print("⏱️  Running benchmark with warmup=5, iters=10...")
+        print("⚠️  Note: This test is marked as xfail due to P167481231")
+        
+        bench_func_ = bench_func[bs, nheads]
+        
+        # Run the benchmark
+        start_time = time.time()
+        bench_func_(q, k, v, o_proj, dy, lse, seed,
+                    use_causal_mask=True, mixed_precision=True)
+        benchmark_time = time.time() - start_time
+        
+        print(f"✅ Benchmark completed in {benchmark_time:.2f} seconds")
+        
+        # Get and display results
+        latency_res = bench_func_.benchmark_result.nc_latency
+        p50_latency = print_performance_metrics(latency_res, latency, "Flash Attention Backward")
+        
+        # Final assertion with better error handling
+        if p50_latency is not None:
+            try:
+                assert p50_latency <= latency
+                print(f"\n🎉 TEST PASSED! P50 latency ({p50_latency:,} ns) <= expected ({latency:,} ns)")
+            except AssertionError:
+                print(f"\n💥 TEST FAILED! P50 latency ({p50_latency:,} ns) > expected ({latency:,} ns)")
+                raise
+        else:
+            # Fallback: try to find any available latency metric
+            print(f"\n⚠️  WARNING: Could not determine P50 latency for comparison")
+            print(f"   Benchmark result type: {type(bench_func_.benchmark_result)}")
+            print(f"   NC latency type: {type(latency_res)}")
+            
+            # Try alternative assertion methods
+            try:
+                # Look for any latency value we can use
+                if hasattr(latency_res, 'mean'):
+                    mean_latency = latency_res.mean
+                    assert mean_latency <= latency
+                    print(f"✅ Using mean latency for comparison: {mean_latency:,} ns <= {latency:,} ns")
+                else:
+                    print("❌ No suitable latency metric found for assertion")
+                    raise AssertionError("Cannot determine latency for comparison")
+            except Exception as e:
+                print(f"💥 Assertion failed: {e}")
+                raise
+        
+        print("\n" + "="*80 + "\n")
+
+    @pytest.mark.simulation
+    @pytest.mark.parametrize("bs, nheads, seqlen, d, dtype", [
+        [1, 4, 4096, 128, np.float32],
+    ])
+    def test_flash_attn_bwd_numerical(self, simulation_only, bs, nheads, seqlen, d, dtype):
+        
+        # Print test header
+        test_params = {
+            'Batch Size': bs,
+            'Num Heads': nheads,
+            'Sequence Length': f"{seqlen:,}",
+            'Head Dimension': d,
+            'Data Type': str(dtype),
+            'Simulation Only': simulation_only
+        }
+        
+        print_test_header("Flash Attention Backward Numerical Test", test_params)
+        print_memory_usage(bs, nheads, seqlen, d, dtype)
+        
+        print("\n⚙️  SETUP PHASE:")
+        print("-" * 40)
+        
+        # Generate test data
+        print("🎲 Generating random test tensors...")
+        q = (np.random.random_sample([bs, nheads, d, seqlen]) - 0.5) * 2
+        k = (np.random.random_sample([bs, nheads, d, seqlen]) - 0.5) * 2
+        v = (np.random.random_sample([bs, nheads, d, seqlen]) - 0.5) * 2
+        dy = (np.random.random_sample([bs, nheads, d, seqlen]) - 0.5) * 2
+        
+        # Print tensor information
+        print_tensor_info("Q", q)
+        print_tensor_info("K", k)
+        print_tensor_info("V", v)
+        print_tensor_info("dY (output gradient)", dy)
+        
+        # Cast to target dtype
+        print(f"\n🔄 Converting tensors to {dtype}...")
+        q = nl.static_cast(q, dtype)
+        k = nl.static_cast(k, dtype)
+        v = nl.static_cast(v, dtype)
+        dy = nl.static_cast(dy, dtype)
+        seed = None
+
+        print("\n🔍 REFERENCE COMPUTATION:")
+        print("-" * 40)
+        
+        # Compute reference (golden) output
+        dq_golden, dk_golden, dv_golden, cached_negative_max, cached_sum_reciprocal, o_proj = \
+          cpu_attention_backward(q, k, v, dy, use_causal_mask=True)
+        
+        # Reshape reference outputs to match expected format
+        cached_negative_max = cached_negative_max.reshape(bs, nheads, seqlen // nl.tile_size.pmax,
+                                                          nl.tile_size.pmax).transpose(0, 1, 3, 2)
+        cached_sum_reciprocal = cached_sum_reciprocal.reshape(bs, nheads, seqlen // nl.tile_size.pmax,
+                                                              nl.tile_size.pmax).transpose(0, 1, 3, 2)
+        lse = -1.0 * (cached_negative_max + np.log(cached_sum_reciprocal))
+        
+        print_tensor_info("Reference dQ", dq_golden)
+        print_tensor_info("Reference dK", dk_golden)
+        print_tensor_info("Reference dV", dv_golden)
+        print_tensor_info("Reference O_proj", o_proj)
+        print_tensor_info("LSE (computed)", lse)
+
+        print("\n🚀 FLASH ATTENTION BACKWARD COMPUTATION:")
+        print("-" * 40)
+        
+        numeric_func = baremetal(flash_attn_bwd)
+        
+        if simulation_only:
+            print("🔬 Running in simulation mode...")
+            start_time = time.time()
+            out_dq, out_dk, out_dv = simulate_kernel(numeric_func[bs, nheads], q, k, v, o_proj, dy, lse, seed,
+                                                     use_causal_mask=True,
+                                                     mixed_precision=True)
+            compute_time = time.time() - start_time
+            print(f"✅ Simulation completed in {compute_time:.2f} seconds")
+        else:
+            print("⚡ Running on hardware...")
+            start_time = time.time()
+            out_dq, out_dk, out_dv = numeric_func[bs, nheads](q, k, v, o_proj, dy, lse, seed,
+                                                             use_causal_mask=True,
+                                                             mixed_precision=True)
+            compute_time = time.time() - start_time
+            print(f"✅ Hardware execution completed in {compute_time:.2f} seconds")
+
+        print("\n🔬 NUMERICAL VERIFICATION:")
+        print("-" * 40)
+        
+        print_tensor_info("Flash dQ", out_dq)
+        print_tensor_info("Flash dK", out_dk)
+        print_tensor_info("Flash dV", out_dv)
+        
+        # Check all gradients
+        dq_close = print_gradient_comparison("dQ", out_dq, dq_golden, tolerance=1e-2)
+        dk_close = print_gradient_comparison("dK", out_dk, dk_golden, tolerance=1e-2)
+        dv_close = print_gradient_comparison("dV", out_dv, dv_golden, tolerance=1e-2)
+        
+        # Final assertions
+        try:
+            assert dq_close, f"dQ gradient mismatch"
+            assert dk_close, f"dK gradient mismatch"
+            assert dv_close, f"dV gradient mismatch"
+            print(f"\n🎉 TEST PASSED! All gradients match reference within tolerance")
+        except AssertionError as e:
+            print(f"\n💥 TEST FAILED! {str(e)}")
+            raise
+        
+        print("\n" + "="*80 + "\n")
\ No newline at end of file
diff --git a/src/self-attention/tests/test_flash_attn_fwd.py b/src/self-attention/tests/test_flash_attn_fwd.py
new file mode 100644
index 0000000..94feeef
--- /dev/null
+++ b/src/self-attention/tests/test_flash_attn_fwd.py
@@ -0,0 +1,466 @@
+"""
+Copyright (c) 2023, Amazon.com. All Rights Reserved
+"""
+import pytest
+import sys
+import os
+import logging
+import time
+from typing import Optional, Tuple
+import numpy as np
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from attention import flash_fwd, FlashConfig
+from neuronxcc.nki import benchmark, baremetal, simulate_kernel
+import neuronxcc.nki.language as nl
+
+# Configure logging for verbose output
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+bench_func = benchmark(warmup=5, iters=10)(flash_fwd)
+
+def print_test_header(test_name: str, params: dict):
+    """Print a formatted test header with parameters"""
+    print("\n" + "="*80)
+    print(f"🧪 RUNNING TEST: {test_name}")
+    print("="*80)
+    print("📋 Test Parameters:")
+    for key, value in params.items():
+        print(f"   {key:20}: {value}")
+    print("="*80)
+
+def print_tensor_info(name: str, tensor: np.ndarray):
+    """Print detailed tensor information"""
+    print(f"📊 {name} Info:")
+    print(f"   Shape: {tensor.shape}")
+    print(f"   Dtype: {tensor.dtype}")
+    print(f"   Size (elements): {tensor.size:,}")
+    print(f"   Memory (MB): {tensor.nbytes / 1024 / 1024:.2f}")
+    print(f"   Min/Max: {tensor.min():.6f} / {tensor.max():.6f}")
+    print(f"   Mean/Std: {tensor.mean():.6f} / {tensor.std():.6f}")
+
+def print_performance_metrics(latency_res, expected_latency: int, test_name: str):
+    """Print detailed performance metrics"""
+    print("\n📈 PERFORMANCE METRICS:")
+    print("-" * 40)
+    
+    # Try to get available percentiles, fallback to common ones
+    percentiles = [50, 90, 95, 99]
+    available_percentiles = []
+    
+    for p in percentiles:
+        try:
+            latency = latency_res.get_latency_percentile(p)
+            available_percentiles.append(p)
+            status = "✅ PASS" if latency <= expected_latency else "❌ FAIL"
+            print(f"   P{p:2d} Latency: {latency:,} ns ({latency/1e9:.3f}s) {status}")
+        except (KeyError, AttributeError, Exception) as e:
+            print(f"   P{p:2d} Latency: ❓ NOT AVAILABLE ({type(e).__name__})")
+    
+    # Try to get basic stats if percentiles fail
+    if not available_percentiles:
+        try:
+            # Try alternative methods to get latency data
+            if hasattr(latency_res, 'mean'):
+                mean_latency = latency_res.mean
+                print(f"   Mean Latency: {mean_latency:,} ns ({mean_latency/1e9:.3f}s)")
+            if hasattr(latency_res, 'min'):
+                min_latency = latency_res.min
+                print(f"   Min Latency:  {min_latency:,} ns ({min_latency/1e9:.3f}s)")
+            if hasattr(latency_res, 'max'):
+                max_latency = latency_res.max
+                print(f"   Max Latency:  {max_latency:,} ns ({max_latency/1e9:.3f}s)")
+            
+            print(f"   Available attributes: {[attr for attr in dir(latency_res) if not attr.startswith('_')]}")
+        except Exception as e:
+            print(f"   ⚠️  Could not extract latency metrics: {e}")
+            print(f"   Latency result type: {type(latency_res)}")
+            print(f"   Available methods: {[method for method in dir(latency_res) if not method.startswith('_')]}")
+    
+    print(f"   Expected:   {expected_latency:,} ns ({expected_latency/1e9:.3f}s)")
+    
+    # Use P50 if available, otherwise try other metrics
+    try:
+        p50_latency = latency_res.get_latency_percentile(50)
+        test_passed = p50_latency <= expected_latency
+        print(f"   Test Status: {'✅ PASSED' if test_passed else '❌ FAILED'}")
+        return p50_latency
+    except:
+        print(f"   Test Status: ❓ CANNOT DETERMINE (P50 not available)")
+        return None
+
+def print_memory_usage(bs: int, nheads: int, seqlen_q: int, seqlen_k: int, d: int, dtype):
+    """Calculate and print memory usage estimates"""
+    element_size = 2 if dtype == nl.bfloat16 else 4  # bytes
+    
+    q_size = bs * nheads * d * seqlen_q * element_size
+    k_size = bs * nheads * d * seqlen_k * element_size
+    v_size = bs * nheads * seqlen_k * d * element_size
+    total_size = q_size + k_size + v_size
+    
+    print("\n💾 MEMORY USAGE ESTIMATES:")
+    print("-" * 40)
+    print(f"   Q tensor:     {q_size / 1024 / 1024:.2f} MB")
+    print(f"   K tensor:     {k_size / 1024 / 1024:.2f} MB")
+    print(f"   V tensor:     {v_size / 1024 / 1024:.2f} MB")
+    print(f"   Total Input:  {total_size / 1024 / 1024:.2f} MB")
+    print(f"   Est. Peak:    {total_size * 2 / 1024 / 1024:.2f} MB (2x for intermediate)")
+
+def softmax(x: np.ndarray, dim: int, zero_max_mode=False,
+            mixed_precision=False, return_max_reduce=False):
+    """Softmax implementation with verbose logging"""
+    logger.debug(f"Computing softmax on tensor shape {x.shape} along dim {dim}")
+    
+    max_value = np.amax(x, axis=dim, keepdims=True)
+    max_value = np.maximum(0, max_value) if zero_max_mode else max_value
+    exp = np.exp(x - max_value)
+    
+    if mixed_precision:
+        reduce = np.add.reduce(exp.astype(np.float32), axis=dim, keepdims=True).astype(x.dtype)
+    else:
+        reduce = np.add.reduce(exp, axis=dim, keepdims=True)
+    
+    if return_max_reduce:
+        return exp / reduce, -max_value, np.reciprocal(reduce)
+    return exp / reduce
+
+def cpu_attention_forward(q, k, v, use_causal_mask=True, mixed_precision=True):
+    """CPU attention forward pass with verbose logging"""
+    logger.info("🔄 Computing CPU reference attention forward pass...")
+    start_time = time.time()
+    
+    def mixed_precision_matmul(a, b):
+        input_dtype = a.dtype
+        a, b = a.astype(np.float32), b.astype(np.float32)
+        c = np.matmul(a, b)
+        return c.astype(input_dtype)
+
+    _, _, d, _ = q.shape
+    logger.debug(f"Attention head dimension: {d}")
+
+    # Compute golden output
+    softmax_scale = 1.0 / (d ** 0.5)
+    logger.debug(f"Softmax scale factor: {softmax_scale:.6f}")
+    
+    q_scaled = q * softmax_scale
+    nheads = q.shape[1]
+    kv_heads = k.shape[1]
+    
+    if nheads > kv_heads:
+        logger.info(f"📡 Expanding KV heads from {kv_heads} to {nheads} (GQA/MQA)")
+        k = np.repeat(k, nheads//kv_heads, axis=1)
+        v = np.repeat(v, nheads//kv_heads, axis=1)
+    
+    logger.debug("Computing attention scores...")
+    raw_score = mixed_precision_matmul(q_scaled.transpose(0, 1, 3, 2), k)
+
+    if use_causal_mask:
+        logger.debug("Applying causal mask...")
+        for i in range(raw_score.shape[0]):
+            for j in range(raw_score.shape[1]):
+                # -inf triggers invalid input error in softmax implementation, use a small negative instead
+                # k=1 to exclude the diagonal, because each token can still attend to itself
+                raw_score[i, j][np.triu_indices_from(raw_score[i, j], k=1)] = -9984.0
+
+    logger.debug("Computing softmax...")
+    norm_score, cached_negative_max, cached_sum_reciprocal = \
+        softmax(raw_score, dim=-1, mixed_precision=mixed_precision, return_max_reduce=True)
+
+    logger.debug("Computing final output...")
+    # Transpose the result so it has the same layout as ours
+    out_golden = mixed_precision_matmul(norm_score, v.transpose(0, 1, 3, 2)).transpose(0, 1, 3, 2)
+    
+    elapsed_time = time.time() - start_time
+    logger.info(f"✅ CPU reference completed in {elapsed_time:.2f} seconds")
+
+    return out_golden, cached_negative_max, cached_sum_reciprocal
+
+class TestAttention:
+    
+    @pytest.mark.parametrize("bs, nheads, seqlen_q, seqlen_k, d, dtype, use_causal_mask,\
+                              mixed_precision, training, tile_size, kv_heads, should_transpose_v, latency", [
+    [1, 6, 32*1024, 32*1024, 96, nl.bfloat16, True, True, True, 2048, 3, False, 87000000000],
+    [1, 1, 32*1024, 32*1024, 96, nl.bfloat16, True, True, False, 2048, None, False, 15100000000],
+    # Non-square
+    [1, 3, 32*1024, 16*1024, 96, nl.bfloat16, True, True, False, 2048, None, False, 7550000000],
+    [1, 3, 16*1024, 32*1024, 96, nl.bfloat16, True, True, False, 2048, None, False, 7550000000],
+    ])
+    def test_flash_attn_fwd_perf(self, bs, nheads, seqlen_q, seqlen_k, d, dtype, use_causal_mask, 
+                                 mixed_precision, training, tile_size, kv_heads, should_transpose_v, latency):
+        
+        # Print test header with all parameters
+        test_params = {
+            'Batch Size': bs,
+            'Num Heads': nheads,
+            'Q Sequence Length': f"{seqlen_q:,}",
+            'K Sequence Length': f"{seqlen_k:,}",
+            'Head Dimension': d,
+            'Data Type': str(dtype),
+            'Causal Mask': use_causal_mask,
+            'Mixed Precision': mixed_precision,
+            'Training Mode': training,
+            'Tile Size': tile_size,
+            'KV Heads': kv_heads or nheads,
+            'Transpose V': should_transpose_v,
+            'Expected Latency': f"{latency:,} ns"
+        }
+        
+        print_test_header("Flash Attention Forward Performance Test", test_params)
+        print_memory_usage(bs, nheads, seqlen_q, seqlen_k, d, dtype)
+        
+        print("\n⚙️  SETUP PHASE:")
+        print("-" * 40)
+        
+        # Generate test data
+        print("🎲 Generating random test tensors...")
+        q = (np.random.random_sample([bs, nheads, d, seqlen_q]) - 0.5) * 2
+        k = (np.random.random_sample([bs, nheads, d, seqlen_k]) - 0.5) * 2
+        
+        if should_transpose_v:
+            v = (np.random.random_sample([bs, nheads, d, seqlen_k]) - 0.5) * 2
+            print("   V tensor: Using transposed layout")
+        else:
+            v = (np.random.random_sample([bs, nheads, seqlen_k, d]) - 0.5) * 2
+            print("   V tensor: Using standard layout")
+        
+        o_proj = np.zeros(shape=[bs, nheads, seqlen_q, d], dtype=dtype)
+        out_lse = np.zeros(shape=[bs, nheads, int(nl.tile_size.pmax), seqlen_q // nl.tile_size.pmax], 
+                                  dtype=nl.float32 if mixed_precision else dtype) if training else None
+        seed = None
+        
+        # Print tensor information
+        print_tensor_info("Q", q)
+        print_tensor_info("K", k)
+        print_tensor_info("V", v)
+        
+        # Cast to target dtype
+        print(f"\n🔄 Converting tensors to {dtype}...")
+        q = nl.static_cast(q, dtype)
+        k = nl.static_cast(k, dtype)
+        v = nl.static_cast(v, dtype)
+        
+        # Setup configuration
+        config = FlashConfig(**{'seq_tile_size':tile_size, 'training':training, 'should_transpose_v':should_transpose_v})
+        print(f"📝 Flash Config: {config.__dict__}")
+        
+        heads = nheads if kv_heads is None else kv_heads
+        
+        print("\n🚀 BENCHMARKING PHASE:")
+        print("-" * 40)
+        print("⏱️  Running benchmark with warmup=5, iters=10...")
+        
+        bench_func_ = bench_func[bs, heads]
+        
+        # Run the benchmark
+        start_time = time.time()
+        bench_func_(q, k, v, seed, use_causal_mask=use_causal_mask,
+                    mixed_precision=mixed_precision, config=config)
+        benchmark_time = time.time() - start_time
+        
+        print(f"✅ Benchmark completed in {benchmark_time:.2f} seconds")
+        
+        # Get and display results
+        latency_res = bench_func_.benchmark_result.nc_latency
+        p50_latency = print_performance_metrics(latency_res, latency, "Flash Attention Forward")
+        
+        # Final assertion with better error handling
+        if p50_latency is not None:
+            try:
+                assert p50_latency <= latency
+                print(f"\n🎉 TEST PASSED! P50 latency ({p50_latency:,} ns) <= expected ({latency:,} ns)")
+            except AssertionError:
+                print(f"\n💥 TEST FAILED! P50 latency ({p50_latency:,} ns) > expected ({latency:,} ns)")
+                raise
+        else:
+            # Fallback: try to find any available latency metric
+            print(f"\n⚠️  WARNING: Could not determine P50 latency for comparison")
+            print(f"   Benchmark result type: {type(bench_func_.benchmark_result)}")
+            print(f"   NC latency type: {type(latency_res)}")
+            
+            # Try alternative assertion methods
+            try:
+                # Look for any latency value we can use
+                if hasattr(latency_res, 'mean'):
+                    mean_latency = latency_res.mean
+                    assert mean_latency <= latency
+                    print(f"✅ Using mean latency for comparison: {mean_latency:,} ns <= {latency:,} ns")
+                else:
+                    print("❌ No suitable latency metric found for assertion")
+                    raise AssertionError("Cannot determine latency for comparison")
+            except Exception as e:
+                print(f"💥 Assertion failed: {e}")
+                raise
+        
+        print("\n" + "="*80 + "\n")
+    
+    @pytest.mark.simulation
+    @pytest.mark.parametrize("bs, nheads, seqlen_q, seqlen_k, d, dtype, use_causal_mask,\
+                              training, tile_size, kv_heads, should_transpose_v", [
+    [1, 6, 4096, 4096, 128, np.float32, True, True, 2048, 3, False],
+    [1, 1, 4096, 4096, 128, np.float32, True, False, 2048, None, False],
+    [1, 1, 8192, 4096, 128, np.float32, True, False, 2048, None, False],
+    [1, 1, 4096, 8192, 128, np.float32, True, False, 2048, None, False],
+    ])
+    def test_flash_attn_fwd_numerical(self, simulation_only, bs, nheads, seqlen_q, seqlen_k, d, dtype, use_causal_mask, 
+                                     training, tile_size, kv_heads, should_transpose_v):
+        
+        # Print test header
+        test_params = {
+            'Batch Size': bs,
+            'Num Heads': nheads,
+            'Q Sequence Length': f"{seqlen_q:,}",
+            'K Sequence Length': f"{seqlen_k:,}",
+            'Head Dimension': d,
+            'Data Type': str(dtype),
+            'Causal Mask': use_causal_mask,
+            'Training Mode': training,
+            'Tile Size': tile_size,
+            'KV Heads': kv_heads or nheads,
+            'Transpose V': should_transpose_v,
+            'Simulation Only': simulation_only
+        }
+        
+        print_test_header("Flash Attention Forward Numerical Test", test_params)
+        print_memory_usage(bs, nheads, seqlen_q, seqlen_k, d, dtype)
+        
+        print("\n⚙️  SETUP PHASE:")
+        print("-" * 40)
+        
+        # Generate test data
+        print("🎲 Generating random test tensors...")
+        q = (np.random.random_sample([bs, nheads, d, seqlen_q]) - 0.5) * 2
+        k = (np.random.random_sample([bs, kv_heads or nheads, d, seqlen_k]) - 0.5) * 2
+        
+        if should_transpose_v:
+            v = (np.random.random_sample([bs, nheads, d, seqlen_k]) - 0.5) * 2
+            cpu_permute = (0, 1, 2, 3)
+            print("   V tensor: Using transposed layout")
+        else:
+            v = (np.random.random_sample([bs, kv_heads or nheads, seqlen_k, d]) - 0.5) * 2
+            cpu_permute = (0, 1, 3, 2)
+            print("   V tensor: Using standard layout")
+
+        # Print tensor information
+        print_tensor_info("Q", q)
+        print_tensor_info("K", k)
+        print_tensor_info("V", v)
+        
+        # Cast to target dtype
+        print(f"\n🔄 Converting tensors to {dtype}...")
+        q = nl.static_cast(q, dtype)
+        k = nl.static_cast(k, dtype)
+        v = nl.static_cast(v, dtype)
+        seed = None
+
+        print("\n🔍 REFERENCE COMPUTATION:")
+        print("-" * 40)
+        
+        # Compute reference (golden) output
+        o_proj_golden, cached_negative_max, cached_sum_reciprocal = \
+          cpu_attention_forward(q, k, v.transpose(cpu_permute), use_causal_mask=use_causal_mask, mixed_precision=True)
+        
+        # Reshape reference outputs to match expected format
+        o_proj_golden = o_proj_golden.transpose(0,1,3,2) # (b,h, d, seq)
+        cached_negative_max = cached_negative_max.reshape(bs, nheads, seqlen_q // nl.tile_size.pmax,
+                                                          nl.tile_size.pmax).transpose(0, 1, 3, 2)
+        cached_sum_reciprocal = cached_sum_reciprocal.reshape(bs, nheads, seqlen_q // nl.tile_size.pmax,
+                                                              nl.tile_size.pmax).transpose(0, 1, 3, 2)
+        lse_golden = -1.0 * (cached_negative_max + np.log(cached_sum_reciprocal)) if training else None
+        
+        print_tensor_info("Reference Output", o_proj_golden)
+        if training:
+            print_tensor_info("Reference LSE", lse_golden)
+        
+        # Setup configuration
+        config = FlashConfig(**{'seq_tile_size':tile_size, 'training':training, 'should_transpose_v':should_transpose_v})
+        print(f"📝 Flash Config: {config.__dict__}")
+        
+        heads = nheads if kv_heads is None else kv_heads
+
+        print("\n🚀 FLASH ATTENTION COMPUTATION:")
+        print("-" * 40)
+        
+        numeric_func = baremetal(flash_fwd)
+        
+        if simulation_only:
+            print("🔬 Running in simulation mode...")
+            start_time = time.time()
+            results = simulate_kernel(numeric_func[bs, heads], q, k, v, seed,
+                                          use_causal_mask=use_causal_mask,
+                                          mixed_precision=True,
+                                          config=config)
+            compute_time = time.time() - start_time
+            print(f"✅ Simulation completed in {compute_time:.2f} seconds")
+        else:
+            print("⚡ Running on hardware...")
+            start_time = time.time()
+            results = numeric_func[bs, heads](q, k, v, seed,
+                                          use_causal_mask=use_causal_mask,
+                                          mixed_precision=True,
+                                          config=config)
+            compute_time = time.time() - start_time
+            print(f"✅ Hardware execution completed in {compute_time:.2f} seconds")
+
+        print("\n🔬 NUMERICAL VERIFICATION:")
+        print("-" * 40)
+        
+        if training:
+            o_proj, out_lse = results
+            print_tensor_info("Flash Output", o_proj)
+            print_tensor_info("Flash LSE", out_lse)
+            
+            # Check output tensor
+            output_close = np.allclose(o_proj, o_proj_golden, atol=1e-2)
+            output_max_diff = np.max(np.abs(o_proj - o_proj_golden))
+            output_mean_diff = np.mean(np.abs(o_proj - o_proj_golden))
+            
+            print(f"📊 Output Comparison:")
+            print(f"   Max absolute difference: {output_max_diff:.6f}")
+            print(f"   Mean absolute difference: {output_mean_diff:.6f}")
+            print(f"   Tolerance: 1e-2")
+            print(f"   Result: {'✅ PASS' if output_close else '❌ FAIL'}")
+            
+            # Check LSE tensor
+            lse_close = np.allclose(out_lse, lse_golden, atol=1e-2)
+            lse_max_diff = np.max(np.abs(out_lse - lse_golden))
+            lse_mean_diff = np.mean(np.abs(out_lse - lse_golden))
+            
+            print(f"📊 LSE Comparison:")
+            print(f"   Max absolute difference: {lse_max_diff:.6f}")
+            print(f"   Mean absolute difference: {lse_mean_diff:.6f}")
+            print(f"   Tolerance: 1e-2")
+            print(f"   Result: {'✅ PASS' if lse_close else '❌ FAIL'}")
+            
+            # Final assertions
+            try:
+                assert output_close, f"Output mismatch: max_diff={output_max_diff:.6f} > 1e-2"
+                assert lse_close, f"LSE mismatch: max_diff={lse_max_diff:.6f} > 1e-2"
+                print(f"\n🎉 TEST PASSED! Both output and LSE match reference within tolerance")
+            except AssertionError as e:
+                print(f"\n💥 TEST FAILED! {str(e)}")
+                raise
+        else:
+            o_proj = results
+            print_tensor_info("Flash Output", o_proj)
+            
+            # Check output tensor
+            output_close = np.allclose(o_proj, o_proj_golden, atol=1e-2)
+            output_max_diff = np.max(np.abs(o_proj - o_proj_golden))
+            output_mean_diff = np.mean(np.abs(o_proj - o_proj_golden))
+            
+            print(f"📊 Output Comparison:")
+            print(f"   Max absolute difference: {output_max_diff:.6f}")
+            print(f"   Mean absolute difference: {output_mean_diff:.6f}")
+            print(f"   Tolerance: 1e-2")
+            print(f"   Result: {'✅ PASS' if output_close else '❌ FAIL'}")
+            
+            # Final assertion
+            try:
+                assert output_close, f"Output mismatch: max_diff={output_max_diff:.6f} > 1e-2"
+                print(f"\n🎉 TEST PASSED! Output matches reference within tolerance")
+            except AssertionError as e:
+                print(f"\n💥 TEST FAILED! {str(e)}")
+                raise
+        
+        print("\n" + "="*80 + "\n")
\ No newline at end of file
diff --git a/test/unit/test_neuron_profile.py b/test/unit/test_neuron_profile.py
deleted file mode 100644
index e607705..0000000
--- a/test/unit/test_neuron_profile.py
+++ /dev/null
@@ -1,86 +0,0 @@
-from neuronxcc.nki import benchmark
-from neuronxcc.nki import profile
-import neuronxcc.nki.language as nl
-import numpy as np
-import pytest
-import os
-import shutil
-import tempfile
-
-
-WORKING_DIRECTORY = tempfile.mkdtemp()
-SAVE_NEFF_NAME = "cus_file123.neff"
-SAVE_TRACE_NAME = "profile-custom.ntff"
-NUM_EXECS = 20
-PROFILE_NTH = 10  
-JSON_REPORTS = "json_reports"
-
-@profile(working_directory=WORKING_DIRECTORY, save_neff_name=SAVE_NEFF_NAME, overwrite=False , save_trace_name=SAVE_TRACE_NAME, num_execs=NUM_EXECS, profile_nth=PROFILE_NTH)
-def nki_tensor_tensor_add(a_tensor, b_tensor):
-  c_output = nl.ndarray(a_tensor.shape, dtype=a_tensor.dtype, buffer=nl.shared_hbm)
- 
-  a = nl.load(a_tensor)
-  b = nl.load(b_tensor)
-
-  c_tile = a + b
-
-  nl.store(c_output, value=c_tile)
-
-  return c_output
-
-class TestNeuronProfile:
-    def _get_ntff_path(self, trace_val):
-        """
-        Prepares ntff file name based on execution trace number
-        """
-        if trace_val == 1:
-            return os.path.join(WORKING_DIRECTORY, f"{os.path.splitext(os.path.basename(SAVE_TRACE_NAME))[0]}.ntff")
-        else:
-            return os.path.join(WORKING_DIRECTORY, f"{os.path.splitext(os.path.basename(SAVE_TRACE_NAME))[0]}_exec_{trace_val}.ntff")
-
-    @pytest.fixture
-    def traces(self):
-        ret = []
-        if NUM_EXECS < PROFILE_NTH:
-            ret.append(self._get_ntff_path(PROFILE_NTH))
-        else:
-            curr = PROFILE_NTH
-            while curr <= NUM_EXECS:
-                ret.append(self._get_ntff_path(curr))
-                curr += PROFILE_NTH
-        return ret
-    
-    @pytest.fixture
-    def num_reports(self):
-        if NUM_EXECS < PROFILE_NTH:
-            return 1
-        else:
-            return NUM_EXECS // PROFILE_NTH
-
-    def test_output_artifacts_created(self, traces, num_reports):
-        # delete artifact directory, only testing non-overwrite functionality
-        if os.path.exists(WORKING_DIRECTORY):
-            shutil.rmtree(WORKING_DIRECTORY)
-
-        # creates dummy input to invoke profile kernel
-        a = np.zeros([128, 1024]).astype(np.float16)
-        b = np.random.random_sample([128, 1024]).astype(np.float16)
-
-        output_nki = nki_tensor_tensor_add(a, b)
-
-        # now asserting artifacts are correctly created     
-        assert os.path.exists(os.path.join(WORKING_DIRECTORY, SAVE_NEFF_NAME)) # neff
-        
-        for trace in traces:
-            assert os.path.exists(trace) # trace
-        
-        # json reports
-        report_dir = os.path.join(WORKING_DIRECTORY, JSON_REPORTS)
-
-        assert os.path.exists(report_dir) # actually exists
-        assert len(os.listdir(report_dir)) == num_reports # report all iterations queried
-
-        # post condition cleanup
-        if os.path.exists(WORKING_DIRECTORY):
-            shutil.rmtree(WORKING_DIRECTORY)
-

From 0b7591099884176b8f2fa0bd024b36f79d615c3c Mon Sep 17 00:00:00 2001
From: arm-diaz <arm.dan.diaz.gonzalez@gmail.com>
Date: Fri, 11 Jul 2025 21:59:20 +0000
Subject: [PATCH 54/65] docs: add hyperlink for deployment.yaml

---
 README.md                 | 34 ++++++++++++++++++++++++++++++++++
 docs/complete-pipeline.md | 18 +++++++++++++-----
 docs/fine-tuning.md       | 18 +++++++++++++-----
 docs/inference.md         | 18 +++++++++++++-----
 docs/self-attention.md    | 20 ++++++++++++++------
 5 files changed, 87 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index 76f1bfc..7801f1e 100644
--- a/README.md
+++ b/README.md
@@ -49,6 +49,40 @@ We've created four specialized guides based on your optimization focus:
 
 **Note**: Create your SSH key pair first in EC2 → Key Pairs for easy download!
 
+#### Deployment Steps
+
+1. **Download the CloudFormation template**: 
+   - Click here to download: [deployment.yaml](../deployment/deployment.yaml)
+
+2. **Click** on one of the CloudFormation Console links above for your preferred region.
+
+3. **Upload the template**:
+   - Choose **Upload a template file**
+   - Click **Choose file** and select the downloaded `deployment.yaml`
+   - Click **Next**
+
+4. **Configure the stack:**
+   - **Stack name**: Keep default or customize (e.g., `nki-llama-hackathon`)
+   - **KeyPairOption**: Choose `use-existing` (recommended - create key in EC2 console first)
+   - **ExistingKeyPairName**: Select your key from dropdown (see note below)
+   - **Ec2InstanceType**: Default: `trn1.32xlarge` - can be changed to use `trn1.2xlarge`
+   - Click **Next**
+   
+   **Note**: For easy key download, first create a key pair in EC2 → Key Pairs → Create key pair, download it, then return here and select it from the dropdown.
+
+5. **Configure stack options**: Leave all values as default and click **Next**
+
+6. **Review and create:**
+   - Check the box: "I acknowledge that AWS CloudFormation might create IAM resources"
+   - Click **Create stack**
+   - Stack creation takes ~5-10 minutes
+
+7. **Access your instance:**
+   - Go to CloudFormation → Select your stack → **Outputs** tab
+   - Copy the **SSHCommand** value
+   - If you created a new key, download it from EC2 → Key Pairs
+   - Connect: `ssh -i <your-key.pem> ubuntu@<instance-ip>`
+
 ### 2. Connect to Your Instance
 
 ```bash
diff --git a/docs/complete-pipeline.md b/docs/complete-pipeline.md
index 1ddfd10..fcd9c6c 100644
--- a/docs/complete-pipeline.md
+++ b/docs/complete-pipeline.md
@@ -37,9 +37,17 @@ Deploy the complete NKI-LLAMA environment using AWS CloudFormation with one clic
 
 ### Deployment Steps
 
-1. **Click** on one of the "Launch stack" links above for your preferred region.
+1. **Download the CloudFormation template**: 
+   - Click here to download: [deployment.yaml](../deployment/deployment.yaml)
 
-2. **Configure the stack:**
+2. **Click** on one of the CloudFormation Console links above for your preferred region.
+
+3. **Upload the template**:
+   - Choose **Upload a template file**
+   - Click **Choose file** and select the downloaded `deployment.yaml`
+   - Click **Next**
+
+4. **Configure the stack:**
    - **Stack name**: Keep default or customize (e.g., `nki-llama-complete`)
    - **KeyPairOption**: Choose `use-existing` (recommended - create key in EC2 console first)
    - **ExistingKeyPairName**: Select your key from dropdown (see note below)
@@ -49,14 +57,14 @@ Deploy the complete NKI-LLAMA environment using AWS CloudFormation with one clic
    
    **Note**: For easy key download, first create a key pair in EC2 → Key Pairs → Create key pair, download it, then return here and select it from the dropdown.
 
-3. **Configure stack options**: Leave all values as default and click **Next**
+5. **Configure stack options**: Leave all values as default and click **Next**
 
-4. **Review and create:**
+6. **Review and create:**
    - Check the box: "I acknowledge that AWS CloudFormation might create IAM resources"
    - Click **Create stack**
    - Stack creation takes ~5-10 minutes
 
-5. **Access your instance:**
+7. **Access your instance:**
    - Go to CloudFormation → Select your stack → **Outputs** tab
    - Note the **EC2InstanceId** and **EC2PublicIP**
    - Connect using your pre-downloaded key or SSM
diff --git a/docs/fine-tuning.md b/docs/fine-tuning.md
index 44cd5d5..ecd5932 100644
--- a/docs/fine-tuning.md
+++ b/docs/fine-tuning.md
@@ -33,9 +33,17 @@ Deploy the NKI-LLAMA training environment using AWS CloudFormation with one clic
 
 ### Deployment Steps
 
-1. **Click** on one of the "Launch stack" links above for your preferred region.
+1. **Download the CloudFormation template**: 
+   - Click here to download: [deployment.yaml](../deployment/deployment.yaml)
 
-2. **Configure the stack:**
+2. **Click** on one of the CloudFormation Console links above for your preferred region.
+
+3. **Upload the template**:
+   - Choose **Upload a template file**
+   - Click **Choose file** and select the downloaded `deployment.yaml`
+   - Click **Next**
+
+4. **Configure the stack:**
    - **Stack name**: Keep default or customize (e.g., `nki-llama-training`)
    - **KeyPairOption**: Choose `use-existing` (recommended - create key in EC2 console first)
    - **ExistingKeyPairName**: Select your key from dropdown (see note below)
@@ -44,14 +52,14 @@ Deploy the NKI-LLAMA training environment using AWS CloudFormation with one clic
    
    **Note**: For easy key download, first create a key pair in EC2 → Key Pairs → Create key pair, download it, then return here and select it from the dropdown.
 
-3. **Configure stack options**: Leave all values as default and click **Next**
+5. **Configure stack options**: Leave all values as default and click **Next**
 
-4. **Review and create:**
+6. **Review and create:**
    - Check the box: "I acknowledge that AWS CloudFormation might create IAM resources"
    - Click **Create stack**
    - Stack creation takes ~5-10 minutes
 
-5. **Access your instance:**
+7. **Access your instance:**
    - Go to CloudFormation → Select your stack → **Outputs** tab
    - Copy the **SSHCommand** value
    - If you created a new key, download it from EC2 → Key Pairs
diff --git a/docs/inference.md b/docs/inference.md
index 768671e..5356734 100644
--- a/docs/inference.md
+++ b/docs/inference.md
@@ -33,9 +33,17 @@ Deploy the NKI-LLAMA inference environment using AWS CloudFormation with one cli
 
 ### Deployment Steps
 
-1. **Click** on one of the "Launch stack" links above for your preferred region.
+1. **Download the CloudFormation template**: 
+   - Click here to download: [deployment.yaml](../deployment/deployment.yaml)
 
-2. **Configure the stack:**
+2. **Click** on one of the CloudFormation Console links above for your preferred region.
+
+3. **Upload the template**:
+   - Choose **Upload a template file**
+   - Click **Choose file** and select the downloaded `deployment.yaml`
+   - Click **Next**
+
+4. **Configure the stack:**
    - **Stack name**: Keep default or customize (e.g., `nki-llama-inference`)
    - **KeyPairOption**: Choose `use-existing` (recommended - create key in EC2 console first)
    - **ExistingKeyPairName**: Select your key from dropdown (see note below)
@@ -45,14 +53,14 @@ Deploy the NKI-LLAMA inference environment using AWS CloudFormation with one cli
    
    **Note**: For easy key download, first create a key pair in EC2 → Key Pairs → Create key pair, download it, then return here and select it from the dropdown. Alternatively, choose `none` to use SSM Session Manager without keys.
 
-3. **Configure stack options**: Leave all values as default and click **Next**
+5. **Configure stack options**: Leave all values as default and click **Next**
 
-4. **Review and create:**
+6. **Review and create:**
    - Check the box: "I acknowledge that AWS CloudFormation might create IAM resources"
    - Click **Create stack**
    - Stack creation takes ~5-10 minutes
 
-5. **Access your instance:**
+7. **Access your instance:**
    - Go to CloudFormation → Select your stack → **Outputs** tab
    - Use **SSHCommand** for SSH access or **EC2InstanceId** for SSM
    - For SSM: `aws ssm start-session --target <instance-id>`
diff --git a/docs/self-attention.md b/docs/self-attention.md
index 1653220..86b3da8 100644
--- a/docs/self-attention.md
+++ b/docs/self-attention.md
@@ -31,10 +31,18 @@ Deploy the NKI-LLAMA training environment using AWS CloudFormation with one clic
 
 ### Deployment Steps
 
-1. **Click** on one of the "Launch stack" links above for your preferred region.
+1. **Download the CloudFormation template**: 
+   - Click here to download: [deployment.yaml](../deployment/deployment.yaml)
 
-2. **Configure the stack:**
-   - **Stack name**: Keep default or customize (e.g., `nki-llama-training`)
+2. **Click** on one of the CloudFormation Console links above for your preferred region.
+
+3. **Upload the template**:
+   - Choose **Upload a template file**
+   - Click **Choose file** and select the downloaded `deployment.yaml`
+   - Click **Next**
+
+4. **Configure the stack:**
+   - **Stack name**: Keep default or customize (e.g., `nki-llama-attention`)
    - **KeyPairOption**: Choose `use-existing` (recommended - create key in EC2 console first)
    - **ExistingKeyPairName**: Select your key from dropdown (see note below)
    - **Ec2InstanceType**: Default: `trn1.32xlarge` - can be changed to use `trn1.2xlarge`
@@ -42,14 +50,14 @@ Deploy the NKI-LLAMA training environment using AWS CloudFormation with one clic
    
    **Note**: For easy key download, first create a key pair in EC2 → Key Pairs → Create key pair, download it, then return here and select it from the dropdown.
 
-3. **Configure stack options**: Leave all values as default and click **Next**
+5. **Configure stack options**: Leave all values as default and click **Next**
 
-4. **Review and create:**
+6. **Review and create:**
    - Check the box: "I acknowledge that AWS CloudFormation might create IAM resources"
    - Click **Create stack**
    - Stack creation takes ~5-10 minutes
 
-5. **Access your instance:**
+7. **Access your instance:**
    - Go to CloudFormation → Select your stack → **Outputs** tab
    - Copy the **SSHCommand** value
    - If you created a new key, download it from EC2 → Key Pairs

From 14f9ec304811db970451cb1cb37cc203b6e9da34 Mon Sep 17 00:00:00 2001
From: arm-diaz <arm.dan.diaz.gonzalez@gmail.com>
Date: Fri, 11 Jul 2025 22:00:37 +0000
Subject: [PATCH 55/65] docs: add hyperlink for deployment.yaml

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7801f1e..0ba75cc 100644
--- a/README.md
+++ b/README.md
@@ -52,7 +52,7 @@ We've created four specialized guides based on your optimization focus:
 #### Deployment Steps
 
 1. **Download the CloudFormation template**: 
-   - Click here to download: [deployment.yaml](../deployment/deployment.yaml)
+   - Click here to download: [deployment.yaml](./deployment/deployment.yaml)
 
 2. **Click** on one of the CloudFormation Console links above for your preferred region.
 

From 7a2c19f108083ba93dc068bb5c67c2de6bc4c7bf Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-58-100.us-west-2.compute.internal>
Date: Fri, 11 Jul 2025 22:22:00 +0000
Subject: [PATCH 56/65] feat: Llama 1B/8B Implementation

---
 install.sh                                 |  6 +-
 src/fine-tune/scripts/download_model.sh    | 49 +++++++++--
 src/fine-tune/scripts/instance_type.sh     | 54 ++++++++++++
 src/inference/scripts/download-model.sh    | 95 ++++++++++++++++++++++
 src/inference/scripts/instance_type.sh     | 54 ++++++++++++
 src/inference/scripts/run-nki-benchmark.sh |  7 ++
 6 files changed, 254 insertions(+), 11 deletions(-)
 create mode 100755 src/fine-tune/scripts/instance_type.sh
 create mode 100755 src/inference/scripts/instance_type.sh

diff --git a/install.sh b/install.sh
index 0127d6f..e789286 100755
--- a/install.sh
+++ b/install.sh
@@ -61,8 +61,10 @@ if [[ ! -f "${INSTALL_DIR}/.env" ]]; then
         cat > "${INSTALL_DIR}/.env" << 'EOF'
 # NKI-LLAMA Configuration
 HF_TOKEN=
-MODEL_ID=meta-llama/Meta-Llama-3-8B
-MODEL_NAME=llama-3-8b
+MODEL_ID_8B=meta-llama/Meta-Llama-3-8B
+MODEL_NAME_8B=llama-3-8b
+MODEL_ID_1B=meta-llama/Llama-3.2-1B
+MODEL_NAME_1B=llama-3-2-1b
 TENSOR_PARALLEL_SIZE=8
 INFERENCE_PORT=8080
 MAX_MODEL_LEN=2048
diff --git a/src/fine-tune/scripts/download_model.sh b/src/fine-tune/scripts/download_model.sh
index b31dbaf..8d12177 100755
--- a/src/fine-tune/scripts/download_model.sh
+++ b/src/fine-tune/scripts/download_model.sh
@@ -3,6 +3,10 @@ set -e
 
 echo "==== Starting Llama model download and conversion script ===="
 
+# Variable names
+MODEL_NAME=none
+HF_WEIGHT_NAME=none
+
 # Check if HF_TOKEN is set
 if [ -z "$HF_TOKEN" ]; then
     echo "Error: HF_TOKEN environment variable is not set!"
@@ -10,12 +14,35 @@ if [ -z "$HF_TOKEN" ]; then
     exit 1
 fi
 
-# Check if MODEL_ID is set
-if [ -z "$MODEL_ID" ]; then
-    echo "Error: MODEL_ID environment variable is not set!"
-    echo "Please make sure MODEL_ID is defined in your .env file"
-    exit 1
-fi
+# Run the instance_type script
+./nki-llama/src/fine-tune/scripts/instance_type.sh
+
+# Check the instance type to set the variables for the model download
+if ["$EC2_INSTANCE_TYPE" == "trn1.2xlarge"]; then
+    # Check if MODEL_ID_1B is set
+    if [ -z "$MODEL_ID_1B" ]; then
+        echo "Error: MODEL_ID_1B environment variable is not set!"
+        echo "Please make sure MODEL_ID_1B is defined in your .env file"
+        exit 1
+    fi
+
+    MODEL_NAME=llama-3-2_1b
+    HF_WEIGHT_NAME=llama3_2-1b_hf_weights_bin
+
+    echo "🚀 Model: Downloading Llama-3.2 1B..."
+else if ["$EC2_INSTANCE_TYPE" == "trn1.32xlarge"]: then
+    # Check if MODEL_ID_8B is set
+    if [ -z "$MODEL_ID_8B" ]; then
+        echo "Error: MODEL_ID_8B environment variable is not set!"
+        echo "Please make sure MODEL_ID_8B is defined in your .env file"
+        exit 1
+    fi
+
+    MODEL_NAME=llama-3-1_8b
+    HF_WEIGHT_NAME=llama3_1-8b_hf_weights_bin
+
+    echo "🚀 Model: Downloading Llama-3 8B..."
+
 
 echo "==== Changing to fine-tune workspace ===="
 # Go to your fine-tune workspace
@@ -25,8 +52,8 @@ echo "Current directory: $(pwd)"
 echo "==== Setting path variables ===="
 # Paths
 export TOKENIZER_DIR=~/nki-llama/src/fine-tune/model_assets/llama_tokenizer
-export MODEL_DIR=~/nki-llama/src/fine-tune/model_assets/llama_3-1_8b
-export BIN_MODEL_DIR=~/nki-llama/src/fine-tune/model_assets/llama3-8B_hf_weights_bin
+export MODEL_DIR=~/nki-llama/src/fine-tune/model_assets/MODEL_NAME
+export BIN_MODEL_DIR=~/nki-llama/src/fine-tune/model_assets/HF_WEIGHT_NAME
 export CONSOLIDATED_BIN_MODEL_DIR=~/nki-llama/src/fine-tune/model_assets/pckpt/
 
 echo "Tokenizer directory: $TOKENIZER_DIR"
@@ -61,7 +88,11 @@ import os
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from pathlib import Path
 
-model_id = os.environ.get("MODEL_ID", "meta-llama/Meta-Llama-3-8B")
+instance_type = os.environ.get("EC2_INSTANCE_TYPE")
+if instance == "trn1.2xlarge":
+    model_id = os.environ.get("MODEL_ID", "meta-llama/Llama-3.2-1B")
+elif instance == "trn1.32xlarge":
+    model_id = os.environ.get("MODEL_ID", "meta-llama/Meta-Llama-3-8B")
 tokenizer_dir = os.path.expanduser(os.environ["TOKENIZER_DIR"])
 model_dir     = os.path.expanduser(os.environ["MODEL_DIR"])
 hf_token      = os.environ.get("HF_TOKEN")
diff --git a/src/fine-tune/scripts/instance_type.sh b/src/fine-tune/scripts/instance_type.sh
new file mode 100755
index 0000000..264073a
--- /dev/null
+++ b/src/fine-tune/scripts/instance_type.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+# Function to get metadata with retry
+get_metadata() {
+    local metadata_url="http://169.254.169.254/latest/meta-data"
+    local max_attempts=5
+    local attempt=1
+
+    while [ $attempt -le $max_attempts ]; do
+        # Try IMDSv2 first
+        TOKEN=$(curl -s -f -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null)
+        if [ $? -eq 0 ]; then
+            RESULT=$(curl -s -f -H "X-aws-ec2-metadata-token: $TOKEN" $metadata_url/$1 2>/dev/null)
+        else
+            # Fallback to IMDSv1
+            RESULT=$(curl -s -f $metadata_url/$1 2>/dev/null)
+        fi
+
+        if [ ! -z "$RESULT" ]; then
+            echo "$RESULT"
+            return 0
+        fi
+
+        echo "Attempt $attempt failed. Retrying..." >&2
+        sleep 2
+        ((attempt++))
+    done
+
+    echo "Failed to retrieve metadata after $max_attempts attempts" >&2
+    return 1
+}
+
+# Get the instance type
+INSTANCE_TYPE=$(get_metadata instance-type)
+
+if [ $? -ne 0 ]; then
+    echo "Failed to determine instance type"
+    exit 1
+fi
+
+# Set the environment variable
+export EC2_INSTANCE_TYPE=$INSTANCE_TYPE
+
+# Print the instance type out
+if [ "$EC2_INSTANCE_TYPE" == "trn1.2xlarge" ] || [ "$EC2_INSTANCE_TYPE" == "trn1.32xlarge" ]; then
+    echo "======================================================"
+    echo "✅ EC2_INSTANCE_TYPE: $EC2_INSTANCE_TYPE"
+    echo "======================================================"
+else
+    echo "========================================================================================="
+    echo "❌ This is not a trn1.2xlarge or trn1.32xlarge instance. It is a $INSTANCE_TYPE"
+    echo "⚠️ Please use a valid instance type ⚠️"
+    echo "========================================================================================="
+fi
diff --git a/src/inference/scripts/download-model.sh b/src/inference/scripts/download-model.sh
index c6ad790..8b448ac 100755
--- a/src/inference/scripts/download-model.sh
+++ b/src/inference/scripts/download-model.sh
@@ -3,6 +3,10 @@
 
 set -euo pipefail
 
+ENV_FILE=".env"
+KEY_1="NEURON_RT_NUM_CORES"
+KEY_2="TENSOR_PARALLEL_SIZE"
+
 # Load configuration
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 source "${SCRIPT_DIR}/../../../nki-llama.config"
@@ -42,6 +46,79 @@ if not ver or pkg_resources.parse_version(ver) >= pkg_resources.parse_version(re
     subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", f"transformers<{req}"])
 PY
 
+# Determine the instance type before deciding the model
+source "${SCRIPT_DIR}/../../../src/inference/scripts/instance_type.sh"
+
+if [ "$EC2_INSTANCE_TYPE" == "trn1.2xlarge" ]; then
+    # Check if MODEL_ID_1B is set
+    if [ -z "$MODEL_ID_1B" ]; then
+        echo "Error: MODEL_ID_1B environment variable is not set!"
+        echo "Please make sure MODEL_ID_1B is defined in your .env file"
+        exit 1
+    fi
+
+    export MODEL_ID="$MODEL_ID_1B"
+    export MODEL_NAME="$MODEL_NAME_1B"
+
+    # Set NeuronCore number
+    VALUE="2"
+    # Check if key exists and update it, otherwise add it - NEURON_RT_NUM_CORES
+    if grep -q "^${KEY_1}=" "$ENV_FILE" 2>/dev/null; then
+        # Key exists, update it
+        sed -i "s/^${KEY_1}=.*/${KEY_1}=${VALUE}/" "$ENV_FILE"
+    else
+        # Key doesn't exist, add it
+        echo "${KEY_1}=${VALUE}" >> "$ENV_FILE"
+    fi
+
+    # Check if key exists and update it, otherwise add it - TENSOR_PARALLEL_SIZE
+    if grep -q "^${KEY_2}=" "$ENV_FILE" 2>/dev/null; then
+        # Key exists, update it
+        sed -i "s/^${KEY_2}=.*/${KEY_2}=${VALUE}/" "$ENV_FILE"
+    else
+        # Key doesn't exist, add it
+        echo "${KEY_2}=${VALUE}" >> "$ENV_FILE"
+    fi
+
+    echo "🚀 Model: Downloading Llama-3.2 1B..."
+elif [ "$EC2_INSTANCE_TYPE" == "trn1.32xlarge" ]; then
+    # Check if MODEL_ID_8B is set
+    if [ -z "$MODEL_ID_8B" ]; then
+        echo "Error: MODEL_ID_8B environment variable is not set!"
+        echo "Please make sure MODEL_ID_8B is defined in your .env file"
+        exit 1
+    fi
+
+    export MODEL_ID="$MODEL_ID_8B"
+    export MODEL_NAME="$MODEL_NAME_8B"
+
+    # Set NeuronCore number
+    VALUE="8"
+    # Check if key exists and update it, otherwise add it - NEURON_RT_NUM_CORES
+    if grep -q "^${KEY_1}=" "$ENV_FILE" 2>/dev/null; then
+        # Key exists, update it
+        sed -i "s/^${KEY_1}=.*/${KEY}=${VALUE}/" "$ENV_FILE"
+    else
+        # Key doesn't exist, add it
+        echo "${KEY_1}=${VALUE}" >> "$ENV_FILE"
+    fi
+
+    # Check if key exists and update it, otherwise add it - TENSOR_PARALLEL_SIZE
+    if grep -q "^${KEY_2}=" "$ENV_FILE" 2>/dev/null; then
+        # Key exists, update it
+        sed -i "s/^${KEY_2}=.*/${KEY_2}=${VALUE}/" "$ENV_FILE"
+    else
+        # Key doesn't exist, add it
+        echo "${KEY_2}=${VALUE}" >> "$ENV_FILE"
+    fi
+
+    echo "🚀 Model: Downloading Llama-3 8B..."
+else
+    echo "Error: Unsupported instance type: $EC2_INSTANCE_TYPE"
+    echo "This script requires either trn1.2xlarge or trn1.32xlarge"
+    exit 1
+fi
+
 # Create models directory
 mkdir -p "$NKI_MODELS"
 
@@ -52,6 +129,24 @@ huggingface-cli download \
     "$MODEL_ID" \
     --local-dir "${NKI_MODELS}/${MODEL_NAME}"
 
+# Export variables to environment for other scripts to use
+echo "Exporting model variables to environment..."
+echo "MODEL_NAME=$MODEL_NAME"
+echo "MODEL_ID=$MODEL_ID"
+
+# Create a file to store these variables for other scripts
+cat > "${SCRIPT_DIR}/model_env.sh" << EOF
+#!/bin/bash
+# Auto-generated by download-model.sh
+# Contains model environment variables for other scripts
+
+export MODEL_NAME="${MODEL_NAME}"
+export MODEL_ID="${MODEL_ID}"
+EOF
+
+chmod +x "${SCRIPT_DIR}/model_env.sh"
+echo "✅ Created model environment file at: ${SCRIPT_DIR}/model_env.sh"
+
 echo -e "${GREEN}✓ Model downloaded successfully${NC}"
 echo "Location: ${NKI_MODELS}/${MODEL_NAME}"
 
diff --git a/src/inference/scripts/instance_type.sh b/src/inference/scripts/instance_type.sh
new file mode 100755
index 0000000..264073a
--- /dev/null
+++ b/src/inference/scripts/instance_type.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+# Function to get metadata with retry
+get_metadata() {
+    local metadata_url="http://169.254.169.254/latest/meta-data"
+    local max_attempts=5
+    local attempt=1
+
+    while [ $attempt -le $max_attempts ]; do
+        # Try IMDSv2 first
+        TOKEN=$(curl -s -f -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null)
+        if [ $? -eq 0 ]; then
+            RESULT=$(curl -s -f -H "X-aws-ec2-metadata-token: $TOKEN" $metadata_url/$1 2>/dev/null)
+        else
+            # Fallback to IMDSv1
+            RESULT=$(curl -s -f $metadata_url/$1 2>/dev/null)
+        fi
+
+        if [ ! -z "$RESULT" ]; then
+            echo "$RESULT"
+            return 0
+        fi
+
+        echo "Attempt $attempt failed. Retrying..." >&2
+        sleep 2
+        ((attempt++))
+    done
+
+    echo "Failed to retrieve metadata after $max_attempts attempts" >&2
+    return 1
+}
+
+# Get the instance type
+INSTANCE_TYPE=$(get_metadata instance-type)
+
+if [ $? -ne 0 ]; then
+    echo "Failed to determine instance type"
+    exit 1
+fi
+
+# Set the environment variable
+export EC2_INSTANCE_TYPE=$INSTANCE_TYPE
+
+# Print the instance type out
+if [ "$EC2_INSTANCE_TYPE" == "trn1.2xlarge" ] || [ "$EC2_INSTANCE_TYPE" == "trn1.32xlarge" ]; then
+    echo "======================================================"
+    echo "✅ EC2_INSTANCE_TYPE: $EC2_INSTANCE_TYPE"
+    echo "======================================================"
+else
+    echo "========================================================================================="
+    echo "❌ This is not a trn1.2xlarge or trn1.32xlarge instance. It is a $INSTANCE_TYPE"
+    echo "⚠️ Please use a valid instance type ⚠️"
+    echo "========================================================================================="
+fi
diff --git a/src/inference/scripts/run-nki-benchmark.sh b/src/inference/scripts/run-nki-benchmark.sh
index f6c2f9c..eef0e1c 100755
--- a/src/inference/scripts/run-nki-benchmark.sh
+++ b/src/inference/scripts/run-nki-benchmark.sh
@@ -31,6 +31,13 @@ RETRY_FAILED="${RETRY_FAILED:-false}"
 # Cache paths
 NEURON_CACHE_DIR="/var/tmp/neuron-compile-cache"
 
+# Load model environment variables if available
+MODEL_ENV_FILE="${SCRIPT_DIR}/model_env.sh"
+if [ -f "$MODEL_ENV_FILE" ]; then
+    echo -e "${BLUE}Loading model environment from: $MODEL_ENV_FILE${NC}"
+    source "$MODEL_ENV_FILE"
+fi
+
 # Parse command line arguments
 while [[ $# -gt 0 ]]; do
     case $1 in

From b0444fae257b29c6fb6e87bc8d510aa053554905 Mon Sep 17 00:00:00 2001
From: arm-diaz <arm.dan.diaz.gonzalez@gmail.com>
Date: Fri, 11 Jul 2025 23:16:35 +0000
Subject: [PATCH 57/65] docs: delete unecessary content

---
 docs/complete-pipeline.md | 218 +-------------------------------------
 1 file changed, 2 insertions(+), 216 deletions(-)

diff --git a/docs/complete-pipeline.md b/docs/complete-pipeline.md
index fcd9c6c..6190e8d 100644
--- a/docs/complete-pipeline.md
+++ b/docs/complete-pipeline.md
@@ -273,212 +273,8 @@ def nki_attention_kernel(q, k, v, mask=None, training=False):
 3. **Speculative Decoding**: Parallel token generation
 4. **Quantization**: INT8/INT4 inference
 
-## 📊 Performance Monitoring Dashboard
-
-### Unified Monitoring Script
-Create a monitoring script to track both phases:
-
-```bash
-#!/bin/bash
-# monitor.sh
-
-echo "=== NKI-LLAMA Performance Monitor ==="
-
-# Training metrics
-if pgrep -f "finetune" > /dev/null; then
-    echo "📊 Training Status:"
-    tail -n 20 logs/nki-llama_*.log | grep -E "(loss|throughput|mfu)"
-fi
-
-# Inference metrics
-if pgrep -f "inference" > /dev/null; then
-    echo "📊 Inference Status:"
-    tail -n 10 src/inference/benchmark_inference.json
-fi
-
-# Device utilization
-echo "📊 Device Utilization:"
-neuron-top -n 1
-
-# Memory usage
-echo "📊 Memory Status:"
-free -h
-```
-
-## 🏗️ Architecture Best Practices
-
-### 1. Kernel Reusability
-Design kernels that work for both training and inference:
-
-```python
-class NKIOptimizedLayer(nn.Module):
-    def __init__(self, config, training_mode=True):
-        super().__init__()
-        self.training_mode = training_mode
-        self.config = config
-        
-    def forward(self, x):
-        if self.config.use_nki:
-            return nki_kernel(x, training=self.training_mode)
-        return standard_implementation(x)
-```
-
-### 2. Configuration Management
-Unified configuration for both phases:
-
-```yaml
-# config.yaml
-model:
-  name: llama-3-8b
-  use_nki: true
-  
-training:
-  batch_size: 8
-  learning_rate: 5e-5
-  nki_kernels:
-    - rmsnorm
-    - attention
-    - linear
-    
-inference:
-  batch_size: 1
-  max_length: 2048
-  nki_kernels:
-    - rmsnorm
-    - attention
-    - linear
-    - kv_cache
-```
-
-### 3. Progressive Optimization
-Start simple and add complexity:
-
-1. **Baseline**: Get everything working without NKI
-2. **Single Kernel**: Add one NKI kernel (e.g., RMSNorm)
-3. **Core Kernels**: Add attention and linear layers
-4. **Advanced**: Implement fusion and specialized kernels
-
-## 🎯 Scoring Optimization Strategy
-
-### Weight Distribution
-For maximum score with all three components:
-
-```python
-# Recommended weight distribution
-WEIGHTS = {
-    "training": 0.33,
-    "inference": 0.33,
-    "reasoning": 0.34
-}
-```
-
-### Focus Areas by Score Impact
-
-#### High Impact (>20% score improvement)
-1. **Attention Optimization**: Both training and inference
-2. **Linear Layer Fusion**: Combine with activation functions
-3. **Memory Access Patterns**: Optimize for Neuron architecture
-
-#### Medium Impact (10-20% improvement)
-1. **Normalization Layers**: RMSNorm, LayerNorm
-2. **Gradient Operations**: Training-specific
-3. **KV Cache**: Inference-specific
-
-#### Low Impact (<10% improvement)
-1. **Activation Functions**: Unless fused with other ops
-2. **Element-wise Operations**: Minor gains
-3. **Data Loading**: Already optimized in framework
-
-## 🛠️ Development Workflow
-
-### Iterative Development Cycle
-```bash
-# 1. Implement kernel
-nano src/kernels/my_nki_kernel.py
-
-# 2. Test in isolation
-python test_kernel.py
-
-# 3. Integrate into model
-nano src/llama.py
-
-# 4. Benchmark improvement
-./nki-llama inference benchmark --seq-len 512
-
-# 5. Profile and optimize
-neuron-profile view profiles/
-```
-
-### Continuous Integration Testing
-```python
-# test_suite.py
-import unittest
-
-class NKIKernelTests(unittest.TestCase):
-    def test_rmsnorm_accuracy(self):
-        # Compare NKI vs PyTorch implementation
-        pass
-        
-    def test_attention_performance(self):
-        # Verify speedup
-        pass
-        
-    def test_training_convergence(self):
-        # Ensure training still converges
-        pass
-```
-
-## 📈 Results Analysis
-
-### Performance Tracking
-Track improvements across iterations:
-
-```python
-# track_performance.py
-import json
-import matplotlib.pyplot as plt
-
-def plot_improvements(baseline, optimized):
-    metrics = ['training_mfu', 'inference_throughput', 'reasoning_accuracy']
-    improvements = [(optimized[m] - baseline[m]) / baseline[m] * 100 
-                   for m in metrics]
-    
-    plt.bar(metrics, improvements)
-    plt.ylabel('Improvement (%)')
-    plt.title('NKI Optimization Impact')
-    plt.savefig('optimization_impact.png')
-```
-
-### Score Breakdown Analysis
-```bash
-# Analyze score components
-python src/handler.py \
-    --inference-results benchmark_inference.json \
-    --analyze-components \
-    --output score_analysis.json
-```
-
 ## 🐛 Common Integration Issues
 
-### Environment Conflicts
-```bash
-# Issue: Package version mismatch between environments
-# Solution: Use separate conda environments
-conda create -n nki-training python=3.10
-conda create -n nki-inference python=3.10
-```
-
-### Model Compatibility
-```bash
-# Issue: Model trained with one config, inference with another
-# Solution: Always save and load full configuration
-torch.save({
-    'model_state_dict': model.state_dict(),
-    'config': config,
-    'nki_kernels': enabled_kernels
-}, 'checkpoint.pt')
-```
-
 ### Cache Conflicts
 ```bash
 # Issue: Stale compiled kernels
@@ -493,14 +289,9 @@ rm -rf ~/.cache/neuron
 - **Week 1**: Get baseline working, understand the code
 - **Week 2**: Implement core NKI kernels
 - **Week 3**: Optimize and fine-tune
-- **Final days**: Polish, document, prepare presentation
-
-### 2. Collaboration Strategy
-- **Frontend**: One member on training optimizations
-- **Backend**: One member on inference optimizations
-- **Integration**: One member on testing and benchmarking
+- **Final days**: Polish, document, prepare submission
 
-### 3. Documentation
+### 2. Documentation
 Keep detailed logs of:
 - Kernel implementations
 - Performance improvements
@@ -523,7 +314,6 @@ tmux -a -t training "cd ~/nki-llama" Enter
 tmux -a -t training "./nki-llama finetune all 2>&1 | tee training.log" Enter
 
 # Wait for training to reach a checkpoint
-sleep 3600  # Adjust based on your training time
 
 # Inference phase
 tmux new -d -s inference
@@ -535,10 +325,6 @@ tmux -a -t inference "./nki-llama inference benchmark 2>&1 | tee inference.log"
 tmux new -d -s reasoning
 tmux -a -t reasoning "source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate" Enter
 tmux -a -t reasoning "./nki-llama/src/inference/scripts/reasoning-bench-lm-eval.sh" Enter
-
-# Monitor all sessions
-tmux new -s monitor
-watch -n 10 './monitor.sh'
 ```
 
 ## 📚 Resources

From 62d7ecfdbe23b36ffd71416a76a3c07fb7cf3417 Mon Sep 17 00:00:00 2001
From: Arhamama-AMZ <arhamama@amazon.com>
Date: Tue, 15 Jul 2025 19:12:41 +0000
Subject: [PATCH 58/65] feat: self-attention score implementation

---
 README.md                                     |   2 +-
 docs/self-attention.md                        |  74 +++++-
 src/self-attention/README.md                  |  66 ++++-
 src/self-attention/scripts/calculate_score.py | 197 ++++++++++++++
 .../scripts/self-attention_benchmark.sh       | 250 ++++++++++++++++++
 src/self-attention/tests/config.py            | 226 ++++++++++++++++
 .../tests/test_flash_attn_bwd.py              |  46 +++-
 .../tests/test_flash_attn_fwd.py              |  53 +++-
 8 files changed, 889 insertions(+), 25 deletions(-)
 create mode 100644 src/self-attention/scripts/calculate_score.py
 create mode 100644 src/self-attention/scripts/self-attention_benchmark.sh
 create mode 100644 src/self-attention/tests/config.py

diff --git a/README.md b/README.md
index 0ba75cc..5f14601 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ We've created four specialized guides based on your optimization focus:
 - Increase performance gains running Flash forward and backward kernels
 - Analyze performance and numerical computation results from implemented kernels
 - Further optimize attention kernels
-- **Score Focus**: Performance and Numerical Unit Tests
+- **Score Focus**: Self-Attention test latency and correctness
 
 ### 2. 🚀 [Inference with NKI Guide](./docs/inference.md)
 **Ideal for teams targeting inference performance**
diff --git a/docs/self-attention.md b/docs/self-attention.md
index 86b3da8..a5a44ba 100644
--- a/docs/self-attention.md
+++ b/docs/self-attention.md
@@ -136,23 +136,38 @@ Refer to the `attention.py` file for details on the kernel implementation. This
 ### Step 3: Run the Flash Self-Attention Kernel Unit Tests
 ```bash
 # Run the unit tests
-cd ~/nki-llama/src/self-attention/tests
+cd ~/nki-llama/src/self-attention/scripts
 
-# Run all forward and backward tests with full verbosity
-pytest test_flash_attn_*.py -v -s
+# Run the comprehensive benchmark script
+./self-attention_benchmark.sh 
 
 # Run specific test suite
-pytest test_flash_attn_fwd.py -v -s
-pytest test_flash_attn_bwd.py -v -s
-
-# Performance tests only
-pytest -k "perf" -v -s
-# Numerical accuracy tests only  
-pytest -k "numerical" -v -s
-# Simulation tests only
-pytest -m simulation -v -s
+pytest ../tests/test_flash_attn_fwd.py -v -s
+pytest ../tests/test_flash_attn_bwd.py -v -s
+
+```
+
+### Step 4: Understand the Scoring Mechanism
+
+The benchmark calculates a combined score based on the following formula:
+
+```
+final_score = accuracy * latency_improvement * throughput_improvement * (1.0 + nki_flop_ratio)
 ```
 
+Where:
+- `accuracy`: Binary value (1.0 or 0.0) indicating if numerical tests pass
+- `latency_improvement`: Ratio of baseline latency to measured latency
+- `throughput_improvement`: Inversely proportional to latency (higher is better)
+- `nki_flop_ratio`: Ratio of operations executed on NKI hardware (hardware utilization)
+
+The NKI FLOP ratio is automatically calculated based on the kernel characteristics, considering:
+- Matrix multiplication operations (highly accelerated on NKI)
+- Softmax operations (partially accelerated)
+- Batch size, sequence length, and head dimension effects on hardware utilization
+
+This scoring mechanism rewards both correctness and performance improvements, with a bonus for efficient hardware utilization.
+
 ## 🧪 Test Categories
 
 ### Performance Tests (`test_*_perf`)
@@ -178,6 +193,41 @@ pytest -m simulation -v -s
    V tensor:     3072.00 MB
    Total Input:  9216.00 MB
    Est. Peak:    18432.00 MB (2x for intermediate)
+   
+🔢 NKI FLOP RATIO: 0.8734
+   Calculated NKI FLOP ratio represents the percentage of operations
+   that can be accelerated by the NKI hardware.
+```
+
+**Performance Metrics JSON:**
+The benchmark generates a detailed JSON file with accumulated metrics:
+```json
+{
+  "timestamp": "2025-07-14T18:45:23Z",
+  "forward": {
+    "latency": 12500000,
+    "base_latency": 15100000000,
+    "latency_improvement": 1208.00,
+    "throughput_improvement": 1208.00,
+    "numerical_accuracy": 1.0,
+    "score": 1459264.00
+  },
+  "backward": {
+    "latency": 41482,
+    "base_latency": 117000,
+    "latency_improvement": 2.82,
+    "throughput_improvement": 2.82,
+    "numerical_accuracy": 1.0,
+    "score": 7.95
+  },
+  "combined": {
+    "forward_weight": 0.4,
+    "backward_weight": 0.6,
+    "nki_flop_ratio": 0.87,
+    "raw_score": 583710.37,
+    "score": 1091538.39
+  }
+}
 ```
 
 ### Numerical Accuracy Tests (`test_*_numerical`)
diff --git a/src/self-attention/README.md b/src/self-attention/README.md
index 4894cfa..36b59a2 100644
--- a/src/self-attention/README.md
+++ b/src/self-attention/README.md
@@ -98,25 +98,71 @@ Run the tests to validate performance and numerical accuracy:
 
 ```bash
 # Navigate to the tests directory
-cd tests
+cd nki-llama/src/self-attention/scripts
 
 # Run all tests
-pytest test_flash_attn_*.py -v -s
+./self-attention_benchmark.sh 
 
 # Run specific test suites
-pytest test_flash_attn_fwd.py -v -s  # Forward pass tests
-pytest test_flash_attn_bwd.py -v -s  # Backward pass tests
+pytest ../tests/test_flash_attn_fwd.py -v -s  # Forward pass tests
+pytest ../tests/test_flash_attn_bwd.py -v -s  # Backward pass tests
+```
+
+### Benchmarking
+
+The module includes a comprehensive benchmarking script that evaluates both forward and backward passes:
+
+```bash
+# Run the benchmark script
+./scripts/self-attention_benchmark.sh
+```
+
+The benchmark calculates a combined score based on the following formula:
 
-# Run only performance tests
-pytest -k "perf" -v -s
+```
+final_score = accuracy * latency_improvement * throughput_improvement * (1.0 + nki_flop_ratio)
+```
+
+Where:
+- `accuracy`: Binary value (1.0 or 0.0) indicating if numerical tests pass
+- `latency_improvement`: Ratio of baseline latency to measured latency
+- `throughput_improvement`: Inversely proportional to latency (higher is better)
+- `nki_flop_ratio`: Ratio of operations executed on NKI hardware (hardware utilization)
+
+#### NKI FLOP Ratio Calculation
 
-# Run only numerical accuracy tests
-pytest -k "numerical" -v -s
+The NKI FLOP ratio is automatically calculated based on the kernel characteristics:
 
-# Run simulation tests
-pytest -m simulation -v -s
+```python
+def calculate_nki_flop_ratio(bs, nheads, seq_len, d, is_backward=False):
+    # Calculate total FLOPs for attention operations
+    qk_bmm_flops = 2 * bs * nheads * seq_len * seq_len * d  # Q*K^T matrix multiply
+    attn_v_bmm_flops = 2 * bs * nheads * seq_len * seq_len * d  # Attention * V matrix multiply
+    softmax_flops = bs * nheads * seq_len * seq_len * 5  # Softmax operations
+    
+    # Additional operations for backward pass
+    if is_backward:
+        dq_flops = 2 * bs * nheads * seq_len * seq_len * d  # dQ calculation
+        dk_flops = 2 * bs * nheads * seq_len * seq_len * d  # dK calculation
+        dv_flops = 2 * bs * nheads * seq_len * seq_len * d  # dV calculation
+        dsoftmax_flops = bs * nheads * seq_len * seq_len * 10  # Softmax gradient
+        # Calculate total and NKI-accelerated operations
+        # ...
+    
+    # Apply adjustments based on empirical observations
+    # Larger batch sizes and head dimensions tend to have better utilization
+    # ...
+    
+    return adjusted_ratio  # Between 0.0 and 1.0
 ```
 
+This calculation considers:
+- Matrix multiplication operations (highly accelerated on NKI)
+- Softmax operations (partially accelerated)
+- Batch size, sequence length, and head dimension effects on hardware utilization
+
+The benchmark generates detailed logs and a JSON results file in the `logs/self_attention/` directory, including accumulated metrics across multiple test runs.
+
 ## Optimization Opportunities
 
 Areas for potential optimization:
diff --git a/src/self-attention/scripts/calculate_score.py b/src/self-attention/scripts/calculate_score.py
new file mode 100644
index 0000000..cb7641f
--- /dev/null
+++ b/src/self-attention/scripts/calculate_score.py
@@ -0,0 +1,197 @@
+#!/usr/bin/env python3
+"""
+Simple Self-Attention Score Calculator
+
+This script calculates a combined score for self-attention based on forward and backward pass metrics.
+It can be used directly after running the tests or with manually provided metrics.
+
+The scoring formula is:
+    final_score = accuracy * latency_improvement * throughput_improvement * (1.0 + nki_flop_ratio)
+
+Where:
+- accuracy: Binary value (1.0 or 0.0) indicating if numerical tests pass
+- latency_improvement: Ratio of baseline latency to measured latency
+- throughput_improvement: Inversely proportional to latency (higher is better)
+- nki_flop_ratio: Ratio of operations executed on NKI hardware (hardware utilization)
+
+The NKI FLOP ratio is calculated based on the kernel characteristics and represents
+the percentage of operations that are accelerated by the NKI hardware.
+"""
+import argparse
+import json
+import os
+import sys
+from datetime import datetime
+
+def calculate_score(fwd_latency, fwd_base_latency, fwd_numerical_accuracy,
+                   bwd_latency, bwd_base_latency, bwd_numerical_accuracy,
+                   fwd_weight=0.4, bwd_weight=0.6, nki_flop_ratio=0.0):
+    """
+    Calculate a combined score for self-attention.
+    
+    Parameters:
+    - fwd_latency: Measured latency for forward pass (ns)
+    - fwd_base_latency: Baseline latency for forward pass (ns)
+    - fwd_numerical_accuracy: Boolean indicating if forward numerical tests passed
+    - bwd_latency: Measured latency for backward pass (ns)
+    - bwd_base_latency: Baseline latency for backward pass (ns)
+    - bwd_numerical_accuracy: Boolean indicating if backward numerical tests passed
+    - fwd_weight: Weight for forward pass in combined score (default: 0.4)
+    - bwd_weight: Weight for backward pass in combined score (default: 0.6)
+    - nki_flop_ratio: Ratio of NKI FLOPs to total FLOPs (default: 0.0)
+    
+    Returns:
+    - Dictionary containing all score components and the final score
+    """
+    # Convert boolean accuracy to 1.0 or 0.0
+    fwd_accuracy = 1.0 if fwd_numerical_accuracy else 0.0
+    bwd_accuracy = 1.0 if bwd_numerical_accuracy else 0.0
+    
+    # Calculate latency improvements
+    fwd_latency_improvement = fwd_base_latency / fwd_latency if fwd_latency > 0 else 0.0
+    bwd_latency_improvement = bwd_base_latency / bwd_latency if bwd_latency > 0 else 0.0
+    
+    # Calculate throughput improvements (inversely proportional to latency)
+    fwd_throughput_improvement = fwd_latency_improvement
+    bwd_throughput_improvement = bwd_latency_improvement
+    
+    # Calculate individual scores
+    # Score = accuracy * latency_improvement * throughput_improvement * (1.0 + nki_flop_ratio)
+    fwd_score = fwd_accuracy * fwd_latency_improvement * fwd_throughput_improvement
+    bwd_score = bwd_accuracy * bwd_latency_improvement * bwd_throughput_improvement
+    
+    # Calculate combined score with NKI FLOP ratio bonus
+    # If either test fails numerically, the combined score is 0
+    combined_numerical_accuracy = fwd_accuracy * bwd_accuracy
+    if combined_numerical_accuracy < 1.0:
+        raw_score = 0.0
+        combined_score = 0.0
+    else:
+        # Apply the NKI FLOP ratio bonus to the weighted sum of forward and backward scores
+        raw_score = ((fwd_weight * fwd_score) + (bwd_weight * bwd_score)) / 1000000000 # Dividing by ns
+        combined_score = raw_score * (1.0 + nki_flop_ratio) # Dividing by nanoseconds
+    
+    # Return all components
+    return {
+        "forward": {
+            "latency": fwd_latency,
+            "base_latency": fwd_base_latency,
+            "latency_improvement": fwd_latency_improvement,
+            "throughput_improvement": fwd_throughput_improvement,
+            "numerical_accuracy": fwd_accuracy,
+            "score": fwd_score
+        },
+        "backward": {
+            "latency": bwd_latency,
+            "base_latency": bwd_base_latency,
+            "latency_improvement": bwd_latency_improvement,
+            "throughput_improvement": bwd_throughput_improvement,
+            "numerical_accuracy": bwd_accuracy,
+            "score": bwd_score
+        },
+        "combined": {
+            "forward_weight": fwd_weight,
+            "backward_weight": bwd_weight,
+            "nki_flop_ratio": nki_flop_ratio,
+            "raw_score": raw_score if combined_numerical_accuracy >= 1.0 else 0.0,
+            "score": combined_score
+        }
+    }
+
+def print_results(results):
+    """Print formatted results to console"""
+    print("\n" + "="*60)
+    print("Self-Attention Benchmark Results")
+    print("="*60)
+    
+    print("\nForward Pass:")
+    print(f"   Latency:             {results['forward']['latency']:,} ns")
+    print(f"   Base Latency:        {results['forward']['base_latency']:,} ns")
+    print(f"   Latency Improvement: {results['forward']['latency_improvement']:.2f}x")
+    print(f"   Throughput Improvement: {results['forward']['throughput_improvement']:.2f}x")
+    print(f"   Numerical Accuracy:  {'✅ PASS' if results['forward']['numerical_accuracy'] == 1.0 else '❌ FAIL'}")
+    print(f"   Forward Score:       {results['forward']['score']:.2f}")
+    
+    print("\nBackward Pass:")
+    print(f"   Latency:             {results['backward']['latency']:,} ns")
+    print(f"   Base Latency:        {results['backward']['base_latency']:,} ns")
+    print(f"   Latency Improvement: {results['backward']['latency_improvement']:.2f}x")
+    print(f"   Throughput Improvement: {results['backward']['throughput_improvement']:.2f}x")
+    print(f"   Numerical Accuracy:  {'✅ PASS' if results['backward']['numerical_accuracy'] == 1.0 else '❌ FAIL'}")
+    print(f"   Backward Score:      {results['backward']['score']:.2f}")
+    
+    print("\nCombined Metrics:")
+    print(f"   Forward Weight:      {results['combined']['forward_weight']:.2f}")
+    print(f"   Backward Weight:     {results['combined']['backward_weight']:.2f}")
+    print(f"   NKI FLOP Ratio:      {results['combined']['nki_flop_ratio']:.2f}")
+    print(f"   Raw Score:           {results['combined']['raw_score']:.2f}")
+    print(f"   Final Score:         {results['combined']['score']:.2f} = Raw Score * (1 + NKI Flop Ratio)")
+    
+    # Print overall status
+    if results['combined']['score'] > 0.0:
+        print("\n🎉 OVERALL STATUS: PASS")
+    else:
+        print("\n❌ OVERALL STATUS: FAIL")
+    
+    print("\n" + "="*60)
+
+def main():
+    """Main function to parse arguments and calculate score"""
+    parser = argparse.ArgumentParser(description="Calculate Self-Attention Score")
+    
+    # Required arguments
+    parser.add_argument("--fwd-latency", type=float, required=True,
+                        help="Measured latency for forward pass (ns)")
+    parser.add_argument("--fwd-base-latency", type=float, required=True,
+                        help="Baseline latency for forward pass (ns)")
+    parser.add_argument("--fwd-numerical-accuracy", type=str, required=True, choices=["pass", "fail"],
+                        help="Whether forward numerical tests passed")
+    
+    parser.add_argument("--bwd-latency", type=float, required=True,
+                        help="Measured latency for backward pass (ns)")
+    parser.add_argument("--bwd-base-latency", type=float, required=True,
+                        help="Baseline latency for backward pass (ns)")
+    parser.add_argument("--bwd-numerical-accuracy", type=str, required=True, choices=["pass", "fail"],
+                        help="Whether backward numerical tests passed")
+    
+    # Optional arguments
+    parser.add_argument("--fwd-weight", type=float, default=0.4,
+                        help="Weight for forward pass in combined score (default: 0.4)")
+    parser.add_argument("--bwd-weight", type=float, default=0.6,
+                        help="Weight for backward pass in combined score (default: 0.6)")
+    parser.add_argument("--nki-flop-ratio", type=float, default=0.0,
+                        help="Ratio of NKI FLOPs to total FLOPs (default: 0.0)")
+    parser.add_argument("--output", type=str,
+                        help="Path to save results as JSON")
+    
+    args = parser.parse_args()
+    
+    # Convert string accuracy to boolean
+    fwd_numerical_accuracy = args.fwd_numerical_accuracy.lower() == "pass"
+    bwd_numerical_accuracy = args.bwd_numerical_accuracy.lower() == "pass"
+    
+    # Calculate score
+    results = calculate_score(
+        args.fwd_latency, args.fwd_base_latency, fwd_numerical_accuracy,
+        args.bwd_latency, args.bwd_base_latency, bwd_numerical_accuracy,
+        args.fwd_weight, args.bwd_weight, args.nki_flop_ratio
+    )
+    
+    # Add timestamp
+    results["timestamp"] = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
+    
+    # Print results
+    print_results(results)
+    
+    # Save results if output path is provided
+    if args.output:
+        os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True)
+        with open(args.output, 'w') as f:
+            json.dump(results, f, indent=2)
+        print(f"\nResults saved to: {args.output}")
+    
+    # Return success if combined score is positive
+    return 0 if results["combined"]["score"] > 0.0 else 1
+
+if __name__ == "__main__":
+    sys.exit(main())
\ No newline at end of file
diff --git a/src/self-attention/scripts/self-attention_benchmark.sh b/src/self-attention/scripts/self-attention_benchmark.sh
new file mode 100644
index 0000000..e6fd68f
--- /dev/null
+++ b/src/self-attention/scripts/self-attention_benchmark.sh
@@ -0,0 +1,250 @@
+#!/bin/bash
+# Simple Self-Attention Benchmark Script
+# Runs forward and backward tests and calculates a combined score
+
+set -euo pipefail
+
+# Get script directory
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../../../" && pwd)"
+
+# Colors for output
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+RED='\033[0;31m'
+NC='\033[0m'
+
+echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+echo -e "${BLUE}NKI Self-Attention Benchmark${NC}"
+echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+
+# Create log directory
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+LOG_DIR="${PROJECT_ROOT}/logs/self_attention/${TIMESTAMP}"
+mkdir -p "$LOG_DIR"
+
+# Default NKI FLOP ratio (will be overridden by calculated on NKIfrom tests)
+# This represents the ratio of operations that are accelerated by NKI
+# Higher values indicate better hardware utilization
+NKI_FLOP_RATIO=0.85  # Default: 85% of operations executed on NKI
+
+# Step 1: Run forward test
+echo -e "\n${YELLOW}Running forward attention test...${NC}"
+cd "${PROJECT_ROOT}/src/self-attention"
+python -m pytest tests/test_flash_attn_fwd.py::TestAttention::test_flash_attn_fwd_perf -v -s > "${LOG_DIR}/forward_perf.log"
+FWD_PERF_STATUS=$?
+
+python -m pytest tests/test_flash_attn_fwd.py::TestAttention::test_flash_attn_fwd_numerical -v -s > "${LOG_DIR}/forward_numerical.log"
+FWD_NUM_STATUS=$?
+
+# Step 2: Run backward test
+echo -e "\n${YELLOW}Running backward attention test...${NC}"
+python -m pytest tests/test_flash_attn_bwd.py::TestAttention::test_flash_attn_bwd_perf -v -s > "${LOG_DIR}/backward_perf.log"
+BWD_PERF_STATUS=$?
+
+python -m pytest tests/test_flash_attn_bwd.py::TestAttention::test_flash_attn_bwd_numerical -v -s > "${LOG_DIR}/backward_numerical.log"
+BWD_NUM_STATUS=$?
+
+# Step 3: Extract latency values from logs and config
+echo -e "\n${YELLOW}Extracting metrics from test results...${NC}"
+
+# Extract P50 latency from forward performance test
+FWD_P50_LATENCY=$(grep -o "P50 Latency: [0-9,]* ns" "${LOG_DIR}/forward_perf.log" | grep -o "[0-9,]*" | tr -d ',')
+if [[ -z "$FWD_P50_LATENCY" ]]; then
+    echo -e "${RED}Could not extract forward P50 latency. Using baseline.${NC}"
+    FWD_P50_LATENCY=$FWD_BASE_LATENCY
+fi
+
+# Extract P50 latency from backward performance test
+BWD_P50_LATENCY=$(grep -o "P50 Latency: [0-9,]* ns" "${LOG_DIR}/backward_perf.log" | grep -o "[0-9,]*" | tr -d ',')
+if [[ -z "$BWD_P50_LATENCY" ]]; then
+    echo -e "${RED}Could not extract backward P50 latency. Using baseline.${NC}"
+    BWD_P50_LATENCY=$BWD_BASE_LATENCY
+fi
+
+# Extract accumulated latency metrics from config file
+CONFIG_FILE="${PROJECT_ROOT}/src/self-attention/config/performance_metrics.json"
+if [[ -f "$CONFIG_FILE" ]]; then
+    echo -e "${GREEN}Loading accumulated metrics from config file...${NC}"
+    
+    # Use jq if available, otherwise use grep and sed
+    if command -v jq &> /dev/null; then
+        FWD_LATENCY_TOTAL=$(jq -r '.FWD_LATENCY_TOTAL' "$CONFIG_FILE")
+        FWD_BASE_LATENCY_TOTAL=$(jq -r '.FWD_BASE_LATENCY_TOTAL' "$CONFIG_FILE")
+        FWD_TEST_COUNT=$(jq -r '.FWD_TEST_COUNT' "$CONFIG_FILE")
+        BWD_LATENCY_TOTAL=$(jq -r '.BWD_LATENCY_TOTAL' "$CONFIG_FILE")
+        BWD_BASE_LATENCY_TOTAL=$(jq -r '.BWD_BASE_LATENCY_TOTAL' "$CONFIG_FILE")
+        BWD_TEST_COUNT=$(jq -r '.BWD_TEST_COUNT' "$CONFIG_FILE")
+    else
+        # Fallback to grep and sed if jq is not available
+        FWD_LATENCY_TOTAL=$(grep -o '"FWD_LATENCY_TOTAL": [0-9.]*' "$CONFIG_FILE" | sed 's/.*: //')
+        FWD_BASE_LATENCY_TOTAL=$(grep -o '"FWD_BASE_LATENCY_TOTAL": [0-9.]*' "$CONFIG_FILE" | sed 's/.*: //')
+        FWD_TEST_COUNT=$(grep -o '"FWD_TEST_COUNT": [0-9.]*' "$CONFIG_FILE" | sed 's/.*: //')
+        BWD_LATENCY_TOTAL=$(grep -o '"BWD_LATENCY_TOTAL": [0-9.]*' "$CONFIG_FILE" | sed 's/.*: //')
+        BWD_BASE_LATENCY_TOTAL=$(grep -o '"BWD_BASE_LATENCY_TOTAL": [0-9.]*' "$CONFIG_FILE" | sed 's/.*: //')
+        BWD_TEST_COUNT=$(grep -o '"BWD_TEST_COUNT": [0-9.]*' "$CONFIG_FILE" | sed 's/.*: //')
+    fi
+    
+    # Use the accumulated metrics if available
+    if [[ -n "$FWD_LATENCY_TOTAL" && -n "$FWD_BASE_LATENCY_TOTAL" && "$FWD_TEST_COUNT" -gt 0 ]]; then
+        echo -e "${GREEN}Using accumulated forward metrics from ${FWD_TEST_COUNT} tests${NC}"
+        FWD_AVG_LATENCY=$(echo "scale=2; $FWD_LATENCY_TOTAL / $FWD_TEST_COUNT" | bc)
+        FWD_AVG_BASE_LATENCY=$(echo "scale=2; $FWD_BASE_LATENCY_TOTAL / $FWD_TEST_COUNT" | bc)
+        echo -e "   Average achieved latency: ${FWD_AVG_LATENCY} ns"
+        echo -e "   Average baseline latency: ${FWD_AVG_BASE_LATENCY} ns"
+        FWD_P50_LATENCY=$FWD_AVG_LATENCY
+        FWD_BASE_LATENCY=$FWD_AVG_BASE_LATENCY
+    fi
+    
+    if [[ -n "$BWD_LATENCY_TOTAL" && -n "$BWD_BASE_LATENCY_TOTAL" && "$BWD_TEST_COUNT" -gt 0 ]]; then
+        echo -e "${GREEN}Using accumulated backward metrics from ${BWD_TEST_COUNT} tests${NC}"
+        BWD_AVG_LATENCY=$(echo "scale=2; $BWD_LATENCY_TOTAL / $BWD_TEST_COUNT" | bc)
+        BWD_AVG_BASE_LATENCY=$(echo "scale=2; $BWD_BASE_LATENCY_TOTAL / $BWD_TEST_COUNT" | bc)
+        echo -e "   Average achieved latency: ${BWD_AVG_LATENCY} ns"
+        echo -e "   Average baseline latency: ${BWD_AVG_BASE_LATENCY} ns"
+        BWD_P50_LATENCY=$BWD_AVG_LATENCY
+        BWD_BASE_LATENCY=$BWD_AVG_BASE_LATENCY
+    fi
+else
+    echo -e "${YELLOW}No accumulated metrics found. Using single test results.${NC}"
+fi
+
+# Step 4: Calculate scores using the Python script
+echo -e "\n${YELLOW}Calculating scores...${NC}"
+
+# Convert test status to pass/fail strings for the Python script
+FWD_NUMERICAL_RESULT=$([ $FWD_NUM_STATUS -eq 0 ] && echo "pass" || echo "fail")
+BWD_NUMERICAL_RESULT=$([ $BWD_NUM_STATUS -eq 0 ] && echo "pass" || echo "fail")
+
+# Define weights
+FWD_WEIGHT=0.4
+BWD_WEIGHT=0.6
+
+# Run the Python script to calculate scores
+SCORE_OUTPUT_FILE="${LOG_DIR}/score_details.json"
+echo -e "${BLUE}Running score calculation script...${NC}"
+
+# Extract NKI_FLOP_RATIO from config if available
+if [[ -f "$CONFIG_FILE" ]]; then
+    if command -v jq &> /dev/null; then
+        CONFIG_NKI_FLOP_RATIO=$(jq -r '.NKI_FLOP_RATIO' "$CONFIG_FILE")
+        if [[ -n "$CONFIG_NKI_FLOP_RATIO" && "$CONFIG_NKI_FLOP_RATIO" != "null" ]]; then
+            NKI_FLOP_RATIO=$CONFIG_NKI_FLOP_RATIO
+            echo -e "${GREEN}Using calculated NKI_FLOP_RATIO from config: ${NKI_FLOP_RATIO}${NC}"
+        fi
+    else
+        # Fallback to grep and sed if jq is not available
+        CONFIG_NKI_FLOP_RATIO=$(grep -o '"NKI_FLOATIO": [0- "$CONFIG_FILE" | sed 's/.*: //'')
+        if [[ -n "$CONFIG_NKI_FLOP_RATIO" ]]; then
+            NKI_FLOP_RATIO=$CONFIG_P_RATIO
+            echo -e "${GREEN}Using calculated NKI_FLOP_RATIO from config: ${NKI_FLOP_RATIO}${NC}"
+        fi
+    fi
+else
+    -e "${YELLOW}No config file found. Using default NKI_FLOP_RATIO: ${NKI_FLOP_RATIO}${NC}"
+fi
+
+python "${SCRIPT_DIR}/calculate_score.py" \
+  --fwd-latency "$FWD_P50_LATENCY" \
+  --fwd-base-latency "$FWD_BASE_LATENCY" \
+  --fwd-numerical-accuracy "$FWD_NUMERICAL_RESULT" \
+  --bwd-latency "$BWD_P50_LATENCY" \
+  --bwd-base-latency "$BWD_BASE_LATENCY" \
+  --bwd-numerical-accuracy "$BWD_NUMERICAL_RESULT" \
+  --fwd-weight "$FWD_WEIGHT" \
+  --bwd-weight "$BWD_WEIGHT" \
+  --nki-flop-ratio "$NKI_FLOP_RATIO" \
+  --output "$SCORE_OUTPUT_FILE"
+
+# Extract values from the JSON file
+if [ -f "$SCORE_OUTPUT_FILE" ]; then
+    # Use jq if available, otherwise use grep and sed
+    if command -v jq &> /dev/null; then
+        FWD_LATENCY_IMPROVEMENT=$(jq -r '.forward.latency_improvement' "$SCORE_OUTPUT_FILE")
+        BWD_LATENCY_IMPROVEMENT=$(jq -r '.backward.latency_improvement' "$SCORE_OUTPUT_FILE")
+        FWD_THROUGHPUT_IMPROVEMENT=$(jq -r '.forward.throughput_improvement' "$SCORE_OUTPUT_FILE")
+        BWD_THROUGHPUT_IMPROVEMENT=$(jq -r '.backward.throughput_improvement' "$SCORE_OUTPUT_FILE")
+        FWD_SCORE=$(jq -r '.forward.score' "$SCORE_OUTPUT_FILE")
+        BWD_SCORE=$(jq -r '.backward.score' "$SCORE_OUTPUT_FILE")
+        RAW_SCORE=$(jq -r '.combined.raw_score' "$SCORE_OUTPUT_FILE")
+        COMBINED_SCORE=$(jq -r '.combined.score' "$SCORE_OUTPUT_FILE")
+    else
+        # Fallback to grep and sed if jq is not available
+        FWD_LATENCY_IMPROVEMENT=$(grep -o '"latency_improvement": [0-9.]*' "$SCORE_OUTPUT_FILE" | head -1 | sed 's/.*: //')
+        BWD_LATENCY_IMPROVEMENT=$(grep -o '"latency_improvement": [0-9.]*' "$SCORE_OUTPUT_FILE" | tail -1 | sed 's/.*: //')
+        FWD_THROUGHPUT_IMPROVEMENT=$(grep -o '"throughput_improvement": [0-9.]*' "$SCORE_OUTPUT_FILE" | head -1 | sed 's/.*: //')
+        BWD_THROUGHPUT_IMPROVEMENT=$(grep -o '"throughput_improvement": [0-9.]*' "$SCORE_OUTPUT_FILE" | tail -1 | sed 's/.*: //')
+        FWD_SCORE=$(grep -o '"score": [0-9.]*' "$SCORE_OUTPUT_FILE" | head -1 | sed 's/.*: //')
+        BWD_SCORE=$(grep -o '"score": [0-9.]*' "$SCORE_OUTPUT_FILE" | head -2 | tail -1 | sed 's/.*: //')
+        RAW_SCORE=$(grep -o '"raw_score": [0-9.]*' "$SCORE_OUTPUT_FILE" | sed 's/.*: //')
+        COMBINED_SCORE=$(grep -o '"score": [0-9.]*' "$SCORE_OUTPUT_FILE" | tail -1 | sed 's/.*: //')
+    fi
+else
+    echo -e "${RED}Score calculation failed. Using manual calculation.${NC}"
+    
+    # Convert test status to numerical accuracy (1.0 for pass, 0.0 for fail)
+    FWD_NUMERICAL_ACCURACY=$([ $FWD_NUM_STATUS -eq 0 ] && echo 1.0 || echo 0.0)
+    BWD_NUMERICAL_ACCURACY=$([ $BWD_NUM_STATUS -eq 0 ] && echo 1.0 || echo 0.0)
+    
+    # Calculate latency improvements
+    FWD_LATENCY_IMPROVEMENT=$(echo "scale=2; $FWD_BASE_LATENCY / $FWD_P50_LATENCY" | bc)
+    BWD_LATENCY_IMPROVEMENT=$(echo "scale=2; $BWD_BASE_LATENCY / $BWD_P50_LATENCY" | bc)
+    FWD_THROUGHPUT_IMPROVEMENT=$FWD_LATENCY_IMPROVEMENT
+    BWD_THROUGHPUT_IMPROVEMENT=$BWD_LATENCY_IMPROVEMENT
+    
+    # Calculate individual scores
+    FWD_SCORE=$(echo "scale=2; $FWD_NUMERICAL_ACCURACY * $FWD_LATENCY_IMPROVEMENT * $FWD_THROUGHPUT_IMPROVEMENT" | bc)
+    BWD_SCORE=$(echo "scale=2; $BWD_NUMERICAL_ACCURACY * $BWD_LATENCY_IMPROVEMENT * $BWD_THROUGHPUT_IMPROVEMENT" | bc)
+    
+    # Calculate combined score
+    COMBINED_NUMERICAL_ACCURACY=$(echo "scale=2; $FWD_NUMERICAL_ACCURACY * $BWD_NUMERICAL_ACCURACY" | bc)
+    if (( $(echo "$COMBINED_NUMERICAL_ACCURACY < 1.0" | bc -l) )); then
+        RAW_SCORE=0.0
+        COMBINED_SCORE=0.0
+    else
+        RAW_SCORE=$(echo "scale=2; ($FWD_WEIGHT * $FWD_SCORE) + ($BWD_WEIGHT * $BWD_SCORE)" | bc)
+        COMBINED_SCORE=$(echo "scale=2; $RAW_SCORE * (1.0 + $NKI_FLOP_RATIO)" | bc)
+    fi
+fi
+
+echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+echo -e "\nLog files saved to: ${LOG_DIR}"
+echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+
+# Save results to JSON file if the Python script didn't already create one
+if [ ! -f "$SCORE_OUTPUT_FILE" ]; then
+    cat > "${LOG_DIR}/results.json" << EOF
+{
+    "timestamp": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")",
+    "forward": {
+        "latency": ${FWD_P50_LATENCY},
+        "base_latency": ${FWD_BASE_LATENCY},
+        "latency_improvement": ${FWD_LATENCY_IMPROVEMENT},
+        "throughput_improvement": ${FWD_THROUGHPUT_IMPROVEMENT},
+        "numerical_accuracy": $([ "$FWD_NUMERICAL_RESULT" = "pass" ] && echo 1.0 || echo 0.0),
+        "score": ${FWD_SCORE}
+    },
+    "backward": {
+        "latency": ${BWD_P50_LATENCY},
+        "base_latency": ${BWD_BASE_LATENCY},
+        "latency_improvement": ${BWD_LATENCY_IMPROVEMENT},
+        "throughput_improvement": ${BWD_THROUGHPUT_IMPROVEMENT},
+        "numerical_accuracy": $([ "$BWD_NUMERICAL_RESULT" = "pass" ] && echo 1.0 || echo 0.0),
+        "score": ${BWD_SCORE}
+    },
+    "combined": {
+        "forward_weight": ${FWD_WEIGHT},
+        "backward_weight": ${BWD_WEIGHT},
+        "nki_flop_ratio": ${NKI_FLOP_RATIO},
+        "raw_score": ${RAW_SCORE},
+        "score": ${COMBINED_SCORE}
+    }
+}
+EOF
+else
+    # Copy the score details file to the standard results.json location
+    cp "$SCORE_OUTPUT_FILE" "${LOG_DIR}/results.json"
+fi
+
+exit 0
\ No newline at end of file
diff --git a/src/self-attention/tests/config.py b/src/self-attention/tests/config.py
new file mode 100644
index 0000000..7d12f0e
--- /dev/null
+++ b/src/self-attention/tests/config.py
@@ -0,0 +1,226 @@
+#!/usr/bin/env python3
+"""
+Configuration utilities for self-attention tests
+"""
+import os
+import json
+import logging
+from typing import Dict, Any
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+# Path to the config file
+CONFIG_FILE = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 
+                          "config", "performance_metrics.json")
+
+def ensure_config_dir():
+    """Ensure the config directory exists"""
+    config_dir = os.path.dirname(CONFIG_FILE)
+    if not os.path.exists(config_dir):
+        try:
+            os.makedirs(config_dir)
+            logger.info(f"Created config directory: {config_dir}")
+        except Exception as e:
+            logger.error(f"Failed to create config directory: {e}")
+            return False
+    return True
+
+def load_config() -> Dict[str, Any]:
+    """Load the configuration from the JSON file"""
+    if not ensure_config_dir():
+        return {}
+        
+    if not os.path.exists(CONFIG_FILE):
+        logger.info(f"Config file not found, creating default: {CONFIG_FILE}")
+        default_config = {
+            "FWD_LATENCY_TOTAL": 0,
+            "FWD_BASE_LATENCY_TOTAL": 0,
+            "FWD_TEST_COUNT": 0,
+            "BWD_LATENCY_TOTAL": 0,
+            "BWD_BASE_LATENCY_TOTAL": 0,
+            "BWD_TEST_COUNT": 0,
+            "NKI_FLOP_RATIO": 0.85,
+            "LAST_RUN_TIMESTAMP": "",
+            "test_details": []
+        }
+        save_config(default_config)
+        return default_config
+    
+    try:
+        with open(CONFIG_FILE, 'r') as f:
+            return json.load(f)
+    except Exception as e:
+        logger.error(f"Failed to load config file: {e}")
+        return {}
+
+def save_config(config: Dict[str, Any]) -> bool:
+    """Save the configuration to the JSON file"""
+    if not ensure_config_dir():
+        return False
+        
+    try:
+        with open(CONFIG_FILE, 'w') as f:
+            json.dump(config, f, indent=2)
+        logger.info(f"Config saved to: {CONFIG_FILE}")
+        return True
+    except Exception as e:
+        logger.error(f"Failed to save config file: {e}")
+        return False
+
+def update_config(updates: Dict[str, Any]) -> bool:
+    """Update specific configuration values"""
+    config = load_config()
+    config.update(updates)
+    return save_config(config)
+
+def add_test_result(test_type: str, achieved_latency: int, expected_latency: int, test_params: Dict[str, Any]) -> bool:
+    """Add a test result and update the accumulated latency totals
+    
+    Args:
+        test_type: Either 'FWD' or 'BWD' for forward or backward tests
+        achieved_latency: The measured latency in nanoseconds
+        expected_latency: The baseline/expected latency in nanoseconds
+        test_params: Dictionary of test parameters (batch size, heads, etc.)
+        
+    Returns:
+        bool: True if update was successful
+    """
+    if test_type not in ['FWD', 'BWD']:
+        logger.error(f"Invalid test type: {test_type}. Must be 'FWD' or 'BWD'.")
+        return False
+        
+    config = load_config()
+    
+    # Update the accumulated totals
+    latency_key = f"{test_type}_LATENCY_TOTAL"
+    base_key = f"{test_type}_BASE_LATENCY_TOTAL"
+    count_key = f"{test_type}_TEST_COUNT"
+    
+    config[latency_key] = config.get(latency_key, 0) + achieved_latency
+    config[base_key] = config.get(base_key, 0) + expected_latency
+    config[count_key] = config.get(count_key, 0) + 1
+    
+    # Add timestamp
+    import datetime
+    config["LAST_RUN_TIMESTAMP"] = datetime.datetime.now().isoformat()
+    
+    # Add test details to the history
+    if "test_details" not in config:
+        config["test_details"] = []
+        
+    test_details = {
+        "timestamp": config["LAST_RUN_TIMESTAMP"],
+        "test_type": test_type,
+        "achieved_latency": achieved_latency,
+        "expected_latency": expected_latency,
+        "params": test_params
+    }
+    
+    config["test_details"].append(test_details)
+    
+    # Keep only the last 20 test details to avoid the file growing too large
+    if len(config["test_details"]) > 20:
+        config["test_details"] = config["test_details"][-20:]
+    
+    return save_config(config)
+
+def get_latency_improvement_ratio(test_type: str) -> float:
+    """Calculate the latency improvement ratio for a test type
+    
+    Args:
+        test_type: Either 'FWD' or 'BWD' for forward or backward tests
+        
+    Returns:
+        float: The latency improvement ratio (baseline/achieved)
+    """
+    config = load_config()
+    
+    latency_key = f"{test_type}_LATENCY_TOTAL"
+    base_key = f"{test_type}_BASE_LATENCY_TOTAL"
+    
+    achieved = config.get(latency_key, 0)
+    baseline = config.get(base_key, 0)
+    
+    if achieved <= 0:
+        return 0.0
+        
+    return baseline / achieved
+
+
+def calculate_nki_flop_ratio(bs: int, nheads: int, seq_len: int, d: int, is_backward: bool = False) -> float:
+    """
+    Calculate the NKI FLOP ratio based on kernel characteristics.
+    This estimates what percentage of operations are executed on NKI hardware.
+    
+    Args:
+        bs: Batch size
+        nheads: Number of attention heads
+        seq_len: Sequence length
+        d: Head dimension
+        is_backward: Whether this is a backward pass calculation
+        
+    Returns:
+        float: The estimated NKI FLOP ratio (0.0 to 1.0)
+    """
+    # Calculate total FLOPs for attention
+    # For forward pass: 2 * bs * nheads * seq_len * seq_len * d
+    # For backward pass: ~3x the forward pass
+    
+    # Calculate FLOPs for different components
+    qk_bmm_flops = 2 * bs * nheads * seq_len * seq_len * d  # Q*K^T matrix multiply
+    attn_v_bmm_flops = 2 * bs * nheads * seq_len * seq_len * d  # Attention * V matrix multiply
+    softmax_flops = bs * nheads * seq_len * seq_len * 5  # Softmax operations (exp, sum, div)
+    
+    # Total FLOPs for forward pass
+    total_forward_flops = qk_bmm_flops + attn_v_bmm_flops + softmax_flops
+    
+    # For backward pass, we need gradients for Q, K, V
+    if is_backward:
+        # Backward pass has additional operations for gradients
+        dq_flops = 2 * bs * nheads * seq_len * seq_len * d  # dQ calculation
+        dk_flops = 2 * bs * nheads * seq_len * seq_len * d  # dK calculation
+        dv_flops = 2 * bs * nheads * seq_len * seq_len * d  # dV calculation
+        dsoftmax_flops = bs * nheads * seq_len * seq_len * 10  # Softmax gradient operations
+        
+        total_flops = total_forward_flops + dq_flops + dk_flops + dv_flops + dsoftmax_flops
+    else:
+        total_flops = total_forward_flops
+    
+    # Estimate NKI accelerated operations
+    # Matrix multiplies and most vector operations can be accelerated
+    nki_accelerated_flops = qk_bmm_flops + attn_v_bmm_flops
+    
+    if is_backward:
+        nki_accelerated_flops += dq_flops + dk_flops + dv_flops
+    
+    # Some softmax operations can be accelerated too
+    nki_accelerated_flops += softmax_flops * 0.7  # Assume 70% of softmax ops are accelerated
+    
+    if is_backward:
+        nki_accelerated_flops += dsoftmax_flops * 0.7
+    
+    # Calculate the ratio
+    nki_flop_ratio = nki_accelerated_flops / total_flops
+    
+    # Apply some adjustments based on empirical observations
+    # Larger batch sizes and head dimensions tend to have better utilization
+    batch_factor = min(1.0, 0.8 + (bs * 0.05))  # Increases with batch size
+    head_factor = min(1.0, 0.8 + (d / 256) * 0.2)  # Increases with head dimension
+    
+    # Sequence length affects utilization - very long sequences may have lower utilization
+    seq_factor = 1.0
+    if seq_len > 8192:
+        seq_factor = 0.95  # Slight reduction for very long sequences
+    
+    # Apply the adjustments
+    adjusted_ratio = nki_flop_ratio * batch_factor * head_factor * seq_factor
+    
+    # Ensure the ratio is between 0.0 and 1.0
+    return max(0.0, min(1.0, adjusted_ratio))
+
+def get_config_value(key: str, default=None) -> Any:
+    """Get a specific configuration value"""
+    config = load_config()
+    return config.get(key, default)
\ No newline at end of file
diff --git a/src/self-attention/tests/test_flash_attn_bwd.py b/src/self-attention/tests/test_flash_attn_bwd.py
index 1f948bf..7c75f35 100644
--- a/src/self-attention/tests/test_flash_attn_bwd.py
+++ b/src/self-attention/tests/test_flash_attn_bwd.py
@@ -6,13 +6,14 @@
 import os
 import logging
 import time
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Dict, Any
 import numpy as np
 
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from attention import flash_attn_bwd
 from neuronxcc.nki import benchmark, baremetal, simulate_kernel
 import neuronxcc.nki.language as nl
+from config import update_config, load_config, add_test_result, calculate_nki_flop_ratio
 
 # Configure logging for verbose output
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -320,6 +321,49 @@ def test_flash_attn_bwd_perf(self, bs, nheads, seqlen, d, dtype, latency):
         latency_res = bench_func_.benchmark_result.nc_latency
         p50_latency = print_performance_metrics(latency_res, latency, "Flash Attention Backward")
         
+        # Update performance metrics in config file
+        print("\n📊 Adding latency metrics to config file...")
+        # Initialize variables to track total latency
+        achieved_latency = p50_latency if p50_latency is not None else 0
+        
+        # If p50_latency is None, try to get mean latency
+        if achieved_latency == 0 and hasattr(latency_res, 'mean'):
+            achieved_latency = latency_res.mean
+            print(f"   Using mean latency: {achieved_latency:,} ns")
+        
+        # Update the config variables if we have valid latency data
+        if achieved_latency > 0:
+            # Calculate NKI FLOP ratio for this kernel configuration
+            nki_flop_ratio = calculate_nki_flop_ratio(
+                bs=bs, 
+                nheads=nheads, 
+                seq_len=seqlen, 
+                d=d, 
+                is_backward=True
+            )
+            print(f"   Calculated NKI FLOP ratio: {nki_flop_ratio:.4f}")
+            
+            # Create test parameters dictionary for logging
+            test_params = {
+                "batch_size": bs,
+                "num_heads": nheads,
+                "seq_len": seqlen,
+                "head_dim": d,
+                "dtype": str(dtype),
+                "nki_flop_ratio": nki_flop_ratio
+            }
+            
+            # Update the NKI FLOP ratio in the config
+            update_config({'NKI_FLOP_RATIO': nki_flop_ratio})
+            
+            # Add the test result to our accumulated metrics
+            update_success = add_test_result('BWD', achieved_latency, latency, test_params)
+            print(f"   Config update {'succeeded' if update_success else 'failed'}")
+            print(f"   Added to BWD_LATENCY_TOTAL: {achieved_latency:,} ns")
+            print(f"   Added to BWD_BASE_LATENCY_TOTAL: {latency:,} ns")
+        else:
+            print("   No valid latency data to update config")
+        
         # Final assertion with better error handling
         if p50_latency is not None:
             try:
diff --git a/src/self-attention/tests/test_flash_attn_fwd.py b/src/self-attention/tests/test_flash_attn_fwd.py
index 94feeef..425ab94 100644
--- a/src/self-attention/tests/test_flash_attn_fwd.py
+++ b/src/self-attention/tests/test_flash_attn_fwd.py
@@ -6,13 +6,14 @@
 import os
 import logging
 import time
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Dict, Any
 import numpy as np
 
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from attention import flash_fwd, FlashConfig
 from neuronxcc.nki import benchmark, baremetal, simulate_kernel
 import neuronxcc.nki.language as nl
+from config import update_config, load_config, add_test_result, calculate_nki_flop_ratio
 
 # Configure logging for verbose output
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -264,6 +265,56 @@ def test_flash_attn_fwd_perf(self, bs, nheads, seqlen_q, seqlen_k, d, dtype, use
         latency_res = bench_func_.benchmark_result.nc_latency
         p50_latency = print_performance_metrics(latency_res, latency, "Flash Attention Forward")
         
+        # Update performance metrics in config file
+        print("\n📊 Adding latency metrics to config file...")
+        # Initialize variables to track total latency
+        achieved_latency = p50_latency if p50_latency is not None else 0
+        
+        # If p50_latency is None, try to get mean latency
+        if achieved_latency == 0 and hasattr(latency_res, 'mean'):
+            achieved_latency = latency_res.mean
+            print(f"   Using mean latency: {achieved_latency:,} ns")
+        
+        # Update the config variables if we have valid latency data
+        if achieved_latency > 0:
+            # Calculate NKI FLOP ratio for this kernel configuration
+            nki_flop_ratio = calculate_nki_flop_ratio(
+                bs=bs, 
+                nheads=nheads, 
+                seq_len=max(seqlen_q, seqlen_k), 
+                d=d, 
+                is_backward=False
+            )
+            print(f"   Calculated NKI FLOP ratio: {nki_flop_ratio:.4f}")
+            
+            # Create test parameters dictionary for logging
+            test_params = {
+                "batch_size": bs,
+                "num_heads": nheads,
+                "seq_len_q": seqlen_q,
+                "seq_len_k": seqlen_k,
+                "head_dim": d,
+                "dtype": str(dtype),
+                "causal_mask": use_causal_mask,
+                "mixed_precision": mixed_precision,
+                "training": training,
+                "tile_size": tile_size,
+                "kv_heads": kv_heads if kv_heads is not None else nheads,
+                "transpose_v": should_transpose_v,
+                "nki_flop_ratio": nki_flop_ratio
+            }
+            
+            # Update the NKI FLOP ratio in the config
+            update_config({'NKI_FLOP_RATIO': nki_flop_ratio})
+            
+            # Add the test result to our accumulated metrics
+            update_success = add_test_result('FWD', achieved_latency, latency, test_params)
+            print(f"   Config update {'succeeded' if update_success else 'failed'}")
+            print(f"   Added to FWD_LATENCY_TOTAL: {achieved_latency:,} ns")
+            print(f"   Added to FWD_BASE_LATENCY_TOTAL: {latency:,} ns")
+        else:
+            print("   No valid latency data to update config")
+        
         # Final assertion with better error handling
         if p50_latency is not None:
             try:

From 6d1962e2bb171e685d133f5cc24194de242cc246 Mon Sep 17 00:00:00 2001
From: Arhamama-AMZ <arhamama@amazon.com>
Date: Tue, 15 Jul 2025 19:23:32 +0000
Subject: [PATCH 59/65] feat: Doc implementation and script integration for
 self attention

---
 src/self-attention/scripts/self-attention_benchmark.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 src/self-attention/scripts/self-attention_benchmark.sh

diff --git a/src/self-attention/scripts/self-attention_benchmark.sh b/src/self-attention/scripts/self-attention_benchmark.sh
old mode 100644
new mode 100755

From 87d9fc150ea4faaae866d6888972499f06109ce8 Mon Sep 17 00:00:00 2001
From: Armando Diaz <arm.dan.diaz.gonzalez@gmail.com>
Date: Tue, 22 Jul 2025 17:38:19 +0000
Subject: [PATCH 60/65] docs: improve docs & deployment

---
 .gitignore                   |   1 +
 deployment/deployment.yaml   |   2 +
 nki-llama.sh                 | 204 +++++++++++++++++++++++++++++++++--
 src/self-attention/README.md |   3 +
 4 files changed, 200 insertions(+), 10 deletions(-)

diff --git a/.gitignore b/.gitignore
index dbc6f0f..8ea5731 100644
--- a/.gitignore
+++ b/.gitignore
@@ -275,5 +275,6 @@ benchmark_results.json
 compiled_merged_model/
 compiled_model/
 merged_model/
+src/self-attention/config
 
 # End of https://www.toptal.com/developers/gitignore/api/macos,windows,linux,jupyternotebooks,python
\ No newline at end of file
diff --git a/deployment/deployment.yaml b/deployment/deployment.yaml
index 587a6a2..28225f5 100644
--- a/deployment/deployment.yaml
+++ b/deployment/deployment.yaml
@@ -345,6 +345,8 @@ Resources:
           # Clone the repository with agents branch
           cd /home/ubuntu
           git clone https://github.com/aws-neuron/nki-llama.git
+          sudo chown -R ubuntu:ubuntu /home/ubuntu/nki-llama/
+          git config --global --add safe.directory /home/ubuntu/nki-llama
 
 Outputs:
   VpcId:
diff --git a/nki-llama.sh b/nki-llama.sh
index b238da8..e58ad17 100755
--- a/nki-llama.sh
+++ b/nki-llama.sh
@@ -101,6 +101,25 @@ check_neuron_env() {
     fi
 }
 
+# Check specific environment activation
+check_specific_env() {
+    local required_env="$1"
+    if [[ -z "${VIRTUAL_ENV:-}" ]]; then
+        echo -e "${RED}❌ No virtual environment active${NC}"
+        echo -e "${YELLOW}Please activate: ${CYAN}source ${required_env}/bin/activate${NC}"
+        return 1
+    elif [[ "$VIRTUAL_ENV" != "$required_env" ]]; then
+        echo -e "${RED}❌ Wrong environment active${NC}"
+        echo -e "${YELLOW}Current: ${VIRTUAL_ENV}${NC}"
+        echo -e "${YELLOW}Required: ${required_env}${NC}"
+        echo -e "${YELLOW}Please activate: ${CYAN}source ${required_env}/bin/activate${NC}"
+        return 1
+    else
+        echo -e "${GREEN}✓ Correct environment active${NC}"
+        return 0
+    fi
+}
+
 # Initialize logging
 init_logging() {
     mkdir -p "$NKI_LOGS"
@@ -131,6 +150,103 @@ run_script() {
     fi
 }
 
+###############################################################################
+# Self-Attention Commands
+###############################################################################
+
+cmd_self_attention_test() {
+    echo -e "${BOLD}Running self-attention tests...${NC}"
+    
+    # Check environment
+    if ! check_specific_env "/opt/aws_neuronx_venv_pytorch_2_6"; then
+        return 1
+    fi
+    
+    # Check if scripts directory exists
+    local SELF_ATTN_SCRIPTS="${SCRIPT_DIR}/src/self-attention/scripts"
+    if [[ ! -d "$SELF_ATTN_SCRIPTS" ]]; then
+        echo -e "${RED}❌ Self-attention scripts directory not found: $SELF_ATTN_SCRIPTS${NC}"
+        return 1
+    fi
+    
+    # Change to scripts directory
+    cd "$SELF_ATTN_SCRIPTS" || {
+        echo -e "${RED}❌ Failed to navigate to self-attention scripts directory${NC}"
+        return 1
+    }
+    
+    # Run the benchmark script
+    if [[ -f "./self-attention_benchmark.sh" ]]; then
+        echo -e "${CYAN}Running all self-attention tests...${NC}"
+        bash ./self-attention_benchmark.sh
+    else
+        echo -e "${RED}❌ self-attention_benchmark.sh not found${NC}"
+        return 1
+    fi
+    
+    # Return to original directory
+    cd - > /dev/null
+}
+
+cmd_self_attention_test_forward() {
+    echo -e "${BOLD}Running self-attention forward pass tests...${NC}"
+    
+    # Check environment
+    if ! check_specific_env "/opt/aws_neuronx_venv_pytorch_2_6"; then
+        return 1
+    fi
+    
+    # Check if tests directory exists
+    local SELF_ATTN_TESTS="${SCRIPT_DIR}/src/self-attention/tests"
+    if [[ ! -d "$SELF_ATTN_TESTS" ]]; then
+        echo -e "${RED}❌ Self-attention tests directory not found: $SELF_ATTN_TESTS${NC}"
+        return 1
+    fi
+    
+    # Run forward pass tests
+    echo -e "${CYAN}Running forward pass tests...${NC}"
+    cd "$SELF_ATTN_TESTS" || return 1
+    pytest test_flash_attn_fwd.py -v -s
+    cd - > /dev/null
+}
+
+cmd_self_attention_test_backward() {
+    echo -e "${BOLD}Running self-attention backward pass tests...${NC}"
+    
+    # Check environment
+    if ! check_specific_env "/opt/aws_neuronx_venv_pytorch_2_6"; then
+        return 1
+    fi
+    
+    # Check if tests directory exists
+    local SELF_ATTN_TESTS="${SCRIPT_DIR}/nki-llama/src/self-attention/tests"
+    if [[ ! -d "$SELF_ATTN_TESTS" ]]; then
+        echo -e "${RED}❌ Self-attention tests directory not found: $SELF_ATTN_TESTS${NC}"
+        return 1
+    fi
+    
+    # Run backward pass tests
+    echo -e "${CYAN}Running backward pass tests...${NC}"
+    cd "$SELF_ATTN_TESTS" || return 1
+    pytest test_flash_attn_bwd.py -v -s
+    cd - > /dev/null
+}
+
+cmd_self_attention_all() {
+    echo -e "${BOLD}Running complete self-attention validation...${NC}\n"
+    
+    # Check environment first
+    if ! check_specific_env "/opt/aws_neuronx_venv_pytorch_2_6"; then
+        return 1
+    fi
+    
+    echo -e "${YELLOW}💡 This will run all self-attention tests to validate NKI kernels${NC}"
+    echo -e "${YELLOW}   Tests include forward and backward pass validation${NC}"
+    echo -e "${YELLOW}   This should be run before training or inference${NC}\n"
+    
+    cmd_self_attention_test
+}
+
 ###############################################################################
 # Fine-tuning Commands
 ###############################################################################
@@ -213,6 +329,7 @@ cmd_finetune_all() {
     if [[ -z "${TMUX:-}" ]]; then
         echo -e "${YELLOW}⚠️  Not running in tmux. ${BOLD}This is critical for the full pipeline!${NC}"
         echo -e "${YELLOW}   The complete pipeline includes:${NC}"
+        echo -e "${YELLOW}   • Self-attention validation${NC}"
         echo -e "${YELLOW}   • Dependency installation${NC}"
         echo -e "${YELLOW}   • Dataset download${NC}"
         echo -e "${YELLOW}   • Model download${NC}"
@@ -232,6 +349,9 @@ cmd_finetune_all() {
         fi
     fi
     
+    # Run self-attention tests first
+    echo -e "${YELLOW}⚠️  Running self-attention validation before training...${NC}"
+    cmd_self_attention_all && \
     cmd_finetune_deps && \
     cmd_finetune_data && \
     cmd_finetune_model && \
@@ -322,13 +442,28 @@ cmd_inference_benchmark() {
         fi
     fi
     
-    bash "${NKI_INFERENCE_SCRIPTS}/run-nki-benchmark.sh" --mode "$mode" "${args[@]}"
+    # Run self-attention tests before benchmark
+    echo -e "${YELLOW}⚠️  Running self-attention validation before benchmark...${NC}"
+    if cmd_self_attention_all; then
+        bash "${NKI_INFERENCE_SCRIPTS}/run-nki-benchmark.sh" --mode "$mode" "${args[@]}"
+    else
+        echo -e "${RED}❌ Self-attention tests failed. Please fix issues before running benchmark.${NC}"
+        return 1
+    fi
 }
 
 cmd_inference_server() {
     echo -e "${BOLD}Starting vLLM server...${NC}"
-    suggest_tmux "vLLM Server" "vllm-server" "inference server"
-    bash "${NKI_INFERENCE_SCRIPTS}/start-server.sh"
+    
+    # Run self-attention tests before server
+    echo -e "${YELLOW}⚠️  Running self-attention validation before starting server...${NC}"
+    if cmd_self_attention_all; then
+        suggest_tmux "vLLM Server" "vllm-server" "inference server"
+        bash "${NKI_INFERENCE_SCRIPTS}/start-server.sh"
+    else
+        echo -e "${RED}❌ Self-attention tests failed. Please fix issues before starting server.${NC}"
+        return 1
+    fi
 }
 
 ###############################################################################
@@ -344,6 +479,21 @@ cmd_status() {
     print_config
     echo
     
+    echo -e "${BOLD}Self-Attention Status:${NC}"
+    if [[ -d "${SCRIPT_DIR}/nki-llama/src/self-attention" ]]; then
+        echo -e "• Self-attention: ${GREEN}✓${NC}"
+        # Check if tests have been run recently
+        local TEST_LOG=$(find "$NKI_LOGS" -name "*self-attention*" -mtime -1 2>/dev/null | head -1)
+        if [[ -n "$TEST_LOG" ]]; then
+            echo -e "• Recent test: ${GREEN}✓${NC} (within 24h)"
+        else
+            echo -e "• Recent test: ${YELLOW}⚠${NC} (run ./nki-llama self-attention test)"
+        fi
+    else
+        echo -e "• Self-attention: ${RED}✗${NC}"
+    fi
+    echo
+    
     echo -e "${BOLD}Fine-tuning Status:${NC}"
     [[ -d "$DATASET_DIR" ]] && echo -e "• Dataset: ${GREEN}✓${NC}" || echo -e "• Dataset: ${YELLOW}⚠${NC}"
     [[ -d "$HF_WEIGHTS_DIR" ]] && echo -e "• Weights: ${GREEN}✓${NC}" || echo -e "• Weights: ${YELLOW}⚠${NC}"
@@ -459,6 +609,13 @@ show_help() {
     echo -e "  ./nki-llama jupyter       - Start Jupyter Lab"
     echo
     
+    echo -e "${CYAN}Self-Attention Commands:${NC}"
+    echo -e "  ./nki-llama self-attention test         - Run all tests"
+    echo -e "  ./nki-llama self-attention forward      - Test forward pass only"
+    echo -e "  ./nki-llama self-attention backward     - Test backward pass only"
+    echo -e "  ./nki-llama self-attention all          - Complete validation"
+    echo
+    
     echo -e "${CYAN}Fine-tuning Commands:${NC}"
     echo -e "  ./nki-llama finetune deps      - Install dependencies"
     echo -e "  ./nki-llama finetune data      - Download training dataset"
@@ -503,14 +660,22 @@ show_help() {
     echo
     
     echo -e "${CYAN}Environment Setup:${NC}"
-    echo -e "  Fine-tuning: source ${NEURON_VENV}/bin/activate"
-    echo -e "  Inference:   source ${NEURON_INFERENCE_VENV}/bin/activate"
+    echo -e "  Self-attention: source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate"
+    echo -e "  Fine-tuning:    source ${NEURON_VENV}/bin/activate"
+    echo -e "  Inference:      source ${NEURON_INFERENCE_VENV}/bin/activate"
+    echo
+    
+    echo -e "${CYAN}Recommended Workflow:${NC}"
+    echo -e "  1. Run self-attention tests to validate NKI kernels"
+    echo -e "  2. Run fine-tuning or inference as needed"
+    echo -e "  3. Self-attention tests are automatically run before training/inference"
     echo
     
     echo -e "${CYAN}Troubleshooting:${NC}"
     echo -e "  • Always use tmux for long operations (compile, train, benchmark)"
     echo -e "  • If benchmark fails with cache errors, use --clear-cache"
     echo -e "  • Check status to see if compilation cache has failed entries"
+    echo -e "  • Ensure correct environment is activated for self-attention tests"
     echo
 }
 
@@ -544,21 +709,25 @@ EOF
     # Show quick start
     echo -e "${BOLD}Quick Start Guide:${NC}"
     echo -e "1. Edit .env file with your Hugging Face token"
-    echo -e "2. For fine-tuning:"
+    echo -e "2. For self-attention testing:"
+    echo -e "   ${CYAN}source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate${NC}"
+    echo -e "   ${CYAN}./nki-llama self-attention test${NC}"
+    echo -e "3. For fine-tuning:"
     echo -e "   ${CYAN}source ${NEURON_VENV}/bin/activate${NC}"
     echo -e "   ${CYAN}tmux new -s training  # ${YELLOW}IMPORTANT: Use tmux!${NC}"
     echo -e "   ${CYAN}./nki-llama finetune all${NC}"
-    echo -e "3. For model benchmarking:"
+    echo -e "4. For model benchmarking:"
     echo -e "   ${CYAN}source ${NEURON_INFERENCE_VENV}/bin/activate${NC}"
     echo -e "   ${CYAN}./nki-llama inference download${NC}"
     echo -e "   ${CYAN}tmux new -s benchmark  # ${YELLOW}IMPORTANT: Use tmux!${NC}"
     echo -e "   ${CYAN}./nki-llama inference benchmark       # Full benchmark${NC}"
     echo -e "   ${CYAN}./nki-llama inference benchmark single   # Quick test${NC}"
-    echo -e "4. For inference serving:"
+    echo -e "5. For inference serving:"
     echo -e "   ${CYAN}./nki-llama inference setup${NC}"
     echo -e "   ${CYAN}./nki-llama inference server${NC}"
     echo
     echo -e "${YELLOW}💡 Pro Tips:${NC}"
+    echo -e "   • Self-attention tests validate NKI kernels before use"
     echo -e "   • Always use tmux for long operations"
     echo -e "   • Check ./nki-llama status for system health"
     echo -e "   • Use --clear-cache if benchmark fails with cache errors"
@@ -577,7 +746,7 @@ main() {
     
     # Initialize logging for actual operations
     case "${1:-help}" in
-        finetune|inference|train|server|clean)
+        finetune|inference|train|server|clean|self-attention)
             init_logging
             ;;
     esac
@@ -603,6 +772,21 @@ main() {
             bash "${NKI_INFERENCE_SCRIPTS}/jupyter.sh" "$@"
             ;;
             
+        # Self-attention commands
+        self-attention)
+            subcmd="${1:-all}"
+            shift || true
+            case "$subcmd" in
+                test|forward|backward|all)
+                    cmd_self_attention_"$subcmd" "$@"
+                    ;;
+                *)
+                    echo -e "${RED}Unknown self-attention command: $subcmd${NC}"
+                    show_help
+                    ;;
+            esac
+            ;;
+            
         # Fine-tuning commands
         finetune)
             subcmd="${1:-all}"
@@ -655,4 +839,4 @@ main() {
 }
 
 # Run main
-main "$@"
+main "$@"
\ No newline at end of file
diff --git a/src/self-attention/README.md b/src/self-attention/README.md
index 36b59a2..b29674c 100644
--- a/src/self-attention/README.md
+++ b/src/self-attention/README.md
@@ -97,6 +97,9 @@ dq, dk, dv = flash_attn_bwd[batch_size, heads](
 Run the tests to validate performance and numerical accuracy:
 
 ```bash
+# Activate python environment
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+
 # Navigate to the tests directory
 cd nki-llama/src/self-attention/scripts
 

From b1923bafb720e4bef039d4318ba58921034d4619 Mon Sep 17 00:00:00 2001
From: Armando Diaz <arm.dan.diaz.gonzalez@gmail.com>
Date: Tue, 22 Jul 2025 20:54:02 +0000
Subject: [PATCH 61/65] fix: add a fixed release for the dependecies with
 neuron

---
 .gitignore                                   |   1 +
 deployment/deployment.yaml                   |   2 +-
 nki-llama.sh                                 | 314 +++++++++++--------
 src/fine-tune/scripts/bootstrap.sh           |   2 +-
 src/fine-tune/scripts/convert_checkpoints.sh |   2 +-
 src/fine-tune/scripts/download_model.sh      |  93 +++---
 src/fine-tune/scripts/merge_checkpoints.py   | 172 ++++++++--
 7 files changed, 394 insertions(+), 192 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8ea5731..12bf41a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -276,5 +276,6 @@ compiled_merged_model/
 compiled_model/
 merged_model/
 src/self-attention/config
+requirements.txt.**
 
 # End of https://www.toptal.com/developers/gitignore/api/macos,windows,linux,jupyternotebooks,python
\ No newline at end of file
diff --git a/deployment/deployment.yaml b/deployment/deployment.yaml
index 28225f5..f71d6a1 100644
--- a/deployment/deployment.yaml
+++ b/deployment/deployment.yaml
@@ -344,7 +344,7 @@ Resources:
           
           # Clone the repository with agents branch
           cd /home/ubuntu
-          git clone https://github.com/aws-neuron/nki-llama.git
+          git clone -b agents https://github.com/arm-diaz/nki-llama.git
           sudo chown -R ubuntu:ubuntu /home/ubuntu/nki-llama/
           git config --global --add safe.directory /home/ubuntu/nki-llama
 
diff --git a/nki-llama.sh b/nki-llama.sh
index e58ad17..fc8c284 100755
--- a/nki-llama.sh
+++ b/nki-llama.sh
@@ -101,25 +101,6 @@ check_neuron_env() {
     fi
 }
 
-# Check specific environment activation
-check_specific_env() {
-    local required_env="$1"
-    if [[ -z "${VIRTUAL_ENV:-}" ]]; then
-        echo -e "${RED}❌ No virtual environment active${NC}"
-        echo -e "${YELLOW}Please activate: ${CYAN}source ${required_env}/bin/activate${NC}"
-        return 1
-    elif [[ "$VIRTUAL_ENV" != "$required_env" ]]; then
-        echo -e "${RED}❌ Wrong environment active${NC}"
-        echo -e "${YELLOW}Current: ${VIRTUAL_ENV}${NC}"
-        echo -e "${YELLOW}Required: ${required_env}${NC}"
-        echo -e "${YELLOW}Please activate: ${CYAN}source ${required_env}/bin/activate${NC}"
-        return 1
-    else
-        echo -e "${GREEN}✓ Correct environment active${NC}"
-        return 0
-    fi
-}
-
 # Initialize logging
 init_logging() {
     mkdir -p "$NKI_LOGS"
@@ -154,97 +135,204 @@ run_script() {
 # Self-Attention Commands
 ###############################################################################
 
-cmd_self_attention_test() {
-    echo -e "${BOLD}Running self-attention tests...${NC}"
+# Check if self-attention environment is active
+check_self_attention_env() {
+    if [[ -z "${VIRTUAL_ENV:-}" ]]; then
+        echo -e "${RED}❌ No virtual environment active${NC}"
+        echo -e "${YELLOW}Please activate the environment:${NC}"
+        echo -e "${CYAN}source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate${NC}"
+        return 1
+    elif [[ "$VIRTUAL_ENV" == *"pytorch_2_6"* ]]; then
+        echo -e "${GREEN}✓ Self-attention environment active${NC}"
+        return 0
+    else
+        echo -e "${YELLOW}⚠️  Wrong environment active: ${VIRTUAL_ENV}${NC}"
+        echo -e "${YELLOW}Please activate the correct environment:${NC}"
+        echo -e "${CYAN}source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate${NC}"
+        return 1
+    fi
+}
+
+cmd_self_attention_benchmark() {
+    echo -e "${BOLD}Running self-attention benchmarks...${NC}"
     
     # Check environment
-    if ! check_specific_env "/opt/aws_neuronx_venv_pytorch_2_6"; then
+    if ! check_self_attention_env; then
         return 1
     fi
     
-    # Check if scripts directory exists
-    local SELF_ATTN_SCRIPTS="${SCRIPT_DIR}/src/self-attention/scripts"
-    if [[ ! -d "$SELF_ATTN_SCRIPTS" ]]; then
-        echo -e "${RED}❌ Self-attention scripts directory not found: $SELF_ATTN_SCRIPTS${NC}"
-        return 1
+    # Check if we're in tmux
+    if [[ -z "${TMUX:-}" ]]; then
+        echo -e "${YELLOW}⚠️  Not running in tmux. ${BOLD}This is important for benchmarking!${NC}"
+        echo -e "${YELLOW}   Benchmarks can take considerable time to complete.${NC}"
+        echo -e "${YELLOW}   Disconnections will terminate the process.${NC}"
+        echo
+        echo -e "   ${CYAN}tmux new -s self-attention${NC}"
+        echo -e "   ${CYAN}./nki-llama self-attention benchmark${NC}"
+        echo
+        read -p "Continue without tmux? [y/N] " -n 1 -r
+        echo
+        if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+            echo -e "${BLUE}Please start tmux with: ${CYAN}tmux new -s self-attention${NC}"
+            exit 0
+        fi
     fi
     
-    # Change to scripts directory
-    cd "$SELF_ATTN_SCRIPTS" || {
-        echo -e "${RED}❌ Failed to navigate to self-attention scripts directory${NC}"
+    # Navigate to scripts directory and run benchmark
+    local self_attention_dir="${SCRIPT_DIR}/src/self-attention"
+    
+    if [[ ! -d "$self_attention_dir/scripts" ]]; then
+        echo -e "${RED}❌ Self-attention scripts directory not found: $self_attention_dir/scripts${NC}"
         return 1
-    }
+    fi
+    
+    cd "$self_attention_dir/scripts"
     
-    # Run the benchmark script
     if [[ -f "./self-attention_benchmark.sh" ]]; then
-        echo -e "${CYAN}Running all self-attention tests...${NC}"
-        bash ./self-attention_benchmark.sh
+        echo -e "${MAGENTA}▶ Running: self-attention_benchmark.sh${NC}"
+        bash ./self-attention_benchmark.sh "$@"
     else
-        echo -e "${RED}❌ self-attention_benchmark.sh not found${NC}"
+        echo -e "${RED}❌ Benchmark script not found: ./self-attention_benchmark.sh${NC}"
         return 1
     fi
-    
-    # Return to original directory
-    cd - > /dev/null
 }
 
-cmd_self_attention_test_forward() {
-    echo -e "${BOLD}Running self-attention forward pass tests...${NC}"
+cmd_self_attention_test() {
+    echo -e "${BOLD}Running self-attention tests...${NC}"
     
     # Check environment
-    if ! check_specific_env "/opt/aws_neuronx_venv_pytorch_2_6"; then
+    if ! check_self_attention_env; then
         return 1
     fi
     
-    # Check if tests directory exists
-    local SELF_ATTN_TESTS="${SCRIPT_DIR}/src/self-attention/tests"
-    if [[ ! -d "$SELF_ATTN_TESTS" ]]; then
-        echo -e "${RED}❌ Self-attention tests directory not found: $SELF_ATTN_TESTS${NC}"
+    local test_type="${1:-all}"
+    shift || true
+    
+    # Navigate to self-attention directory
+    local self_attention_dir="${SCRIPT_DIR}/src/self-attention"
+    
+    if [[ ! -d "$self_attention_dir/tests" ]]; then
+        echo -e "${RED}❌ Self-attention tests directory not found: $self_attention_dir/tests${NC}"
         return 1
     fi
     
-    # Run forward pass tests
-    echo -e "${CYAN}Running forward pass tests...${NC}"
-    cd "$SELF_ATTN_TESTS" || return 1
-    pytest test_flash_attn_fwd.py -v -s
-    cd - > /dev/null
+    cd "$self_attention_dir"
+    
+    case "$test_type" in
+        all)
+            echo -e "${CYAN}Running all self-attention tests...${NC}"
+            if command -v pytest &> /dev/null; then
+                pytest tests/ -v -s "$@"
+            else
+                echo -e "${RED}❌ pytest not found. Please install pytest.${NC}"
+                return 1
+            fi
+            ;;
+        forward|fwd)
+            echo -e "${CYAN}Running forward pass tests...${NC}"
+            if [[ -f "tests/test_flash_attn_fwd.py" ]]; then
+                pytest tests/test_flash_attn_fwd.py -v -s "$@"
+            else
+                echo -e "${RED}❌ Forward test file not found: tests/test_flash_attn_fwd.py${NC}"
+                return 1
+            fi
+            ;;
+        backward|bwd)
+            echo -e "${CYAN}Running backward pass tests...${NC}"
+            if [[ -f "tests/test_flash_attn_bwd.py" ]]; then
+                pytest tests/test_flash_attn_bwd.py -v -s "$@"
+            else
+                echo -e "${RED}❌ Backward test file not found: tests/test_flash_attn_bwd.py${NC}"
+                return 1
+            fi
+            ;;
+        *)
+            echo -e "${RED}Unknown test type: $test_type${NC}"
+            echo -e "Available: all, forward (fwd), backward (bwd)"
+            return 1
+            ;;
+    esac
 }
 
-cmd_self_attention_test_backward() {
-    echo -e "${BOLD}Running self-attention backward pass tests...${NC}"
+cmd_self_attention_run() {
+    echo -e "${BOLD}Running self-attention script...${NC}"
     
     # Check environment
-    if ! check_specific_env "/opt/aws_neuronx_venv_pytorch_2_6"; then
+    if ! check_self_attention_env; then
         return 1
     fi
     
-    # Check if tests directory exists
-    local SELF_ATTN_TESTS="${SCRIPT_DIR}/nki-llama/src/self-attention/tests"
-    if [[ ! -d "$SELF_ATTN_TESTS" ]]; then
-        echo -e "${RED}❌ Self-attention tests directory not found: $SELF_ATTN_TESTS${NC}"
+    local script_name="$1"
+    shift || true
+    
+    if [[ -z "$script_name" ]]; then
+        echo -e "${RED}❌ No script specified${NC}"
+        echo -e "Usage: ./nki-llama self-attention run <script_name> [args...]"
         return 1
     fi
     
-    # Run backward pass tests
-    echo -e "${CYAN}Running backward pass tests...${NC}"
-    cd "$SELF_ATTN_TESTS" || return 1
-    pytest test_flash_attn_bwd.py -v -s
-    cd - > /dev/null
+    # Navigate to scripts directory
+    local self_attention_dir="${SCRIPT_DIR}/src/self-attention"
+    cd "$self_attention_dir/scripts"
+    
+    if [[ -f "./${script_name}" ]]; then
+        echo -e "${MAGENTA}▶ Running: ${script_name}${NC}"
+        bash "./${script_name}" "$@"
+    elif [[ -f "./${script_name}.sh" ]]; then
+        echo -e "${MAGENTA}▶ Running: ${script_name}.sh${NC}"
+        bash "./${script_name}.sh" "$@"
+    else
+        echo -e "${RED}❌ Script not found: ${script_name}${NC}"
+        echo -e "Available scripts in $self_attention_dir/scripts:"
+        ls -1 *.sh 2>/dev/null || echo "No .sh scripts found"
+        return 1
+    fi
 }
 
-cmd_self_attention_all() {
-    echo -e "${BOLD}Running complete self-attention validation...${NC}\n"
+cmd_self_attention_status() {
+    echo -e "${BOLD}Self-Attention Status:${NC}"
     
-    # Check environment first
-    if ! check_specific_env "/opt/aws_neuronx_venv_pytorch_2_6"; then
-        return 1
+    # Check environment
+    check_self_attention_env || true
+    echo
+    
+    # Check directories
+    local self_attention_dir="${SCRIPT_DIR}/src/self-attention"
+    
+    echo -e "${BOLD}Directory Structure:${NC}"
+    [[ -d "$self_attention_dir" ]] && echo -e "• Base directory: ${GREEN}✓${NC}" || echo -e "• Base directory: ${RED}✗${NC}"
+    [[ -d "$self_attention_dir/scripts" ]] && echo -e "• Scripts: ${GREEN}✓${NC}" || echo -e "• Scripts: ${RED}✗${NC}"
+    [[ -d "$self_attention_dir/tests" ]] && echo -e "• Tests: ${GREEN}✓${NC}" || echo -e "• Tests: ${RED}✗${NC}"
+    
+    # Check for benchmark script
+    if [[ -f "$self_attention_dir/scripts/self-attention_benchmark.sh" ]]; then
+        echo -e "• Benchmark script: ${GREEN}✓${NC}"
+    else
+        echo -e "• Benchmark script: ${RED}✗${NC}"
+    fi
+    
+    # Check for test files
+    echo -e "\n${BOLD}Test Files:${NC}"
+    if [[ -f "$self_attention_dir/tests/test_flash_attn_fwd.py" ]]; then
+        echo -e "• Forward tests: ${GREEN}✓${NC}"
+    else
+        echo -e "• Forward tests: ${RED}✗${NC}"
     fi
     
-    echo -e "${YELLOW}💡 This will run all self-attention tests to validate NKI kernels${NC}"
-    echo -e "${YELLOW}   Tests include forward and backward pass validation${NC}"
-    echo -e "${YELLOW}   This should be run before training or inference${NC}\n"
+    if [[ -f "$self_attention_dir/tests/test_flash_attn_bwd.py" ]]; then
+        echo -e "• Backward tests: ${GREEN}✓${NC}"
+    else
+        echo -e "• Backward tests: ${RED}✗${NC}"
+    fi
     
-    cmd_self_attention_test
+    # Check for pytest
+    if command -v pytest &> /dev/null; then
+        echo -e "\n${BOLD}Dependencies:${NC}"
+        echo -e "• pytest: ${GREEN}✓${NC} ($(pytest --version 2>&1 | head -1))"
+    else
+        echo -e "\n${BOLD}Dependencies:${NC}"
+        echo -e "• pytest: ${RED}✗${NC} (not installed)"
+    fi
 }
 
 ###############################################################################
@@ -329,7 +417,6 @@ cmd_finetune_all() {
     if [[ -z "${TMUX:-}" ]]; then
         echo -e "${YELLOW}⚠️  Not running in tmux. ${BOLD}This is critical for the full pipeline!${NC}"
         echo -e "${YELLOW}   The complete pipeline includes:${NC}"
-        echo -e "${YELLOW}   • Self-attention validation${NC}"
         echo -e "${YELLOW}   • Dependency installation${NC}"
         echo -e "${YELLOW}   • Dataset download${NC}"
         echo -e "${YELLOW}   • Model download${NC}"
@@ -349,9 +436,6 @@ cmd_finetune_all() {
         fi
     fi
     
-    # Run self-attention tests first
-    echo -e "${YELLOW}⚠️  Running self-attention validation before training...${NC}"
-    cmd_self_attention_all && \
     cmd_finetune_deps && \
     cmd_finetune_data && \
     cmd_finetune_model && \
@@ -442,28 +526,13 @@ cmd_inference_benchmark() {
         fi
     fi
     
-    # Run self-attention tests before benchmark
-    echo -e "${YELLOW}⚠️  Running self-attention validation before benchmark...${NC}"
-    if cmd_self_attention_all; then
-        bash "${NKI_INFERENCE_SCRIPTS}/run-nki-benchmark.sh" --mode "$mode" "${args[@]}"
-    else
-        echo -e "${RED}❌ Self-attention tests failed. Please fix issues before running benchmark.${NC}"
-        return 1
-    fi
+    bash "${NKI_INFERENCE_SCRIPTS}/run-nki-benchmark.sh" --mode "$mode" "${args[@]}"
 }
 
 cmd_inference_server() {
     echo -e "${BOLD}Starting vLLM server...${NC}"
-    
-    # Run self-attention tests before server
-    echo -e "${YELLOW}⚠️  Running self-attention validation before starting server...${NC}"
-    if cmd_self_attention_all; then
-        suggest_tmux "vLLM Server" "vllm-server" "inference server"
-        bash "${NKI_INFERENCE_SCRIPTS}/start-server.sh"
-    else
-        echo -e "${RED}❌ Self-attention tests failed. Please fix issues before starting server.${NC}"
-        return 1
-    fi
+    suggest_tmux "vLLM Server" "vllm-server" "inference server"
+    bash "${NKI_INFERENCE_SCRIPTS}/start-server.sh"
 }
 
 ###############################################################################
@@ -480,18 +549,10 @@ cmd_status() {
     echo
     
     echo -e "${BOLD}Self-Attention Status:${NC}"
-    if [[ -d "${SCRIPT_DIR}/nki-llama/src/self-attention" ]]; then
-        echo -e "• Self-attention: ${GREEN}✓${NC}"
-        # Check if tests have been run recently
-        local TEST_LOG=$(find "$NKI_LOGS" -name "*self-attention*" -mtime -1 2>/dev/null | head -1)
-        if [[ -n "$TEST_LOG" ]]; then
-            echo -e "• Recent test: ${GREEN}✓${NC} (within 24h)"
-        else
-            echo -e "• Recent test: ${YELLOW}⚠${NC} (run ./nki-llama self-attention test)"
-        fi
-    else
-        echo -e "• Self-attention: ${RED}✗${NC}"
-    fi
+    local self_attention_dir="${SCRIPT_DIR}/nki-llama/src/self-attention"
+    [[ -d "$self_attention_dir" ]] && echo -e "• Module: ${GREEN}✓${NC}" || echo -e "• Module: ${YELLOW}⚠${NC}"
+    [[ -d "$self_attention_dir/scripts" ]] && echo -e "• Scripts: ${GREEN}✓${NC}" || echo -e "• Scripts: ${YELLOW}⚠${NC}"
+    [[ -d "$self_attention_dir/tests" ]] && echo -e "• Tests: ${GREEN}✓${NC}" || echo -e "• Tests: ${YELLOW}⚠${NC}"
     echo
     
     echo -e "${BOLD}Fine-tuning Status:${NC}"
@@ -610,10 +671,12 @@ show_help() {
     echo
     
     echo -e "${CYAN}Self-Attention Commands:${NC}"
-    echo -e "  ./nki-llama self-attention test         - Run all tests"
-    echo -e "  ./nki-llama self-attention forward      - Test forward pass only"
-    echo -e "  ./nki-llama self-attention backward     - Test backward pass only"
-    echo -e "  ./nki-llama self-attention all          - Complete validation"
+    echo -e "  ./nki-llama self-attention benchmark         - Run all benchmarks"
+    echo -e "  ./nki-llama self-attention test              - Run all tests"
+    echo -e "  ./nki-llama self-attention test forward      - Run forward pass tests"
+    echo -e "  ./nki-llama self-attention test backward     - Run backward pass tests"
+    echo -e "  ./nki-llama self-attention run <script>      - Run specific script"
+    echo -e "  ./nki-llama self-attention status            - Show self-attention status"
     echo
     
     echo -e "${CYAN}Fine-tuning Commands:${NC}"
@@ -660,22 +723,16 @@ show_help() {
     echo
     
     echo -e "${CYAN}Environment Setup:${NC}"
-    echo -e "  Self-attention: source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate"
+    echo -e "  Self-Attention: source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate"
     echo -e "  Fine-tuning:    source ${NEURON_VENV}/bin/activate"
     echo -e "  Inference:      source ${NEURON_INFERENCE_VENV}/bin/activate"
     echo
     
-    echo -e "${CYAN}Recommended Workflow:${NC}"
-    echo -e "  1. Run self-attention tests to validate NKI kernels"
-    echo -e "  2. Run fine-tuning or inference as needed"
-    echo -e "  3. Self-attention tests are automatically run before training/inference"
-    echo
-    
     echo -e "${CYAN}Troubleshooting:${NC}"
     echo -e "  • Always use tmux for long operations (compile, train, benchmark)"
     echo -e "  • If benchmark fails with cache errors, use --clear-cache"
     echo -e "  • Check status to see if compilation cache has failed entries"
-    echo -e "  • Ensure correct environment is activated for self-attention tests"
+    echo -e "  • For self-attention, ensure correct environment is activated"
     echo
 }
 
@@ -711,7 +768,8 @@ EOF
     echo -e "1. Edit .env file with your Hugging Face token"
     echo -e "2. For self-attention testing:"
     echo -e "   ${CYAN}source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate${NC}"
-    echo -e "   ${CYAN}./nki-llama self-attention test${NC}"
+    echo -e "   ${CYAN}tmux new -s self-attention  # ${YELLOW}IMPORTANT: Use tmux!${NC}"
+    echo -e "   ${CYAN}./nki-llama self-attention benchmark${NC}"
     echo -e "3. For fine-tuning:"
     echo -e "   ${CYAN}source ${NEURON_VENV}/bin/activate${NC}"
     echo -e "   ${CYAN}tmux new -s training  # ${YELLOW}IMPORTANT: Use tmux!${NC}"
@@ -727,7 +785,6 @@ EOF
     echo -e "   ${CYAN}./nki-llama inference server${NC}"
     echo
     echo -e "${YELLOW}💡 Pro Tips:${NC}"
-    echo -e "   • Self-attention tests validate NKI kernels before use"
     echo -e "   • Always use tmux for long operations"
     echo -e "   • Check ./nki-llama status for system health"
     echo -e "   • Use --clear-cache if benchmark fails with cache errors"
@@ -746,7 +803,7 @@ main() {
     
     # Initialize logging for actual operations
     case "${1:-help}" in
-        finetune|inference|train|server|clean|self-attention)
+        self-attention|finetune|inference|train|server|clean)
             init_logging
             ;;
     esac
@@ -774,15 +831,24 @@ main() {
             
         # Self-attention commands
         self-attention)
-            subcmd="${1:-all}"
+            subcmd="${1:-benchmark}"
             shift || true
             case "$subcmd" in
-                test|forward|backward|all)
-                    cmd_self_attention_"$subcmd" "$@"
+                benchmark)
+                    cmd_self_attention_benchmark "$@"
+                    ;;
+                test)
+                    cmd_self_attention_test "$@"
+                    ;;
+                run)
+                    cmd_self_attention_run "$@"
+                    ;;
+                status)
+                    cmd_self_attention_status "$@"
                     ;;
                 *)
                     echo -e "${RED}Unknown self-attention command: $subcmd${NC}"
-                    show_help
+                    echo -e "Available: benchmark, test, run, status"
                     ;;
             esac
             ;;
diff --git a/src/fine-tune/scripts/bootstrap.sh b/src/fine-tune/scripts/bootstrap.sh
index be16221..729f026 100755
--- a/src/fine-tune/scripts/bootstrap.sh
+++ b/src/fine-tune/scripts/bootstrap.sh
@@ -6,7 +6,7 @@ set +e
 
 LOG_FILE="bootstrap_$(date +%Y%m%d_%H%M%S).log"
 MAX_RETRIES=3
-REQUIREMENTS_URL="https://raw.githubusercontent.com/aws-neuron/neuronx-distributed-training/master/requirements.txt"
+REQUIREMENTS_URL="https://raw.githubusercontent.com/aws-neuron/neuronx-distributed-training/refs/heads/release_2.23.0/requirements.txt"
 LOCAL_REQUIREMENTS="./cached_requirements.txt"
 
 log_message() {
diff --git a/src/fine-tune/scripts/convert_checkpoints.sh b/src/fine-tune/scripts/convert_checkpoints.sh
index 1ebe66f..3fa37b6 100755
--- a/src/fine-tune/scripts/convert_checkpoints.sh
+++ b/src/fine-tune/scripts/convert_checkpoints.sh
@@ -12,7 +12,7 @@ if [ -d "/home/ubuntu/nki-llama/src/fine-tune/neuronx-distributed-training" ]; t
 fi
 
 echo "==== Cloning neuronx-distributed-training repository ===="
-git clone https://github.com/aws-neuron/neuronx-distributed-training.git
+git clone -b release_2.23.0 https://github.com/aws-neuron/neuronx-distributed-training.git
 
 echo "==== Changing to checkpoint converter scripts directory ===="
 cd ~/nki-llama/src/fine-tune/neuronx-distributed-training/examples/checkpoint_converter_scripts
diff --git a/src/fine-tune/scripts/download_model.sh b/src/fine-tune/scripts/download_model.sh
index 8d12177..030f4f2 100755
--- a/src/fine-tune/scripts/download_model.sh
+++ b/src/fine-tune/scripts/download_model.sh
@@ -3,9 +3,35 @@ set -e
 
 echo "==== Starting Llama model download and conversion script ===="
 
-# Variable names
-MODEL_NAME=none
-HF_WEIGHT_NAME=none
+# Run the instance_type script
+source ~/nki-llama/src/fine-tune/scripts/instance_type.sh
+
+# Check if MODEL_NAME is provided as an argument or environment variable
+if [ -n "$1" ]; then
+    MODEL_NAME="$1"
+elif [ -n "$MODEL_NAME" ]; then
+    # MODEL_NAME already set as environment variable
+    :
+else
+    echo "Error: MODEL_NAME not specified!"
+    echo "Usage: $0 <MODEL_NAME>"
+    echo "   or: export MODEL_NAME=<model_name> && $0"
+    echo ""
+    
+    # Provide recommendations based on instance type
+    if [ "$EC2_INSTANCE_TYPE" == "trn1.2xlarge" ]; then
+        echo "Recommended model for $EC2_INSTANCE_TYPE:"
+        echo "  - llama-3-2_1b"
+    elif [ "$EC2_INSTANCE_TYPE" == "trn1.32xlarge" ]; then
+        echo "Recommended model for $EC2_INSTANCE_TYPE:"
+        echo "  - llama-3-1_8b"
+    else
+        echo "Recommended model names:"
+        echo "  - llama-3-2_1b (for smaller instances)"
+        echo "  - llama-3-1_8b (for larger instances)"
+    fi
+    exit 1
+fi
 
 # Check if HF_TOKEN is set
 if [ -z "$HF_TOKEN" ]; then
@@ -14,35 +40,30 @@ if [ -z "$HF_TOKEN" ]; then
     exit 1
 fi
 
-# Run the instance_type script
-./nki-llama/src/fine-tune/scripts/instance_type.sh
-
-# Check the instance type to set the variables for the model download
-if ["$EC2_INSTANCE_TYPE" == "trn1.2xlarge"]; then
-    # Check if MODEL_ID_1B is set
-    if [ -z "$MODEL_ID_1B" ]; then
-        echo "Error: MODEL_ID_1B environment variable is not set!"
-        echo "Please make sure MODEL_ID_1B is defined in your .env file"
-        exit 1
-    fi
-
-    MODEL_NAME=llama-3-2_1b
-    HF_WEIGHT_NAME=llama3_2-1b_hf_weights_bin
-
-    echo "🚀 Model: Downloading Llama-3.2 1B..."
-else if ["$EC2_INSTANCE_TYPE" == "trn1.32xlarge"]: then
-    # Check if MODEL_ID_8B is set
-    if [ -z "$MODEL_ID_8B" ]; then
-        echo "Error: MODEL_ID_8B environment variable is not set!"
-        echo "Please make sure MODEL_ID_8B is defined in your .env file"
-        exit 1
-    fi
+# Check if MODEL_ID is set
+if [ -z "$MODEL_ID" ]; then
+    echo "Error: MODEL_ID environment variable is not set!"
+    echo "Please set MODEL_ID to the Hugging Face model identifier"
+    echo "Examples:"
+    echo "  - For llama-3-2_1b: export MODEL_ID=meta-llama/Llama-3.2-1B"
+    echo "  - For llama-3-1_8b: export MODEL_ID=meta-llama/Meta-Llama-3-8B"
+    exit 1
+fi
 
-    MODEL_NAME=llama-3-1_8b
-    HF_WEIGHT_NAME=llama3_1-8b_hf_weights_bin
+# Set HF_WEIGHT_NAME based on MODEL_NAME
+HF_WEIGHT_NAME="${MODEL_NAME}_hf_weights_bin"
 
-    echo "🚀 Model: Downloading Llama-3 8B..."
+echo "🚀 Model: Processing $MODEL_NAME..."
+echo "   Instance Type: $EC2_INSTANCE_TYPE"
+echo "   HF Model ID: $MODEL_ID"
+echo "   HF Weight Name: $HF_WEIGHT_NAME"
 
+# Optional: Warn if model seems mismatched with instance type
+if [ "$EC2_INSTANCE_TYPE" == "trn1.2xlarge" ] && [[ "$MODEL_NAME" == *"8b"* ]]; then
+    echo "⚠️  Warning: Using a large model (8B) on a small instance (trn1.2xlarge) may cause performance issues"
+elif [ "$EC2_INSTANCE_TYPE" == "trn1.32xlarge" ] && [[ "$MODEL_NAME" == *"1b"* ]]; then
+    echo "ℹ️  Note: Using a small model (1B) on a large instance (trn1.32xlarge) - consider using a larger model for better resource utilization"
+fi
 
 echo "==== Changing to fine-tune workspace ===="
 # Go to your fine-tune workspace
@@ -52,8 +73,8 @@ echo "Current directory: $(pwd)"
 echo "==== Setting path variables ===="
 # Paths
 export TOKENIZER_DIR=~/nki-llama/src/fine-tune/model_assets/llama_tokenizer
-export MODEL_DIR=~/nki-llama/src/fine-tune/model_assets/MODEL_NAME
-export BIN_MODEL_DIR=~/nki-llama/src/fine-tune/model_assets/HF_WEIGHT_NAME
+export MODEL_DIR=~/nki-llama/src/fine-tune/model_assets/$MODEL_NAME
+export BIN_MODEL_DIR=~/nki-llama/src/fine-tune/model_assets/$HF_WEIGHT_NAME
 export CONSOLIDATED_BIN_MODEL_DIR=~/nki-llama/src/fine-tune/model_assets/pckpt/
 
 echo "Tokenizer directory: $TOKENIZER_DIR"
@@ -88,14 +109,10 @@ import os
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from pathlib import Path
 
-instance_type = os.environ.get("EC2_INSTANCE_TYPE")
-if instance == "trn1.2xlarge":
-    model_id = os.environ.get("MODEL_ID", "meta-llama/Llama-3.2-1B")
-elif instance == "trn1.32xlarge":
-    model_id = os.environ.get("MODEL_ID", "meta-llama/Meta-Llama-3-8B")
+model_id = os.environ.get("MODEL_ID")
 tokenizer_dir = os.path.expanduser(os.environ["TOKENIZER_DIR"])
-model_dir     = os.path.expanduser(os.environ["MODEL_DIR"])
-hf_token      = os.environ.get("HF_TOKEN")
+model_dir = os.path.expanduser(os.environ["MODEL_DIR"])
+hf_token = os.environ.get("HF_TOKEN")
 
 # Check if tokenizer exists
 tokenizer_path = Path(tokenizer_dir)
diff --git a/src/fine-tune/scripts/merge_checkpoints.py b/src/fine-tune/scripts/merge_checkpoints.py
index 8601b1d..18af02e 100644
--- a/src/fine-tune/scripts/merge_checkpoints.py
+++ b/src/fine-tune/scripts/merge_checkpoints.py
@@ -1,34 +1,152 @@
 #!/usr/bin/env python3
 import os
+import sys
 import torch
+from pathlib import Path
 
-# Directory containing shards, and final output path
-bin_dir    = "/home/ubuntu/nki-llama/src/fine-tune/model_assets/llama3-8B_hf_weights_bin"
-output_bin = "/home/ubuntu/nki-llama/src/fine-tune/model_assets/pckpt/pytorch_model.bin"
+def get_model_paths():
+    """
+    Determine the input and output paths based on environment variables or defaults.
+    Returns: (bin_dir, output_bin)
+    """
+    # Check for environment variables first
+    bin_dir = os.environ.get('BIN_MODEL_DIR')
+    output_dir = os.environ.get('CONSOLIDATED_BIN_MODEL_DIR')
+    
+    if bin_dir and output_dir:
+        # Use environment variables if both are set
+        output_bin = os.path.join(output_dir, "pytorch_model.bin")
+        return bin_dir, output_bin
+    
+    # Try to construct paths from MODEL_NAME if available
+    model_name = os.environ.get('MODEL_NAME')
+    if model_name:
+        base_path = os.path.expanduser("~/nki-llama/src/fine-tune/model_assets")
+        hf_weight_name = f"{model_name}_hf_weights_bin"
+        bin_dir = os.path.join(base_path, hf_weight_name)
+        output_bin = os.path.join(base_path, "pckpt", "pytorch_model.bin")
+        
+        if os.path.exists(bin_dir):
+            return bin_dir, output_bin
+    
+    # Fall back to default paths
+    default_paths = [
+        {
+            'bin_dir': "/home/ubuntu/nki-llama/src/fine-tune/model_assets/llama3-8B_hf_weights_bin",
+            'output_bin': "/home/ubuntu/nki-llama/src/fine-tune/model_assets/pckpt/pytorch_model.bin"
+        },
+        {
+            'bin_dir': "/home/ubuntu/nki-llama/src/fine-tune/model_assets/llama-3-1_8b_hf_weights_bin",
+            'output_bin': "/home/ubuntu/nki-llama/src/fine-tune/model_assets/pckpt/pytorch_model.bin"
+        },
+        {
+            'bin_dir': "/home/ubuntu/nki-llama/src/fine-tune/model_assets/llama-3-2_1b_hf_weights_bin",
+            'output_bin': "/home/ubuntu/nki-llama/src/fine-tune/model_assets/pckpt/pytorch_model.bin"
+        }
+    ]
+    
+    # Check which default path exists
+    for paths in default_paths:
+        if os.path.exists(paths['bin_dir']):
+            print(f"Using default path: {paths['bin_dir']}")
+            return paths['bin_dir'], paths['output_bin']
+    
+    # If nothing works, return None to trigger error
+    return None, None
 
-# We’ll merge everything into this dict
-merged_state = {}
+def print_error_help():
+    """Print helpful error message with suggestions."""
+    print("\nError: Could not find the binary model directory!")
+    print("\nPossible solutions:")
+    print("\n1. Set environment variables (recommended):")
+    print("   export BIN_MODEL_DIR=/path/to/your/model_hf_weights_bin")
+    print("   export CONSOLIDATED_BIN_MODEL_DIR=/path/to/output/pckpt")
+    print("\n2. Set MODEL_NAME environment variable:")
+    print("   export MODEL_NAME=llama-3-1_8b")
+    print("   (This will look for ~/nki-llama/src/fine-tune/model_assets/llama-3-1_8b_hf_weights_bin)")
+    print("\n3. Run from the download script which sets these variables automatically")
+    print("\n4. Ensure one of these default directories exists:")
+    print("   - /home/ubuntu/nki-llama/src/fine-tune/model_assets/llama3-8B_hf_weights_bin")
+    print("   - /home/ubuntu/nki-llama/src/fine-tune/model_assets/llama-3-1_8b_hf_weights_bin")
+    print("   - /home/ubuntu/nki-llama/src/fine-tune/model_assets/llama-3-2_1b_hf_weights_bin")
+    print("\nCurrent environment variables:")
+    print(f"   BIN_MODEL_DIR: {os.environ.get('BIN_MODEL_DIR', 'Not set')}")
+    print(f"   CONSOLIDATED_BIN_MODEL_DIR: {os.environ.get('CONSOLIDATED_BIN_MODEL_DIR', 'Not set')}")
+    print(f"   MODEL_NAME: {os.environ.get('MODEL_NAME', 'Not set')}")
 
-# Always load on CPU for merging
-map_loc = "cpu"
+def main():
+    # Get paths
+    bin_dir, output_bin = get_model_paths()
+    
+    if not bin_dir or not os.path.exists(bin_dir):
+        print_error_help()
+        sys.exit(1)
+    
+    # Check if output already exists
+    if os.path.exists(output_bin):
+        print(f"Merged checkpoint already exists at {output_bin}")
+        response = input("Do you want to overwrite it? (y/N): ")
+        if response.lower() != 'y':
+            print("Skipping merge operation.")
+            sys.exit(0)
+    
+    # Create output directory if it doesn't exist
+    output_dir = os.path.dirname(output_bin)
+    os.makedirs(output_dir, exist_ok=True)
+    
+    print(f"Binary model directory: {bin_dir}")
+    print(f"Output file: {output_bin}")
+    print()
+    
+    # Check if there are any .bin files to merge
+    bin_files = [f for f in os.listdir(bin_dir) if f.endswith(".bin")]
+    if not bin_files:
+        print(f"Error: No .bin files found in {bin_dir}")
+        print("Make sure the convert_safetensors.py script has been run first.")
+        sys.exit(1)
+    
+    print(f"Found {len(bin_files)} .bin files to merge")
+    
+    # We'll merge everything into this dict
+    merged_state = {}
+    
+    # Always load on CPU for merging
+    map_loc = "cpu"
+    
+    # Iterate in sorted order so names don't collide unpredictably
+    for fname in sorted(bin_files):
+        shard_path = os.path.join(bin_dir, fname)
+        print(f"Loading shard: {fname}")
+        
+        try:
+            shard = torch.load(shard_path, map_location=map_loc)
+            
+            # If the file wrapped weights under "state_dict", pull it out
+            if isinstance(shard, dict) and "state_dict" in shard:
+                shard = shard["state_dict"]
+            
+            # Merge into our master dict
+            merged_state.update(shard)
+            
+        except Exception as e:
+            print(f"Error loading {fname}: {e}")
+            sys.exit(1)
+    
+    # Save the flattened checkpoint
+    print(f"\nSaving merged checkpoint to {output_bin}")
+    print(f"Total parameters in merged checkpoint: {len(merged_state)}")
+    
+    try:
+        torch.save(merged_state, output_bin)
+        print("Done! Merge completed successfully.")
+        
+        # Print file size
+        file_size = os.path.getsize(output_bin) / (1024 ** 3)  # Convert to GB
+        print(f"Output file size: {file_size:.2f} GB")
+        
+    except Exception as e:
+        print(f"Error saving merged checkpoint: {e}")
+        sys.exit(1)
 
-# Iterate in sorted order so names don’t collide unpredictably
-for fname in sorted(os.listdir(bin_dir)):
-    if not fname.endswith(".bin"):
-        continue
-
-    shard_path = os.path.join(bin_dir, fname)
-    print(f"Loading shard: {shard_path}")
-    shard = torch.load(shard_path, map_location=map_loc)
-
-    # If the file wrapped weights under "state_dict", pull it out
-    if isinstance(shard, dict) and "state_dict" in shard:
-        shard = shard["state_dict"]
-
-    # Merge into our master dict
-    merged_state.update(shard)
-
-# Save the flattened checkpoint
-print(f"Saving merged checkpoint to {output_bin}")
-torch.save(merged_state, output_bin)
-print("Done.")
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From aec05d1265ac492761989038e6bcf1057ab4d6b2 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-2-20-160.ec2.internal>
Date: Wed, 23 Jul 2025 01:21:33 +0000
Subject: [PATCH 62/65] fix: inference benchmark - missing env variable for
 model IDs

---
 .gitignore                              |   1 +
 src/inference/scripts/download-model.sh | 193 ++++++++++++++----------
 2 files changed, 118 insertions(+), 76 deletions(-)

diff --git a/.gitignore b/.gitignore
index 12bf41a..e0b4ed2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -277,5 +277,6 @@ compiled_model/
 merged_model/
 src/self-attention/config
 requirements.txt.**
+model_env.sh
 
 # End of https://www.toptal.com/developers/gitignore/api/macos,windows,linux,jupyternotebooks,python
\ No newline at end of file
diff --git a/src/inference/scripts/download-model.sh b/src/inference/scripts/download-model.sh
index 8b448ac..94381c3 100755
--- a/src/inference/scripts/download-model.sh
+++ b/src/inference/scripts/download-model.sh
@@ -19,6 +19,47 @@ NC='\033[0m'
 
 echo -e "${GREEN}Downloading model from Hugging Face...${NC}"
 
+# Determine the instance type
+source "${SCRIPT_DIR}/../../../src/inference/scripts/instance_type.sh"
+
+# Check if MODEL_NAME is provided as an argument or environment variable
+if [ -n "${1:-}" ]; then
+    MODEL_NAME="$1"
+elif [ -z "${MODEL_NAME:-}" ]; then
+    echo -e "${RED}Error: MODEL_NAME not specified!${NC}"
+    echo "Usage: $0 <MODEL_NAME>"
+    echo "   or: export MODEL_NAME=<model_name> && $0"
+    echo ""
+    
+    # Provide recommendations based on instance type
+    if [ "$EC2_INSTANCE_TYPE" == "trn1.2xlarge" ]; then
+        echo "Recommended model for $EC2_INSTANCE_TYPE:"
+        echo "  - llama-3-2_1b"
+        echo ""
+        echo "Example: $0 llama-3-2_1b"
+    elif [ "$EC2_INSTANCE_TYPE" == "trn1.32xlarge" ]; then
+        echo "Recommended model for $EC2_INSTANCE_TYPE:"
+        echo "  - llama-3-1_8b"
+        echo ""
+        echo "Example: $0 llama-3-1_8b"
+    else
+        echo "Unsupported instance type: $EC2_INSTANCE_TYPE"
+        echo "This script requires either trn1.2xlarge or trn1.32xlarge"
+    fi
+    exit 1
+fi
+
+# Check if MODEL_ID is set
+if [ -z "${MODEL_ID:-}" ]; then
+    echo -e "${RED}Error: MODEL_ID environment variable is not set!${NC}"
+    echo "Please set MODEL_ID to the Hugging Face model identifier"
+    echo ""
+    echo "Examples:"
+    echo "  For llama-3-2_1b: export MODEL_ID=meta-llama/Llama-3.2-1B"
+    echo "  For llama-3-1_8b: export MODEL_ID=meta-llama/Meta-Llama-3-8B"
+    exit 1
+fi
+
 # Check HF token
 if [[ -z "${HF_TOKEN:-}" ]]; then
     echo -e "${YELLOW}HF_TOKEN not set${NC}"
@@ -46,93 +87,85 @@ if not ver or pkg_resources.parse_version(ver) >= pkg_resources.parse_version(re
     subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", f"transformers<{req}"])
 PY
 
-# Determine the instance type before deciding the model
-source "${SCRIPT_DIR}/../../../src/inference/scripts/instance_type.sh"
-
+# Configure NeuronCore settings based on instance type
 if [ "$EC2_INSTANCE_TYPE" == "trn1.2xlarge" ]; then
-    # Check if MODEL_ID_1B is set
-    if [ -z "$MODEL_ID_1B" ]; then
-        echo "Error: MODEL_ID_1B environment variable is not set!"
-        echo "Please make sure MODEL_ID_1B is defined in your .env file"
-        exit 1
-    fi
-
-    export MODEL_ID="$MODEL_ID_1B"
-    export MODEL_NAME="$MODEL_NAME_1B"
-
-    # Set NeuronCore number
-    VALUE="2"
-    # Check if key exists and update it, otherwise add it - NEURON_RT_NUM_CORES
-    if grep -q "^${KEY_1}=" "$ENV_FILE" 2>/dev/null; then
-        # Key exists, update it
-        sed -i "s/^${KEY_1}=.*/${KEY_1}=${VALUE}/" "$ENV_FILE"
-    else
-        # Key doesn't exist, add it
-        echo "${KEY_1}=${VALUE}" >> "$ENV_FILE"
-    fi
-
-    # Check if key exists and update it, otherwise add it - TENSOR_PARALLEL_SIZE
-    if grep -q "^${KEY_2}=" "$ENV_FILE" 2>/dev/null; then
-        # Key exists, update it
-        sed -i "s/^${KEY_2}=.*/${KEY_2}=${VALUE}/" "$ENV_FILE"
-    else
-        # Key doesn't exist, add it
-        echo "${KEY_2}=${VALUE}" >> "$ENV_FILE"
+    CORE_VALUE="2"
+    echo "🚀 Configuring for small instance (2 NeuronCores)..."
+    
+    # Optional warning if using large model on small instance
+    if [[ "$MODEL_NAME" == *"8b"* ]] || [[ "$MODEL_NAME" == *"8B"* ]]; then
+        echo -e "${YELLOW}⚠️  Warning: Using a large model (8B) on a small instance (trn1.2xlarge) may cause performance issues${NC}"
     fi
-
-    echo "🚀 Model: Downloading Llama-3.2 1B..."
 elif [ "$EC2_INSTANCE_TYPE" == "trn1.32xlarge" ]; then
-    # Check if MODEL_ID_8B is set
-    if [ -z "$MODEL_ID_8B" ]; then
-        echo "Error: MODEL_ID_8B environment variable is not set!"
-        echo "Please make sure MODEL_ID_8B is defined in your .env file"
-        exit 1
-    fi
-
-    export MODEL_ID="$MODEL_ID_8B"
-    export MODEL_NAME="$MODEL_NAME_8B"
-
-    # Set NeuronCore number
-    VALUE="8"
-    # Check if key exists and update it, otherwise add it - NEURON_RT_NUM_CORES
-    if grep -q "^${KEY_1}=" "$ENV_FILE" 2>/dev/null; then
-        # Key exists, update it
-        sed -i "s/^${KEY_1}=.*/${KEY}=${VALUE}/" "$ENV_FILE"
-    else
-        # Key doesn't exist, add it
-        echo "${KEY_1}=${VALUE}" >> "$ENV_FILE"
-    fi
-
-    # Check if key exists and update it, otherwise add it - TENSOR_PARALLEL_SIZE
-    if grep -q "^${KEY_2}=" "$ENV_FILE" 2>/dev/null; then
-        # Key exists, update it
-        sed -i "s/^${KEY_2}=.*/${KEY_2}=${VALUE}/" "$ENV_FILE"
-    else
-        # Key doesn't exist, add it
-        echo "${KEY_2}=${VALUE}" >> "$ENV_FILE"
+    CORE_VALUE="8"
+    echo "🚀 Configuring for large instance (8 NeuronCores)..."
+    
+    # Optional note if using small model on large instance
+    if [[ "$MODEL_NAME" == *"1b"* ]] || [[ "$MODEL_NAME" == *"1B"* ]]; then
+        echo -e "${YELLOW}ℹ️  Note: Using a small model (1B) on a large instance (trn1.32xlarge)${NC}"
+        echo -e "${YELLOW}   Consider using a larger model for better resource utilization${NC}"
     fi
-
-    echo "🚀 Model: Downloading Llama-3 8B..."
 else
-    echo "Error: Unsupported instance type: $EC2_INSTANCE_TYPE"
+    echo -e "${RED}Error: Unsupported instance type: $EC2_INSTANCE_TYPE${NC}"
     echo "This script requires either trn1.2xlarge or trn1.32xlarge"
     exit 1
 fi
 
+# Update NEURON_RT_NUM_CORES in .env file
+if grep -q "^${KEY_1}=" "$ENV_FILE" 2>/dev/null; then
+    sed -i "s/^${KEY_1}=.*/${KEY_1}=${CORE_VALUE}/" "$ENV_FILE"
+else
+    echo "${KEY_1}=${CORE_VALUE}" >> "$ENV_FILE"
+fi
+
+# Update TENSOR_PARALLEL_SIZE in .env file
+if grep -q "^${KEY_2}=" "$ENV_FILE" 2>/dev/null; then
+    sed -i "s/^${KEY_2}=.*/${KEY_2}=${CORE_VALUE}/" "$ENV_FILE"
+else
+    echo "${KEY_2}=${CORE_VALUE}" >> "$ENV_FILE"
+fi
+
+echo "Updated .env file with:"
+echo "  ${KEY_1}=${CORE_VALUE}"
+echo "  ${KEY_2}=${CORE_VALUE}"
+
+# Display configuration summary
+echo ""
+echo "Configuration Summary:"
+echo "  Instance Type: $EC2_INSTANCE_TYPE"
+echo "  Model Name: $MODEL_NAME"
+echo "  Model ID: $MODEL_ID"
+echo "  NeuronCores: $CORE_VALUE"
+echo ""
+
 # Create models directory
 mkdir -p "$NKI_MODELS"
 
-# Download model
-echo "Downloading ${MODEL_ID} to ${NKI_MODELS}/${MODEL_NAME}"
-huggingface-cli download \
-    --token "$HF_TOKEN" \
-    "$MODEL_ID" \
-    --local-dir "${NKI_MODELS}/${MODEL_NAME}"
+# Check if model already exists
+if [ -d "${NKI_MODELS}/${MODEL_NAME}" ] && [ -n "$(ls -A ${NKI_MODELS}/${MODEL_NAME} 2>/dev/null)" ]; then
+    echo -e "${YELLOW}Model already exists at ${NKI_MODELS}/${MODEL_NAME}${NC}"
+    read -p "Do you want to re-download it? (y/N): " -n 1 -r
+    echo
+    if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+        echo "Skipping download. Using existing model."
+    else
+        echo "Re-downloading model..."
+        rm -rf "${NKI_MODELS}/${MODEL_NAME}"
+    fi
+fi
+
+# Download model if needed
+if [ ! -d "${NKI_MODELS}/${MODEL_NAME}" ] || [ -z "$(ls -A ${NKI_MODELS}/${MODEL_NAME} 2>/dev/null)" ]; then
+    echo "Downloading ${MODEL_ID} to ${NKI_MODELS}/${MODEL_NAME}"
+    huggingface-cli download \
+        --token "$HF_TOKEN" \
+        "$MODEL_ID" \
+        --local-dir "${NKI_MODELS}/${MODEL_NAME}"
+fi
 
 # Export variables to environment for other scripts to use
-echo "Exporting model variables to environment..."
-echo "MODEL_NAME=$MODEL_NAME"
-echo "MODEL_ID=$MODEL_ID"
+echo ""
+echo "Creating model environment file..."
 
 # Create a file to store these variables for other scripts
 cat > "${SCRIPT_DIR}/model_env.sh" << EOF
@@ -142,6 +175,8 @@ cat > "${SCRIPT_DIR}/model_env.sh" << EOF
 
 export MODEL_NAME="${MODEL_NAME}"
 export MODEL_ID="${MODEL_ID}"
+export NEURON_RT_NUM_CORES="${CORE_VALUE}"
+export TENSOR_PARALLEL_SIZE="${CORE_VALUE}"
 EOF
 
 chmod +x "${SCRIPT_DIR}/model_env.sh"
@@ -151,8 +186,14 @@ echo -e "${GREEN}✓ Model downloaded successfully${NC}"
 echo "Location: ${NKI_MODELS}/${MODEL_NAME}"
 
 # Save configuration hint
-if [[ -z "${HF_TOKEN:-}" ]]; then
-    echo
+if [[ -z "${HF_TOKEN:-}" ]] && [[ -n "${HF_TOKEN}" ]]; then
+    echo ""
     echo "To save your token, add to .env file:"
     echo "HF_TOKEN=$HF_TOKEN"
-fi
\ No newline at end of file
+fi
+
+# Provide next steps
+echo ""
+echo "Next steps:"
+echo "1. Source the model environment: source ${SCRIPT_DIR}/model_env.sh"
+echo "2. Run your inference or fine-tuning scripts"
\ No newline at end of file

From 70cc0581a7101750b042b8760ed5645667b7deb9 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-2-20-160.ec2.internal>
Date: Wed, 23 Jul 2025 10:30:00 +0000
Subject: [PATCH 63/65] docs: improve error handling & documentation

---
 docs/self-attention.md | 227 ++++++++++++++++++++++++++--
 nki-llama.sh           | 330 ++++++++++++++++++++++++++++++++++-------
 2 files changed, 486 insertions(+), 71 deletions(-)

diff --git a/docs/self-attention.md b/docs/self-attention.md
index a5a44ba..91bdb70 100644
--- a/docs/self-attention.md
+++ b/docs/self-attention.md
@@ -2,7 +2,7 @@
 
 ## 🎯 Overview
 
-This guide focuses on working with the sefl-attention kernels provided and optimizing them further using the Neuron Kernel Interface (NKI) compilation on AWS Inferentia/Trainium. This is a perfect starting place for teams who want to learn more about NKI and how kernel optimizations can be applied without having to train or inference components. 
+This guide focuses on working with the self-attention kernels provided and optimizing them further using the Neuron Kernel Interface (NKI) compilation on AWS Inferentia/Trainium. This is a perfect starting place for teams who want to learn more about NKI and how kernel optimizations can be applied without having to train or inference components.
 
 ### Instance Requirements
 - **Instance Type**: trn1.2xlarge (minimum) or trn1.32xlarge
@@ -14,8 +14,8 @@ This guide focuses on working with the sefl-attention kernels provided and optim
 
 ### Environment Setup
 ```bash
-# Activate the inference environment
-source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+# Activate the self-attention environment
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
 ```
 
 ## 🚀 Deployment
@@ -76,6 +76,52 @@ chmod +x install.sh
 ./install.sh
 ```
 
+## 🎮 Using the NKI-LLAMA CLI
+
+The repository includes a unified command-line interface that simplifies all operations. You can use either the CLI commands or run the scripts directly.
+
+### Option 1: Using NKI-LLAMA CLI (Recommended)
+
+```bash
+# Once connected to your instance
+cd ~/nki-llama
+
+# View all self-attention commands
+./nki-llama help
+
+# Run interactive setup wizard
+./nki-llama setup
+```
+
+**Self-Attention CLI Commands:**
+- `./nki-llama self-attention benchmark` - Run comprehensive benchmarks
+- `./nki-llama self-attention test` - Run all tests
+- `./nki-llama self-attention test forward` - Run forward pass tests only
+- `./nki-llama self-attention test backward` - Run backward pass tests only
+- `./nki-llama self-attention run <script>` - Run a specific script
+
+**Key Benefits of Using CLI:**
+- Automatic environment detection and activation guidance
+- Built-in tmux recommendations for long operations
+- Integrated logging to `logs/` directory
+- Consistent error handling and reporting
+
+### Option 2: Direct Script Execution
+
+If you prefer to run scripts directly:
+
+```bash
+# Navigate to scripts directory
+cd ~/nki-llama/src/self-attention/scripts
+
+# Run the comprehensive benchmark script
+./self-attention_benchmark.sh 
+
+# Run specific test suite
+pytest ../tests/test_flash_attn_fwd.py -v -s
+pytest ../tests/test_flash_attn_bwd.py -v -s
+```
+
 ## 📁 File Overview
 
 ### Core Test Files
@@ -98,20 +144,20 @@ chmod +x install.sh
 - **Purpose:** Optimized forward attention computation with tiling and memory efficiency
 - **Features:** Causal masking, mixed precision, dropout, GQA/MQA support, logit bias
 - **Optimizations:** Memory tiling, recomputation, SBUF management
-Usage: flash_fwd[batch_size, kv_heads](q, k, v, seed, config=FlashConfig(...))
+- **Usage:** `flash_fwd[batch_size, kv_heads](q, k, v, seed, config=FlashConfig(...))`
 
 **`flash_attn_bwd` - Flash Attention Backward Pass**
 
 - **Purpose:** Backward pass gradient computation for attention
 - **Features:** Efficient gradient calculation for Q, K, V with recomputation
 - **Optimizations:** Tiled computation, memory-efficient recomputation
-- **Usage:** flash_attn_bwd[batch_size, heads](q, k, v, o, dy, lse, seed)
+- **Usage:** `flash_attn_bwd[batch_size, heads](q, k, v, o, dy, lse, seed)`
 
 **`fused_self_attn_for_SD_small_head_size` - Stable Diffusion Specialized**
 
 - **Purpose:** Optimized attention for small head sizes (≤128) in Stable Diffusion
 - **Features:** Specialized for SD workloads, different tensor layouts
-- **Usage:** fused_self_attn_for_SD_small_head_size[batch_size](q, k, v)
+- **Usage:** `fused_self_attn_for_SD_small_head_size[batch_size](q, k, v)`
 
 ## 🚀 Quick Start
 
@@ -129,22 +175,71 @@ chmod +x install.sh
 ./install.sh
 ```
 
-### Step 2: Modify and Optimiza the Kernel Implementations
+### Step 2: Modify and Optimize the Kernel Implementations
 
-Refer to the `attention.py` file for details on the kernel implementation. This is the main file where contestants would want to edit to implement their optimization before testing the kernels. 
+Refer to the `attention.py` file for details on the kernel implementation. This is the main file where contestants would want to edit to implement their optimization before testing the kernels.
 
 ### Step 3: Run the Flash Self-Attention Kernel Unit Tests
+
+#### Using NKI-LLAMA CLI (Recommended)
+
 ```bash
+# The CLI will check your environment and guide you if needed
+cd ~/nki-llama
+
+# Use tmux for benchmarking (recommended)
+tmux new -s self-attention
+
+# Run comprehensive benchmarks
+./nki-llama self-attention benchmark
+
+# Run all tests
+./nki-llama self-attention test
+
+# Run specific test types
+./nki-llama self-attention test forward   # Forward pass only
+./nki-llama self-attention test backward  # Backward pass only
+
+# Detach from tmux with Ctrl+B, D
+# Reattach with: tmux attach -t self-attention
+```
+
+**Environment Handling:**
+The CLI will automatically:
+- Detect if you're in the correct virtual environment
+- Provide the exact activation command if needed
+- Suggest tmux for long operations
+- Log all output to `logs/nki-llama_<timestamp>.log`
+
+Example CLI output when environment is not active:
+```
+❌ No virtual environment active
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+⚠️  Self-attention environment required
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+
+Please activate the environment first:
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+```
+
+#### Using Direct Script Execution
+
+```bash
+# Activate the environment manually
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+
 # Run the unit tests
 cd ~/nki-llama/src/self-attention/scripts
 
+# Use tmux for long operations
+tmux new -s benchmark
+
 # Run the comprehensive benchmark script
 ./self-attention_benchmark.sh 
 
 # Run specific test suite
 pytest ../tests/test_flash_attn_fwd.py -v -s
 pytest ../tests/test_flash_attn_bwd.py -v -s
-
 ```
 
 ### Step 4: Understand the Scoring Mechanism
@@ -258,32 +353,67 @@ The benchmark generates a detailed JSON file with accumulated metrics:
 
 ## 🛠️ Advanced Usage
 
-### Custom Test Execution
+### Custom Test Execution with NKI-LLAMA CLI
+
+```bash
+# Run specific scripts with the CLI
+./nki-llama self-attention run custom_benchmark.sh
+
+# Pass additional pytest arguments through CLI
+./nki-llama self-attention test forward -k "test_flash_attn_fwd_perf" --tb=short
+
+# Run with verbose output
+./nki-llama self-attention test all -v -s --durations=10
 ```
+
+### Direct pytest Execution
+```bash
 # Run with maximum verbosity and detailed tracebacks
-pytest test_flash_attn_fwd_verbose.py -v -s --tb=long
+pytest test_flash_attn_fwd.py -v -s --tb=long
 
 # Run specific parameter combinations
-pytest test_flash_attn_fwd_verbose.py::TestAttention::test_flash_attn_fwd_perf[1-6-32768-32768-96-bfloat16-True-True-True-2048-3-False-87000000000] -v -s
+pytest test_flash_attn_fwd.py::TestAttention::test_flash_attn_fwd_perf[1-6-32768-32768-96-bfloat16-True-True-True-2048-3-False-87000000000] -v -s
 
 # Stop on first failure for debugging
-pytest test_flash_attn_fwd_verbose.py -v -s -x
+pytest test_flash_attn_fwd.py -v -s -x
 
 # Run with timing information
-pytest test_flash_attn_fwd_verbose.py -v -s --durations=10
+pytest test_flash_attn_fwd.py -v -s --durations=10
 
 # Capture output to file
-pytest test_flash_attn_fwd_verbose.py -v -s > test_results.log 2>&1
+pytest test_flash_attn_fwd.py -v -s > test_results.log 2>&1
 ```
 
 ## 🔧 Troubleshooting
 
+### Using NKI-LLAMA CLI for Diagnostics
+
+```bash
+# Check overall status
+./nki-llama status
+
+# Check self-attention specific status
+./nki-llama self-attention status
+
+# View logs
+ls -la logs/nki-llama_*.log
+tail -f logs/nki-llama_*.log
+```
+
 ### Common Issues
 
+#### Environment Not Active:
+The CLI will detect this and show:
+```
+❌ No virtual environment active
+Please activate the environment first:
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+```
+
 #### Performance Test Failures:
 - Check hardware availability and configuration
 - Verify expected latency thresholds are appropriate for your hardware
-- Review memory usage estimates for resource constraint
+- Review memory usage estimates for resource constraints
 
 #### Numerical Test Failures:
 - Increase tolerance if needed for specific hardware characteristics
@@ -310,3 +440,68 @@ pytest test_flash_attn_fwd_verbose.py -v -s > test_results.log 2>&1
 - 💥 FAILED: One or more metrics exceeded thresholds
 - ⚠️ xfail: Expected failure due to known issues
 - ❓ Cannot Determine: Missing metric data (API issues)
+
+## 📊 Complete Workflow Example
+
+### Using NKI-LLAMA CLI (Recommended)
+```bash
+# Start a new tmux session
+tmux new -s hackathon-attention
+
+# Navigate to repository
+cd ~/nki-llama
+
+# If environment is not active, the CLI will tell you to:
+# source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+
+# Modify kernel implementation
+nano src/self-attention/attention.py
+
+# Run benchmarks
+./nki-llama self-attention benchmark
+
+# Run specific tests
+./nki-llama self-attention test forward
+./nki-llama self-attention test backward
+
+# Check logs
+tail -f logs/nki-llama_*.log
+
+# Detach from tmux: Ctrl+B, D
+# Reattach later: tmux attach -t hackathon-attention
+```
+
+### Using Direct Scripts
+```bash
+# Start tmux
+tmux new -s benchmark
+
+# Activate environment
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+
+# Navigate to scripts
+cd ~/nki-llama/src/self-attention/scripts
+
+# Run benchmark
+./self-attention_benchmark.sh
+
+# Run individual tests
+cd ../tests
+pytest test_flash_attn_fwd.py -v -s
+pytest test_flash_attn_bwd.py -v -s
+```
+
+## 📚 Resources
+
+- [NKI Documentation](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/index.html)
+- [NKI Samples Repository](https://github.com/aws-neuron/nki-samples)
+- [Flash Attention Paper](https://arxiv.org/abs/2205.14135)
+- [Neuron SDK Documentation](https://awsdocs-neuron.readthedocs-hosted.com/)
+
+---
+
+**Pro Tips:**
+- Always use the NKI-LLAMA CLI for better environment management
+- Run benchmarks in tmux to avoid disconnection issues
+- Check `./nki-llama status` regularly to monitor system health
+- The CLI logs everything to `logs/` for later analysis
\ No newline at end of file
diff --git a/nki-llama.sh b/nki-llama.sh
index fc8c284..9772e5f 100755
--- a/nki-llama.sh
+++ b/nki-llama.sh
@@ -50,16 +50,132 @@ EOF
     echo -e "${NC}"
 }
 
+# Environment paths
+SELF_ATTENTION_ENV="/opt/aws_neuronx_venv_pytorch_2_6"
+FINETUNE_ENV="${NEURON_VENV:-/opt/aws_neuronx_venv_pytorch_2_6}"
+INFERENCE_ENV="${NEURON_INFERENCE_VENV:-/opt/aws_neuronx_venv_pytorch_2_6_nxd_inference}"
+
+# Enhanced environment checking with activation suggestions
+check_and_suggest_env() {
+    local required_env="$1"
+    local env_name="$2"
+    local env_path="$3"
+    
+    # Check if any virtual environment is active
+    if [[ -z "${VIRTUAL_ENV:-}" ]]; then
+        echo -e "${RED}❌ No virtual environment active${NC}"
+        echo -e "${YELLOW}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+        echo -e "${YELLOW}⚠️  ${env_name} environment required${NC}"
+        echo -e "${YELLOW}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+        echo
+        echo -e "${CYAN}Please activate the environment first:${NC}"
+        echo -e "${GREEN}source ${env_path}/bin/activate${NC}"
+        echo
+        
+        # If in tmux, provide additional guidance
+        if [[ -n "${TMUX:-}" ]]; then
+            echo -e "${YELLOW}💡 You're in a tmux session. Run the activation command above,${NC}"
+            echo -e "${YELLOW}   then re-run your command.${NC}"
+            echo
+        fi
+        
+        return 1
+    fi
+    
+    # Check if python is available in the environment
+    if ! command -v python &> /dev/null; then
+        echo -e "${RED}❌ Python not found in current environment${NC}"
+        echo -e "${YELLOW}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+        echo -e "${YELLOW}⚠️  Environment appears to be corrupted or not properly activated${NC}"
+        echo -e "${YELLOW}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+        echo
+        echo -e "Current VIRTUAL_ENV: ${VIRTUAL_ENV}"
+        echo -e "Current PATH: ${PATH}"
+        echo
+        echo -e "${CYAN}Try deactivating and reactivating:${NC}"
+        echo -e "${GREEN}deactivate${NC}"
+        echo -e "${GREEN}source ${env_path}/bin/activate${NC}"
+        echo
+        return 1
+    fi
+    
+    # Check Python version
+    local python_version=$(python --version 2>&1 | cut -d' ' -f2)
+    echo -e "${BLUE}🐍 Python ${python_version} detected${NC}"
+    
+    # Check if the correct environment is active
+    case "$required_env" in
+        "self-attention")
+            if [[ "$VIRTUAL_ENV" == *"pytorch_2_6"* ]] && [[ "$VIRTUAL_ENV" != *"nxd_inference"* ]]; then
+                echo -e "${GREEN}✓ Self-attention environment active${NC}"
+                
+                # Verify key packages
+                if python -c "import torch" 2>/dev/null; then
+                    echo -e "${GREEN}✓ PyTorch available${NC}"
+                else
+                    echo -e "${YELLOW}⚠️  PyTorch not found in environment${NC}"
+                fi
+                
+                return 0
+            else
+                echo -e "${RED}❌ Wrong environment active: ${VIRTUAL_ENV}${NC}"
+                echo -e "${YELLOW}Please activate the correct environment:${NC}"
+                echo -e "${GREEN}source ${env_path}/bin/activate${NC}"
+                return 1
+            fi
+            ;;
+        "finetune")
+            if [[ "$VIRTUAL_ENV" == *"pytorch_2_6"* ]] && [[ "$VIRTUAL_ENV" != *"nxd_inference"* ]]; then
+                echo -e "${GREEN}✓ Fine-tuning environment active${NC}"
+                
+                # Verify key packages
+                if python -c "import torch" 2>/dev/null; then
+                    echo -e "${GREEN}✓ PyTorch available${NC}"
+                else
+                    echo -e "${YELLOW}⚠️  PyTorch not found in environment${NC}"
+                fi
+                
+                return 0
+            else
+                echo -e "${RED}❌ Wrong environment active: ${VIRTUAL_ENV}${NC}"
+                echo -e "${YELLOW}Please activate the correct environment:${NC}"
+                echo -e "${GREEN}source ${env_path}/bin/activate${NC}"
+                return 1
+            fi
+            ;;
+        "inference")
+            if [[ "$VIRTUAL_ENV" == *"nxd_inference"* ]]; then
+                echo -e "${GREEN}✓ Inference environment active${NC}"
+                
+                # Verify key packages
+                if python -c "import torch" 2>/dev/null; then
+                    echo -e "${GREEN}✓ PyTorch available${NC}"
+                else
+                    echo -e "${YELLOW}⚠️  PyTorch not found in environment${NC}"
+                fi
+                
+                return 0
+            else
+                echo -e "${RED}❌ Wrong environment active: ${VIRTUAL_ENV}${NC}"
+                echo -e "${YELLOW}Please activate the correct environment:${NC}"
+                echo -e "${GREEN}source ${env_path}/bin/activate${NC}"
+                return 1
+            fi
+            ;;
+    esac
+}
+
 # Tmux helper functions
 check_tmux_session() {
     local session_name="$1"
     tmux has-session -t "$session_name" 2>/dev/null
 }
 
-suggest_tmux() {
+suggest_tmux_with_env() {
     local operation="$1"
     local session_name="$2"
-    shift 2
+    local env_path="$3"
+    shift 3
     local args="$*"
     
     if [[ -z "${TMUX:-}" ]]; then
@@ -69,12 +185,13 @@ suggest_tmux() {
         echo
         echo -e "This operation may take a long time. We recommend using tmux:"
         echo
-        echo -e "${CYAN}# Create new session:${NC}"
+        echo -e "${CYAN}# Option 1: Create session and activate environment manually:${NC}"
         echo -e "tmux new -s ${session_name}"
+        echo -e "source ${env_path}/bin/activate"
         echo -e "./nki-llama ${args}"
         echo
-        echo -e "${CYAN}# Or run directly in tmux:${NC}"
-        echo -e "tmux new -s ${session_name} './nki-llama ${args}'"
+        echo -e "${CYAN}# Option 2: Run everything in one command:${NC}"
+        echo -e "tmux new -s ${session_name} 'source ${env_path}/bin/activate && ./nki-llama ${args}'"
         echo
         echo -e "${CYAN}# Detach with: Ctrl+B, D${NC}"
         echo -e "${CYAN}# Reattach with: tmux attach -t ${session_name}${NC}"
@@ -84,7 +201,7 @@ suggest_tmux() {
     fi
 }
 
-# Check active Neuron environment
+# Check active Neuron environment (deprecated - use check_and_suggest_env instead)
 check_neuron_env() {
     if [[ -z "${VIRTUAL_ENV:-}" ]]; then
         echo -e "${RED}❌ No virtual environment active${NC}"
@@ -122,7 +239,20 @@ run_script() {
         return 1
     fi
     
+    # Ensure python is available before running
+    if ! command -v python &> /dev/null; then
+        echo -e "${RED}❌ Python not found in PATH${NC}"
+        echo -e "${YELLOW}Please ensure the virtual environment is properly activated${NC}"
+        return 1
+    fi
+    
     echo -e "${MAGENTA}▶ Running: ${display_name}${NC}"
+    echo -e "${BLUE}  Using Python: $(which python)${NC}"
+    
+    # Export environment variables to ensure child scripts inherit them
+    export VIRTUAL_ENV
+    export PATH
+    
     if bash "$script_path" "$@"; then
         echo -e "${GREEN}✓ ${display_name} completed${NC}\n"
     else
@@ -131,33 +261,47 @@ run_script() {
     fi
 }
 
-###############################################################################
-# Self-Attention Commands
-###############################################################################
-
-# Check if self-attention environment is active
-check_self_attention_env() {
-    if [[ -z "${VIRTUAL_ENV:-}" ]]; then
-        echo -e "${RED}❌ No virtual environment active${NC}"
-        echo -e "${YELLOW}Please activate the environment:${NC}"
-        echo -e "${CYAN}source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate${NC}"
-        return 1
-    elif [[ "$VIRTUAL_ENV" == *"pytorch_2_6"* ]]; then
-        echo -e "${GREEN}✓ Self-attention environment active${NC}"
-        return 0
+# Print configuration
+print_config() {
+    echo -e "${BOLD}Configuration Summary:${NC}"
+    echo -e "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+    
+    echo -e "\n${CYAN}Environment Paths:${NC}"
+    echo -e "• Self-Attention: ${GREEN}${SELF_ATTENTION_ENV}${NC}"
+    echo -e "• Fine-tuning:    ${GREEN}${FINETUNE_ENV}${NC}"
+    echo -e "• Inference:      ${GREEN}${INFERENCE_ENV}${NC}"
+    
+    echo -e "\n${CYAN}Model Configuration:${NC}"
+    echo -e "• Model ID:    ${GREEN}${MODEL_ID:-Not set}${NC}"
+    echo -e "• Model Name:  ${GREEN}${MODEL_NAME:-Not set}${NC}"
+    echo -e "• TP Size:     ${GREEN}${TENSOR_PARALLEL_SIZE:-8}${NC}"
+    
+    echo -e "\n${CYAN}Directories:${NC}"
+    echo -e "• Base:        ${BLUE}${NKI_BASE}${NC}"
+    echo -e "• Scripts:     ${BLUE}${NKI_SCRIPTS}${NC}"
+    echo -e "• Models:      ${BLUE}${NKI_MODELS}${NC}"
+    echo -e "• Logs:        ${BLUE}${NKI_LOGS}${NC}"
+    
+    if [[ -n "${HF_TOKEN:-}" ]]; then
+        echo -e "\n${CYAN}Authentication:${NC}"
+        echo -e "• HF Token:    ${GREEN}✓ Configured${NC}"
     else
-        echo -e "${YELLOW}⚠️  Wrong environment active: ${VIRTUAL_ENV}${NC}"
-        echo -e "${YELLOW}Please activate the correct environment:${NC}"
-        echo -e "${CYAN}source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate${NC}"
-        return 1
+        echo -e "\n${CYAN}Authentication:${NC}"
+        echo -e "• HF Token:    ${RED}✗ Not set${NC}"
     fi
+    
+    echo -e "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
 }
 
+###############################################################################
+# Self-Attention Commands
+###############################################################################
+
 cmd_self_attention_benchmark() {
     echo -e "${BOLD}Running self-attention benchmarks...${NC}"
     
     # Check environment
-    if ! check_self_attention_env; then
+    if ! check_and_suggest_env "self-attention" "Self-attention" "$SELF_ATTENTION_ENV"; then
         return 1
     fi
     
@@ -167,13 +311,12 @@ cmd_self_attention_benchmark() {
         echo -e "${YELLOW}   Benchmarks can take considerable time to complete.${NC}"
         echo -e "${YELLOW}   Disconnections will terminate the process.${NC}"
         echo
-        echo -e "   ${CYAN}tmux new -s self-attention${NC}"
-        echo -e "   ${CYAN}./nki-llama self-attention benchmark${NC}"
+        echo -e "   ${CYAN}tmux new -s self-attention 'source ${SELF_ATTENTION_ENV}/bin/activate && ./nki-llama self-attention benchmark'${NC}"
         echo
         read -p "Continue without tmux? [y/N] " -n 1 -r
         echo
         if [[ ! $REPLY =~ ^[Yy]$ ]]; then
-            echo -e "${BLUE}Please start tmux with: ${CYAN}tmux new -s self-attention${NC}"
+            echo -e "${BLUE}Please start tmux as shown above${NC}"
             exit 0
         fi
     fi
@@ -188,9 +331,23 @@ cmd_self_attention_benchmark() {
     
     cd "$self_attention_dir/scripts"
     
+    # Debug: Show current environment
+    echo -e "${BLUE}Debug Info:${NC}"
+    echo -e "  VIRTUAL_ENV: ${VIRTUAL_ENV}"
+    echo -e "  Python: $(which python 2>/dev/null || echo 'not found')"
+    echo -e "  Python3: $(which python3 2>/dev/null || echo 'not found')"
+    echo
+    
     if [[ -f "./self-attention_benchmark.sh" ]]; then
         echo -e "${MAGENTA}▶ Running: self-attention_benchmark.sh${NC}"
-        bash ./self-attention_benchmark.sh "$@"
+        
+        # Source the virtual environment in the subshell to ensure it's available
+        (
+            if [[ -f "${VIRTUAL_ENV}/bin/activate" ]]; then
+                source "${VIRTUAL_ENV}/bin/activate"
+            fi
+            bash ./self-attention_benchmark.sh "$@"
+        )
     else
         echo -e "${RED}❌ Benchmark script not found: ./self-attention_benchmark.sh${NC}"
         return 1
@@ -201,7 +358,7 @@ cmd_self_attention_test() {
     echo -e "${BOLD}Running self-attention tests...${NC}"
     
     # Check environment
-    if ! check_self_attention_env; then
+    if ! check_and_suggest_env "self-attention" "Self-attention" "$SELF_ATTENTION_ENV"; then
         return 1
     fi
     
@@ -258,7 +415,7 @@ cmd_self_attention_run() {
     echo -e "${BOLD}Running self-attention script...${NC}"
     
     # Check environment
-    if ! check_self_attention_env; then
+    if ! check_and_suggest_env "self-attention" "Self-attention" "$SELF_ATTENTION_ENV"; then
         return 1
     fi
     
@@ -293,7 +450,7 @@ cmd_self_attention_status() {
     echo -e "${BOLD}Self-Attention Status:${NC}"
     
     # Check environment
-    check_self_attention_env || true
+    check_and_suggest_env "self-attention" "Self-attention" "$SELF_ATTENTION_ENV" || true
     echo
     
     # Check directories
@@ -341,40 +498,68 @@ cmd_self_attention_status() {
 
 cmd_finetune_deps() {
     echo -e "${BOLD}Installing fine-tuning dependencies...${NC}"
+    
+    # Check environment
+    if ! check_and_suggest_env "finetune" "Fine-tuning" "$FINETUNE_ENV"; then
+        return 1
+    fi
+    
     run_script "${NKI_FINETUNE_SCRIPTS}/bootstrap.sh" "Dependencies Installation"
 }
 
 cmd_finetune_data() {
     echo -e "${BOLD}Downloading dataset...${NC}"
+    
+    # Check environment
+    if ! check_and_suggest_env "finetune" "Fine-tuning" "$FINETUNE_ENV"; then
+        return 1
+    fi
+    
     run_script "${NKI_FINETUNE_SCRIPTS}/download_data.sh" "Dataset Download"
 }
 
 cmd_finetune_model() {
     echo -e "${BOLD}Downloading model weights...${NC}"
+    
+    # Check environment
+    if ! check_and_suggest_env "finetune" "Fine-tuning" "$FINETUNE_ENV"; then
+        return 1
+    fi
+    
     run_script "${NKI_FINETUNE_SCRIPTS}/download_model.sh" "Model Download"
 }
 
 cmd_finetune_convert() {
     echo -e "${BOLD}Converting checkpoints...${NC}"
+    
+    # Check environment
+    if ! check_and_suggest_env "finetune" "Fine-tuning" "$FINETUNE_ENV"; then
+        return 1
+    fi
+    
     run_script "${NKI_FINETUNE_SCRIPTS}/convert_checkpoints.sh" "Checkpoint Conversion"
 }
 
 cmd_finetune_compile() {
     echo -e "${BOLD}Pre-compiling graphs...${NC}"
     
+    # Check environment
+    if ! check_and_suggest_env "finetune" "Fine-tuning" "$FINETUNE_ENV"; then
+        return 1
+    fi
+    
     # Check if we're in tmux
     if [[ -z "${TMUX:-}" ]]; then
         echo -e "${YELLOW}⚠️  Not running in tmux. ${BOLD}This is important for graph compilation!${NC}"
         echo -e "${YELLOW}   Graph compilation can take 30-60 minutes.${NC}"
         echo -e "${YELLOW}   Disconnections will terminate the process.${NC}"
         echo
-        echo -e "   ${CYAN}tmux new -s compile${NC}"
-        echo -e "   ${CYAN}./nki-llama finetune compile${NC}"
+        echo -e "   ${CYAN}tmux new -s compile 'source ${FINETUNE_ENV}/bin/activate && ./nki-llama finetune compile'${NC}"
         echo
         read -p "Continue without tmux? [y/N] " -n 1 -r
         echo
         if [[ ! $REPLY =~ ^[Yy]$ ]]; then
-            echo -e "${BLUE}Please start tmux with: ${CYAN}tmux new -s compile${NC}"
+            echo -e "${BLUE}Please start tmux as shown above${NC}"
             exit 0
         fi
     fi
@@ -385,6 +570,11 @@ cmd_finetune_compile() {
 cmd_finetune_train() {
     echo -e "${BOLD}Starting fine-tuning...${NC}"
     
+    # Check environment
+    if ! check_and_suggest_env "finetune" "Fine-tuning" "$FINETUNE_ENV"; then
+        return 1
+    fi
+    
     # Show training information
     echo -e "${YELLOW}💡 Fine-tuning will run for multiple hours.${NC}"
     echo -e "${YELLOW}   The training includes checkpointing and will resume if interrupted.${NC}"
@@ -396,13 +586,12 @@ cmd_finetune_train() {
         echo -e "${YELLOW}   Training can take several hours to complete.${NC}"
         echo -e "${YELLOW}   Disconnections will terminate the process (SIGHUP).${NC}"
         echo
-        echo -e "   ${CYAN}tmux new -s training${NC}"
-        echo -e "   ${CYAN}./nki-llama finetune train${NC}"
+        echo -e "   ${CYAN}tmux new -s training 'source ${FINETUNE_ENV}/bin/activate && ./nki-llama finetune train'${NC}"
         echo
         read -p "Continue without tmux? [y/N] " -n 1 -r
         echo
         if [[ ! $REPLY =~ ^[Yy]$ ]]; then
-            echo -e "${BLUE}Please start tmux with: ${CYAN}tmux new -s training${NC}"
+            echo -e "${BLUE}Please start tmux as shown above${NC}"
             exit 0
         fi
     fi
@@ -413,6 +602,11 @@ cmd_finetune_train() {
 cmd_finetune_all() {
     echo -e "${BOLD}Running complete fine-tuning pipeline...${NC}\n"
     
+    # Check environment
+    if ! check_and_suggest_env "finetune" "Fine-tuning" "$FINETUNE_ENV"; then
+        return 1
+    fi
+    
     # Check if we're in tmux for the entire pipeline
     if [[ -z "${TMUX:-}" ]]; then
         echo -e "${YELLOW}⚠️  Not running in tmux. ${BOLD}This is critical for the full pipeline!${NC}"
@@ -425,13 +619,12 @@ cmd_finetune_all() {
         echo -e "${YELLOW}   • Training (several hours)${NC}"
         echo -e "${YELLOW}   Total time: 4-8 hours depending on configuration${NC}"
         echo
-        echo -e "   ${CYAN}tmux new -s training${NC}"
-        echo -e "   ${CYAN}./nki-llama finetune all${NC}"
+        echo -e "   ${CYAN}tmux new -s training 'source ${FINETUNE_ENV}/bin/activate && ./nki-llama finetune all'${NC}"
         echo
         read -p "Continue without tmux? [y/N] " -n 1 -r
         echo
         if [[ ! $REPLY =~ ^[Yy]$ ]]; then
-            echo -e "${BLUE}Please start tmux with: ${CYAN}tmux new -s training${NC}"
+            echo -e "${BLUE}Please start tmux as shown above${NC}"
             exit 0
         fi
     fi
@@ -450,17 +643,34 @@ cmd_finetune_all() {
 
 cmd_inference_setup() {
     echo -e "${BOLD}Setting up vLLM for inference...${NC}"
+    
+    # Check environment
+    if ! check_and_suggest_env "inference" "Inference" "$INFERENCE_ENV"; then
+        return 1
+    fi
+    
     bash "${NKI_INFERENCE_SCRIPTS}/setup-vllm.sh"
 }
 
 cmd_inference_download() {
     echo -e "${BOLD}Downloading model for inference...${NC}"
+    
+    # Check environment
+    if ! check_and_suggest_env "inference" "Inference" "$INFERENCE_ENV"; then
+        return 1
+    fi
+    
     bash "${NKI_INFERENCE_SCRIPTS}/download-model.sh"
 }
 
 cmd_inference_benchmark() {
     echo -e "${BOLD}Running NKI benchmark evaluation...${NC}"
     
+    # Check environment
+    if ! check_and_suggest_env "inference" "Inference" "$INFERENCE_ENV"; then
+        return 1
+    fi
+    
     # Parse benchmark mode and special flags
     local mode="evaluate_all"  # Default mode
     local args=()
@@ -515,13 +725,12 @@ cmd_inference_benchmark() {
         echo -e "${YELLOW}⚠️  Not running in tmux. ${BOLD}This is critical for long compilations!${NC}"
         echo -e "${YELLOW}   Disconnections will terminate the process (SIGHUP).${NC}"
         echo
-        echo -e "   ${CYAN}tmux new -s benchmark${NC}"
-        echo -e "   ${CYAN}./nki-llama inference benchmark ${mode} ${args[*]}${NC}"
+        echo -e "   ${CYAN}tmux new -s benchmark 'source ${INFERENCE_ENV}/bin/activate && ./nki-llama inference benchmark ${mode} ${args[*]}'${NC}"
         echo
         read -p "Continue without tmux? [y/N] " -n 1 -r
         echo
         if [[ ! $REPLY =~ ^[Yy]$ ]]; then
-            echo -e "${BLUE}Please start tmux with: ${CYAN}tmux new -s benchmark${NC}"
+            echo -e "${BLUE}Please start tmux as shown above${NC}"
             exit 0
         fi
     fi
@@ -531,7 +740,13 @@ cmd_inference_benchmark() {
 
 cmd_inference_server() {
     echo -e "${BOLD}Starting vLLM server...${NC}"
-    suggest_tmux "vLLM Server" "vllm-server" "inference server"
+    
+    # Check environment
+    if ! check_and_suggest_env "inference" "Inference" "$INFERENCE_ENV"; then
+        return 1
+    fi
+    
+    suggest_tmux_with_env "vLLM Server" "vllm-server" "$INFERENCE_ENV" "inference server"
     bash "${NKI_INFERENCE_SCRIPTS}/start-server.sh"
 }
 
@@ -723,16 +938,16 @@ show_help() {
     echo
     
     echo -e "${CYAN}Environment Setup:${NC}"
-    echo -e "  Self-Attention: source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate"
-    echo -e "  Fine-tuning:    source ${NEURON_VENV}/bin/activate"
-    echo -e "  Inference:      source ${NEURON_INFERENCE_VENV}/bin/activate"
+    echo -e "  Self-Attention: ${GREEN}source ${SELF_ATTENTION_ENV}/bin/activate${NC}"
+    echo -e "  Fine-tuning:    ${GREEN}source ${FINETUNE_ENV}/bin/activate${NC}"
+    echo -e "  Inference:      ${GREEN}source ${INFERENCE_ENV}/bin/activate${NC}"
     echo
     
     echo -e "${CYAN}Troubleshooting:${NC}"
     echo -e "  • Always use tmux for long operations (compile, train, benchmark)"
+    echo -e "  • If no environment is active, the script will tell you which to activate"
     echo -e "  • If benchmark fails with cache errors, use --clear-cache"
     echo -e "  • Check status to see if compilation cache has failed entries"
-    echo -e "  • For self-attention, ensure correct environment is activated"
     echo
 }
 
@@ -767,17 +982,17 @@ EOF
     echo -e "${BOLD}Quick Start Guide:${NC}"
     echo -e "1. Edit .env file with your Hugging Face token"
     echo -e "2. For self-attention testing:"
-    echo -e "   ${CYAN}source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate${NC}"
-    echo -e "   ${CYAN}tmux new -s self-attention  # ${YELLOW}IMPORTANT: Use tmux!${NC}"
+    echo -e "   ${CYAN}source ${SELF_ATTENTION_ENV}/bin/activate${NC}"
+    echo -e "   ${CYAN}tmux new -s self-attention${NC}"
     echo -e "   ${CYAN}./nki-llama self-attention benchmark${NC}"
     echo -e "3. For fine-tuning:"
-    echo -e "   ${CYAN}source ${NEURON_VENV}/bin/activate${NC}"
-    echo -e "   ${CYAN}tmux new -s training  # ${YELLOW}IMPORTANT: Use tmux!${NC}"
+    echo -e "   ${CYAN}source ${FINETUNE_ENV}/bin/activate${NC}"
+    echo -e "   ${CYAN}tmux new -s training${NC}"
     echo -e "   ${CYAN}./nki-llama finetune all${NC}"
     echo -e "4. For model benchmarking:"
-    echo -e "   ${CYAN}source ${NEURON_INFERENCE_VENV}/bin/activate${NC}"
+    echo -e "   ${CYAN}source ${INFERENCE_ENV}/bin/activate${NC}"
     echo -e "   ${CYAN}./nki-llama inference download${NC}"
-    echo -e "   ${CYAN}tmux new -s benchmark  # ${YELLOW}IMPORTANT: Use tmux!${NC}"
+    echo -e "   ${CYAN}tmux new -s benchmark${NC}"
     echo -e "   ${CYAN}./nki-llama inference benchmark       # Full benchmark${NC}"
     echo -e "   ${CYAN}./nki-llama inference benchmark single   # Quick test${NC}"
     echo -e "5. For inference serving:"
@@ -786,6 +1001,7 @@ EOF
     echo
     echo -e "${YELLOW}💡 Pro Tips:${NC}"
     echo -e "   • Always use tmux for long operations"
+    echo -e "   • The script will tell you which environment to activate if needed"
     echo -e "   • Check ./nki-llama status for system health"
     echo -e "   • Use --clear-cache if benchmark fails with cache errors"
     echo
@@ -826,6 +1042,10 @@ main() {
             cmd_inference_server "$@"
             ;;
         jupyter)
+            # Check environment
+            if ! check_and_suggest_env "inference" "Inference" "$INFERENCE_ENV"; then
+                exit 1
+            fi
             bash "${NKI_INFERENCE_SCRIPTS}/jupyter.sh" "$@"
             ;;
             

From 9b280fc2d69f69702d8e4a2ad368a355c3b95fcb Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-2-20-160.ec2.internal>
Date: Wed, 23 Jul 2025 10:32:34 +0000
Subject: [PATCH 64/65] docs: add support for trn1.2xlarge in cloudformation

---
 deployment/deployment.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/deployment/deployment.yaml b/deployment/deployment.yaml
index f71d6a1..4996713 100644
--- a/deployment/deployment.yaml
+++ b/deployment/deployment.yaml
@@ -26,6 +26,7 @@ Parameters:
     Default: trn1.32xlarge
     AllowedValues:
       - trn1.32xlarge
+      - trn1.2xlarge
   
   VpcOption:
     Description: Choose whether to use an existing VPC or create a new one

From 5b9316f2125e553c9d76c3913e64436059c6a1dc Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-2-30-134.ec2.internal>
Date: Thu, 24 Jul 2025 17:33:50 +0000
Subject: [PATCH 65/65] fix: env variables for nki status

---
 docs/self-attention.md | 3 +++
 nki-llama.sh           | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/docs/self-attention.md b/docs/self-attention.md
index 91bdb70..67ac1b7 100644
--- a/docs/self-attention.md
+++ b/docs/self-attention.md
@@ -190,6 +190,9 @@ cd ~/nki-llama
 # Use tmux for benchmarking (recommended)
 tmux new -s self-attention
 
+# Activate python environment
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+
 # Run comprehensive benchmarks
 ./nki-llama self-attention benchmark
 
diff --git a/nki-llama.sh b/nki-llama.sh
index 9772e5f..b952734 100755
--- a/nki-llama.sh
+++ b/nki-llama.sh
@@ -277,8 +277,8 @@ print_config() {
     echo -e "• TP Size:     ${GREEN}${TENSOR_PARALLEL_SIZE:-8}${NC}"
     
     echo -e "\n${CYAN}Directories:${NC}"
-    echo -e "• Base:        ${BLUE}${NKI_BASE}${NC}"
-    echo -e "• Scripts:     ${BLUE}${NKI_SCRIPTS}${NC}"
+    echo -e "• Base:        ${BLUE}${NKI_ROOT}${NC}"
+    echo -e "• Scripts:     ${BLUE}${NKI_SRC}${NC}"
     echo -e "• Models:      ${BLUE}${NKI_MODELS}${NC}"
     echo -e "• Logs:        ${BLUE}${NKI_LOGS}${NC}"