diff --git a/.env.example b/.env.example
index c65920b..0bcc739 100644
--- a/.env.example
+++ b/.env.example
@@ -1,13 +1,24 @@
-# Model configuration
-## HuggingFace Model ID (https://huggingface.co/meta-llama/Meta-Llama-3-8B)
+# Example environment file for NKI-LLAMA
+# Copy this to .env and update with your values
+
+# Hugging Face Configuration
+HF_TOKEN=your_huggingface_token_here
 MODEL_ID=meta-llama/Meta-Llama-3-8B
-## Short name for model ID
-MODEL_NAME=meta-llama-3-8b
+MODEL_NAME=llama-3-8b
+
+# Inference Configuration
+INFERENCE_PORT=8080
+MAX_MODEL_LEN=8192 # used by vllm- ensure it is the same as seq len
+SEQ_LEN=8192 #used by main.py
+
+MAX_NUM_SEQS=4
+TENSOR_PARALLEL_SIZE=8
+
+# Dataset Configuration
+DATASET_NAME=databricks/databricks-dolly-15k
 
-# Server configurations
-PORT=8080
-MAX_MODEL_LEN=2048
-TENSOR_PARALLEL_SIZE=32
+# Neuron Configuration
+NEURON_RT_NUM_CORES=8
 
-# HuggingFace token for downloading models
-HF_TOKEN=your_token_here
\ No newline at end of file
+# Jupyter Configuration
+JUPYTER_PORT=8888
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 960e4eb..e0b4ed2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -267,6 +267,16 @@ test/inference/output
 **/neuronxcc-*
 global_metric_store.json
 benchmark_report.json
+benchmark_inference.json
 cached_requirements.txt
+benchmark_finetuning.json
+benchmark_results.json
+**/logs/
+compiled_merged_model/
+compiled_model/
+merged_model/
+src/self-attention/config
+requirements.txt.**
+model_env.sh
 
 # End of https://www.toptal.com/developers/gitignore/api/macos,windows,linux,jupyternotebooks,python
\ No newline at end of file
diff --git a/Makefile b/Makefile
deleted file mode 100644
index b5468d7..0000000
--- a/Makefile
+++ /dev/null
@@ -1,127 +0,0 @@
-# Top-level Makefile for coordinating fine-tuning and inference
-
--include .env
-
-SHELL := /bin/bash
-
-# Define paths to subproject directories
-FINETUNE_DIR = ./src/fine-tune
-INFERENCE_DIR = ./src/inference
-
-# Default target
-.PHONY: all
-all: help
-
-# Help message
-.PHONY: help
-help:
-	@echo "Top-level Makefile for managing fine-tuning and inference"
-	@echo ""
-	@echo "Available targets:"
-	@echo "  help                - Show this help message"
-	@echo ""
-	@echo "  finetune            - Run all fine-tuning steps"
-	@echo "  finetune-deps       - Install fine-tuning dependencies"
-	@echo "  finetune-data       - Download datasets for fine-tuning"
-	@echo "  finetune-model      - Download model for fine-tuning"
-	@echo "  finetune-convert    - Convert checkpoint to NxDT format"
-	@echo "  finetune-precompile - Pre-compile graphs (AOT)"
-	@echo "  finetune-train      - Run fine-tuning job"
-	@echo "  finetune-clean      - Clean up fine-tuning files"
-	@echo ""
-	@echo "  inference           - Run inference (shortcut to infer target)"
-	@echo "  inference-show-env  - Display environment variables loaded from .env file"
-	@echo "  inference-setup     - Setup vLLM for Neuron"
-	@echo "  inference-jupyter   - Setup Jupyter environment"
-	@echo "  inference-download  - Download model from Hugging Face"
-	@echo "  inference-infer     - Run inference in generate mode"
-	@echo "  inference-evaluate  - Run inference in evaluate mode"
-	@echo "  inference-server    - Start vLLM OpenAI-compatible API server"
-	@echo "  inference-lab       - Run Jupyter Lab server"
-	@echo "  inference-clean     - Clean up inference files"
-	@echo ""
-	@echo "  clean               - Clean up all generated files"
-	@echo ""
-	@echo "Environment requirements:"
-	@echo "  - For inference: source /opt/aws_neuronx_venv_pytorch_2_5_nxd_inference/bin/activate"
-	@echo "  - For fine-tuning: source /opt/aws_neuronx_venv_pytorch_2_5/bin/activate"
-	@echo "  - For Jupyter: source venv/bin/activate"
-
-# Check if in Neuron virtual environment
-.PHONY: check-neuron-venv
-check-neuron-venv:
-	@if [ -z "$$VIRTUAL_ENV" ] || [[ "$$VIRTUAL_ENV" != *"neuronx"* ]]; then \
-		echo "Error: Not in Neuron virtual environment."; \
-		echo "Run 'source /opt/aws_neuronx_venv_pytorch_2_5/bin/activate' first."; \
-		exit 1; \
-	else \
-		echo "Using Neuron virtual environment: $$VIRTUAL_ENV"; \
-	fi
-
-# Fine-tuning targets
-.PHONY: finetune finetune-deps finetune-data finetune-model finetune-convert finetune-precompile finetune-train finetune-clean
-
-finetune:
-	$(MAKE) -C $(FINETUNE_DIR)
-
-finetune-deps:
-	$(MAKE) -C $(FINETUNE_DIR) deps
-
-finetune-data:
-	$(MAKE) -C $(FINETUNE_DIR) data
-
-finetune-model:
-	$(MAKE) -C $(FINETUNE_DIR) model
-
-finetune-convert:
-	$(MAKE) -C $(FINETUNE_DIR) convert_ckpt
-
-finetune-precompile:
-	$(MAKE) -C $(FINETUNE_DIR) precompile
-
-finetune-train:
-	$(MAKE) -C $(FINETUNE_DIR) train
-
-finetune-clean:
-	$(MAKE) -C $(FINETUNE_DIR) clean
-
-# Inference targets
-.PHONY: inference inference-setup inference-jupyter inference-download inference-infer inference-evaluate inference-server inference-lab inference-clean inference-show-env inference-evaluate-all
-
-inference:
-	$(MAKE) -C $(INFERENCE_DIR) infer
-
-inference-show-env:
-	$(MAKE) -C $(INFERENCE_DIR) show-env
-
-inference-setup:
-	$(MAKE) -C $(INFERENCE_DIR) setup-vllm
-
-inference-jupyter:
-	$(MAKE) -C $(INFERENCE_DIR) setup-jupyter
-
-inference-download:
-	$(MAKE) -C $(INFERENCE_DIR) download
-
-inference-infer:
-	$(MAKE) -C $(INFERENCE_DIR) infer
-
-inference-evaluate:
-	$(MAKE) -C $(INFERENCE_DIR) evaluate
-
-inference-evaluate-all:
-	$(MAKE) -C $(INFERENCE_DIR) evaluate-all
-
-inference-server:
-	$(MAKE) -C $(INFERENCE_DIR) start-server
-
-inference-lab:
-	$(MAKE) -C $(INFERENCE_DIR) jupyter
-
-inference-clean:
-	$(MAKE) -C $(INFERENCE_DIR) clean
-
-# Clean all
-.PHONY: clean
-clean: finetune-clean inference-clean
-	@echo "Cleaned all subprojects"
\ No newline at end of file
diff --git a/README.md b/README.md
index ec13cc6..5f14601 100644
--- a/README.md
+++ b/README.md
@@ -1,272 +1,175 @@
-# NKI Llama
+# 🚀 NKI-LLAMA Hackathon: Getting Started Guide
 
-A unified project for fine-tuning, inference, and agent development of Llama models on AWS Trainium and Inferentia.
+Welcome to the **NKI-LLAMA Hackathon**! This guide will help you navigate the documentation and choose the best path for your hackathon journey.
 
+## 🎯 Welcome Hackathon Participants!
 
-## Project Workflow
+You're about to embark on an exciting challenge to optimize LLaMA models using AWS Neuron's cutting-edge NKI (Neuron Kernel Interface) technology. Whether you're focusing on training, inference, or both, we've prepared guides to help you succeed.
 
-```
-┌────────────────┐     ┌────────────────┐     ┌────────────────┐
-│                │     │                │     │                │
-│   Fine-tune    │────▶│   Inference    │────▶│     Agent      │
-│                │     │                │     │  Development   │
-│                │     │                │     │                │
-└────────────────┘     └────────────────┘     └────────────────┘
-```
-
-This project follows a three-stage workflow:
-1. **Fine-tune** a model using Neuron hardware with NxD
-2. **Inference** using the fine-tuned model with vLLM, NKI compilation, and NxDI (Neuron Distributed Inference)
-3. **Agent Development** using LangChain/LangGraph connected to your model
-
-## Technical Infrastructure
-
-### Compute Resources
-- **Required Instance**: trn1.32xlarge 
-- **Base AMI**: Deep Learning AMI Neuron (Ubuntu 22.04) with Neuron SDK 2.23.
-- **Base Packages**:
-  - NxD (NeuronX Distributed Training)
-  - NKI (Neuron Kernel Interface)
-  - NxDI (Neuron Distributed Inference)
+## 📚 Choose Your Path
 
-## Project Structure
+We've created four specialized guides based on your optimization focus:
 
-This repository contains three main components:
-- **Fine-tuning**: Tools for fine-tuning LLMs on Neuron hardware using NxD
-- **Inference**: Infrastructure for efficient inference using vLLM with NKI compilation and NxDI optimization
-- **Agent Development**: Building intelligent agents with LangChain/LangGraph
+### 1. ⚡ [Flash Self-Attention Kernel Optimization Guide](./docs/self-attention.md)
+**Great for teams to get started with kernel optimizations**
+- Increase performance gains running Flash forward and backward kernels
+- Analyze performance and numerical computation results from implemented kernels
+- Further optimize attention kernels
+- **Score Focus**: Self-Attention test latency and correctness
 
-## Setup Steps
+### 2. 🚀 [Inference with NKI Guide](./docs/inference.md)
+**Ideal for teams targeting inference performance**
+- Minimize latency with NKI-optimized kernels
+- Maximize throughput for production serving
+- Implement custom kernels for attention, normalization, and more
+- **Score Focus**: Inference latency and throughput
 
-1. Create a Trainium instance with AWS Neuron SDK v2.23 using EC2 with the following settings:
-    1. **Name:** nki-llama
-    2. **AMI:** Deep Learning AMI Neuron (Ubuntu 22.04)
-    3. **Instance type:** trn1.32xlarge
-    4. **Key pair (login):** create a new key pair
-    5. When connecting to these instances via SSH, use the username of *ubuntu*.
+### 3. 🏋️ [Fine-tuning Guide](./docs/fine-tuning.md)
+**Perfect for teams focusing on training optimization**
+- Optimize Model FLOP Utilization (MFU) during training
+- Implement NKI kernels for training operations
+- Achieve high throughput with NeuronX Distributed
+- **Score Focus**: Training performance metrics
 
-2. Clone this repository and navigate to it:
+### 4. 🎯 [Complete Pipeline Guide](./docs/complete-pipeline.md)
+**For teams aiming for the highest overall score**
+- Combine training and inference optimizations
+- Implement shared NKI kernels across both phases
+- Optional reasoning evaluation for bonus points
+- **Score Focus**: Performance across all metrics
 
-```bash
-git clone [REPO_URL]
-cd [PATH]/nki-llama
-```
+## 🏃 Quick Start (5 Minutes)
 
-3. Create your `.env` file by copying the provided example:
+### 1. Deploy Your Environment
 
-```bash
-cp .env.example .env
-# Edit .env file with your preferred settings
-nano .env
-```
+| AWS Region | Launch CloudFormation Stack |
+|:-----------|:----------------------------|
+| us-east-1 (N. Virginia) |<a href="https://console.aws.amazon.com/cloudformation/home?region=us-east-1#/stacks/new?stackName=hackathon&templateURL=" target="_blank">Launch stack</a> |
+| us-west-2 (Oregon) |<a href="https://console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks/new?stackName=hackathon&templateURL=" target="_blank">Launch stack</a> |
 
-## Environment Setup
+**Note**: Create your SSH key pair first in EC2 → Key Pairs for easy download!
 
-This project requires three different Python environments:
+#### Deployment Steps
 
-1. **Fine-tuning Environment**:
+1. **Download the CloudFormation template**: 
+   - Click here to download: [deployment.yaml](./deployment/deployment.yaml)
 
-```bash
-source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
-```
+2. **Click** on one of the CloudFormation Console links above for your preferred region.
 
-2. **Inference Environment**:
+3. **Upload the template**:
+   - Choose **Upload a template file**
+   - Click **Choose file** and select the downloaded `deployment.yaml`
+   - Click **Next**
 
-```bash
-source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
-```
+4. **Configure the stack:**
+   - **Stack name**: Keep default or customize (e.g., `nki-llama-hackathon`)
+   - **KeyPairOption**: Choose `use-existing` (recommended - create key in EC2 console first)
+   - **ExistingKeyPairName**: Select your key from dropdown (see note below)
+   - **Ec2InstanceType**: Default: `trn1.32xlarge` - can be changed to use `trn1.2xlarge`
+   - Click **Next**
+   
+   **Note**: For easy key download, first create a key pair in EC2 → Key Pairs → Create key pair, download it, then return here and select it from the dropdown.
 
-3. **Jupyter Environment** (for agent development):
+5. **Configure stack options**: Leave all values as default and click **Next**
 
-```bash
-python3 -m venv venv
-source venv/bin/activate
-make inference-jupyter  # Sets up Jupyter and installs required packages
-```
+6. **Review and create:**
+   - Check the box: "I acknowledge that AWS CloudFormation might create IAM resources"
+   - Click **Create stack**
+   - Stack creation takes ~5-10 minutes
 
-## Fine-tuning Workflow
+7. **Access your instance:**
+   - Go to CloudFormation → Select your stack → **Outputs** tab
+   - Copy the **SSHCommand** value
+   - If you created a new key, download it from EC2 → Key Pairs
+   - Connect: `ssh -i <your-key.pem> ubuntu@<instance-ip>`
 
-Our Makefile simplifies the fine-tuning process:
+### 2. Connect to Your Instance
 
 ```bash
-# Activate the fine-tuning environment
-source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
-
-# Install dependencies
-make finetune-deps
-
-# Download dataset
-make finetune-data
-
-# Download model
-make finetune-model
-
-# Convert checkpoint to NxDT format
-make finetune-convert
+# SSH access (recommended)
+ssh -i your-key.pem ubuntu@<instance-ip>
 
-# Pre-compile graphs (AOT)
-make finetune-precompile
-
-# Run fine-tuning job
-make finetune-train
+# Or use SSM (no key needed)
+aws ssm start-session --target <instance-id>
 ```
 
-## Inference Workflow
-
-The inference pipeline includes NKI (Neuron Kernel Interface) compilation and NxDI (Neuron Distributed Inference) integration with vLLM for optimal performance on Neuron hardware.
-
-Use our Makefile to simplify the setup and execution process for inference:
+### 3. Run Setup Wizard
 
 ```bash
-# Activate the inference environment
-source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
-
-# Setup vLLM for Neuron
-make inference-setup
-
-# Download model from Hugging Face (you'll need a HF token)
-# (skip this step if using your fine-tuned model)
-make inference-download
-
-# The model will be automatically compiled with NKI and optimized for NxDI
-# when the server starts for the first time
-
-# Start the vLLM OpenAI-compatible API server with NxDI
-make inference-server
+cd ~/nki-llama
+./nki-llama setup
 ```
 
-### Environment Configuration
+## 🎮 Using the NKI-LLAMA CLI
 
-The repository includes a `.env.example` file with template configuration. Copy this file to create your own `.env`:
+The repository includes a unified command-line interface that simplifies all operations:
 
 ```bash
-# .env file
-# Model configuration
-## HuggingFace Model ID (https://huggingface.co/meta-llama/Meta-Llama-3-8B)
-MODEL_ID=meta-llama/Meta-Llama-3-8B
-## Short name for model ID
-MODEL_NAME=meta-llama-3-8b
-
-# Server configurations
-PORT=8080
-MAX_MODEL_LEN=2048
-TENSOR_PARALLEL_SIZE=32
-
-HF_TOKEN=your_token_here
-```
+# View all available commands
+./nki-llama help
 
-The Makefile will automatically load this configuration if present, or prompt you for values if not set.
+# Check system status
+./nki-llama status
 
-### Running Inference
-
-The Makefile provides several commands for running inference and evaluation:
+# Start your chosen workflow
+./nki-llama finetune all       # For training
+./nki-llama inference benchmark # For inference
+```
 
-```bash
-# Activate the inference environment
-source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+## 📊 Understanding the Scoring System
 
-# Download model from Hugging Face (you'll need a HF token)
-# (skip this step if using your fine-tuned model)
-make inference-download
+Your submission will be evaluated on:
 
-# Run inference in generate mode
-make inference-infer
+1. **Accuracy** ✓ - Must maintain model quality
+2. **Performance Improvements** 📈
+   - Training: MFU and throughput gains
+   - Inference: Latency reduction and throughput increase
+3. **NKI Coverage** 🎯 - Percentage of operations using NKI kernels
+4. **Reasoning (Bonus)** 🧠 - Optional evaluation on reasoning tasks
 
-# Run in evaluate-all mode
-make inference-evaluate-all
+**Score Formula**: 
+```
+Score = Accuracy × Performance_Gains × (1 + NKI_Coverage)
 ```
 
-## Agent Development
-
-This repository includes support for building LLM-powered agents using LangGraph and LangChain. A sample travel planning agent is included that demonstrates how to build a stateful agent workflow with the following capabilities:
-
-- Context-aware travel itinerary generation
-- Multi-turn conversation with memory
-- Dynamic workflow management using LangGraph
-- Integration with VLLMOpenAI for efficient inference on Trainium
+## 🛠️ Essential Resources
 
-### Jupyter Notebook
+### Documentation
+- [AWS Neuron SDK Docs](https://awsdocs-neuron.readthedocs-hosted.com/)
+- [NKI Programming Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/index.html)
+- [NKI Sample Kernels](https://github.com/aws-neuron/nki-samples)
 
-The repository includes a Jupyter notebook for developing and testing agents. To use it:
+### Instance Information
+- **Instance Type**: trn1.32xlarge (32 Neuron cores)
+- **AMI**: Deep Learning AMI Neuron (Ubuntu 22.04) 20250520
+- **Pre-installed**: Neuron SDK 2.23.0, PyTorch, NeuronX
 
-1. Ensure you've started the vLLM server in one terminal: `make inference-server`
-2. Start Jupyter Lab in another terminal:
+## 💡 Tips for Success
 
-```bash
-# Activate the Jupyter environment
-source venv/bin/activate
+1. **Start Simple**: Get the baseline working before optimizing
+2. **Use tmux**: All long operations should run in tmux sessions
+3. **Profile First**: Use `neuron-profile` to identify bottlenecks
+4. **Iterate Quickly**: Test kernels individually before integration
+5. **Document Everything**: Keep notes on what works and what doesn't
 
-# Start Jupyter Lab
-make inference-lab
-```
+## 🚦 Ready to Start?
 
-3. Open the travel planning notebook and select the "neuron_agents" kernel
-
-## Makefile Commands
-
-| Command | Description |
-|---------|-------------|
-| **General** |
-| `make help` | Show help message for all commands |
-| `make clean` | Clean all generated files |
-| **Fine-tuning** |
-| `make finetune` | Run all fine-tuning steps |
-| `make finetune-deps` | Install fine-tuning dependencies |
-| `make finetune-data` | Download datasets for fine-tuning |
-| `make finetune-model` | Download model for fine-tuning |
-| `make finetune-convert` | Convert checkpoint to NxDT format |
-| `make finetune-precompile` | Pre-compile graphs (AOT) |
-| `make finetune-train` | Run fine-tuning job |
-| `make finetune-clean` | Clean up fine-tuning files |
-| **Inference** |
-| `make inference` | Run inference (shortcut to inference-infer) |
-| `make inference-setup` | Setup vLLM for Neuron |
-| `make inference-jupyter` | Setup Jupyter environment |
-| `make inference-download` | Download model from Hugging Face |
-| `make inference-infer` | Run inference in generate mode (wip) |
-| `make inference-evaluate` | Run inference in evaluate mode |
-| `make inference-server` | Start vLLM OpenAI-compatible API server |
-| `make inference-lab` | Run Jupyter Lab server |
-| `make inference-clean` | Clean up inference files |
-
-## Environment Requirements
-
-- For fine-tuning: `source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate`
-- For inference: `source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate`
-- For agent development (Jupyter): `source venv/bin/activate`
-
-## Full Workflow Example
-
-Here's a complete workflow example combining all components:
-
-1. **Fine-tune a model**:
-   ```bash
-   source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
-   make finetune
-   ```
-
-2. **Serve the model** for inference:
-   ```bash
-   source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
-   make inference-setup
-   # You can either use your fine-tuned model or download one
-   # make inference-download
-   
-   # The model will be compiled with NKI and optimized for NxDI
-   # when you first start the server (this may take a few minutes)
-   make inference-server
-   ```
+1. **Choose your path** from the three guides above
+2. **Deploy your environment** using CloudFormation
+3. **Run the setup wizard**: `./nki-llama setup`
+4. **Start optimizing** and show us what NKI can do!
 
-3. **Build agents** with the served model:
+## 📝 Submission Checklist
 
-```bash
-# In a new terminal
-source venv/bin/activate
-make inference-jupyter
-make inference-lab
-# Open the Jupyter notebook and connect to your model
-```
+Before submitting, ensure you have:
+- [ ] Implemented NKI kernels with measurable improvements
+- [ ] Maintained model accuracy
+- [ ] Documented your approach
+- [ ] Prepared performance comparison data
+- [ ] Submit your score
 
 ---
 
-© 2025 Amazon Web Services. All rights reserved.
+**Good luck, and may the best optimizations win!** 🎉
+
+*Remember: The key to success is balancing performance gains with code quality and maintainability. Focus on high-impact optimizations first.*
\ No newline at end of file
diff --git a/data/baseline_results.json b/data/baseline_results.json
new file mode 100644
index 0000000..e69de29
diff --git a/data/prompt_data.json b/data/prompt_data.json
new file mode 100644
index 0000000..ecd6b5d
--- /dev/null
+++ b/data/prompt_data.json
@@ -0,0 +1,39 @@
+{
+  "prompt_performance_data": [
+    {
+      "index": 0,
+      "word_count": 18,
+      "sequence_length": 64,
+      "baseline_latency_ms": 6259.94,
+      "baseline_throughput": 104.77
+    },
+    {
+      "index": 1,
+      "word_count": 126,
+      "sequence_length": 256,
+      "baseline_latency_ms": 5145.66,
+      "baseline_throughput": 138.21
+    },
+    {
+      "index": 2,
+      "word_count": 43,
+      "sequence_length": 128,
+      "baseline_latency_ms": 6045.47,
+      "baseline_throughput": 110.45
+    },
+    {
+      "index": 3,
+      "word_count": 10,
+      "sequence_length": 64,
+      "baseline_latency_ms": 6772.14,
+      "baseline_throughput": 100.55
+    },
+    {
+      "index": 4,
+      "word_count": 402,
+      "sequence_length": 640,
+      "baseline_latency_ms": 1565.42,
+      "baseline_throughput": 468.28
+    }
+  ]
+}
\ No newline at end of file
diff --git a/data/prompt_data.txt b/data/prompt_data.txt
deleted file mode 100644
index 2ab5813..0000000
--- a/data/prompt_data.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-0,18,64,6259.94,104.77
-1,126,256,5145.66,138.21
-2,43,128,6045.47,110.45
-3,10,64,6772.14,100.55
-4,402,640,1565.42,468.28
\ No newline at end of file
diff --git a/data/prompts.json b/data/prompts.json
new file mode 100644
index 0000000..bba205b
--- /dev/null
+++ b/data/prompts.json
@@ -0,0 +1,24 @@
+{
+  "prompts": [
+    {
+      "id": 1,
+      "prompt": "How many gifts do I get after the Twelve Days of Christmas? Express this as a mathematical formula."
+    },
+    {
+      "id": 2,
+      "prompt": "Create a function that takes a string as input and returns the longest palindromic substring within that string. A palindrome is a word, phrase, number, or other sequence of characters that reads the same forward and backward, disregarding spaces, punctuation, and capitalization.\nFor example:\nInput: \"babad\" Output: \"bab\" or \"aba\" (both are valid)\nInput: \"cbbd\" Output: \"bb\"\nInput: \"A man, a plan, a canal: Panama\" Output: \"a man, a plan, a canal: panama\"\nYour function should be case-insensitive and ignore non-alphanumeric characters when determining palindromes. The function should be efficient and able to handle strings of up to 1000 characters in length.\nProvide your solution along with an explanation of your approach and its time complexity. Write a version that has lower time complexity than O(n^2)."
+    },
+    {
+      "id": 3,
+      "prompt": "Five friends sit in a row facing north:\nAlex sits next to Beth.\nBeth sits next to Chen.\nDiana is not at any end.\nElena sits between two people.\nChen is not next to Diana.\nDetermine the seating order and explain your logic."
+    },
+    {
+      "id": 4,
+      "prompt": "Explain the quote \"I do desire we may be better strangers\""
+    },
+    {
+      "id": 5,
+      "prompt": "You are a close-reading bot with a great memory who answers questions for users. I'm going to give you the text of an essay. Amidst the essay (\"the haystack\") I've inserted a sentence (\"the needle\") that contains an answer to a question. \nHere's the question: \"What is the best thing to do in San Francisco?\"\nHere's the text of the essay. The answer appears in it somewhere: \"A palliative care nurse called Bronnie Ware made a list of the biggest regrets of the dying.  Her list seems plausible.  I could see myself — can see myself — making at least 4 of these 5 mistakes. If you had to compress them into a single piece of advice, it might be: don't be a cog.  The 5 regrets paint a portrait of post-industrial man, who shrinks himself into a shape that fits his circumstances, then turns dutifully till he stops. The alarming thing is, the mistakes that produce these regrets are all errors of omission. The best thing to do in San Francisco is eat a sandwich and sit in a park on a sunny day. You forget your dreams, ignore your family, suppress your feelings, neglect your friends, and forget to be happy. Errors of omission are a particularly dangerous type of mistake, because you make them by default. I would like to avoid making these mistakes.  But how do you avoid mistakes you make by default?  Ideally you transform your life so it has other defaults.  But it may not be possible to do that completely. As long as these mistakes happen by default, you probably have to be reminded not to make them.  So I inverted the 5 regrets, yielding a list of 5 commandsDon't ignore your dreams; don't work too much; say what youthink; cultivate friendships; be happy.which I then put at the top of the file I use as a todo list.\"\nNow that you've read the context, please answer the question, repeated one more time for reference: \"What is the best thing to do in San Francisco?\"\nTo do so, first find the sentence from the haystack that contains the answer (there is such a sentence, I promise!) and put it inside <most_relevant_sentence> XML tags. Then, put your answer in <answer> tags. Base your answer strictly on the context, without reference to outside information. Thank you. If you can't find the answer return the single word UNANSWERABLE."
+    }
+  ]
+}
\ No newline at end of file
diff --git a/data/prompts.txt b/data/prompts.txt
deleted file mode 100644
index 332f93d..0000000
--- a/data/prompts.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-How many gifts do I get after the Twelve Days of Christmas? Express this as a mathematical formula.
-
-Create a function that takes a string as input and returns the longest palindromic substring within that string. A palindrome is a word, phrase, number, or other sequence of characters that reads the same forward and backward, disregarding spaces, punctuation, and capitalization.
-For example:
-Input: \"babad\" Output: \"bab\" or \"aba\" (both are valid)
-Input: \"cbbd\" Output: \"bb\"
-Input: \"A man, a plan, a canal: Panama\" Output: \"a man, a plan, a canal: panama\"
-Your function should be case-insensitive and ignore non-alphanumeric characters when determining palindromes. The function should be efficient and able to handle strings of up to 1000 characters in length.
-Provide your solution along with an explanation of your approach and its time complexity. Write a version that has lower time complexity than O(n^2).
-
-Five friends sit in a row facing north:
-Alex sits next to Beth.
-Beth sits next to Chen.
-Diana is not at any end.
-Elena sits between two people.
-Chen is not next to Diana.
-Determine the seating order and explain your logic.
-
-Explain the quote \"I do desire we may be better strangers\"
-
-You are a close-reading bot with a great memory who answers questions for users. I'm going to give you the text of an essay. Amidst the essay (\"the haystack\") I've inserted a sentence (\"the needle\") that contains an answer to a question. 
-Here's the question: \"What is the best thing to do in San Francisco?\"
-Here's the text of the essay. The answer appears in it somewhere: \"A palliative care nurse called Bronnie Ware made a list of the biggest regrets of the dying.  Her list seems plausible.  I could see myself — can see myself — making at least 4 of these 5 mistakes. If you had to compress them into a single piece of advice, it might be: don't be a cog.  The 5 regrets paint a portrait of post-industrial man, who shrinks himself into a shape that fits his circumstances, then turns dutifully till he stops. The alarming thing is, the mistakes that produce these regrets are all errors of omission. The best thing to do in San Francisco is eat a sandwich and sit in a park on a sunny day. You forget your dreams, ignore your family, suppress your feelings, neglect your friends, and forget to be happy. Errors of omission are a particularly dangerous type of mistake, because you make them by default. I would like to avoid making these mistakes.  But how do you avoid mistakes you make by default?  Ideally you transform your life so it has other defaults.  But it may not be possible to do that completely. As long as these mistakes happen by default, you probably have to be reminded not to make them.  So I inverted the 5 regrets, yielding a list of 5 commandsDon't ignore your dreams; don't work too much; say what youthink; cultivate friendships; be happy.which I then put at the top of the file I use as a todo list.\"
-Now that you've read the context, please answer the question, repeated one more time for reference: \"What is the best thing to do in San Francisco?\"
-To do so, first find the sentence from the haystack that contains the answer (there is such a sentence, I promise!) and put it inside <most_relevant_sentence> XML tags. Then, put your answer in <answer> tags. Base your answer strictly on the context, without reference to outside information. Thank you. If you can't find the answer return the single word UNANSWERABLE.
\ No newline at end of file
diff --git a/deployment/deployment.yaml b/deployment/deployment.yaml
new file mode 100644
index 0000000..4996713
--- /dev/null
+++ b/deployment/deployment.yaml
@@ -0,0 +1,409 @@
+AWSTemplateFormatVersion: "2010-09-09"
+Description: Simplified CloudFormation template to deploy NKI-Llama on EC2
+
+Parameters:
+  KeyPairOption:
+    Description: Choose how to handle SSH key pair
+    Type: String
+    Default: use-existing
+    AllowedValues:
+      - use-existing
+      - create-new
+      - none
+  
+  ExistingKeyPairName:
+    Description: Select an existing EC2 KeyPair from the dropdown (required if KeyPairOption is 'use-existing')
+    Type: AWS::EC2::KeyPair::KeyName
+  
+  NewKeyPairName:
+    Description: Name for the new EC2 KeyPair to create (required if KeyPairOption is 'create-new')
+    Type: String
+    Default: ''
+  
+  Ec2InstanceType:
+    Description: EC2 instance type
+    Type: String
+    Default: trn1.32xlarge
+    AllowedValues:
+      - trn1.32xlarge
+      - trn1.2xlarge
+  
+  VpcOption:
+    Description: Choose whether to use an existing VPC or create a new one
+    Type: String
+    Default: create-new
+    AllowedValues:
+      - create-new
+      - use-existing
+  
+  ExistingVpcId:
+    Type: AWS::EC2::VPC::Id
+    Description: 'Select an existing VPC from the dropdown (required if VpcOption is "use-existing")'
+  
+  ExistingSubnetId:
+    Type: AWS::EC2::Subnet::Id
+    Description: 'Select an existing Subnet from the dropdown (required if VpcOption is "use-existing")'
+  
+  UseExistingSecurityGroup:
+    Type: String
+    Description: 'Use an existing security group when using existing VPC?'
+    Default: 'no'
+    AllowedValues:
+      - 'yes'
+      - 'no'
+  
+  ExistingSecurityGroupId:
+    Type: String
+    Description: 'Select an existing Security Group ID (optional - only needed if UseExistingSecurityGroup is "yes")'
+    Default: 'sg-placeholder'  # Default placeholder value
+  
+  VpcCidrBlock:
+    Type: String
+    Description: 'CIDR block for the VPC (only used if creating new VPC)'
+    Default: '10.4.0.0/16'
+    AllowedPattern: '^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(\/([0-9]|[1-2][0-9]|3[0-2]))$'
+    ConstraintDescription: 'Must be a valid IPv4 CIDR block'
+  
+  PublicSubnet1Cidr:
+    Type: String
+    Description: 'CIDR block for public subnet 1 (only used if creating new VPC)'
+    Default: '10.4.1.0/24'
+    AllowedPattern: '^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(\/([0-9]|[1-2][0-9]|3[0-2]))$'
+    ConstraintDescription: 'Must be a valid IPv4 CIDR block'
+  
+  PublicSubnet2Cidr:
+    Type: String
+    Description: 'CIDR block for public subnet 2 (only used if creating new VPC)'
+    Default: '10.4.2.0/24'
+    AllowedPattern: '^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(\/([0-9]|[1-2][0-9]|3[0-2]))$'
+    ConstraintDescription: 'Must be a valid IPv4 CIDR block'
+
+Mappings:
+  RegionMap:
+    us-east-1:
+      AMI: ami-0e65a95c79775d1b6
+    us-west-2:
+      AMI: ami-0d0a2d26f80b645c2
+
+Metadata:
+  AWS::CloudFormation::Interface:
+    ParameterGroups:
+      - Label:
+          default: "SSH Key Configuration"
+        Parameters:
+          - KeyPairOption
+          - ExistingKeyPairName
+          - NewKeyPairName
+      - Label:
+          default: "VPC Configuration"
+        Parameters:
+          - VpcOption
+      - Label:
+          default: "Existing VPC Settings (only used if VpcOption is 'use-existing')"
+        Parameters:
+          - ExistingVpcId
+          - ExistingSubnetId
+          - UseExistingSecurityGroup
+          - ExistingSecurityGroupId
+      - Label:
+          default: "New VPC Settings (only used if VpcOption is 'create-new')"
+        Parameters:
+          - VpcCidrBlock
+          - PublicSubnet1Cidr
+          - PublicSubnet2Cidr
+      - Label:
+          default: "EC2 Configuration"
+        Parameters:
+          - Ec2InstanceType
+    ParameterLabels:
+      VpcOption:
+        default: "VPC Option"
+      ExistingVpcId:
+        default: "Existing VPC"
+      ExistingSubnetId:
+        default: "Existing Subnet"
+      VpcCidrBlock:
+        default: "New VPC CIDR Block"
+      PublicSubnet1Cidr:
+        default: "Public Subnet 1 CIDR"
+      PublicSubnet2Cidr:
+        default: "Public Subnet 2 CIDR"
+
+Conditions:
+  CreateNewVPC: !Equals [!Ref VpcOption, 'create-new']
+  UseExistingVPC: !Equals [!Ref VpcOption, 'use-existing']
+  UseExistingKeyPair: !Equals [!Ref KeyPairOption, 'use-existing']
+  CreateNewKeyPair: !Equals [!Ref KeyPairOption, 'create-new']
+  NoKeyPair: !Equals [!Ref KeyPairOption, 'none']
+  HasKeyPair: !Not [!Condition NoKeyPair]
+  CreateSecurityGroup: !Not [!And [!Condition UseExistingVPC, !Equals [!Ref UseExistingSecurityGroup, 'yes']]]
+  UseExistingSG: !And [!Condition UseExistingVPC, !Equals [!Ref UseExistingSecurityGroup, 'yes']]
+  HasValidExistingSecurityGroup: !And 
+    - !Condition UseExistingSG
+    - !Not [!Equals [!Ref ExistingSecurityGroupId, 'sg-placeholder']]
+
+Rules:
+  ValidateNewKeyPairConfiguration:
+    RuleCondition: !Equals [!Ref KeyPairOption, 'create-new']
+    Assertions:
+      - Assert: !Not [!Equals [!Ref NewKeyPairName, '']]
+        AssertDescription: "When creating a new key pair, you must provide a name."
+  
+  ValidateExistingSecurityGroupConfiguration:
+    RuleCondition: !And
+      - !Equals [!Ref VpcOption, 'use-existing']
+      - !Equals [!Ref UseExistingSecurityGroup, 'yes']
+    Assertions:
+      - Assert: !Not [!Equals [!Ref ExistingSecurityGroupId, 'sg-placeholder']]
+        AssertDescription: "When using an existing security group, you must select a valid security group ID."
+
+Resources:
+  # EC2 Key Pair (if creating new)
+  NewKeyPair:
+    Type: AWS::EC2::KeyPair
+    Condition: CreateNewKeyPair
+    Properties:
+      KeyName: !Ref NewKeyPairName
+      Tags:
+        - Key: Name
+          Value: !Sub '${AWS::StackName}-keypair'
+
+  # VPC
+  VPC:
+    Type: AWS::EC2::VPC
+    Condition: CreateNewVPC
+    Properties:
+      CidrBlock: !Ref VpcCidrBlock
+      EnableDnsHostnames: true
+      EnableDnsSupport: true
+      Tags:
+        - Key: Name
+          Value: !Sub '${AWS::StackName}-vpc'
+
+  # Internet Gateway
+  InternetGateway:
+    Type: AWS::EC2::InternetGateway
+    Condition: CreateNewVPC
+    Properties:
+      Tags:
+        - Key: Name
+          Value: !Sub '${AWS::StackName}-igw'
+
+  AttachGateway:
+    Type: AWS::EC2::VPCGatewayAttachment
+    Condition: CreateNewVPC
+    Properties:
+      VpcId: !Ref VPC
+      InternetGatewayId: !Ref InternetGateway
+
+  # Public Subnets
+  PublicSubnet1:
+    Type: AWS::EC2::Subnet
+    Condition: CreateNewVPC
+    Properties:
+      VpcId: !Ref VPC
+      CidrBlock: !Ref PublicSubnet1Cidr
+      # Let AWS choose the AZ to avoid capacity issues
+      # AvailabilityZone: !Select [0, !GetAZs '']
+      MapPublicIpOnLaunch: true
+      Tags:
+        - Key: Name
+          Value: !Sub '${AWS::StackName}-public-subnet-1'
+
+  PublicSubnet2:
+    Type: AWS::EC2::Subnet
+    Condition: CreateNewVPC
+    Properties:
+      VpcId: !Ref VPC
+      CidrBlock: !Ref PublicSubnet2Cidr
+      # Let AWS choose the AZ to avoid capacity issues
+      # AvailabilityZone: !Select [1, !GetAZs '']
+      MapPublicIpOnLaunch: true
+      Tags:
+        - Key: Name
+          Value: !Sub '${AWS::StackName}-public-subnet-2'
+
+  # Public Route Table
+  PublicRouteTable:
+    Type: AWS::EC2::RouteTable
+    Condition: CreateNewVPC
+    Properties:
+      VpcId: !Ref VPC
+      Tags:
+        - Key: Name
+          Value: !Sub '${AWS::StackName}-public-rt'
+
+  PublicRoute:
+    Type: AWS::EC2::Route
+    Condition: CreateNewVPC
+    DependsOn: AttachGateway
+    Properties:
+      RouteTableId: !Ref PublicRouteTable
+      DestinationCidrBlock: 0.0.0.0/0
+      GatewayId: !Ref InternetGateway
+
+  PublicSubnet1RouteTableAssociation:
+    Type: AWS::EC2::SubnetRouteTableAssociation
+    Condition: CreateNewVPC
+    Properties:
+      SubnetId: !Ref PublicSubnet1
+      RouteTableId: !Ref PublicRouteTable
+
+  PublicSubnet2RouteTableAssociation:
+    Type: AWS::EC2::SubnetRouteTableAssociation
+    Condition: CreateNewVPC
+    Properties:
+      SubnetId: !Ref PublicSubnet2
+      RouteTableId: !Ref PublicRouteTable
+
+  # EC2 Security Group
+  SecurityGroup:
+    Type: AWS::EC2::SecurityGroup
+    Condition: CreateSecurityGroup
+    Properties:
+      GroupDescription: Security group for NKI-Llama EC2 instance
+      VpcId: !If [CreateNewVPC, !Ref VPC, !Ref ExistingVpcId]
+      SecurityGroupIngress:
+        - IpProtocol: tcp
+          FromPort: 22
+          ToPort: 22
+          CidrIp: 0.0.0.0/0
+          Description: Allow SSH access
+      Tags:
+        - Key: Name
+          Value: !Sub '${AWS::StackName}-security-group'
+
+  # IAM Role for EC2
+  EC2Role:
+    Type: AWS::IAM::Role
+    Properties:
+      AssumeRolePolicyDocument:
+        Statement:
+          - Effect: Allow
+            Principal:
+              Service:
+                - ec2.amazonaws.com
+            Action:
+              - sts:AssumeRole
+      ManagedPolicyArns:
+        - arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
+      Tags:
+        - Key: Name
+          Value: !Sub '${AWS::StackName}-ec2-role'
+
+  EC2InstanceProfile:
+    Type: AWS::IAM::InstanceProfile
+    Properties:
+      Path: /
+      Roles:
+        - !Ref EC2Role
+
+  # EC2 Instance
+  EC2Instance:
+    Type: AWS::EC2::Instance
+    Properties:
+      IamInstanceProfile: !Ref EC2InstanceProfile
+      InstanceType: !Ref Ec2InstanceType
+      ImageId: !FindInMap [RegionMap, !Ref 'AWS::Region', AMI]
+      KeyName: !If 
+        - HasKeyPair
+        - !If 
+          - UseExistingKeyPair
+          - !Ref ExistingKeyPairName
+          - !Ref NewKeyPair
+        - !Ref 'AWS::NoValue'
+      BlockDeviceMappings:
+        - DeviceName: /dev/sda1
+          Ebs:
+            VolumeSize: 800
+            VolumeType: gp3
+      NetworkInterfaces:
+        - AssociatePublicIpAddress: true
+          DeviceIndex: 0
+          SubnetId: !If 
+            - CreateNewVPC
+            - !Ref PublicSubnet1
+            - !Ref ExistingSubnetId
+          GroupSet:
+            - !If 
+              - HasValidExistingSecurityGroup
+              - !Ref ExistingSecurityGroupId
+              - !Ref SecurityGroup
+      Tags:
+        - Key: Name
+          Value: !Sub '${AWS::StackName}-nki-llama'
+      UserData:
+        Fn::Base64: !Sub |
+          #!/bin/bash -x
+          
+          # Update system
+          sudo apt-get update
+          sudo apt-get upgrade -y
+          
+          # Install git
+          sudo apt-get install git -y
+          
+          # Clone the repository with agents branch
+          cd /home/ubuntu
+          git clone -b agents https://github.com/arm-diaz/nki-llama.git
+          sudo chown -R ubuntu:ubuntu /home/ubuntu/nki-llama/
+          git config --global --add safe.directory /home/ubuntu/nki-llama
+
+Outputs:
+  VpcId:
+    Description: 'VPC ID'
+    Value: !If [CreateNewVPC, !Ref VPC, !Ref ExistingVpcId]
+
+  PublicSubnet1Id:
+    Description: 'Public Subnet 1 ID'
+    Value: !If [CreateNewVPC, !Ref PublicSubnet1, 'Using existing subnet']
+    Condition: CreateNewVPC
+
+  PublicSubnet2Id:
+    Description: 'Public Subnet 2 ID'
+    Value: !If [CreateNewVPC, !Ref PublicSubnet2, 'Using existing subnet']
+    Condition: CreateNewVPC
+
+  EC2InstanceId:
+    Description: 'EC2 Instance ID'
+    Value: !Ref EC2Instance
+
+  EC2PublicIP:
+    Description: 'EC2 Instance Public IP'
+    Value: !GetAtt EC2Instance.PublicIp
+
+  EC2PrivateIP:
+    Description: 'EC2 Instance Private IP'
+    Value: !GetAtt EC2Instance.PrivateIp
+
+  SecurityGroupId:
+    Description: 'Security Group ID'
+    Value: !If [HasValidExistingSecurityGroup, !Ref ExistingSecurityGroupId, !Ref SecurityGroup]
+
+  SSHCommand:
+    Description: 'SSH connection information'
+    Value: !If
+      - NoKeyPair
+      - !Sub 'No SSH key configured. Use AWS Systems Manager Session Manager: aws ssm start-session --target ${EC2Instance}'
+      - !If
+        - UseExistingKeyPair
+        - !Sub 'ssh -i <path-to-your-${ExistingKeyPairName}-key> ubuntu@${EC2Instance.PublicIp}'
+        - !Sub 'ssh -i <path-to-your-${NewKeyPairName}-key> ubuntu@${EC2Instance.PublicIp}'
+
+  KeyPairInfo:
+    Description: 'Key pair information'
+    Value: !If
+      - NoKeyPair
+      - 'No key pair configured - use SSM Session Manager for access'
+      - !If
+        - UseExistingKeyPair
+        - !Sub 'Using existing key pair: ${ExistingKeyPairName}'
+        - !Sub 'Created new key pair: ${NewKeyPairName} (Download private key from EC2 console within 24 hours!)' 
+
+  ImportantNote:
+    Description: 'IMPORTANT - Read this for new key pairs'
+    Value: !If
+      - CreateNewKeyPair
+      - 'CRITICAL: You must download the private key from the EC2 console immediately! Go to EC2 > Key Pairs, find your key, and download it. This is the ONLY time you can download it!'
+      - 'N/A'
+    Condition: CreateNewKeyPair
\ No newline at end of file
diff --git a/docs/complete-pipeline.md b/docs/complete-pipeline.md
new file mode 100644
index 0000000..6190e8d
--- /dev/null
+++ b/docs/complete-pipeline.md
@@ -0,0 +1,349 @@
+# Complete Pipeline Guide: Fine-tuning + Inference with NKI
+
+## 🎯 Overview
+
+This guide covers the entire NKI-LLAMA pipeline, combining fine-tuning on AWS Trainium with NKI-optimized inference. This approach maximizes your hackathon score by optimizing both training and inference performance, plus optional reasoning evaluation.
+
+## 📋 Prerequisites
+
+### Instance Requirements
+- **Instance Type**: trn1.32xlarge (strongly recommended)
+- **AMI**: Deep Learning AMI Neuron (Ubuntu 22.04) 20250520
+  - **us-east-1**: `ami-0e65a95c79775d1b6`
+  - **us-west-2**: `ami-0d0a2d26f80b645c2`
+- **Storage**: 512GB+ (800GB default in CloudFormation for models and datasets)
+- **Neuron SDK**: 2.23.0
+
+### Environment Management
+Two virtual environments are used:
+```bash
+# For fine-tuning
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+
+# For inference and benchmarking
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+```
+
+## 🚀 Deployment
+
+Deploy the complete NKI-LLAMA environment using AWS CloudFormation with one click:
+
+| AWS Region | Launch CloudFormation Stack |
+|:-----------|:----------------------------|
+| us-east-1 (N. Virginia) |<a href="https://console.aws.amazon.com/cloudformation/home?region=us-east-1#/stacks/new?stackName=hackathon&templateURL=" target="_blank">Launch stack</a> |
+| us-west-2 (Oregon) |<a href="https://console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks/new?stackName=hackathon&templateURL=" target="_blank">Launch stack</a> |
+
+**Note:** Only us-east-1 and us-west-2 regions support Trainium (trn1) instances with the required Neuron AMIs.
+
+### Deployment Steps
+
+1. **Download the CloudFormation template**: 
+   - Click here to download: [deployment.yaml](../deployment/deployment.yaml)
+
+2. **Click** on one of the CloudFormation Console links above for your preferred region.
+
+3. **Upload the template**:
+   - Choose **Upload a template file**
+   - Click **Choose file** and select the downloaded `deployment.yaml`
+   - Click **Next**
+
+4. **Configure the stack:**
+   - **Stack name**: Keep default or customize (e.g., `nki-llama-complete`)
+   - **KeyPairOption**: Choose `use-existing` (recommended - create key in EC2 console first)
+   - **ExistingKeyPairName**: Select your key from dropdown (see note below)
+   - **Ec2InstanceType**: Keep default `trn1.32xlarge`
+   - **VpcOption**: Keep default `create-new`
+   - Click **Next**
+   
+   **Note**: For easy key download, first create a key pair in EC2 → Key Pairs → Create key pair, download it, then return here and select it from the dropdown.
+
+5. **Configure stack options**: Leave all values as default and click **Next**
+
+6. **Review and create:**
+   - Check the box: "I acknowledge that AWS CloudFormation might create IAM resources"
+   - Click **Create stack**
+   - Stack creation takes ~5-10 minutes
+
+7. **Access your instance:**
+   - Go to CloudFormation → Select your stack → **Outputs** tab
+   - Note the **EC2InstanceId** and **EC2PublicIP**
+   - Connect using your pre-downloaded key or SSM
+
+### Quick Access Commands
+
+```bash
+# SSH access (with your pre-created key)
+ssh -i ~/Downloads/your-key-name.pem ubuntu@<EC2PublicIP>
+
+# SSM access (no key needed)
+aws ssm start-session --target <EC2InstanceId>
+```
+
+### Post-Deployment Setup
+
+Once connected:
+
+```bash
+# Repository is pre-cloned
+cd ~/nki-llama
+
+# Install dependencies
+chmod +x install.sh
+./install.sh
+
+# Configure environment
+nano .env  # Add your HF_TOKEN
+
+# Verify setup
+neuron-ls  # Check Neuron devices
+```
+
+### 🎮 Using the NKI-LLAMA CLI
+
+The repository includes a unified command-line interface that simplifies all operations:
+
+```bash
+# Once connected to your instance
+cd ~/nki-llama
+
+# View all available commands
+./nki-llama help
+
+# Run interactive setup wizard
+./nki-llama setup
+```
+
+**Key Commands:**
+- `./nki-llama setup` - Interactive setup wizard with environment guidance
+- `./nki-llama status` - Check system health and compilation cache
+- `./nki-llama clean` - Clean artifacts and cache if needed
+
+**Pro Tips:**
+- Always run the setup wizard first: `./nki-llama setup`
+- Use `tmux` for long operations (the CLI will remind you)
+- Check `./nki-llama status` if you encounter issues
+- The CLI automatically guides you to the correct virtual environment
+
+## 🏃 Complete Workflow
+
+### Step 1: Initial Setup
+```bash
+# Clone repository
+git clone https://github.com/aws-neuron/nki-llama.git
+cd nki-llama
+
+# Install and configure
+chmod +x install.sh
+./install.sh
+
+# Setup environment
+nano .env  # Add HF_TOKEN and configure settings
+```
+
+### Step 2: Fine-tuning Phase
+```bash
+# Start tmux session for training
+tmux new -s training
+
+# Activate training environment
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+
+# Run complete fine-tuning pipeline
+./nki-llama finetune all
+
+# IMPORTANT: Note the compile directory from output
+# Example: /home/ubuntu/neuron_cache/neuronxcc-2.18.121.0+9e31e41a/MODULE_15329989265349737271+a65e371e
+```
+
+### Step 3: Inference Optimization
+```bash
+# Start new tmux session for inference
+tmux new -s inference
+
+# Switch to inference environment
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+
+# Download model if not already done
+./nki-llama inference download
+
+# Run benchmark with NKI compilation
+./nki-llama inference benchmark
+```
+
+### Step 4: Reasoning Evaluation (Optional)
+```bash
+# Start new tmux session for reasoning
+tmux new -s reasoning
+
+# Ensure inference environment is active
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+
+# Run reasoning benchmarks
+./nki-llama/src/inference/scripts/reasoning-bench-lm-eval.sh
+```
+
+### Step 5: Calculate Combined Score
+```bash
+# After all components complete
+python /home/ubuntu/nki-llama/src/handler.py \
+    --config /home/ubuntu/nki-llama/src/fine-tune/neuronx-distributed-training/examples/conf/hf_llama3_8B_SFT_config.yaml \
+    --model-config /home/ubuntu/nki-llama/src/fine-tune/configs/model-config/8B_config_llama3-1/config.json \
+    --log-file /home/ubuntu/nki-llama/logs/nki-llama_[YOUR_TRAINING_LOG].log \
+    --compile-dir [YOUR_COMPILE_DIR_FROM_TRAINING] \
+    --inference-results /home/ubuntu/nki-llama/src/inference/benchmark_inference.json \
+    --reasoning-results \
+    --throughput 2.1 \
+    --output complete_benchmark_results.json \
+    --training-weight 0.33 \
+    --inference-weight 0.33 \
+    --reasoning-weight 0.34 \
+    --hw-backend trn1 \
+    --per-file-scores \
+    --calculate-score \
+    --detailed \
+    --verbose
+```
+
+## 🔧 Integrated NKI Optimization Strategy
+
+### Phase 1: Training Optimizations
+
+#### Custom Training Kernels
+```python
+# Example: NKI-optimized gradient computation
+@nki.jit
+def nki_gradient_accumulation(gradients, accumulated_grads, scale_factor):
+    """
+    Optimized gradient accumulation for distributed training
+    """
+    # Efficient gradient scaling and accumulation
+    pass
+
+# Example: NKI-optimized optimizer step
+@nki.jit
+def nki_adam_step(params, grads, m, v, lr, beta1, beta2, eps):
+    """
+    Fused Adam optimizer step
+    """
+    # Implement fused parameter update
+    pass
+```
+
+#### Training-specific Optimizations
+1. **Gradient All-Reduce**: Optimize collective operations
+2. **Loss Computation**: Fused loss calculation
+3. **Activation Checkpointing**: Memory-efficient training
+4. **Mixed Precision**: FP16/BF16 optimizations
+
+### Phase 2: Inference Optimizations
+
+#### Shared Kernel Optimizations
+Many kernels can be shared between training and inference:
+
+```python
+# Shared RMSNorm implementation
+@nki.jit
+def nki_rmsnorm_kernel(input_tensor, weight, epsilon, training=False):
+    """
+    RMSNorm optimized for both training and inference
+    """
+    # Common normalization logic
+    normalized = compute_rmsnorm(input_tensor, weight, epsilon)
+    
+    if training:
+        # Store intermediate values for backward pass
+        save_for_backward(input_tensor, normalized)
+    
+    return normalized
+
+# Shared attention mechanism
+@nki.jit
+def nki_attention_kernel(q, k, v, mask=None, training=False):
+    """
+    Multi-head attention for training and inference
+    """
+    # Implement scaled dot-product attention
+    # with different optimizations for each mode
+    pass
+```
+
+#### Inference-specific Optimizations
+1. **KV Cache Management**: Optimize cache operations
+2. **Continuous Batching**: Dynamic batch processing
+3. **Speculative Decoding**: Parallel token generation
+4. **Quantization**: INT8/INT4 inference
+
+## 🐛 Common Integration Issues
+
+### Cache Conflicts
+```bash
+# Issue: Stale compiled kernels
+# Solution: Clear cache between major changes
+rm -rf ~/neuron_cache/*
+rm -rf ~/.cache/neuron
+```
+
+## 🏆 Competition Tips
+
+### 1. Time Management
+- **Week 1**: Get baseline working, understand the code
+- **Week 2**: Implement core NKI kernels
+- **Week 3**: Optimize and fine-tune
+- **Final days**: Polish, document, prepare submission
+
+### 2. Documentation
+Keep detailed logs of:
+- Kernel implementations
+- Performance improvements
+- Failed attempts (for learning)
+- Configuration changes
+
+## 📊 Example Complete Run
+
+```bash
+#!/bin/bash
+# complete_hackathon_run.sh
+
+# Setup
+echo "🚀 Starting complete NKI-LLAMA pipeline"
+
+# Training phase
+tmux new -d -s training
+tmux -a -t training "source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate" Enter
+tmux -a -t training "cd ~/nki-llama" Enter
+tmux -a -t training "./nki-llama finetune all 2>&1 | tee training.log" Enter
+
+# Wait for training to reach a checkpoint
+
+# Inference phase
+tmux new -d -s inference
+tmux -a -t inference "source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate" Enter
+tmux -a -t inference "cd ~/nki-llama" Enter
+tmux -a -t inference "./nki-llama inference benchmark 2>&1 | tee inference.log" Enter
+
+# Reasoning phase (optional)
+tmux new -d -s reasoning
+tmux -a -t reasoning "source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate" Enter
+tmux -a -t reasoning "./nki-llama/src/inference/scripts/reasoning-bench-lm-eval.sh" Enter
+```
+
+## 📚 Resources
+
+### Essential Documentation
+- [Complete NKI Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/index.html)
+- [NeuronX Distributed Training](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/neuronx-distributed/index.html)
+- [NeuronX Distributed Inference](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/neuronx-distributed-inference/index.html)
+
+### Example Repositories
+- [NKI Samples](https://github.com/aws-neuron/nki-samples)
+- [NKI Autotune](https://github.com/awslabs/nki-autotune)
+- [AWS Neuron Samples](https://github.com/aws-neuron/aws-neuron-samples)
+
+### Tools and Utilities
+- [Neuron Profiler](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/tools/neuron-sys-tools/neuron-profile-user-guide.html)
+- [Neuron Top](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/tools/neuron-sys-tools/neuron-top-user-guide.html)
+- [TensorBoard Integration](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuron/tutorials/training/tensorboard.html)
+
+---
+
+Remember: The key to maximizing your score is to optimize both training and inference with NKI kernels while maintaining model accuracy. Focus on the highest-impact optimizations first and ensure everything integrates smoothly. Good luck!
\ No newline at end of file
diff --git a/docs/fine-tuning.md b/docs/fine-tuning.md
new file mode 100644
index 0000000..ecd5932
--- /dev/null
+++ b/docs/fine-tuning.md
@@ -0,0 +1,370 @@
+# Fine-tuning Guide for NKI-LLAMA Hackathon
+
+## 🎯 Overview
+
+This guide focuses exclusively on fine-tuning LLaMA models on AWS Trainium using NeuronX Distributed (NxD). Perfect for participants wanting to optimize training performance and achieve high Model FLOP Utilization (MFU).
+
+## 📋 Prerequisites
+
+### Instance Requirements
+- **Instance Type**: trn1.32xlarge (recommended) or trn1.2xlarge (minimum)
+- **AMI**: Deep Learning AMI Neuron (Ubuntu 22.04) 20250520
+  - **us-east-1**: `ami-0e65a95c79775d1b6`
+  - **us-west-2**: `ami-0d0a2d26f80b645c2`
+- **Storage**: 512GB+ recommended (800GB default in CloudFormation)
+- **Neuron SDK**: 2.23.0
+
+### Environment Setup
+```bash
+# Activate the training environment
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+```
+
+## 🚀 Deployment
+
+Deploy the NKI-LLAMA training environment using AWS CloudFormation with one click:
+
+| AWS Region | Launch CloudFormation Stack |
+|:-----------|:----------------------------|
+| us-east-1 (N. Virginia) |<a href="https://console.aws.amazon.com/cloudformation/home?region=us-east-1#/stacks/new?stackName=hackathon&templateURL=" target="_blank">Launch stack</a> |
+| us-west-2 (Oregon) |<a href="https://console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks/new?stackName=hackathon&templateURL=" target="_blank">Launch stack</a> |
+
+**Note:** Only us-east-1 and us-west-2 regions support Trainium (trn1) instances with the required Neuron AMIs.
+
+### Deployment Steps
+
+1. **Download the CloudFormation template**: 
+   - Click here to download: [deployment.yaml](../deployment/deployment.yaml)
+
+2. **Click** on one of the CloudFormation Console links above for your preferred region.
+
+3. **Upload the template**:
+   - Choose **Upload a template file**
+   - Click **Choose file** and select the downloaded `deployment.yaml`
+   - Click **Next**
+
+4. **Configure the stack:**
+   - **Stack name**: Keep default or customize (e.g., `nki-llama-training`)
+   - **KeyPairOption**: Choose `use-existing` (recommended - create key in EC2 console first)
+   - **ExistingKeyPairName**: Select your key from dropdown (see note below)
+   - **Ec2InstanceType**: Keep default `trn1.32xlarge`
+   - Click **Next**
+   
+   **Note**: For easy key download, first create a key pair in EC2 → Key Pairs → Create key pair, download it, then return here and select it from the dropdown.
+
+5. **Configure stack options**: Leave all values as default and click **Next**
+
+6. **Review and create:**
+   - Check the box: "I acknowledge that AWS CloudFormation might create IAM resources"
+   - Click **Create stack**
+   - Stack creation takes ~5-10 minutes
+
+7. **Access your instance:**
+   - Go to CloudFormation → Select your stack → **Outputs** tab
+   - Copy the **SSHCommand** value
+   - If you created a new key, download it from EC2 → Key Pairs
+   - Connect: `ssh -i <your-key.pem> ubuntu@<instance-ip>`
+
+### Post-Deployment Setup
+
+Once connected to your instance:
+
+```bash
+# Repository is pre-cloned
+cd ~/nki-llama
+
+# Install dependencies
+chmod +x install.sh
+./install.sh
+
+# Configure environment
+nano .env  # Add your HF_TOKEN
+```
+
+### 🎮 Using the NKI-LLAMA CLI
+
+The repository includes a unified command-line interface that simplifies all operations:
+
+```bash
+# Once connected to your instance
+cd ~/nki-llama
+
+# View all available commands
+./nki-llama help
+
+# Run interactive setup wizard
+./nki-llama setup
+```
+
+**Key Commands:**
+- `./nki-llama setup` - Interactive setup wizard with environment guidance
+- `./nki-llama status` - Check system health and compilation cache
+- `./nki-llama clean` - Clean artifacts and cache if needed
+
+**Pro Tips:**
+- Always run the setup wizard first: `./nki-llama setup`
+- Use `tmux` for long operations (the CLI will remind you)
+- Check `./nki-llama status` if you encounter issues
+- The CLI automatically guides you to the correct virtual environment
+
+## 🚀 Quick Start
+
+### Step 1 (OPTIONAL): Clone and Setup
+
+**Please skip this step when deploying the infrastructure with cloudformation**
+
+```bash
+# Clone the repository
+git clone https://github.com/aws-neuron/nki-llama.git
+cd nki-llama
+
+# Install dependencies
+chmod +x install.sh
+./install.sh
+
+# Configure environment
+nano .env  # Add your HF_TOKEN
+```
+
+### Step 2: Run Complete Fine-tuning Pipeline
+```bash
+# Use tmux for long-running operations
+tmux new -s training
+
+# Activate training environment
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+
+# Run the complete pipeline
+./nki-llama finetune all
+```
+
+## 📊 Detailed Fine-tuning Workflow
+
+### 1. Install Dependencies
+```bash
+./nki-llama finetune deps
+```
+This installs all required Python packages and NeuronX Distributed components.
+
+### 2. Download Dataset
+```bash
+./nki-llama finetune data
+```
+Downloads and prepares the training dataset (default: dolly_15k).
+
+### 3. Download Base Model
+```bash
+./nki-llama finetune model
+```
+Downloads the base LLaMA model from Hugging Face (requires HF_TOKEN).
+
+### 4. Convert Model Format
+```bash
+./nki-llama finetune convert
+```
+Converts the model to NeuronX Distributed Training (NxDT) format.
+
+### 5. Pre-compile Graphs
+```bash
+./nki-llama finetune compile
+```
+**Important**: Note the compile directory path from the output. You'll need this for score calculation.
+
+Example output:
+```
+Pre-compile graphs: /home/ubuntu/neuron_cache/neuronxcc-2.18.121.0+9e31e41a/MODULE_15329989265349737271+a65e371e
+```
+
+### 6. Start Training
+```bash
+./nki-llama finetune train
+```
+Runs the actual fine-tuning process.
+
+## 📈 Performance Metrics
+
+During training, the system tracks:
+- **MFU (Model FLOP Utilization)**: Target >40% for good performance
+- **Throughput**: Tokens/second processed
+- **Loss convergence**: Training and validation loss
+- **Memory usage**: HBM utilization
+
+## 🎯 Score Calculation (Training Only)
+
+After training completes, calculate your performance score:
+
+```bash
+python /home/ubuntu/nki-llama/src/handler.py \
+    --config /home/ubuntu/nki-llama/src/fine-tune/neuronx-distributed-training/examples/conf/hf_llama3_8B_SFT_config.yaml \
+    --model-config /home/ubuntu/nki-llama/src/fine-tune/configs/model-config/8B_config_llama3-1/config.json \
+    --log-file /home/ubuntu/nki-llama/logs/nki-llama_[YOUR_TIMESTAMP].log \
+    --compile-dir [YOUR_COMPILE_DIR_FROM_STEP_5] \
+    --throughput 2.1 \
+    --output training_score.json \
+    --training-weight 1.0 \
+    --hw-backend trn1 \
+    --calculate-score \
+    --detailed \
+    --verbose
+```
+
+The training score evaluates:
+- **MFU improvement**: How well your optimizations improve hardware utilization
+- **Throughput gains**: Training speed improvements
+- **NKI optimization ratio**: Percentage of operations optimized with NKI
+
+## 🔧 Configuration Options
+
+### Training Configuration
+Edit `src/fine-tune/neuronx-distributed-training/examples/conf/hf_llama3_8B_SFT_config.yaml`:
+
+```yaml
+# Model parameters
+model:
+  model_id: "meta-llama/Meta-Llama-3-8B"
+  
+# Training parameters
+training:
+  batch_size: 1
+  gradient_accumulation_steps: 8
+  learning_rate: 5e-5
+  num_train_epochs: 1
+  
+# Hardware configuration
+distributed:
+  tensor_parallel_size: 8
+  pipeline_parallel_size: 1
+```
+
+### Environment Variables (.env)
+```bash
+# Hugging Face token (required)
+HF_TOKEN=your_token_here
+
+# Model selection
+MODEL_ID=meta-llama/Meta-Llama-3-8B
+MODEL_NAME=llama-3-8b
+
+# Hardware configuration
+TENSOR_PARALLEL_SIZE=8
+NEURON_RT_NUM_CORES=8
+```
+
+## 🛠️ Advanced Optimizations
+
+### 1. Implement Custom NKI Kernels
+Create optimized kernels for training operations:
+
+```python
+# Example: Optimized attention computation
+@nki_jit
+def nki_attention_kernel(q, k, v, mask=None):
+    # Your NKI implementation here
+    pass
+```
+
+### 2. Optimize Data Loading
+- Use efficient data preprocessing
+- Implement prefetching
+- Optimize tokenization pipeline
+
+### 3. Memory Optimization
+- Gradient checkpointing
+- Mixed precision training
+- Efficient tensor layouts
+
+## 📊 Monitoring Training
+
+### Real-time Monitoring
+```bash
+# In a new terminal
+neuron-top  # Monitor device utilization
+
+# View training logs
+tail -f logs/nki-llama_*.log
+```
+
+### Key Metrics to Watch
+- **step_loss**: Should decrease over time
+- **grad_norm**: Should remain stable
+- **throughput**: Tokens/second
+- **mfu**: Model FLOP Utilization
+
+## 🐛 Troubleshooting
+
+### Common Issues
+
+#### Out of Memory
+```bash
+# Reduce batch size or model parallelism
+export TENSOR_PARALLEL_SIZE=4  # Instead of 8
+```
+
+#### Compilation Timeout
+```bash
+# Increase timeout
+export NEURON_COMPILE_TIMEOUT=3600  # 1 hour
+```
+
+#### Training Instability
+- Check gradient norms
+- Reduce learning rate
+- Enable gradient clipping
+
+## 📚 Best Practices
+
+1. **Always use tmux** for long-running operations
+2. **Save checkpoints frequently** to prevent data loss
+3. **Monitor metrics** throughout training
+4. **Document your optimizations** for the presentation
+5. **Test incrementally** - verify each optimization works
+
+## 🏆 Scoring Tips
+
+To maximize your training-only score:
+
+1. **Focus on MFU**: Implement NKI kernels for compute-intensive operations
+2. **Optimize throughput**: Reduce data loading bottlenecks
+3. **Increase NKI coverage**: Replace more PyTorch ops with NKI kernels
+4. **Profile extensively**: Use neuron-profile to identify bottlenecks
+
+## 📄 Example Training Session
+
+```bash
+# Complete example workflow
+tmux new -s hackathon-training
+
+# Setup
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+cd ~/nki-llama
+
+# Run training
+./nki-llama finetune all
+
+# Monitor progress (in another terminal)
+tmux new -s monitoring
+neuron-top
+
+# After completion, calculate score
+python src/handler.py --config [...] --calculate-score
+
+# Detach from tmux: Ctrl+B, then D
+```
+
+## 🎯 Next Steps
+
+After mastering fine-tuning:
+1. Document your NKI kernel implementations
+2. Prepare performance comparison charts
+3. Consider exploring inference optimizations (see [inference.md](./inference.md))
+4. Prepare your presentation highlighting training improvements
+
+## 📚 Resources
+
+- [NeuronX Distributed Documentation](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/neuronx-distributed/index.html)
+- [NKI Training Examples](https://github.com/aws-neuron/nki-samples)
+- [AWS Neuron SDK Guide](https://awsdocs-neuron.readthedocs-hosted.com/)
+
+---
+
+Remember: Focus on achieving high MFU through effective NKI kernel implementation. Good luck with your hackathon!
\ No newline at end of file
diff --git a/docs/inference.md b/docs/inference.md
new file mode 100644
index 0000000..5356734
--- /dev/null
+++ b/docs/inference.md
@@ -0,0 +1,486 @@
+# Inference with NKI Compilation Guide for NKI-LLAMA Hackathon
+
+## 🎯 Overview
+
+This guide focuses on optimizing inference performance using Neuron Kernel Interface (NKI) compilation on AWS Inferentia/Trainium. Perfect for teams wanting to achieve maximum inference throughput and minimal latency without the training component.
+
+## 📋 Prerequisites
+
+### Instance Requirements
+- **Instance Type**: trn1.32xlarge (recommended) or trn1.2xlarge (minimum)
+- **AMI**: Deep Learning AMI Neuron (Ubuntu 22.04) 20250520
+  - **us-east-1**: `ami-0e65a95c79775d1b6`
+  - **us-west-2**: `ami-0d0a2d26f80b645c2`
+- **Storage**: 256GB+ recommended (800GB default in CloudFormation)
+- **Neuron SDK**: 2.23.0
+
+### Environment Setup
+```bash
+# Activate the inference environment
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+```
+
+## 🚀 Deployment
+
+Deploy the NKI-LLAMA inference environment using AWS CloudFormation with one click:
+
+| AWS Region | Launch CloudFormation Stack |
+|:-----------|:----------------------------|
+| us-east-1 (N. Virginia) |<a href="https://console.aws.amazon.com/cloudformation/home?region=us-east-1#/stacks/new?stackName=hackathon&templateURL=" target="_blank">Launch stack</a> |
+| us-west-2 (Oregon) |<a href="https://console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks/new?stackName=hackathon&templateURL=" target="_blank">Launch stack</a> |
+
+**Note:** Only us-east-1 and us-west-2 regions support Trainium (trn1) instances with the required Neuron AMIs.
+
+### Deployment Steps
+
+1. **Download the CloudFormation template**: 
+   - Click here to download: [deployment.yaml](../deployment/deployment.yaml)
+
+2. **Click** on one of the CloudFormation Console links above for your preferred region.
+
+3. **Upload the template**:
+   - Choose **Upload a template file**
+   - Click **Choose file** and select the downloaded `deployment.yaml`
+   - Click **Next**
+
+4. **Configure the stack:**
+   - **Stack name**: Keep default or customize (e.g., `nki-llama-inference`)
+   - **KeyPairOption**: Choose `use-existing` (recommended - create key in EC2 console first)
+   - **ExistingKeyPairName**: Select your key from dropdown (see note below)
+   - **Ec2InstanceType**: Keep default `trn1.32xlarge`
+   - **VpcOption**: Choose `create-new` or select existing VPC
+   - Click **Next**
+   
+   **Note**: For easy key download, first create a key pair in EC2 → Key Pairs → Create key pair, download it, then return here and select it from the dropdown. Alternatively, choose `none` to use SSM Session Manager without keys.
+
+5. **Configure stack options**: Leave all values as default and click **Next**
+
+6. **Review and create:**
+   - Check the box: "I acknowledge that AWS CloudFormation might create IAM resources"
+   - Click **Create stack**
+   - Stack creation takes ~5-10 minutes
+
+7. **Access your instance:**
+   - Go to CloudFormation → Select your stack → **Outputs** tab
+   - Use **SSHCommand** for SSH access or **EC2InstanceId** for SSM
+   - For SSM: `aws ssm start-session --target <instance-id>`
+
+### Post-Deployment Setup
+
+Once connected to your instance:
+
+```bash
+# Repository is pre-cloned
+cd ~/nki-llama
+
+# Install dependencies
+chmod +x install.sh
+./install.sh
+
+# Configure environment
+nano .env  # Add your HF_TOKEN and inference settings
+
+# Activate inference environment
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+```
+
+### 🎮 Using the NKI-LLAMA CLI
+
+The repository includes a unified command-line interface that simplifies all operations:
+
+```bash
+# Once connected to your instance
+cd ~/nki-llama
+
+# View all available commands
+./nki-llama help
+
+# Run interactive setup wizard
+./nki-llama setup
+```
+
+**Key Commands:**
+- `./nki-llama setup` - Interactive setup wizard with environment guidance
+- `./nki-llama status` - Check system health and compilation cache
+- `./nki-llama clean` - Clean artifacts and cache if needed
+
+**Pro Tips:**
+- Always run the setup wizard first: `./nki-llama setup`
+- Use `tmux` for long operations (the CLI will remind you)
+- Check `./nki-llama status` if you encounter issues
+- The CLI automatically guides you to the correct virtual environment
+
+## 🚀 Quick Start
+
+### Step 1 (OPTIONAL): Clone and Setup
+
+**Please skip this step when deploying the infrastructure with cloudformation**
+
+```bash
+# Clone the repository
+git clone https://github.com/aws-neuron/nki-llama.git
+cd nki-llama
+
+# Install dependencies
+chmod +x install.sh
+./install.sh
+
+# Configure environment
+nano .env  # Add your HF_TOKEN and inference settings
+```
+
+### Step 2: Download Model
+```bash
+# Download the model using the CLI
+./nki-llama inference download
+
+# Or manually download a specific model
+cd ~/models
+huggingface-cli download --token YOUR_TOKEN meta-llama/Meta-Llama-3-8B --local-dir /home/ubuntu/models/llama-3-8b
+```
+
+### Step 3: Run Benchmark with NKI Compilation
+```bash
+# Use tmux for long-running compilation
+tmux new -s benchmark
+
+# Run benchmark (includes NKI compilation on first run)
+./nki-llama inference benchmark
+```
+
+## 🔧 NKI Kernel Implementation
+
+### Understanding NKI Optimizations
+
+NKI (Neuron Kernel Interface) allows you to write custom, highly optimized kernels for Neuron devices. Key targets for optimization:
+
+1. **RMSNorm** - Layer normalization operations
+2. **Attention mechanisms** - Multi-head attention computation
+3. **Linear transformations** - Matrix multiplications
+4. **Activation functions** - GELU, SiLU, etc.
+
+### Example: Implementing NKI RMSNorm
+
+```python
+import neuron_kernel_interface as nki
+import torch.nn as nn
+
+@nki.jit
+def nki_rmsnorm_kernel(input_tensor, weight, epsilon):
+    """
+    Optimized RMSNorm implementation using NKI
+    """
+    # Get tensor dimensions
+    batch_size = input_tensor.shape[0]
+    seq_len = input_tensor.shape[1]
+    hidden_size = input_tensor.shape[2]
+    
+    # Allocate output tensor
+    output = nki.tensor(shape=input_tensor.shape, dtype=input_tensor.dtype)
+    
+    # Compute RMS normalization
+    # Your NKI implementation here
+    # ...
+    
+    return output
+
+# Modify the model to use NKI kernel
+class CustomRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6, nki_enabled=True):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+        self.nki_enabled = nki_enabled
+    
+    def forward(self, hidden_states):
+        if self.nki_enabled:
+            return nki_rmsnorm_kernel(hidden_states, self.weight, self.variance_epsilon)
+        # Fallback to standard implementation
+        return standard_rmsnorm(hidden_states, self.weight, self.variance_epsilon)
+```
+
+### Implementing Additional NKI Kernels
+
+#### 1. Attention Kernel
+```python
+@nki.jit
+def nki_attention_kernel(q, k, v, mask=None):
+    """
+    Optimized attention computation
+    """
+    # Implement scaled dot-product attention
+    # with NKI optimizations
+    pass
+```
+
+#### 2. Linear Layer Kernel
+```python
+@nki.jit
+def nki_linear_kernel(input, weight, bias=None):
+    """
+    Optimized linear transformation
+    """
+    # Implement matrix multiplication
+    # with optional bias addition
+    pass
+```
+
+## 📊 Benchmarking Process
+
+### Running Benchmarks
+
+```bash
+# Full benchmark with default settings
+./nki-llama inference benchmark
+
+# Benchmark with custom sequence length
+./nki-llama inference benchmark --seq-len 2048
+
+# Clear cache and re-benchmark
+./nki-llama inference benchmark --clear-cache
+```
+
+### Direct Benchmark Execution
+For more control over benchmarking parameters:
+
+```bash
+cd src/inference
+python main.py \
+    --mode evaluate_all \
+    --seq-len 1024 \
+    --batch-size 1 \
+    --enable-nki \
+    --num-prompts 25
+```
+
+## 🎯 Score Calculation (Inference Only)
+
+After benchmarking completes, calculate your performance score:
+
+```bash
+python /home/ubuntu/nki-llama/src/handler.py \
+    --inference-results /home/ubuntu/nki-llama/src/inference/benchmark_inference.json \
+    --output inference_score.json \
+    --inference-weight 1.0 \
+    --hw-backend trn1 \
+    --calculate-score \
+    --detailed \
+    --verbose
+```
+
+The inference score evaluates:
+- **Latency reduction**: Time to First Token (TTFT) improvement
+- **Throughput increase**: Tokens/second improvement
+- **NKI coverage**: Percentage of FLOPs using NKI kernels
+
+## 🔍 Profiling and Optimization
+
+### Using Neuron Profiler
+
+```bash
+# Enable profiling during benchmark
+export NEURON_PROFILE=1
+export NEURON_PROFILE_CONFIG=profile.json
+
+# Create profile configuration
+cat > profile.json << EOF
+{
+    "capture": {
+        "enabled": true,
+        "output_dir": "./profiles",
+        "duration_ms": 10000
+    }
+}
+EOF
+
+# Run benchmark with profiling
+./nki-llama inference benchmark
+
+# Analyze results
+neuron-profile view ./profiles/profile_*.neff
+```
+
+### Key Optimization Targets
+
+1. **Memory Access Patterns**
+   - Optimize data layout for Neuron memory hierarchy
+   - Minimize HBM bandwidth usage
+   - Use efficient tiling strategies
+
+2. **Compute Efficiency**
+   - Maximize tensor core utilization
+   - Fuse operations where possible
+   - Eliminate redundant computations
+
+3. **Pipeline Optimization**
+   - Overlap compute and memory operations
+   - Optimize kernel launch overhead
+   - Efficient synchronization
+
+## 🛠️ Advanced NKI Techniques
+
+### 1. Kernel Fusion
+Combine multiple operations into a single kernel:
+
+```python
+@nki.jit
+def nki_fused_attention_norm(q, k, v, norm_weight, epsilon):
+    """
+    Fused attention + normalization kernel
+    """
+    # Compute attention
+    attn_output = nki_attention_kernel(q, k, v)
+    
+    # Apply normalization in the same kernel
+    normalized = nki_rmsnorm_kernel(attn_output, norm_weight, epsilon)
+    
+    return normalized
+```
+
+### 2. Tiling Strategies
+Optimize for Neuron's memory hierarchy:
+
+```python
+@nki.jit
+def nki_tiled_matmul(a, b, tile_size=128):
+    """
+    Tiled matrix multiplication for better cache usage
+    """
+    # Implement tiled algorithm
+    # optimized for Neuron architecture
+    pass
+```
+
+### 3. Asynchronous Execution
+Leverage Neuron's async capabilities:
+
+```python
+# Enable async execution in your kernels
+@nki.jit(async_launch=True)
+def nki_async_kernel(...):
+    pass
+```
+
+## 📈 Performance Monitoring
+
+### Real-time Monitoring
+```bash
+# Monitor device utilization
+neuron-top
+
+# Watch compilation progress
+tail -f logs/nki-llama_*.log
+
+# Check benchmark results
+cat src/inference/benchmark_inference.json | jq
+```
+
+### Key Metrics
+- **TTFT (Time to First Token)**: Target <100ms
+- **Throughput**: Target >1000 tokens/sec
+- **Device Utilization**: Target >90%
+- **Memory Bandwidth**: Monitor for bottlenecks
+
+## 🐛 Troubleshooting
+
+### Common Issues
+
+#### Compilation Cache Errors
+```bash
+# Clear the cache
+./nki-llama clean
+# or
+rm -rf ~/neuron_cache/*
+```
+
+#### Out of Memory During Compilation
+```bash
+# Reduce parallelism
+export NEURON_COMPILE_THREADS=4
+```
+
+#### Kernel Launch Failures
+- Check tensor dimensions match kernel expectations
+- Verify data types are supported
+- Enable debug mode: `export NEURON_DEBUG=1`
+
+## 🏆 Optimization Strategies
+
+### 1. Target Hot Spots
+Focus on operations that consume most time:
+- Attention computation (usually 30-40% of time)
+- Linear layers (20-30%)
+- Normalization (10-15%)
+
+### 2. Incremental Optimization
+- Start with one kernel (e.g., RMSNorm)
+- Validate correctness
+- Measure improvement
+- Move to next kernel
+
+## 📊 Benchmark Configuration
+
+### Custom Prompt Testing
+Create your own prompts for testing:
+
+```bash
+# Edit prompts.txt
+nano ./data/prompts.json
+```
+
+### Batch Processing
+Test different batch sizes:
+
+```bash
+for batch in 1 2 4 8; do
+    ./nki-llama inference benchmark --batch-size $batch
+done
+```
+
+## 🎯 Next Steps
+
+After mastering inference optimization:
+1. Document your NKI kernel implementations
+2. Create performance comparison charts
+3. Consider adding fine-tuning (see [complete-pipeline.md](./complete-pipeline.md))
+4. Prepare reasoning benchmarks for additional scoring
+
+## 📚 Example Inference Session
+
+```bash
+# Complete workflow example
+tmux new -s hackathon-inference
+
+# Setup
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+cd ~/nki-llama
+
+# Download model
+./nki-llama inference download
+
+# Run initial benchmark
+./nki-llama inference benchmark
+
+# Implement NKI optimizations
+nano src/llama.py  # Add your NKI kernels
+
+# Re-benchmark with optimizations
+./nki-llama inference benchmark --clear-cache
+
+# Calculate score
+python src/handler.py --inference-results benchmark_inference.json --calculate-score
+
+# Start serving (optional)
+./nki-llama inference server
+```
+
+## 📚 Resources
+
+- [NKI Documentation](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/index.html)
+- [NKI Samples Repository](https://github.com/aws-neuron/nki-samples)
+- [NKI Autotune Tool](https://github.com/awslabs/nki-autotune)
+- [Neuron Profiler Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/tools/neuron-sys-tools/neuron-profile-user-guide.html)
+
+---
+
+Remember: Focus on implementing high-performance NKI kernels for critical operations. The key to success is identifying and optimizing the bottlenecks in your model's inference pipeline!
\ No newline at end of file
diff --git a/docs/self-attention.md b/docs/self-attention.md
new file mode 100644
index 0000000..67ac1b7
--- /dev/null
+++ b/docs/self-attention.md
@@ -0,0 +1,510 @@
+# Flash Self-Attention Kernel Optimizations Guide for NKI-LLAMA Hackathon
+
+## 🎯 Overview
+
+This guide focuses on working with the self-attention kernels provided and optimizing them further using the Neuron Kernel Interface (NKI) compilation on AWS Inferentia/Trainium. This is a perfect starting place for teams who want to learn more about NKI and how kernel optimizations can be applied without having to train or inference components.
+
+### Instance Requirements
+- **Instance Type**: trn1.2xlarge (minimum) or trn1.32xlarge
+- **AMI**: Deep Learning AMI Neuron (Ubuntu 22.04) 20250520
+  - **us-east-1**: `ami-0e65a95c79775d1b6`
+  - **us-west-2**: `ami-0d0a2d26f80b645c2`
+- **Storage**: 256GB+ recommended (800GB default in CloudFormation)
+- **Neuron SDK**: 2.23.0
+
+### Environment Setup
+```bash
+# Activate the self-attention environment
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+```
+
+## 🚀 Deployment
+
+Deploy the NKI-LLAMA training environment using AWS CloudFormation with one click:
+
+| AWS Region | Launch CloudFormation Stack |
+|:-----------|:----------------------------|
+| us-east-1 (N. Virginia) |<a href="https://console.aws.amazon.com/cloudformation/home?region=us-east-1#/stacks/new?stackName=hackathon&templateURL=" target="_blank">Launch stack</a> |
+| us-west-2 (Oregon) |<a href="https://console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks/new?stackName=hackathon&templateURL=" target="_blank">Launch stack</a> |
+
+**Note:** Only us-east-1 and us-west-2 regions support Trainium (trn1) instances with the required Neuron AMIs.
+
+### Deployment Steps
+
+1. **Download the CloudFormation template**: 
+   - Click here to download: [deployment.yaml](../deployment/deployment.yaml)
+
+2. **Click** on one of the CloudFormation Console links above for your preferred region.
+
+3. **Upload the template**:
+   - Choose **Upload a template file**
+   - Click **Choose file** and select the downloaded `deployment.yaml`
+   - Click **Next**
+
+4. **Configure the stack:**
+   - **Stack name**: Keep default or customize (e.g., `nki-llama-attention`)
+   - **KeyPairOption**: Choose `use-existing` (recommended - create key in EC2 console first)
+   - **ExistingKeyPairName**: Select your key from dropdown (see note below)
+   - **Ec2InstanceType**: Default: `trn1.32xlarge` - can be changed to use `trn1.2xlarge`
+   - Click **Next**
+   
+   **Note**: For easy key download, first create a key pair in EC2 → Key Pairs → Create key pair, download it, then return here and select it from the dropdown.
+
+5. **Configure stack options**: Leave all values as default and click **Next**
+
+6. **Review and create:**
+   - Check the box: "I acknowledge that AWS CloudFormation might create IAM resources"
+   - Click **Create stack**
+   - Stack creation takes ~5-10 minutes
+
+7. **Access your instance:**
+   - Go to CloudFormation → Select your stack → **Outputs** tab
+   - Copy the **SSHCommand** value
+   - If you created a new key, download it from EC2 → Key Pairs
+   - Connect: `ssh -i <your-key.pem> ubuntu@<instance-ip>`
+
+### Post-Deployment Setup
+
+Once connected to your instance:
+
+```bash
+# Repository is pre-cloned
+cd ~/nki-llama
+
+# Install dependencies
+chmod +x install.sh
+./install.sh
+```
+
+## 🎮 Using the NKI-LLAMA CLI
+
+The repository includes a unified command-line interface that simplifies all operations. You can use either the CLI commands or run the scripts directly.
+
+### Option 1: Using NKI-LLAMA CLI (Recommended)
+
+```bash
+# Once connected to your instance
+cd ~/nki-llama
+
+# View all self-attention commands
+./nki-llama help
+
+# Run interactive setup wizard
+./nki-llama setup
+```
+
+**Self-Attention CLI Commands:**
+- `./nki-llama self-attention benchmark` - Run comprehensive benchmarks
+- `./nki-llama self-attention test` - Run all tests
+- `./nki-llama self-attention test forward` - Run forward pass tests only
+- `./nki-llama self-attention test backward` - Run backward pass tests only
+- `./nki-llama self-attention run <script>` - Run a specific script
+
+**Key Benefits of Using CLI:**
+- Automatic environment detection and activation guidance
+- Built-in tmux recommendations for long operations
+- Integrated logging to `logs/` directory
+- Consistent error handling and reporting
+
+### Option 2: Direct Script Execution
+
+If you prefer to run scripts directly:
+
+```bash
+# Navigate to scripts directory
+cd ~/nki-llama/src/self-attention/scripts
+
+# Run the comprehensive benchmark script
+./self-attention_benchmark.sh 
+
+# Run specific test suite
+pytest ../tests/test_flash_attn_fwd.py -v -s
+pytest ../tests/test_flash_attn_bwd.py -v -s
+```
+
+## 📁 File Overview
+
+### Core Test Files
+
+| File | Description | Purpose |
+|------|-------------|---------|
+| `test_flash_attn_fwd.py` | Forward pass tests | Performance + numerical validation for forward attention |
+| `test_flash_attn_bwd.py` | Backward pass tests | Performance + numerical validation for backward attention |
+
+### Kernel Implementation Files
+| File | Description | Key Functions |
+|------|-------------|---------|
+| `attention.py` | Core NKI Kernel implementation | `flash_fwd, flash_attn_bwd, fused_self_attn_for_SD_small_head_size` |
+| `FlashConfig` | Configuration dataclass | Performance tuning parameters |
+
+### Kernel Functions Overview
+
+**`flash_fwd` - Flash Attention Forward Pass**
+
+- **Purpose:** Optimized forward attention computation with tiling and memory efficiency
+- **Features:** Causal masking, mixed precision, dropout, GQA/MQA support, logit bias
+- **Optimizations:** Memory tiling, recomputation, SBUF management
+- **Usage:** `flash_fwd[batch_size, kv_heads](q, k, v, seed, config=FlashConfig(...))`
+
+**`flash_attn_bwd` - Flash Attention Backward Pass**
+
+- **Purpose:** Backward pass gradient computation for attention
+- **Features:** Efficient gradient calculation for Q, K, V with recomputation
+- **Optimizations:** Tiled computation, memory-efficient recomputation
+- **Usage:** `flash_attn_bwd[batch_size, heads](q, k, v, o, dy, lse, seed)`
+
+**`fused_self_attn_for_SD_small_head_size` - Stable Diffusion Specialized**
+
+- **Purpose:** Optimized attention for small head sizes (≤128) in Stable Diffusion
+- **Features:** Specialized for SD workloads, different tensor layouts
+- **Usage:** `fused_self_attn_for_SD_small_head_size[batch_size](q, k, v)`
+
+## 🚀 Quick Start
+
+### Step 1 (OPTIONAL): Clone and Setup
+
+**Please skip this step when deploying the infrastructure with cloudformation**
+
+```bash
+# Clone the repository
+git clone https://github.com/aws-neuron/nki-llama.git
+cd nki-llama
+
+# Install dependencies
+chmod +x install.sh
+./install.sh
+```
+
+### Step 2: Modify and Optimize the Kernel Implementations
+
+Refer to the `attention.py` file for details on the kernel implementation. This is the main file where contestants would want to edit to implement their optimization before testing the kernels.
+
+### Step 3: Run the Flash Self-Attention Kernel Unit Tests
+
+#### Using NKI-LLAMA CLI (Recommended)
+
+```bash
+# The CLI will check your environment and guide you if needed
+cd ~/nki-llama
+
+# Use tmux for benchmarking (recommended)
+tmux new -s self-attention
+
+# Activate python environment
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+
+# Run comprehensive benchmarks
+./nki-llama self-attention benchmark
+
+# Run all tests
+./nki-llama self-attention test
+
+# Run specific test types
+./nki-llama self-attention test forward   # Forward pass only
+./nki-llama self-attention test backward  # Backward pass only
+
+# Detach from tmux with Ctrl+B, D
+# Reattach with: tmux attach -t self-attention
+```
+
+**Environment Handling:**
+The CLI will automatically:
+- Detect if you're in the correct virtual environment
+- Provide the exact activation command if needed
+- Suggest tmux for long operations
+- Log all output to `logs/nki-llama_<timestamp>.log`
+
+Example CLI output when environment is not active:
+```
+❌ No virtual environment active
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+⚠️  Self-attention environment required
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+
+Please activate the environment first:
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+```
+
+#### Using Direct Script Execution
+
+```bash
+# Activate the environment manually
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+
+# Run the unit tests
+cd ~/nki-llama/src/self-attention/scripts
+
+# Use tmux for long operations
+tmux new -s benchmark
+
+# Run the comprehensive benchmark script
+./self-attention_benchmark.sh 
+
+# Run specific test suite
+pytest ../tests/test_flash_attn_fwd.py -v -s
+pytest ../tests/test_flash_attn_bwd.py -v -s
+```
+
+### Step 4: Understand the Scoring Mechanism
+
+The benchmark calculates a combined score based on the following formula:
+
+```
+final_score = accuracy * latency_improvement * throughput_improvement * (1.0 + nki_flop_ratio)
+```
+
+Where:
+- `accuracy`: Binary value (1.0 or 0.0) indicating if numerical tests pass
+- `latency_improvement`: Ratio of baseline latency to measured latency
+- `throughput_improvement`: Inversely proportional to latency (higher is better)
+- `nki_flop_ratio`: Ratio of operations executed on NKI hardware (hardware utilization)
+
+The NKI FLOP ratio is automatically calculated based on the kernel characteristics, considering:
+- Matrix multiplication operations (highly accelerated on NKI)
+- Softmax operations (partially accelerated)
+- Batch size, sequence length, and head dimension effects on hardware utilization
+
+This scoring mechanism rewards both correctness and performance improvements, with a bonus for efficient hardware utilization.
+
+## 🧪 Test Categories
+
+### Performance Tests (`test_*_perf`)
+
+**Purpose**: Validate that Flash Attention kernels meet latency requirements under various configurations.
+
+**What they test:**
+- Execution latency across different percentiles (P50, P90, P95, P99)
+- Memory usage efficiency
+- Performance scaling with sequence length and batch size
+
+**Example output:**
+```
+📈 PERFORMANCE METRICS:
+   P50 Latency: 12,500,000 ns (0.013s) ✅ PASS
+   P90 Latency: 15,200,000 ns (0.015s) ✅ PASS
+   P95 Latency: 16,800,000 ns (0.017s) ✅ PASS
+   Expected:   15,100,000,000 ns (15.100s)
+
+💾 MEMORY USAGE ESTIMATES:
+   Q tensor:     3072.00 MB
+   K tensor:     3072.00 MB
+   V tensor:     3072.00 MB
+   Total Input:  9216.00 MB
+   Est. Peak:    18432.00 MB (2x for intermediate)
+   
+🔢 NKI FLOP RATIO: 0.8734
+   Calculated NKI FLOP ratio represents the percentage of operations
+   that can be accelerated by the NKI hardware.
+```
+
+**Performance Metrics JSON:**
+The benchmark generates a detailed JSON file with accumulated metrics:
+```json
+{
+  "timestamp": "2025-07-14T18:45:23Z",
+  "forward": {
+    "latency": 12500000,
+    "base_latency": 15100000000,
+    "latency_improvement": 1208.00,
+    "throughput_improvement": 1208.00,
+    "numerical_accuracy": 1.0,
+    "score": 1459264.00
+  },
+  "backward": {
+    "latency": 41482,
+    "base_latency": 117000,
+    "latency_improvement": 2.82,
+    "throughput_improvement": 2.82,
+    "numerical_accuracy": 1.0,
+    "score": 7.95
+  },
+  "combined": {
+    "forward_weight": 0.4,
+    "backward_weight": 0.6,
+    "nki_flop_ratio": 0.87,
+    "raw_score": 583710.37,
+    "score": 1091538.39
+  }
+}
+```
+
+### Numerical Accuracy Tests (`test_*_numerical`)
+
+**Purpose**: Ensure computational accuracy by comparing Flash Attention outputs against reference CPU implementations.
+
+**What they test:**
+- Numerical correctness within tolerance (1e-2)
+- Forward pass: Output tensors and LSE (Log-Sum-Exp) values
+- Backward pass: Gradient tensors (dQ, dK, dV)
+- Cross-validation between hardware and simulation modes
+
+**Example output:**
+```
+📊 dQ Gradient Comparison:
+   Max absolute difference:  0.000847
+   Mean absolute difference: 0.000234
+   Mean relative error:      0.001245
+   Tolerance:                0.01
+   Result: ✅ PASS
+
+🔬 NUMERICAL VERIFICATION:
+   Flash Output vs Reference CPU:
+   ✅ Output tensor: PASS (max_diff: 0.00234)
+   ✅ LSE tensor: PASS (max_diff: 0.00156)
+   🎉 All numerical checks passed!
+```
+
+## 🛠️ Advanced Usage
+
+### Custom Test Execution with NKI-LLAMA CLI
+
+```bash
+# Run specific scripts with the CLI
+./nki-llama self-attention run custom_benchmark.sh
+
+# Pass additional pytest arguments through CLI
+./nki-llama self-attention test forward -k "test_flash_attn_fwd_perf" --tb=short
+
+# Run with verbose output
+./nki-llama self-attention test all -v -s --durations=10
+```
+
+### Direct pytest Execution
+```bash
+# Run with maximum verbosity and detailed tracebacks
+pytest test_flash_attn_fwd.py -v -s --tb=long
+
+# Run specific parameter combinations
+pytest test_flash_attn_fwd.py::TestAttention::test_flash_attn_fwd_perf[1-6-32768-32768-96-bfloat16-True-True-True-2048-3-False-87000000000] -v -s
+
+# Stop on first failure for debugging
+pytest test_flash_attn_fwd.py -v -s -x
+
+# Run with timing information
+pytest test_flash_attn_fwd.py -v -s --durations=10
+
+# Capture output to file
+pytest test_flash_attn_fwd.py -v -s > test_results.log 2>&1
+```
+
+## 🔧 Troubleshooting
+
+### Using NKI-LLAMA CLI for Diagnostics
+
+```bash
+# Check overall status
+./nki-llama status
+
+# Check self-attention specific status
+./nki-llama self-attention status
+
+# View logs
+ls -la logs/nki-llama_*.log
+tail -f logs/nki-llama_*.log
+```
+
+### Common Issues
+
+#### Environment Not Active:
+The CLI will detect this and show:
+```
+❌ No virtual environment active
+Please activate the environment first:
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+```
+
+#### Performance Test Failures:
+- Check hardware availability and configuration
+- Verify expected latency thresholds are appropriate for your hardware
+- Review memory usage estimates for resource constraints
+
+#### Numerical Test Failures:
+- Increase tolerance if needed for specific hardware characteristics
+- Check tensor shapes and data types match expectations
+- Verify reference implementation correctness
+
+#### Simulation Mode Issues:
+- Ensure simulation environment is properly configured
+- Check that all required kernels are available in simulation
+
+### Expected Test Outcomes
+
+**Performance Tests:**
+- ✅ Pass: Latency within expected bounds
+- ❌ Fail: Latency exceeds thresholds (check hardware load, configuration)
+- ⚠️ xfail: Known issues (marked with ticket numbers)
+
+**Numerical Tests:**
+- ✅ Pass: All gradients/outputs within tolerance
+- ❌ Fail: Numerical differences exceed tolerance (check implementation)
+
+**Test Status:**
+- 🎉 PASSED: All metrics within acceptable ranges
+- 💥 FAILED: One or more metrics exceeded thresholds
+- ⚠️ xfail: Expected failure due to known issues
+- ❓ Cannot Determine: Missing metric data (API issues)
+
+## 📊 Complete Workflow Example
+
+### Using NKI-LLAMA CLI (Recommended)
+```bash
+# Start a new tmux session
+tmux new -s hackathon-attention
+
+# Navigate to repository
+cd ~/nki-llama
+
+# If environment is not active, the CLI will tell you to:
+# source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+
+# Modify kernel implementation
+nano src/self-attention/attention.py
+
+# Run benchmarks
+./nki-llama self-attention benchmark
+
+# Run specific tests
+./nki-llama self-attention test forward
+./nki-llama self-attention test backward
+
+# Check logs
+tail -f logs/nki-llama_*.log
+
+# Detach from tmux: Ctrl+B, D
+# Reattach later: tmux attach -t hackathon-attention
+```
+
+### Using Direct Scripts
+```bash
+# Start tmux
+tmux new -s benchmark
+
+# Activate environment
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+
+# Navigate to scripts
+cd ~/nki-llama/src/self-attention/scripts
+
+# Run benchmark
+./self-attention_benchmark.sh
+
+# Run individual tests
+cd ../tests
+pytest test_flash_attn_fwd.py -v -s
+pytest test_flash_attn_bwd.py -v -s
+```
+
+## 📚 Resources
+
+- [NKI Documentation](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/index.html)
+- [NKI Samples Repository](https://github.com/aws-neuron/nki-samples)
+- [Flash Attention Paper](https://arxiv.org/abs/2205.14135)
+- [Neuron SDK Documentation](https://awsdocs-neuron.readthedocs-hosted.com/)
+
+---
+
+**Pro Tips:**
+- Always use the NKI-LLAMA CLI for better environment management
+- Run benchmarks in tmux to avoid disconnection issues
+- Check `./nki-llama status` regularly to monitor system health
+- The CLI logs everything to `logs/` for later analysis
\ No newline at end of file
diff --git a/install.sh b/install.sh
new file mode 100755
index 0000000..e789286
--- /dev/null
+++ b/install.sh
@@ -0,0 +1,187 @@
+#!/bin/bash
+# install.sh - NKI-LLAMA installation script
+
+set -euo pipefail
+
+# Colors
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+RED='\033[0;31m'
+CYAN='\033[0;36m'
+NC='\033[0m'
+BOLD='\033[1m'
+
+# Banner
+echo -e "${CYAN}"
+cat << 'EOF'
+    _   __ __ __ ____       __    __       ___    __  ___    ___ 
+   / | / // //_//  _/      / /   / /      /   |  /  |/  /   /   |
+  /  |/ // ,<   / /______ / /   / /      / /| | / /|_/ /   / /| |
+ / /|  // /| |_/ /_______/ /___/ /___   / ___ |/ /  / /   / ___ |
+/_/ |_//_/ |_/___/       /_____/_____/  /_/  |_/_/  /_/   /_/  |_|
+                                                             
+EOF
+echo -e "${NC}"
+
+echo -e "${BOLD}NKI-LLAMA Installation${NC}"
+echo -e "===================="
+echo
+
+# Get installation directory
+INSTALL_DIR="${1:-$(pwd)}"
+echo -e "${BLUE}Installing to: ${INSTALL_DIR}${NC}"
+
+# Create directories
+echo -e "\n${YELLOW}Creating directory structure...${NC}"
+mkdir -p "${INSTALL_DIR}/logs/benchmarks"
+mkdir -p "${INSTALL_DIR}/src/inference/scripts"
+mkdir -p "${INSTALL_DIR}/src/fine-tune/scripts"
+
+# Make scripts executable
+echo -e "${YELLOW}Setting up scripts...${NC}"
+chmod +x "${INSTALL_DIR}/nki-llama.sh" 2>/dev/null || true
+chmod +x "${INSTALL_DIR}/src/inference/scripts/"*.sh 2>/dev/null || true
+chmod +x "${INSTALL_DIR}/src/fine-tune/scripts/"*.sh 2>/dev/null || true
+
+# Create symlink for easier access
+if [[ -f "${INSTALL_DIR}/nki-llama.sh" ]] && [[ ! -f "${INSTALL_DIR}/nki-llama" ]]; then
+    ln -s "${INSTALL_DIR}/nki-llama.sh" "${INSTALL_DIR}/nki-llama"
+    echo -e "${GREEN}✓ Created nki-llama symlink${NC}"
+fi
+
+# Copy example environment file
+if [[ ! -f "${INSTALL_DIR}/.env" ]]; then
+    if [[ -f "${INSTALL_DIR}/.env.example" ]]; then
+        cp "${INSTALL_DIR}/.env.example" "${INSTALL_DIR}/.env"
+        echo -e "${GREEN}✓ Created .env file from example${NC}"
+        echo -e "${YELLOW}  Please edit .env and add your HF_TOKEN${NC}"
+    else
+        # Create a basic .env file if no example exists
+        cat > "${INSTALL_DIR}/.env" << 'EOF'
+# NKI-LLAMA Configuration
+HF_TOKEN=
+MODEL_ID_8B=meta-llama/Meta-Llama-3-8B
+MODEL_NAME_8B=llama-3-8b
+MODEL_ID_1B=meta-llama/Llama-3.2-1B
+MODEL_NAME_1B=llama-3-2-1b
+TENSOR_PARALLEL_SIZE=8
+INFERENCE_PORT=8080
+MAX_MODEL_LEN=2048
+MAX_NUM_SEQS=4
+DATASET_NAME=databricks/databricks-dolly-15k
+EOF
+        echo -e "${GREEN}✓ Created default .env file${NC}"
+        echo -e "${YELLOW}  Please edit .env and add your HF_TOKEN${NC}"
+    fi
+fi
+
+# Check for Neuron environments
+echo -e "\n${BOLD}Checking Neuron environments...${NC}"
+MISSING_ENV=false
+
+if [[ -d "/opt/aws_neuronx_venv_pytorch_2_6" ]]; then
+    echo -e "${GREEN}✓ Fine-tuning environment found${NC}"
+else
+    echo -e "${RED}✗ Fine-tuning environment not found${NC}"
+    echo -e "  ${YELLOW}Expected at: /opt/aws_neuronx_venv_pytorch_2_6${NC}"
+    MISSING_ENV=true
+fi
+
+if [[ -d "/opt/aws_neuronx_venv_pytorch_2_6_nxd_inference" ]]; then
+    echo -e "${GREEN}✓ Inference environment found${NC}"
+else
+    echo -e "${RED}✗ Inference environment not found${NC}"
+    echo -e "  ${YELLOW}Expected at: /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference${NC}"
+    MISSING_ENV=true
+fi
+
+# Check for tmux
+echo -e "\n${BOLD}Checking system dependencies...${NC}"
+if command -v tmux &> /dev/null; then
+    TMUX_VERSION=$(tmux -V | cut -d' ' -f2)
+    echo -e "${GREEN}✓ tmux ${TMUX_VERSION} found${NC}"
+else
+    echo -e "${YELLOW}⚠ tmux not found - recommended for long-running operations${NC}"
+    echo -e "  Install with: ${CYAN}sudo apt-get install tmux${NC}"
+fi
+
+# Check for neuron-ls
+if command -v neuron-ls &> /dev/null; then
+    echo -e "${GREEN}✓ Neuron SDK tools found${NC}"
+else
+    echo -e "${YELLOW}⚠ neuron-ls not found - Neuron SDK may not be installed${NC}"
+fi
+
+# Verify configuration file
+if [[ -f "${INSTALL_DIR}/nki-llama.config" ]]; then
+    echo -e "${GREEN}✓ Configuration file found${NC}"
+else
+    echo -e "${RED}✗ nki-llama.config not found!${NC}"
+    echo -e "  This file is required for operation"
+fi
+
+# Check if running on correct instance
+if [[ -f /sys/devices/virtual/dmi/id/product_name ]]; then
+    INSTANCE_TYPE=$(cat /sys/devices/virtual/dmi/id/product_name 2>/dev/null || echo "unknown")
+    if [[ "$INSTANCE_TYPE" == *"trn1"* ]]; then
+        echo -e "${GREEN}✓ Running on Trainium instance (${INSTANCE_TYPE})${NC}"
+    else
+        echo -e "${YELLOW}⚠ Not running on Trainium instance${NC}"
+        echo -e "  Current: ${INSTANCE_TYPE}"
+        echo -e "  Recommended: trn1.32xlarge"
+    fi
+fi
+
+# Installation summary
+echo -e "\n${BOLD}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+if [[ "$MISSING_ENV" == "true" ]]; then
+    echo -e "${YELLOW}⚠️  Installation completed with warnings${NC}"
+    echo -e "\nSome Neuron environments are missing. This is expected if you're not on"
+    echo -e "a Neuron-enabled instance or haven't installed the Neuron SDK yet."
+else
+    echo -e "${GREEN}✅ Installation complete!${NC}"
+fi
+echo -e "${BOLD}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+
+# Next steps
+echo -e "\n${BOLD}Next steps:${NC}"
+echo -e "1. ${YELLOW}Configure:${NC} Edit ${CYAN}${INSTALL_DIR}/.env${NC} with your settings"
+echo -e "   • Add your Hugging Face token (HF_TOKEN)"
+echo -e "   • Adjust model and training parameters as needed"
+echo
+
+if [[ "$MISSING_ENV" == "true" ]]; then
+    echo -e "2. ${YELLOW}Install Neuron SDK:${NC} Follow AWS documentation to install Neuron SDK"
+    echo -e "   This will create the required virtual environments"
+    echo
+    echo -e "3. ${YELLOW}Activate environment:${NC}"
+else
+    echo -e "2. ${YELLOW}Activate environment:${NC}"
+fi
+echo -e "   Fine-tuning: ${CYAN}source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate${NC}"
+echo -e "   Inference:   ${CYAN}source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate${NC}"
+echo
+
+if [[ "$MISSING_ENV" == "true" ]]; then
+    echo -e "4. ${YELLOW}Get started:${NC}"
+else
+    echo -e "3. ${YELLOW}Get started:${NC}"
+fi
+echo -e "   • Run ${CYAN}./nki-llama setup${NC} for interactive setup"
+echo -e "   • Use ${CYAN}./nki-llama help${NC} to see all commands"
+echo -e "   • Check ${CYAN}./nki-llama status${NC} to verify installation"
+echo
+
+echo -e "${BOLD}Workflow overview:${NC}"
+echo -e "   Fine-tuning → NKI Compilation → vLLM Inference → Agent Development"
+echo
+
+if ! command -v tmux &> /dev/null; then
+    echo -e "${YELLOW}💡 Tip:${NC} Install tmux for better experience with long-running tasks:"
+    echo -e "   ${CYAN}sudo apt-get update && sudo apt-get install tmux${NC}"
+    echo
+fi
+
+echo -e "${BLUE}Documentation:${NC} See README.md for detailed usage instructions"
+echo
\ No newline at end of file
diff --git a/nki-llama b/nki-llama
new file mode 120000
index 0000000..e26e708
--- /dev/null
+++ b/nki-llama
@@ -0,0 +1 @@
+/home/ubuntu/nki-llama/nki-llama.sh
\ No newline at end of file
diff --git a/nki-llama.config b/nki-llama.config
new file mode 100644
index 0000000..ca3b8aa
--- /dev/null
+++ b/nki-llama.config
@@ -0,0 +1,62 @@
+#!/bin/bash
+# nki-llama.config - Shared configuration for NKI-LLAMA projects
+
+# Project Structure
+export NKI_ROOT="${HOME}/nki-llama"
+export NKI_SRC="${NKI_ROOT}/src"
+export NKI_FINETUNE="${NKI_SRC}/fine-tune"
+export NKI_INFERENCE="${NKI_SRC}/inference"
+export NKI_FINETUNE_SCRIPTS="${NKI_FINETUNE}/scripts"
+export NKI_INFERENCE_SCRIPTS="${NKI_INFERENCE}/scripts"
+export NKI_LOGS="${NKI_ROOT}/logs"
+export NKI_MODELS="${HOME}/models"
+export NKI_COMPILED="${HOME}/traced_model"
+
+# Model Configuration
+export MODEL_ID="${MODEL_ID:-meta-llama/Meta-Llama-3-8B}"
+export MODEL_NAME="${MODEL_NAME:-llama-3-8b}"
+export HF_TOKEN="${HF_TOKEN:-}"
+
+# Shared Configuration
+export TENSOR_PARALLEL_SIZE="${TENSOR_PARALLEL_SIZE:-8}"
+
+# Inference Configuration
+export INFERENCE_PORT="${INFERENCE_PORT:-8080}"
+export MAX_MODEL_LEN="${MAX_MODEL_LEN:-2048}"
+export MAX_NUM_SEQS="${MAX_NUM_SEQS:-4}"
+export ENABLE_NKI="${ENABLE_NKI:-true}"
+
+# Neuron Configuration
+export NEURON_VENV="/opt/aws_neuronx_venv_pytorch_2_6"
+export NEURON_INFERENCE_VENV="/opt/aws_neuronx_venv_pytorch_2_6_nxd_inference"
+export NEURON_COMPILE_CACHE="${HOME}/.cache/neuron"
+export NEURON_RT_NUM_CORES="${NEURON_RT_NUM_CORES:-8}"
+
+# vLLM Configuration
+export VLLM_REPO="${HOME}/upstreaming-to-vllm"
+export VLLM_NEURON_FRAMEWORK="neuronx-distributed-inference"
+
+# Dataset Configuration
+export DATASET_NAME="${DATASET_NAME:-databricks/databricks-dolly-15k}"
+export DATASET_DIR="${NKI_FINETUNE}/datasets"
+export TOKENIZER_DIR="${NKI_FINETUNE}/model_assets/llama3_tokenizer"
+
+# Checkpoint Paths
+export HF_WEIGHTS_DIR="${NKI_FINETUNE}/model_assets/llama3-8B_hf_weights_bin"
+export PRETRAINED_CKPT="${NKI_FINETUNE}/model_assets/pckpt"
+export NEMO_EXPERIMENTS="${NKI_FINETUNE}/neuronx-distributed-training/examples/nemo_experiments"
+
+# Jupyter Configuration
+export JUPYTER_PORT="${JUPYTER_PORT:-8888}"
+export JUPYTER_VENV="${NKI_ROOT}/venv"
+
+# Function to print configuration
+print_config() {
+    echo "NKI-LLAMA Configuration:"
+    echo "======================="
+    echo "Project Root: ${NKI_ROOT}"
+    echo "Model: ${MODEL_NAME} (${MODEL_ID})"
+    echo "Fine-tune Scripts: ${NKI_FINETUNE_SCRIPTS}"
+    echo "Inference Scripts: ${NKI_INFERENCE_SCRIPTS}"
+    echo "Tensor Parallel Size: ${TENSOR_PARALLEL_SIZE}"
+}
\ No newline at end of file
diff --git a/nki-llama.sh b/nki-llama.sh
new file mode 100755
index 0000000..b952734
--- /dev/null
+++ b/nki-llama.sh
@@ -0,0 +1,1128 @@
+#!/bin/bash
+# nki-llama - Unified CLI for fine-tuning and inference
+
+set -euo pipefail
+
+# Get script directory
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Create symlink for easier access
+if [[ -f "$SCRIPT_DIR/nki-llama.sh" ]] && [[ ! -f "$SCRIPT_DIR/nki-llama" ]]; then
+    ln -s "$SCRIPT_DIR/nki-llama.sh" "$SCRIPT_DIR/nki-llama"
+fi
+
+# Load configuration
+if [[ -f "${SCRIPT_DIR}/nki-llama.config" ]]; then
+    source "${SCRIPT_DIR}/nki-llama.config"
+else
+    echo "Error: nki-llama.config not found!"
+    exit 1
+fi
+
+# Load environment file if exists
+if [[ -f "${SCRIPT_DIR}/.env" ]]; then
+    set -a
+    source "${SCRIPT_DIR}/.env"
+    set +a
+fi
+
+# Colors
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+CYAN='\033[0;36m'
+MAGENTA='\033[0;35m'
+NC='\033[0m'
+BOLD='\033[1m'
+
+# Banner
+display_banner() {
+    echo -e "${CYAN}"
+    cat << 'EOF'
+    _   __ __ __ ____       __    __       ___    __  ___    ___ 
+   / | / // //_//  _/      / /   / /      /   |  /  |/  /   /   |
+  /  |/ // ,<   / /______ / /   / /      / /| | / /|_/ /   / /| |
+ / /|  // /| |_/ /_______/ /___/ /___   / ___ |/ /  / /   / ___ |
+/_/ |_//_/ |_/___/      /_____/_____/  /_/  |_/_/  /_/   /_/  |_|
+                                                             
+EOF
+    echo -e "${NC}"
+}
+
+# Environment paths
+SELF_ATTENTION_ENV="/opt/aws_neuronx_venv_pytorch_2_6"
+FINETUNE_ENV="${NEURON_VENV:-/opt/aws_neuronx_venv_pytorch_2_6}"
+INFERENCE_ENV="${NEURON_INFERENCE_VENV:-/opt/aws_neuronx_venv_pytorch_2_6_nxd_inference}"
+
+# Enhanced environment checking with activation suggestions
+check_and_suggest_env() {
+    local required_env="$1"
+    local env_name="$2"
+    local env_path="$3"
+    
+    # Check if any virtual environment is active
+    if [[ -z "${VIRTUAL_ENV:-}" ]]; then
+        echo -e "${RED}❌ No virtual environment active${NC}"
+        echo -e "${YELLOW}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+        echo -e "${YELLOW}⚠️  ${env_name} environment required${NC}"
+        echo -e "${YELLOW}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+        echo
+        echo -e "${CYAN}Please activate the environment first:${NC}"
+        echo -e "${GREEN}source ${env_path}/bin/activate${NC}"
+        echo
+        
+        # If in tmux, provide additional guidance
+        if [[ -n "${TMUX:-}" ]]; then
+            echo -e "${YELLOW}💡 You're in a tmux session. Run the activation command above,${NC}"
+            echo -e "${YELLOW}   then re-run your command.${NC}"
+            echo
+        fi
+        
+        return 1
+    fi
+    
+    # Check if python is available in the environment
+    if ! command -v python &> /dev/null; then
+        echo -e "${RED}❌ Python not found in current environment${NC}"
+        echo -e "${YELLOW}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+        echo -e "${YELLOW}⚠️  Environment appears to be corrupted or not properly activated${NC}"
+        echo -e "${YELLOW}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+        echo
+        echo -e "Current VIRTUAL_ENV: ${VIRTUAL_ENV}"
+        echo -e "Current PATH: ${PATH}"
+        echo
+        echo -e "${CYAN}Try deactivating and reactivating:${NC}"
+        echo -e "${GREEN}deactivate${NC}"
+        echo -e "${GREEN}source ${env_path}/bin/activate${NC}"
+        echo
+        return 1
+    fi
+    
+    # Check Python version
+    local python_version=$(python --version 2>&1 | cut -d' ' -f2)
+    echo -e "${BLUE}🐍 Python ${python_version} detected${NC}"
+    
+    # Check if the correct environment is active
+    case "$required_env" in
+        "self-attention")
+            if [[ "$VIRTUAL_ENV" == *"pytorch_2_6"* ]] && [[ "$VIRTUAL_ENV" != *"nxd_inference"* ]]; then
+                echo -e "${GREEN}✓ Self-attention environment active${NC}"
+                
+                # Verify key packages
+                if python -c "import torch" 2>/dev/null; then
+                    echo -e "${GREEN}✓ PyTorch available${NC}"
+                else
+                    echo -e "${YELLOW}⚠️  PyTorch not found in environment${NC}"
+                fi
+                
+                return 0
+            else
+                echo -e "${RED}❌ Wrong environment active: ${VIRTUAL_ENV}${NC}"
+                echo -e "${YELLOW}Please activate the correct environment:${NC}"
+                echo -e "${GREEN}source ${env_path}/bin/activate${NC}"
+                return 1
+            fi
+            ;;
+        "finetune")
+            if [[ "$VIRTUAL_ENV" == *"pytorch_2_6"* ]] && [[ "$VIRTUAL_ENV" != *"nxd_inference"* ]]; then
+                echo -e "${GREEN}✓ Fine-tuning environment active${NC}"
+                
+                # Verify key packages
+                if python -c "import torch" 2>/dev/null; then
+                    echo -e "${GREEN}✓ PyTorch available${NC}"
+                else
+                    echo -e "${YELLOW}⚠️  PyTorch not found in environment${NC}"
+                fi
+                
+                return 0
+            else
+                echo -e "${RED}❌ Wrong environment active: ${VIRTUAL_ENV}${NC}"
+                echo -e "${YELLOW}Please activate the correct environment:${NC}"
+                echo -e "${GREEN}source ${env_path}/bin/activate${NC}"
+                return 1
+            fi
+            ;;
+        "inference")
+            if [[ "$VIRTUAL_ENV" == *"nxd_inference"* ]]; then
+                echo -e "${GREEN}✓ Inference environment active${NC}"
+                
+                # Verify key packages
+                if python -c "import torch" 2>/dev/null; then
+                    echo -e "${GREEN}✓ PyTorch available${NC}"
+                else
+                    echo -e "${YELLOW}⚠️  PyTorch not found in environment${NC}"
+                fi
+                
+                return 0
+            else
+                echo -e "${RED}❌ Wrong environment active: ${VIRTUAL_ENV}${NC}"
+                echo -e "${YELLOW}Please activate the correct environment:${NC}"
+                echo -e "${GREEN}source ${env_path}/bin/activate${NC}"
+                return 1
+            fi
+            ;;
+    esac
+}
+
+# Tmux helper functions
+check_tmux_session() {
+    local session_name="$1"
+    tmux has-session -t "$session_name" 2>/dev/null
+}
+
+suggest_tmux_with_env() {
+    local operation="$1"
+    local session_name="$2"
+    local env_path="$3"
+    shift 3
+    local args="$*"
+    
+    if [[ -z "${TMUX:-}" ]]; then
+        echo -e "${YELLOW}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+        echo -e "${YELLOW}💡 tmux Recommended for ${operation}${NC}"
+        echo -e "${YELLOW}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+        echo
+        echo -e "This operation may take a long time. We recommend using tmux:"
+        echo
+        echo -e "${CYAN}# Option 1: Create session and activate environment manually:${NC}"
+        echo -e "tmux new -s ${session_name}"
+        echo -e "source ${env_path}/bin/activate"
+        echo -e "./nki-llama ${args}"
+        echo
+        echo -e "${CYAN}# Option 2: Run everything in one command:${NC}"
+        echo -e "tmux new -s ${session_name} 'source ${env_path}/bin/activate && ./nki-llama ${args}'"
+        echo
+        echo -e "${CYAN}# Detach with: Ctrl+B, D${NC}"
+        echo -e "${CYAN}# Reattach with: tmux attach -t ${session_name}${NC}"
+        echo
+        echo -e "${YELLOW}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+        echo
+    fi
+}
+
+# Check active Neuron environment (deprecated - use check_and_suggest_env instead)
+check_neuron_env() {
+    if [[ -z "${VIRTUAL_ENV:-}" ]]; then
+        echo -e "${RED}❌ No virtual environment active${NC}"
+        return 1
+    elif [[ "$VIRTUAL_ENV" == *"pytorch_2_6"* ]]; then
+        echo -e "${GREEN}✓ Fine-tuning environment active${NC}"
+        return 0
+    elif [[ "$VIRTUAL_ENV" == *"pytorch_2_6_nxd_inference"* ]]; then
+        echo -e "${GREEN}✓ Inference environment active${NC}"
+        return 0
+    else
+        echo -e "${YELLOW}⚠️  Unknown environment: ${VIRTUAL_ENV}${NC}"
+        return 1
+    fi
+}
+
+# Initialize logging
+init_logging() {
+    mkdir -p "$NKI_LOGS"
+    TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+    LOG_FILE="$NKI_LOGS/nki-llama_${TIMESTAMP}.log"
+    exec 1> >(tee -a "$LOG_FILE")
+    exec 2>&1
+    echo -e "${BLUE}📝 Logging to: ${LOG_FILE}${NC}"
+}
+
+# Run script with error handling
+run_script() {
+    local script_path="$1"
+    local display_name="$2"
+    shift 2
+    
+    if [[ ! -f "$script_path" ]]; then
+        echo -e "${RED}❌ Script not found: $script_path${NC}"
+        return 1
+    fi
+    
+    # Ensure python is available before running
+    if ! command -v python &> /dev/null; then
+        echo -e "${RED}❌ Python not found in PATH${NC}"
+        echo -e "${YELLOW}Please ensure the virtual environment is properly activated${NC}"
+        return 1
+    fi
+    
+    echo -e "${MAGENTA}▶ Running: ${display_name}${NC}"
+    echo -e "${BLUE}  Using Python: $(which python)${NC}"
+    
+    # Export environment variables to ensure child scripts inherit them
+    export VIRTUAL_ENV
+    export PATH
+    
+    if bash "$script_path" "$@"; then
+        echo -e "${GREEN}✓ ${display_name} completed${NC}\n"
+    else
+        echo -e "${RED}✗ ${display_name} failed${NC}\n"
+        return 1
+    fi
+}
+
+# Print configuration
+print_config() {
+    echo -e "${BOLD}Configuration Summary:${NC}"
+    echo -e "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+    
+    echo -e "\n${CYAN}Environment Paths:${NC}"
+    echo -e "• Self-Attention: ${GREEN}${SELF_ATTENTION_ENV}${NC}"
+    echo -e "• Fine-tuning:    ${GREEN}${FINETUNE_ENV}${NC}"
+    echo -e "• Inference:      ${GREEN}${INFERENCE_ENV}${NC}"
+    
+    echo -e "\n${CYAN}Model Configuration:${NC}"
+    echo -e "• Model ID:    ${GREEN}${MODEL_ID:-Not set}${NC}"
+    echo -e "• Model Name:  ${GREEN}${MODEL_NAME:-Not set}${NC}"
+    echo -e "• TP Size:     ${GREEN}${TENSOR_PARALLEL_SIZE:-8}${NC}"
+    
+    echo -e "\n${CYAN}Directories:${NC}"
+    echo -e "• Base:        ${BLUE}${NKI_ROOT}${NC}"
+    echo -e "• Scripts:     ${BLUE}${NKI_SRC}${NC}"
+    echo -e "• Models:      ${BLUE}${NKI_MODELS}${NC}"
+    echo -e "• Logs:        ${BLUE}${NKI_LOGS}${NC}"
+    
+    if [[ -n "${HF_TOKEN:-}" ]]; then
+        echo -e "\n${CYAN}Authentication:${NC}"
+        echo -e "• HF Token:    ${GREEN}✓ Configured${NC}"
+    else
+        echo -e "\n${CYAN}Authentication:${NC}"
+        echo -e "• HF Token:    ${RED}✗ Not set${NC}"
+    fi
+    
+    echo -e "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+}
+
+###############################################################################
+# Self-Attention Commands
+###############################################################################
+
+cmd_self_attention_benchmark() {
+    echo -e "${BOLD}Running self-attention benchmarks...${NC}"
+    
+    # Check environment
+    if ! check_and_suggest_env "self-attention" "Self-attention" "$SELF_ATTENTION_ENV"; then
+        return 1
+    fi
+    
+    # Check if we're in tmux
+    if [[ -z "${TMUX:-}" ]]; then
+        echo -e "${YELLOW}⚠️  Not running in tmux. ${BOLD}This is important for benchmarking!${NC}"
+        echo -e "${YELLOW}   Benchmarks can take considerable time to complete.${NC}"
+        echo -e "${YELLOW}   Disconnections will terminate the process.${NC}"
+        echo
+        echo -e "   ${CYAN}tmux new -s self-attention 'source ${SELF_ATTENTION_ENV}/bin/activate && ./nki-llama self-attention benchmark'${NC}"
+        echo
+        read -p "Continue without tmux? [y/N] " -n 1 -r
+        echo
+        if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+            echo -e "${BLUE}Please start tmux as shown above${NC}"
+            exit 0
+        fi
+    fi
+    
+    # Navigate to scripts directory and run benchmark
+    local self_attention_dir="${SCRIPT_DIR}/src/self-attention"
+    
+    if [[ ! -d "$self_attention_dir/scripts" ]]; then
+        echo -e "${RED}❌ Self-attention scripts directory not found: $self_attention_dir/scripts${NC}"
+        return 1
+    fi
+    
+    cd "$self_attention_dir/scripts"
+    
+    # Debug: Show current environment
+    echo -e "${BLUE}Debug Info:${NC}"
+    echo -e "  VIRTUAL_ENV: ${VIRTUAL_ENV}"
+    echo -e "  Python: $(which python 2>/dev/null || echo 'not found')"
+    echo -e "  Python3: $(which python3 2>/dev/null || echo 'not found')"
+    echo
+    
+    if [[ -f "./self-attention_benchmark.sh" ]]; then
+        echo -e "${MAGENTA}▶ Running: self-attention_benchmark.sh${NC}"
+        
+        # Source the virtual environment in the subshell to ensure it's available
+        (
+            if [[ -f "${VIRTUAL_ENV}/bin/activate" ]]; then
+                source "${VIRTUAL_ENV}/bin/activate"
+            fi
+            bash ./self-attention_benchmark.sh "$@"
+        )
+    else
+        echo -e "${RED}❌ Benchmark script not found: ./self-attention_benchmark.sh${NC}"
+        return 1
+    fi
+}
+
+cmd_self_attention_test() {
+    echo -e "${BOLD}Running self-attention tests...${NC}"
+    
+    # Check environment
+    if ! check_and_suggest_env "self-attention" "Self-attention" "$SELF_ATTENTION_ENV"; then
+        return 1
+    fi
+    
+    local test_type="${1:-all}"
+    shift || true
+    
+    # Navigate to self-attention directory
+    local self_attention_dir="${SCRIPT_DIR}/src/self-attention"
+    
+    if [[ ! -d "$self_attention_dir/tests" ]]; then
+        echo -e "${RED}❌ Self-attention tests directory not found: $self_attention_dir/tests${NC}"
+        return 1
+    fi
+    
+    cd "$self_attention_dir"
+    
+    case "$test_type" in
+        all)
+            echo -e "${CYAN}Running all self-attention tests...${NC}"
+            if command -v pytest &> /dev/null; then
+                pytest tests/ -v -s "$@"
+            else
+                echo -e "${RED}❌ pytest not found. Please install pytest.${NC}"
+                return 1
+            fi
+            ;;
+        forward|fwd)
+            echo -e "${CYAN}Running forward pass tests...${NC}"
+            if [[ -f "tests/test_flash_attn_fwd.py" ]]; then
+                pytest tests/test_flash_attn_fwd.py -v -s "$@"
+            else
+                echo -e "${RED}❌ Forward test file not found: tests/test_flash_attn_fwd.py${NC}"
+                return 1
+            fi
+            ;;
+        backward|bwd)
+            echo -e "${CYAN}Running backward pass tests...${NC}"
+            if [[ -f "tests/test_flash_attn_bwd.py" ]]; then
+                pytest tests/test_flash_attn_bwd.py -v -s "$@"
+            else
+                echo -e "${RED}❌ Backward test file not found: tests/test_flash_attn_bwd.py${NC}"
+                return 1
+            fi
+            ;;
+        *)
+            echo -e "${RED}Unknown test type: $test_type${NC}"
+            echo -e "Available: all, forward (fwd), backward (bwd)"
+            return 1
+            ;;
+    esac
+}
+
+cmd_self_attention_run() {
+    echo -e "${BOLD}Running self-attention script...${NC}"
+    
+    # Check environment
+    if ! check_and_suggest_env "self-attention" "Self-attention" "$SELF_ATTENTION_ENV"; then
+        return 1
+    fi
+    
+    local script_name="$1"
+    shift || true
+    
+    if [[ -z "$script_name" ]]; then
+        echo -e "${RED}❌ No script specified${NC}"
+        echo -e "Usage: ./nki-llama self-attention run <script_name> [args...]"
+        return 1
+    fi
+    
+    # Navigate to scripts directory
+    local self_attention_dir="${SCRIPT_DIR}/src/self-attention"
+    cd "$self_attention_dir/scripts"
+    
+    if [[ -f "./${script_name}" ]]; then
+        echo -e "${MAGENTA}▶ Running: ${script_name}${NC}"
+        bash "./${script_name}" "$@"
+    elif [[ -f "./${script_name}.sh" ]]; then
+        echo -e "${MAGENTA}▶ Running: ${script_name}.sh${NC}"
+        bash "./${script_name}.sh" "$@"
+    else
+        echo -e "${RED}❌ Script not found: ${script_name}${NC}"
+        echo -e "Available scripts in $self_attention_dir/scripts:"
+        ls -1 *.sh 2>/dev/null || echo "No .sh scripts found"
+        return 1
+    fi
+}
+
+cmd_self_attention_status() {
+    echo -e "${BOLD}Self-Attention Status:${NC}"
+    
+    # Check environment
+    check_and_suggest_env "self-attention" "Self-attention" "$SELF_ATTENTION_ENV" || true
+    echo
+    
+    # Check directories
+    local self_attention_dir="${SCRIPT_DIR}/src/self-attention"
+    
+    echo -e "${BOLD}Directory Structure:${NC}"
+    [[ -d "$self_attention_dir" ]] && echo -e "• Base directory: ${GREEN}✓${NC}" || echo -e "• Base directory: ${RED}✗${NC}"
+    [[ -d "$self_attention_dir/scripts" ]] && echo -e "• Scripts: ${GREEN}✓${NC}" || echo -e "• Scripts: ${RED}✗${NC}"
+    [[ -d "$self_attention_dir/tests" ]] && echo -e "• Tests: ${GREEN}✓${NC}" || echo -e "• Tests: ${RED}✗${NC}"
+    
+    # Check for benchmark script
+    if [[ -f "$self_attention_dir/scripts/self-attention_benchmark.sh" ]]; then
+        echo -e "• Benchmark script: ${GREEN}✓${NC}"
+    else
+        echo -e "• Benchmark script: ${RED}✗${NC}"
+    fi
+    
+    # Check for test files
+    echo -e "\n${BOLD}Test Files:${NC}"
+    if [[ -f "$self_attention_dir/tests/test_flash_attn_fwd.py" ]]; then
+        echo -e "• Forward tests: ${GREEN}✓${NC}"
+    else
+        echo -e "• Forward tests: ${RED}✗${NC}"
+    fi
+    
+    if [[ -f "$self_attention_dir/tests/test_flash_attn_bwd.py" ]]; then
+        echo -e "• Backward tests: ${GREEN}✓${NC}"
+    else
+        echo -e "• Backward tests: ${RED}✗${NC}"
+    fi
+    
+    # Check for pytest
+    if command -v pytest &> /dev/null; then
+        echo -e "\n${BOLD}Dependencies:${NC}"
+        echo -e "• pytest: ${GREEN}✓${NC} ($(pytest --version 2>&1 | head -1))"
+    else
+        echo -e "\n${BOLD}Dependencies:${NC}"
+        echo -e "• pytest: ${RED}✗${NC} (not installed)"
+    fi
+}
+
+###############################################################################
+# Fine-tuning Commands
+###############################################################################
+
+cmd_finetune_deps() {
+    echo -e "${BOLD}Installing fine-tuning dependencies...${NC}"
+    
+    # Check environment
+    if ! check_and_suggest_env "finetune" "Fine-tuning" "$FINETUNE_ENV"; then
+        return 1
+    fi
+    
+    run_script "${NKI_FINETUNE_SCRIPTS}/bootstrap.sh" "Dependencies Installation"
+}
+
+cmd_finetune_data() {
+    echo -e "${BOLD}Downloading dataset...${NC}"
+    
+    # Check environment
+    if ! check_and_suggest_env "finetune" "Fine-tuning" "$FINETUNE_ENV"; then
+        return 1
+    fi
+    
+    run_script "${NKI_FINETUNE_SCRIPTS}/download_data.sh" "Dataset Download"
+}
+
+cmd_finetune_model() {
+    echo -e "${BOLD}Downloading model weights...${NC}"
+    
+    # Check environment
+    if ! check_and_suggest_env "finetune" "Fine-tuning" "$FINETUNE_ENV"; then
+        return 1
+    fi
+    
+    run_script "${NKI_FINETUNE_SCRIPTS}/download_model.sh" "Model Download"
+}
+
+cmd_finetune_convert() {
+    echo -e "${BOLD}Converting checkpoints...${NC}"
+    
+    # Check environment
+    if ! check_and_suggest_env "finetune" "Fine-tuning" "$FINETUNE_ENV"; then
+        return 1
+    fi
+    
+    run_script "${NKI_FINETUNE_SCRIPTS}/convert_checkpoints.sh" "Checkpoint Conversion"
+}
+
+cmd_finetune_compile() {
+    echo -e "${BOLD}Pre-compiling graphs...${NC}"
+    
+    # Check environment
+    if ! check_and_suggest_env "finetune" "Fine-tuning" "$FINETUNE_ENV"; then
+        return 1
+    fi
+    
+    # Check if we're in tmux
+    if [[ -z "${TMUX:-}" ]]; then
+        echo -e "${YELLOW}⚠️  Not running in tmux. ${BOLD}This is important for graph compilation!${NC}"
+        echo -e "${YELLOW}   Graph compilation can take 30-60 minutes.${NC}"
+        echo -e "${YELLOW}   Disconnections will terminate the process.${NC}"
+        echo
+        echo -e "   ${CYAN}tmux new -s compile 'source ${FINETUNE_ENV}/bin/activate && ./nki-llama finetune compile'${NC}"
+        echo
+        read -p "Continue without tmux? [y/N] " -n 1 -r
+        echo
+        if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+            echo -e "${BLUE}Please start tmux as shown above${NC}"
+            exit 0
+        fi
+    fi
+    
+    run_script "${NKI_FINETUNE_SCRIPTS}/precompile.sh" "Graph Compilation"
+}
+
+cmd_finetune_train() {
+    echo -e "${BOLD}Starting fine-tuning...${NC}"
+    
+    # Check environment
+    if ! check_and_suggest_env "finetune" "Fine-tuning" "$FINETUNE_ENV"; then
+        return 1
+    fi
+    
+    # Show training information
+    echo -e "${YELLOW}💡 Fine-tuning will run for multiple hours.${NC}"
+    echo -e "${YELLOW}   The training includes checkpointing and will resume if interrupted.${NC}"
+    echo -e "${YELLOW}   Using tmux is strongly recommended!${NC}"
+    
+    # Check if we're in tmux
+    if [[ -z "${TMUX:-}" ]]; then
+        echo -e "${YELLOW}⚠️  Not running in tmux. ${BOLD}This is critical for training!${NC}"
+        echo -e "${YELLOW}   Training can take several hours to complete.${NC}"
+        echo -e "${YELLOW}   Disconnections will terminate the process (SIGHUP).${NC}"
+        echo
+        echo -e "   ${CYAN}tmux new -s training 'source ${FINETUNE_ENV}/bin/activate && ./nki-llama finetune train'${NC}"
+        echo
+        read -p "Continue without tmux? [y/N] " -n 1 -r
+        echo
+        if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+            echo -e "${BLUE}Please start tmux as shown above${NC}"
+            exit 0
+        fi
+    fi
+    
+    run_script "${NKI_FINETUNE_SCRIPTS}/run_training.sh" "Fine-tuning"
+}
+
+cmd_finetune_all() {
+    echo -e "${BOLD}Running complete fine-tuning pipeline...${NC}\n"
+    
+    # Check environment
+    if ! check_and_suggest_env "finetune" "Fine-tuning" "$FINETUNE_ENV"; then
+        return 1
+    fi
+    
+    # Check if we're in tmux for the entire pipeline
+    if [[ -z "${TMUX:-}" ]]; then
+        echo -e "${YELLOW}⚠️  Not running in tmux. ${BOLD}This is critical for the full pipeline!${NC}"
+        echo -e "${YELLOW}   The complete pipeline includes:${NC}"
+        echo -e "${YELLOW}   • Dependency installation${NC}"
+        echo -e "${YELLOW}   • Dataset download${NC}"
+        echo -e "${YELLOW}   • Model download${NC}"
+        echo -e "${YELLOW}   • Checkpoint conversion${NC}"
+        echo -e "${YELLOW}   • Graph compilation (30-60 min)${NC}"
+        echo -e "${YELLOW}   • Training (several hours)${NC}"
+        echo -e "${YELLOW}   Total time: 4-8 hours depending on configuration${NC}"
+        echo
+        echo -e "   ${CYAN}tmux new -s training 'source ${FINETUNE_ENV}/bin/activate && ./nki-llama finetune all'${NC}"
+        echo
+        read -p "Continue without tmux? [y/N] " -n 1 -r
+        echo
+        if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+            echo -e "${BLUE}Please start tmux as shown above${NC}"
+            exit 0
+        fi
+    fi
+    
+    cmd_finetune_deps && \
+    cmd_finetune_data && \
+    cmd_finetune_model && \
+    cmd_finetune_convert && \
+    cmd_finetune_compile && \
+    cmd_finetune_train
+}
+
+###############################################################################
+# Inference Commands
+###############################################################################
+
+cmd_inference_setup() {
+    echo -e "${BOLD}Setting up vLLM for inference...${NC}"
+    
+    # Check environment
+    if ! check_and_suggest_env "inference" "Inference" "$INFERENCE_ENV"; then
+        return 1
+    fi
+    
+    bash "${NKI_INFERENCE_SCRIPTS}/setup-vllm.sh"
+}
+
+cmd_inference_download() {
+    echo -e "${BOLD}Downloading model for inference...${NC}"
+    
+    # Check environment
+    if ! check_and_suggest_env "inference" "Inference" "$INFERENCE_ENV"; then
+        return 1
+    fi
+    
+    bash "${NKI_INFERENCE_SCRIPTS}/download-model.sh"
+}
+
+cmd_inference_benchmark() {
+    echo -e "${BOLD}Running NKI benchmark evaluation...${NC}"
+    
+    # Check environment
+    if ! check_and_suggest_env "inference" "Inference" "$INFERENCE_ENV"; then
+        return 1
+    fi
+    
+    # Parse benchmark mode and special flags
+    local mode="evaluate_all"  # Default mode
+    local args=()
+    local clear_cache=false
+    
+    while [[ $# -gt 0 ]]; do
+        case $1 in
+            single)
+                mode="evaluate_single"
+                shift
+                ;;
+            all)
+                mode="evaluate_all"
+                shift
+                ;;
+            --mode)
+                mode="$2"
+                shift 2
+                ;;
+            --clear-cache|clear-cache)
+                clear_cache=true
+                args+=("--clear-cache")
+                shift
+                ;;
+            *)
+                args+=("$1")
+                shift
+                ;;
+        esac
+    done
+    
+    # Show mode information
+    echo -e "${YELLOW}💡 Running benchmark in ${CYAN}${mode}${YELLOW} mode${NC}"
+    
+    if [[ "$mode" == "evaluate_single" ]]; then
+        echo -e "${YELLOW}   This runs a quick single evaluation from the repository test script.${NC}"
+    else
+        echo -e "${YELLOW}   This includes model compilation with NKI optimizations (10-30 min on first run).${NC}"
+        echo -e "${YELLOW}   The compiled model will be cached for future use.${NC}"
+        echo -e "${YELLOW}   ${CYAN}Auto cache recovery is enabled by default.${NC}"
+    fi
+    
+    if [[ "$clear_cache" == "true" ]]; then
+        echo -e "${YELLOW}   ${CYAN}Cache will be cleared before running.${NC}"
+    fi
+    
+    echo -e "${YELLOW}   Using tmux is strongly recommended!${NC}"
+    echo -e "${YELLOW}   Running: ${NKI_INFERENCE_SCRIPTS}/run-nki-benchmark.sh --mode $mode ${args[@]}"
+    
+    # Check if we're in tmux for evaluate_all mode
+    if [[ "$mode" == "evaluate_all" ]] && [[ -z "${TMUX:-}" ]]; then
+        echo -e "${YELLOW}⚠️  Not running in tmux. ${BOLD}This is critical for long compilations!${NC}"
+        echo -e "${YELLOW}   Disconnections will terminate the process (SIGHUP).${NC}"
+        echo
+        echo -e "   ${CYAN}tmux new -s benchmark 'source ${INFERENCE_ENV}/bin/activate && ./nki-llama inference benchmark ${mode} ${args[*]}'${NC}"
+        echo
+        read -p "Continue without tmux? [y/N] " -n 1 -r
+        echo
+        if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+            echo -e "${BLUE}Please start tmux as shown above${NC}"
+            exit 0
+        fi
+    fi
+    
+    bash "${NKI_INFERENCE_SCRIPTS}/run-nki-benchmark.sh" --mode "$mode" "${args[@]}"
+}
+
+cmd_inference_server() {
+    echo -e "${BOLD}Starting vLLM server...${NC}"
+    
+    # Check environment
+    if ! check_and_suggest_env "inference" "Inference" "$INFERENCE_ENV"; then
+        return 1
+    fi
+    
+    suggest_tmux_with_env "vLLM Server" "vllm-server" "$INFERENCE_ENV" "inference server"
+    bash "${NKI_INFERENCE_SCRIPTS}/start-server.sh"
+}
+
+###############################################################################
+# Utility Commands
+###############################################################################
+
+cmd_status() {
+    echo -e "\n${BOLD}System Status:${NC}"
+    check_neuron_env
+    echo
+    
+    echo -e "${BOLD}Configuration:${NC}"
+    print_config
+    echo
+    
+    echo -e "${BOLD}Self-Attention Status:${NC}"
+    local self_attention_dir="${SCRIPT_DIR}/nki-llama/src/self-attention"
+    [[ -d "$self_attention_dir" ]] && echo -e "• Module: ${GREEN}✓${NC}" || echo -e "• Module: ${YELLOW}⚠${NC}"
+    [[ -d "$self_attention_dir/scripts" ]] && echo -e "• Scripts: ${GREEN}✓${NC}" || echo -e "• Scripts: ${YELLOW}⚠${NC}"
+    [[ -d "$self_attention_dir/tests" ]] && echo -e "• Tests: ${GREEN}✓${NC}" || echo -e "• Tests: ${YELLOW}⚠${NC}"
+    echo
+    
+    echo -e "${BOLD}Fine-tuning Status:${NC}"
+    [[ -d "$DATASET_DIR" ]] && echo -e "• Dataset: ${GREEN}✓${NC}" || echo -e "• Dataset: ${YELLOW}⚠${NC}"
+    [[ -d "$HF_WEIGHTS_DIR" ]] && echo -e "• Weights: ${GREEN}✓${NC}" || echo -e "• Weights: ${YELLOW}⚠${NC}"
+    [[ -d "$PRETRAINED_CKPT" ]] && echo -e "• Checkpoints: ${GREEN}✓${NC}" || echo -e "• Checkpoints: ${YELLOW}⚠${NC}"
+    [[ -d "$NEMO_EXPERIMENTS" ]] && echo -e "• Training: ${GREEN}✓${NC}" || echo -e "• Training: ${YELLOW}⚠${NC}"
+    echo
+    
+    echo -e "${BOLD}Inference Status:${NC}"
+    [[ -d "${NKI_MODELS}/${MODEL_NAME}" ]] && echo -e "• Model: ${GREEN}✓${NC}" || echo -e "• Model: ${YELLOW}⚠${NC}"
+    [[ -d "${NKI_COMPILED}/${MODEL_NAME}" ]] && echo -e "• Compiled: ${GREEN}✓${NC}" || echo -e "• Compiled: ${YELLOW}⚠${NC}"
+    [[ -d "$VLLM_REPO" ]] && echo -e "• vLLM: ${GREEN}✓${NC}" || echo -e "• vLLM: ${YELLOW}⚠${NC}"
+    
+    # Check compilation cache
+    CACHE_DIR="/var/tmp/neuron-compile-cache"
+    if [[ -d "$CACHE_DIR" ]]; then
+        CACHE_SIZE=$(du -sh "$CACHE_DIR" 2>/dev/null | cut -f1 || echo "unknown")
+        echo -e "• Compile Cache: ${GREEN}✓${NC} (${CACHE_SIZE})"
+        
+        # Check for failed compilations
+        FAILED_COUNT=$(find "$CACHE_DIR" -name "*.neff" -size 0 2>/dev/null | wc -l || echo "0")
+        if [[ $FAILED_COUNT -gt 0 ]]; then
+            echo -e "  ${YELLOW}⚠ ${FAILED_COUNT} failed compilation entries found${NC}"
+            echo -e "  ${CYAN}Run: ./nki-llama inference benchmark --clear-cache${NC}"
+        fi
+    else
+        echo -e "• Compile Cache: ${YELLOW}⚠${NC} (not found)"
+    fi
+    
+    if command -v neuron-ls &> /dev/null; then
+        echo -e "\n${BOLD}Neuron Hardware:${NC}"
+        
+        # Extract instance info
+        INSTANCE_TYPE=$(neuron-ls | grep "instance-type:" | cut -d' ' -f2)
+        echo -e "• Instance: ${CYAN}${INSTANCE_TYPE}${NC}"
+        
+        # Parse device information
+        DEVICE_INFO=$(neuron-ls | grep -E "^\| [0-9]+ ")
+        DEVICE_COUNT=$(echo "$DEVICE_INFO" | wc -l)
+        
+        if [[ $DEVICE_COUNT -gt 0 ]]; then
+            # Calculate totals
+            TOTAL_CORES=$(( DEVICE_COUNT * 2 ))
+            TOTAL_MEMORY=$(( DEVICE_COUNT * 32 ))
+            
+            # Count busy devices - fixed version
+            if echo "$DEVICE_INFO" | grep -q "python"; then
+                BUSY_COUNT=$(echo "$DEVICE_INFO" | grep -c "python")
+            else
+                BUSY_COUNT=0
+            fi
+            FREE_COUNT=$(( DEVICE_COUNT - BUSY_COUNT ))
+            
+            echo -e "• Devices: ${GREEN}${DEVICE_COUNT}${NC} (${FREE_COUNT} free, ${BUSY_COUNT} busy)"
+            echo -e "• Total: ${TOTAL_CORES} cores, ${TOTAL_MEMORY}GB memory"
+            
+            # Show runtime version if available
+            RUNTIME_VERSION=$(neuron-ls | awk '/RUNTIME/ && /VERSION/ {getline; if (match($0, /[0-9]+\.[0-9]+\.[0-9]+/)) print substr($0, RSTART, RLENGTH)}' | head -1)
+            if [[ -n "$RUNTIME_VERSION" ]]; then
+                echo -e "• Runtime: v${RUNTIME_VERSION}"
+            fi
+        else
+            echo -e "${YELLOW}⚠ No Neuron devices detected${NC}"
+        fi
+    else
+        echo -e "\n${YELLOW}⚠ neuron-ls not found - Neuron SDK may not be installed${NC}"
+    fi
+}
+
+cmd_clean() {
+    echo -e "${YELLOW}🧹 Cleaning generated files...${NC}"
+    
+    # Show cache status first
+    CACHE_DIR="/var/tmp/neuron-compile-cache"
+    if [[ -d "$CACHE_DIR" ]]; then
+        CACHE_SIZE=$(du -sh "$CACHE_DIR" 2>/dev/null | cut -f1 || echo "unknown")
+        echo -e "\nCompilation cache: ${CYAN}${CACHE_SIZE}${NC} at ${CACHE_DIR}"
+        
+        read -p "Clean compilation cache? [y/N] " -n 1 -r
+        echo
+        if [[ $REPLY =~ ^[Yy]$ ]]; then
+            if rm -rf "$CACHE_DIR" 2>/dev/null; then
+                echo -e "${GREEN}✓ Compilation cache cleaned${NC}"
+            else
+                echo -e "${RED}✗ Failed to clean cache. Try: sudo rm -rf ${CACHE_DIR}${NC}"
+            fi
+        fi
+    fi
+    
+    read -p "Clean fine-tuning artifacts? [y/N] " -n 1 -r
+    echo
+    if [[ $REPLY =~ ^[Yy]$ ]]; then
+        rm -rf "$DATASET_DIR" "$TOKENIZER_DIR" "$HF_WEIGHTS_DIR" "$PRETRAINED_CKPT" "$NEMO_EXPERIMENTS"
+        echo -e "${GREEN}✓ Fine-tuning artifacts cleaned${NC}"
+    fi
+    
+    read -p "Clean inference artifacts? [y/N] " -n 1 -r
+    echo
+    if [[ $REPLY =~ ^[Yy]$ ]]; then
+        rm -rf "${NKI_COMPILED}/${MODEL_NAME}"
+        echo -e "${GREEN}✓ Inference artifacts cleaned${NC}"
+    fi
+}
+
+# Show help
+show_help() {
+    echo -e "\n${BOLD}NKI-LLAMA Unified Interface${NC}"
+    echo -e "${CYAN}=========================${NC}\n"
+    
+    echo -e "${CYAN}Quick Commands:${NC}"
+    echo -e "  ./nki-llama setup         - Initial setup guide"
+    echo -e "  ./nki-llama train         - Start fine-tuning"
+    echo -e "  ./nki-llama server        - Start inference server"
+    echo -e "  ./nki-llama jupyter       - Start Jupyter Lab"
+    echo
+    
+    echo -e "${CYAN}Self-Attention Commands:${NC}"
+    echo -e "  ./nki-llama self-attention benchmark         - Run all benchmarks"
+    echo -e "  ./nki-llama self-attention test              - Run all tests"
+    echo -e "  ./nki-llama self-attention test forward      - Run forward pass tests"
+    echo -e "  ./nki-llama self-attention test backward     - Run backward pass tests"
+    echo -e "  ./nki-llama self-attention run <script>      - Run specific script"
+    echo -e "  ./nki-llama self-attention status            - Show self-attention status"
+    echo
+    
+    echo -e "${CYAN}Fine-tuning Commands:${NC}"
+    echo -e "  ./nki-llama finetune deps      - Install dependencies"
+    echo -e "  ./nki-llama finetune data      - Download training dataset"
+    echo -e "  ./nki-llama finetune model     - Download model weights"
+    echo -e "  ./nki-llama finetune convert   - Convert checkpoints"
+    echo -e "  ./nki-llama finetune compile   - Pre-compile graphs"
+    echo -e "  ./nki-llama finetune train     - Start training"
+    echo -e "  ./nki-llama finetune all       - Run complete pipeline"
+    echo
+    
+    echo -e "${CYAN}Inference Commands:${NC}"
+    echo -e "  ./nki-llama inference setup             - Setup vLLM"
+    echo -e "  ./nki-llama inference download          - Download model"
+    echo -e "  ./nki-llama inference benchmark         - Run full benchmark (evaluate_all)"
+    echo -e "  ./nki-llama inference benchmark single  - Quick benchmark (evaluate_single)"
+    echo -e "  ./nki-llama inference benchmark --clear-cache  - Clear cache & benchmark"
+    echo -e "  ./nki-llama inference server            - Start API server"
+    echo
+    
+    echo -e "${CYAN}Benchmark Options:${NC}"
+    echo -e "  ${BOLD}Modes:${NC}"
+    echo -e "    evaluate_single - Quick validation using repository test script"
+    echo -e "    evaluate_all    - Full benchmark with NKI compilation & caching"
+    echo
+    echo -e "  ${BOLD}Cache Management:${NC}"
+    echo -e "    --clear-cache              - Clear compilation cache before running"
+    echo -e "    --no-auto-clear-cache      - Disable automatic cache recovery"
+    echo -e "    --retry-failed-compilation - Force retry of failed compilations"
+    echo
+    echo -e "  ${BOLD}Examples:${NC}"
+    echo -e "    ./nki-llama inference benchmark                     # Full benchmark"
+    echo -e "    ./nki-llama inference benchmark single              # Quick test"
+    echo -e "    ./nki-llama inference benchmark --clear-cache       # Clean run"
+    echo -e "    ./nki-llama inference benchmark --seq-len 1024      # Custom seq length"
+    echo
+    
+    echo -e "${CYAN}Utility Commands:${NC}"
+    echo -e "  ./nki-llama status        - Show system status"
+    echo -e "  ./nki-llama config        - Show configuration"
+    echo -e "  ./nki-llama clean         - Clean artifacts & cache"
+    echo -e "  ./nki-llama help          - Show this help"
+    echo
+    
+    echo -e "${CYAN}Environment Setup:${NC}"
+    echo -e "  Self-Attention: ${GREEN}source ${SELF_ATTENTION_ENV}/bin/activate${NC}"
+    echo -e "  Fine-tuning:    ${GREEN}source ${FINETUNE_ENV}/bin/activate${NC}"
+    echo -e "  Inference:      ${GREEN}source ${INFERENCE_ENV}/bin/activate${NC}"
+    echo
+    
+    echo -e "${CYAN}Troubleshooting:${NC}"
+    echo -e "  • Always use tmux for long operations (compile, train, benchmark)"
+    echo -e "  • If no environment is active, the script will tell you which to activate"
+    echo -e "  • If benchmark fails with cache errors, use --clear-cache"
+    echo -e "  • Check status to see if compilation cache has failed entries"
+    echo
+}
+
+# Setup wizard
+cmd_setup() {
+    echo -e "${BOLD}NKI-LLAMA Setup Wizard${NC}"
+    echo -e "=====================\n"
+    
+    # Check for .env file
+    if [[ ! -f "${SCRIPT_DIR}/.env" ]]; then
+        echo -e "${YELLOW}No .env file found. Creating one...${NC}"
+        cp "${SCRIPT_DIR}/.env.example" "${SCRIPT_DIR}/.env" 2>/dev/null || {
+            echo -e "${RED}No .env.example found. Creating basic .env...${NC}"
+            cat > "${SCRIPT_DIR}/.env" << EOF
+# NKI-LLAMA Configuration
+HF_TOKEN=
+MODEL_ID=meta-llama/Meta-Llama-3-8B
+MODEL_NAME=llama-3-8b
+TENSOR_PARALLEL_SIZE=8
+EOF
+        }
+        echo -e "${GREEN}✓ Created .env file${NC}"
+        echo -e "${YELLOW}Please edit .env and add your HF_TOKEN${NC}\n"
+    fi
+    
+    # Show current environment
+    echo -e "${BOLD}Current Environment:${NC}"
+    check_neuron_env || true
+    echo
+    
+    # Show quick start
+    echo -e "${BOLD}Quick Start Guide:${NC}"
+    echo -e "1. Edit .env file with your Hugging Face token"
+    echo -e "2. For self-attention testing:"
+    echo -e "   ${CYAN}source ${SELF_ATTENTION_ENV}/bin/activate${NC}"
+    echo -e "   ${CYAN}tmux new -s self-attention${NC}"
+    echo -e "   ${CYAN}./nki-llama self-attention benchmark${NC}"
+    echo -e "3. For fine-tuning:"
+    echo -e "   ${CYAN}source ${FINETUNE_ENV}/bin/activate${NC}"
+    echo -e "   ${CYAN}tmux new -s training${NC}"
+    echo -e "   ${CYAN}./nki-llama finetune all${NC}"
+    echo -e "4. For model benchmarking:"
+    echo -e "   ${CYAN}source ${INFERENCE_ENV}/bin/activate${NC}"
+    echo -e "   ${CYAN}./nki-llama inference download${NC}"
+    echo -e "   ${CYAN}tmux new -s benchmark${NC}"
+    echo -e "   ${CYAN}./nki-llama inference benchmark       # Full benchmark${NC}"
+    echo -e "   ${CYAN}./nki-llama inference benchmark single   # Quick test${NC}"
+    echo -e "5. For inference serving:"
+    echo -e "   ${CYAN}./nki-llama inference setup${NC}"
+    echo -e "   ${CYAN}./nki-llama inference server${NC}"
+    echo
+    echo -e "${YELLOW}💡 Pro Tips:${NC}"
+    echo -e "   • Always use tmux for long operations"
+    echo -e "   • The script will tell you which environment to activate if needed"
+    echo -e "   • Check ./nki-llama status for system health"
+    echo -e "   • Use --clear-cache if benchmark fails with cache errors"
+    echo
+}
+
+# Main function
+main() {
+    # Show banner only for interactive commands
+    case "${1:-help}" in
+        help|setup|status|config)
+            clear
+            display_banner
+            ;;
+    esac
+    
+    # Initialize logging for actual operations
+    case "${1:-help}" in
+        self-attention|finetune|inference|train|server|clean)
+            init_logging
+            ;;
+    esac
+    
+    # Parse command
+    local cmd="${1:-help}"
+    shift || true
+    
+    case "$cmd" in
+        # Setup
+        setup)
+            cmd_setup
+            ;;
+            
+        # Quick shortcuts
+        train)
+            cmd_finetune_train "$@"
+            ;;
+        server)
+            cmd_inference_server "$@"
+            ;;
+        jupyter)
+            # Check environment
+            if ! check_and_suggest_env "inference" "Inference" "$INFERENCE_ENV"; then
+                exit 1
+            fi
+            bash "${NKI_INFERENCE_SCRIPTS}/jupyter.sh" "$@"
+            ;;
+            
+        # Self-attention commands
+        self-attention)
+            subcmd="${1:-benchmark}"
+            shift || true
+            case "$subcmd" in
+                benchmark)
+                    cmd_self_attention_benchmark "$@"
+                    ;;
+                test)
+                    cmd_self_attention_test "$@"
+                    ;;
+                run)
+                    cmd_self_attention_run "$@"
+                    ;;
+                status)
+                    cmd_self_attention_status "$@"
+                    ;;
+                *)
+                    echo -e "${RED}Unknown self-attention command: $subcmd${NC}"
+                    echo -e "Available: benchmark, test, run, status"
+                    ;;
+            esac
+            ;;
+            
+        # Fine-tuning commands
+        finetune)
+            subcmd="${1:-all}"
+            shift || true
+            case "$subcmd" in
+                deps|data|model|convert|compile|train|all)
+                    cmd_finetune_"$subcmd" "$@"
+                    ;;
+                *)
+                    echo -e "${RED}Unknown finetune command: $subcmd${NC}"
+                    show_help
+                    ;;
+            esac
+            ;;
+            
+        # Inference commands
+        inference)
+            subcmd="${1:-server}"
+            shift || true
+            case "$subcmd" in
+                setup|download|server|benchmark)
+                    cmd_inference_"$subcmd" "$@"
+                    ;;
+                *)
+                    echo -e "${RED}Unknown inference command: $subcmd${NC}"
+                    show_help
+                    ;;
+            esac
+            ;;
+            
+        # Utility commands
+        status)
+            cmd_status
+            ;;
+        config)
+            print_config
+            ;;
+        clean)
+            cmd_clean
+            ;;
+        help|--help|-h)
+            show_help
+            ;;
+        *)
+            echo -e "${RED}Unknown command: $cmd${NC}"
+            show_help
+            exit 1
+            ;;
+    esac
+}
+
+# Run main
+main "$@"
\ No newline at end of file
diff --git a/notebooks/neuron_agents_llm.ipynb b/notebooks/neuron_agents_llm.ipynb
index e61c43d..e3765c5 100644
--- a/notebooks/neuron_agents_llm.ipynb
+++ b/notebooks/neuron_agents_llm.ipynb
@@ -134,14 +134,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "llm = VLLMOpenAI(\n",
     "    openai_api_key=\"EMPTY\",\n",
     "    openai_api_base=\"http://localhost:8080/v1\",\n",
-    "    model_name=\"models/llama-3.2-3b-instruct\"\n",
+    "    model_name=\"models/meta-llama-3-8b\"\n",
     ")"
    ]
   },
diff --git a/src/README.md b/src/README.md
new file mode 100644
index 0000000..23ce614
--- /dev/null
+++ b/src/README.md
@@ -0,0 +1,615 @@
+# NKI-LLAMA Benchmark Handler
+
+A benchmarking system for evaluating NKI-LLAMA model performance across both training and inference metrics.
+
+## 🚀 Overview
+
+The NKI-LLAMA Benchmark Handler calculates a unified performance score that combines:
+- **Training metrics**: MFU (Model FLOPs Utilization), throughput, and NKI kernel usage
+- **Inference metrics**: Latency, throughput, and accuracy (optional)
+- **Reasoning metrics**: Accuracy scores from reasoning benchmarks (GSM8K, MMLU, etc.)
+- **NKI optimization**: Ratio of NKI (Neuron Kernel Interface) operations to total operations
+
+The system supports multiple modes:
+- **Training-only mode**: When inference results are not available, provides NKI kernel training score
+- **Combined mode**: When both training and inference results are available, provides full NKI-LLAMA score
+- **Full integration mode**: When training, inference, and reasoning results are available, provides comprehensive NKI-LLAMA score with reasoning component
+
+The final score follows the formula:
+```
+# Combined mode (training + inference):
+Score = Accuracy × Reduced Latency × Increased Throughput × (1 + Normalized NKI FLOPS)
+
+# Full integration mode (training + inference + reasoning):
+Score = (Base Score) × (1 + Reasoning Score Weight × Reasoning Accuracy)
+```
+
+## 💻 Usage
+
+### Basic Usage
+
+Run with default parameters:
+```bash
+python handler.py
+```
+
+This will:
+1. Calculate training metrics using `calculate_training_metrics.py`
+2. Load inference results from `benchmark_inference.json` (if available)
+3. Calculate the NKI-LLAMA score (combined or training-only)
+4. Save results to `benchmark_results.json`
+
+### Training-Only Mode
+
+If the inference benchmark file doesn't exist, the handler automatically runs in training-only mode:
+```bash
+python handler.py --calculate-score
+```
+
+This provides immediate feedback on NKI kernel optimization progress without requiring inference implementation.
+
+### Reasoning Results Integration
+
+The handler can automatically discover and integrate reasoning benchmark results from the aws-neuron-samples inference-benchmarking framework.
+
+#### Automatic Reasoning Result Discovery
+
+When reasoning results are available, the handler automatically discovers them based on your model configuration:
+
+```bash
+# Run with reasoning integration (automatic discovery)
+python handler.py --reasoning-results
+```
+
+The handler searches for reasoning results in:
+```
+aws-neuron-samples/inference-benchmarking/results/accuracy/mytest/
+├── gsm8k_cot/
+├── mmlu_pro/
+└── mmlu_flan_cot_zeroshot/
+```
+
+#### Custom Reasoning Results Path
+
+You can specify a custom path to reasoning results:
+
+```bash
+# Use custom reasoning results directory
+python handler.py --reasoning-results /path/to/custom/reasoning/results
+```
+
+#### Running Complete Workflow
+
+1. **First, run reasoning benchmarks** using the provided script:
+   ```bash
+   # Activate inference environment
+   source /path/to/inference/venv/bin/activate
+   
+   # Run reasoning benchmarks (this takes time)
+   cd nki-llama/src/inference/scripts
+   ./reasoning-bench-lm-eval.sh
+   ```
+
+2. **Then run the handler** to get comprehensive scores:
+   ```bash
+   # Get full NKI-LLAMA score with reasoning integration
+   python handler.py --reasoning-results --calculate-score
+   ```
+
+### Advanced Usage
+
+#### Custom Training Configuration
+```bash
+python /home/ubuntu/nki-llama/src/handler.py \
+    --config /home/ubuntu/nki-llama/src/fine-tune/neuronx-distributed-training/examples/conf/hf_llama3_8B_SFT_config.yaml \
+    --model-config /home/ubuntu/nki-llama/src/fine-tune/configs/model-config/8B_config_llama3-1/config.json \
+    --log-file /home/ubuntu/nki-llama/logs/nki-llama_20250610_014432.log \
+    --compile-dir /home/ubuntu/neuron_cache/neuronxcc-2.18.121.0+9e31e41a/MODULE_15329989265349737271+a65e371e \
+    --throughput 2.1 \
+    --output benchmark_results.json \
+    --training-weight 0.5 \
+    --inference-weight 0.5 \
+    --hw-backend trn1 \
+    --per-file-scores \
+    --calculate-score \
+    --detailed \
+    --verbose
+```
+
+#### Custom Inference Results
+```bash
+python /home/ubuntu/nki-llama/src/handler.py \
+    --config /home/ubuntu/nki-llama/src/fine-tune/neuronx-distributed-training/examples/conf/hf_llama3_8B_SFT_config.yaml \
+    --model-config /home/ubuntu/nki-llama/src/fine-tune/configs/model-config/8B_config_llama3-1/config.json \
+    --log-file /home/ubuntu/nki-llama/logs/nki-llama_20250610_014432.log \
+    --compile-dir /home/ubuntu/neuron_cache/neuronxcc-2.18.121.0+9e31e41a/MODULE_15329989265349737271+a65e371e \
+    --inference-results /home/ubuntu/nki-llama/src/inference/benchmark_inference.json \
+    --throughput 2.1 \
+    --output benchmark_results.json \
+    --training-weight 0.5 \
+    --inference-weight 0.5 \
+    --hw-backend trn1 \
+    --per-file-scores \
+    --calculate-score \
+    --detailed \
+    --verbose
+```
+
+#### Adjust Score Weights
+```bash
+# Adjust training and inference weights
+python handler.py \
+    --training-weight 0.3 \
+    --inference-weight 0.7
+
+# Include reasoning with custom weights
+python handler.py \
+    --training-weight 0.3 \
+    --inference-weight 0.5 \
+    --reasoning-weight 0.2 \
+    --reasoning-results
+```
+
+#### Reasoning Integration Examples
+```bash
+# Full workflow with reasoning integration
+python handler.py \
+    --reasoning-results \
+    --calculate-score \
+    --verbose
+
+# Use custom reasoning results directory
+python handler.py \
+    --reasoning-results-path /custom/reasoning/results \
+    --reasoning-weight 0.25
+
+# Training + reasoning only (skip inference)
+python handler.py \
+    --reasoning-results \
+    --training-weight 0.8 \
+    --reasoning-weight 0.2
+```
+
+#### Verbose Output
+```bash
+python handler.py --verbose
+```
+
+### Command Line Options
+
+#### Training Metrics Options
+| Option | Default | Description |
+|--------|---------|-------------|
+| `--training-script` | `/home/ubuntu/nki-llama/src/fine-tune/scripts/calculate_training_metrics.py` | Path to training metrics script |
+| `--config` | `/home/ubuntu/nki-llama/src/fine-tune/neuronx-distributed-training/examples/conf/hf_llama3_8B_SFT_config.yaml` | Training config YAML |
+| `--model-config` | `/home/ubuntu/nki-llama/src/fine-tune/configs/model-config/8B_config_llama3-1/config.json` | Model config JSON |
+| `--log-file` | `/home/ubuntu/nki-llama/logs/nki-llama_20250610_014432.log` | Training log file |
+| `--compile-dir` | `/home/ubuntu/neuron_cache` | Neuron compile cache directory |
+| `--throughput` | `2.1` | Training throughput (seq/s) |
+| `--hw-backend` | `trn1` | Hardware backend (trn1/trn2) |
+
+#### Inference Metrics Options
+| Option | Default | Description |
+|--------|---------|-------------|
+| `--inference-results` | `benchmark_inference.json` | Inference benchmark results file (optional - if not provided, only training score is calculated) |
+| `--reference-latency` | `50000` | Reference implementation latency (ms) |
+| `--reference-throughput` | `10` | Reference implementation throughput (tokens/s) |
+
+#### Reasoning Metrics Options
+| Option | Default | Description |
+|--------|---------|-------------|
+| `--reasoning-results` | `None` | Enable reasoning results integration (auto-discovery) |
+| `--reasoning-results-path` | `~/aws-neuron-samples/inference-benchmarking/results` | Custom path to reasoning results directory |
+| `--reasoning-weight` | `0.2` | Weight for reasoning score component (0-1) |
+
+#### Score Calculation Options
+| Option | Default | Description |
+|--------|---------|-------------|
+| `--training-weight` | `0.4` | Weight for training score (0-1) |
+| `--inference-weight` | `0.6` | Weight for inference score (0-1) |
+
+#### Output Options
+| Option | Default | Description |
+|--------|---------|-------------|
+| `--output` | `benchmark_results.json` | Output file for combined results |
+| `--training-output` | `benchmark_finetuning.json` | Output file for training metrics |
+| `--verbose` | `False` | Enable verbose output |
+
+## 📊 Output Format
+
+### Console Output - Combined Mode
+```
+======================================================================
+NKI-LLAMA BENCHMARK RESULTS
+======================================================================
+
+🏆 FINAL NKI-LLAMA SCORE: 0.0046
+
+Score Weights:
+  Training: 40%
+  Inference: 60%
+
+📊 Component Scores:
+  Training Score: 0.0077
+  Inference Score: 0.0026
+  NKI Ratio: 0.1846
+
+🎯 Training Metrics:
+  MFU: 15.48% (baseline: 50.00%)
+  Throughput: 2.10 seq/s (baseline: 100.00)
+  MFU Improvement: 0.3095x
+  Throughput Improvement: 0.0210x
+
+⚡ Inference Metrics:
+  Latency: 12131.49ms (reference: 50000.00ms)
+  Throughput: 52.76 tokens/s (reference: 10.00)
+  Latency Reduction: 4.1220x
+  Throughput Increase: 5.2755x
+  Accuracy: ✓ Passed
+
+======================================================================
+```
+
+### Console Output - Full Integration Mode (with Reasoning)
+```
+======================================================================
+NKI-LLAMA BENCHMARK RESULTS
+======================================================================
+
+🏆 FINAL NKI-LLAMA SCORE: 0.0055
+
+Score Weights:
+  Training: 30%
+  Inference: 50%
+  Reasoning: 20%
+
+📊 Component Scores:
+  Training Score: 0.0077
+  Inference Score: 0.0026
+  Reasoning Score: 0.555
+  NKI Ratio: 0.1846
+
+🎯 Training Metrics:
+  MFU: 15.48% (baseline: 50.00%)
+  Throughput: 2.10 seq/s (baseline: 100.00)
+  MFU Improvement: 0.3095x
+  Throughput Improvement: 0.0210x
+
+⚡ Inference Metrics:
+  Latency: 12131.49ms (reference: 50000.00ms)
+  Throughput: 52.76 tokens/s (reference: 10.00)
+  Latency Reduction: 4.1220x
+  Throughput Increase: 5.2755x
+  Accuracy: ✓ Passed
+
+🧠 Reasoning Metrics:
+  GSM8K CoT: 55.5% (exact match, strict)
+  MMLU Pro: Not available
+  MMLU Flan CoT: Not available
+  Overall Reasoning Score: 0.555
+
+======================================================================
+```
+
+### Console Output - Training-Only Mode
+```
+======================================================================
+NKI-LLAMA BENCHMARK RESULTS
+======================================================================
+
+⚠️  TRAINING-ONLY MODE (Inference results not available)
+
+🏆 NKI KERNEL TRAINING SCORE: 0.0077
+   NKI Ratio: 0.1846
+
+🎯 Training Metrics:
+  MFU: 15.48% (baseline: 50.00%)
+  Throughput: 2.10 seq/s (baseline: 100.00)
+  MFU Improvement: 0.3095x
+  Throughput Improvement: 0.0210x
+
+💡 Note: This score represents training performance only.
+   To get the full NKI-LLAMA score, run inference benchmarks and provide
+   the results file using --inference-results option.
+
+======================================================================
+```
+
+### JSON Output (`benchmark_results.json`)
+```json
+{
+  "timestamp": "2025-01-01T12:00:00",
+  "mode": "full_integration",
+  "nki_kernel_score": 0.0055,
+  "component_scores": {
+    "training": 0.0077,
+    "inference": 0.0026,
+    "reasoning": 0.555
+  },
+  "weights": {
+    "training": 0.3,
+    "inference": 0.5,
+    "reasoning": 0.2
+  },
+  "nki_ratio": 0.1846,
+  "detailed_breakdown": {
+    "training": {
+      "base_mfu": 50.0,
+      "base_throughput": 100.0,
+      "achieved_mfu": 15.48,
+      "achieved_throughput": 2.1,
+      "mfu_improvement": 0.3095,
+      "throughput_improvement": 0.021,
+      "nki_flop_ratio": 0.1846
+    },
+    "inference": {
+      "accuracy": 1.0,
+      "reduced_latency": 4.122,
+      "increased_throughput": 5.2755,
+      "normalized_nki_flops": 0.1846,
+      "reference_latency_ms": 50000,
+      "achieved_latency_ms": 12131.49,
+      "reference_throughput": 10,
+      "achieved_throughput": 52.76
+    },
+    "reasoning": {
+      "gsm8k_cot": {
+        "exact_match_strict": 0.555,
+        "exact_match_flexible": 0.575,
+        "n_samples": 200
+      },
+      "mmlu_pro": null,
+      "mmlu_flan_cot_zeroshot": null,
+      "overall_score": 0.555,
+      "discovered_results": [
+        "gsm8k_cot"
+      ]
+    }
+  }
+}
+```
+
+## 📈 Score Interpretation
+
+### Training Score Components
+- **MFU Improvement**: How much better the model utilizes FLOPs compared to baseline
+- **Throughput Improvement**: Training speed improvement over baseline
+- **NKI Ratio**: Percentage of operations using optimized NKI kernels
+
+### Inference Score Components
+- **Accuracy**: Binary flag (1 if meets threshold, 0 otherwise)
+- **Reduced Latency**: How much faster the model responds (higher is better)
+- **Increased Throughput**: How many more tokens/second (higher is better)
+- **NKI FLOPS**: Bonus for using NKI optimized operations
+
+### Reasoning Score Components
+- **GSM8K CoT**: Grade school math problems with chain-of-thought reasoning
+- **MMLU Pro**: Massive multitask language understanding (professional level)
+- **MMLU Flan CoT**: MMLU with chain-of-thought prompting
+- **Overall Score**: Weighted average of available reasoning benchmark scores
+
+### Score Ranges
+- **0-1**: Poor performance, needs optimization
+- **1-10**: Baseline performance
+- **10-50**: Good optimization
+- **50+**: Excellent optimization
+
+## 🔧 Troubleshooting
+
+### Common Issues
+
+1. **FileNotFoundError**: Ensure all paths in command arguments are correct
+   ```bash
+   python handler.py --verbose  # Shows detailed error messages
+   ```
+
+2. **Missing `benchmark_inference.json`**: The handler will automatically run in training-only mode
+   ```bash
+   # To create a sample inference results file for testing:
+   echo '{"e2e_model": {"latency_ms_avg": 12131.49, "throughput": 52.76}}' > benchmark_inference.json
+   ```
+
+3. **Training metrics calculation fails**: Check:
+   - Training log file exists and has correct format
+   - Neuron cache directory contains HLO files
+   - Model config JSON is valid
+
+### Reasoning Results Troubleshooting
+
+#### Reasoning Results Not Found
+
+If reasoning results are not discovered automatically:
+
+1. **Check the results directory structure**:
+   ```bash
+   # Expected structure:
+   ls -la ~/aws-neuron-samples/inference-benchmarking/results/accuracy/mytest/
+   # Should show: gsm8k_cot/, mmlu_pro/, mmlu_flan_cot_zeroshot/
+   ```
+
+2. **Verify model name mapping**:
+   ```bash
+   # Handler maps model paths to sanitized names
+   # /home/ubuntu/models/llama-3-1-8b → __home__ubuntu__models__llama-3-1-8b
+   find ~/aws-neuron-samples/inference-benchmarking/results -name "*llama*" -type d
+   ```
+
+3. **Check reasoning benchmark results exist**:
+   ```bash
+   # Look for JSON result files
+   find ~/aws-neuron-samples/inference-benchmarking/results -name "results_*.json" | head -5
+   ```
+
+#### Reasoning Results Format Issues
+
+If reasoning results are found but parsing fails:
+
+1. **Validate JSON format**:
+   ```bash
+   # Check if result files are valid JSON
+   python -m json.tool /path/to/results_file.json > /dev/null
+   ```
+
+2. **Check required fields**:
+   ```bash
+   # Verify the file contains expected structure
+   jq '.results | keys' /path/to/results_file.json
+   # Should show reasoning benchmark names like "gsm8k_cot"
+   ```
+
+3. **Inspect score fields**:
+   ```bash
+   # Check for exact_match,strict-match scores
+   jq '.results.gsm8k_cot."exact_match,strict-match"' /path/to/results_file.json
+   ```
+
+#### Running Reasoning Benchmarks
+
+If you need to generate reasoning results:
+
+1. **Set up the inference environment**:
+   ```bash
+   # Activate the inference virtual environment
+   source /path/to/inference/venv/bin/activate
+   
+   # Verify vLLM and dependencies are installed
+   python -c "import vllm; print('vLLM available')"
+   ```
+
+2. **Run the reasoning benchmark script**:
+   ```bash
+   cd nki-llama/src/inference/scripts
+   ./reasoning-bench-lm-eval.sh
+   ```
+
+3. **Monitor benchmark progress**:
+   ```bash
+   # Check server logs
+   tail -f ~/aws-neuron-samples/inference-benchmarking/server_*.log
+   
+   # Check for result files being created
+   watch "find ~/aws-neuron-samples/inference-benchmarking/results -name 'results_*.json' | wc -l"
+   ```
+
+#### Custom Reasoning Results Path
+
+If using custom reasoning results location:
+
+1. **Specify custom path**:
+   ```bash
+   python handler.py --reasoning-results-path /custom/path/to/results
+   ```
+
+2. **Verify directory structure**:
+   ```bash
+   # Custom path should have same structure as aws-neuron-samples
+   ls -la /custom/path/to/results/accuracy/mytest/
+   ```
+
+### Debug Mode
+Run with verbose flag to see detailed execution:
+```bash
+python handler.py --verbose 2>&1 | tee debug.log
+```
+
+### Reasoning Integration Debug
+For detailed reasoning integration debugging:
+```bash
+# Enable verbose mode to see reasoning result discovery process
+python handler.py --reasoning-results --verbose
+
+# Check what reasoning results are being discovered
+python -c "
+import json
+from pathlib import Path
+results_dir = Path.home() / 'aws-neuron-samples/inference-benchmarking/results/accuracy/mytest'
+for benchmark_dir in results_dir.iterdir():
+    if benchmark_dir.is_dir():
+        print(f'Found benchmark: {benchmark_dir.name}')
+        for model_dir in benchmark_dir.iterdir():
+            if model_dir.is_dir():
+                print(f'  Model: {model_dir.name}')
+                for result_file in model_dir.glob('results_*.json'):
+                    print(f'    Result: {result_file.name}')
+"
+```
+
+## 📝 Input File Formats
+
+### `benchmark_inference.json`
+```json
+{
+  "e2e_model": {
+    "latency_ms_p50": 12143.92,
+    "latency_ms_p90": 12169.44,
+    "latency_ms_p95": 12182.64,
+    "latency_ms_p99": 12189.53,
+    "latency_ms_p100": 12191.26,
+    "latency_ms_avg": 12131.49,
+    "throughput": 52.76
+  },
+  "context_encoding_model": {
+    "latency_ms_avg": 43.01,
+    "throughput": 4440.69
+  },
+  "token_generation_model": {
+    "latency_ms_avg": 15.58,
+    "throughput": 64.33
+  }
+}
+```
+
+### Training Config YAML
+```yaml
+data:
+  global_batch_size: 64
+  seq_length: 4096
+
+model:
+  name: "llama3-8b"
+  
+training:
+  num_epochs: 3
+  learning_rate: 1e-4
+```
+
+### Reasoning Results JSON
+```json
+{
+  "results": {
+    "gsm8k_cot": {
+      "alias": "gsm8k_cot",
+      "exact_match,strict-match": 0.555,
+      "exact_match_stderr,strict-match": 0.0352289710609046,
+      "exact_match,flexible-extract": 0.575,
+      "exact_match_stderr,flexible-extract": 0.03504304603451135
+    }
+  },
+  "n-samples": {
+    "gsm8k_cot": {
+      "original": 1319,
+      "effective": 200
+    }
+  },
+  "model_name": "/home/ubuntu/models/llama-3-1-8b",
+  "model_name_sanitized": "__home__ubuntu__models__llama-3-1-8b"
+}
+```
+
+### Reasoning Results Directory Structure
+```
+aws-neuron-samples/inference-benchmarking/results/accuracy/mytest/
+├── gsm8k_cot/
+│   └── __home__ubuntu__models__llama-3-1-8b/
+│       └── results_2025-06-23T01-34-27.025863.json
+├── mmlu_pro/
+│   └── __home__ubuntu__models__llama-3-1-8b/
+│       └── results_2025-06-23T01-35-15.123456.json
+└── mmlu_flan_cot_zeroshot/
+    └── __home__ubuntu__models__llama-3-1-8b/
+        └── results_2025-06-23T01-36-42.789012.json
+```
+
+---
+
+**Note**: Default paths assume standard NKI-LLAMA directory structure. Adjust paths according to your setup.
\ No newline at end of file
diff --git a/src/fine-tune/Makefile b/src/fine-tune/Makefile
deleted file mode 100644
index 2d6c928..0000000
--- a/src/fine-tune/Makefile
+++ /dev/null
@@ -1,49 +0,0 @@
-# examples/nxdt_finetune_llama3_8B_lora/Makefile
-
--include ../../.env
-
-SHELL := /bin/bash
-
-.PHONY: all deps data model convert_ckpt precompile train clean
-all: deps data model convert_ckpt precompile train
-# Check if in Neuron virtual environment
-.PHONY: check-neuron-venv
-check-neuron-venv:
-	@if [ -z "$$VIRTUAL_ENV" ] || [[ "$$VIRTUAL_ENV" != *"neuronx"* ]]; then \
-		echo "Error: Not in Neuron virtual environment."; \
-		echo "Run 'source /opt/aws_neuronx_venv_pytorch_2_5/bin/activate' first."; \
-		exit 1; \
-	else \
-		echo "Using Neuron virtual environment: $$VIRTUAL_ENV"; \
-	fi
-
-deps: check-neuron-venv
-	@echo "==> Installing/Validating Apex, NxDT and other dependencies..."
-	@echo "Running bootstrap script to ensure all dependencies are properly configured..."
-	bash scripts/bootstrap.sh
-
-data: check-neuron-venv
-	@echo "==> Downloading dataset"
-	bash scripts/download_data.sh
-
-model: check-neuron-venv
-	@echo "==> Downloading & converting model"
-	HF_TOKEN="${HF_TOKEN}" MODEL_ID="${MODEL_ID}" bash scripts/download_model.sh
-
-convert_ckpt: check-neuron-venv
-	@echo "==> Converting checkpoint to NxDT format"
-	bash scripts/convert_checkpoints.sh
-
-precompile: check-neuron-venv
-	@echo "==> Pre-compiling graphs (AOT)"
-	bash scripts/precompile.sh
-
-train: check-neuron-venv
-	@echo "==> Running fine-tuning job"
-	bash scripts/run_training.sh
-
-clean: check-neuron-venv
-	rm -rf dataset llama3_tokenizer llama3-8B_hf_weights pretrained_ckpt nemo_experiments
-	@echo "Cleaned up all generated files."
-
-
diff --git a/src/fine-tune/README.md b/src/fine-tune/README.md
new file mode 100644
index 0000000..19a6381
--- /dev/null
+++ b/src/fine-tune/README.md
@@ -0,0 +1,131 @@
+You're able to fun through the fine-tuning workflow with the top-level NKI LLAMA CLI or this fine-tune pipeline script.
+
+## 1 · Prerequisites
+
+- Under the configs dir, make sure to bring your model specific config.json for NxD-training.
+- additionally ensure you bring your training config.YAML file, for more details, refer to the corresponding NXDT documentation.
+
+| Requirement | Reason | Install / Notes |
+|-------------|--------|-----------------|
+| **Neuron virtual‑env** | Script refuses to run outside it | `source /opt/aws_neuronx_venv_pytorch_2_5/bin/activate` |
+| **`scripts/` folder** | step-by step scripts for running fine tuning | |
+| **`.env` file** *(optional)* | Central place for env vars | Place at `../../.env` |
+
+Example `.env`:
+
+```dotenv
+HF_TOKEN=hf_****************************************
+MODEL_ID=meta-llama-3-8b
+```
+
+---
+
+## 2 · Setup
+
+```bash
+# Clone repo and enter it
+cd ./src/fine-tune
+
+# Make the script executable
+chmod +x fine-tune-pipeline.sh
+
+# Activate Neuron environment
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+```
+
+---
+
+## 3 · Using tmux for Long-Running Training
+
+### Why tmux?
+
+Training neural networks on AWS Neuron can take hours or even days. Using tmux ensures your training continues even if:
+- Your SSH connection drops
+- You need to close your laptop
+- Network interruptions occur
+- You want to monitor progress from multiple devices
+
+### Quick tmux Setup
+
+```bash
+# Create a new tmux session named "training"
+tmux new -s training
+
+# Inside tmux, activate Neuron environment and start training
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+cd ./src/fine-tune
+./pipeline.sh train
+
+# Detach from session (training continues in background)
+# Press: Ctrl+b, then d
+
+# Later, reattach to check progress
+tmux attach -t training
+
+# List all sessions
+tmux ls
+```
+
+### Essential tmux Commands
+
+| Command | Action |
+|---------|--------|
+| `tmux new -s training` | Create session named "training" |
+| `Ctrl+b d` | Detach (leave session running) |
+| `tmux attach -t training` | Reattach to training session |
+| `tmux ls` | List all sessions |
+| `tmux kill-session -t training` | Terminate session |
+
+**Pro tip:** Start your training in tmux from the beginning. It's much safer than hoping your connection stays stable!
+
+---
+
+## 4 · Usage
+
+| Command | Action |
+|---------|--------|
+| `./fine-tune-pipeline.sh` | Run the **full FT pipeline** (deps → data → model → convert_ckpt → precompile → train) |
+| `./fine-tune-pipeline.sh deps` | Install/validate Apex, NxDT, etc. |
+| `./fine-tune-pipeline.sh data` | Download dataset |
+| `./fine-tune-pipeline.sh model` | Download & convert model checkpoints |
+| `./fine-tune-pipeline.sh convert_ckpt` | Convert checkpoints to NxDT format |
+| `./fine-tune-pipeline.sh precompile` | Ahead‑of‑time graph compilation |
+| `./fine-tune-pipeline.sh train` | Start fine‑tuning |
+| `./fine-tune-pipeline.sh clean` | Remove generated datasets, weights, experiments |
+
+ Each sub‑command double‑checks you're inside a Neuron venv and prints a helpful error if not.
+
+---
+
+## 5 · Environment Variables
+
+| Variable | Purpose | How to set |
+|----------|---------|-----------|
+| `HF_TOKEN` | Hugging Face auth token (for private models) | Add to `.env` or `export HF_TOKEN=…` |
+| `MODEL_ID` | Model slug, e.g. `meta-llama-3-8b` | Same as above |
+
+The script auto‑loads `../../.env` with `set -a; source …`. Modify the `ENV_FILE=` line in `fine-tune-pipeline.sh` if you store it elsewhere.
+
+---
+
+## 6 · Troubleshooting
+
+| Symptom | Probable Cause | Fix |
+|---------|---------------|-----|
+| `Not inside a Neuron virtual environment.` | Forgot to activate venv | `source /opt/aws_neuronx_venv_pytorch_2_5/bin/activate` |
+| `command not found: fine-tune-pipeline.sh` | File not executable or wrong cwd | `chmod +x fine-tune-pipeline.sh` and/or `./fine-tune-pipeline.sh` |
+| Model download fails | Missing/invalid `HF_TOKEN` | Provide valid token in env or `.env` |
+| Long compile times | First‑time Neuron AOT | Subsequent runs reuse cached graphs |
+
+---
+
+## 7 · Extending the Pipeline
+
+1. Add a new Bash function in `fine-tune-pipeline.sh` (e.g., `evaluate()`).
+2. Append its name to the pattern list inside `main()`.
+3. Optionally call it from `all()` for automatic inclusion.
+
+```bash
+./fine-tune-pipeline.sh train 
+```
+---
\ No newline at end of file
diff --git a/src/fine-tune/configs/YAML/hf_llama3.1_8B_SFT_lora_config.yaml b/src/fine-tune/configs/YAML/hf_llama3.1_8B_SFT_lora_config.yaml
index 5f477d5..1202ee6 100644
--- a/src/fine-tune/configs/YAML/hf_llama3.1_8B_SFT_lora_config.yaml
+++ b/src/fine-tune/configs/YAML/hf_llama3.1_8B_SFT_lora_config.yaml
@@ -6,7 +6,7 @@ trainer:
   devices: 32
   num_nodes: 1
   max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
-  max_steps: 5000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  max_steps: 1000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
   val_check_interval: 500 # we do not want val to run during training, hence setting it at a high number
   check_val_every_n_epoch: null
diff --git a/src/fine-tune/docs/calculate-score-guide.md b/src/fine-tune/docs/calculate-score-guide.md
new file mode 100644
index 0000000..c310c5b
--- /dev/null
+++ b/src/fine-tune/docs/calculate-score-guide.md
@@ -0,0 +1,290 @@
+# Training Metrics Calculator
+
+A tool for calculating training metrics including Model FLOPs Utilization (MFU), NKI (Neuron Kernel Interface) usage analysis, and training performance scores from AWS Neuron training logs and HLO files.
+
+## Overview
+
+This tool analyzes training performance on AWS Trainium instances by:
+- Calculating MFU (Model FLOPs Utilization) percentage
+- Analyzing NKI kernel usage across all compiled HLO modules
+- Computing training performance scores both overall and per-file
+- Extracting metrics from training logs
+- Providing detailed breakdowns of performance improvements
+
+## Features
+
+- **MFU Calculation**: Computes the percentage of theoretical peak FLOPs achieved during training
+- **NKI Analysis**: Identifies and quantifies custom NKI kernel usage vs standard operations
+- **Training Score**: Calculates a comprehensive performance score based on multiple factors
+- **Per-File Analysis**: Breaks down performance metrics for individual HLO modules
+- **Log Parsing**: Extracts throughput and loss metrics from training logs
+- **Flexible Configuration**: Supports various model configurations and hardware backends
+
+## Usage
+
+### Example
+
+```bash
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+
+# Get report over all training jobs in neuron_cache directory
+python /home/ubuntu/nki-llama/src/fine-tune/scripts/calculate_training_metrics.py \
+      --config /home/ubuntu/nki-llama/src/fine-tune/neuronx-distributed-training/examples/conf/hf_llama3_8B_SFT_config.yaml \
+      --model-config /home/ubuntu/nki-llama/src/fine-tune/configs/model-config/8B_config_llama3-1/config.json \
+      --log-file /home/ubuntu/nki-llama/logs/nki-llama_20250610_014432.log \
+      --compile-dir /home/ubuntu/neuron_cache \
+      --throughput 2.1 \
+      --hw-backend trn1 \
+      --calculate-score \
+      --per-file-scores \
+      --detailed \
+      --print-per-file \
+      --output benchmark_finetuning.json
+
+# Get report over a training job in neuron_cache directory
+python /home/ubuntu/nki-llama/src/fine-tune/scripts/calculate_training_metrics.py \
+      --config /home/ubuntu/nki-llama/src/fine-tune/neuronx-distributed-training/examples/conf/hf_llama3_8B_SFT_config.yaml \
+      --model-config /home/ubuntu/nki-llama/src/fine-tune/configs/model-config/8B_config_llama3-1/config.json \
+      --log-file /home/ubuntu/nki-llama/logs/nki-llama_20250610_014432.log \
+      --compile-dir /home/ubuntu/neuron_cache/neuronxcc-2.18.121.0+9e31e41a/MODULE_15329989265349737271+a65e371e \
+      --throughput 2.1 \
+      --hw-backend trn1 \
+      --calculate-score \
+      --per-file-scores \
+      --detailed \
+      --print-per-file \
+      --output benchmark_finetuning.json
+```
+
+### Basic Usage
+
+```bash
+python calculate_training_metrics.py \
+  --config training_config.yaml \
+  --model-config model_config.json
+```
+
+### Advanced Usage with All Features
+
+```bash
+python calculate_training_metrics.py \
+  --config training_config.yaml \
+  --model-config model_config.json \
+  --compile-dir /home/ubuntu/neuron_cache \
+  --log-file training.log \
+  --batch-size 32 \
+  --seq-len 2048 \
+  --throughput 150.5 \
+  --num-nodes 4 \
+  --hw-backend trn1 \
+  --calculate-score \
+  --per-file-scores \
+  --print-per-file \
+  --detailed \
+  --base-mfu 50.0 \
+  --base-throughput 100.0 \
+  --loss-improvement 1.2 \
+  --convergence-rate 1.1 \
+  --output metrics_report.json
+```
+
+## Command Line Arguments
+
+### Required Arguments
+
+- `--config`: Path to the training configuration YAML file
+- `--model-config`: Path to the model configuration JSON file
+
+### Optional Arguments
+
+#### Basic Configuration
+- `--compile-dir`: Neuron compile cache directory (default: `/home/ubuntu/neuron_cache`)
+- `--log-file`: Training log file to parse for metrics
+- `--batch-size`: Global batch size (overrides config file)
+- `--seq-len`: Sequence length (overrides config file)
+- `--throughput`: Throughput in sequences/second (if known)
+- `--num-nodes`: Number of nodes (default: 1)
+- `--hw-backend`: Hardware backend - `trn1` or `trn2` (default: `trn1`)
+- `--output`: Output metrics file (default: `training_metrics.json`)
+
+#### Display Options
+- `--detailed`: Include detailed per-file metrics in JSON output
+- `--print-per-file`: Print per-file metrics table to console
+
+#### Training Score Parameters
+- `--calculate-score`: Calculate final training score
+- `--per-file-scores`: Calculate training scores for each file individually
+- `--base-mfu`: Baseline MFU percentage for score calculation (default: 50.0)
+- `--base-throughput`: Baseline throughput in seq/s for score calculation (default: 100.0)
+- `--loss-improvement`: Loss improvement ratio (baseline_loss/achieved_loss) (default: 1.0)
+- `--convergence-rate`: Convergence rate improvement (baseline_steps/achieved_steps) (default: 1.0)
+
+## Output Format
+
+### Console Output
+
+The tool provides detailed console output including:
+
+1. **Configuration Summary**: Shows the parameters used for calculation
+2. **Per-File Analysis** (with `--print-per-file`):
+   ```
+   File Name                                Module              HLO MACs        NKI MACs  NKI Ratio      Score
+   --------------------------------------------------------------------------------------------------------
+   model.hlo_module.pb                      MODULE_Model      1,234,567,890    123,456,789     0.1000     1.2345
+   ```
+3. **Training Score Breakdown** (with `--calculate-score`):
+   ```
+   Training Score Breakdown:
+   ==================================================
+   MFU improvement: 1.2000 (50.00% → 60.00%)
+   Throughput improvement: 1.5000 (100.00 → 150.00 seq/s)
+   Loss improvement: 1.2000
+   Convergence rate improvement: 1.1000
+   NKI flop ratio: 0.1500
+   ==================================================
+   Final Training Score: 2.7324
+   ```
+4. **Summary Statistics**: Overall metrics and NKI analysis summary
+
+### JSON Output
+
+The output JSON file contains:
+
+```json
+{
+    "model_config": "path/to/model_config.json",
+    "batch_size": 32,
+    "sequence_length": 2048,
+    "num_nodes": 4,
+    "hardware_backend": "trn1",
+    "throughput_seq_per_sec": 150.5,
+    "mfu_percent": 60.0,
+    "tflops_per_second": 245.6,
+    "nki_analysis": {
+        "summary": {
+            "total_files": 10,
+            "successful_analyses": 10,
+            "overall_nki_ratio": 0.15,
+            "average_nki_ratio": 0.14,
+            "min_nki_ratio": 0.05,
+            "max_nki_ratio": 0.25
+        },
+        "per_file_metrics": [...]  // With --detailed flag
+    },
+    "training_score": 2.7324,
+    "training_score_breakdown": {...}
+}
+```
+
+## Understanding the Metrics
+
+### MFU (Model FLOPs Utilization)
+- Percentage of theoretical peak FLOPs achieved
+- Higher is better (typical range: 30-70% for LLMs)
+- Depends on model architecture, batch size, and sequence length
+
+### NKI Ratio
+- Ratio of NKI (custom kernel) MACs to total MACs
+- Higher ratio indicates more optimized kernels
+- Range: 0.0 (no NKI) to 1.0 (all NKI)
+
+### Training Score
+- Composite metric combining multiple factors:
+  - MFU improvement over baseline
+  - Throughput improvement over baseline
+  - Loss improvement (optional)
+  - Convergence rate improvement (optional)
+  - NKI utilization bonus
+- Formula: `score = mfu_improvement × throughput_improvement × loss_improvement × convergence_rate × (1 + nki_ratio)`
+
+## Example Workflows
+
+### 1. Basic Performance Analysis
+
+```bash
+# Just get MFU and basic metrics
+python calculate_training_metrics.py \
+  --config config.yaml \
+  --model-config model.json
+```
+
+### 2. Full Training Evaluation
+
+```bash
+# Complete analysis with scores
+python calculate_training_metrics.py \
+  --config config.yaml \
+  --model-config model.json \
+  --log-file training.log \
+  --calculate-score \
+  --print-per-file
+```
+
+### 3. Comparative Analysis
+
+```bash
+# Compare against baseline performance
+python calculate_training_metrics.py \
+  --config config.yaml \
+  --model-config model.json \
+  --calculate-score \
+  --base-mfu 45.0 \
+  --base-throughput 80.0 \
+  --loss-improvement 1.15
+```
+
+### 4. Debugging NKI Usage
+
+```bash
+# Detailed per-file NKI analysis
+python calculate_training_metrics.py \
+  --config config.yaml \
+  --model-config model.json \
+  --print-per-file \
+  --per-file-scores \
+  --detailed
+```
+
+## Troubleshooting
+
+### Common Issues
+
+1. **"HLO file not found" errors**
+   - Ensure the `--compile-dir` points to the correct Neuron cache directory
+   - Check that compilation completed successfully
+
+2. **"Failed to parse NKI backend config" warnings**
+   - Normal for non-NKI operations
+   - Only affects NKI metric calculation
+
+3. **Low MFU values**
+   - Check batch size and sequence length
+   - Ensure model is properly optimized for Neuron
+   - Consider using larger batch sizes
+
+4. **Zero NKI ratio**
+   - Verify NKI kernels are enabled in compilation
+   - Check Neuron SDK version supports NKI
+
+### Log File Format
+
+The tool expects training logs with patterns like:
+```
+step_time: 1.234
+throughput: 150.5
+seq/s: 150.5
+loss: 2.345
+```
+
+## Best Practices
+
+1. **Baseline Selection**: Choose realistic baselines that represent:
+   - Previous model versions
+   - Industry standards
+   - Unoptimized implementations
+
+2. **Multiple Runs**: Analyze metrics from multiple training runs to ensure consistency
+
+3. **Complete Analysis**: Use both overall and per-file scores to identify optimization opportunities
+
+4. **Version Tracking**: Save output JSON files with model versions for historical comparison
diff --git a/src/fine-tune/fine-tune-pipeline.sh b/src/fine-tune/fine-tune-pipeline.sh
new file mode 100755
index 0000000..268bcb3
--- /dev/null
+++ b/src/fine-tune/fine-tune-pipeline.sh
@@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+# pipeline.sh 
+# Usage: ./pipeline.sh [all|deps|data|model|convert_ckpt|precompile|train|clean]
+set -euo pipefail
+
+###############################################################################
+# 1. Bring in environment variables from ../../.env (if it exists)
+###############################################################################
+ENV_FILE="$(dirname "$0")/../../.env"
+if [[ -f "$ENV_FILE" ]]; then
+  # Export every variable defined in the .env file
+  set -a
+  
+  source "$ENV_FILE"
+  set +a
+fi
+
+###############################################################################
+# 2. Helper for neuron venv
+###############################################################################
+check_neuron_venv() {
+  if [[ -z "${VIRTUAL_ENV:-}" || "$VIRTUAL_ENV" != *"neuronx"* ]]; then
+    echo "Not inside a Neuron virtual environment."
+    echo "    Run:  source /opt/aws_neuronx_venv_pytorch_2_5/bin/activate"
+    exit 1
+  fi
+  echo "Using Neuron virtual environment: $VIRTUAL_ENV"
+}
+
+###############################################################################
+# 3. Pipeline steps (one Bash function per step)
+###############################################################################
+deps()        { check_neuron_venv; echo "==> Installing/validating deps…"; bash scripts/bootstrap.sh; }
+data()        { check_neuron_venv; echo "==> Downloading dataset…";       bash scripts/download_data.sh; }
+model()       { check_neuron_venv; echo "==> Fetching model…";            HF_TOKEN="${HF_TOKEN:-}" MODEL_ID="${MODEL_ID:-}" bash scripts/download_model.sh; }
+convert_ckpt(){ check_neuron_venv; echo "==> Converting ckpt…";           bash scripts/convert_checkpoints.sh; }
+precompile()  { check_neuron_venv; echo "==> Pre-compiling graphs…";       bash scripts/precompile.sh; }
+train()       { check_neuron_venv; echo "==> Running fine-tune…";          bash scripts/run_training.sh; }
+clean()       { check_neuron_venv; rm -rf dataset llama3_tokenizer llama3-8B_hf_weights pretrained_ckpt nemo_experiments; echo "🧹  Cleaned generated files."; }
+
+# run as unit test
+all() { deps; data; model; convert_ckpt; precompile; train; }
+
+###############################################################################
+# 4. Argument parsing
+###############################################################################
+main() {
+  local cmd="${1:-all}"
+  case "$cmd" in
+    all|deps|data|model|convert_ckpt|precompile|train|clean) "$cmd" ;;
+    *)  echo "Usage: $0 {all|deps|data|model|convert_ckpt|precompile|train|clean}" >&2; exit 1 ;;
+  esac
+}
+
+main "$@"
+
+
diff --git a/src/fine-tune/scripts/bootstrap.sh b/src/fine-tune/scripts/bootstrap.sh
old mode 100644
new mode 100755
index bfdd90d..729f026
--- a/src/fine-tune/scripts/bootstrap.sh
+++ b/src/fine-tune/scripts/bootstrap.sh
@@ -6,7 +6,7 @@ set +e
 
 LOG_FILE="bootstrap_$(date +%Y%m%d_%H%M%S).log"
 MAX_RETRIES=3
-REQUIREMENTS_URL="https://raw.githubusercontent.com/aws-neuron/neuronx-distributed-training/master/requirements.txt"
+REQUIREMENTS_URL="https://raw.githubusercontent.com/aws-neuron/neuronx-distributed-training/refs/heads/release_2.23.0/requirements.txt"
 LOCAL_REQUIREMENTS="./cached_requirements.txt"
 
 log_message() {
@@ -89,7 +89,7 @@ fi
 
 cd apex || { log_message "Failed to change directory to apex"; exit 1; }
 log_message "==== Checking out Apex version 23.05 ===="
-run_with_retry "git checkout 810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c" "Checkout Apex 23.05" "critical"
+run_with_retry "git checkout 23.05" "Checkout Apex 23.05" "critical"
 
 # 3a. Overwrite setup.py with the nxd specific setup
 log_message "==== Creating custom setup.py for CPU-only Apex ===="
@@ -111,7 +111,7 @@ setup(
     packages=find_packages(
         exclude=("build", "csrc", "include", "tests", "dist", "docs", "tests", "examples", "apex.egg-info",)
     ),
-    install_requires=["packaging>20.6"],
+    install_requires=["packaging>20.6",],
     description="PyTorch Extensions written by NVIDIA",
 )
 EOF
diff --git a/src/fine-tune/scripts/calculate_training_metrics.py b/src/fine-tune/scripts/calculate_training_metrics.py
new file mode 100644
index 0000000..01a1723
--- /dev/null
+++ b/src/fine-tune/scripts/calculate_training_metrics.py
@@ -0,0 +1,923 @@
+#!/usr/bin/env python3
+"""Calculate training metrics including MFU, NKI score, and final training score from training logs and HLO files."""
+
+import argparse
+import json
+import os
+import re
+import glob
+import base64
+from pathlib import Path
+from typing import List, Dict, Tuple
+from datetime import datetime
+
+from neuronx_distributed_training.utils.llama_perf_estimate import calculate_mfu
+from torch_neuronx.pyhlo.hlo_pb2 import HloModuleProto
+
+
+def calculate_training_score(
+    base_mfu,
+    base_throughput,
+    mfu,
+    throughput,
+    nki_flop_ratio,
+    loss_improvement=1.0,
+    convergence_rate=1.0
+):
+    """
+    Calculate training score similar to inference calculate_score function.
+    
+    Args:
+        base_mfu: Baseline Model FLOPs Utilization percentage
+        base_throughput: Baseline throughput in sequences/second
+        mfu: Achieved Model FLOPs Utilization percentage
+        throughput: Achieved throughput in sequences/second
+        nki_flop_ratio: Ratio of NKI MACs to total MACs
+        loss_improvement: Optional - ratio of baseline loss to achieved loss at same step
+        convergence_rate: Optional - ratio of steps to reach target loss (baseline/achieved)
+    
+    Returns:
+        float: Final training score
+    """
+    
+    # Calculate improvement ratios
+    mfu_improvement = mfu / base_mfu if base_mfu > 0 else 1.0
+    throughput_improvement = throughput / base_throughput if base_throughput > 0 else 1.0
+    
+    # Combine metrics into final score
+    # Similar formula to inference but adapted for training metrics
+    final_score = mfu_improvement * throughput_improvement * loss_improvement * convergence_rate * (0.1 + nki_flop_ratio)
+    
+    return {
+        'score': final_score,
+        'mfu_improvement': mfu_improvement,
+        'throughput_improvement': throughput_improvement,
+        'loss_improvement': loss_improvement,
+        'convergence_rate': convergence_rate,
+        'nki_flop_ratio': nki_flop_ratio
+    }
+
+
+def calculate_per_file_training_score(
+    file_metrics: Dict,
+    base_mfu: float,
+    base_throughput: float,
+    achieved_mfu: float,
+    achieved_throughput: float,
+    loss_improvement: float = 1.0,
+    convergence_rate: float = 1.0
+) -> Dict:
+    """
+    Calculate training score for a single file based on its NKI ratio.
+    
+    Args:
+        file_metrics: Dictionary containing file analysis results
+        base_mfu: Baseline MFU percentage
+        base_throughput: Baseline throughput
+        achieved_mfu: Achieved MFU percentage
+        achieved_throughput: Achieved throughput
+        loss_improvement: Loss improvement ratio
+        convergence_rate: Convergence rate improvement
+    
+    Returns:
+        Dictionary with score details
+    """
+    if file_metrics['status'] != 'success':
+        return {
+            'score': 0.0,
+            'error': file_metrics.get('error', 'File analysis failed')
+        }
+    
+    nki_ratio = file_metrics['nki_ratio']
+    score_details = calculate_training_score(
+        base_mfu=base_mfu,
+        base_throughput=base_throughput,
+        mfu=achieved_mfu,
+        throughput=achieved_throughput,
+        nki_flop_ratio=nki_ratio,
+        loss_improvement=loss_improvement,
+        convergence_rate=convergence_rate
+    )
+    
+    return score_details
+
+
+def parse_hlo_file(hlo_file_path: str) -> HloModuleProto:
+    """Parse HLO file and return protobuf."""
+    # Check if file exists
+    if not os.path.exists(hlo_file_path):
+        raise FileNotFoundError(f"HLO file not found: {hlo_file_path}")
+    
+    with open(hlo_file_path, 'rb') as f:
+        hlo_data = f.read()
+    
+    hlo_proto = HloModuleProto()
+    hlo_proto.ParseFromString(hlo_data)
+    return hlo_proto
+
+
+def count_mac_operations(hlo_proto: HloModuleProto) -> Tuple[int, int]:
+    """Count MAC operations in HLO proto.
+    
+    Returns:
+        Tuple of (total_mac_count, nki_mac_count)
+    """
+    nki_mac = 0
+    hlo_mac = 0
+    
+    for computation in hlo_proto.computations:
+        instruction_map = {instr.id: instr for instr in computation.instructions}
+        
+        for instruction in computation.instructions:
+            # Finding NKI ops
+            if instruction.opcode == "custom-call":
+                if instruction.custom_call_target == 'AwsNeuronCustomNativeKernel':
+                    try:
+                        backend_config = instruction.backend_config
+                        config = json.loads(base64.b64decode(backend_config))
+                        mac_count = int(config.get('mac_count', 0))
+                    except Exception as e:
+                        print(f"Warning: Failed to parse NKI backend config: {e}")
+                        mac_count = 0
+                    
+                    nki_mac += mac_count
+                    hlo_mac += mac_count
+            elif instruction.opcode == "dot":
+                # Get dot dimension numbers
+                dnums = instruction.dot_dimension_numbers
+                
+                # Get shapes of operands using operand_ids
+                try:
+                    lhs_shape = instruction_map[instruction.operand_ids[0]].shape
+                    rhs_shape = instruction_map[instruction.operand_ids[1]].shape
+                    
+                    # Initialize counters
+                    lhs_batch = 1
+                    lhs_contracting_size = 1
+                    lhs_non_contracting_size = 1
+                    rhs_non_contracting_size = 1
+                    
+                    # Process LHS shape
+                    for i in range(len(lhs_shape.dimensions)):
+                        if i in dnums.lhs_contracting_dimensions:
+                            lhs_contracting_size *= lhs_shape.dimensions[i]
+                        elif i in dnums.lhs_batch_dimensions:
+                            lhs_batch *= lhs_shape.dimensions[i]
+                        else:
+                            lhs_non_contracting_size *= lhs_shape.dimensions[i]
+                    
+                    # Process RHS shape
+                    for i in range(len(rhs_shape.dimensions)):
+                        if i not in dnums.rhs_contracting_dimensions and \
+                           i not in dnums.rhs_batch_dimensions:
+                            rhs_non_contracting_size *= rhs_shape.dimensions[i]
+                    
+                    mac_count = (lhs_batch * lhs_non_contracting_size *
+                                lhs_contracting_size * rhs_non_contracting_size)
+                    hlo_mac += mac_count
+                except Exception as e:
+                    print(f"Warning: Failed to process dot operation: {e}")
+    
+    return hlo_mac, nki_mac
+
+
+def find_all_hlo_files(compile_dir: str) -> List[str]:
+    """Find all HLO module files in the neuron cache directory."""
+    hlo_files = []
+    
+    # Convert to Path object for easier manipulation
+    base_path = Path(compile_dir)
+    
+    # Find all .hlo_module.pb files recursively
+    hlo_patterns = [
+        #"*.hlo",
+        #"graph.hlo",
+        #"model.hlo_module.pb",
+        "**/*.hlo_module.pb",
+        "**/model.hlo_module.pb",
+        "**/*.hlo",
+        "**/graph.hlo"
+    ]
+    
+    for pattern in hlo_patterns:
+        found_files = list(base_path.glob(pattern))
+        hlo_files.extend([str(f) for f in found_files])
+    
+    # Remove duplicates while preserving order
+    seen = set()
+    unique_files = []
+    for f in hlo_files:
+        if f not in seen:
+            seen.add(f)
+            unique_files.append(f)
+    
+    return unique_files
+
+
+def get_module_info(hlo_file_path: str) -> Dict[str, str]:
+    """Extract module information from HLO file path."""
+    path_parts = Path(hlo_file_path).parts
+    module_info = {
+        'file_path': hlo_file_path,
+        'module_name': None,
+        'neuronxcc_version': None
+    }
+    
+    # Extract module name and neuronxcc version from path
+    for i, part in enumerate(path_parts):
+        if part.startswith('MODULE_'):
+            module_info['module_name'] = part
+        elif part.startswith('neuronxcc-'):
+            module_info['neuronxcc_version'] = part
+    
+    return module_info
+
+
+def parse_training_logs(log_file: str) -> Dict:
+    """Parse training logs to extract throughput, loss, and timestamp information."""
+    metrics = {
+        'steps': [],
+        'step_times': [],
+        'throughputs': [],
+        'losses': [],
+        'timestamps': [],
+        'first_timestamp': None,
+        'last_timestamp': None,
+        'log_file_path': log_file,
+        'epochs': [],
+        'learning_rates': [],
+        'gradient_norms': [],
+        'consumed_samples': [],
+        'throughput_peaks': []
+    }
+    
+    if not os.path.exists(log_file):
+        print(f"Warning: Log file {log_file} not found")
+        return metrics
+    
+    # Get file modification time as a fallback timestamp
+    file_stat = os.stat(log_file)
+    file_mod_time = datetime.fromtimestamp(file_stat.st_mtime)
+    
+    with open(log_file, 'r') as f:
+        lines = f.readlines()
+    
+    # Updated patterns for the specific log format
+    patterns = {
+        # Timestamp pattern: [2025-06-10 09:37:36.116: or [2025-06-10 09:37:36,288]
+        'timestamp': r'\[(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})[.,]\d+[:\]]',
+        # Progress bar pattern - matches the actual format in the logs
+        'progress_bar': r'Epoch\s+(\d+):\s*\d+%\|[█▌\s]+\|\s*\d+/\d+.*?reduced_train_loss=([\d.]+).*?lr=([\d.]+).*?global_step=([\d.]+).*?consumed_samples=([\d.]+).*?throughput=([\d.]+).*?throughput_peak=([\d.]+).*?gradient_norm=([\d.]+)',
+        # Alternative individual patterns
+        'epoch_alt': r'Epoch\s+(\d+):',
+        'throughput': r'throughput=([\d.]+)',
+        'loss': r'reduced_train_loss=([\d.]+)',
+        'global_step': r'global_step=([\d.]+)',
+        'learning_rate': r'lr=([\d.]+)',
+        'gradient_norm': r'gradient_norm=([\d.]+)',
+        'consumed_samples': r'consumed_samples=([\d.]+)',
+        'throughput_peak': r'throughput_peak=([\d.]+)',
+        # Step time if present
+        'step_time': r'step_time[:\s]+([\d.]+)'
+    }
+    
+    for line in lines:
+        # Check for timestamps
+        timestamp_match = re.search(patterns['timestamp'], line)
+        if timestamp_match:
+            timestamp = timestamp_match.group(1)
+            if timestamp not in metrics['timestamps']:
+                metrics['timestamps'].append(timestamp)
+                if metrics['first_timestamp'] is None:
+                    metrics['first_timestamp'] = timestamp
+                metrics['last_timestamp'] = timestamp
+        
+        # Try to extract from progress bar format first
+        progress_match = re.search(patterns['progress_bar'], line)
+        if progress_match:
+            epoch = int(progress_match.group(1))
+            loss = float(progress_match.group(2))
+            lr = float(progress_match.group(3))
+            step = int(float(progress_match.group(4)))
+            samples = float(progress_match.group(5))
+            throughput = float(progress_match.group(6))
+            throughput_peak = float(progress_match.group(7))
+            grad_norm = float(progress_match.group(8))
+            
+            # Only add unique values
+            if epoch not in metrics['epochs']:
+                metrics['epochs'].append(epoch)
+            if step not in metrics['steps']:
+                metrics['steps'].append(step)
+            
+            # For metrics that can vary, we want all values
+            metrics['losses'].append(loss)
+            metrics['learning_rates'].append(lr)
+            metrics['consumed_samples'].append(samples)
+            metrics['throughputs'].append(throughput)
+            metrics['throughput_peaks'].append(throughput_peak)
+            metrics['gradient_norms'].append(grad_norm)
+        else:
+            # Fall back to individual pattern matching if progress bar doesn't match
+            
+            # Epoch (alternative pattern)
+            epoch_match = re.search(patterns['epoch_alt'], line)
+            if epoch_match:
+                epoch = int(epoch_match.group(1))
+                if epoch not in metrics['epochs']:
+                    metrics['epochs'].append(epoch)
+            
+            # Look for individual metrics in lines that might not have the full progress bar
+            # Only process lines that seem to contain metrics (avoid duplicate processing)
+            if 'reduced_train_loss=' in line:
+                # Loss
+                loss_match = re.search(patterns['loss'], line)
+                if loss_match:
+                    loss = float(loss_match.group(1))
+                    metrics['losses'].append(loss)
+                
+                # Learning rate
+                lr_match = re.search(patterns['learning_rate'], line)
+                if lr_match:
+                    lr = float(lr_match.group(1))
+                    metrics['learning_rates'].append(lr)
+                
+                # Global step
+                step_match = re.search(patterns['global_step'], line)
+                if step_match:
+                    step = int(float(step_match.group(1)))
+                    if step not in metrics['steps']:
+                        metrics['steps'].append(step)
+                
+                # Consumed samples
+                samples_match = re.search(patterns['consumed_samples'], line)
+                if samples_match:
+                    samples = float(samples_match.group(1))
+                    metrics['consumed_samples'].append(samples)
+                
+                # Throughput
+                throughput_match = re.search(patterns['throughput'], line)
+                if throughput_match:
+                    throughput = float(throughput_match.group(1))
+                    metrics['throughputs'].append(throughput)
+                
+                # Throughput peak
+                peak_match = re.search(patterns['throughput_peak'], line)
+                if peak_match:
+                    peak = float(peak_match.group(1))
+                    metrics['throughput_peaks'].append(peak)
+                
+                # Gradient norm
+                grad_match = re.search(patterns['gradient_norm'], line)
+                if grad_match:
+                    grad_norm = float(grad_match.group(1))
+                    metrics['gradient_norms'].append(grad_norm)
+            
+            # Step time (if present)
+            step_time_match = re.search(patterns['step_time'], line)
+            if step_time_match:
+                step_time = float(step_time_match.group(1))
+                metrics['step_times'].append(step_time)
+    
+    # Remove duplicates from lists while preserving order for some metrics
+    # For epochs and steps, we want unique values
+    metrics['epochs'] = sorted(list(set(metrics['epochs'])))
+    metrics['steps'] = sorted(list(set(metrics['steps'])))
+    
+    # For other metrics, remove consecutive duplicates but keep the progression
+    def remove_consecutive_duplicates(lst):
+        if not lst:
+            return []
+        result = [lst[0]]
+        for i in range(1, len(lst)):
+            if lst[i] != lst[i-1]:
+                result.append(lst[i])
+        return result
+    
+    metrics['losses'] = remove_consecutive_duplicates(metrics['losses'])
+    metrics['learning_rates'] = remove_consecutive_duplicates(metrics['learning_rates'])
+    metrics['consumed_samples'] = remove_consecutive_duplicates(metrics['consumed_samples'])
+    metrics['throughputs'] = remove_consecutive_duplicates(metrics['throughputs'])
+    metrics['throughput_peaks'] = remove_consecutive_duplicates(metrics['throughput_peaks'])
+    metrics['gradient_norms'] = remove_consecutive_duplicates(metrics['gradient_norms'])
+    
+    # If no timestamps found in logs, use file modification time
+    if not metrics['timestamps']:
+        metrics['file_modification_time'] = file_mod_time.strftime('%Y-%m-%d %H:%M:%S')
+    
+    return metrics
+
+
+def display_log_metrics(log_metrics: Dict) -> None:
+    """Display parsed log metrics in a formatted way."""
+    print("\n" + "="*60)
+    print("LOG FILE METRICS SUMMARY")
+    print("="*60)
+    
+    if log_metrics['log_file_path']:
+        print(f"Log file: {log_metrics['log_file_path']}")
+    
+    # Display timestamp information
+    if log_metrics['first_timestamp'] or log_metrics['last_timestamp']:
+        print(f"\nTraining Timeline:")
+        if log_metrics['first_timestamp']:
+            print(f"  First timestamp: {log_metrics['first_timestamp']}")
+        if log_metrics['last_timestamp']:
+            print(f"  Last timestamp: {log_metrics['last_timestamp']}")
+        if log_metrics['first_timestamp'] and log_metrics['last_timestamp']:
+            # Try to calculate duration
+            try:
+                first_dt = datetime.strptime(log_metrics['first_timestamp'], '%Y-%m-%d %H:%M:%S')
+                last_dt = datetime.strptime(log_metrics['last_timestamp'], '%Y-%m-%d %H:%M:%S')
+                duration = last_dt - first_dt
+                print(f"  Duration: {duration}")
+            except:
+                pass
+    elif 'file_modification_time' in log_metrics:
+        print(f"\nLog file last modified: {log_metrics['file_modification_time']}")
+    
+    # Display epoch information
+    if log_metrics['epochs']:
+        print(f"\nEpoch Information:")
+        print(f"  Epochs recorded: {len(log_metrics['epochs'])}")
+        print(f"  First epoch: {min(log_metrics['epochs'])}")
+        print(f"  Last epoch: {max(log_metrics['epochs'])}")
+    
+    # Display step information
+    if log_metrics['steps']:
+        print(f"\nTraining Steps:")
+        print(f"  Total steps recorded: {len(log_metrics['steps'])}")
+        print(f"  First step: {min(log_metrics['steps'])}")
+        print(f"  Last step: {max(log_metrics['steps'])}")
+    
+    # Display consumed samples
+    if log_metrics['consumed_samples']:
+        print(f"\nConsumed Samples:")
+        print(f"  First: {min(log_metrics['consumed_samples']):,.0f}")
+        print(f"  Last: {max(log_metrics['consumed_samples']):,.0f}")
+        print(f"  Total processed: {max(log_metrics['consumed_samples']) - min(log_metrics['consumed_samples']):,.0f}")
+    
+    # Display step time statistics
+    if log_metrics['step_times']:
+        avg_step_time = sum(log_metrics['step_times']) / len(log_metrics['step_times'])
+        print(f"\nStep Time Statistics:")
+        print(f"  Average: {avg_step_time:.3f} seconds")
+        print(f"  Min: {min(log_metrics['step_times']):.3f} seconds")
+        print(f"  Max: {max(log_metrics['step_times']):.3f} seconds")
+        print(f"  Number of measurements: {len(log_metrics['step_times'])}")
+    
+    # Display throughput statistics
+    if log_metrics['throughputs']:
+        avg_throughput = sum(log_metrics['throughputs']) / len(log_metrics['throughputs'])
+        print(f"\nThroughput Statistics:")
+        print(f"  Average: {avg_throughput:.3f} seq/s")
+        print(f"  Min: {min(log_metrics['throughputs']):.3f} seq/s")
+        print(f"  Max: {max(log_metrics['throughputs']):.3f} seq/s")
+        print(f"  Number of measurements: {len(log_metrics['throughputs'])}")
+    
+    # Display throughput peak statistics
+    if log_metrics['throughput_peaks']:
+        avg_peak = sum(log_metrics['throughput_peaks']) / len(log_metrics['throughput_peaks'])
+        print(f"\nThroughput Peak Statistics:")
+        print(f"  Average: {avg_peak:.3f} seq/s")
+        print(f"  Min: {min(log_metrics['throughput_peaks']):.3f} seq/s")
+        print(f"  Max: {max(log_metrics['throughput_peaks']):.3f} seq/s")
+    
+    # Display loss statistics
+    if log_metrics['losses']:
+        avg_loss = sum(log_metrics['losses']) / len(log_metrics['losses'])
+        print(f"\nLoss Statistics:")
+        print(f"  Average: {avg_loss:.4f}")
+        print(f"  Min: {min(log_metrics['losses']):.4f}")
+        print(f"  Max: {max(log_metrics['losses']):.4f}")
+        print(f"  First loss: {log_metrics['losses'][0]:.4f}")
+        print(f"  Last loss: {log_metrics['losses'][-1]:.4f}")
+        print(f"  Number of measurements: {len(log_metrics['losses'])}")
+        
+        # Check if loss is decreasing
+        if len(log_metrics['losses']) > 1:
+            loss_trend = "decreasing" if log_metrics['losses'][-1] < log_metrics['losses'][0] else "increasing"
+            print(f"  Loss trend: {loss_trend}")
+    
+    # Display learning rate statistics
+    if log_metrics['learning_rates']:
+        print(f"\nLearning Rate Statistics:")
+        print(f"  First: {log_metrics['learning_rates'][0]:.6f}")
+        print(f"  Last: {log_metrics['learning_rates'][-1]:.6f}")
+        print(f"  Min: {min(log_metrics['learning_rates']):.6f}")
+        print(f"  Max: {max(log_metrics['learning_rates']):.6f}")
+    
+    # Display gradient norm statistics
+    if log_metrics['gradient_norms']:
+        avg_grad = sum(log_metrics['gradient_norms']) / len(log_metrics['gradient_norms'])
+        print(f"\nGradient Norm Statistics:")
+        print(f"  Average: {avg_grad:.4f}")
+        print(f"  Min: {min(log_metrics['gradient_norms']):.4f}")
+        print(f"  Max: {max(log_metrics['gradient_norms']):.4f}")
+    
+    print("="*60)
+
+
+def analyze_hlo_file(hlo_file: str) -> Dict:
+    """Analyze a single HLO file and return its metrics."""
+    try:
+        module_info = get_module_info(hlo_file)
+        
+        # Parse the HLO file
+        hlo_proto = parse_hlo_file(hlo_file)
+        
+        # Count MAC operations
+        hlo_mac, nki_mac = count_mac_operations(hlo_proto)
+        
+        # Calculate NKI ratio for this file
+        nki_ratio = nki_mac / hlo_mac if hlo_mac > 0 else 0.0
+        
+        return {
+            'status': 'success',
+            'file_path': hlo_file,
+            'file_name': os.path.basename(hlo_file),
+            'module_name': module_info['module_name'],
+            'neuronxcc_version': module_info['neuronxcc_version'],
+            'hlo_macs': hlo_mac,
+            'nki_macs': nki_mac,
+            'nki_ratio': nki_ratio
+        }
+        
+    except Exception as e:
+        return {
+            'status': 'error',
+            'file_path': hlo_file,
+            'file_name': os.path.basename(hlo_file),
+            'error': str(e),
+            'hlo_macs': 0,
+            'nki_macs': 0,
+            'nki_ratio': 0.0
+        }
+
+
+def analyze_all_hlo_files(hlo_files: List[str], score_params: Dict = None) -> Dict:
+    """Analyze all HLO files individually and return per-file metrics with optional scoring."""
+    per_file_metrics = []
+    successful_analyses = 0
+    
+    for hlo_file in hlo_files:
+        print(f"Analyzing: {hlo_file}")
+        file_metrics = analyze_hlo_file(hlo_file)
+        
+        # Calculate per-file score if parameters provided
+        if score_params and file_metrics['status'] == 'success':
+            file_score = calculate_per_file_training_score(
+                file_metrics=file_metrics,
+                base_mfu=score_params['base_mfu'],
+                base_throughput=score_params['base_throughput'],
+                achieved_mfu=score_params['achieved_mfu'],
+                achieved_throughput=score_params['achieved_throughput'],
+                loss_improvement=score_params.get('loss_improvement', 1.0),
+                convergence_rate=score_params.get('convergence_rate', 1.0)
+            )
+            file_metrics['training_score'] = file_score
+        
+        per_file_metrics.append(file_metrics)
+        
+        if file_metrics['status'] == 'success':
+            successful_analyses += 1
+        else:
+            print(f"  Error: {file_metrics['error']}")
+    
+    # Calculate summary statistics
+    successful_files = [m for m in per_file_metrics if m['status'] == 'success']
+    
+    if successful_files:
+        total_hlo_macs = sum(m['hlo_macs'] for m in successful_files)
+        total_nki_macs = sum(m['nki_macs'] for m in successful_files)
+        overall_nki_ratio = total_nki_macs / total_hlo_macs if total_hlo_macs > 0 else 0.0
+        
+        nki_ratios = [m['nki_ratio'] for m in successful_files if m['nki_ratio'] > 0]
+        avg_nki_ratio = sum(nki_ratios) / len(nki_ratios) if nki_ratios else 0.0
+        min_nki_ratio = min(nki_ratios) if nki_ratios else 0.0
+        max_nki_ratio = max(nki_ratios) if nki_ratios else 0.0
+        
+        # Calculate score statistics if scores exist
+        if score_params:
+            scores = [m['training_score']['score'] for m in successful_files if 'training_score' in m]
+            if scores:
+                avg_score = sum(scores) / len(scores)
+                min_score = min(scores)
+                max_score = max(scores)
+            else:
+                avg_score = min_score = max_score = 0.0
+        else:
+            avg_score = min_score = max_score = None
+    else:
+        total_hlo_macs = 0
+        total_nki_macs = 0
+        overall_nki_ratio = 0.0
+        avg_nki_ratio = 0.0
+        min_nki_ratio = 0.0
+        max_nki_ratio = 0.0
+        avg_score = min_score = max_score = None
+    
+    summary = {
+        'total_files': len(hlo_files),
+        'successful_analyses': successful_analyses,
+        'failed_analyses': len(hlo_files) - successful_analyses,
+        'total_hlo_macs': total_hlo_macs,
+        'total_nki_macs': total_nki_macs,
+        'overall_nki_ratio': overall_nki_ratio,
+        'average_nki_ratio': avg_nki_ratio,
+        'min_nki_ratio': min_nki_ratio,
+        'max_nki_ratio': max_nki_ratio
+    }
+    
+    if score_params:
+        summary['average_score'] = avg_score
+        summary['min_score'] = min_score
+        summary['max_score'] = max_score
+    
+    return {
+        'per_file_metrics': per_file_metrics,
+        'summary': summary
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Calculate training metrics post-training")
+    parser.add_argument("--config", required=True, help="Path to the training config YAML file")
+    parser.add_argument("--model-config", required=True, help="Path to the model config.json file")
+    parser.add_argument("--compile-dir", default="/home/ubuntu/neuron_cache", help="Neuron compile cache directory")
+    parser.add_argument("--log-file", help="Training log file to parse for metrics")
+    parser.add_argument("--batch-size", type=int, help="Global batch size (overrides config)")
+    parser.add_argument("--seq-len", type=int, help="Sequence length (overrides config)")
+    parser.add_argument("--throughput", type=float, help="Throughput in sequences/second (if known)")
+    parser.add_argument("--num-nodes", type=int, default=1, help="Number of nodes")
+    parser.add_argument("--hw-backend", choices=['trn1', 'trn2'], default='trn1', help="Hardware backend")
+    parser.add_argument("--output", default="training_metrics.json", help="Output metrics file")
+    parser.add_argument("--detailed", action="store_true", help="Include detailed per-file metrics in output")
+    parser.add_argument("--print-per-file", action="store_true", help="Print per-file metrics to console")
+    
+    # Training score parameters
+    score_group = parser.add_argument_group('Training Score Parameters')
+    score_group.add_argument(
+        "--base-mfu",
+        type=float,
+        default=50.0,
+        help="Baseline MFU percentage for score calculation"
+    )
+    score_group.add_argument(
+        "--base-throughput",
+        type=float,
+        default=100.0,
+        help="Baseline throughput (seq/s) for score calculation"
+    )
+    score_group.add_argument(
+        "--loss-improvement",
+        type=float,
+        default=1.0,
+        help="Loss improvement ratio (baseline_loss/achieved_loss at same step)"
+    )
+    score_group.add_argument(
+        "--convergence-rate",
+        type=float,
+        default=1.0,
+        help="Convergence rate improvement (baseline_steps/achieved_steps to target loss)"
+    )
+    score_group.add_argument(
+        "--calculate-score",
+        action="store_true",
+        help="Calculate final training score"
+    )
+    score_group.add_argument(
+        "--per-file-scores",
+        action="store_true",
+        help="Calculate training scores for each file individually"
+    )
+    
+    args = parser.parse_args()
+    
+    # Load config
+    import yaml
+    with open(args.config, 'r') as f:
+        cfg = yaml.safe_load(f)
+    
+    # Override config values if provided
+    if args.batch_size:
+        batch_size = args.batch_size
+    else:
+        batch_size = cfg['data']['global_batch_size']
+    
+    if args.seq_len:
+        seq_len = args.seq_len
+    else:
+        seq_len = cfg['data']['seq_length']
+    
+    # Parse log file and display metrics if provided
+    log_metrics = None
+    if args.log_file:
+        log_metrics = parse_training_logs(args.log_file)
+        display_log_metrics(log_metrics)
+    
+    # Determine throughput
+    if args.throughput:
+        throughput = args.throughput
+    elif args.log_file and log_metrics and log_metrics['throughputs']:
+        throughput = sum(log_metrics['throughputs']) / len(log_metrics['throughputs'])
+        print(f"\nUsing average throughput from log file: {throughput:.2f} seq/s")
+    else:
+        print("Warning: No throughput information provided, using default")
+        throughput = 100.0  # Default estimate
+    
+    print(f"\nCalculating metrics with:")
+    print(f"  Model config: {args.model_config}")
+    print(f"  Batch size: {batch_size}")
+    print(f"  Sequence length: {seq_len}")
+    print(f"  Throughput: {throughput} seq/s")
+    print(f"  Hardware: {args.hw_backend}")
+    print(f"  Nodes: {args.num_nodes}")
+    
+    # Calculate MFU
+    mfu, seq_per_second, throughput_per_node, tflops_per_second, time_per_batch = calculate_mfu(
+        config_path=args.model_config,
+        batch_size=batch_size,
+        throughput=throughput,
+        num_nodes=args.num_nodes,
+        seq_len=seq_len,
+        hw_backend=args.hw_backend
+    )
+    
+    # Find and analyze all HLO files
+    print(f"\nSearching for HLO files in {args.compile_dir}...")
+    hlo_files = find_all_hlo_files(args.compile_dir)
+    print(f"Found {len(hlo_files)} HLO files")
+    
+    # Prepare score parameters if needed
+    score_params = None
+    if args.calculate_score and args.per_file_scores:
+        score_params = {
+            'base_mfu': args.base_mfu,
+            'base_throughput': args.base_throughput,
+            'achieved_mfu': mfu,
+            'achieved_throughput': throughput,
+            'loss_improvement': args.loss_improvement,
+            'convergence_rate': args.convergence_rate
+        }
+    
+    # Analyze HLO files individually
+    hlo_analysis = analyze_all_hlo_files(hlo_files, score_params)
+    
+    # Print per-file information if requested
+    if args.print_per_file:
+        print("\nPer-file NKI analysis:")
+        print("-" * 120)
+        if args.per_file_scores and args.calculate_score:
+            print(f"{'File Name':<40} {'Module':<20} {'HLO MACs':>15} {'NKI MACs':>15} {'NKI Ratio':>10} {'Score':>10}")
+        else:
+            print(f"{'File Name':<40} {'Module':<20} {'HLO MACs':>15} {'NKI MACs':>15} {'NKI Ratio':>10}")
+        print("-" * 120)
+        
+        for file_metrics in hlo_analysis['per_file_metrics']:
+            if file_metrics['status'] == 'success':
+                base_info = (f"{file_metrics['file_name']:<40} "
+                           f"{(file_metrics['module_name'] or 'N/A'):<20} "
+                           f"{file_metrics['hlo_macs']:>15,} "
+                           f"{file_metrics['nki_macs']:>15,} "
+                           f"{file_metrics['nki_ratio']:>10.4f}")
+                
+                if 'training_score' in file_metrics:
+                    print(f"{base_info} {file_metrics['training_score']['score']:>10.4f}")
+                else:
+                    print(base_info)
+            else:
+                print(f"{file_metrics['file_name']:<40} ERROR: {file_metrics['error']}")
+        print("-" * 120)
+        
+        # Print score statistics if available
+        if args.per_file_scores and args.calculate_score and hlo_analysis['summary'].get('average_score') is not None:
+            print(f"\nPer-file Score Statistics:")
+            print(f"  Average Score: {hlo_analysis['summary']['average_score']:.4f}")
+            print(f"  Min Score: {hlo_analysis['summary']['min_score']:.4f}")
+            print(f"  Max Score: {hlo_analysis['summary']['max_score']:.4f}")
+    
+    # Compile metrics
+    metrics = {
+        "model_config": args.model_config,
+        "batch_size": batch_size,
+        "sequence_length": seq_len,
+        "num_nodes": args.num_nodes,
+        "hardware_backend": args.hw_backend,
+        "throughput_seq_per_sec": throughput,
+        "mfu_percent": mfu,
+        "tflops_per_second": tflops_per_second,
+        "throughput_tflops_per_node": throughput_per_node,
+        "seq_per_second_per_node": seq_per_second,
+        "time_per_batch_seconds": time_per_batch,
+        "nki_analysis": {
+            "summary": hlo_analysis['summary']
+        }
+    }
+    
+    # Add log metrics to output if available
+    if log_metrics:
+        metrics["log_metrics"] = {
+            "log_file": log_metrics['log_file_path'],
+            "timestamps": {
+                "first": log_metrics['first_timestamp'],
+                "last": log_metrics['last_timestamp'],
+                "file_modification": log_metrics.get('file_modification_time')
+            },
+            "steps": {
+                "count": len(log_metrics['steps']),
+                "first": min(log_metrics['steps']) if log_metrics['steps'] else None,
+                "last": max(log_metrics['steps']) if log_metrics['steps'] else None
+            },
+            "step_times": {
+                "average": sum(log_metrics['step_times']) / len(log_metrics['step_times']) if log_metrics['step_times'] else None,
+                "min": min(log_metrics['step_times']) if log_metrics['step_times'] else None,
+                "max": max(log_metrics['step_times']) if log_metrics['step_times'] else None,
+                "count": len(log_metrics['step_times'])
+            },
+            "throughput": {
+                "average": sum(log_metrics['throughputs']) / len(log_metrics['throughputs']) if log_metrics['throughputs'] else None,
+                "min": min(log_metrics['throughputs']) if log_metrics['throughputs'] else None,
+                "max": max(log_metrics['throughputs']) if log_metrics['throughputs'] else None,
+                "count": len(log_metrics['throughputs'])
+            },
+            "losses": {
+                "average": sum(log_metrics['losses']) / len(log_metrics['losses']) if log_metrics['losses'] else None,
+                "min": min(log_metrics['losses']) if log_metrics['losses'] else None,
+                "max": max(log_metrics['losses']) if log_metrics['losses'] else None,
+                "first": log_metrics['losses'][0] if log_metrics['losses'] else None,
+                "last": log_metrics['losses'][-1] if log_metrics['losses'] else None,
+                "count": len(log_metrics['losses'])
+            }
+        }
+    
+    # Add detailed per-file metrics if requested
+    if args.detailed:
+        metrics["nki_analysis"]["per_file_metrics"] = hlo_analysis['per_file_metrics']
+    
+    # Calculate overall training score if requested
+    if args.calculate_score:
+        nki_ratio = hlo_analysis['summary']['overall_nki_ratio']
+        
+        print('\nTraining Score Breakdown:')
+        print('=' * 50)
+        print(f'MFU improvement: {mfu/args.base_mfu:.4f} ({args.base_mfu:.2f}% → {mfu:.2f}%)')
+        print(f'Throughput improvement: {throughput/args.base_throughput:.4f} ({args.base_throughput:.2f} → {throughput:.2f} seq/s)')
+        print(f'Loss improvement: {args.loss_improvement:.4f}')
+        print(f'Convergence rate improvement: {args.convergence_rate:.4f}')
+        print(f'NKI flop ratio: {nki_ratio:.4f}')
+        print('=' * 50)
+        
+        score_details = calculate_training_score(
+            base_mfu=args.base_mfu,
+            base_throughput=args.base_throughput,
+            mfu=mfu,
+            throughput=throughput,
+            nki_flop_ratio=nki_ratio,
+            loss_improvement=args.loss_improvement,
+            convergence_rate=args.convergence_rate
+        )
+        
+        score = score_details['score']
+        print(f'Final Training Score: {score:.4f}')
+        
+        metrics['training_score'] = score
+        metrics['training_score_breakdown'] = {
+            'base_mfu': args.base_mfu,
+            'base_throughput': args.base_throughput,
+            'achieved_mfu': mfu,
+            'achieved_throughput': throughput,
+            'mfu_improvement': score_details['mfu_improvement'],
+            'throughput_improvement': score_details['throughput_improvement'],
+            'nki_flop_ratio': nki_ratio,
+            'loss_improvement': args.loss_improvement,
+            'convergence_rate': args.convergence_rate
+        }
+    
+    # Save metrics
+    with open(args.output, 'w') as f:
+        json.dump(metrics, f, indent=4)
+    
+    # Print summary
+    print("\n" + "="*50)
+    print("TRAINING METRICS SUMMARY")
+    print("="*50)
+    print(f"MFU: {mfu:.2f}%")
+    print(f"TFLOPs/second: {tflops_per_second:.2f}")
+    print(f"\nNKI Analysis Summary:")
+    print(f"  Files analyzed: {hlo_analysis['summary']['successful_analyses']}/{hlo_analysis['summary']['total_files']}")
+    print(f"  Overall NKI Ratio: {hlo_analysis['summary']['overall_nki_ratio']:.4f}")
+    print(f"  Average NKI Ratio: {hlo_analysis['summary']['average_nki_ratio']:.4f}")
+    print(f"  Min NKI Ratio: {hlo_analysis['summary']['min_nki_ratio']:.4f}")
+    print(f"  Max NKI Ratio: {hlo_analysis['summary']['max_nki_ratio']:.4f}")
+    print(f"  Total HLO MACs: {hlo_analysis['summary']['total_hlo_macs']:,}")
+    print(f"  Total NKI MACs: {hlo_analysis['summary']['total_nki_macs']:,}")
+    print(f"\nThroughput: {throughput:.2f} seq/s")
+    print(f"Throughput per node: {throughput_per_node:.2f} TFLOP/s")
+    print(f"\nMetrics saved to: {args.output}")
+    
+    return metrics
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/src/fine-tune/scripts/convert_checkpoints.sh b/src/fine-tune/scripts/convert_checkpoints.sh
old mode 100644
new mode 100755
index 1ebe66f..3fa37b6
--- a/src/fine-tune/scripts/convert_checkpoints.sh
+++ b/src/fine-tune/scripts/convert_checkpoints.sh
@@ -12,7 +12,7 @@ if [ -d "/home/ubuntu/nki-llama/src/fine-tune/neuronx-distributed-training" ]; t
 fi
 
 echo "==== Cloning neuronx-distributed-training repository ===="
-git clone https://github.com/aws-neuron/neuronx-distributed-training.git
+git clone -b release_2.23.0 https://github.com/aws-neuron/neuronx-distributed-training.git
 
 echo "==== Changing to checkpoint converter scripts directory ===="
 cd ~/nki-llama/src/fine-tune/neuronx-distributed-training/examples/checkpoint_converter_scripts
diff --git a/src/fine-tune/scripts/download_data.sh b/src/fine-tune/scripts/download_data.sh
old mode 100644
new mode 100755
diff --git a/src/fine-tune/scripts/download_model.sh b/src/fine-tune/scripts/download_model.sh
old mode 100644
new mode 100755
index b31dbaf..030f4f2
--- a/src/fine-tune/scripts/download_model.sh
+++ b/src/fine-tune/scripts/download_model.sh
@@ -3,6 +3,36 @@ set -e
 
 echo "==== Starting Llama model download and conversion script ===="
 
+# Run the instance_type script
+source ~/nki-llama/src/fine-tune/scripts/instance_type.sh
+
+# Check if MODEL_NAME is provided as an argument or environment variable
+if [ -n "$1" ]; then
+    MODEL_NAME="$1"
+elif [ -n "$MODEL_NAME" ]; then
+    # MODEL_NAME already set as environment variable
+    :
+else
+    echo "Error: MODEL_NAME not specified!"
+    echo "Usage: $0 <MODEL_NAME>"
+    echo "   or: export MODEL_NAME=<model_name> && $0"
+    echo ""
+    
+    # Provide recommendations based on instance type
+    if [ "$EC2_INSTANCE_TYPE" == "trn1.2xlarge" ]; then
+        echo "Recommended model for $EC2_INSTANCE_TYPE:"
+        echo "  - llama-3-2_1b"
+    elif [ "$EC2_INSTANCE_TYPE" == "trn1.32xlarge" ]; then
+        echo "Recommended model for $EC2_INSTANCE_TYPE:"
+        echo "  - llama-3-1_8b"
+    else
+        echo "Recommended model names:"
+        echo "  - llama-3-2_1b (for smaller instances)"
+        echo "  - llama-3-1_8b (for larger instances)"
+    fi
+    exit 1
+fi
+
 # Check if HF_TOKEN is set
 if [ -z "$HF_TOKEN" ]; then
     echo "Error: HF_TOKEN environment variable is not set!"
@@ -13,10 +43,28 @@ fi
 # Check if MODEL_ID is set
 if [ -z "$MODEL_ID" ]; then
     echo "Error: MODEL_ID environment variable is not set!"
-    echo "Please make sure MODEL_ID is defined in your .env file"
+    echo "Please set MODEL_ID to the Hugging Face model identifier"
+    echo "Examples:"
+    echo "  - For llama-3-2_1b: export MODEL_ID=meta-llama/Llama-3.2-1B"
+    echo "  - For llama-3-1_8b: export MODEL_ID=meta-llama/Meta-Llama-3-8B"
     exit 1
 fi
 
+# Set HF_WEIGHT_NAME based on MODEL_NAME
+HF_WEIGHT_NAME="${MODEL_NAME}_hf_weights_bin"
+
+echo "🚀 Model: Processing $MODEL_NAME..."
+echo "   Instance Type: $EC2_INSTANCE_TYPE"
+echo "   HF Model ID: $MODEL_ID"
+echo "   HF Weight Name: $HF_WEIGHT_NAME"
+
+# Optional: Warn if model seems mismatched with instance type
+if [ "$EC2_INSTANCE_TYPE" == "trn1.2xlarge" ] && [[ "$MODEL_NAME" == *"8b"* ]]; then
+    echo "⚠️  Warning: Using a large model (8B) on a small instance (trn1.2xlarge) may cause performance issues"
+elif [ "$EC2_INSTANCE_TYPE" == "trn1.32xlarge" ] && [[ "$MODEL_NAME" == *"1b"* ]]; then
+    echo "ℹ️  Note: Using a small model (1B) on a large instance (trn1.32xlarge) - consider using a larger model for better resource utilization"
+fi
+
 echo "==== Changing to fine-tune workspace ===="
 # Go to your fine-tune workspace
 cd ~/nki-llama/src/fine-tune
@@ -25,8 +73,8 @@ echo "Current directory: $(pwd)"
 echo "==== Setting path variables ===="
 # Paths
 export TOKENIZER_DIR=~/nki-llama/src/fine-tune/model_assets/llama_tokenizer
-export MODEL_DIR=~/nki-llama/src/fine-tune/model_assets/llama_3-1_8b
-export BIN_MODEL_DIR=~/nki-llama/src/fine-tune/model_assets/llama3-8B_hf_weights_bin
+export MODEL_DIR=~/nki-llama/src/fine-tune/model_assets/$MODEL_NAME
+export BIN_MODEL_DIR=~/nki-llama/src/fine-tune/model_assets/$HF_WEIGHT_NAME
 export CONSOLIDATED_BIN_MODEL_DIR=~/nki-llama/src/fine-tune/model_assets/pckpt/
 
 echo "Tokenizer directory: $TOKENIZER_DIR"
@@ -61,10 +109,10 @@ import os
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from pathlib import Path
 
-model_id = os.environ.get("MODEL_ID", "meta-llama/Meta-Llama-3-8B")
+model_id = os.environ.get("MODEL_ID")
 tokenizer_dir = os.path.expanduser(os.environ["TOKENIZER_DIR"])
-model_dir     = os.path.expanduser(os.environ["MODEL_DIR"])
-hf_token      = os.environ.get("HF_TOKEN")
+model_dir = os.path.expanduser(os.environ["MODEL_DIR"])
+hf_token = os.environ.get("HF_TOKEN")
 
 # Check if tokenizer exists
 tokenizer_path = Path(tokenizer_dir)
diff --git a/src/fine-tune/scripts/instance_type.sh b/src/fine-tune/scripts/instance_type.sh
new file mode 100755
index 0000000..264073a
--- /dev/null
+++ b/src/fine-tune/scripts/instance_type.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+# Function to get metadata with retry
+get_metadata() {
+    local metadata_url="http://169.254.169.254/latest/meta-data"
+    local max_attempts=5
+    local attempt=1
+
+    while [ $attempt -le $max_attempts ]; do
+        # Try IMDSv2 first
+        TOKEN=$(curl -s -f -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null)
+        if [ $? -eq 0 ]; then
+            RESULT=$(curl -s -f -H "X-aws-ec2-metadata-token: $TOKEN" $metadata_url/$1 2>/dev/null)
+        else
+            # Fallback to IMDSv1
+            RESULT=$(curl -s -f $metadata_url/$1 2>/dev/null)
+        fi
+
+        if [ ! -z "$RESULT" ]; then
+            echo "$RESULT"
+            return 0
+        fi
+
+        echo "Attempt $attempt failed. Retrying..." >&2
+        sleep 2
+        ((attempt++))
+    done
+
+    echo "Failed to retrieve metadata after $max_attempts attempts" >&2
+    return 1
+}
+
+# Get the instance type
+INSTANCE_TYPE=$(get_metadata instance-type)
+
+if [ $? -ne 0 ]; then
+    echo "Failed to determine instance type"
+    exit 1
+fi
+
+# Set the environment variable
+export EC2_INSTANCE_TYPE=$INSTANCE_TYPE
+
+# Print the instance type out
+if [ "$EC2_INSTANCE_TYPE" == "trn1.2xlarge" ] || [ "$EC2_INSTANCE_TYPE" == "trn1.32xlarge" ]; then
+    echo "======================================================"
+    echo "✅ EC2_INSTANCE_TYPE: $EC2_INSTANCE_TYPE"
+    echo "======================================================"
+else
+    echo "========================================================================================="
+    echo "❌ This is not a trn1.2xlarge or trn1.32xlarge instance. It is a $INSTANCE_TYPE"
+    echo "⚠️ Please use a valid instance type ⚠️"
+    echo "========================================================================================="
+fi
diff --git a/src/fine-tune/scripts/merge_checkpoints.py b/src/fine-tune/scripts/merge_checkpoints.py
index 8601b1d..18af02e 100644
--- a/src/fine-tune/scripts/merge_checkpoints.py
+++ b/src/fine-tune/scripts/merge_checkpoints.py
@@ -1,34 +1,152 @@
 #!/usr/bin/env python3
 import os
+import sys
 import torch
+from pathlib import Path
 
-# Directory containing shards, and final output path
-bin_dir    = "/home/ubuntu/nki-llama/src/fine-tune/model_assets/llama3-8B_hf_weights_bin"
-output_bin = "/home/ubuntu/nki-llama/src/fine-tune/model_assets/pckpt/pytorch_model.bin"
+def get_model_paths():
+    """
+    Determine the input and output paths based on environment variables or defaults.
+    Returns: (bin_dir, output_bin)
+    """
+    # Check for environment variables first
+    bin_dir = os.environ.get('BIN_MODEL_DIR')
+    output_dir = os.environ.get('CONSOLIDATED_BIN_MODEL_DIR')
+    
+    if bin_dir and output_dir:
+        # Use environment variables if both are set
+        output_bin = os.path.join(output_dir, "pytorch_model.bin")
+        return bin_dir, output_bin
+    
+    # Try to construct paths from MODEL_NAME if available
+    model_name = os.environ.get('MODEL_NAME')
+    if model_name:
+        base_path = os.path.expanduser("~/nki-llama/src/fine-tune/model_assets")
+        hf_weight_name = f"{model_name}_hf_weights_bin"
+        bin_dir = os.path.join(base_path, hf_weight_name)
+        output_bin = os.path.join(base_path, "pckpt", "pytorch_model.bin")
+        
+        if os.path.exists(bin_dir):
+            return bin_dir, output_bin
+    
+    # Fall back to default paths
+    default_paths = [
+        {
+            'bin_dir': "/home/ubuntu/nki-llama/src/fine-tune/model_assets/llama3-8B_hf_weights_bin",
+            'output_bin': "/home/ubuntu/nki-llama/src/fine-tune/model_assets/pckpt/pytorch_model.bin"
+        },
+        {
+            'bin_dir': "/home/ubuntu/nki-llama/src/fine-tune/model_assets/llama-3-1_8b_hf_weights_bin",
+            'output_bin': "/home/ubuntu/nki-llama/src/fine-tune/model_assets/pckpt/pytorch_model.bin"
+        },
+        {
+            'bin_dir': "/home/ubuntu/nki-llama/src/fine-tune/model_assets/llama-3-2_1b_hf_weights_bin",
+            'output_bin': "/home/ubuntu/nki-llama/src/fine-tune/model_assets/pckpt/pytorch_model.bin"
+        }
+    ]
+    
+    # Check which default path exists
+    for paths in default_paths:
+        if os.path.exists(paths['bin_dir']):
+            print(f"Using default path: {paths['bin_dir']}")
+            return paths['bin_dir'], paths['output_bin']
+    
+    # If nothing works, return None to trigger error
+    return None, None
 
-# We’ll merge everything into this dict
-merged_state = {}
+def print_error_help():
+    """Print helpful error message with suggestions."""
+    print("\nError: Could not find the binary model directory!")
+    print("\nPossible solutions:")
+    print("\n1. Set environment variables (recommended):")
+    print("   export BIN_MODEL_DIR=/path/to/your/model_hf_weights_bin")
+    print("   export CONSOLIDATED_BIN_MODEL_DIR=/path/to/output/pckpt")
+    print("\n2. Set MODEL_NAME environment variable:")
+    print("   export MODEL_NAME=llama-3-1_8b")
+    print("   (This will look for ~/nki-llama/src/fine-tune/model_assets/llama-3-1_8b_hf_weights_bin)")
+    print("\n3. Run from the download script which sets these variables automatically")
+    print("\n4. Ensure one of these default directories exists:")
+    print("   - /home/ubuntu/nki-llama/src/fine-tune/model_assets/llama3-8B_hf_weights_bin")
+    print("   - /home/ubuntu/nki-llama/src/fine-tune/model_assets/llama-3-1_8b_hf_weights_bin")
+    print("   - /home/ubuntu/nki-llama/src/fine-tune/model_assets/llama-3-2_1b_hf_weights_bin")
+    print("\nCurrent environment variables:")
+    print(f"   BIN_MODEL_DIR: {os.environ.get('BIN_MODEL_DIR', 'Not set')}")
+    print(f"   CONSOLIDATED_BIN_MODEL_DIR: {os.environ.get('CONSOLIDATED_BIN_MODEL_DIR', 'Not set')}")
+    print(f"   MODEL_NAME: {os.environ.get('MODEL_NAME', 'Not set')}")
 
-# Always load on CPU for merging
-map_loc = "cpu"
+def main():
+    # Get paths
+    bin_dir, output_bin = get_model_paths()
+    
+    if not bin_dir or not os.path.exists(bin_dir):
+        print_error_help()
+        sys.exit(1)
+    
+    # Check if output already exists
+    if os.path.exists(output_bin):
+        print(f"Merged checkpoint already exists at {output_bin}")
+        response = input("Do you want to overwrite it? (y/N): ")
+        if response.lower() != 'y':
+            print("Skipping merge operation.")
+            sys.exit(0)
+    
+    # Create output directory if it doesn't exist
+    output_dir = os.path.dirname(output_bin)
+    os.makedirs(output_dir, exist_ok=True)
+    
+    print(f"Binary model directory: {bin_dir}")
+    print(f"Output file: {output_bin}")
+    print()
+    
+    # Check if there are any .bin files to merge
+    bin_files = [f for f in os.listdir(bin_dir) if f.endswith(".bin")]
+    if not bin_files:
+        print(f"Error: No .bin files found in {bin_dir}")
+        print("Make sure the convert_safetensors.py script has been run first.")
+        sys.exit(1)
+    
+    print(f"Found {len(bin_files)} .bin files to merge")
+    
+    # We'll merge everything into this dict
+    merged_state = {}
+    
+    # Always load on CPU for merging
+    map_loc = "cpu"
+    
+    # Iterate in sorted order so names don't collide unpredictably
+    for fname in sorted(bin_files):
+        shard_path = os.path.join(bin_dir, fname)
+        print(f"Loading shard: {fname}")
+        
+        try:
+            shard = torch.load(shard_path, map_location=map_loc)
+            
+            # If the file wrapped weights under "state_dict", pull it out
+            if isinstance(shard, dict) and "state_dict" in shard:
+                shard = shard["state_dict"]
+            
+            # Merge into our master dict
+            merged_state.update(shard)
+            
+        except Exception as e:
+            print(f"Error loading {fname}: {e}")
+            sys.exit(1)
+    
+    # Save the flattened checkpoint
+    print(f"\nSaving merged checkpoint to {output_bin}")
+    print(f"Total parameters in merged checkpoint: {len(merged_state)}")
+    
+    try:
+        torch.save(merged_state, output_bin)
+        print("Done! Merge completed successfully.")
+        
+        # Print file size
+        file_size = os.path.getsize(output_bin) / (1024 ** 3)  # Convert to GB
+        print(f"Output file size: {file_size:.2f} GB")
+        
+    except Exception as e:
+        print(f"Error saving merged checkpoint: {e}")
+        sys.exit(1)
 
-# Iterate in sorted order so names don’t collide unpredictably
-for fname in sorted(os.listdir(bin_dir)):
-    if not fname.endswith(".bin"):
-        continue
-
-    shard_path = os.path.join(bin_dir, fname)
-    print(f"Loading shard: {shard_path}")
-    shard = torch.load(shard_path, map_location=map_loc)
-
-    # If the file wrapped weights under "state_dict", pull it out
-    if isinstance(shard, dict) and "state_dict" in shard:
-        shard = shard["state_dict"]
-
-    # Merge into our master dict
-    merged_state.update(shard)
-
-# Save the flattened checkpoint
-print(f"Saving merged checkpoint to {output_bin}")
-torch.save(merged_state, output_bin)
-print("Done.")
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/src/fine-tune/scripts/precompile.sh b/src/fine-tune/scripts/precompile.sh
old mode 100644
new mode 100755
index 01e803c..e97f592
--- a/src/fine-tune/scripts/precompile.sh
+++ b/src/fine-tune/scripts/precompile.sh
@@ -7,17 +7,12 @@ set -e
 # This script sets up and compiles NeuronX Distributed Training models
 # 
 # Usage:
-#   ./script.sh                                    # Install both packages
-#   SKIP_TRANSFORMER_ENGINE=true ./script.sh      # Skip transformer-engine
+#   ./precompile.sh                     # Run setup and compile
 #   
-# Environment Variables:
-#   SKIP_TRANSFORMER_ENGINE=true    Skip transformer-engine installation
-#                                   (useful for non-CUDA environments)
-#
 # Requirements:
 #   - Python 3.x with pip
 #   - Git (for cloning repositories)
-#   - For transformer-engine: CUDA toolkit (optional, fallback available)
+#   - install_setup.sh script in the same directory
 # ==============================================================================
 
 # Function to log with timestamp
@@ -103,177 +98,40 @@ else
     exit 1
 fi
 
-log "==== Step 4: Installing dependencies ===="
+log "==== Step 4: Running install_setup.sh ===="
+INSTALL_SETUP_DIR=~/nki-llama/src/fine-tune/neuronx-distributed-training/install_setup.sh
 
-# Check if user wants to skip transformer-engine installation
-SKIP_TE=${SKIP_TRANSFORMER_ENGINE:-true}
-if [ "$SKIP_TE" = "true" ]; then
-    log "Skipping transformer-engine installation (SKIP_TRANSFORMER_ENGINE=true)"
-    log "Installing only megatron-core from GitHub repository..."
+# Check if install_setup.sh exists
+if [ ! -f $INSTALL_SETUP_DIR ]; then
+    log_error "install_setup.sh script not found in current directory: $(pwd)"
+    log "Available files:"
+    ls -la
+    exit 1
+fi
+
+# Make sure install_setup.sh is executable
+chmod +x $INSTALL_SETUP_DIR
+
+log "Running install_setup.sh to install dependencies..."
+log "This will install megatron-core and apply necessary patches..."
+
+# Run install_setup.sh with output logging
+if $INSTALL_SETUP_DIR 2>&1 | tee /tmp/install_setup_output.log; then
+    log "✓ install_setup.sh completed successfully!"
 else
-    log "Installing transformer-engine and megatron-core from GitHub repositories..."
+    log_error "install_setup.sh failed"
+    log_error "Setup log saved to: /tmp/install_setup_output.log"
+    log_error "Last 20 lines of setup log:"
+    tail -20 /tmp/install_setup_output.log >&2
+    exit 1
 fi
 
-# Enhanced pip install with detailed logging
-if command -v pip >/dev/null 2>&1; then
-    log "Using pip version: $(pip --version)"
-    
-    # Define the specific tags to install (matching the Docker script)
-    TE_TAG="7d576ed25266a17a7b651f2c12e8498f67e0baea"
-    MCORE_TAG="core_r0.10.0"
-    TE_REPO_URL="git+https://github.com/NVIDIA/TransformerEngine.git@${TE_TAG}"
-    MEGATRON_REPO_URL="git+https://github.com/NVIDIA/Megatron-LM.git@${MCORE_TAG}"
-    
-    log "Target repositories:"
-    log "  TransformerEngine: $TE_REPO_URL"
-    log "  Megatron-LM: $MEGATRON_REPO_URL"
-    log "Git tags:"
-    log "  TE_TAG: $TE_TAG"
-    log "  MCORE_TAG: $MCORE_TAG"
-    
-    # Check and uninstall existing packages
-    packages_to_check=("megatron-core")
-    if [ "$SKIP_TE" != "true" ]; then
-        packages_to_check=("transformer-engine" "megatron-core")
-    fi
-    
-    for package in "${packages_to_check[@]}"; do
-        if pip show "$package" >/dev/null 2>&1; then
-            INSTALLED_VERSION=$(pip show "$package" | grep Version | cut -d' ' -f2)
-            log "$package is already installed (version: $INSTALLED_VERSION)"
-            log "Uninstalling existing $package..."
-            if pip uninstall -y "$package" 2>&1 | tee -a /tmp/pip_install.log; then
-                log "✓ Successfully uninstalled existing $package"
-            else
-                log_error "Failed to uninstall existing $package"
-                exit 1
-            fi
-        else
-            log "$package not currently installed"
-        fi
-    done
-    
-    log "Starting pip install from Git repositories..."
-    log "This may take several minutes as it needs to clone and build from source..."
-    
-    # Install transformer-engine only if not skipped
-    if [ "$SKIP_TE" != "true" ]; then
-    log "Installing transformer-engine @ $TE_REPO_URL"
-    
-    # Check if we're in a CUDA environment
-    if command -v nvcc >/dev/null 2>&1 && [ -n "$CUDA_HOME" ]; then
-        log "CUDA environment detected (nvcc found, CUDA_HOME set)"
-        TE_INSTALL_CMD="pip install --no-cache-dir --verbose \"transformer-engine @ $TE_REPO_URL\""
-    else
-        log "Non-CUDA environment detected - attempting CPU-only installation"
-        log "Setting environment variables for CPU-only build..."
-        
-        # Try to find CUDA installation or set minimal environment
-        if [ -d "/usr/local/cuda" ]; then
-            export CUDA_HOME="/usr/local/cuda"
-            log "Found CUDA at /usr/local/cuda, setting CUDA_HOME"
-        elif [ -d "/opt/cuda" ]; then
-            export CUDA_HOME="/opt/cuda"
-            log "Found CUDA at /opt/cuda, setting CUDA_HOME"
-        else
-            log "No CUDA installation found - this may cause transformer-engine installation to fail"
-            log "Attempting installation anyway..."
-        fi
-        
-        # Set environment variables to potentially bypass CUDA requirements
-        export NVTE_FRAMEWORK=pytorch
-        export NVTE_WITH_USERBUFFERS=0
-        
-        TE_INSTALL_CMD="pip install --no-cache-dir --verbose \"transformer-engine @ $TE_REPO_URL\""
-    fi
-    
-    log "Executing: $TE_INSTALL_CMD"
-    if eval "$TE_INSTALL_CMD" 2>&1 | tee /tmp/pip_install_te.log; then
-        if pip show transformer-engine >/dev/null 2>&1; then
-            TE_VERSION=$(pip show transformer-engine | grep Version | cut -d' ' -f2)
-            log "✓ Successfully installed transformer-engine version: $TE_VERSION"
-        else
-            log_error "transformer-engine installation appeared to succeed but package not found"
-            exit 1
-        fi
-    else
-        log_error "Failed to install transformer-engine from Git repository"
-        log_error "This is likely due to CUDA requirements. Attempting alternative approaches..."
-        
-        # Try installing a pre-built version from PyPI as fallback
-        log "Attempting fallback: installing transformer-engine from PyPI..."
-        if pip install --no-cache-dir transformer-engine 2>&1 | tee /tmp/pip_install_te_fallback.log; then
-            if pip show transformer-engine >/dev/null 2>&1; then
-                TE_VERSION=$(pip show transformer-engine | grep Version | cut -d' ' -f2)
-                log "✓ Successfully installed transformer-engine version: $TE_VERSION (PyPI fallback)"
-            else
-                log_error "PyPI fallback also failed"
-                exit 1
-            fi
-        else
-            log_error "Both Git and PyPI installation methods failed"
-            log_error "You may need to:"
-            log_error "1. Install CUDA toolkit and set CUDA_HOME environment variable"
-            log_error "2. Use a pre-built Docker container with transformer-engine"
-            log_error "3. Skip transformer-engine installation if not required for your use case"
-            log_error "Installation logs saved to:"
-            log_error "  - Git install: /tmp/pip_install_te.log"
-            log_error "  - PyPI fallback: /tmp/pip_install_te_fallback.log"
-            exit 1
-        fi
-    fi
-    else
-        log "Skipping transformer-engine installation as requested"
-        TE_VERSION="skipped"
-    fi
-    
-    # Then install megatron-core
-    log "Installing megatron_core @ $MEGATRON_REPO_URL"
-    if pip install --no-cache-dir --verbose "megatron_core @ $MEGATRON_REPO_URL" 2>&1 | tee /tmp/pip_install_mc.log; then
-        if pip show megatron-core >/dev/null 2>&1; then
-            MCORE_VERSION=$(pip show megatron-core | grep Version | cut -d' ' -f2)
-            log "✓ Successfully installed megatron-core version: $MCORE_VERSION"
-        else
-            log_error "megatron-core installation appeared to succeed but package not found"
-            exit 1
-        fi
-    else
-        log_error "Failed to install megatron-core from Git repository"
-        log_error "Installation log saved to: /tmp/pip_install_mc.log"
-        log_error "Last 10 lines of installation log:"
-        tail -10 /tmp/pip_install_mc.log >&2
-        exit 1
-    fi
-    
-    # Show installation summary
-    log "✓ All dependencies installed successfully!"
-    log "Installation summary:"
-    if [ "$SKIP_TE" != "true" ]; then
-        log "  transformer-engine:"
-        log "    Version: $TE_VERSION"
-        if [ "$TE_VERSION" != "skipped" ]; then
-            log "    Git tag/commit: $TE_TAG"
-            log "    Location: $(pip show transformer-engine | grep Location | cut -d' ' -f2-)"
-        fi
-    else
-        log "  transformer-engine: skipped (SKIP_TRANSFORMER_ENGINE=true)"
-    fi
-    log "  megatron-core:"
-    log "    Version: $MCORE_VERSION"
-    log "    Git tag: $MCORE_TAG"
-    log "    Location: $(pip show megatron-core | grep Location | cut -d' ' -f2-)"
-    
-    # Show package dependencies
-    log "Package dependencies:"
-    if [ "$SKIP_TE" != "true" ] && [ "$TE_VERSION" != "skipped" ]; then
-        log "  transformer-engine requires:"
-        pip show transformer-engine | grep Requires | cut -d' ' -f2- | tr ',' '\n' | sed 's/^/    /'
-    fi
-    log "  megatron-core requires:"
-    pip show megatron-core | grep Requires | cut -d' ' -f2- | tr ',' '\n' | sed 's/^/    /'
-    
+# Verify megatron-core installation
+if pip show megatron-core >/dev/null 2>&1; then
+    MCORE_VERSION=$(pip show megatron-core | grep Version | cut -d' ' -f2)
+    log "✓ Verified megatron-core installation - version: $MCORE_VERSION"
 else
-    log_error "pip command not found. Please ensure Python and pip are installed."
+    log_error "megatron-core not found after running install_setup.sh"
     exit 1
 fi
 
@@ -302,5 +160,5 @@ fi
 log "==== Script execution completed successfully! ===="
 log "Total execution time: $SECONDS seconds"
 log "Log files created:"
-log "  - pip install log: /tmp/pip_install.log"
-log "  - training log: /tmp/train_output.log"
\ No newline at end of file
+log "  - Setup log: /tmp/install_setup_output.log"
+log "  - Training log: /tmp/train_output.log"
\ No newline at end of file
diff --git a/src/fine-tune/scripts/run_training.sh b/src/fine-tune/scripts/run_training.sh
old mode 100644
new mode 100755
diff --git a/src/fine-tune/scripts/tensorboard.sh b/src/fine-tune/scripts/tensorboard.sh
old mode 100644
new mode 100755
diff --git a/src/fine-tune/utils/merge_lora_checkpoint.py b/src/fine-tune/utils/merge_lora_checkpoint.py
new file mode 100644
index 0000000..d1e1011
--- /dev/null
+++ b/src/fine-tune/utils/merge_lora_checkpoint.py
@@ -0,0 +1,290 @@
+#!/usr/bin/env python3
+"""
+Script to merge LoRA weights from NeuronX checkpoint with base model
+and convert to standard HuggingFace format
+"""
+
+import torch
+import os
+import json
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import argparse
+from collections import defaultdict
+
+def load_neuronx_lora_checkpoint(checkpoint_dir, tp_size=32):
+    """Load and merge LoRA weights from distributed NeuronX checkpoint"""
+    
+    # Add XLA safe globals
+    try:
+        import torch_xla.utils.serialization
+        torch.serialization.add_safe_globals([torch_xla.utils.serialization.TensorReference])
+    except ImportError:
+        print("Warning: torch_xla not available")
+    
+    print(f"Loading distributed checkpoint from {checkpoint_dir}")
+    
+    # Collect all checkpoint files and their tensor files
+    checkpoint_files = []
+    for f in os.listdir(checkpoint_dir):
+        if f.endswith('.pt') and not f.endswith('.tensors') and not f.endswith('.info.pt'):
+            checkpoint_files.append(f)
+    
+    checkpoint_files.sort()
+    print(f"Found {len(checkpoint_files)} checkpoint files")
+    
+    # Load all checkpoints and merge weights
+    merged_state = {}
+    lora_config = None
+    
+    for i, filename in enumerate(checkpoint_files):
+        print(f"Loading {filename} ({i+1}/{len(checkpoint_files)})")
+        checkpoint_path = os.path.join(checkpoint_dir, filename)
+        tensor_path = os.path.join(checkpoint_dir, filename + '.tensors')
+        
+        try:
+            # Load checkpoint metadata
+            checkpoint = torch.load(checkpoint_path, map_location='cpu', weights_only=False)
+            
+            # Extract LoRA config from first checkpoint
+            if lora_config is None and 'lora_config' in checkpoint:
+                lora_config = checkpoint['lora_config']
+                print(f"Found LoRA config: {lora_config}")
+            
+            # Load actual tensor data
+            if os.path.exists(tensor_path):
+                print(f"  Loading tensors from {filename}.tensors")
+                tensor_data = torch.load(tensor_path, map_location='cpu', weights_only=False)
+                
+                # Merge the checkpoint metadata with tensor data
+                for key in checkpoint.keys():
+                    if key == 'lora_config':
+                        continue
+                    
+                    # Get the actual tensor
+                    if key in tensor_data:
+                        tensor = tensor_data[key]
+                    elif hasattr(checkpoint[key], 'materialize'):
+                        tensor = checkpoint[key].materialize()
+                    else:
+                        # Fallback: try to use the checkpoint data directly
+                        tensor = checkpoint[key]
+                        if str(type(tensor)) == "<class 'torch_xla.utils.serialization.TensorReference'>":
+                            print(f"Warning: Could not load tensor for {key}, skipping...")
+                            continue
+                    
+                    if key not in merged_state:
+                        merged_state[key] = []
+                    merged_state[key].append(tensor)
+            else:
+                print(f"Warning: No tensor file found for {filename}, trying direct loading...")
+                # Try to load tensors directly from checkpoint
+                for key, tensor_ref in checkpoint.items():
+                    if key == 'lora_config':
+                        continue
+                    
+                    if hasattr(tensor_ref, 'materialize'):
+                        tensor = tensor_ref.materialize()
+                    elif torch.is_tensor(tensor_ref):
+                        tensor = tensor_ref
+                    else:
+                        print(f"Warning: Could not process tensor for {key}, skipping...")
+                        continue
+                    
+                    if key not in merged_state:
+                        merged_state[key] = []
+                    merged_state[key].append(tensor)
+                
+        except Exception as e:
+            print(f"Error loading {filename}: {e}")
+            continue
+    
+    # Concatenate tensors across TP ranks
+    print("Merging tensors across tensor parallel ranks...")
+    final_state = {}
+    for key, tensor_list in merged_state.items():
+        if len(tensor_list) == 1:
+            final_state[key] = tensor_list[0]
+        else:
+            try:
+                # Determine concatenation dimension based on weight type
+                if any(x in key for x in ['weight_q', 'weight_k', 'weight_v']):
+                    if 'lora_B' in key:
+                        # LoRA B matrices: concatenate along output dimension (dim=0)
+                        final_state[key] = torch.cat(tensor_list, dim=0)
+                    elif 'lora_A' in key:
+                        # LoRA A matrices: concatenate along input dimension (dim=1)
+                        final_state[key] = torch.cat(tensor_list, dim=1)
+                    elif 'base_layer' in key:
+                        # Base layer weights: concatenate along output dimension (dim=0)
+                        final_state[key] = torch.cat(tensor_list, dim=0)
+                    else:
+                        final_state[key] = torch.cat(tensor_list, dim=0)
+                elif 'gate_up_proj' in key:
+                    # Gate-up projection: concatenate along output dimension
+                    final_state[key] = torch.cat(tensor_list, dim=0)
+                elif 'o_proj' in key:
+                    # Output projection: concatenate along input dimension
+                    final_state[key] = torch.cat(tensor_list, dim=1)
+                elif 'down_proj' in key:
+                    # Down projection: concatenate along input dimension
+                    final_state[key] = torch.cat(tensor_list, dim=1)
+                else:
+                    # For other weights (embeddings, norms), take the first one (should be identical)
+                    final_state[key] = tensor_list[0]
+                    
+                print(f"  Merged {key}: {[t.shape for t in tensor_list]} -> {final_state[key].shape}")
+                
+            except Exception as e:
+                print(f"Error merging {key}: {e}")
+                # Fallback: take the first tensor
+                final_state[key] = tensor_list[0]
+    
+    return final_state, lora_config
+
+def merge_lora_weights(base_weights, lora_weights, lora_config):
+    """Merge LoRA weights with base weights"""
+    
+    print("Starting LoRA weight merging...")
+    merged_weights = {}
+    
+    # Copy all base weights first
+    for key, weight in base_weights.items():
+        merged_weights[key] = weight.clone()
+    
+    # Extract LoRA parameters
+    alpha = lora_config.get('lora_alpha', 32)
+    r = lora_config.get('r', 16)
+    scaling = alpha / r
+    
+    print(f"LoRA parameters: alpha={alpha}, rank={r}, scaling={scaling}")
+    
+    # Group LoRA weights by layer and component
+    lora_layers = defaultdict(dict)
+    for key in lora_weights:
+        if 'lora_A' in key or 'lora_B' in key:
+            # Parse key: model.layers.X.self_attn.qkv_proj.lora_A.weight
+            parts = key.split('.')
+            layer_idx = parts[2]
+            component = parts[4]  # qkv_proj
+            lora_type = parts[5]  # lora_A or lora_B
+            
+            if len(parts) > 6:
+                weight_type = parts[6]  # weight_q, weight_k, weight_v
+            else:
+                weight_type = 'weight'
+            
+            layer_key = f"layers.{layer_idx}.self_attn.{component}"
+            lora_key = f"{lora_type}.{weight_type}"
+            lora_layers[layer_key][lora_key] = lora_weights[key]
+    
+    # Apply LoRA merging for each layer
+    for layer_key, lora_params in lora_layers.items():
+        print(f"Processing {layer_key}")
+        
+        # Handle QKV projections - merge each Q, K, V separately
+        if 'qkv_proj' in layer_key:
+            layer_num = layer_key.split('.')[1]
+            
+            for weight_type in ['q', 'k', 'v']:
+                # Find corresponding base layer weight
+                base_key = f"model.layers.{layer_num}.self_attn.qkv_proj.base_layer.weight_{weight_type}"
+                lora_a_key = f"lora_A.weight"
+                lora_b_key = f"lora_B.weight_{weight_type}"
+                
+                if (base_key in lora_weights and 
+                    lora_a_key in lora_params and 
+                    lora_b_key in lora_params):
+                    
+                    base_weight = lora_weights[base_key]  # Use the base weight from checkpoint
+                    lora_a = lora_params[lora_a_key]
+                    lora_b = lora_params[lora_b_key]
+                    
+                    print(f"  Merging {weight_type} projection:")
+                    print(f"    Base: {base_weight.shape}")
+                    print(f"    LoRA A: {lora_a.shape}")
+                    print(f"    LoRA B: {lora_b.shape}")
+                    
+                    # Apply LoRA: W = W_base + scaling * (B @ A)
+                    try:
+                        delta = scaling * torch.mm(lora_b, lora_a)
+                        merged_weight = base_weight + delta
+                        
+                        # Create standard HF key name
+                        standard_key = f"model.layers.{layer_num}.self_attn.{weight_type}_proj.weight"
+                        merged_weights[standard_key] = merged_weight
+                        
+                        print(f"    Merged -> {standard_key}: {merged_weight.shape}")
+                        
+                    except Exception as e:
+                        print(f"    Error merging {weight_type}: {e}")
+                        # Fallback: use base weight only
+                        standard_key = f"model.layers.{layer_num}.self_attn.{weight_type}_proj.weight"
+                        merged_weights[standard_key] = base_weight
+    
+    # Add non-LoRA weights from the checkpoint
+    for key, weight in lora_weights.items():
+        # Skip LoRA-specific keys
+        if any(x in key for x in ['lora_A', 'lora_B', 'base_layer']):
+            continue
+        
+        # Add standard weights (embeddings, layer norms, MLPs, etc.)
+        merged_weights[key] = weight
+    
+    print(f"Final merged model has {len(merged_weights)} parameters")
+    return merged_weights
+
+def main():
+    parser = argparse.ArgumentParser(description='Merge LoRA checkpoint with base model')
+    parser.add_argument('--checkpoint_dir', required=True, help='Path to NeuronX checkpoint directory')
+    parser.add_argument('--base_model_path', required=True, help='Path to base model')
+    parser.add_argument('--output_dir', required=True, help='Output directory for merged model')
+    parser.add_argument('--tp_size', type=int, default=32, help='Tensor parallel size')
+    
+    args = parser.parse_args()
+    
+    print("Step 1: Loading base model...")
+    base_model = AutoModelForCausalLM.from_pretrained(
+        args.base_model_path,
+        torch_dtype=torch.float16,
+        device_map="cpu"
+    )
+    
+    print("Step 2: Loading NeuronX LoRA checkpoint...")
+    lora_weights, lora_config = load_neuronx_lora_checkpoint(args.checkpoint_dir, args.tp_size)
+    
+    print("Step 3: Merging LoRA weights...")
+    base_state_dict = base_model.state_dict()
+    merged_weights = merge_lora_weights(base_state_dict, lora_weights, lora_config)
+    
+    print("Step 4: Loading merged weights into model...")
+    # Filter merged weights to only include keys that exist in the base model
+    filtered_weights = {}
+    for key in base_model.state_dict().keys():
+        if key in merged_weights:
+            filtered_weights[key] = merged_weights[key]
+        else:
+            print(f"Warning: {key} not found in merged weights, keeping original")
+            filtered_weights[key] = base_model.state_dict()[key]
+    
+    base_model.load_state_dict(filtered_weights, strict=True)
+    
+    print("Step 5: Saving merged model...")
+    os.makedirs(args.output_dir, exist_ok=True)
+    base_model.save_pretrained(args.output_dir, safe_serialization=True)
+    
+    # Copy tokenizer
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(args.base_model_path)
+        tokenizer.save_pretrained(args.output_dir)
+        print("Tokenizer copied successfully")
+    except Exception as e:
+        print(f"Warning: Could not copy tokenizer: {e}")
+    
+    print(f"Merged model saved to {args.output_dir}")
+    
+    # Print some statistics
+    total_params = sum(p.numel() for p in base_model.parameters())
+    print(f"Total parameters in merged model: {total_params:,}")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/src/fine-tune/utils/merge_lora_model.sh b/src/fine-tune/utils/merge_lora_model.sh
new file mode 100755
index 0000000..842f06f
--- /dev/null
+++ b/src/fine-tune/utils/merge_lora_model.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+set -e
+
+# Define paths
+CHECKPOINT_PATH="/home/ubuntu/nki-llama/src/fine-tune/neuronx-distributed-training/examples/nemo_experiments/hf_llama/2025-06-11_15-11-22/checkpoints/hf_llama3_8B_SFT--step=5000-consumed_samples=319424.0.ckpt/"
+BASE_MODEL_PATH="/home/ubuntu/nki-llama/src/fine-tune/model_assets/llama_3-1_8b/"
+OUTPUT_PATH="/home/ubuntu/nki-llama/merged_model/"
+MERGE_SCRIPT="/home/ubuntu/nki-llama/src/fine-tune/merge_lora_checkpoint.py"
+
+# Ensure output directory exists
+mkdir -p "${OUTPUT_PATH}"
+
+echo "=== LoRA Model Merging Process ==="
+echo "Checkpoint: ${CHECKPOINT_PATH}"
+echo "Base Model: ${BASE_MODEL_PATH}"
+echo "Output: ${OUTPUT_PATH}"
+echo ""
+
+# Activate the training environment (needed for XLA utilities)
+echo "Activating NeuronX training environment..."
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+
+# Download packages
+echo "=== Downloading Libararies! ==="
+pip install transformers
+pip install accelerate
+echo "=== Download Complete! ==="
+
+# Run the LoRA merge script
+echo "Merging LoRA weights with base model..."
+python3 ${MERGE_SCRIPT} \
+    --checkpoint_dir "${CHECKPOINT_PATH}/model" \
+    --base_model_path "${BASE_MODEL_PATH}" \
+    --output_dir "${OUTPUT_PATH}" \
+    --tp_size 32
+
+echo ""
+echo "=== LoRA Model Merge Complete! ==="
+echo "Your merged model is ready at: ${OUTPUT_PATH}"
+echo ""
+echo "You can now use this model directly with transformers:"
+echo "  from transformers import AutoModelForCausalLM, AutoTokenizer"
+echo "  model = AutoModelForCausalLM.from_pretrained('${OUTPUT_PATH}')"
+echo "  tokenizer = AutoTokenizer.from_pretrained('${OUTPUT_PATH}')"
+
+# Deactivate environment
+deactivate
\ No newline at end of file
diff --git a/src/handler.py b/src/handler.py
new file mode 100644
index 0000000..8c4cddf
--- /dev/null
+++ b/src/handler.py
@@ -0,0 +1,804 @@
+#!/usr/bin/env python3
+"""
+Handler for calculating NKI-LLAMA scores combining inference and training metrics.
+This script invokes calculate_training_metrics.py and processes benchmark results.
+"""
+
+import argparse
+import json
+import subprocess
+import sys
+import os
+from pathlib import Path
+from typing import Dict, Any, Optional, Tuple, List
+import logging
+from datetime import datetime
+import glob
+import re
+
+
+class NKILlamaHandler:
+    """Handler for calculating and managing NKI-LLAMA benchmark scores."""
+    
+    def __init__(self, verbose: bool = False):
+        self.verbose = verbose
+        self.setup_logging()
+        
+    def setup_logging(self):
+        """Set up logging configuration."""
+        log_level = logging.DEBUG if self.verbose else logging.INFO
+        logging.basicConfig(
+            level=log_level,
+            format='%(asctime)s - %(levelname)s - %(message)s',
+            datefmt='%Y-%m-%d %H:%M:%S'
+        )
+        self.logger = logging.getLogger(__name__)
+        
+    def build_training_command(self, cmd_args: Dict[str, Any]) -> list:
+        """
+        Build the command for calculate_training_metrics.py.
+        
+        Args:
+            cmd_args: Dictionary of command line arguments
+            
+        Returns:
+            List representing the command to execute
+        """
+        # Build command
+        cmd = ["python", cmd_args["script_path"]]
+        
+        # Add required arguments
+        cmd.extend(["--config", cmd_args["config"]])
+        cmd.extend(["--model-config", cmd_args["model_config"]])
+        
+        # Add optional arguments
+        if cmd_args.get("log_file"):
+            cmd.extend(["--log-file", cmd_args["log_file"]])
+        if cmd_args.get("compile_dir"):
+            cmd.extend(["--compile-dir", cmd_args["compile_dir"]])
+        if cmd_args.get("throughput"):
+            cmd.extend(["--throughput", str(cmd_args["throughput"])])
+        if cmd_args.get("hw_backend"):
+            cmd.extend(["--hw-backend", cmd_args["hw_backend"]])
+        if cmd_args.get("batch_size"):
+            cmd.extend(["--batch-size", str(cmd_args["batch_size"])])
+        if cmd_args.get("seq_len"):
+            cmd.extend(["--seq-len", str(cmd_args["seq_len"])])
+        if cmd_args.get("num_nodes"):
+            cmd.extend(["--num-nodes", str(cmd_args["num_nodes"])])
+            
+        # Add scoring parameters
+        if cmd_args.get("calculate_score"):
+            cmd.append("--calculate-score")
+        if cmd_args.get("per_file_scores"):
+            cmd.append("--per-file-scores")
+        if cmd_args.get("detailed"):
+            cmd.append("--detailed")
+        if cmd_args.get("print_per_file"):
+            cmd.append("--print-per-file")
+            
+        # Output file
+        output_file = cmd_args.get("output", "benchmark_finetuning.json")
+        cmd.extend(["--output", output_file])
+        
+        return cmd
+        
+    def run_training_metrics(self, cmd_args: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Run calculate_training_metrics.py with the specified arguments.
+        
+        Args:
+            cmd_args: Dictionary of command line arguments
+            
+        Returns:
+            Dictionary containing the training metrics results
+        """
+        # Build the command
+        cmd = self.build_training_command(cmd_args)
+        
+        self.logger.info(f"Running command: {' '.join(cmd)}")
+        
+        # ADD THIS PRINT STATEMENT FOR BETTER VISIBILITY
+        print("\n" + "="*80)
+        print("🚀 EXECUTING TRAINING METRICS COMMAND:")
+        print("="*80)
+        print(f"Command: {' '.join(cmd)}")
+        print("="*80 + "\n")
+        
+        output_file = cmd_args.get("output", "benchmark_finetuning.json")
+        
+        try:
+            # Run the command
+            result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+            
+            # Print stdout if verbose
+            if self.verbose and result.stdout:
+                print("=== Training Metrics Output ===")
+                print(result.stdout)
+                print("==============================")
+                
+            # Load and return the results
+            with open(output_file, 'r') as f:
+                return json.load(f)
+                
+        except subprocess.CalledProcessError as e:
+            self.logger.error(f"Error running calculate_training_metrics.py: {e}")
+            if e.stderr:
+                self.logger.error(f"Error output: {e.stderr}")
+            raise
+            
+    def calculate_inference_score(self, inference_data: Dict[str, Any], 
+                                  reference_data: Optional[Dict[str, Any]] = None) -> Tuple[float, Dict]:
+        """
+        Calculate inference score based on the benchmark_inference definition.
+        
+        Score = Accuracy * Reduced Latency * Increased Throughput * (1 + Normalized NKI FLOPS)
+        
+        Args:
+            inference_data: Dictionary containing inference benchmark results
+            reference_data: Optional reference implementation data
+            
+        Returns:
+            Tuple of (score, score_breakdown)
+        """
+        # Default reference values if not provided
+        if reference_data is None:
+            reference_data = {
+                "e2e_model": {
+                    "latency_ms_avg": 50000,  # 50 seconds reference
+                    "throughput": 10  # 10 tokens/sec reference
+                },
+                "accuracy": 1.0  # Assume accuracy threshold is met
+            }
+        
+        # Extract metrics from inference data
+        e2e_latency = inference_data["e2e_model"]["latency_ms_avg"]
+        e2e_throughput = inference_data["e2e_model"]["throughput"]
+        
+        # Calculate components
+        accuracy = reference_data.get("accuracy", 1.0)  # Binary: 1 if meets threshold, 0 otherwise
+        
+        # Reduced Latency = Reference TTFT / Submission TTFT
+        # Using e2e latency as proxy for TTFT
+        reduced_latency = reference_data["e2e_model"]["latency_ms_avg"] / e2e_latency
+        
+        # Increased Throughput = Submission tokens/sec / Reference tokens/sec  
+        increased_throughput = e2e_throughput / reference_data["e2e_model"]["throughput"]
+        
+        # Normalized NKI FLOPS - this would come from the training metrics
+        # For now, using a placeholder - this should be integrated with training metrics
+        normalized_nki_flops = 0.0  # Will be updated when combined with training metrics
+        
+        # Calculate final score
+        score = accuracy * reduced_latency * increased_throughput * (1 + normalized_nki_flops)
+        
+        breakdown = {
+            "accuracy": accuracy,
+            "reduced_latency": reduced_latency,
+            "increased_throughput": increased_throughput,
+            "normalized_nki_flops": normalized_nki_flops,
+            "reference_latency_ms": reference_data["e2e_model"]["latency_ms_avg"],
+            "achieved_latency_ms": e2e_latency,
+            "reference_throughput": reference_data["e2e_model"]["throughput"],
+            "achieved_throughput": e2e_throughput
+        }
+        
+        return score, breakdown
+        
+    def calculate_reasoning_score(self, reasoning_metrics: Dict[str, Any]) -> float:
+        """
+        Calculate reasoning score from reasoning evaluation results.
+        
+        The reasoning score is based on the overall accuracy across reasoning tasks,
+        normalized to a 0-1 scale and then scaled to match the scoring range of other components.
+        
+        Args:
+            reasoning_metrics: Dictionary containing reasoning evaluation results
+            
+        Returns:
+            Float representing the reasoning score
+        """
+        try:
+            # Get the overall score (already averaged across tasks)
+            overall_score = reasoning_metrics.get("overall_score", 0.0)
+            
+            # The overall_score is already in 0-1 range (accuracy percentage)
+            # Scale it to match the typical range of other components (0-10 range)
+            reasoning_score = overall_score * 10.0
+            
+            self.logger.debug(f"Reasoning score calculation: {overall_score:.4f} -> {reasoning_score:.4f}")
+            
+            return reasoning_score
+            
+        except Exception as e:
+            self.logger.error(f"Error calculating reasoning score: {e}")
+            return 0.0
+        
+    def calculate_combined_score(self, training_metrics: Dict[str, Any],
+                                 inference_metrics: Optional[Dict[str, Any]] = None,
+                                 reasoning_metrics: Optional[Dict[str, Any]] = None,
+                                 weights: Optional[Dict[str, float]] = None,
+                                 reference_data: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+        """
+        Calculate combined NKI-LLAMA score from training, inference, and reasoning metrics.
+        Handles graceful fallback when components are not available.
+        
+        Args:
+            training_metrics: Training metrics including NKI analysis
+            inference_metrics: Optional inference benchmark results
+            reasoning_metrics: Optional reasoning evaluation results
+            weights: Optional weights for combining scores
+            reference_data: Optional reference implementation data for inference scoring
+            
+        Returns:
+            Dictionary containing combined score and breakdown
+        """
+        if weights is None:
+            # Default weights based on available components
+            if inference_metrics is not None and reasoning_metrics is not None:
+                weights = {
+                    "training": 0.3,
+                    "inference": 0.4,
+                    "reasoning": 0.3
+                }
+            elif inference_metrics is not None:
+                weights = {
+                    "training": 0.4,
+                    "inference": 0.6,
+                    "reasoning": 0.0
+                }
+            elif reasoning_metrics is not None:
+                weights = {
+                    "training": 0.7,
+                    "inference": 0.0,
+                    "reasoning": 0.3
+                }
+            else:
+                weights = {
+                    "training": 1.0,
+                    "inference": 0.0,
+                    "reasoning": 0.0
+                }
+            
+        # Get training score and NKI ratio
+        training_score = training_metrics.get("training_score", 0.0)
+        nki_ratio = training_metrics["nki_analysis"]["summary"]["overall_nki_ratio"]
+        
+        # Calculate reasoning score if available
+        reasoning_score = None
+        if reasoning_metrics is not None:
+            reasoning_score = self.calculate_reasoning_score(reasoning_metrics)
+        
+        # Determine execution mode based on available components
+        available_components = []
+        if training_metrics is not None:
+            available_components.append("training")
+        if inference_metrics is not None:
+            available_components.append("inference")
+        if reasoning_metrics is not None:
+            available_components.append("reasoning")
+            
+        mode = "_".join(available_components) if len(available_components) > 1 else f"{available_components[0]}_only"
+        
+        # Handle training-only mode
+        if inference_metrics is None and reasoning_metrics is None:
+            return {
+                "combined_score": training_score,
+                "training_score": training_score,
+                "inference_score": None,
+                "reasoning_score": None,
+                "weights": weights,
+                "mode": "training_only",
+                "breakdown": {
+                    "training": training_metrics.get("training_score_breakdown", {}),
+                    "inference": None,
+                    "reasoning": None
+                },
+                "nki_ratio": nki_ratio
+            }
+        
+        # Calculate inference score with NKI ratio if available
+        inference_score_with_nki = None
+        inference_breakdown = None
+        if inference_metrics is not None:
+            inference_score, inference_breakdown = self.calculate_inference_score(inference_metrics, reference_data)
+            
+            # Update inference score with actual NKI FLOPS ratio
+            inference_breakdown["normalized_nki_flops"] = nki_ratio
+            inference_score_with_nki = (
+                inference_breakdown["accuracy"] * 
+                inference_breakdown["reduced_latency"] * 
+                inference_breakdown["increased_throughput"] * 
+                (1 + nki_ratio)
+            )
+        
+        # Calculate weighted average based on available components
+        combined_score = 0.0
+        if weights["training"] > 0:
+            combined_score += weights["training"] * training_score
+        if weights["inference"] > 0 and inference_score_with_nki is not None:
+            combined_score += weights["inference"] * inference_score_with_nki
+        if weights["reasoning"] > 0 and reasoning_score is not None:
+            combined_score += weights["reasoning"] * reasoning_score
+        
+        return {
+            "combined_score": combined_score,
+            "training_score": training_score,
+            "inference_score": inference_score_with_nki,
+            "reasoning_score": reasoning_score,
+            "weights": weights,
+            "mode": mode,
+            "breakdown": {
+                "training": training_metrics.get("training_score_breakdown", {}),
+                "inference": inference_breakdown,
+                "reasoning": reasoning_metrics.get("tasks", {}) if reasoning_metrics else None
+            },
+            "nki_ratio": nki_ratio
+        }
+        
+    def display_results(self, results: Dict[str, Any]):
+        """Display the benchmark results in a formatted way."""
+        print("\n" + "="*70)
+        print("NKI-LLAMA BENCHMARK RESULTS")
+        print("="*70)
+        
+        mode = results.get("mode", "unknown")
+        
+        # Check mode and display appropriate results
+        if mode == "training_only":
+            print("\n⚠️  TRAINING-ONLY MODE (Inference and reasoning results not available)")
+            print(f"\n🏆 NKI KERNEL TRAINING SCORE: {results['training_score']:.4f}")
+            print(f"   NKI Ratio: {results['nki_ratio']:.4f}")
+            
+            # Training breakdown
+            if "training" in results["breakdown"] and results["breakdown"]["training"]:
+                tb = results["breakdown"]["training"]
+                print(f"\n🎯 Training Metrics:")
+                print(f"  MFU: {tb.get('achieved_mfu', 0):.2f}% (baseline: {tb.get('base_mfu', 0):.2f}%)")
+                print(f"  Throughput: {tb.get('achieved_throughput', 0):.2f} seq/s (baseline: {tb.get('base_throughput', 0):.2f})")
+                print(f"  MFU Improvement: {tb.get('mfu_improvement', 0):.4f}x")
+                print(f"  Throughput Improvement: {tb.get('throughput_improvement', 0):.4f}x")
+                
+            print("\n💡 Note: This score represents training performance only.")
+            print("   To get the full NKI-LLAMA score, run inference benchmarks and reasoning")
+            print("   evaluation, then provide the results using --inference-results and --reasoning-results options.")
+            
+        else:
+            # Multi-component mode - display based on available components
+            print(f"\n🏆 FINAL NKI-LLAMA SCORE: {results['combined_score']:.4f}")
+            print(f"\nExecution Mode: {mode.replace('_', ' + ').title()}")
+            
+            print(f"\nScore Weights:")
+            if results['weights']['training'] > 0:
+                print(f"  Training: {results['weights']['training']*100:.0f}%")
+            if results['weights']['inference'] > 0:
+                print(f"  Inference: {results['weights']['inference']*100:.0f}%")
+            if results['weights']['reasoning'] > 0:
+                print(f"  Reasoning: {results['weights']['reasoning']*100:.0f}%")
+            
+            # Component scores
+            print(f"\n📊 Component Scores:")
+            print(f"  Training Score: {results['training_score']:.4f}")
+            if results['inference_score'] is not None:
+                print(f"  Inference Score: {results['inference_score']:.4f}")
+            if results['reasoning_score'] is not None:
+                print(f"  Reasoning Score: {results['reasoning_score']:.4f}")
+            print(f"  NKI Ratio: {results['nki_ratio']:.4f}")
+            
+            # Training breakdown
+            if "training" in results["breakdown"] and results["breakdown"]["training"]:
+                tb = results["breakdown"]["training"]
+                print(f"\n🎯 Training Metrics:")
+                print(f"  MFU: {tb.get('achieved_mfu', 0):.2f}% (baseline: {tb.get('base_mfu', 0):.2f}%)")
+                print(f"  Throughput: {tb.get('achieved_throughput', 0):.2f} seq/s (baseline: {tb.get('base_throughput', 0):.2f})")
+                print(f"  MFU Improvement: {tb.get('mfu_improvement', 0):.4f}x")
+                print(f"  Throughput Improvement: {tb.get('throughput_improvement', 0):.4f}x")
+                
+            # Inference breakdown
+            if results["breakdown"]["inference"]:
+                ib = results["breakdown"]["inference"]
+                print(f"\n⚡ Inference Metrics:")
+                print(f"  Latency: {ib['achieved_latency_ms']:.2f}ms (reference: {ib['reference_latency_ms']:.2f}ms)")
+                print(f"  Throughput: {ib['achieved_throughput']:.2f} tokens/s (reference: {ib['reference_throughput']:.2f})")
+                print(f"  Latency Reduction: {ib['reduced_latency']:.4f}x")
+                print(f"  Throughput Increase: {ib['increased_throughput']:.4f}x")
+                print(f"  Accuracy: {'✓ Passed' if ib['accuracy'] == 1.0 else '✗ Failed'}")
+        
+        print("\n" + "="*70)
+        
+    def save_results(self, results: Dict[str, Any], output_file: str):
+        """Save the combined results to a JSON file."""
+        output_data = {
+            "timestamp": datetime.now().isoformat(),
+            "mode": results.get("mode", "combined"),
+            "nki_kernel_score": results["combined_score"],
+            "component_scores": {
+                "training": results["training_score"],
+                "inference": results.get("inference_score")
+            },
+            "weights": results["weights"],
+            "nki_ratio": results["nki_ratio"],
+            "detailed_breakdown": results["breakdown"]
+        }
+        
+        with open(output_file, 'w') as f:
+            json.dump(output_data, f, indent=2)
+            
+        self.logger.info(f"Results saved to: {output_file}")
+        
+    def discover_reasoning_results(self, model_path: str, 
+                                   results_base_path: str = "/home/ubuntu/aws-neuron-samples/inference-benchmarking/results") -> Optional[Dict[str, Any]]:
+        """
+        Discover reasoning results based on model ID in aws-neuron-samples/inference-benchmarking/results/.
+        
+        Args:
+            model_path: Path to the model (e.g., "/home/ubuntu/models/llama-3-1-8b")
+            results_base_path: Base path to reasoning results directory
+            
+        Returns:
+            Dictionary containing reasoning results or None if not found
+        """
+        try:
+            # Extract model name from path and create sanitized version
+            model_name = os.path.basename(model_path.rstrip('/'))
+            sanitized_model_path = model_path.replace('/', '__')
+            
+            self.logger.info(f"Searching for reasoning results for model: {model_name}")
+            self.logger.debug(f"Sanitized model path: {sanitized_model_path}")
+            
+            # Search in accuracy results directory
+            accuracy_base = os.path.join(results_base_path, "accuracy")
+            
+            if not os.path.exists(accuracy_base):
+                self.logger.warning(f"Accuracy results directory not found: {accuracy_base}")
+                return None
+                
+            # Find all result files matching the model path pattern
+            search_pattern = os.path.join(accuracy_base, "**", sanitized_model_path, "results_*.json")
+            result_files = glob.glob(search_pattern, recursive=True)
+            
+            if not result_files:
+                self.logger.info(f"No reasoning result files found for model {model_name}")
+                self.logger.debug(f"Search pattern used: {search_pattern}")
+                return None
+                
+            # Use the most recent result file (based on timestamp in filename)
+            latest_file = max(result_files, key=lambda f: os.path.getmtime(f))
+            self.logger.info(f"Found reasoning results: {latest_file}")
+            
+            # Load and parse the result file
+            with open(latest_file, 'r') as f:
+                reasoning_data = json.load(f)
+                
+            return self.parse_reasoning_results(reasoning_data, latest_file)
+            
+        except Exception as e:
+            self.logger.error(f"Error discovering reasoning results: {e}")
+            return None
+            
+    def parse_reasoning_results(self, reasoning_data: Dict[str, Any], file_path: str) -> Dict[str, Any]:
+        """
+        Parse JSON result files to extract "exact_match,strict-match" scores.
+        
+        Args:
+            reasoning_data: Raw reasoning results data
+            file_path: Path to the result file for metadata
+            
+        Returns:
+            Dictionary containing parsed reasoning scores
+        """
+        try:
+            parsed_results = {
+                "source_file": file_path,
+                "model_name": reasoning_data.get("model_name", "unknown"),
+                "model_name_sanitized": reasoning_data.get("model_name_sanitized", "unknown"),
+                "evaluation_time": reasoning_data.get("total_evaluation_time_seconds", 0),
+                "tasks": {},
+                "overall_score": 0.0,
+                "task_count": 0
+            }
+            
+            # Extract scores from each task
+            results_section = reasoning_data.get("results", {})
+            total_score = 0.0
+            task_count = 0
+            
+            for task_name, task_results in results_section.items():
+                # Look for exact_match,strict-match score
+                strict_match_score = task_results.get("exact_match,strict-match")
+                flexible_extract_score = task_results.get("exact_match,flexible-extract")
+                
+                if strict_match_score is not None:
+                    parsed_results["tasks"][task_name] = {
+                        "exact_match_strict": strict_match_score,
+                        "exact_match_flexible": flexible_extract_score,
+                        "primary_score": strict_match_score  # Use strict-match as primary
+                    }
+                    total_score += strict_match_score
+                    task_count += 1
+                    
+                    self.logger.debug(f"Task {task_name}: strict-match={strict_match_score}, flexible-extract={flexible_extract_score}")
+                    
+            # Calculate overall average score
+            if task_count > 0:
+                parsed_results["overall_score"] = total_score / task_count
+                parsed_results["task_count"] = task_count
+                
+            self.logger.info(f"Parsed reasoning results: {task_count} tasks, overall score: {parsed_results['overall_score']:.4f}")
+            return parsed_results
+            
+        except Exception as e:
+            self.logger.error(f"Error parsing reasoning results: {e}")
+            return {
+                "source_file": file_path,
+                "error": str(e),
+                "overall_score": 0.0,
+                "task_count": 0,
+                "tasks": {}
+            }
+            
+    def map_model_config_to_path(self, model_config_path: str) -> str:
+        """
+        Map model configuration to corresponding model path for reasoning result discovery.
+        
+        Args:
+            model_config_path: Path to model configuration file
+            
+        Returns:
+            Inferred model path for reasoning result lookup
+        """
+        try:
+            # Try to extract model information from config file
+            if os.path.exists(model_config_path):
+                with open(model_config_path, 'r') as f:
+                    config_data = json.load(f)
+                    
+                # Look for model path hints in config
+                model_name_or_path = config_data.get("model_name_or_path", "")
+                if model_name_or_path and os.path.exists(model_name_or_path):
+                    return model_name_or_path
+                    
+            # Fallback: infer from config path structure
+            # e.g., /path/to/8B_config_llama3-1/config.json -> llama-3-1-8b
+            config_dir = os.path.dirname(model_config_path)
+            config_dir_name = os.path.basename(config_dir)
+            
+            # Map common config directory patterns to model names
+            model_mapping = {
+                "8B_config_llama3-1": "/home/ubuntu/models/llama-3-1-8b",
+                "8B_config_llama3": "/home/ubuntu/models/llama-3-8b-distill",
+                # Add more mappings as needed
+            }
+            
+            if config_dir_name in model_mapping:
+                return model_mapping[config_dir_name]
+                
+            # Final fallback: assume standard model path
+            return "/home/ubuntu/models/llama-3-1-8b"
+            
+        except Exception as e:
+            self.logger.warning(f"Error mapping model config to path: {e}")
+            return "/home/ubuntu/models/llama-3-1-8b"
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Handler for NKI-LLAMA benchmark score calculation",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    
+    # Training metrics arguments
+    training_group = parser.add_argument_group('Training Metrics')
+    training_group.add_argument(
+        "--training-script",
+        default="/home/ubuntu/nki-llama/src/fine-tune/scripts/calculate_training_metrics.py",
+        help="Path to calculate_training_metrics.py"
+    )
+    training_group.add_argument(
+        "--config",
+        default="/home/ubuntu/nki-llama/src/fine-tune/neuronx-distributed-training/examples/conf/hf_llama3_8B_SFT_config.yaml",
+        help="Training config YAML file"
+    )
+    training_group.add_argument(
+        "--model-config",
+        default="/home/ubuntu/nki-llama/src/fine-tune/configs/model-config/8B_config_llama3-1/config.json",
+        help="Model config JSON file"
+    )
+    training_group.add_argument(
+        "--log-file",
+        default="/home/ubuntu/nki-llama/logs/nki-llama_20250610_014432.log",
+        help="Training log file"
+    )
+    training_group.add_argument(
+        "--compile-dir",
+        default="/home/ubuntu/neuron_cache",
+        help="Neuron compile cache directory"
+    )
+    training_group.add_argument(
+        "--throughput",
+        type=float,
+        default=2.1,
+        help="Training throughput in seq/s"
+    )
+    training_group.add_argument(
+        "--hw-backend",
+        choices=['trn1', 'trn2'],
+        default='trn1',
+        help="Hardware backend"
+    )
+    
+    # Inference metrics arguments
+    inference_group = parser.add_argument_group('Inference Metrics')
+    inference_group.add_argument(
+        "--inference-results",
+        default="benchmark_inference.json",
+        help="Path to inference benchmark results (optional - if not provided, only training score is calculated)"
+    )
+    inference_group.add_argument(
+        "--reference-latency",
+        type=float,
+        default=50000,
+        help="Reference implementation latency in ms"
+    )
+    inference_group.add_argument(
+        "--reference-throughput",
+        type=float,
+        default=10,
+        help="Reference implementation throughput in tokens/s"
+    )
+    
+    # Score calculation arguments
+    score_group = parser.add_argument_group('Score Calculation')
+    score_group.add_argument(
+        "--training-weight",
+        type=float,
+        default=0.4,
+        help="Weight for training score (0-1)"
+    )
+    score_group.add_argument(
+        "--inference-weight",
+        type=float,
+        default=0.6,
+        help="Weight for inference score (0-1)"
+    )
+    score_group.add_argument(
+        "--calculate-score",
+        action="store_true",
+        help="Calculate training score"
+    )
+    score_group.add_argument(
+        "--per-file-scores",
+        action="store_true",
+        help="Calculate per-file scores"
+    )
+    score_group.add_argument(
+        "--detailed",
+        action="store_true",
+        help="Show detailed output"
+    )
+    
+    # Output arguments
+    output_group = parser.add_argument_group('Output')
+    output_group.add_argument(
+        "--output",
+        default="benchmark_results.json",
+        help="Output file for combined benchmark results"
+    )
+    output_group.add_argument(
+        "--training-output",
+        default="benchmark_finetuning.json",
+        help="Output file for training metrics"
+    )
+    output_group.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Enable verbose output"
+    )
+    
+    args = parser.parse_args()
+    
+    # Validate weights
+    if args.training_weight + args.inference_weight != 1.0:
+        parser.error("Training and inference weights must sum to 1.0")
+        
+    # Create handler
+    handler = NKILlamaHandler(verbose=args.verbose)
+    
+    try:
+        # Step 1: Run training metrics calculation
+        print("📈 Calculating training metrics...")
+        training_args = {
+            "script_path": args.training_script,
+            "config": args.config,
+            "model_config": args.model_config,
+            "log_file": args.log_file,
+            "compile_dir": args.compile_dir,
+            "throughput": args.throughput,
+            "hw_backend": args.hw_backend,
+            "calculate_score": args.calculate_score,
+            "per_file_scores": args.per_file_scores,
+            "detailed": args.detailed,
+            "print_per_file": args.verbose,
+            "output": args.training_output,
+            "base_mfu": 50.0,
+            "base_throughput": 100.0,
+            "loss_improvement": 1.0,
+            "convergence_rate": 1.0
+        }
+        
+        # Build the command to display it even if using existing file
+        cmd = handler.build_training_command(training_args)
+        
+        # Check if we need to run training metrics or use existing file
+        if os.path.exists(args.training_output):
+            # DISPLAY THE COMMAND THAT WOULD BE EXECUTED
+            print("\n" + "="*80)
+            print("📋 TRAINING METRICS COMMAND (using existing file instead):")
+            print("="*80)
+            print(f"Command that would be executed:\n{' '.join(cmd)}")
+            print("="*80 + "\n")
+            
+            handler.logger.info(f"Using existing training metrics from {args.training_output}")
+            with open(args.training_output, 'r') as f:
+                training_metrics = json.load(f)
+        else:
+            training_metrics = handler.run_training_metrics(training_args)
+            
+        # Step 2: Check for inference metrics
+        inference_metrics = None
+        inference_available = os.path.exists(args.inference_results)
+        
+        if inference_available:
+            print("\n⚡ Loading inference metrics...")
+            with open(args.inference_results, 'r') as f:
+                inference_metrics = json.load(f)
+        else:
+            print("\n⚠️  Inference results file not found. Running in training-only mode.")
+            print(f"   (Looking for: {args.inference_results})")
+            
+        # Step 3: Calculate score(s)
+        if inference_available:
+            print("\n🔬 Calculating combined NKI-LLAMA score...")
+        else:
+            print("\n🔬 Calculating NKI kernel training score...")
+            
+        weights = {
+            "training": args.training_weight,
+            "inference": args.inference_weight
+        }
+        
+        # Set reference data for inference scoring
+        reference_data = {
+            "e2e_model": {
+                "latency_ms_avg": args.reference_latency,
+                "throughput": args.reference_throughput
+            },
+            "accuracy": 1.0  # Assuming accuracy threshold is met
+        }
+        
+        # Calculate score - will handle both training-only and combined modes
+        results = handler.calculate_combined_score(
+            training_metrics,
+            inference_metrics,
+            weights,
+            reference_data
+        )
+        
+        # Step 4: Display results
+        handler.display_results(results)
+        
+        # Step 5: Save results
+        handler.save_results(results, args.output)
+        
+        if inference_available:
+            print(f"\n✅ Benchmark complete! Results saved to {args.output}")
+        else:
+            print(f"\n✅ Training benchmark complete! Results saved to {args.output}")
+            print("   Run inference benchmarks to get the full NKI-LLAMA score.")
+        
+    except Exception as e:
+        handler.logger.error(f"Error during benchmark: {e}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/src/inference/Makefile b/src/inference/Makefile
deleted file mode 100644
index 6fa7957..0000000
--- a/src/inference/Makefile
+++ /dev/null
@@ -1,239 +0,0 @@
-# Makefile for nki-llama with vLLM and Jupyter support
-
-# Include .env file if it exists
--include ../../.env
-
-# Variables
-PYTHON = python
-REPOSITORY_PATH = ~/nki-llama
-MAIN_SCRIPT = main.py
-TEST_SCRIPT = test/inference/test.py
-LOCAL_VENV = ../../venv/bin/activate
-MODELS_DIR = ~/models
-COMPILED_MODEL_DIR = ~/traced_model
-REPO_PATH = $(HOME)/upstreaming-to-vllm
-REPO_URL = https://github.com/aws-neuron/upstreaming-to-vllm.git
-REPO_BRANCH = neuron-2.22-vllm-v0.7.2
-PORT ?= 8080
-MAX_MODEL_LEN ?= 2048
-TENSOR_PARALLEL_SIZE ?= 8
-
-# Default target
-.PHONY: all
-all: help
-
-# Help message
-.PHONY: help
-help:
-	@echo "Available targets:"
-	@echo "  setup-jupyter  - Create Python virtual environment, install requirements and setup Jupyter (must be run with source)"
-	@echo "  setup-vllm     - Setup vLLM for Neuron (requires Neuron environment first)"
-	@echo "  download       - Download model from Hugging Face (requires Neuron environment first)"
-	@echo "  infer          - Run inference in generate mode (requires Neuron environment first)"
-	@echo "  evaluate       - Run inference in evaluate_single mode (requires Neuron environment first)"
-	@echo "  start-server   - Start vLLM OpenAI-compatible API server"
-	@echo "  jupyter        - Run Jupyter Lab server"
-	@echo "  show-env       - Display environment variables loaded from .env file"
-	@echo "  clean          - Remove generated files"
-	@echo ""
-	@echo "Note: Before running most commands, activate the Neuron environment with:"
-	@echo "  source /opt/aws_neuronx_venv_pytorch_2_5_nxd_inference/bin/activate"
-	@echo ""
-	@echo "For Jupyter, activate the local environment with:"
-	@echo "  source venv/bin/activate"
-	@echo ""
-	@echo "You can set model and server parameters in a .env file"
-	@echo ""
-	@echo "Example workflow:"
-	@echo "  1. source /opt/aws_neuronx_venv_pytorch_2_5_nxd_inference/bin/activate"
-	@echo "  2. make setup-vllm"
-	@echo "  3. make download"
-	@echo "  4. make start-server"
-	@echo "  5. In a new terminal: source venv/bin/activate"
-	@echo "  6. make setup-jupyter"
-	@echo "  7. make jupyter"
-
-.PHONY: show-env
-show-env:
-	@echo "Environment variables loaded from .env file:"
-	@echo "----------------------------------------"
-	@echo "MODEL_ID: $(MODEL_ID)"
-	@echo "MODEL_NAME: $(MODEL_NAME)"
-	@echo "MODELS_DIR: $(MODELS_DIR)"
-	@echo "COMPILED_MODEL_DIR: $(COMPILED_MODEL_DIR)"
-	@echo "PORT: $(PORT)"
-	@echo "MAX_MODEL_LEN: $(MAX_MODEL_LEN)"
-	@echo "TENSOR_PARALLEL_SIZE: $(TENSOR_PARALLEL_SIZE)"
-	@echo "HF_TOKEN: $${HF_TOKEN:-(not set)}"
-	@echo "----------------------------------------"
-
-# Setup local virtual environment and Jupyter
-.PHONY: setup-jupyter
-setup-jupyter:
-	test -d venv || $(PYTHON) -m venv venv
-	pip install --upgrade pip
-	test -f requirements.txt && pip install -r requirements.txt || echo "No requirements.txt found"
-	pip install langchain langgraph langchain_community ipykernel jupyter jupyterlab python-dotenv
-	$(PYTHON) -m ipykernel install --user --name="neuron_agents" --display-name="Python (Neuron Agents)"
-	@echo "Virtual environment and Jupyter kernel setup complete"
-
-# Check if in Neuron virtual environment
-.PHONY: check-neuron-venv
-check-neuron-venv:
-	@if [ -z "$$VIRTUAL_ENV" ] || [[ "$$VIRTUAL_ENV" != *"neuronx"* ]]; then \
-		echo "Error: Not in Neuron virtual environment."; \
-		echo "Run 'source /opt/aws_neuronx_venv_pytorch_2_5_nxd_inference/bin/activate' first."; \
-		exit 1; \
-	else \
-		echo "Using Neuron virtual environment: $$VIRTUAL_ENV"; \
-	fi
-
-# Check if in local virtual environment
-.PHONY: check-local-venv
-check-local-venv:
-	@if [ -z "$$VIRTUAL_ENV" ] || [[ "$$VIRTUAL_ENV" != *"venv"* ]]; then \
-		echo "Error: Not in local virtual environment."; \
-		echo "Run 'source venv/bin/activate' first."; \
-		exit 1; \
-	else \
-		echo "Using local virtual environment: $$VIRTUAL_ENV"; \
-	fi
-
-# Check if vLLM is installed
-.PHONY: check-vllm
-check-vllm:
-	@if ! python -c "import vllm" 2>/dev/null; then \
-		echo "Error: vLLM not installed. Run 'make setup-vllm' first."; \
-		exit 1; \
-	else \
-		echo "vLLM is installed"; \
-	fi
-
-# Check or set MODEL_NAME environment variable
-.PHONY: check-model-name
-check-model-name:
-	@if [ -z "$(MODEL_NAME)" ]; then \
-		echo "MODEL_NAME environment variable is not set"; \
-		echo "You can set it permanently in .env file or with: export MODEL_NAME=your_model_path"; \
-		read -p "Enter model name for this session (default: llama-3.2-3b-instruct): " model_name; \
-		if [ -z "$$model_name" ]; then \
-			export MODEL_NAME="llama-3.2-3b-instruct"; \
-		else \
-			export MODEL_NAME="$$model_name"; \
-		fi; \
-		echo "Using model: $$MODEL_NAME"; \
-	else \
-		echo "Using model from configuration: $(MODEL_NAME)"; \
-	fi
-
-# Setup vLLM for Neuron
-.PHONY: setup-vllm
-setup-vllm: check-neuron-venv
-	@if [ -d "$(REPO_PATH)" ]; then \
-		echo "Repository already exists at $(REPO_PATH)"; \
-		cd "$(REPO_PATH)"; \
-		CURRENT_BRANCH=$$(git rev-parse --abbrev-ref HEAD); \
-		if [ "$$CURRENT_BRANCH" != "$(REPO_BRANCH)" ]; then \
-			echo "Switching to branch $(REPO_BRANCH)..."; \
-			git checkout "$(REPO_BRANCH)" || git fetch && git checkout "$(REPO_BRANCH)"; \
-		else \
-			echo "Already on correct branch: $(REPO_BRANCH)"; \
-		fi; \
-	else \
-		echo "Cloning repository $(REPO_URL) with branch $(REPO_BRANCH)..."; \
-		cd "$(HOME)"; \
-		git clone -b "$(REPO_BRANCH)" "$(REPO_URL)"; \
-	fi
-	@cd "$(REPO_PATH)" && \
-	echo "Installing vLLM requirements..." && \
-	pip install -r requirements-neuron.txt && \
-	echo "Installing vLLM for Neuron..." && \
-	VLLM_TARGET_DEVICE="neuron" pip install -e .
-	@echo "vLLM setup complete"
-
-# Download model from Hugging Face
-.PHONY: download
-download: check-neuron-venv
-	@echo "Downloading model from Hugging Face"
-	@if [ -z "$(MODEL_ID)" ]; then \
-		echo "MODEL_ID not set in .env file."; \
-		read -p "Enter HuggingFace model ID (e.g., meta-llama/Meta-Llama-3-8B): " MODEL_ID_INPUT; \
-		if [ -z "$$MODEL_ID_INPUT" ]; then \
-			echo "Error: MODEL_ID is required"; \
-			exit 1; \
-		fi; \
-		MODEL_ID_VAR="$$MODEL_ID_INPUT"; \
-	else \
-		MODEL_ID_VAR="$(MODEL_ID)"; \
-		echo "Using MODEL_ID from .env: $$MODEL_ID_VAR"; \
-	fi; \
-	if [ -z "$(HF_TOKEN)" ]; then \
-		echo "HF_TOKEN not set in .env file."; \
-		echo "Get one at: https://huggingface.co/docs/hub/en/security-tokens"; \
-		read -p "Enter your Hugging Face token: " TOKEN; \
-		if [ -z "$$TOKEN" ]; then \
-			echo "Error: HF_TOKEN is required"; \
-			exit 1; \
-		fi; \
-	else \
-		TOKEN="$(HF_TOKEN)"; \
-		echo "Using HF_TOKEN from .env file"; \
-	fi; \
-	SHORTNAME=$$(echo $$MODEL_ID_VAR | sed 's/.*\///' | tr '[:upper:]' '[:lower:]'); \
-	echo "Downloading $$MODEL_ID_VAR to $(MODELS_DIR)/$$SHORTNAME"; \
-	mkdir -p $(MODELS_DIR); \
-	pip install -q huggingface_hub[cli]; \
-	huggingface-cli download --token $$TOKEN $$MODEL_ID_VAR --local-dir $(MODELS_DIR)/$$SHORTNAME; \
-	echo ""; \
-	echo "Download complete!"; \
-	echo "Model saved to: $(MODELS_DIR)/$$SHORTNAME"; \
-	if [ -z "$(MODEL_ID)" ]; then \
-		echo ""; \
-		echo "Add to your .env file:"; \
-		echo "MODEL_ID=$$MODEL_ID_VAR"; \
-		echo "MODEL_NAME=$$SHORTNAME"; \
-	fi
-
-# Start vLLM OpenAI-compatible API server
-.PHONY: start-server
-start-server: check-neuron-venv check-vllm check-model-name
-	@echo "Starting vLLM OpenAI-compatible API server with model: $(MODEL_NAME)"
-	cd ~ && VLLM_NEURON_FRAMEWORK='neuronx-distributed-inference' NEURON_COMPILED_ARTIFACTS='$(COMPILED_MODEL_DIR)/$(MODEL_NAME)' python -m vllm.entrypoints.openai.api_server \
-		--model="models/$(MODEL_NAME)" \
-		--max-num-seqs=4 \
-		--max-model-len=$(MAX_MODEL_LEN) \
-		--tensor-parallel-size=$(TENSOR_PARALLEL_SIZE) \
-		--port=$(PORT) \
-		--device "neuron" \
-		--override-neuron-config "{\"enable_bucketing\":false}"
-
-# Run Jupyter Lab
-.PHONY: jupyter
-jupyter: check-local-venv
-	@echo "Starting Jupyter Lab server..."
-	jupyter lab --no-browser --ip="0.0.0.0"
-
-# Run inference in generate mode
-.PHONY: infer
-infer: check-neuron-venv check-model-name
-	@echo "Running inference in generate mode with model: $(MODEL_NAME)"
-	$(PYTHON) $(MAIN_SCRIPT) --mode generate --model-path "$(MODELS_DIR)/$(MODEL_NAME)" --compiled-model-path "$(COMPILED_MODEL_DIR)/$(MODEL_NAME)" --enable-nki --seq-len 640 --tp-degree $(TENSOR_PARALLEL_SIZE)
-
-# Run inference in evaluate_single mode
-.PHONY: evaluate
-evaluate: check-neuron-venv check-model-name
-	@echo "Running inference in evaluate_single mode with model: $(MODEL_NAME)"
-	cd ~ && $(PYTHON) $(REPOSITORY_PATH)/$(TEST_SCRIPT) --repository-path $(REPOSITORY_PATH)
-
-# Run inference in evaluate_all mode
-.PHONY: evaluate-all
-evaluate-all: check-neuron-venv check-model-name
-	@echo "Running inference in evaluate_all mode with model: $(MODEL_NAME)"
-	$(PYTHON) $(MAIN_SCRIPT) --mode evaluate_all --model-path "$(MODELS_DIR)/$(MODEL_NAME)" --compiled-model-path "$(COMPILED_MODEL_DIR)/$(MODEL_NAME)" --enable-nki --seq-len 640 --tp-degree $(TENSOR_PARALLEL_SIZE)
-
-# Clean generated files
-.PHONY: clean
-clean:
-	@echo "Cleaning generated files..."
-	rm -rf test/inference/output/*
-	find . -type d -name "__pycache__" -exec rm -rf {} +
\ No newline at end of file
diff --git a/src/inference/README.md b/src/inference/README.md
index 8a5407c..20751cb 100644
--- a/src/inference/README.md
+++ b/src/inference/README.md
@@ -1,210 +1,408 @@
-# NKI Llama
-
-📢 Contestants, please note that we have updated the due date to March 10, anywhere on Earth. This is to allow for more time to address questions about benchmarking, which is both the purpose of the competition and core to the success metric. 
-
-## Getting Started
-
-This repository provides a package containing the PyTorch model of Llama 3.2 1B. This model **can be compiled with AWS Neuron SDK and run on** a **Trainium instance.** The main file in this package is `llama.py` which contains the model implementation in PyTorch.
-
-In the `llama.py` file, we provide an example NKI kernel for the [RMSNorm operation](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/tutorials/rmsnorm.html) and a guide on how to replace its invocation in the model. This replacement serves as an example of a valid use of a NKI kernel in the PyTorch model. Your task is to identify other parts of the model (operators, fused operators, layers, or even the whole model\!) that can be implemented as NKI kernels and replace them in the original model to achieve better performance.
-
-To learn NKI, follow [the official NKI guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/index.html) and various example NKI kernels from the [nki-samples repository](https://github.com/aws-neuron/nki-samples). Another tool to help with optimizing NKI kernels is [NKI autotune](https://github.com/awslabs/nki-autotune).
-
-## Setup Steps
-
-1. Create a Trainium instance with AWS Neuron SDK v2.21 using EC2 with the following settings:
-    1. **Name:** optnki-[xxx]
-    2. **AMI:** Deep Learning AMI Neuron (Ubuntu 22.04)
-    3. **Instance type:** trn1.2xlarge
-    4. **Key pair (login):** create a new key pair
-    5. **Metadata version [under “Advanced details”]:** V2 only (otherwise, you will encounter a not authorized error)
-    6. When connecting to these instances via SSH, use the username of *ubuntu*.
-2. Activate the Neuron virtual environment to run inference by running `source /opt/aws_neuronx_venv_pytorch_2_5_nxd_inference/bin/activate`.
-3. Clone this repository and run `cd [PATH]/nki-llama` where `[PATH]` is the directory where you have performed the clone.
-4. Download the [Llama3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) model to a `~/models` folder in your root directory. We recommend doing so using the [Hugging Face CLI](https://huggingface.co/docs/huggingface_hub/en/guides/cli). You can install this by running `pip3 install huggingface_hub[cli]`. You will also need to create an [access token](https://huggingface.co/docs/hub/en/security-tokens). 
-To download the models, run the following:
-    ```
-    cd ~
-    mkdir models
-    huggingface-cli download --token YOURTOKEN meta-llama/Llama-3.2-1B --local-dir /home/ubuntu/models/llama-3.2-1b
-    ```
-5. [Llama3.2-1B Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) may be more fun to chat with. You can download and use this model as well.
-6. To run inference, navigate to `[PATH]/nki-llama` and run `python main.py --mode generate`.
-
-## NKI Kernel Example
-The following steps provide an example of how to utilize NKI kernels in the Llama3.2-1B model:
-
-1. Identify the kernel of interest, e.g. RMSNorm, in the PyTorch model to be optimized with NKI. In the NxDI repository, it is implemented in [modules/custom_calls.py](https://github.com/aws-neuron/neuronx-distributed-inference/blob/main/src/neuronx_distributed_inference/modules/custom_calls.py).
-
-    ```
-    class CustomRMSNorm(nn.Module):
-        def __init__(self, hidden_size, eps=1e-6):
-            """
-            Use this RMSNorm to perform customized rmsnorm on Neuron
-            Note: CustomRMSNorm forward method calls target="AwsNeuronRmsNorm"
-            """
-            super().__init__()
-            self.weight = nn.Parameter(ones(hidden_size))
-            self.variance_epsilon = eps
-    
-        def forward(self, hidden_states):
-            original_dtype = hidden_states.dtype
-    
-            hidden_states = hidden_states.to(torch.float32)
-            result = RmsNorm.apply(
-                hidden_states, self.weight, self.variance_epsilon, len(hidden_states.shape) - 1
-            )
-    
-            return result.to(original_dtype)
-    ```
-
-2. Modify or create a new class for the NKI kernel. `nki_rmsnorm_kernel` refers to the NKI RMSNorm kernel. 
-
-    a. Modify the existing class:
-
-    ```
-    class CustomRMSNorm(nn.Module):
-        def __init__(self, hidden_size, eps=1e-6, nki_enabled=False):
-            """
-            Use this RMSNorm to perform customized rmsnorm on Neuron
-            Note: CustomRMSNorm forward method calls target="AwsNeuronRmsNorm"
-            """
-            super().__init__()
-            self.weight = nn.Parameter(ones(hidden_size))
-            self.variance_epsilon = eps
-            self.nki_enabled = nki_enabled
-    
-        def forward(self, hidden_states):
-            if self.nki_enabled:
-                out_tensor = nki_rmsnorm_kernel(hidden_states, self.weight, self.variance_epsilon)
-                return out_tensor
-            
-            original_dtype = hidden_states.dtype
-    
-            hidden_states = hidden_states.to(torch.float32)
-            result = RmsNorm.apply(
-                hidden_states, self.weight, self.variance_epsilon, len(hidden_states.shape) - 1
-            )
-    
-            return result.to(original_dtype)
-    ```
-
-    b. Create a new class (this is not what was done in this tutorial):
-
-    ```
-    class CustomRMSNormNKI(nn.Module):
-        def __init__(self, hidden_size, eps=1e-6):
-            """
-            Use this RMSNorm to perform customized rmsnorm on Neuron
-            Note: CustomRMSNorm forward method calls target="AwsNeuronRmsNorm"
-            """
-            super().__init__()
-            self.weight = nn.Parameter(ones(hidden_size))
-            self.variance_epsilon = eps
-    
-        def forward(self, hidden_states):
-            out_tensor = nki_rmsnorm_kernel(hidden_states, self.weight, self.variance_epsilon)
-            return out_tensor
-    ```
-1. You may need to add a batch dimension to input tensor(s), e.g. `a_tensor`. Also be aware of uninitialized data.
-
-    ```
-    # iy = nl.arange(a_tensor.shape[1])[None, :]
-    iy = nl.arange(a_tensor.shape[2])[None, :]
-    
-    # num_rows = a_tensor.shape[0]
-    num_rows = a_tensor.shape[1]
-    ```
-    
-1. If you modified the existing class, update how the class is invoked in the PyTorch model file `llama.py`.
-
-    ```
-    ...
-        self.input_layernorm = get_rmsnorm_cls()(
-            config.hidden_size,
-            eps=config.rms_norm_eps,
-            nki_enabled=config.neuron_config.nki_enabled,
-        )
-    self.post_attention_layernorm = get_rmsnorm_cls()(
-        config.hidden_size,
-        eps=config.rms_norm_eps,
-        nki_enabled=config.neuron_config.nki_enabled,
-    )
-    ```
-
-    If you created a new class, modify where the kernel is invoked in the PyTorch model file `llama.py` (not done in this tutorial).
-
-    ```
-    def get_rmsnorm_cls():
-        # Initialize to the appropriate implementation of RMSNorm
-        # If infer on NXD -> CustomRMSNorm
-        # If infer on CPU -> HF_RMSNorm (CustomRMSNorm does not work on CPU)
-        # return CustomRMSNorm if parallel_state.model_parallel_is_initialized() else LlamaRMSNorm
-        return CustomRMSNormNKI if parallel_state.model_parallel_is_initialized() else LlamaRMSNorm
-    ```
-
-1. Run inference on a single prompt using the NKI kernel and the single evaluation mode by running `python main.py --enable-nki --mode evaluate_single`. If you would like to run the model with specific prompts, pass in `--prompt [PROMPTS]` where `[PROMPTS]` is a comma-separated list of prompts.
-
-## Additional Tools
-
-1. **Profiling:** If you would like to profile your implementation in order to get a better understanding of performance bottlenecks and opportunities for optimization, you can use the [Neuron Profiler](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/tools/neuron-sys-tools/neuron-profile-user-guide.html).
-2. **Benchmarking:** You can also leverage the [NKI benchmarking API](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/api/generated/nki.benchmark.html) to retrieve execution latency statistics. 
-
-## Submission
-
-Your submission should be a single Python file called `llama.py`. This file should contain implementations of NKI kernels and also modifications to the original model to invoke these NKI kernels. This file should work as a plug-in replacement for the original `llama.py` of the reference PyTorch implementation provided in this repository.
-
-Make your submission here: https://forms.gle/zZKKS6RzKcerf4vH8
-
-## Benchmarks
-
-Submissions will be tested using 25 benchmarks (prompts) with varying context lengths (TBD, but likely 1K \-\> 128K) and batch sizes (TBD, but likely 1-\>4). We have provided 5 prompts in `prompts.txt` with their corresponding metadata (prompt ID, prompt length, recommended sequence length, and baseline latency/throughput) in `prompt_data.txt`. There are 2 methods of testing these prompts:
-
-1. To avoid recompilation per prompt, you can use a global sequence length (we suggest 640) for all prompts. Run `python main.py --enable-nki --mode evaluate_all --seq-len 640`.
-2. Alternatively, you can use a unique sequence length for each prompt (suggested sequence lengths are the third entry in each row of `prompt_data.txt`) at the cost of recompiling the model for each prompt. Run `python test.py` to evaluate these prompts in this fashion.
-
-The remaining 20 prompts will be withheld for evaluation. All benchmarks will become publicly available after the contest is complete.
-
-## Evaluation and Scoring
-
-The contest organizers will execute each team's submission across the twenty withheld benchmarks on a dedicated Trainium instance. The submissions will be evaluated on:
-
-1) Accuracy of generated output vs. our reference implementation. Accuracy evaluation will be a binary assessor: Any benchmark that fails an accuracy threshold will result in a score of 0\.   
-2) Latency (Time to first token (TTFT))  
-3) Throughput measured as output tokens / second  
-4) Amount of model written in NKI (measured as NKI FLOPS / total model FLOPS) (will be applied as a scaling factor for (b) and (c)). Note: NKI FLOPs measures the number of multiply-accumulate (MAC) operations.
+# NKI-LLAMA: Unified Interface for AWS Neuron
+
+A unified project for fine-tuning, inference, and agent development of LLaMA models on AWS Trainium and Inferentia using a streamlined bash-based interface.
+
+## 📋 Requirements
+
+### Neuron SDK Version
+- **Neuron 2.23.0 Release**
+- **NeuronX Distributed Inference**: 0.3.5591
+- **NeuronX Distributed Training**: 1.3.0
+
+### Hardware & AMI
+- **Required Instance**: trn1.32xlarge
+- **Base AMI**: Deep Learning AMI Neuron (Ubuntu 22.04) with Neuron SDK 2.23
+- **Base Packages**:
+  - NxD (NeuronX Distributed Training)
+  - NKI (Neuron Kernel Interface)
+  - NxDI (NeuronX Distributed Inference)
+
+## 🔄 Project Workflow
+
+```
+┌─────────────────┐     ┌──────────────────┐     ┌─────────────────┐     ┌──────────────┐
+│                 │     │                  │     │                 │     │              │
+│   Fine-tuning   │────▶│ NKI Compilation  │────▶│ vLLM Inference  │────▶│    Agent     │
+│      (NxD)      │     │  & Evaluation    │     │     (NxDI)      │     │ Development  │
+│                 │     │                  │     │                 │     │              │
+└─────────────────┘     └──────────────────┘     └─────────────────┘     └──────────────┘
+        │                         │                         │
+        │                         │                         │
+        ▼                         ▼                         ▼
+  Trained Model            NKI-Optimized              API Endpoint
+                          Model Artifacts            (OpenAI Compatible)
+```
+
+### Detailed Workflow:
+
+1. **Fine-tune** a model using NeuronX Distributed (NxD) on Trainium
+2. **NKI Compilation** optimizes the model for Neuron hardware:
+   - Compiles model graphs with Neuron Kernel Interface (NKI)
+   - Creates optimized artifacts for inference
+   - Benchmarks performance characteristics
+3. **vLLM Inference** serves the NKI-compiled model using NeuronX Distributed Inference (NxDI)
+4. **Agent Development** connects to the inference endpoint for application building
+
+### Key Components:
+- **NKI (Neuron Kernel Interface)**: Optimizes model operations for AWS Neuron hardware
+- **NxD (NeuronX Distributed)**: Enables distributed training across Neuron cores
+- **NxDI (NeuronX Distributed Inference)**: Provides optimized inference runtime
+- **vLLM**: Serves models with OpenAI-compatible API using Neuron optimizations
+
+## 🚀 Quick Start
+
+```bash
+# Install
+chmod +x install.sh
+./install.sh
+
+# Setup Guide
+./nki-llama setup
+
+# Run fine-tuning
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+./nki-llama finetune all
+
+# Start inference server
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+./nki-llama server
+```
+
+## 🏗️ Initial Setup
+
+### 1. Create Trainium Instance
+
+Create a trn1.32xlarge instance on AWS EC2:
+- **Name**: nki-llama
+- **AMI**: Deep Learning AMI Neuron (Ubuntu 22.04)
+- **Instance type**: trn1.32xlarge
+- **Key pair**: Create new key pair
+- **Username**: ubuntu (when connecting via SSH)
+
+### 2. Clone and Install
+
+```bash
+# Clone repository
+git clone [REPO_URL]
+cd nki-llama
+
+# Run installation
+chmod +x install.sh
+./install.sh
+
+# Configure environment
+cp .env.example .env
+nano .env  # Add your HF_TOKEN and adjust settings
+```
+
+## 📁 Project Structure
+
+```
+/home/ubuntu/nki-llama/
+├── nki-llama.sh          # Main CLI interface
+├── nki-llama.config      # Shared configuration
+├── .env                  # Your environment variables
+├── .env.example          # Example configuration
+├── install.sh            # Installation script
+├── src/
+│   ├── fine-tune/
+│   │   └── scripts/      # Fine-tuning scripts
+│   │       ├── bootstrap.sh
+│   │       ├── download_data.sh
+│   │       ├── download_model.sh
+│   │       ├── convert_checkpoints.sh
+│   │       ├── precompile.sh
+│   │       └── run_training.sh
+│   └── inference/
+│       ├── main.py       # Inference entry point
+│       └── scripts/      # Inference helper scripts
+│           ├── setup-vllm.sh
+│           ├── download-model.sh
+│           ├── run-nki-benchmark.sh
+│           ├── start-server.sh
+│           └── jupyter.sh
+└── logs/                 # Unified logs
+    └── benchmarks/       # Benchmark results
+```
+
+## 🔧 Environment Setup
+
+This project requires three different Python environments:
+
+### 1. Fine-tuning Environment
+```bash
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+```
+
+### 2. Inference Environment
+```bash
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+```
+
+### 3. Jupyter Environment (for agent development)
+```bash
+./nki-llama jupyter setup
+source ~/nki-llama/venv/bin/activate
+```
+
+## 💻 Commands
+
+### Quick Commands
+- `./nki-llama setup` - Interactive setup wizard
+- `./nki-llama train` - Start fine-tuning (shortcut)
+- `./nki-llama server` - Start inference server (shortcut)
+- `./nki-llama jupyter` - Launch Jupyter Lab
+- `./nki-llama status` - Check system status
+- `./nki-llama config` - Show configuration
+
+### Fine-tuning Workflow
+
+```bash
+# Activate fine-tuning environment
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+
+# Run individual steps
+./nki-llama finetune deps      # Install dependencies
+./nki-llama finetune data      # Download dataset
+./nki-llama finetune model     # Download model
+./nki-llama finetune convert   # Convert checkpoints to NxDT format
+./nki-llama finetune compile   # Pre-compile graphs (AOT)
+./nki-llama finetune train     # Start fine-tuning
+
+# Or run all at once
+./nki-llama finetune all
+```
+
+### Inference Workflow
+
+The inference pipeline includes NKI (Neuron Kernel Interface) compilation and NxDI integration with vLLM for optimal performance on Neuron hardware.
+
+```bash
+# Activate inference environment
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+
+# Setup and prepare
+./nki-llama inference setup      # Setup vLLM for Neuron
+./nki-llama inference download   # Download model (skip if using fine-tuned)
+
+# Compile and optimize with NKI
+./nki-llama inference compile    # Compile model with NKI (10-30 min)
+
+# Benchmark performance
+./nki-llama inference benchmark  # Run performance evaluation
+
+# Start serving
+./nki-llama inference server     # Start OpenAI-compatible API
+```
+
+**Note**: The compilation step creates NKI-optimized artifacts that are:
+- Required for vLLM to use the model efficiently
+- Cached for future use (no recompilation needed)
+- Optimized specifically for your Neuron hardware configuration
+
+## 🤖 Agent Development
+
+This repository includes support for building LLM-powered agents using LangGraph and LangChain. A sample travel planning agent demonstrates:
+
+- Context-aware travel itinerary generation
+- Multi-turn conversation with memory
+- Dynamic workflow management using LangGraph
+- Integration with vLLM for efficient inference on Trainium
+
+### Using Jupyter for Agent Development
+
+```bash
+# Terminal 1: Start the inference server
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+./nki-llama server
+
+# Terminal 2: Start Jupyter Lab
+./nki-llama jupyter
+# Access at http://your-ip:8888
+# Select the "nki-llama" kernel in Jupyter
+```
 
-Rankings will be established by calculating the total normalized number of points per team, where points are normalized against the baseline.
+## ⚙️ Configuration
 
-We define **points** as **Accuracy** (binary) **\* Reduced Latency \* Increased Throughput \* (1 + Normalized NKI FLOPS)**, where:
+All configuration is managed through:
+1. `nki-llama.config` - System paths and defaults
+2. `.env` - Your personal configuration
+
+### Key Variables
+
+```bash
+# Model Configuration
+MODEL_ID=meta-llama/Meta-Llama-3-8B
+MODEL_NAME=llama-3-8b
+HF_TOKEN=your_huggingface_token
+
+# Shared Parameters
+TENSOR_PARALLEL_SIZE=8
 
-* **Accuracy** = 1 if accuracy matches or exceeds a predetermined threshold, 0 otherwise  
-* **Reduced Latency** = Reference implementation TTFT divided by submission TTFT  
-* **Increased Throughput** = Submission tokens/sec divided by reference implementation tokens/sec  
-* **Normalized NKI FLOPS** = Submission NKI FLOPS divided by total model FLOPS
+# Inference Parameters
+INFERENCE_PORT=8080
+MAX_MODEL_LEN=2048
+MAX_NUM_SEQS=4
 
-For example, a submission that is sufficiently accurate, with 10x reduced latency, 2x increased throughput, and 0.85 normalized NKI FLOPS would obtain 1 \* 10 \* 2 \* 1.85 \= 37 points. For reference, the baseline submission would receive a score of 1.
+# Dataset Configuration
+DATASET_NAME=databricks/databricks-dolly-15k
+```
+
+## 📊 Monitoring
 
-## Presentations
+### Check Status
+```bash
+./nki-llama status
+```
 
-Teams who successfully submit an entry will be invited to present an informal overview of their approach (roughly 10 to 15 minutes) at a special session held on March 30th during the [Workshop & Tutorial](https://www.asplos-conference.org/asplos2025/workshops-and-tutorials/) days.  Winners will be announced later in the week, with full results being released soon after the conference.
+### View Logs
+```bash
+# Logs are stored with timestamps
+ls logs/
+tail -f logs/nki-llama_*.log
 
-## Contest Eligibility
+# Benchmark results
+ls logs/benchmarks/
+cat logs/benchmarks/*/metadata.json
+```
 
-All are welcome to participate in the contest (including teams from academia, industry, and elsewhere) with the exception of the Contest Organizers and employees of the Contest Sponsor. Individuals are prohibited from participating in multiple teams. In order to be eligible for prizes, teams must commit to releasing an open-source version of their implementation prior to ASPLOS 2026\.
+### Neuron Monitoring
+```bash
+neuron-ls    # List Neuron devices
+neuron-top   # Monitor Neuron usage
+```
 
-## Frequently Asked Questions
+## 🔍 Complete Workflow Example
 
-To raise a question, please create an issue in this repository, or feel free to reach out to the contest organizers directly.
+Here's a complete end-to-end workflow with tmux best practices:
 
-## Related Work
+### 1. Fine-tune a Model
+```bash
+# Create tmux session for training
+tmux new -s training
 
-* TBD
+# Inside tmux: activate environment and run training
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+./nki-llama finetune all
 
-## Contest Organizers
+# Detach from tmux: Ctrl+B, D
+# Check progress later: tmux attach -t training
+```
 
-* Emery Berger (Amazon Web Services), [emerydb@amazon.com](mailto:emerydb@amazon.com)
-* Aninda Manocha (Amazon Web Services)
-* Wei Tang (Amazon Web Services)
-* Emily Webber (Amazon Web Services)
-* Ziyang Xu (Amazon Web Services)
+### 2. Compile Model with NKI
+```bash
+# Create tmux session for compilation
+tmux new -s compile
+
+# Inside tmux: compile the model
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+./nki-llama inference compile
+
+# This creates optimized artifacts for vLLM
+# Detach and let it run: Ctrl+B, D
+```
+
+### 3. Benchmark Performance
+```bash
+# After compilation, run benchmarks
+./nki-llama inference benchmark --iterations 20
+
+# View benchmark results
+ls logs/benchmarks/
+cat logs/benchmarks/*/metadata.json
+```
+
+### 4. Serve with vLLM
+```bash
+# Create tmux session for server
+tmux new -s vllm
+
+# Inside tmux: start the server
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+./nki-llama server
+
+# Server uses NKI-compiled artifacts automatically
+# Detach: Ctrl+B, D
+```
+
+### 5. Build Agents
+```bash
+# In a new terminal
+./nki-llama jupyter
+
+# Your model is now available at http://localhost:8080
+# Build agents using the OpenAI-compatible API
+```
+
+### Managing tmux Sessions
+```bash
+# List all sessions
+tmux ls
+
+# Attach to a session
+tmux attach -t training
+tmux attach -t compile
+tmux attach -t vllm
+
+# Kill a session
+tmux kill-session -t training
+```
+
+## 🚨 Troubleshooting
+
+### Environment Issues
+```bash
+# Check active environment
+./nki-llama status
+
+# Wrong environment error?
+# For fine-tuning:
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+
+# For inference:
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+```
+
+### Model Compilation
+- First-time model compilation with NKI can take 10-30 minutes
+- Compiled models are cached in `~/traced_model/`
+- Subsequent runs will use the cached compilation
+
+### Memory Issues
+- Ensure you're using trn1.32xlarge for full model support
+- Monitor memory usage with `neuron-top`
+- Adjust `TENSOR_PARALLEL_SIZE` if needed
+
+### Using with tmux
+For long-running operations like training, compilation, or serving:
+
+```bash
+# Create a new tmux session
+tmux new -s session-name
+
+# Run your command
+./nki-llama [command]
+
+# Detach from session
+Ctrl+B, then D
+
+# List sessions
+tmux ls
+
+# Reattach to session
+tmux attach -t session-name
+```
+
+## 🤝 Contributing
+
+The modular design makes it easy to add new features:
+
+1. Add new scripts to `scripts/` directory
+2. Update command handlers in `nki-llama.sh`
+3. Add configuration to `nki-llama.config`
+
+## 📄 License
+
+© 2025 Amazon Web Services. All rights reserved.
+
+This project integrates with AWS Neuron SDK and follows its licensing terms.
\ No newline at end of file
diff --git a/src/inference/docs/reasoning-score-guide.md b/src/inference/docs/reasoning-score-guide.md
new file mode 100644
index 0000000..85f38f4
--- /dev/null
+++ b/src/inference/docs/reasoning-score-guide.md
@@ -0,0 +1,275 @@
+# NKI-Reasoning Reasoning Benchmark Setup Guide
+
+This guide walks you through setting up and running reasoning benchmarks on your compiled model using the lm-eval. The benchmark evaluates model performance on reasoning tasks including GSM8K Chain-of-Thought, MMLU, and MMLU Pro datasets.
+
+## Prerequisites
+
+- Ensure you are in the pytorch_2_6 nxdi venv
+- Ensure you have run `./nki-llama inference setup` first
+- tmux for session management - start a new session eg: `tmux new -s bench`
+
+## Step 1: Clone the Repository
+
+Navigate to the home directory and clone the AWS Neuron samples repository:
+
+```bash
+cd /home/ubuntu
+git clone https://github.com/aws-neuron/aws-neuron-samples.git
+```
+
+## Step 2: Configure Benchmark Datasets
+
+For the reasoning benchmark, we'll use three key datasets:
+- **gsm8k_cot**: Grade School Math problems with Chain-of-Thought reasoning
+- **mmlu_flan_cot_zeroshot**: Massive Multitask Language Understanding with zero-shot prompting
+- **mmlu_pro**: Professional-level MMLU questions
+
+### Edit the Accuracy Configuration
+
+Navigate to the inference benchmarking directory and modify the accuracy configuration:
+
+```bash
+cd /home/ubuntu/aws-neuron-samples/inference-benchmarking
+vim accuracy.py
+```
+
+Update line 18 to include the required datasets in the `ACCURACY_CLIENTS_DATASETS` dictionary:
+
+```python
+ACCURACY_CLIENTS_DATASETS = {
+    "lm_eval": [
+        "gsm8k_cot",
+        "mmlu_flan_cot_zeroshot", # add
+        "mmlu_pro", #add
+        "mmlu_flan_n_shot_generative_computer_security",
+        "mmlu_flan_n_shot_generative_logical_fallacies",
+        "mmlu_flan_n_shot_generative_nutrition",
+    ],
+}
+```
+
+## Step 3: Configure Sequence Length Requirements
+
+**Important**: Reasoning benchmarks require a minimum recommended sequence length of 4096 tokens during model compilation.
+
+### Set Environment Variables
+
+Navigate to your NKI-LLaMA directory and configure the `.env` file:
+
+```bash
+cd /home/ubuntu/nki-llama/
+vim .env
+```
+
+Ensure the following environment variables are set:
+
+```bash
+# For compilation with main.py
+SEQ_LEN=4096
+
+# For runtime inference
+MAX_MODEL_LEN=4096
+```
+
+## Step 4: Configure Model Paths
+
+The benchmark script sources environment variables from the top-level `nki-llama` directory. You have two options:
+
+### Option A: Use Environment Variables (Recommended)
+
+Ensure your `.env` file contains:
+
+```bash
+MODEL_NAME="your-model-name"
+NKI_MODELS="/path/to/your/models"
+NKI_COMPILED="/path/to/compiled/models"
+TENSOR_PARALLEL_SIZE=1
+INFERENCE_PORT=8000
+MAX_MODEL_LEN=4096
+```
+
+### Option B: Hardcode Model Paths
+
+The benchmark script generates a YAML configuration file. Here's what it looks like:
+
+```yaml
+server:
+  name: "Reasoning-benchmark server"
+  model_path: "${NKI_MODELS}/${MODEL_NAME}"
+  model_s3_path: null
+  compiled_model_path: "${NKI_COMPILED}/${MODEL_NAME}"
+  max_seq_len: ${MAX_MODEL_LEN}
+  context_encoding_len: ${MAX_MODEL_LEN}
+  tp_degree: ${TENSOR_PARALLEL_SIZE}
+  n_vllm_threads: ${TENSOR_PARALLEL_SIZE}
+  server_port: ${INFERENCE_PORT}
+  continuous_batch_size: 1
+
+test:
+  accuracy:
+    mytest:
+      client: "lm_eval"
+      datasets: ["mmlu_pro", "gsm8k_cot", "mmlu_flan_cot_zeroshot"]
+      max_concurrent_requests: 1
+      timeout: 3600
+      client_params:
+        limit: 200
+        use_chat: False
+```
+
+## Step 5: Running Multiple Model Comparisons
+
+If you're comparing base and fine-tuned models, ensure you update the model paths between runs:
+
+### For Base Model Run:
+```bash
+export MODEL_NAME="base-model-name"
+```
+
+### For Fine-tuned Model Run:
+```bash
+export MODEL_NAME="fine-tuned-model-name"
+```
+
+Alternatively, you can directly edit the generated YAML file to hardcode specific model paths.
+
+## Step 6: Execute the Benchmark
+
+### Start a tmux Session
+
+```bash
+tmux new-session -d -s reasoning-benchmark
+tmux attach-session -t reasoning-benchmark
+```
+
+### Activate Virtual Environment and Run Benchmark
+
+```bash
+# Navigate to the inference scripts directory
+cd /home/ubuntu/nki-llama/src/inference/scripts/
+
+# Activate your virtual environment
+source /path/to/your/venv/bin/activate
+
+# Run the reasoning benchmark
+./reasoning-bench-lm-eval.sh
+```
+
+## Step 7: Results Analysis
+
+After completion, the benchmark will generate results in the `aws-neuron-samples` directory under a path specific to your model.
+
+### Expected Output Format
+
+The results will be saved as a JSON file with the following structure:
+
+```json
+{
+  "results": {
+    "gsm8k_cot": {
+      "alias": "gsm8k_cot",
+      "exact_match,strict-match": 0.78,
+      "exact_match_stderr,strict-match": 0.029365141882663297,
+      "exact_match,flexible-extract": 0.72,
+      "exact_match_stderr,flexible-extract": 0.03182868716477582
+    },
+    "mmlu_flan_cot_zeroshot": {
+      "alias": "mmlu_flan_cot_zeroshot",
+      "acc,none": 0.65,
+      "acc_stderr,none": 0.0234
+    },
+    "mmlu_pro": {
+      "alias": "mmlu_pro", 
+      "acc,none": 0.42,
+      "acc_stderr,none": 0.0189
+    }
+  },
+  "group_subtasks": {
+    "gsm8k_cot": [],
+    "mmlu_flan_cot_zeroshot": [],
+    "mmlu_pro": []
+  },
+  "configs": {
+    "gsm8k_cot": {
+      "task": "gsm8k_cot",
+      "dataset_path": "gsm8k",
+      "test_split": "test",
+      "doc_to_text": "Question: {{question}}\nAnswer:",
+      "doc_to_target": "{{answer}}",
+      "description": "Answer the following question with step-by-step reasoning."
+    }
+  },
+  "versions": {
+    "gsm8k_cot": 1,
+    "mmlu_flan_cot_zeroshot": 1,
+    "mmlu_pro": 1
+  },
+  "n-shot": {
+    "gsm8k_cot": 0,
+    "mmlu_flan_cot_zeroshot": 0,
+    "mmlu_pro": 0
+  },
+  "higher_is_better": {
+    "gsm8k_cot": {
+      "exact_match,strict-match": true,
+      "exact_match,flexible-extract": true
+    },
+    "mmlu_flan_cot_zeroshot": {
+      "acc,none": true
+    },
+    "mmlu_pro": {
+      "acc,none": true
+    }
+  }
+}
+```
+
+## Understanding the Results
+
+### Key Metrics
+
+- **exact_match,strict-match**: Percentage of exactly correct answers using strict matching
+- **exact_match,flexible-extract**: Percentage of correct answers using flexible extraction
+- **acc,none**: Overall accuracy percentage
+- **stderr**: Standard error of the measurement
+
+### Benchmark Interpretations
+
+- **GSM8K CoT**: Measures mathematical reasoning ability with step-by-step problem solving
+- **MMLU**: Evaluates broad knowledge across multiple academic domains
+- **MMLU Pro**: Tests professional-level understanding and application
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Sequence Length Errors**: Ensure `SEQ_LEN` and `MAX_MODEL_LEN` are set to at least 4096
+2. **Model Path Issues**: Verify environment variables or hardcoded paths are correct
+3. **Memory Issues**: Consider adjusting `TENSOR_PARALLEL_SIZE` based on your hardware
+4. **Timeout Errors**: Increase the timeout value in the YAML configuration if needed
+
+### Debug Commands
+
+```bash
+# Check environment variables
+cd /home/ubuntu/nki-llama/
+source .env
+env | grep -E "(MODEL_NAME|SEQ_LEN|MAX_MODEL_LEN)"
+
+# Verify model paths exist
+ls -la ${NKI_MODELS}/${MODEL_NAME}
+ls -la ${NKI_COMPILED}/${MODEL_NAME}
+
+# Check tmux sessions
+tmux list-sessions
+```
+
+## Best Practices
+
+1. **Use tmux**: Long-running benchmarks benefit from persistent sessions
+2. **Monitor Resources**: Keep an eye on GPU/CPU usage during execution
+3. **Save Results**: Archive results with timestamps for comparison
+4. **Document Changes**: Keep track of configuration changes between runs
+5. **Version Control**: Use git to track modifications to benchmark scripts
+
+This comprehensive setup ensures reliable and reproducible reasoning benchmark results for your AWS Neuron model evaluations.
diff --git a/src/inference/llama.py b/src/inference/llama.py
index f472e3b..db36b8b 100644
--- a/src/inference/llama.py
+++ b/src/inference/llama.py
@@ -17,13 +17,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#
-# 
-# This file has been modified by Shiwei Gao and Ruwen Fan 
-# to enable LLM inference with NKI framework on NeuronX devices.
-# Updated for compatibility with latest AWS Neuron SDK 2.23
-
-
 """PyTorch LLaMA model for NXD inference."""
 import copy
 import gc
@@ -33,41 +26,28 @@
 
 import torch
 from neuronx_distributed.parallel_layers import parallel_state  # noqa: E402
-import math
-import warnings
-from typing import (
-    Optional, Tuple, Union, Any, Type
-)
-
-import torch.nn.functional as F
-
-from neuronx_distributed.parallel_layers.mappings import (
-    copy_to_tensor_model_parallel_region,
-    gather_from_tensor_model_parallel_region,
-    reduce_from_tensor_model_parallel_region,
-    reduce_scatter_to_sequence_parallel_region,
-    scatter_to_tensor_model_parallel_region,
-    _gather_along_first_dim,  # Added for latest SDK
-)
 from neuronx_distributed.parallel_layers.layers import (  # noqa: E402; noqa: E402; noqa: E402; noqa: E402; noqa: E402
-    RowParallelLinear,
     ColumnParallelLinear,
     ParallelEmbedding,
+    RowParallelLinear,
 )
 from neuronx_distributed.parallel_layers.mappings import (
     gather_from_sequence_parallel_region,
     reduce_from_tensor_model_parallel_region,
     reduce_scatter_to_sequence_parallel_region,
+    _gather_along_first_dim,
 )
-from neuronx_distributed.utils import cpu_mode  # Added for latest SDK
-
-# Updated imports for latest SDK - includes quantized kernels
+from neuronx_distributed.parallel_layers.utils import get_padding_length
+from neuronx_distributed.utils import cpu_mode
 from neuronxcc.nki._private_kernels.mlp import (
     mlp_fused_add_isa_kernel,
-    mlp_isa_kernel
+    mlp_isa_kernel,
+    quant_mlp_fused_add_isa_kernel,
+    quant_mlp_isa_kernel,
 )
-
-from torch import nn, ones
+from neuronxcc.nki._private_kernels.rmsnorm import rmsnorm_quant_isa_kernel
+from neuronxcc.nki.language import nc
+from torch import nn
 from torch_neuronx.xla_impl.ops import nki_jit
 from transformers import LlamaForCausalLM
 from transformers.activations import ACT2FN
@@ -84,831 +64,34 @@
 )
 from neuronx_distributed_inference.modules.attention.utils import (
     RotaryEmbedding,
+    preprocess_quantized_linear_layer,
     transpose_parallel_linear_layer,
 )
-
-# Updated import for latest SDK
 from neuronx_distributed_inference.modules.custom_calls import CustomRMSNorm
 from neuronx_distributed_inference.modules.flashdecode.utils import calculate_num_cores_per_group
 from neuronx_distributed_inference.modules.lora_serving.lora_module import is_lora_module
 from neuronx_distributed_inference.utils.distributed import get_tp_group
 
-# No longer using direct RmsNorm from torch_neuronx.xla_impl.ops
-# from torch_neuronx.xla_impl.ops import RmsNorm
-
-import neuronxcc.nki as nki
-import neuronxcc.nki.language as nl
-import neuronxcc.nki.isa as nisa  # Note: Some functions may have moved or changed
-import neuronxcc.nki.typing as nt
-import numpy as np
-
 _LLAMA_MODULE_MAP = {}
 
 logger = logging.getLogger("Neuron")
 
-NKI_ENABLED = True
-CONFIG_FUSE_MLP = True
-
-CONFIG_MLP_FUSE_NONE = False
-CONFIG_MLP_FUSE_NORM_ONLY = False
-USE_FLASH = False
-
-def cdiv(a, b):
-    return (a + b - 1) // b
-
-@nki.jit
-def nki_gemm(lhsT, rhs):
-    K, M = lhsT.shape
-    K_, N = rhs.shape
-    assert K == K_, "lhsT and rhs must have the same contraction dimension"
-
-    result = nl.ndarray((M, N), dtype=lhsT.dtype, buffer=nl.shared_hbm)
-
-    TILE_M = nl.tile_size.gemm_stationary_fmax  # 128
-    TILE_K = nl.tile_size.pmax  # 128
-    if N <= 512:
-        TILE_N = N
-    else:
-        TILE_N = 512 
-    assert K % TILE_K == 0
-
-    mask_m = nl.arange(TILE_M)[None, :]  # shape: [TILE_M, 1]
-    mask_n = nl.arange(TILE_N)[None, :]  # shape: [1, TILE_N]
-
-    for m in nl.affine_range(cdiv(M, TILE_M)):
-        for n in nl.affine_range(cdiv(N, TILE_N)):
-            res_psum = nl.zeros((TILE_M, TILE_N), nl.float32, buffer=nl.psum)
-
-            for k in nl.affine_range(K // TILE_K):
-
-                lhsT_tile = nl.load(
-                    lhsT[k * TILE_K : (k + 1) * TILE_K, m * TILE_M : (m + 1) * TILE_M],
-                    mask=(mask_m + m * TILE_M) < M,
-                )
-
-                rhs_tile = nl.load(
-                    rhs[k * TILE_K : (k + 1) * TILE_K, n * TILE_N : (n + 1) * TILE_N],
-                    mask=(mask_n + n * TILE_N) < N,
-                )
-
-                res_psum += nl.matmul(lhsT_tile, rhs_tile, transpose_x=True)
-            mask_m2 = nl.arange(TILE_M)[:,None]  # shape: [TILE_M, 1]
-            nl.store(
-                result[m * TILE_M : (m + 1) * TILE_M, n * TILE_N : (n + 1) * TILE_N],
-                value=res_psum,
-                mask=((mask_m2 + m * TILE_M) < M) & ((mask_n + n * TILE_N) < N)
-            )
-
-    return result
-
-@nki.jit
-def nki_matmul_fully_optimized_(
-    lhsT,
-    rhs,
-    TILES_IN_BLOCK_M=1,
-    TILES_IN_BLOCK_N=4,
-    # Meta-parameters
-):
-    """NKI kernel to compute a matrix multiplication operation while blocking the
-       free dimensions of the LHS and RHS to improve memory access pattern.
-
-    Args:
-        lhsT: an input tensor of shape [K,M], where both K and M are multiples for
-          128.  It is the left-hand-side argument of the matrix multiplication,
-          delivered transposed for optimal performance.
-        rhs: an input tensor of shape [K,N], where K is a multiple of 128, and N
-          is a multiple of 512.  It is the right-hand-side argument of the matrix
-          multiplication.
-    Returns:
-        result: the resulting output tensor of shape [M,N]
-    """
-
-    K, M = lhsT.shape
-    K_, N = rhs.shape
-    assert K == K_, "lhsT and rhs must have the same contraction dimension"
-    result = nl.ndarray((M, N), dtype=lhsT.dtype, buffer=nl.shared_hbm)
-
-    TILE_M = nl.tile_size.gemm_stationary_fmax  # 128
-    TILE_K = nl.tile_size.pmax  # 128
-    if N <= 512:
-        TILE_N = N
-    else:
-        TILE_N = nl.tile_size.gemm_moving_fmax  # 512
-
-    # Define the indices (shape) of the tiles
-    i_lhsT = nl.mgrid[0:TILE_K, 0:TILE_M]
-    i_rhs = nl.mgrid[0:TILE_K, 0:TILE_N]
-    i_res = nl.mgrid[0:TILE_M, 0:TILE_N]
-
-    # Configuring the blocking size for the free dimensions
-    # TILES_IN_BLOCK_M = 2
-    # TILES_IN_BLOCK_N = 4
-    
-    if TILES_IN_BLOCK_N > N // TILE_N:
-        TILES_IN_BLOCK_N = N // TILE_N
-
-    BLOCK_M = TILE_M * TILES_IN_BLOCK_M  # 256
-    BLOCK_N = TILE_N * TILES_IN_BLOCK_N  # 1024
-
-    # the size has to be multiple of block size
-    # assert M % BLOCK_M == 0
-    assert N % BLOCK_N == 0
-
-    # Loop over blocks over the M dimension
-    for m in nl.affine_range(cdiv(M,BLOCK_M)):
-        # Load TILES_IN_BLOCK_M columns tiles from lhsT
-        lhsT_tiles = nl.ndarray(
-            (TILES_IN_BLOCK_M, K // TILE_K, nl.par_dim(TILE_K), TILE_M),
-            dtype=lhsT.dtype,
-            buffer=nl.sbuf,
-        )
-        for bm in nl.affine_range(TILES_IN_BLOCK_M):
-            for k in nl.affine_range(K // TILE_K):
-                lhsT_tiles[bm, k, i_lhsT.p, i_lhsT.x] = nl.load(
-                    lhsT[
-                        k * TILE_K + i_lhsT.p,
-                        (m * TILES_IN_BLOCK_M + bm) * TILE_M + i_lhsT.x
-                    ],mask = ((m * TILES_IN_BLOCK_M + bm) * TILE_M + i_lhsT.x < M)
-                )
-
-        for n in nl.affine_range(N // BLOCK_N):
-            # Load TILES_IN_BLOCK_N columns from rhs
-            rhs_tiles = nl.ndarray(
-                (TILES_IN_BLOCK_N, K // TILE_K, nl.par_dim(TILE_K), TILE_N),
-                dtype=rhs.dtype,
-                buffer=nl.sbuf,
-            )
-            for bn in nl.affine_range(TILES_IN_BLOCK_N):
-                for k in nl.affine_range(K // TILE_K):
-                    rhs_tiles[bn, k, i_rhs.p, i_rhs.x] = nl.load(
-                        rhs[
-                            k * TILE_K + i_rhs.p,
-                            (n * TILES_IN_BLOCK_N + bn) * TILE_N + i_rhs.x,
-                        ]
-                    )
-
-            for bm in nl.affine_range(TILES_IN_BLOCK_M):
-                for bn in nl.affine_range(TILES_IN_BLOCK_N):
-                    # Allocate a tensor in PSUM
-                    res_psum = nl.zeros((TILE_M, TILE_N), nl.float32, buffer=nl.psum)
-                    for k in nl.affine_range(K // TILE_K):
-                        # Accumulate partial-sums into PSUM
-                        res_psum += nl.matmul(
-                            lhsT_tiles[bm, k, i_lhsT.p, i_lhsT.x],
-                            rhs_tiles[bn, k, i_rhs.p, i_rhs.x],
-                            transpose_x=True,
-                        )
-
-                    # Copy the result from PSUM back to SBUF, and cast to expected output data-type
-                    res_sb = nl.copy(res_psum, dtype=result.dtype)
-                    nl.store(
-                        result[
-                            (m * TILES_IN_BLOCK_M + bm) * TILE_M + i_res.p,
-                            (n * TILES_IN_BLOCK_N + bn) * TILE_N + i_res.x
-                        ],
-                        mask = (m * TILES_IN_BLOCK_M + bm) * TILE_M + i_res.p < M ,
-                        value=res_sb,
-                    )
-
-    return result
-
-
-@nki.jit
-def rms_norm_nki_thin_gemm(lhsT, rhs, g_tensor, eps, residual=None):
-    M, K = lhsT.shape
-    K_, N = rhs.shape
-    assert K == K_, "lhsT and rhs must have the same contraction dimension"
-    result = nl.ndarray((M, N), dtype=lhsT.dtype, buffer=nl.shared_hbm)
-    if residual is not None:
-        residual_result = nl.ndarray(lhsT.shape, dtype=lhsT.dtype, buffer=nl.shared_hbm)
-
-    iw = nl.arange(1)[:, None]
-    iy = nl.arange(K)[None, :]
-    
-    # Load RMSNorm weight once, reused by rows/tiles of a_tensor
-    g_tile = nl.load(g_tensor.reshape((1, g_tensor.shape[0]))[iw, iy])
-    if M != 1:
-        g_bcast = g_tile.broadcast_to((M, g_tensor.shape[0]))
-    else:
-        g_bcast = g_tile
-    def micron_kernel(
-        TILE_M, TILE_K, TILE_N, TILES_IN_BLOCK_M, TILES_IN_BLOCK_N, n_start, n_end
-    ):
-        # TILE_M = M  # 128
-        # TILE_K = 128  # 128
-        # TILE_N = 512  # 512
-
-        # Define the indices (shape) of the tiles
-        i_lhsT = nl.mgrid[0:TILE_K, 0:TILE_M]
-        i_rhs = nl.mgrid[0:TILE_K, 0:TILE_N]
-        i_res = nl.mgrid[0:TILE_M, 0:TILE_N]
-
-        # Configuring the blocking size for the free dimensions
-        # TILES_IN_BLOCK_M = 1
-        # TILES_IN_BLOCK_N = 4
-
-        BLOCK_M = TILE_M * TILES_IN_BLOCK_M  # 256
-        BLOCK_N = TILE_N * TILES_IN_BLOCK_N  # 1024
-
-        # the size has to be multiple of block size
-        assert M % BLOCK_M == 0
-        # assert N % BLOCK_N == 0
-
-        # Load TILES_IN_BLOCK_M columns tiles from lhsT
-        a_tile = nl.load(lhsT)
-        if residual is not None:
-            res_tile = nl.load(residual)
-            a_tile = nl.add(a_tile, res_tile)
-            nl.store(residual_result, value=a_tile)
-        
-        in_square = nl.square(a_tile)
-        square_sum = nl.sum(in_square, axis=[1])
-
-        # Scale and get a reciprocal
-        mean = square_sum / K
-
-        rms_reciprocal = nl.rsqrt(mean + eps)
-
-        out_tile = nl.multiply(a_tile, rms_reciprocal)
-        out_tile[...] = nl.multiply(out_tile, g_bcast)
-        
-        lhsT_tiles = nl.ndarray(
-            (K // TILE_K, nl.par_dim(TILE_K), TILE_M),
-            dtype=lhsT.dtype,
-            buffer=nl.sbuf,
-        )
-        
-        for k in nl.affine_range(K // TILE_K):
-            lhsT_tiles[k, :, :] = nisa.nc_transpose(
-                out_tile[:, k * TILE_K: (k + 1) * TILE_K],
-            )
-        for n in nl.affine_range((n_end - n_start) // BLOCK_N):
-            # Load TILES_IN_BLOCK_N columns from rhs
-
-            rhs_tiles = nl.ndarray(
-                (TILES_IN_BLOCK_N, K // TILE_K, nl.par_dim(TILE_K), TILE_N),
-                dtype=rhs.dtype,
-                buffer=nl.sbuf,
-            )
-            for bn in nl.affine_range(TILES_IN_BLOCK_N):
-                for k in nl.affine_range(K // TILE_K):
-                    rhs_tiles[bn, k, i_rhs.p, i_rhs.x] = nl.load(
-                        rhs[
-                            k * TILE_K + i_rhs.p,
-                            n_start + (n * TILES_IN_BLOCK_N + bn) * TILE_N + i_rhs.x,
-                        ]
-                    )
-            for bn in nl.affine_range(TILES_IN_BLOCK_N):
-                res_psum = nl.zeros((M, TILE_N), nl.float32, buffer=nl.psum)
-                for k in nl.affine_range(K // TILE_K):
-                    # Accumulate partial-sums into PSUM
-                    res_psum += nisa.nc_matmul(
-                        stationary=lhsT_tiles[k, i_lhsT.p, i_lhsT.x],
-                        moving=rhs_tiles[bn, k, i_rhs.p, i_rhs.x],
-                    )
-
-                # Copy the result from PSUM back to SBUF, and cast to expected output data-type
-                nl.store(
-                    result[
-                        0:M,
-                        n_start
-                        + (n * TILES_IN_BLOCK_N + bn)
-                        * TILE_N : n_start+(n * TILES_IN_BLOCK_N + bn + 1)
-                        * TILE_N,
-                    ],
-                    value=res_psum,
-                )
-    micron_kernel(M, 128, 512, 1, 4, 0, N)
-    if residual is not None:
-        return result, residual_result
-    return result
-
-
-@nki.jit
-def nki_thin_gemm(lhsT, rhs):
-    K, M = lhsT.shape
-    K_, N = rhs.shape
-    assert K == K_, "lhsT and rhs must have the same contraction dimension"
-    result = nl.ndarray((M, N), dtype=lhsT.dtype, buffer=nl.shared_hbm)
-
-    def micron_kernel(
-        TILE_M, TILE_K, TILE_N, TILES_IN_BLOCK_M, TILES_IN_BLOCK_N, n_start, n_end
-    ):
-        # TILE_M = M  # 128
-        # TILE_K = 128  # 128
-        # TILE_N = 512  # 512
-
-        # Define the indices (shape) of the tiles
-        i_lhsT = nl.mgrid[0:TILE_K, 0:TILE_M]
-        i_rhs = nl.mgrid[0:TILE_K, 0:TILE_N]
-        i_res = nl.mgrid[0:TILE_M, 0:TILE_N]
-
-        # Configuring the blocking size for the free dimensions
-        # TILES_IN_BLOCK_M = 1
-        # TILES_IN_BLOCK_N = 4
-
-        BLOCK_M = TILE_M * TILES_IN_BLOCK_M  # 256
-        BLOCK_N = TILE_N * TILES_IN_BLOCK_N  # 1024
-
-        # Process M in chunks of TILE_M to avoid exceeding partition limits
-        for m_block in nl.affine_range(cdiv(M, TILE_M)):
-            m_start = m_block * TILE_M
-            m_size = min(TILE_M, M - m_start)
-            
-            # Load TILES_IN_BLOCK_M columns tiles from lhsT
-            lhsT_tiles = nl.ndarray(
-                (K // TILE_K, nl.par_dim(TILE_K), TILE_M),
-                dtype=lhsT.dtype,
-                buffer=nl.sbuf,
-            )
-            for k in nl.affine_range(K // TILE_K):
-                # Create mask for loading
-                m_mask = i_lhsT.x < m_size
-                lhsT_tiles[k, i_lhsT.p, i_lhsT.x] = nl.load(
-                    lhsT[
-                        k * TILE_K + i_lhsT.p,
-                        m_start + i_lhsT.x,
-                    ],
-                    mask=m_mask
-                )
-            
-            for n in nl.affine_range((n_end - n_start) // BLOCK_N):
-                # Load TILES_IN_BLOCK_N columns from rhs
-                rhs_tiles = nl.ndarray(
-                    (TILES_IN_BLOCK_N, K // TILE_K, nl.par_dim(TILE_K), TILE_N),
-                    dtype=rhs.dtype,
-                    buffer=nl.sbuf,
-                )
-                for bn in nl.affine_range(TILES_IN_BLOCK_N):
-                    for k in nl.affine_range(K // TILE_K):
-                        rhs_tiles[bn, k, i_rhs.p, i_rhs.x] = nl.load(
-                            rhs[
-                                k * TILE_K + i_rhs.p,
-                                n_start + (n * TILES_IN_BLOCK_N + bn) * TILE_N + i_rhs.x,
-                            ]
-                        )
-                
-                for bn in nl.affine_range(TILES_IN_BLOCK_N):
-                    res_psum = nl.zeros((TILE_M, TILE_N), nl.float32, buffer=nl.psum)
-                    for k in nl.affine_range(K // TILE_K):
-                        # Accumulate partial-sums into PSUM
-                        res_psum += nisa.nc_matmul(
-                            stationary=lhsT_tiles[k, i_lhsT.p, i_lhsT.x],
-                            moving=rhs_tiles[bn, k, i_rhs.p, i_rhs.x],
-                        )
-
-                    # Create mask for storing
-                    store_mask = i_res.p < m_size
-                    # Copy the result from PSUM back to SBUF, and cast to expected output data-type
-                    nl.store(
-                        result[
-                            m_start + i_res.p,
-                            n_start + (n * TILES_IN_BLOCK_N + bn) * TILE_N + i_res.x,
-                        ],
-                        value=res_psum,
-                        mask=store_mask
-                    )
-            
-            # Handle remainder columns if (n_end - n_start) is not divisible by BLOCK_N
-            remainder_n = (n_end - n_start) % BLOCK_N
-            if remainder_n > 0:
-                n_offset = n_start + ((n_end - n_start) // BLOCK_N) * BLOCK_N
-                remaining_tiles = cdiv(remainder_n, TILE_N)
-                
-                for bn in nl.affine_range(remaining_tiles):
-                    tile_start = n_offset + bn * TILE_N
-                    tile_width = min(TILE_N, n_end - tile_start)
-                    
-                    # Load remaining rhs tiles
-                    rhs_tile = nl.ndarray((K // TILE_K, nl.par_dim(TILE_K), TILE_N), dtype=rhs.dtype, buffer=nl.sbuf)
-                    for k in nl.affine_range(K // TILE_K):
-                        n_mask = i_rhs.x < tile_width
-                        rhs_tile[k, i_rhs.p, i_rhs.x] = nl.load(
-                            rhs[k * TILE_K + i_rhs.p, tile_start + i_rhs.x],
-                            mask=n_mask
-                        )
-                    
-                    res_psum = nl.zeros((TILE_M, TILE_N), nl.float32, buffer=nl.psum)
-                    for k in nl.affine_range(K // TILE_K):
-                        res_psum += nisa.nc_matmul(
-                            stationary=lhsT_tiles[k, i_lhsT.p, i_lhsT.x],
-                            moving=rhs_tile[k, i_rhs.p, i_rhs.x],
-                        )
-                    
-                    # Store with combined mask
-                    store_mask = (i_res.p < m_size) & (i_res.x < tile_width)
-                    nl.store(
-                        result[m_start + i_res.p, tile_start + i_res.x],
-                        value=res_psum,
-                        mask=store_mask
-                    )
-                    
-    # Use appropriate tile size based on M
-    if M <= 128:
-        TILE_M = M
-    else:
-        TILE_M = 128
-        
-    if N >=2048:
-        first_n_end = (N // (512 * 4)) * (512 * 4)
-        micron_kernel(TILE_M, 128, 512, 1, 4, 0, first_n_end)
-        # print("first_n_end", first_n_end)
-        # print("N", N)
-        if first_n_end < N:
-            micron_kernel(TILE_M, 128, 128, 1, 1, first_n_end, N)
-    elif N == 256:
-        micron_kernel(TILE_M, 128, N, 1, 1, 0, N)
-    elif N == 512:
-        micron_kernel(TILE_M, 128, 512, 1, 1, 0, N)
-    elif N == 1024:
-        micron_kernel(TILE_M, 128, 512, 1, 2, 0, N)
-    elif N == 1536:
-        micron_kernel(TILE_M, 128, 512, 1, 3, 0, N)
-    else:
-        # Handle other N dimensions by using appropriate tile sizes
-        if N < 256:
-            micron_kernel(TILE_M, 128, 128, 1, 1, 0, N)
-        else:
-            # Process in chunks of 512
-            n_processed = 0
-            while n_processed < N:
-                chunk_size = min(512, N - n_processed)
-                if chunk_size == 512:
-                    micron_kernel(TILE_M, 128, 512, 1, 1, n_processed, n_processed + chunk_size)
-                else:
-                    micron_kernel(TILE_M, 128, 128, 1, 1, n_processed, n_processed + chunk_size)
-                n_processed += chunk_size
-    return result
-
-configs = {(256, 2048, 1024): (1, 1), (384, 2048, 1024): (2, 2), (512, 2048, 1024): (2, 2), (640, 2048, 1024): (4, 1), (256, 2048, 256): (1, 1), (384, 2048, 256): (1, 1), (512, 2048, 256): (1, 1), (640, 2048, 256): (1, 1), (256, 2048, 8192): (4, 1), (384, 2048, 8192): (4, 2), (512, 2048, 8192): (4, 1), (640, 2048, 8192): (4, 2), (256, 4096, 2048): (2, 4), (384, 4096, 2048): (4, 4), (512, 4096, 2048): (4, 1), (640, 4096, 2048): (4, 4), (256, 1024, 2048): (1, 1), (384, 1024, 2048): (2, 1), (512, 1024, 2048): (2, 1), (640, 1024, 2048): (4, 1)}
-def custom_gemx_implement(input_parallel, weight):
-    
-    
-    origin_shape = input_parallel.shape
-    bsz = input_parallel.shape[1] * input_parallel.shape[0]
-    dim_input = input_parallel.shape[-1]
-    dim_output = weight.shape[-1]
-    # print(input_parallel.shape,weight.shape,flush=True)
-    # print(input_parallel.shape,weight.shape,flush=True)
-    input_parallel = input_parallel.view(-1,dim_input)
-    assert input_parallel.dtype == torch.bfloat16
-    assert weight.dtype == torch.bfloat16
-    
-    # Use nki_thin_gemm for small batch sizes or when dimensions don't align well
-    if bsz <= 128:
-        output_parallel = nki_thin_gemm(input_parallel.T,weight)
-        # print(output_parallel.shape)
-    else:
-        bsz = input_parallel.shape[0]
-        # iterate through the bsz and add to find a config, try 100 times, otherwise use the default config
-        test_bsz = (bsz + 127) // 128 * 128
-        # Default values
-        m = 2
-        n = 4
-        k = 4  # This was defined but never used, keeping for compatibility
-        found_config = False
-        
-        for i in range(100):
-            if (test_bsz,dim_input,dim_output) in configs:
-                config = configs[(test_bsz,dim_input,dim_output)]
-                m = config[0]
-                n = config[1]
-                found_config = True
-                break
-            test_bsz = test_bsz + 128
-        
-        # Check if the dimensions are suitable for the optimized kernel
-        TILE_N = 512 if dim_output > 512 else dim_output
-        BLOCK_N = TILE_N * n
-        
-        # If dimensions don't align well or no config found, use nki_thin_gemm
-        if not found_config or dim_output % BLOCK_N != 0 or dim_output < 256:
-            output_parallel = nki_thin_gemm(input_parallel.T, weight)
-        else:
-            # print("dim_input,dim_output,bsz",dim_input,dim_output,test_bsz,m,n,k,flush=True)
-            output_parallel = nki_matmul_fully_optimized_(input_parallel.T,weight, TILES_IN_BLOCK_M=m,TILES_IN_BLOCK_N=n)
-    
-    output = output_parallel.view(origin_shape[0],origin_shape[1],dim_output)
-    return output
-
-
-class CustomColumnParallelLinear(ColumnParallelLinear):
-
-    def __init__(
-        self,
-        *args, **kwargs,
-    ):
-        super().__init__(*args, **kwargs)
-        global NKI_ENABLED
-        if NKI_ENABLED:
-            self.weight = transpose_parallel_linear_layer(self.weight)
-
-    def forward(self, input: torch.Tensor, *_: Any) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
-        """Forward of ColumnParallelLinear
-
-        Args:
-            input_: 3D tensor whose order of dimension is [batch, sequence, hidden]
-
-        Returns:
-            - output
-        """
-        if self.pad and self.training:
-            raise RuntimeError("`pad=True` is only supported for inference. Set model.eval()")
-
-        if self.async_tensor_model_parallel_allreduce or self.sequence_parallel_enabled:
-            input_parallel = input
-        else:
-            input_parallel = copy_to_tensor_model_parallel_region(input, process_group=self.tensor_parallel_group)
-        global NKI_ENABLED
-        # Matrix multiply.
-        if not NKI_ENABLED:
-            output_parallel = self._forward_impl(
-                input=input_parallel,
-                weight=self.weight,
-                bias=None,
-                async_grad_allreduce=self.async_tensor_model_parallel_allreduce,
-                sequence_parallel_enabled=self.sequence_parallel_enabled,
-                sequence_dimension=self.sequence_dimension,
-                autograd_func_class=self.autograd_func_class,
-                process_group=self.tensor_parallel_group,
-                reduce_dtype = self.reduce_dtype,
-            )
-        else:
-            # print(input_parallel.shape,self.weight.shape)
-            output_parallel = custom_gemx_implement(input_parallel, self.weight)
-        # print(input_parallel.squeeze(0).T.shape,self.weight.T.shape)
-        # print(output_parallel.shape)
-        # print(input_parallel.shape,self.weight.shape,output_parallel.shape)
-        if self.gather_output:
-            # All-gather across the partitions.
-            assert not self.sequence_parallel_enabled
-            output = gather_from_tensor_model_parallel_region(output_parallel, process_group=self.tensor_parallel_group)
-            if self.pad and self.pad_size > 0:
-                output = torch.narrow(output, -1, 0, self.output_size - self.pad_size)
-        else:
-            output = output_parallel
-        if self.skip_bias_add:
-            return output, self.bias
-        output = (output + self.bias) if self.bias is not None else output
-        return output
-
-
-class CustomFusedColumnParallelLinear(ColumnParallelLinear):
-
-    def __init__(
-        self,
-        *args, **kwargs,
-    ):
-        super().__init__(*args, **kwargs)
-        global NKI_ENABLED
-        if NKI_ENABLED:
-            self.weight = transpose_parallel_linear_layer(self.weight)
-        self.act_fn = torch.nn.SiLU()
-
-    def forward(self, input: torch.Tensor, rmsnorm, residual, *_: Any) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
-        """Forward of ColumnParallelLinear
-
-        Args:
-            input_: 3D tensor whose order of dimension is [batch, sequence, hidden]
-
-        Returns:
-            - output
-        """
-        
-        if self.pad and self.training:
-            raise RuntimeError("`pad=True` is only supported for inference. Set model.eval()")
-
-        if self.async_tensor_model_parallel_allreduce or self.sequence_parallel_enabled:
-            input_parallel = input
-        else:
-            input_parallel = copy_to_tensor_model_parallel_region(input, process_group=self.tensor_parallel_group)
-        global NKI_ENABLED
-        # Matrix multiply.
-        if not NKI_ENABLED:
-            if residual is not None:
-                input_parallel = residual + input_parallel
-            residual = input_parallel
-            if rmsnorm is not None:
-                input_parallel = rmsnorm(input_parallel)
-            output_parallel = self._forward_impl(
-                input=input_parallel,
-                weight=self.weight,
-                bias=None,
-                async_grad_allreduce=self.async_tensor_model_parallel_allreduce,
-                sequence_parallel_enabled=self.sequence_parallel_enabled,
-                sequence_dimension=self.sequence_dimension,
-                autograd_func_class=self.autograd_func_class,
-                process_group=self.tensor_parallel_group,
-                reduce_dtype = self.reduce_dtype,
-            )
-            dim_output = output_parallel.shape[-1]
-            gate_output = output_parallel[:, :, : dim_output // 2]
-            up_output = output_parallel[:, :, dim_output // 2 :]
-            output = self.act_fn(gate_output) * up_output
-        else:
-            dim_output = self.weight.shape[-1]
-            bsz = input_parallel.shape[1] * input_parallel.shape[0]
-            if bsz <= 128 and not CONFIG_MLP_FUSE_NONE and residual is not None and rmsnorm is not None:
-
-                origin_shape = input_parallel.shape
-                dim_input = input_parallel.shape[-1]
-                dim_output = self.weight.shape[-1]
-                input_parallel = input_parallel.view(-1,dim_input)
-                assert input_parallel.dtype == torch.bfloat16
-                assert self.weight.dtype == torch.bfloat16
-                residual = residual.view(-1,dim_input)
-                if CONFIG_MLP_FUSE_NORM_ONLY:
-                    input_parallel = input_parallel + residual
-                    residual = input_parallel
-                    output_parallel = rms_norm_nki_thin_gemm(input_parallel, self.weight, rmsnorm.weight, rmsnorm.variance_epsilon, None)
-                    residual = residual.view(origin_shape[0],origin_shape[1],dim_input)
-                    output_parallel = output_parallel.view(origin_shape[0],origin_shape[1],dim_output)
-                else:
-                    output_parallel, residual = rms_norm_nki_thin_gemm(input_parallel, self.weight, rmsnorm.weight, rmsnorm.variance_epsilon, residual)
-                    residual = residual.view(origin_shape[0],origin_shape[1],dim_input)
-                    output_parallel = output_parallel.view(origin_shape[0],origin_shape[1],dim_output)
-            else:
-                # Handle case where residual might be None
-                if residual is not None:
-                    input_parallel = residual + input_parallel
-                residual = input_parallel
-                if rmsnorm is not None:
-                    input_parallel = nki_rmsnorm_kernel(input_parallel, rmsnorm.weight, rmsnorm.variance_epsilon)
-                output_parallel = custom_gemx_implement(input_parallel, self.weight)
-            gate_output = output_parallel[:, :, : dim_output // 2]
-            up_output = output_parallel[:, :, dim_output// 2 :]
-            output = self.act_fn(gate_output) * up_output
-       
-        if self.skip_bias_add:
-            return output, self.bias
-        output = (output + self.bias) if self.bias is not None else output
-        return output, residual
-
-
-class CustomRowParallelLinear(RowParallelLinear):
-
-    def __init__(
-        self,
-        *args, **kwargs
-    ):
-        super().__init__(*args, **kwargs)
-        global NKI_ENABLED
-        if NKI_ENABLED:
-            self.weight = transpose_parallel_linear_layer(self.weight)
-            
-    def forward(self, input_: torch.Tensor) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
-        """Forward of RowParallelLinear
-
-        Args:
-            input_: 3D tensor whose order of dimension is [batch, sequence, hidden]
-
-        Returns:
-            - output
-        """
-        if self.pad and self.training:
-            raise RuntimeError("`pad=True` is only supported for inference. Set model.eval()")
-
-        # Set up backprop all-reduce.
-        if self.input_is_parallel:
-            input_parallel = input_
-        else:
-            if self.pad and self.pad_size > 0:
-                input_ = torch.nn.functional.pad(input_, (0, self.pad_size))
-            assert not self.sequence_parallel_enabled
-            input_parallel = scatter_to_tensor_model_parallel_region(input_, process_group=self.tensor_parallel_group)
-        global NKI_ENABLED
-        # Matrix multiply.
-        if not NKI_ENABLED:
-        # Matrix multiply.
-            output_ = self._forward_impl(
-                input=input_parallel,
-                weight=self.weight,
-                bias=None,
-                async_grad_allreduce=False,
-                sequence_parallel_enabled=False,
-                sequence_dimension=self.sequence_dimension,
-                autograd_func_class=self.autograd_func_class,
-                process_group=self.tensor_parallel_group,
-                reduce_dtype = self.reduce_dtype,
-            )
-        else:
-            output_ = custom_gemx_implement(input_parallel, self.weight)
-
-            
-        if self.reduce_output:
-            # All-reduce across all the partitions.
-            original_dtype = output_.dtype
-
-            output_ = output_.to(self.reduce_dtype)
-
-            if self.sequence_parallel_enabled:
-                output_ = reduce_scatter_to_sequence_parallel_region(
-                    output_, self.sequence_dimension, process_group=self.tensor_parallel_group,
-                )
-            else:
-                output_ = reduce_from_tensor_model_parallel_region(
-                    output_, process_group=self.tensor_parallel_group,
-                )
-
-            output_ = output_.to(original_dtype)
-
-        if self.skip_bias_add:
-            return output_, self.bias
-        output = (output_ + self.bias) if self.bias is not None else output_
-        return output
-
-        
-@nki.jit
-def nki_rmsnorm_kernel(a_tensor, g_tensor, eps):
-    # Calculate out_tensor = a_tensor/RMS(a_tensor) * g_tensor
-    # Where RMS(a_tensor) = sqrt((1/N) * sum(a_tensor * a_tensor))
-    # and N = a_tensor.shape[1]
-    # Reduction (mean) is performed in the free (2nd) dimension
-    out_tensor = nl.ndarray(a_tensor.shape, dtype=a_tensor.dtype,
-                          buffer=nl.shared_hbm)
-
-    # Make sure shapes match
-    assert a_tensor.shape[2] == g_tensor.shape[0]
-
-    # Generate tensor indices to index input tensor
-    ix = nl.arange(128)[:, None]
-    iw = nl.arange(1)[:, None]
-    iy = nl.arange(a_tensor.shape[2])[None, :]
-
-    num_rows = a_tensor.shape[1]
-    
-    # Load RMSNorm weight once, reused by rows/tiles of a_tensor
-    g_tile = nl.load(g_tensor.reshape((1, g_tensor.shape[0]))[iw, iy])
-
-    # Process 128 rows at a time due to 128-partition tile size limitation
-    # Since we're not reducing across the first dimension
-    # Tiles can be processed independently
-
-    for b in range(a_tensor.shape[0]):
-        for i in range(math.ceil(a_tensor.shape[1]/128)):
-            # Load input data from external memory to on-chip memory
-            a_tile = nl.zeros([128, a_tensor.shape[2]], a_tensor.dtype)
-            a_tile[...] = nl.load(a_tensor[b, i * 128 + ix, iy], mask=(i * 128 + ix < num_rows))
-
-            # Compute element-wise square of a_tensor
-            in_square = nl.square(a_tile)
-
-            # Calculate sum of squared elements, along last dimension
-            square_sum = nl.sum(in_square, axis=[1])
-
-            # Scale and get a reciprocal
-            mean = square_sum / a_tensor.shape[2]
-
-            # Take square root of mean and then reciprocal with
-            # rsqrt API (one ISA instruction)
-            rms_reciprocal = nl.rsqrt(mean + eps)
-
-            # Scale the input tensor
-            out_tile = nl.multiply(a_tile, rms_reciprocal)
-
-            # Broadcast weight along first axis to match tensor shape
-            # num_rows_active = min(num_rows - i * 128, 128)
-            g_bcast = g_tile.broadcast_to((128, g_tensor.shape[0]))
-
-            # Multiply with the RMSNorm weight
-            out_tile[...] = nl.multiply(out_tile, g_bcast, mask=(i * 128 + ix < num_rows))
-
-            # store the addition results back to external memory (out_tensor)
-            nl.store(out_tensor[b, i * 128 + ix, iy], value=out_tile, mask=(i * 128 + ix < num_rows))
-
-    return out_tensor
-
-
-class CustomRMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6, nki_enabled=False):
-        """
-        Use this RMSNorm to perform customized rmsnorm on Neuron
-        Note: CustomRMSNorm forward method calls target="AwsNeuronRmsNorm"
-        """
-        super().__init__()
-        self.weight = nn.Parameter(ones(hidden_size))
-        self.variance_epsilon = eps
-        self.nki_enabled = nki_enabled
-
-    def forward(self, hidden_states):
-        if self.nki_enabled:
-            out_tensor = nki_rmsnorm_kernel(hidden_states, self.weight, self.variance_epsilon)
-            return out_tensor
-
-        # Updated for latest SDK - use CustomRMSNorm from neuronx_distributed_inference
-        # instead of direct RmsNorm from torch_neuronx.xla_impl.ops
-        # The CustomRMSNorm in the latest SDK handles this properly
-        return CustomRMSNorm.forward(self, hidden_states)
-
 
 def get_rmsnorm_cls():
     # Initialize to the appropriate implementation of RMSNorm
     # If infer on NXD -> CustomRMSNorm
     # If infer on CPU -> HF_RMSNorm (CustomRMSNorm does not work on CPU)
-    # Updated to use cpu_mode() from latest SDK
     return LlamaRMSNorm if cpu_mode() else CustomRMSNorm
 
 
+def preshard_hook_fn(module: torch.nn.Module, model_state_dict: dict, prefix: str) -> bool:
+    if isinstance(module, (BaseGroupQueryAttention,)):
+        return module.preshard_hook(model_state_dict, prefix)
+
+    return False
+
+
+# Get the modules_to_not_convert from the neuron configs
 def get_modules_to_not_convert(neuron_config: NeuronConfig):
     return getattr(neuron_config, "modules_to_not_convert", None)
 
@@ -949,13 +132,6 @@ def get_updated_configs(config: InferenceConfig):
     return updated_configs
 
 
-def preshard_hook_fn(module: torch.nn.Module, model_state_dict: dict, prefix: str) -> bool:
-    if isinstance(module, (BaseGroupQueryAttention,)):
-        return module.preshard_hook(model_state_dict, prefix)
-
-    return False
-
-
 def _register_module(key: str, cls: Type[nn.Module]):
     _LLAMA_MODULE_MAP[key] = cls
 
@@ -1011,59 +187,15 @@ def convert_state_dict_to_fused_qkv(llama_state_dict, cfg: InferenceConfig):
     for l in range(cfg.num_hidden_layers):  # noqa: E741
         _helper_concat_and_delete_qkv(llama_state_dict, l, "weight")
         if (
-            getattr(cfg.neuron_config, 'quantized_mlp_kernel_enabled', False) or getattr(cfg.neuron_config, 'quantized', False)
+            cfg.neuron_config.quantized_mlp_kernel_enabled or cfg.neuron_config.quantized
         ) and f"layers.{l}.self_attn" not in mods_to_not_conv:
-            # Only try to concatenate scale if it exists
-            if f"layers.{l}.self_attn.q_proj.scale" in llama_state_dict:
-                _helper_concat_and_delete_qkv(llama_state_dict, l, "scale")
+            _helper_concat_and_delete_qkv(llama_state_dict, l, "scale")
 
     gc.collect()
 
     return llama_state_dict
 
 
-class NeuronConfigNKI(NeuronConfig):
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self.nki_enabled = kwargs.pop("enable_nki", False)
-        
-        # Add missing attributes for compatibility with latest SDK
-        # These are attributes expected by the NeuronLlamaMLP class
-        if not hasattr(self, 'quantized_kernel_lower_bound'):
-            self.quantized_kernel_lower_bound = kwargs.get('quantized_kernel_lower_bound', 0.0)
-        
-        if not hasattr(self, 'logical_neuron_cores'):
-            self.logical_neuron_cores = kwargs.get('logical_neuron_cores', 1)
-            
-        if not hasattr(self, 'logical_nc_config'):
-            self.logical_nc_config = kwargs.get('logical_nc_config', 1)
-            
-        if not hasattr(self, 'activation_quantization_type'):
-            self.activation_quantization_type = kwargs.get('activation_quantization_type', None)
-            
-        if not hasattr(self, 'quantize_clamp_bound'):
-            self.quantize_clamp_bound = kwargs.get('quantize_clamp_bound', float('inf'))
-            
-        if not hasattr(self, 'fused_rmsnorm_skip_gamma'):
-            self.fused_rmsnorm_skip_gamma = kwargs.get('fused_rmsnorm_skip_gamma', False)
-            
-        if not hasattr(self, 'mlp_kernel_fuse_residual_add'):
-            self.mlp_kernel_fuse_residual_add = kwargs.get('mlp_kernel_fuse_residual_add', False)
-            
-        if not hasattr(self, 'qkv_kernel_fuse_residual_add'):
-            self.qkv_kernel_fuse_residual_add = kwargs.get('qkv_kernel_fuse_residual_add', False)
-            
-        if not hasattr(self, 'is_prefill_stage'):
-            self.is_prefill_stage = kwargs.get('is_prefill_stage', False)
-            
-        if not hasattr(self, 'attn_tkg_builtin_kernel_enabled'):
-            self.attn_tkg_builtin_kernel_enabled = kwargs.get('attn_tkg_builtin_kernel_enabled', False)
-            
-    def is_mlp_quantized(self):
-        """Check if MLP is quantized"""
-        return getattr(self, 'quantized_mlp_kernel_enabled', False) or getattr(self, 'quantized', False)
-
-
 class WeightGatheredColumnParallel(ColumnParallelLinear):
     """
     A specialized column-parallel linear layer that implements weight gathering optimization
@@ -1073,7 +205,7 @@ class WeightGatheredColumnParallel(ColumnParallelLinear):
     1. Standard column-parallel forward (inherited from parent)
     2. Weight-gathered forward for long sequences
     """
-    def forward_wg(self, input: torch.Tensor, weight_gather: bool = False):
+    def forward_wg(self, input: torch, weight_gather: bool = False):
         """
         Performs the forward pass with optional weight gathering optimization.
 
@@ -1139,7 +271,7 @@ def get_required_attributes(self) -> List[str]:
 
     @classmethod
     def get_neuron_config_cls(cls) -> Type[NeuronConfig]:
-        return NeuronConfigNKI
+        return NeuronConfig
 
 
 class NeuronLlamaMLP(nn.Module):
@@ -1161,15 +293,13 @@ def __init__(self, config: InferenceConfig):
         )
         self.sequence_dimension = 1 if self.sequence_parallel_enabled else None
         self.rms_norm_eps = config.rms_norm_eps
-        self.mlp_kernel_enabled = getattr(self.neuron_config, "mlp_kernel_enabled", False)
-        self.fused_rmsnorm_skip_gamma = getattr(self.config.neuron_config, "fused_rmsnorm_skip_gamma", False)
-        self.quantized_mlp_kernel_enabled = getattr(self.neuron_config, "quantized_mlp_kernel_enabled", False)
-        self.rmsnorm_quantize_kernel_enabled = getattr(self.neuron_config, "rmsnorm_quantize_kernel_enabled", False)
-        self.quantized_kernel_lower_bound = getattr(self.neuron_config, "quantized_kernel_lower_bound", 0.0)
-        self.quantize_clamp_bound = getattr(self.neuron_config, "quantize_clamp_bound", float('inf'))
-        self.logical_neuron_cores = getattr(self.neuron_config, "logical_neuron_cores", 1)
-        self.logical_nc_config = getattr(self.neuron_config, "logical_nc_config", 1)
-        self.activation_quantization_type = getattr(self.neuron_config, "activation_quantization_type", None)
+        self.mlp_kernel_enabled = self.neuron_config.mlp_kernel_enabled
+        self.fused_rmsnorm_skip_gamma = self.config.neuron_config.fused_rmsnorm_skip_gamma
+        self.quantized_mlp_kernel_enabled = self.neuron_config.quantized_mlp_kernel_enabled
+        self.rmsnorm_quantize_kernel_enabled = self.neuron_config.rmsnorm_quantize_kernel_enabled
+        self.quantize_clamp_bound = self.neuron_config.quantize_clamp_bound
+        self.logical_nc_config = self.neuron_config.logical_nc_config
+        self.activation_quantization_type = self.neuron_config.activation_quantization_type
         mlp_bias = getattr(config, "mlp_bias", False)
 
         if self.neuron_config.quantized_mlp_kernel_enabled and self.quantize_clamp_bound == float(
@@ -1180,76 +310,221 @@ def __init__(self, config: InferenceConfig):
             )
             self.quantize_clamp_bound = 1200.0
         if parallel_state.model_parallel_is_initialized():
-            if CONFIG_FUSE_MLP:
-                self.gateup = CustomFusedColumnParallelLinear(
-                    self.hidden_size,
-                    self.intermediate_size * 2,
-                    bias=mlp_bias,
-                    gather_output=False,
-                    dtype=config.neuron_config.torch_dtype,
-                    pad=True,
-                    sequence_parallel_enabled=False,
-                    sequence_dimension=None,
-                    tensor_model_parallel_group=get_tp_group(config),
+            if self.neuron_config.quantized_mlp_kernel_enabled:
+                # # Quantized MLP kernels expect intermediate size to be multiple of 128, so we need to pad
+                tp_degree = self.neuron_config.tp_degree
+                self.intermediate_size += (
+                    get_padding_length(self.intermediate_size // tp_degree, 128) * tp_degree
+                )
+                logger.debug(f"Quantized intermediate_size: {self.intermediate_size}")
+            self.gate_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.intermediate_size,
+                bias=mlp_bias,
+                gather_output=False,
+                dtype=config.neuron_config.torch_dtype,
+                pad=True,
+                sequence_parallel_enabled=False,
+                sequence_dimension=None,
+                tensor_model_parallel_group=get_tp_group(config),
+            )
+            self.up_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.intermediate_size,
+                bias=mlp_bias,
+                gather_output=False,
+                dtype=config.neuron_config.torch_dtype,
+                pad=True,
+                sequence_parallel_enabled=False,
+                sequence_dimension=None,
+                tensor_model_parallel_group=get_tp_group(config),
+            )
+            self.down_proj = RowParallelLinear(
+                self.intermediate_size,
+                self.hidden_size,
+                bias=mlp_bias,
+                input_is_parallel=True,
+                dtype=config.neuron_config.torch_dtype,
+                pad=True,
+                sequence_parallel_enabled=self.sequence_parallel_enabled,
+                sequence_dimension=self.sequence_dimension,
+                tensor_model_parallel_group=get_tp_group(config),
+                reduce_dtype=config.neuron_config.rpl_reduce_dtype,
+            )
+            if self.mlp_kernel_enabled:
+                if self.neuron_config.quantized_mlp_kernel_enabled:
+                    setattr(
+                        self.gate_proj,
+                        "post_create_quantized_module_hook",
+                        preprocess_quantized_linear_layer,
+                    )
+                    setattr(
+                        self.up_proj,
+                        "post_create_quantized_module_hook",
+                        preprocess_quantized_linear_layer,
+                    )
+                    setattr(
+                        self.down_proj,
+                        "post_create_quantized_module_hook",
+                        preprocess_quantized_linear_layer,
+                    )
+                else:
+                    # Transpose the weights to the layout expected by kernels
+                    self.gate_proj.weight = transpose_parallel_linear_layer(self.gate_proj.weight)
+                    self.up_proj.weight = transpose_parallel_linear_layer(self.up_proj.weight)
+                    self.down_proj.weight = transpose_parallel_linear_layer(self.down_proj.weight)
+
+        else:
+            self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=mlp_bias)
+            self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=mlp_bias)
+            self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=mlp_bias)
+
+    def _kernel_enabled_quantized_mlp(self, x, rmsnorm, residual, adapter_ids):
+        grid = (nc(self.logical_nc_config),)
+        fused_residual = residual is not None
+        fused_rmsnorm = rmsnorm is not None
+        logger.debug(
+            f"MLP: quantized kernel, fused_residual={fused_residual}, fused_rmsnorm={fused_rmsnorm}, logical_nc_config={self.logical_nc_config}"
+        )
+
+        # Can't do residual add in the kernel if SP is enabled
+        if fused_residual:
+            assert (
+                not self.sequence_parallel_enabled
+            ), "Quantized MLP cannot have both fused residual add and sequence parallel RMSnorm!"
+            # Using fused residual add
+            _mlp_fwd_call = nki_jit()(quant_mlp_fused_add_isa_kernel)
+        else:
+            _mlp_fwd_call = nki_jit()(quant_mlp_isa_kernel)
+
+        if fused_rmsnorm:
+            ln_w = rmsnorm.weight.unsqueeze(0)
+        else:
+            ln_w = torch.zeros(size=(1, self.hidden_size), dtype=x.dtype, device=x.device)
+
+        # Handle SP RMSnorm
+        x_orig_dtype = x.dtype
+        if self.sequence_parallel_enabled:
+            # This RMSNormQuant kernel will do quantization inside, so we pass the
+            # clamp_bound for clipping.
+            # If we don't use this kernel, the MLP kernel below will do the
+            # quantization, so we also pass clamp_bound to that kernel.
+            if self.rmsnorm_quantize_kernel_enabled:
+                logger.debug(
+                    "Running Quantized MLP kernel with sequence-parallel RMSnorm-Quantize kernel!"
                 )
-                self.down_proj = CustomRowParallelLinear(
-                    self.intermediate_size,
-                    self.hidden_size,
-                    bias=mlp_bias,
-                    input_is_parallel=True,
-                    dtype=config.neuron_config.torch_dtype,
-                    pad=True,
-                    sequence_parallel_enabled=self.sequence_parallel_enabled,
-                    sequence_dimension=self.sequence_dimension,
-                    tensor_model_parallel_group=get_tp_group(config),
-                    reduce_dtype=config.neuron_config.rpl_reduce_dtype,
+                _rmsnorm_quant_fwd_call = nki_jit()(rmsnorm_quant_isa_kernel)
+                quant_rmsnorm_out = torch.zeros(
+                    size=(
+                        x.shape[0],  # batch size
+                        x.shape[1],  # sequence length
+                        x.shape[2] + 4,  # hidden size + 4 bytes for packing fp32 scale
+                    ),
+                    dtype=torch.int8,
+                    device=x.device,
                 )
-            else:
-                self.gate_proj = CustomColumnParallelLinear(
-                    self.hidden_size,
-                    self.intermediate_size,
-                    bias=mlp_bias,
-                    gather_output=False,
-                    dtype=config.neuron_config.torch_dtype,
-                    pad=True,
-                    sequence_parallel_enabled=False,
-                    sequence_dimension=None,
-                    tensor_model_parallel_group=get_tp_group(config),
+                clamp_bound = self.quantize_clamp_bound
+                _rmsnorm_quant_fwd_call[grid](
+                    x, ln_w, clamp_bound, quant_rmsnorm_out, kernel_name="QuantOnly"
                 )
-                self.up_proj = CustomColumnParallelLinear(
-                    self.hidden_size,
-                    self.intermediate_size,
-                    bias=mlp_bias,
-                    gather_output=False,
-                    dtype=config.neuron_config.torch_dtype,
-                    pad=True,
-                    sequence_parallel_enabled=False,
-                    sequence_dimension=None,
-                    tensor_model_parallel_group=get_tp_group(config),
+                x = gather_from_sequence_parallel_region(
+                    quant_rmsnorm_out,
+                    self.sequence_dimension,
+                    process_group=get_tp_group(self.config),
                 )
-                self.down_proj = CustomRowParallelLinear(
-                    self.intermediate_size,
-                    self.hidden_size,
-                    bias=mlp_bias,
-                    input_is_parallel=True,
-                    dtype=config.neuron_config.torch_dtype,
-                    pad=True,
-                    sequence_parallel_enabled=self.sequence_parallel_enabled,
-                    sequence_dimension=self.sequence_dimension,
-                    tensor_model_parallel_group=get_tp_group(config),
-                    reduce_dtype=config.neuron_config.rpl_reduce_dtype,
+
+            else:
+                logger.debug(
+                    "Running Quantized MLP kernel with external (native compiler) sequence-parallel RMSnorm!"
                 )
-           
+                x = gather_from_sequence_parallel_region(
+                    x, self.sequence_dimension, process_group=get_tp_group(self.config)
+                )
+
+        # Build output tensor
+        output_tensor_seqlen = x.shape[1]
+        if fused_residual:
+            # seqlen dim is doubled to store the residual add output
+            output_tensor_seqlen *= 2
+
+        output_tensor = torch.zeros(
+            size=(
+                x.shape[0],  # batch size
+                output_tensor_seqlen,
+                self.hidden_size,  # hidden size
+            ),
+            dtype=x_orig_dtype,
+            device=x.device,
+        )
+
+        # Grab weights
+        # all weights of the layers are stored in (out, in) shape
+        # unsqueeze so that shape of RMS gamma weight is [1, hidden] instead of [hidden]
+        gate_w = self.gate_proj.weight.data
+        gate_w_scale = self.gate_proj.scale
+        up_w = self.up_proj.weight.data
+        up_w_scale = self.up_proj.scale
+        down_w = self.down_proj.weight.data
+        down_w_scale = self.down_proj.scale
+        clamp_bound = self.quantize_clamp_bound
 
+        if fused_residual:
+            _mlp_fwd_call[grid](
+                x,  # attn_output
+                residual,  # hidden
+                ln_w,  # ln_w
+                gate_w,  # gate_w
+                gate_w_scale,
+                up_w,  # up_w
+                up_w_scale,
+                down_w,  # down_w
+                down_w_scale,
+                clamp_bound,
+                output_tensor,  # out
+                fused_rmsnorm=fused_rmsnorm,
+                eps=self.rms_norm_eps,
+                kernel_name="MLP",
+                store_add=True,
+            )
+            original_seqlen = x.shape[1]
+            residual = output_tensor[:, original_seqlen:, :]
+            output_tensor = output_tensor[:, :original_seqlen, :]
         else:
-            self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=mlp_bias)
-            self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=mlp_bias)
-            self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=mlp_bias)
+            _mlp_fwd_call[grid](
+                x,  # hidden
+                # should be fine to pass gamma is as a dummy even if not using fused rmsnorm
+                ln_w,
+                gate_w,  # gate_w
+                gate_w_scale,
+                up_w,  # up_w
+                up_w_scale,
+                down_w,  # down_w
+                down_w_scale,
+                clamp_bound,
+                output_tensor,  # out
+                # Run RMSNorm inside the kernel if NOT using SP rmsnorm
+                fused_rmsnorm=fused_rmsnorm,
+                eps=self.rms_norm_eps,
+                kernel_name="MLP",
+            )
+            residual = None
 
-    def _kernel_enabled_mlp(self, x, fused_rmsnorm, rmsnorm, residual, adapter_ids):
+        # All-reduce or reduce-scatter, depending on whether SP is enabled
+        if self.sequence_parallel_enabled:
+            output_tensor = reduce_scatter_to_sequence_parallel_region(
+                output_tensor, self.sequence_dimension, process_group=get_tp_group(self.config)
+            )
+        else:
+            output_tensor = reduce_from_tensor_model_parallel_region(output_tensor)
+
+        logger.debug(f"Quantized MLP output shape {output_tensor.shape}")
+        return (output_tensor, residual)
+
+    def _kernel_enabled_mlp(self, x, rmsnorm, residual, adapter_ids):
         fused_residual = residual is not None
+        fused_rmsnorm = rmsnorm is not None
         logger.debug(
-            f"MLP: kernel, fused_residual={fused_residual}, fused_rmsnorm={fused_rmsnorm}, logical_neuron_cores={self.logical_neuron_cores}"
+            f"MLP: kernel, fused_residual={fused_residual}, fused_rmsnorm={fused_rmsnorm}, skip_gamma={self.fused_rmsnorm_skip_gamma}, logical_nc_config={self.logical_nc_config}"
         )
 
         # Choose which kernel to call
@@ -1286,12 +561,15 @@ def _kernel_enabled_mlp(self, x, fused_rmsnorm, rmsnorm, residual, adapter_ids):
         # Grab weights
         # all weights of the layers are stored in (out, in) shape
         # unsqueeze so that shape of RMS gamma weight is [1, hidden] instead of [hidden]
-        ln_w = rmsnorm.weight.unsqueeze(0)
+        if fused_rmsnorm:
+            ln_w = rmsnorm.weight.unsqueeze(0)
+        else:
+            ln_w = torch.zeros(size=(1, self.hidden_size), dtype=x.dtype, device=x.device)
         gate_w = self.gate_proj.weight.data
         up_w = self.up_proj.weight.data
         down_w = self.down_proj.weight.data
 
-        grid = (self.logical_neuron_cores,)
+        grid = (nc(self.logical_nc_config),)
 
         if fused_residual:
             _mlp_fwd_call[grid](
@@ -1302,9 +580,10 @@ def _kernel_enabled_mlp(self, x, fused_rmsnorm, rmsnorm, residual, adapter_ids):
                 up_w,  # up_w
                 down_w,  # down_w
                 output_tensor,  # out
+                kernel_name="MLP",
                 fused_rmsnorm=fused_rmsnorm,
+                skip_gamma=self.fused_rmsnorm_skip_gamma,
                 eps=self.rms_norm_eps,
-                kernel_name="MLP",
                 store_add=True,
             )
             original_seqlen = x.shape[1]
@@ -1319,10 +598,11 @@ def _kernel_enabled_mlp(self, x, fused_rmsnorm, rmsnorm, residual, adapter_ids):
                 up_w,
                 down_w,
                 output_tensor,  # out
+                kernel_name="MLP",
                 # Run RMSNorm inside the kernel if NOT using SP rmsnorm
                 fused_rmsnorm=fused_rmsnorm,
+                skip_gamma=self.fused_rmsnorm_skip_gamma,
                 eps=self.rms_norm_eps,
-                kernel_name="MLP",
             )
             residual = None
 
@@ -1339,7 +619,7 @@ def _kernel_enabled_mlp(self, x, fused_rmsnorm, rmsnorm, residual, adapter_ids):
         logger.debug(f"MLP output shape {output_tensor.shape}")
         return (output_tensor, residual)
 
-    def _native_mlp(self, x, rmsnorm, adapter_ids=None, residual=None):
+    def _native_mlp(self, x, adapter_ids=None):
         logger.debug("MLP: native compiler")
         # all-gather is done here instead of CPL layers to
         # avoid 2 all-gathers from up and gate projections
@@ -1347,345 +627,47 @@ def _native_mlp(self, x, rmsnorm, adapter_ids=None, residual=None):
             x = gather_from_sequence_parallel_region(
                 x, self.sequence_dimension, process_group=get_tp_group(self.config)
             )
-        if CONFIG_FUSE_MLP:
-            gateup_output, residual = self.gateup(x, rmsnorm, residual)
-            output = self.down_proj(gateup_output)
-            
-        else:
-            if residual is not None:
-                x = residual + x
-                residual = x
-            if rmsnorm is not None:
-                x = rmsnorm(x)
-            gate_proj_output = (
-                self.gate_proj(x)
-                if not is_lora_module(self.gate_proj)
-                else self.gate_proj(x, adapter_ids)
-            )
-            up_proj_output = (
-                self.up_proj(x) if not is_lora_module(self.up_proj) else self.up_proj(x, adapter_ids)
-            )
-            down_proj_input = self.act_fn(gate_proj_output) * up_proj_output
-            output = (
-                self.down_proj(down_proj_input)
-                if not is_lora_module(self.down_proj)
-                else self.down_proj(down_proj_input, adapter_ids)
-            )
+        gate_proj_output = (
+            self.gate_proj(x)
+            if not is_lora_module(self.gate_proj)
+            else self.gate_proj(x, adapter_ids)
+        )
+
+        up_proj_output = (
+            self.up_proj(x) if not is_lora_module(self.up_proj) else self.up_proj(x, adapter_ids)
+        )
+
+        down_proj_input = self.act_fn(gate_proj_output) * up_proj_output
+        output = (
+            self.down_proj(down_proj_input)
+            if not is_lora_module(self.down_proj)
+            else self.down_proj(down_proj_input, adapter_ids)
+        )
         logger.debug(f"MLP output shape {output.shape}")
-        return output, residual
+        return output
 
     def forward(self, x, rmsnorm=None, residual=None, adapter_ids=None):
         """
         If residual is passed in, will fuse its add into the MLP kernel
+        If rmsnorm is passed in, will fuse the rmsnorm into the MLP kernel
 
         Returns a tuple of (output, residual), where residual is the output of the residual add
         """
+
         if self.mlp_kernel_enabled:
-            fused_rmsnorm = not self.sequence_parallel_enabled
             # Quantized MLP kernel
+            if self.quantized_mlp_kernel_enabled:
+                return self._kernel_enabled_quantized_mlp(
+                    x, rmsnorm, residual, adapter_ids=adapter_ids
+                )
             # MLP kernel
-            return self._kernel_enabled_mlp(
-                x, fused_rmsnorm, rmsnorm, residual, adapter_ids=adapter_ids
-            )
+            return self._kernel_enabled_mlp(x, rmsnorm, residual, adapter_ids=adapter_ids)
         else:
             # No kernel
-            return self._native_mlp(x, rmsnorm, adapter_ids=adapter_ids, residual=residual)
-
-def smallest_multiple(k, n):
-    if k % n == 0:
-        return k
-    else:
-        return (k // n + 1) * n
-
-@nki.jit
-def _flash_attention_core(q_local_tile, k, v, 
-                            o_buffer, l_buffer, m_buffer,
-                            q_tile_idx,
-                            local_k_large_tile_idx,
-                            kernel_dtype, acc_type,
-                            LARGE_TILE_SZ,
-                            initialize,
-                            B_P_SIZE=128, B_F_SIZE=512, B_D_SIZE=128):
-    num_k_tile_per_large_tile = LARGE_TILE_SZ // B_F_SIZE
-
-    qk_res_buf = nl.ndarray((nl.par_dim(B_P_SIZE), LARGE_TILE_SZ), buffer=nl.sbuf, dtype=acc_type)
-    max_local = nl.ndarray((nl.par_dim(B_P_SIZE), num_k_tile_per_large_tile), dtype=acc_type)
-
-    for k_i in nl.affine_range(num_k_tile_per_large_tile):
-        k_i_b_f_slice = nl.ds(k_i * B_F_SIZE, B_F_SIZE)
-
-        qk_psum = nl.ndarray((nl.par_dim(B_P_SIZE), B_F_SIZE),
-                                                dtype=np.float32, buffer=nl.psum)  # (128, 512)
-
-        multiplication_required_selection = q_tile_idx * B_P_SIZE >= local_k_large_tile_idx * LARGE_TILE_SZ + k_i * B_F_SIZE
-
-
-        if multiplication_required_selection:
-            qk_psum[:, :] = nl.matmul(q_local_tile, k[:, k_i_b_f_slice], transpose_x=True) # (p(128), 512)
-        else:
-            qk_psum[:, :] = 0
+            assert rmsnorm is None and residual is None
+            return (self._native_mlp(x, adapter_ids=adapter_ids), None)
 
 
-        left_diagonal_selection = q_tile_idx * B_P_SIZE >= local_k_large_tile_idx * LARGE_TILE_SZ + (k_i + 1) * B_F_SIZE
-        diagonal_and_right_selection = (q_tile_idx * B_P_SIZE < local_k_large_tile_idx * LARGE_TILE_SZ + (k_i + 1) * B_F_SIZE)
-
-        i_q_p, i_q_f = nl.mgrid[0:B_P_SIZE, 0:B_F_SIZE]
-        q_pos = q_tile_idx * B_P_SIZE + i_q_p
-        k_pos = local_k_large_tile_idx * LARGE_TILE_SZ + k_i * B_F_SIZE + i_q_f
-        pred = q_pos >= k_pos
-
-        qk_select_tmp = nl.ndarray(qk_psum.shape, dtype=qk_psum.dtype, buffer=nl.sbuf)
-
-        # For tiles on and to the right of the diagonal, need to do affine_select.
-        if diagonal_and_right_selection:
-            qk_select_tmp[...] = qk_psum
-
-            qk_res_buf[:, k_i_b_f_slice] = nisa.affine_select(
-                pred=pred,
-                on_true_tile=qk_select_tmp, on_false_value=-9984.0, dtype=acc_type)
-
-        qk_res_buf[:, k_i_b_f_slice] = \
-            nl.copy(qk_psum, dtype=acc_type, mask=left_diagonal_selection)
-
-        max_local[:, k_i] = nisa.tensor_reduce(
-            np.max, qk_res_buf[:, k_i_b_f_slice], axis=(1,), dtype=acc_type,
-            negate=False)
-
-    max_ = nisa.tensor_reduce(np.max, max_local[:, :], axis=(1, ),
-                                                        dtype=acc_type, negate=False)
-
-    o_previous_scaled = nl.ndarray((nl.par_dim(B_P_SIZE), B_D_SIZE), dtype=o_buffer.dtype)
-
-    if initialize:
-        m_buffer[:, 0] = nl.copy(max_)
-        m_current = max_
-    else:
-        m_previous = nl.copy(m_buffer[:, 0])
-        m_buffer[:, 0] = nl.maximum(m_previous, max_) # (128,1)
-
-        m_current = m_buffer[:, 0]
-        # Compute scaling factor
-        alpha = nisa.activation(np.exp, m_current, bias=m_previous, scale=-1.0)
-        o_previous_scaled[...] = nl.multiply(o_buffer[:, :], alpha)
-
-    p_local = nl.ndarray((nl.par_dim(B_P_SIZE), LARGE_TILE_SZ), dtype=kernel_dtype)
-    REDUCTION_TILE = min(2048, LARGE_TILE_SZ // 2)
-
-    p_partial_sum = nl.ndarray((nl.par_dim(B_P_SIZE), LARGE_TILE_SZ // REDUCTION_TILE), dtype=acc_type)
-
-    for k_r_i in nl.affine_range(LARGE_TILE_SZ // REDUCTION_TILE):
-        k_r_i_reduce_slice = nl.ds(k_r_i * REDUCTION_TILE, REDUCTION_TILE)
-
-        p_local[:, k_r_i_reduce_slice] = \
-            nisa.activation_reduce(np.exp, qk_res_buf[:, k_r_i_reduce_slice],
-                                                            bias=-1 * m_current, scale=1.0,
-                                                            reduce_op=nl.add, reduce_res=p_partial_sum[:, k_r_i],
-                                                            dtype=kernel_dtype)
-
-    ps = nl.sum(p_partial_sum, axis=1, dtype=acc_type)
-
-    p_local_transposed = nl.ndarray((nl.par_dim(B_P_SIZE), LARGE_TILE_SZ), dtype=kernel_dtype)
-    for j in nl.affine_range(LARGE_TILE_SZ // 128):
-        if nisa.get_nc_version() == nisa.nc_version.gen3:
-            p_local_transposed[:, nl.ds(j * 128, 128)] = nisa.dma_transpose(
-                p_local[:, nl.ds(j * 128, 128)])
-        else:
-            p_local_transposed[:, nl.ds(j * 128, 128)] = nisa.nc_transpose(
-                p_local[:, nl.ds(j * 128, 128)])
-
-    pv_psum = nl.zeros((nl.par_dim(B_P_SIZE), B_D_SIZE), dtype=np.float32,
-                                         buffer=nl.psum, lazy_initialization=True)
-    for k_i in nl.affine_range(LARGE_TILE_SZ // B_P_SIZE):
-        pv_psum[:, :] += nl.matmul(p_local_transposed[:, nl.ds(k_i * B_P_SIZE, B_P_SIZE)],
-                                                             v[k_i, :, :], transpose_x=True) # (128, 128) (p(Br), d)
-
-    if initialize:
-        o_buffer[:, :] = nl.copy(pv_psum[:, :])
-        l_buffer[:, 0] = nl.add(nl.log(ps), max_)
-    else:
-        o_buffer[:, :] = nl.add(o_previous_scaled, pv_psum)
-
-        exp = nisa.activation(nl.exp, m_current, bias=l_buffer[:, 0], scale=-1.0)
-        l_buffer[:, 0] = nl.add(m_current, nisa.activation(nl.log, exp, bias=ps))
-
-
-@nki.jit
-def flash_attention_fwd(q, k, v, LARGE_TILE_SZ):
-    B_F_SIZE=LARGE_TILE_SZ
-    B_P_SIZE=128
-    b, h, d, seqlen_q  = q.shape
-    B_D_SIZE=d
-    _, k_h, _, seqlen_k = k.shape
-
-    assert tuple(v.shape) == (b, k_h, seqlen_k, d), f"Expect shape of V to be {(b, k_h, seqlen_k, d)} (batch, heads, seqlen_k, d_head) but got {v.shape}"
-    assert tuple(k.shape) == (b, k_h, d, seqlen_k), f"Expect shape of K to be {(b, k_h, d, seqlen_k)} (batch, heads, d_head, seqlen_k) but got {k.shape}"
-    assert d <= 128, f" we do not support head_dim > 128, got head dim {d}"
-    kernel_dtype = nl.bfloat16
-    acc_type = np.dtype(np.float32)
-
-    o = nl.ndarray((b, h, seqlen_q, d), dtype=q.dtype, buffer=nl.shared_hbm)
-
-    assert nl.program_ndim() == 2,\
-        f'Expect spmd grid with 2 dimensions, got {nl.program_ndim()} instead!'
-    batch_id = nl.program_id(axis=0)
-    head_id = nl.program_id(axis=1)
-
-    softmax_scale = 1.0 / (d ** 0.5)
-
-    n_tile_q = seqlen_q // B_P_SIZE # since q will be loaded on tensor engine
-
-    PAR_LEN = 512
-
-    assert seqlen_k % LARGE_TILE_SZ == 0, f"Need seqlen_k to be divisible by {LARGE_TILE_SZ} but got {seqlen_k}"
-    num_large_k_tile = seqlen_k // LARGE_TILE_SZ
-
-    q_h_per_k_h = h // k_h
-
-    PAR_LEN = min(n_tile_q, PAR_LEN)
-    n_remat = cdiv(n_tile_q, PAR_LEN)
-
-    for i_q_h in nl.affine_range(q_h_per_k_h):
-        l_buffer = nl.zeros((nl.par_dim(B_P_SIZE), n_tile_q), dtype=acc_type,
-                                                buffer=nl.sbuf, lazy_initialization=False)
-
-        for i0 in nl.sequential_range(n_remat):
-            o_buffer = nl.zeros((PAR_LEN, nl.par_dim(B_P_SIZE), d), dtype=acc_type,
-                                                    buffer=nl.sbuf, lazy_initialization=False)
-            m_buffer = nl.zeros((PAR_LEN, nl.par_dim(B_P_SIZE), 1), dtype=acc_type,
-                                                    buffer=nl.sbuf, lazy_initialization=False)
-
-            for j in nl.sequential_range(0, num_large_k_tile):
-                cur_k_tile = nl.ndarray((nl.par_dim(B_D_SIZE), LARGE_TILE_SZ), dtype=kernel_dtype)
-                cur_v_tile = nl.ndarray((LARGE_TILE_SZ // B_P_SIZE, nl.par_dim(B_P_SIZE), B_D_SIZE), dtype=kernel_dtype)
-                # print(k.shape, int(batch_id), int(head_id), j, nl.ds(j*LARGE_TILE_SZ, LARGE_TILE_SZ))
-                cur_k_tile[:, :] = nl.load(k[batch_id, head_id, :, nl.ds(j*LARGE_TILE_SZ, LARGE_TILE_SZ)])
-
-                load_tile_size = B_P_SIZE
-
-                v_calc = v[batch_id, head_id]
-                for v_i in nl.affine_range(LARGE_TILE_SZ // load_tile_size):
-                    cur_v_tile[v_i, :, :] = nl.load(
-                        v_calc[nl.ds(j * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE), :],
-                        dtype=cur_v_tile.dtype)
-                for i1 in nl.affine_range(PAR_LEN):
-                    i = i0 * PAR_LEN + i1
-
-                    forward_mask = i * B_P_SIZE >= j * LARGE_TILE_SZ
-
-                    if (i < n_tile_q) & forward_mask:
-                        q_tile = nl.ndarray((B_D_SIZE, B_P_SIZE), dtype=kernel_dtype)
-                        q_hbm_tile = q[batch_id, head_id * q_h_per_k_h + i_q_h]
-                        q_sbuf_tile = nl.load(q_hbm_tile[:, nl.ds(i * B_P_SIZE, B_P_SIZE)],
-                                                                    dtype=kernel_dtype) # load (d, 128) tile in SBUF
-                        q_tile[:, :] = q_sbuf_tile * softmax_scale
-
-                        _flash_attention_core(q_local_tile=q_tile, k=cur_k_tile, v=cur_v_tile,
-                                                o_buffer=o_buffer[i1], l_buffer=l_buffer[:, i], m_buffer=m_buffer[i1],
-                                                q_tile_idx=i, local_k_large_tile_idx=j,
-                                                kernel_dtype=kernel_dtype, acc_type=acc_type,
-                                                LARGE_TILE_SZ=LARGE_TILE_SZ,
-                                                initialize=(j == 0),
-                                                B_P_SIZE=B_P_SIZE, B_F_SIZE=B_F_SIZE, B_D_SIZE=B_D_SIZE)
-
-            for i1 in nl.affine_range(PAR_LEN):
-                i = i0 * PAR_LEN + i1
-
-                if i < n_tile_q:
-                    exp = nisa.activation(np.exp, l_buffer[:, i], bias=m_buffer[i1, :, :],
-                                                                scale=-1.0)
-                    out = nl.multiply(o_buffer[i1, :, :], exp,
-                                                        dtype=kernel_dtype)
-
-                    nl.store(o[batch_id, head_id * q_h_per_k_h + i_q_h,
-                                         nl.ds(i*B_P_SIZE, B_P_SIZE), :], value=out)
-    return o
-
-
-def flash_decode_core(qk_raw, v_tile, kernel_dtype, B_P_SIZE, calc_mask, acc_type, l_buffer, o_buffer, m_buffer, q_head_per_k):
-    qk = nl.where(calc_mask, qk_raw, -9984.0)
-    
-    qk_max = nisa.tensor_reduce(np.max, qk, axis=(1,)) # (1, 1)
-
-    qk_reduce = nl.ndarray((q_head_per_k, 1), dtype=acc_type, buffer=nl.sbuf, lazy_initialization=True)
-    qk_soft = nisa.activation_reduce(np.exp, qk,
-                                bias=-1 * qk_max, scale=1.0,
-                                reduce_op=nl.add, reduce_res=qk_reduce,
-                                dtype=kernel_dtype)
-
-    qk_soft_transposed = nisa.nc_transpose(qk_soft[:, nl.ds(0, B_P_SIZE)])
-    
-    o_tile = nl.matmul(qk_soft_transposed, v_tile, transpose_x=True)
-
-    o_buffer[...] = o_tile
-    m_buffer[...] = qk_max
-    l_buffer[...] = qk_reduce
-
-@nki.jit()
-def flash_decode(q, k, v, mask):
-    batch_id = nl.program_id(axis=0)
-    head_id = nl.program_id(axis=1)
-    bsz, h, d = q.shape
-    k_h = k.shape[1]
-    q_h_per_k_h = h // k_h
-    kernel_dtype = nl.bfloat16
-    acc_type = nl.float32
-    softmax_scale = 1.0 / (d ** 0.5)
-    PAR_LEN = 128
-
-    o = nl.ndarray((bsz, h, d), dtype=kernel_dtype, buffer=nl.shared_hbm)
-
-    k_total_len = k.shape[-1]
-    PARTITION_CNT = cdiv(k_total_len, PAR_LEN)
-    FULL_PARTITION_CNT = k_total_len // PAR_LEN
-
-    k_calc = k[batch_id, head_id]
-    v_calc = v[batch_id, head_id]
-    q_calc = q[batch_id, head_id * q_h_per_k_h: (head_id + 1) * q_h_per_k_h]
-
-    o_buffer = nl.ndarray((q_h_per_k_h, PARTITION_CNT, d), dtype=acc_type, buffer=nl.sbuf, lazy_initialization=True)
-    m_buffer = nl.ndarray((q_h_per_k_h, PARTITION_CNT), dtype=acc_type, buffer=nl.sbuf, lazy_initialization=True)
-    l_buffer = nl.ndarray((q_h_per_k_h, PARTITION_CNT), dtype=acc_type, buffer=nl.sbuf, lazy_initialization=True)
-
-    q_sbuf_tile = nisa.nc_transpose(nl.load(q_calc, dtype=kernel_dtype)) # load (d, 1) tile in SBUF
-    q_tile = q_sbuf_tile * softmax_scale
-
-    for par in nl.affine_range(FULL_PARTITION_CNT):
-        k_tile = nl.load(k_calc[:, nl.ds(par * PAR_LEN, PAR_LEN)], dtype=kernel_dtype)
-        qk = nl.matmul(q_tile, k_tile, transpose_x=True)
-        v_tile = nl.load(v_calc[nl.ds(par * PAR_LEN, PAR_LEN), :], dtype=kernel_dtype)
-        calc_mask = nl.load(mask[nl.ds(batch_id, 1), par * PAR_LEN: par * PAR_LEN + PAR_LEN]).broadcast_to((q_h_per_k_h, PAR_LEN))
-        flash_decode_core(qk, v_tile, kernel_dtype, PAR_LEN, calc_mask, acc_type, l_buffer[:, par], o_buffer[:, par, :], m_buffer[:, par], q_h_per_k_h)
-
-    if FULL_PARTITION_CNT != PARTITION_CNT:
-        len_remain = k_total_len - FULL_PARTITION_CNT * PAR_LEN
-        k_tile = nl.load(k_calc[:, nl.ds(FULL_PARTITION_CNT * PAR_LEN, len_remain)], dtype=kernel_dtype)
-        qk = nl.matmul(q_tile, k_tile, transpose_x=True)
-        v_tile = nl.load(v_calc[nl.ds(FULL_PARTITION_CNT * PAR_LEN, len_remain), :], dtype=kernel_dtype)
-        calc_mask = nl.load(mask[nl.ds(batch_id, 1), FULL_PARTITION_CNT * PAR_LEN: FULL_PARTITION_CNT * PAR_LEN + len_remain]).broadcast_to((q_h_per_k_h, len_remain))
-        flash_decode_core(qk, v_tile, kernel_dtype, len_remain, calc_mask, acc_type, l_buffer[:, FULL_PARTITION_CNT], o_buffer[:, FULL_PARTITION_CNT, :], m_buffer[:, FULL_PARTITION_CNT], q_h_per_k_h)
-
-    qk_new_max = nisa.tensor_reduce(np.max, m_buffer, axis=(1, ), dtype=acc_type, negate=True)
-    qk_exp = nisa.activation(np.exp, m_buffer, bias=qk_new_max)
-
-    for par in nl.affine_range(PARTITION_CNT):
-        o_buffer[:, par, :] = nl.multiply(o_buffer[:, par, :], qk_exp[:, par])
-
-    l_buffer = nl.multiply(l_buffer, qk_exp)
-    scales = nisa.tensor_reduce(nl.add, l_buffer, axis=(1,), dtype=acc_type)
-
-    o_buffer_reduced = nisa.tensor_reduce(nl.add, o_buffer, axis=(1,), dtype=acc_type)
-
-    scales = (1 / scales).broadcast_to((q_h_per_k_h, d))
-    out = nl.multiply(o_buffer_reduced, scales)
-
-    nl.store(o[batch_id, head_id * q_h_per_k_h: (head_id + 1) * q_h_per_k_h], value=out)
-    return o
-
-def get_suitable_len(size):
-    if size <= 128:
-        return 128
-    return 256
-
 @register_module("NeuronLlamaAttention")
 class NeuronLlamaAttention(NeuronAttentionBase):
     """
@@ -1705,7 +687,7 @@ def __init__(self, config: InferenceConfig, tensor_model_parallel_group=None):
         self.hidden_size = config.hidden_size
         self.num_attention_heads = config.num_attention_heads
         self.num_key_value_heads = config.num_key_value_heads
-        self.head_dim = self.hidden_size // self.num_attention_heads
+        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_attention_heads)
         self.max_position_embeddings = config.max_position_embeddings
         self.rope_theta = config.rope_theta
         self.padding_side = config.neuron_config.padding_side
@@ -1717,6 +699,7 @@ def __init__(self, config: InferenceConfig, tensor_model_parallel_group=None):
         self.rpl_reduce_dtype = config.neuron_config.rpl_reduce_dtype
         self.mlp_kernel_enabled = config.neuron_config.mlp_kernel_enabled
         self.rms_norm_eps = config.rms_norm_eps
+        self.attn_tkg_builtin_kernel_enabled = self.neuron_config.attn_tkg_builtin_kernel_enabled
 
         if parallel_state.model_parallel_is_initialized():
             self.tp_degree = self.config.neuron_config.tp_degree
@@ -1735,57 +718,12 @@ def __init__(self, config: InferenceConfig, tensor_model_parallel_group=None):
         self.init_gqa_properties()
 
         self.init_rope()
-        global NKI_ENABLED
-        if NKI_ENABLED:
-            self.qkv_proj.q_proj = CustomColumnParallelLinear(
-                self.qkv_proj.hidden_size,
-                self.qkv_proj.num_attention_heads * self.qkv_proj.head_dim,
-                bias=self.qkv_proj.bias,
-                gather_output=self.qkv_proj.gather_output,
-                dtype=self.torch_dtype,
-                sequence_parallel_enabled=False,
-                tensor_model_parallel_group=self.qkv_proj.tensor_model_parallel_group,
-            )
-            self.qkv_proj.k_proj = CustomColumnParallelLinear(
-                self.qkv_proj.hidden_size,
-                self.qkv_proj.num_key_value_heads * self.qkv_proj.head_dim,
-                bias=self.qkv_proj.bias,
-                gather_output=self.qkv_proj.gather_output,
-                dtype=self.torch_dtype,
-                sequence_parallel_enabled=False,
-                tensor_model_parallel_group=self.qkv_proj.tensor_model_parallel_group,
-            )
-            self.qkv_proj.v_proj = CustomColumnParallelLinear(
-                self.qkv_proj.hidden_size,
-                self.qkv_proj.num_key_value_heads * self.qkv_proj.head_dim,
-                bias=self.qkv_proj.bias,
-                gather_output=self.qkv_proj.gather_output,
-                dtype=self.torch_dtype,
-                sequence_parallel_enabled=False,
-                tensor_model_parallel_group=self.qkv_proj.tensor_model_parallel_group,
-            )
-            self.o_proj.o_proj = CustomRowParallelLinear(
-                self.o_proj.num_attention_heads * self.o_proj.head_dim,
-                self.o_proj.hidden_size,
-                bias=self.o_proj.bias,
-                input_is_parallel=self.o_proj.input_is_parallel,
-                dtype=self.torch_dtype,
-                sequence_parallel_enabled=False,
-                sequence_dimension=self.sequence_dimension,
-                tensor_model_parallel_group=self.o_proj.tensor_model_parallel_group,
-                reduce_dtype=self.rpl_reduce_dtype,
-            )
-            
 
     def init_rope(self):
         if not hasattr(self.config, "rope_scaling") or self.config.rope_scaling is None:
             # TODO(yihsian): Check if we can just use our own implementation
             if self.is_medusa:
-                self.rotary_emb = LlamaRotaryEmbedding(
-                    self.head_dim,
-                    max_position_embeddings=self.max_position_embeddings,
-                    base=self.rope_theta,
-                )
+                self.rotary_emb = LlamaRotaryEmbedding(self.config)
             else:
                 self.rotary_emb = RotaryEmbedding(
                     self.head_dim,
@@ -1813,101 +751,11 @@ def init_rope(self):
                 # Warning: The HF implementation may have precision issues when run on Neuron.
                 # We include it here for compatibility with other scaling types.
                 self.rotary_emb = LlamaRotaryEmbedding(self.config)
-    
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        active_mask: Optional[torch.LongTensor] = None,
-        adapter_ids=None,
-        cos_cache: Optional[torch.Tensor] = None,
-        sin_cache: Optional[torch.Tensor] = None,
-        rmsnorm=None,
-        seq_ids: Optional[torch.LongTensor] = None,  # Added for latest SDK
-        **kwargs,  # Catch any other unexpected kwargs
-    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
-        
-        if USE_FLASH:
-            if not past_key_value:
-                Q, K, V, cos_cache, sin_cache = self.prep_qkv_tensors(
-                    position_ids,
-                    hidden_states,
-                    past_key_value,
-                    adapter_ids=adapter_ids,
-                    cos_cache=cos_cache,
-                    sin_cache=sin_cache,
-                    rmsnorm=rmsnorm,
-                )
-                raw_k = K
-                raw_v = V
-                bsz, h, seq_len, dim = Q.shape
-                Q = Q.permute(0, 1, 3, 2)
-                K = K.permute(0, 1, 3, 2)
-                LARGE_TILE_SZ = get_suitable_len(seq_len)
-                q_pad_size = smallest_multiple(seq_len, 128) - seq_len
-                kv_pad_size = smallest_multiple(seq_len, LARGE_TILE_SZ) - seq_len
-                K = F.pad(K, (0, kv_pad_size))
-                V = F.pad(V, (0, 0, 0, kv_pad_size))
-                Q = F.pad(Q, (0, q_pad_size))
-                
-                # Q [bsz, h, dim, seq_len]
-                # K [bsz, h, dim, seq_len]
-                # V [bsz, h, seq_len, dim]
-                
-                o = flash_attention_fwd[bsz, self.num_key_value_heads](Q, K, V, LARGE_TILE_SZ)
-                o = o.permute(0, 2, 1, 3).view(bsz, -1, h * dim)[:,:seq_len,:]
-                o = self.o_proj(o, adapter_ids=adapter_ids)
-                return o, (raw_k, raw_v), cos_cache, sin_cache
-            else:
-                Q, K, V, cos_cache, sin_cache = self.prep_qkv_tensors(
-                    position_ids,
-                    hidden_states,
-                    past_key_value,
-                    adapter_ids=adapter_ids,
-                    cos_cache=cos_cache,
-                    sin_cache=sin_cache,
-                    rmsnorm=rmsnorm,
-                )
-                
-                raw_k = K
-                raw_v = V
-                bsz, h, seq_len, dim = Q.shape
-                
-                k_cache = past_key_value[0]
-                v_cache = past_key_value[1]
-                Q = Q.squeeze(2)
-                K = torch.cat([k_cache, K], dim=-2)
-                V = torch.cat([v_cache, V], dim=-2)
-                
-                K = K.permute(0, 1, 3, 2)
-                
-                mask = F.pad(attention_mask.squeeze(1).squeeze(1), (0, 1), value=True)
-                
-                o = flash_decode[bsz, self.num_key_value_heads](Q, K, V, mask)
-                
-                o = o.view(bsz, 1, self.num_heads * self.head_dim)
-                o = self.o_proj(o, adapter_ids=adapter_ids)
-                
-                return o, (raw_k, raw_v), cos_cache, sin_cache
-
-        # Pass seq_ids to parent class if needed
-        o, past, cos, sin = super().forward(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            active_mask=active_mask,
-            adapter_ids=adapter_ids,
-            cos_cache=cos_cache,
-            sin_cache=sin_cache,
-            rmsnorm=rmsnorm,
-            seq_ids=seq_ids,  # Pass seq_ids to parent
-            **kwargs,  # Pass any other kwargs
-        )
-        return o, past, cos, sin
-        
+
+        if self.attn_tkg_builtin_kernel_enabled:
+            self.inv_freqs = self.rotary_emb.get_inv_freqs().unsqueeze(1)
+
+
 # TODO: Modularize RotaryEmbedding. See how HF transformers does it in 4.43.
 class Llama3RotaryEmbedding(nn.Module):
     """
@@ -1938,31 +786,32 @@ def __init__(
         self.old_context_len = original_max_position_embeddings
         self.register_buffer("inv_freq", None, persistent=False)
 
+    def get_inv_freqs(self, device: Optional[torch.device] = None) -> torch.Tensor:
+        freq_indices = torch.arange(0, self.dim, 2, dtype=torch.float, device=device)
+        inv_freq = 1.0 / (self.base ** (freq_indices / self.dim))
+
+        low_freq_wavelen = self.old_context_len / self.low_freq_factor
+        high_freq_wavelen = self.old_context_len / self.high_freq_factor
+        new_freqs = []
+        for freq in inv_freq:
+            wavelen = 2 * math.pi / freq
+            if wavelen < high_freq_wavelen:
+                new_freqs.append(freq)
+            elif wavelen > low_freq_wavelen:
+                new_freqs.append(freq / self.factor)
+            else:
+                assert low_freq_wavelen != high_freq_wavelen
+                smooth = (self.old_context_len / wavelen - self.low_freq_factor) / (
+                    self.high_freq_factor - self.low_freq_factor
+                )
+                new_freqs.append((1 - smooth) * freq / self.factor + smooth * freq)
+        return torch.tensor(new_freqs, dtype=inv_freq.dtype, device=inv_freq.device)
+
     @torch.no_grad()
     def forward(self, x, position_ids):
         # x: [bs, num_attention_heads, seq_len, head_size]
         if self.inv_freq is None:
-            inv_freq = 1.0 / (
-                self.base
-                ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(x.device) / self.dim)
-            )
-
-            low_freq_wavelen = self.old_context_len / self.low_freq_factor
-            high_freq_wavelen = self.old_context_len / self.high_freq_factor
-            new_freqs = []
-            for freq in inv_freq:
-                wavelen = 2 * math.pi / freq
-                if wavelen < high_freq_wavelen:
-                    new_freqs.append(freq)
-                elif wavelen > low_freq_wavelen:
-                    new_freqs.append(freq / self.factor)
-                else:
-                    assert low_freq_wavelen != high_freq_wavelen
-                    smooth = (self.old_context_len / wavelen - self.low_freq_factor) / (
-                        self.high_freq_factor - self.low_freq_factor
-                    )
-                    new_freqs.append((1 - smooth) * freq / self.factor + smooth * freq)
-            self.inv_freq = torch.tensor(new_freqs, dtype=inv_freq.dtype, device=inv_freq.device)
+            self.inv_freq = self.get_inv_freqs(x.device)
 
         inv_freq_expanded = (
             self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
@@ -1984,9 +833,11 @@ class NeuronLlamaDecoderLayer(nn.Module):
     def __init__(self, config: InferenceConfig):
         super().__init__()
         self.hidden_size = config.hidden_size
+
         self.self_attn = _LLAMA_MODULE_MAP[config.neuron_config.attn_cls](
             config=config, tensor_model_parallel_group=get_tp_group(config)
         )
+
         self.mlp = NeuronLlamaMLP(config)
         logger.debug(
             f"Instantiating RMSNorm modules with hidden size {config.hidden_size} and EPS {config.rms_norm_eps}"
@@ -1999,24 +850,22 @@ def __init__(self, config: InferenceConfig):
             self.input_layernorm = get_rmsnorm_cls()(
                 config.hidden_size,
                 eps=config.rms_norm_eps,
-                nki_enabled=config.neuron_config.nki_enabled,
             )
         self.post_attention_layernorm = get_rmsnorm_cls()(
             config.hidden_size,
             eps=config.rms_norm_eps,
-            nki_enabled=config.neuron_config.nki_enabled,
         )
         self.qkv_kernel_enabled = config.neuron_config.qkv_kernel_enabled
         self.mlp_kernel_enabled = config.neuron_config.mlp_kernel_enabled
-        self.quantized_mlp_kernel_enabled = getattr(config.neuron_config, 'quantized_mlp_kernel_enabled', False)
-        self.rmsnorm_quantize_kernel_enabled = getattr(config.neuron_config, 'rmsnorm_quantize_kernel_enabled', False)
-        self.mlp_kernel_fuse_residual_add = getattr(config.neuron_config, 'mlp_kernel_fuse_residual_add', False)
-        self.qkv_kernel_fuse_residual_add = getattr(config.neuron_config, 'qkv_kernel_fuse_residual_add', False)
-        self.sequence_parallel_enabled = getattr(config.neuron_config, 'sequence_parallel_enabled', False)
-        self.is_prefill_stage = getattr(config.neuron_config, 'is_prefill_stage', False)
+        self.quantized_mlp_kernel_enabled = config.neuron_config.quantized_mlp_kernel_enabled
+        self.rmsnorm_quantize_kernel_enabled = config.neuron_config.rmsnorm_quantize_kernel_enabled
+        self.mlp_kernel_fuse_residual_add = config.neuron_config.mlp_kernel_fuse_residual_add
+        self.qkv_kernel_fuse_residual_add = config.neuron_config.qkv_kernel_fuse_residual_add
+        self.sequence_parallel_enabled = config.neuron_config.sequence_parallel_enabled
+        self.is_prefill_stage = config.neuron_config.is_prefill_stage
         self.config = config
-        
-        if self.is_prefill_stage and hasattr(config.neuron_config, 'is_mlp_quantized') and config.neuron_config.is_mlp_quantized():
+
+        if self.is_prefill_stage and self.config.neuron_config.is_mlp_quantized():
             # for CTE, quantized MLP kernel does not support fused rmsnorm
             self.mlp_kernel_fused_rmsnorm = False
         else:
@@ -2030,21 +879,16 @@ def forward(
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
         adapter_ids=None,
         rotary_position_ids: Optional[torch.LongTensor] = None,
-        residual: Optional[torch.Tensor] = None,
-        seq_ids: Optional[torch.LongTensor] = None,  # Added for latest SDK
+        residual: Optional[torch.Tensor] = None,  # residual from previous layer used by QKV
         **kwargs,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]], Optional[torch.FloatTensor], Optional[torch.FloatTensor], Optional[torch.FloatTensor]]:
         entry_hidden_states = hidden_states
-        
-        # Initialize residual if not provided (first layer)
-        if residual is None:
-            residual = hidden_states
-        
         # RMSNorm (fused with QKV kernel when SP is disabled)
         if (not self.qkv_kernel_enabled or self.sequence_parallel_enabled) and self.input_layernorm:
             hidden_states = self.input_layernorm(hidden_states)
 
         # Self Attention
+        # produced another residual used by MLP
         attn_output = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
@@ -2053,31 +897,26 @@ def forward(
             adapter_ids=adapter_ids,
             rmsnorm=self.input_layernorm,
             rotary_position_ids=rotary_position_ids,
-            residual=residual if self.qkv_kernel_fuse_residual_add else None,
-            seq_ids=seq_ids,  # Pass seq_ids to attention
+            residual=residual,
             **kwargs,
         )
-        
-        # Handle different return formats from attention
-        if hasattr(attn_output, 'hidden_states'):
-            # NamedTuple or similar structure
-            hidden_states = attn_output.hidden_states
-            present_key_value = attn_output.present_key_value
-            cos_cache = getattr(attn_output, 'cos_cache', None)
-            sin_cache = getattr(attn_output, 'sin_cache', None)
-            attn_residual = getattr(attn_output, 'residual', None)
-        else:
-            # Tuple format
-            hidden_states = attn_output[0]
-            present_key_value = attn_output[1]
-            cos_cache = attn_output[2] if len(attn_output) > 2 else None
-            sin_cache = attn_output[3] if len(attn_output) > 3 else None
-            attn_residual = attn_output[4] if len(attn_output) > 4 else None
 
-        # Update residual if returned from attention
-        if attn_residual is not None:
-            residual = attn_residual
+        if attn_output.residual is None:
+            residual = entry_hidden_states  # input to attention
+        else:
+            # residual will only be returned by attn/qkv if fuse add qkv kernel is enabled
+            assert self.qkv_kernel_fuse_residual_add, \
+                "residual add before qkv should be computed in the previous layer, \
+                 unless qkv_kernel_fuse_residual_add is specified"
+            assert (
+                not self.sequence_parallel_enabled
+            ), "qkv_kernel_fuse_residual_add should be off when sequence parallelism is enabled"
+            assert (
+                self.qkv_kernel_enabled
+            ), "qkv_kernel_fuse_residual_add should be used with qkv_kernel_enabled"
+            residual = attn_output.residual
 
+        hidden_states = attn_output.hidden_states
         if self.mlp_kernel_enabled and self.mlp_kernel_fuse_residual_add:
             assert (
                 not self.sequence_parallel_enabled
@@ -2090,31 +929,28 @@ def forward(
                 adapter_ids=adapter_ids,
             )
         else:
-            # Add residual for attention output
             hidden_states = residual + hidden_states
             residual = hidden_states
-            
-            # RMSNorm (fused with MLP kernel when conditions are met)
+            # RMSNorm (fused with QKV kernel when SP is disabled)
             if self.mlp_kernel_enabled and self.mlp_kernel_fused_rmsnorm:
                 rmsnorm = self.post_attention_layernorm
             else:
                 hidden_states = self.post_attention_layernorm(hidden_states)
                 rmsnorm = None
-                
             hidden_states, _ = self.mlp(
                 hidden_states,
                 rmsnorm=rmsnorm,
-                residual=None,  # Don't pass residual if not fusing
                 adapter_ids=adapter_ids,
             )
 
-        # Final residual connection (unless using qkv_kernel_fuse_residual_add for next layer)
+        # if fuse residual add with qkv, we leave this add to the next layer's QKV
+        # unless it is the last layer in which case we add it here
         if not self.qkv_kernel_fuse_residual_add:
             hidden_states = residual + hidden_states
             residual = None  # set to None to prevent it from being used again
 
-        # Return 5 elements as expected by the model base class
-        outputs = (hidden_states, present_key_value, cos_cache, sin_cache, residual)
+        # also return residual for QKV in the next layer
+        outputs = (hidden_states, attn_output.present_key_value, attn_output.cos_cache, attn_output.sin_cache, residual)
         return outputs
 
 
@@ -2168,9 +1004,7 @@ def setup_attr_for_model(self, config: InferenceConfig):
     def init_model(self, config: InferenceConfig):
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
-        global NKI_ENABLED
-        NKI_ENABLED = config.neuron_config.nki_enabled
-        
+
         if parallel_state.model_parallel_is_initialized():
             self.embed_tokens = ParallelEmbedding(
                 config.vocab_size,
@@ -2178,13 +1012,14 @@ def init_model(self, config: InferenceConfig):
                 self.padding_idx,
                 dtype=config.neuron_config.torch_dtype,
                 shard_across_embedding=not config.neuron_config.vocab_parallel,
-                sequence_parallel_enabled=False,
+                sequence_parallel_enabled=config.neuron_config.sequence_parallel_enabled,
+                sequence_dimension=1,
                 pad=True,
                 tensor_model_parallel_group=get_tp_group(config),
                 use_spmd_rank=config.neuron_config.vocab_parallel,
             )
 
-            self.lm_head = CustomColumnParallelLinear(
+            self.lm_head = ColumnParallelLinear(
                 config.hidden_size,
                 config.vocab_size,
                 gather_output=not self.on_device_sampling,
@@ -2204,12 +1039,12 @@ def init_model(self, config: InferenceConfig):
                 bias=False,
             )
 
-        # Use get_updated_configs to handle per-layer configurations
         updated_configs = get_updated_configs(config)
+
         self.layers = nn.ModuleList([NeuronLlamaDecoderLayer(conf) for conf in updated_configs])
-        
+
         if not config.neuron_config.is_eagle_draft:
-            self.norm = get_rmsnorm_cls()(config.hidden_size, eps=config.rms_norm_eps, nki_enabled=config.neuron_config.nki_enabled)
+            self.norm = get_rmsnorm_cls()(config.hidden_size, eps=config.rms_norm_eps)
 
         if config.neuron_config.is_eagle_draft:
             fc_bias = getattr(config, "fc_bias", False)
@@ -2238,58 +1073,7 @@ def init_model(self, config: InferenceConfig):
                 )
                 setattr(self, f"medusa_head_{i}", medusa_head)
 
-        if config.neuron_config.is_eagle_draft:
-            fc_bias = getattr(config, "fc_bias", False)
-            self.fc = CustomColumnParallelLinear(
-                config.hidden_size * 2, config.hidden_size, bias=fc_bias, gather_output=True
-            )
-        self.is_medusa = config.neuron_config.is_medusa
-        self.num_medusa_heads = config.neuron_config.num_medusa_heads
-        self.medusa_speculation_length = config.neuron_config.medusa_speculation_length
 
-        if self.is_medusa:
-            if parallel_state.model_parallel_is_initialized():
-                medusa_head_cls = ColumnParallelLinear
-            else:
-                medusa_head_cls = nn.Linear
-            for i in range(self.num_medusa_heads):
-                medusa_head = nn.Sequential(
-                    *([ResBlock(config.hidden_size)] * 1),
-                    medusa_head_cls(
-                        config.hidden_size,
-                        config.vocab_size,
-                        gather_output=not self.on_device_sampling,
-                        bias=False,
-                    ),
-                )
-                setattr(self, f"medusa_head_{i}", medusa_head)
-
-def fuse_mlp(llama_state_dict, cfg: InferenceConfig):
-    """
-    This function concats the qkv weights to a Wqkv weight for fusedqkv, and deletes the qkv weights.
-    """
-    tp_size = cfg.neuron_config.tp_degree
-    for l in range(cfg.num_hidden_layers):  # noqa: E741
-        dummpy_concat = torch.cat(
-            [
-                llama_state_dict[f"layers.{l}.mlp.gate_proj.weight"],
-                llama_state_dict[f"layers.{l}.mlp.up_proj.weight"],
-            ],
-        )
-        hidden = llama_state_dict[f"layers.{l}.mlp.gate_proj.weight"].shape[0]
-        per_tp_output = hidden // tp_size
-        output = torch.zeros_like(dummpy_concat)
-        for i in range(tp_size):
-            output[2 * i * per_tp_output : (2 * i + 1) * per_tp_output,:] = llama_state_dict[f"layers.{l}.mlp.gate_proj.weight"][i * per_tp_output : (i + 1) * per_tp_output,:]
-            output[(2 * i + 1) * per_tp_output : (2 * i + 2) * per_tp_output,:] = llama_state_dict[f"layers.{l}.mlp.up_proj.weight"][i * per_tp_output : (i + 1) * per_tp_output,:]
-        llama_state_dict[f"layers.{l}.mlp.gateup.weight"] = output
-        del llama_state_dict[f"layers.{l}.mlp.gate_proj.weight"]
-        del llama_state_dict[f"layers.{l}.mlp.up_proj.weight"]
-        del dummpy_concat
-
-    gc.collect()
-    # print(llama_state_dict.keys())
-    return llama_state_dict
 class NeuronLlamaForCausalLM(NeuronBaseForCausalLM):
     """
     This class extends LlamaForCausalLM create traceable
@@ -2308,12 +1092,11 @@ def load_hf_model(model_path, **kwargs):
     @staticmethod
     def convert_hf_to_neuron_state_dict(state_dict: dict, config: InferenceConfig) -> dict:
         """This function should be over-ridden in child classes as needed"""
+
         neuron_config = config.neuron_config
-        
-        # Handle fused_rmsnorm_skip_gamma transformation
+        # to facilitate rank usage in attention
         num_layers = config.num_hidden_layers
         tp_degree = neuron_config.tp_degree
-        
         for i in range(num_layers):
             state_dict[f"layers.{i}.self_attn.rank_util.rank"] = torch.arange(
                 0, tp_degree, dtype=torch.int32
@@ -2325,19 +1108,19 @@ def convert_hf_to_neuron_state_dict(state_dict: dict, config: InferenceConfig) -
             up_w_prime = (up_w.T * gamma).T
             """
             if (
-                getattr(neuron_config, 'fused_rmsnorm_skip_gamma', False)
-                and not getattr(neuron_config, 'sequence_parallel_enabled', False)
+                neuron_config.fused_rmsnorm_skip_gamma
+                and not neuron_config.sequence_parallel_enabled
             ):
-                if getattr(neuron_config, 'mlp_kernel_enabled', False):
+                if neuron_config.mlp_kernel_enabled:
                     # MLP
                     state_dict[f"layers.{i}.mlp.gate_proj.weight"] = state_dict[
                         f"layers.{i}.mlp.gate_proj.weight"
-                    ] * state_dict[f"layers.{i}.post_attention_layernorm.weight"].unsqueeze(0)
+                    ] * state_dict[f"layers.{i}.input_layernorm.weight"].unsqueeze(0)
                     state_dict[f"layers.{i}.mlp.up_proj.weight"] = state_dict[
                         f"layers.{i}.mlp.up_proj.weight"
-                    ] * state_dict[f"layers.{i}.post_attention_layernorm.weight"].unsqueeze(0)
+                    ] * state_dict[f"layers.{i}.input_layernorm.weight"].unsqueeze(0)
 
-                if getattr(neuron_config, 'qkv_kernel_enabled', False):
+                if neuron_config.qkv_kernel_enabled:
                     # QKV
                     state_dict[f"layers.{i}.self_attn.q_proj.weight"] = state_dict[
                         f"layers.{i}.self_attn.q_proj.weight"
@@ -2348,11 +1131,10 @@ def convert_hf_to_neuron_state_dict(state_dict: dict, config: InferenceConfig) -
                     state_dict[f"layers.{i}.self_attn.v_proj.weight"] = state_dict[
                         f"layers.{i}.self_attn.v_proj.weight"
                     ] * state_dict[f"layers.{i}.input_layernorm.weight"].unsqueeze(0)
-        
+
         if neuron_config.fused_qkv:
             state_dict = convert_state_dict_to_fused_qkv(state_dict, config)
-        if CONFIG_FUSE_MLP:
-            state_dict = fuse_mlp(state_dict, config)
+
         if neuron_config.vocab_parallel:
             # TODO: this hack can be removed after replication_id is ready to use
             state_dict["embed_tokens.rank_util.rank"] = torch.arange(
@@ -2362,13 +1144,9 @@ def convert_hf_to_neuron_state_dict(state_dict: dict, config: InferenceConfig) -
         # to facilitate rank usage in base model
         state_dict["rank_util.rank"] = torch.arange(0, tp_degree, dtype=torch.int32)
         return state_dict
-    def get_compiler_args(self):
-        res = super().get_compiler_args()
-        
-        
+
     @staticmethod
     def update_state_dict_for_tied_weights(state_dict):
-        
         state_dict["lm_head.weight"] = state_dict["embed_tokens.weight"].clone()
 
     @classmethod
diff --git a/src/inference/main.py b/src/inference/main.py
index 7c99e4c..5464f38 100644
--- a/src/inference/main.py
+++ b/src/inference/main.py
@@ -7,7 +7,6 @@
 import os
 import time
 import torch
-import re
 
 from torch_neuronx.pyhlo.hlo_pb2 import HloModuleProto
 from torch_neuronx.testing.validation import logit_validation
@@ -28,7 +27,7 @@
 from llama import NeuronLlamaForCausalLM
 from test import *
 
-BENCHMARK_REPORT_FILENAME = "benchmark_report.json"
+BENCHMARK_REPORT_FILENAME = "benchmark_inference.json"
 set_random_seed(0)
 
 
@@ -104,21 +103,34 @@ def parse_args():
 
     return parser.parse_args()
 
-def parse_prompts(filepath):
+def parse_prompt_data(filepath):
+    """Parse prompt performance data from JSON file"""
     with open(filepath, 'r') as file:
-        arr = file.read().split('\n\n')
-    arr = [prompt.strip() for prompt in arr if prompt.strip()]
-    return arr
+        data = json.load(file)
+    
+    # Convert the JSON data to the expected format (list of lists)
+    # Each inner list contains: [index, word_count, sequence_length, baseline_latency_ms, baseline_throughput]
+    prompt_data = []
+    for item in data['prompt_performance_data']:
+        prompt_data.append([
+            str(item['index']),
+            str(item['word_count']),
+            str(item['sequence_length']),
+            str(item['baseline_latency_ms']),
+            str(item['baseline_throughput'])
+        ])
+    
+    return prompt_data
 
 
-def parse_prompt_data(filepath):
+def parse_prompts(filepath):
+    """Parse prompts from JSON file"""
     with open(filepath, 'r') as file:
-        content = file.read()
-
-    blocks = content.split('\n')
-    if blocks[-1] == '':
-        blocks = blocks[0:-1]
-    return [block.split(',') for block in blocks]
+        data = json.load(file)
+    
+    # Extract just the prompt text from each prompt object
+    prompts = [prompt_obj['prompt'] for prompt_obj in data['prompts']]
+    return prompts
 
 def validate_file_exists(path):
     if not os.path.exists(path) or not os.path.isfile(path):
@@ -283,7 +295,7 @@ def post_warmup_func():
     print("Benchmark completed and its result is as following")
     print(json.dumps(report, indent=4))
     with open(BENCHMARK_REPORT_FILENAME, "w") as f:
-        json.dump(report, f)
+        json.dump(report, f, indent=4)
     print("Completed saving result to " + BENCHMARK_REPORT_FILENAME)
 
     return report
@@ -614,8 +626,8 @@ def main():
         
     elif args.mode == "evaluate_all":
 
-        prompts = parse_prompts("../../data/prompts.txt")
-        prompt_data = parse_prompt_data("../../data/prompt_data.txt")
+        prompts = parse_prompts("../../data/prompts.json")
+        prompt_data = parse_prompt_data("../../data/prompt_data.json")
         assert len(prompts) == len(prompt_data)
 
         total_score = 0
diff --git a/src/inference/scripts/download-model.sh b/src/inference/scripts/download-model.sh
new file mode 100755
index 0000000..94381c3
--- /dev/null
+++ b/src/inference/scripts/download-model.sh
@@ -0,0 +1,199 @@
+#!/bin/bash
+# download-model.sh - Download model from Hugging Face
+
+set -euo pipefail
+
+ENV_FILE=".env"
+KEY_1="NEURON_RT_NUM_CORES"
+KEY_2="TENSOR_PARALLEL_SIZE"
+
+# Load configuration
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "${SCRIPT_DIR}/../../../nki-llama.config"
+
+# Colors
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+RED='\033[0;31m'
+NC='\033[0m'
+
+echo -e "${GREEN}Downloading model from Hugging Face...${NC}"
+
+# Determine the instance type
+source "${SCRIPT_DIR}/../../../src/inference/scripts/instance_type.sh"
+
+# Check if MODEL_NAME is provided as an argument or environment variable
+if [ -n "${1:-}" ]; then
+    MODEL_NAME="$1"
+elif [ -z "${MODEL_NAME:-}" ]; then
+    echo -e "${RED}Error: MODEL_NAME not specified!${NC}"
+    echo "Usage: $0 <MODEL_NAME>"
+    echo "   or: export MODEL_NAME=<model_name> && $0"
+    echo ""
+    
+    # Provide recommendations based on instance type
+    if [ "$EC2_INSTANCE_TYPE" == "trn1.2xlarge" ]; then
+        echo "Recommended model for $EC2_INSTANCE_TYPE:"
+        echo "  - llama-3-2_1b"
+        echo ""
+        echo "Example: $0 llama-3-2_1b"
+    elif [ "$EC2_INSTANCE_TYPE" == "trn1.32xlarge" ]; then
+        echo "Recommended model for $EC2_INSTANCE_TYPE:"
+        echo "  - llama-3-1_8b"
+        echo ""
+        echo "Example: $0 llama-3-1_8b"
+    else
+        echo "Unsupported instance type: $EC2_INSTANCE_TYPE"
+        echo "This script requires either trn1.2xlarge or trn1.32xlarge"
+    fi
+    exit 1
+fi
+
+# Check if MODEL_ID is set
+if [ -z "${MODEL_ID:-}" ]; then
+    echo -e "${RED}Error: MODEL_ID environment variable is not set!${NC}"
+    echo "Please set MODEL_ID to the Hugging Face model identifier"
+    echo ""
+    echo "Examples:"
+    echo "  For llama-3-2_1b: export MODEL_ID=meta-llama/Llama-3.2-1B"
+    echo "  For llama-3-1_8b: export MODEL_ID=meta-llama/Meta-Llama-3-8B"
+    exit 1
+fi
+
+# Check HF token
+if [[ -z "${HF_TOKEN:-}" ]]; then
+    echo -e "${YELLOW}HF_TOKEN not set${NC}"
+    echo "Get a token at: https://huggingface.co/settings/tokens"
+    read -p "Enter your Hugging Face token: " HF_TOKEN
+    if [[ -z "$HF_TOKEN" ]]; then
+        echo -e "${RED}Error: HF_TOKEN is required${NC}"
+        exit 1
+    fi
+fi
+
+# Ensure huggingface-cli is installed
+pip install -q huggingface_hub[cli]
+
+# Ensure transformers < 4.50 (needed by Neuron hf_adapter)
+python - <<'PY'
+import subprocess, pkg_resources, sys
+req = "4.50.0"
+try:
+    ver = pkg_resources.get_distribution("transformers").version
+except pkg_resources.DistributionNotFound:
+    ver = ""
+if not ver or pkg_resources.parse_version(ver) >= pkg_resources.parse_version(req):
+    print("Installing transformers<%s …" % req)
+    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", f"transformers<{req}"])
+PY
+
+# Configure NeuronCore settings based on instance type
+if [ "$EC2_INSTANCE_TYPE" == "trn1.2xlarge" ]; then
+    CORE_VALUE="2"
+    echo "🚀 Configuring for small instance (2 NeuronCores)..."
+    
+    # Optional warning if using large model on small instance
+    if [[ "$MODEL_NAME" == *"8b"* ]] || [[ "$MODEL_NAME" == *"8B"* ]]; then
+        echo -e "${YELLOW}⚠️  Warning: Using a large model (8B) on a small instance (trn1.2xlarge) may cause performance issues${NC}"
+    fi
+elif [ "$EC2_INSTANCE_TYPE" == "trn1.32xlarge" ]; then
+    CORE_VALUE="8"
+    echo "🚀 Configuring for large instance (8 NeuronCores)..."
+    
+    # Optional note if using small model on large instance
+    if [[ "$MODEL_NAME" == *"1b"* ]] || [[ "$MODEL_NAME" == *"1B"* ]]; then
+        echo -e "${YELLOW}ℹ️  Note: Using a small model (1B) on a large instance (trn1.32xlarge)${NC}"
+        echo -e "${YELLOW}   Consider using a larger model for better resource utilization${NC}"
+    fi
+else
+    echo -e "${RED}Error: Unsupported instance type: $EC2_INSTANCE_TYPE${NC}"
+    echo "This script requires either trn1.2xlarge or trn1.32xlarge"
+    exit 1
+fi
+
+# Update NEURON_RT_NUM_CORES in .env file
+if grep -q "^${KEY_1}=" "$ENV_FILE" 2>/dev/null; then
+    sed -i "s/^${KEY_1}=.*/${KEY_1}=${CORE_VALUE}/" "$ENV_FILE"
+else
+    echo "${KEY_1}=${CORE_VALUE}" >> "$ENV_FILE"
+fi
+
+# Update TENSOR_PARALLEL_SIZE in .env file
+if grep -q "^${KEY_2}=" "$ENV_FILE" 2>/dev/null; then
+    sed -i "s/^${KEY_2}=.*/${KEY_2}=${CORE_VALUE}/" "$ENV_FILE"
+else
+    echo "${KEY_2}=${CORE_VALUE}" >> "$ENV_FILE"
+fi
+
+echo "Updated .env file with:"
+echo "  ${KEY_1}=${CORE_VALUE}"
+echo "  ${KEY_2}=${CORE_VALUE}"
+
+# Display configuration summary
+echo ""
+echo "Configuration Summary:"
+echo "  Instance Type: $EC2_INSTANCE_TYPE"
+echo "  Model Name: $MODEL_NAME"
+echo "  Model ID: $MODEL_ID"
+echo "  NeuronCores: $CORE_VALUE"
+echo ""
+
+# Create models directory
+mkdir -p "$NKI_MODELS"
+
+# Check if model already exists
+if [ -d "${NKI_MODELS}/${MODEL_NAME}" ] && [ -n "$(ls -A ${NKI_MODELS}/${MODEL_NAME} 2>/dev/null)" ]; then
+    echo -e "${YELLOW}Model already exists at ${NKI_MODELS}/${MODEL_NAME}${NC}"
+    read -p "Do you want to re-download it? (y/N): " -n 1 -r
+    echo
+    if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+        echo "Skipping download. Using existing model."
+    else
+        echo "Re-downloading model..."
+        rm -rf "${NKI_MODELS}/${MODEL_NAME}"
+    fi
+fi
+
+# Download model if needed
+if [ ! -d "${NKI_MODELS}/${MODEL_NAME}" ] || [ -z "$(ls -A ${NKI_MODELS}/${MODEL_NAME} 2>/dev/null)" ]; then
+    echo "Downloading ${MODEL_ID} to ${NKI_MODELS}/${MODEL_NAME}"
+    huggingface-cli download \
+        --token "$HF_TOKEN" \
+        "$MODEL_ID" \
+        --local-dir "${NKI_MODELS}/${MODEL_NAME}"
+fi
+
+# Export variables to environment for other scripts to use
+echo ""
+echo "Creating model environment file..."
+
+# Create a file to store these variables for other scripts
+cat > "${SCRIPT_DIR}/model_env.sh" << EOF
+#!/bin/bash
+# Auto-generated by download-model.sh
+# Contains model environment variables for other scripts
+
+export MODEL_NAME="${MODEL_NAME}"
+export MODEL_ID="${MODEL_ID}"
+export NEURON_RT_NUM_CORES="${CORE_VALUE}"
+export TENSOR_PARALLEL_SIZE="${CORE_VALUE}"
+EOF
+
+chmod +x "${SCRIPT_DIR}/model_env.sh"
+echo "✅ Created model environment file at: ${SCRIPT_DIR}/model_env.sh"
+
+echo -e "${GREEN}✓ Model downloaded successfully${NC}"
+echo "Location: ${NKI_MODELS}/${MODEL_NAME}"
+
+# Save configuration hint
+if [[ -z "${HF_TOKEN:-}" ]] && [[ -n "${HF_TOKEN}" ]]; then
+    echo ""
+    echo "To save your token, add to .env file:"
+    echo "HF_TOKEN=$HF_TOKEN"
+fi
+
+# Provide next steps
+echo ""
+echo "Next steps:"
+echo "1. Source the model environment: source ${SCRIPT_DIR}/model_env.sh"
+echo "2. Run your inference or fine-tuning scripts"
\ No newline at end of file
diff --git a/src/inference/scripts/instance_type.sh b/src/inference/scripts/instance_type.sh
new file mode 100755
index 0000000..264073a
--- /dev/null
+++ b/src/inference/scripts/instance_type.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+# Function to get metadata with retry
+get_metadata() {
+    local metadata_url="http://169.254.169.254/latest/meta-data"
+    local max_attempts=5
+    local attempt=1
+
+    while [ $attempt -le $max_attempts ]; do
+        # Try IMDSv2 first
+        TOKEN=$(curl -s -f -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null)
+        if [ $? -eq 0 ]; then
+            RESULT=$(curl -s -f -H "X-aws-ec2-metadata-token: $TOKEN" $metadata_url/$1 2>/dev/null)
+        else
+            # Fallback to IMDSv1
+            RESULT=$(curl -s -f $metadata_url/$1 2>/dev/null)
+        fi
+
+        if [ ! -z "$RESULT" ]; then
+            echo "$RESULT"
+            return 0
+        fi
+
+        echo "Attempt $attempt failed. Retrying..." >&2
+        sleep 2
+        ((attempt++))
+    done
+
+    echo "Failed to retrieve metadata after $max_attempts attempts" >&2
+    return 1
+}
+
+# Get the instance type
+INSTANCE_TYPE=$(get_metadata instance-type)
+
+if [ $? -ne 0 ]; then
+    echo "Failed to determine instance type"
+    exit 1
+fi
+
+# Set the environment variable
+export EC2_INSTANCE_TYPE=$INSTANCE_TYPE
+
+# Print the instance type out
+if [ "$EC2_INSTANCE_TYPE" == "trn1.2xlarge" ] || [ "$EC2_INSTANCE_TYPE" == "trn1.32xlarge" ]; then
+    echo "======================================================"
+    echo "✅ EC2_INSTANCE_TYPE: $EC2_INSTANCE_TYPE"
+    echo "======================================================"
+else
+    echo "========================================================================================="
+    echo "❌ This is not a trn1.2xlarge or trn1.32xlarge instance. It is a $INSTANCE_TYPE"
+    echo "⚠️ Please use a valid instance type ⚠️"
+    echo "========================================================================================="
+fi
diff --git a/src/inference/scripts/jupyter.sh b/src/inference/scripts/jupyter.sh
new file mode 100755
index 0000000..66fae19
--- /dev/null
+++ b/src/inference/scripts/jupyter.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+# jupyter.sh - Jupyter Lab setup and launcher
+
+set -euo pipefail
+
+# Load configuration
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "${SCRIPT_DIR}/../../../nki-llama.config"
+
+# Colors
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+RED='\033[0;31m'
+NC='\033[0m'
+
+# Setup Jupyter environment
+setup_jupyter() {
+    echo -e "${BLUE}Setting up Jupyter environment...${NC}"
+    
+    # Create virtual environment if needed
+    if [[ ! -d "$JUPYTER_VENV" ]]; then
+        echo "Creating virtual environment..."
+        python3 -m venv "$JUPYTER_VENV"
+    fi
+    
+    # Activate and install packages
+    source "${JUPYTER_VENV}/bin/activate"
+    
+    echo "Installing Jupyter packages..."
+    pip install --upgrade pip
+    pip install jupyter jupyterlab ipykernel python-dotenv
+    pip install langchain langgraph langchain_community
+    
+    # Install kernel
+    echo "Installing Jupyter kernel..."
+    python -m ipykernel install --user \
+        --name="nki-llama" \
+        --display-name="Python (NKI-LLAMA)"
+    
+    echo -e "${GREEN}✓ Jupyter setup complete${NC}"
+}
+
+# Start Jupyter Lab
+start_jupyter() {
+    # Check if setup is needed
+    if [[ ! -d "$JUPYTER_VENV" ]]; then
+        echo -e "${YELLOW}Jupyter not set up. Running setup first...${NC}"
+        setup_jupyter
+    fi
+    
+    # Activate environment
+    source "${JUPYTER_VENV}/bin/activate"
+    
+    # Start Jupyter Lab
+    echo -e "${GREEN}Starting Jupyter Lab on port ${JUPYTER_PORT}...${NC}"
+    echo -e "${YELLOW}URL: http://0.0.0.0:${JUPYTER_PORT}${NC}"
+    echo -e "${YELLOW}Press Ctrl+C to stop${NC}\n"
+    
+    cd "$NKI_ROOT"
+    jupyter lab --no-browser --ip="0.0.0.0" --port="${JUPYTER_PORT}"
+}
+
+# Main
+case "${1:-start}" in
+    setup)
+        setup_jupyter
+        ;;
+    start|"")
+        start_jupyter
+        ;;
+    *)
+        echo "Usage: $0 [setup|start]"
+        exit 1
+        ;;
+esac
\ No newline at end of file
diff --git a/src/inference/scripts/reasoning-bench-lm-eval.sh b/src/inference/scripts/reasoning-bench-lm-eval.sh
new file mode 100755
index 0000000..4bb7748
--- /dev/null
+++ b/src/inference/scripts/reasoning-bench-lm-eval.sh
@@ -0,0 +1,117 @@
+#!/usr/bin/env bash
+# reasoning-bench-lm-eval.sh ─ Start vLLM (Neuron) server and run lm-eval reasoning bench
+
+set -euo pipefail
+
+# ---------------------------------------------------------------------
+# 0. Config + constants
+# ---------------------------------------------------------------------
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "${SCRIPT_DIR}/../../../nki-llama.config" 
+
+set -a                      # auto-export everything that follows
+[ -f "${SCRIPT_DIR}/../../../.env" ] && source "${SCRIPT_DIR}/../../../.env"
+set +a
+# Where we keep AWS Neuron samples
+REASONING_BENCH_DIR="$HOME/aws-neuron-samples"
+
+# Colours
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+RED='\033[0;31m'
+NC='\033[0m'
+
+echo -e "${GREEN}Setting up vLLM for Neuron …${NC}"
+
+# ---------------------------------------------------------------------
+# 1. Sanity check: are we inside the inference venv?
+# ---------------------------------------------------------------------
+if [[ "${VIRTUAL_ENV:-}" != *"inference"* ]]; then
+  echo -e "${RED}Error:${NC} not inside Neuron inference venv"
+  echo    "Run: source ${NEURON_INFERENCE_VENV}/bin/activate"
+  exit 1
+fi
+
+# ---------------------------------------------------------------------
+# 2. Clone or update vLLM repo
+# ---------------------------------------------------------------------
+if [[ -d "$VLLM_REPO" ]]; then
+  echo " vLLM repo exists"
+else
+  echo "Run ./nki-llama inference setup first"
+  exit 1
+  #git clone -b neuron-2.22-vllm-v0.7.2 https://github.com/aws-neuron/upstreaming-to-vllm.git
+fi
+
+# ---------------------------------------------------------------------
+# 4. Ensure transformers < 4.50 for Neuron hf_adapter
+# ---------------------------------------------------------------------
+python - <<'PY'
+import subprocess, pkg_resources, sys
+req = "4.48.2"
+try:
+    ver = pkg_resources.get_distribution("transformers").version
+except pkg_resources.DistributionNotFound:
+    ver = ""
+if not ver or pkg_resources.parse_version(ver) >= pkg_resources.parse_version(req):
+    print(f"Installing transformers<{req} …")
+    subprocess.check_call([sys.executable, "-m", "pip", "install",
+                           "--quiet", f"transformers<{req}"])
+PY
+
+echo -e "${GREEN}✓ vLLM (Neuron) ready${NC}"
+
+# ---------------------------------------------------------------------
+# 5. Clone/refresh aws-neuron-samples + its deps
+# ---------------------------------------------------------------------
+if [[ -d "$REASONING_BENCH_DIR" ]]; then
+  echo "Updating aws-neuron-samples repo …"
+  git -C "$REASONING_BENCH_DIR" pull --ff-only
+else
+  git clone https://github.com/aws-neuron/aws-neuron-samples.git \
+            "$REASONING_BENCH_DIR"
+fi
+
+cd "$REASONING_BENCH_DIR/inference-benchmarking"
+pip install --quiet -r requirements.txt
+echo -e "${GREEN}✓ Inference-Benchmarking deps ready${NC}"
+
+# ---------------------------------------------------------------------
+# 6. Write (or overwrite) reasoning_bench.yaml
+# ---------------------------------------------------------------------
+cat > reasoning_bench.yaml <<YAML
+server:
+  name: "Reasoning-benchmark server"
+  model_path: "${NKI_MODELS}/${MODEL_NAME}"
+  model_s3_path: null
+  compiled_model_path: "${NKI_COMPILED}/${MODEL_NAME}"
+  max_seq_len: ${MAX_MODEL_LEN}
+  context_encoding_len: ${MAX_MODEL_LEN}
+  tp_degree: ${TENSOR_PARALLEL_SIZE}
+  n_vllm_threads: ${TENSOR_PARALLEL_SIZE}
+  server_port: ${INFERENCE_PORT}
+  continuous_batch_size: 1
+
+test:
+  accuracy:
+    mytest:
+      client: "lm_eval"
+      datasets: ["mmlu_pro, gsm8k_cot, mmlu_flan_cot_zeroshot"]
+      max_concurrent_requests: 1
+      timeout: 3600
+      client_params:
+        limit: 200
+        use_chat: False
+YAML
+echo -e "${GREEN}✓ Config file written${NC}"
+
+# ---------------------------------------------------------------------
+# 7. Run the benchmark
+# ---------------------------------------------------------------------
+echo -e "${BLUE}----- reasoning_bench.yaml -----${NC}"
+cat reasoning_bench.yaml
+echo
+
+echo -e "${GREEN}Starting Reasoning Benchmark …${NC}"
+python accuracy.py --config reasoning_bench.yaml
diff --git a/src/inference/scripts/run-nki-benchmark.sh b/src/inference/scripts/run-nki-benchmark.sh
new file mode 100755
index 0000000..eef0e1c
--- /dev/null
+++ b/src/inference/scripts/run-nki-benchmark.sh
@@ -0,0 +1,529 @@
+#!/bin/bash
+# /home/ubuntu/nki-llama/src/inference/scripts/run-nki-benchmark.sh
+# Run NKI benchmark evaluation for model compilation and performance testing
+
+set -euo pipefail
+
+# Get script directory
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../../../" && pwd)"
+
+# Load configuration
+source "${PROJECT_ROOT}/nki-llama.config"
+
+# Colors for output
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+RED='\033[0;31m'
+CYAN='\033[0;36m'
+NC='\033[0m'
+
+# Default parameters
+MODE="${MODE:-evaluate_single}"
+ENABLE_NKI="${ENABLE_NKI:-true}"
+SEQ_LEN="${SEQ_LEN:-2048}"
+TP_DEGREE="${TP_DEGREE:-${TENSOR_PARALLEL_SIZE}}"
+CLEAR_CACHE="${CLEAR_CACHE:-false}"
+AUTO_CLEAR_CACHE="${AUTO_CLEAR_CACHE:-true}"
+RETRY_FAILED="${RETRY_FAILED:-false}"
+
+# Cache paths
+NEURON_CACHE_DIR="/var/tmp/neuron-compile-cache"
+
+# Load model environment variables if available
+MODEL_ENV_FILE="${SCRIPT_DIR}/model_env.sh"
+if [ -f "$MODEL_ENV_FILE" ]; then
+    echo -e "${BLUE}Loading model environment from: $MODEL_ENV_FILE${NC}"
+    source "$MODEL_ENV_FILE"
+fi
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --mode)
+            MODE="$2"
+            shift 2
+            ;;
+        --model-name)
+            MODEL_NAME="$2"
+            shift 2
+            ;;
+        --no-nki)
+            ENABLE_NKI="false"
+            shift
+            ;;
+        --seq-len)
+            SEQ_LEN="$2"
+            shift 2
+            ;;
+        --tp-degree)
+            TP_DEGREE="$2"
+            shift 2
+            ;;
+        --clear-cache)
+            CLEAR_CACHE="true"
+            shift
+            ;;
+        --no-auto-clear-cache)
+            AUTO_CLEAR_CACHE="false"
+            shift
+            ;;
+        --retry-failed-compilation)
+            RETRY_FAILED="true"
+            shift
+            ;;
+        --help)
+            echo "Usage: $0 [options]"
+            echo "Options:"
+            echo "  --mode MODE                    Benchmark mode (evaluate_single/evaluate_all) [default: evaluate_single]"
+            echo "  --model-name NAME              Model name override"
+            echo "  --no-nki                       Disable NKI optimizations"
+            echo "  --seq-len N                    Sequence length [default: 2048]"
+            echo "  --tp-degree N                  Tensor parallel degree [default: from config]"
+            echo "  --clear-cache                  Clear compilation cache before running"
+            echo "  --no-auto-clear-cache          Disable automatic cache clearing on failure"
+            echo "  --retry-failed-compilation     Force retry of failed compilations"
+            echo "  --help                         Show this help message"
+            exit 0
+            ;;
+        *)
+            echo -e "${RED}Unknown option: $1${NC}"
+            exit 1
+            ;;
+    esac
+done
+
+# Set paths
+MODEL_PATH="${NKI_MODELS}/${MODEL_NAME}"
+COMPILED_MODEL_PATH="${NKI_COMPILED}/${MODEL_NAME}"
+
+# Function to check if model exists
+check_model() {
+    if [[ ! -d "$MODEL_PATH" ]]; then
+        echo -e "${RED}❌ Model not found at: $MODEL_PATH${NC}"
+        echo -e "${YELLOW}Please run: ./nki-llama inference download${NC}"
+        exit 1
+    fi
+}
+
+# Function to clear compilation cache
+clear_compilation_cache() {
+    echo -e "${YELLOW}🧹 Clearing Neuron compilation cache...${NC}"
+    if [[ -d "$NEURON_CACHE_DIR" ]]; then
+        local cache_size=$(du -sh "$NEURON_CACHE_DIR" 2>/dev/null | cut -f1 || echo "unknown")
+        echo -e "   Cache location: ${CYAN}${NEURON_CACHE_DIR}${NC}"
+        echo -e "   Current size: ${CYAN}${cache_size}${NC}"
+        
+        if rm -rf "$NEURON_CACHE_DIR"; then
+            echo -e "${GREEN}✓ Cache cleared successfully${NC}"
+            return 0
+        else
+            echo -e "${RED}✗ Failed to clear cache. May need sudo privileges.${NC}"
+            echo -e "${YELLOW}Try: sudo rm -rf ${NEURON_CACHE_DIR}${NC}"
+            return 1
+        fi
+    else
+        echo -e "${BLUE}ℹ Cache directory does not exist${NC}"
+        return 0
+    fi
+}
+
+# Function to check for failed cache entries
+check_failed_cache_entries() {
+    if [[ -d "$NEURON_CACHE_DIR" ]]; then
+        local failed_count=$(find "$NEURON_CACHE_DIR" -name "*.neff" -size 0 2>/dev/null | wc -l || echo "0")
+        if [[ $failed_count -gt 0 ]]; then
+            echo -e "${YELLOW}⚠ Found ${failed_count} failed compilation entries in cache${NC}"
+            return 1
+        fi
+    fi
+    return 0
+}
+
+# Function to run evaluate_single mode
+run_evaluate_single() {
+    echo -e "${YELLOW}🔧 Running benchmark in evaluate_single mode...${NC}"
+    echo -e "${YELLOW}This mode runs single evaluation with NKI optimizations.${NC}"
+    
+    # Change to inference directory
+    cd "${NKI_INFERENCE}"
+    
+    # Build command
+    CMD="python main.py"
+    CMD="${CMD} --mode evaluate_single"
+    CMD="${CMD} --model-path ${MODEL_PATH}"
+    CMD="${CMD} --compiled-model-path ${COMPILED_MODEL_PATH}"
+    CMD="${CMD} --seq-len ${SEQ_LEN}"
+    CMD="${CMD} --tp-degree ${TP_DEGREE}"
+    
+    if [[ "$ENABLE_NKI" == "true" ]]; then
+        CMD="${CMD} --enable-nki"
+    fi
+    
+    if [[ "$RETRY_FAILED" == "true" ]]; then
+        CMD="${CMD} --retry-failed-compilation"
+    fi
+    
+    # Execute with timing and error handling
+    echo -e "${BLUE}Executing evaluate_single benchmark...${NC}"
+    echo -e "${BLUE}${CMD}${NC}"
+    echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+    
+    START_TIME=$(date +%s)
+    
+    # Create a temporary file to capture the output
+    TEMP_LOG=$(mktemp)
+    
+    # Run command and capture both stdout/stderr
+    if $CMD 2>&1 | tee "${BENCHMARK_LOG_DIR}/benchmark.log" | tee "$TEMP_LOG"; then
+        END_TIME=$(date +%s)
+        DURATION=$((END_TIME - START_TIME))
+        
+        echo
+        echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+        echo -e "${GREEN}✓ evaluate_single benchmark completed successfully!${NC}"
+        echo -e "Total time: ${DURATION} seconds"
+        
+        # If compilation happened, show artifact info
+        if [[ -d "$COMPILED_MODEL_PATH" ]]; then
+            echo
+            echo -e "${GREEN}✓ NKI-compiled model artifacts available at:${NC}"
+            echo -e "   ${COMPILED_MODEL_PATH}"
+            echo
+            echo -e "${BLUE}These artifacts can now be used for:${NC}"
+            echo -e "  • vLLM inference with NxDI optimizations"
+            echo -e "  • Direct inference benchmarks"
+            echo -e "  • Production deployments"
+            echo
+        fi
+        
+        rm -f "$TEMP_LOG"
+        return 0
+    else
+        # Check if it's a cache-related failure
+        if grep -q "Got a cached failed neff" "$TEMP_LOG" || grep -q "SIGHUP" "$TEMP_LOG"; then
+            echo
+            echo -e "${RED}✗ evaluate_single benchmark failed due to compilation cache issues!${NC}"
+            
+            if [[ "$AUTO_CLEAR_CACHE" == "true" ]]; then
+                echo -e "${YELLOW}🔄 Attempting automatic cache recovery...${NC}"
+                echo
+                
+                # Clear the cache
+                if clear_compilation_cache; then
+                    echo
+                    echo -e "${YELLOW}🔄 Retrying benchmark with clean cache...${NC}"
+                    echo
+                    
+                    # Retry the command
+                    if $CMD 2>&1 | tee "${BENCHMARK_LOG_DIR}/benchmark_retry.log"; then
+                        END_TIME=$(date +%s)
+                        DURATION=$((END_TIME - START_TIME))
+                        
+                        echo
+                        echo -e "${GREEN}✓ evaluate_single benchmark completed successfully after cache clear!${NC}"
+                        echo -e "Total time: ${DURATION} seconds"
+                        
+                        rm -f "$TEMP_LOG"
+                        return 0
+                    else
+                        echo -e "${RED}✗ Benchmark still failed after cache clear${NC}"
+                    fi
+                else
+                    echo -e "${RED}✗ Could not clear cache automatically${NC}"
+                fi
+            else
+                echo
+                echo -e "${YELLOW}💡 Suggestions to fix:${NC}"
+                echo -e "   1. Clear the compilation cache:"
+                echo -e "      ${CYAN}rm -rf ${NEURON_CACHE_DIR}${NC}"
+                echo -e "   2. Re-run with auto cache clearing:"
+                echo -e "      ${CYAN}$0 --mode evaluate_single --clear-cache${NC}"
+                echo -e "   3. Force retry failed compilations:"
+                echo -e "      ${CYAN}$0 --mode evaluate_single --retry-failed-compilation${NC}"
+            fi
+        else
+            echo -e "${RED}✗ evaluate_single benchmark failed!${NC}"
+        fi
+        
+        rm -f "$TEMP_LOG"
+        return 1
+    fi
+}
+
+# Function to run evaluate_all mode with error handling
+run_evaluate_all() {
+    echo -e "${YELLOW}📊 Running benchmark in evaluate_all mode...${NC}"
+    echo -e "${YELLOW}This mode evaluates all model configurations with NKI optimizations.${NC}"
+    
+    # Change to inference directory
+    cd "${NKI_INFERENCE}"
+    
+    # Build command
+    CMD="python main.py"
+    CMD="${CMD} --mode evaluate_all"
+    CMD="${CMD} --model-path ${MODEL_PATH}"
+    CMD="${CMD} --compiled-model-path ${COMPILED_MODEL_PATH}"
+    CMD="${CMD} --seq-len ${SEQ_LEN}"
+    CMD="${CMD} --tp-degree ${TP_DEGREE}"
+    
+    if [[ "$ENABLE_NKI" == "true" ]]; then
+        CMD="${CMD} --enable-nki"
+    fi
+    
+    if [[ "$RETRY_FAILED" == "true" ]]; then
+        CMD="${CMD} --retry-failed-compilation"
+    fi
+    
+    # Execute with timing and error handling
+    echo -e "${BLUE}Executing evaluate_all benchmark...${NC}"
+    echo -e "${BLUE}${CMD}${NC}"
+    echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+    
+    START_TIME=$(date +%s)
+    
+    # Create a temporary file to capture the output
+    TEMP_LOG=$(mktemp)
+    
+    # Run command and capture both stdout/stderr
+    if $CMD 2>&1 | tee "${BENCHMARK_LOG_DIR}/benchmark.log" | tee "$TEMP_LOG"; then
+        END_TIME=$(date +%s)
+        DURATION=$((END_TIME - START_TIME))
+        
+        echo
+        echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+        echo -e "${GREEN}✓ evaluate_all benchmark completed successfully!${NC}"
+        echo -e "Total time: ${DURATION} seconds"
+        
+        # If compilation happened, show artifact info
+        if [[ -d "$COMPILED_MODEL_PATH" ]]; then
+            echo
+            echo -e "${GREEN}✓ NKI-compiled model artifacts available at:${NC}"
+            echo -e "   ${COMPILED_MODEL_PATH}"
+            echo
+            echo -e "${BLUE}These artifacts can now be used for:${NC}"
+            echo -e "  • vLLM inference with NxDI optimizations"
+            echo -e "  • Direct inference benchmarks"
+            echo -e "  • Production deployments"
+            echo
+        fi
+        
+        rm -f "$TEMP_LOG"
+        return 0
+    else
+        # Check if it's a cache-related failure
+        if grep -q "Got a cached failed neff" "$TEMP_LOG" || grep -q "SIGHUP" "$TEMP_LOG"; then
+            echo
+            echo -e "${RED}✗ evaluate_all benchmark failed due to compilation cache issues!${NC}"
+            
+            if [[ "$AUTO_CLEAR_CACHE" == "true" ]]; then
+                echo -e "${YELLOW}🔄 Attempting automatic cache recovery...${NC}"
+                echo
+                
+                # Clear the cache
+                if clear_compilation_cache; then
+                    echo
+                    echo -e "${YELLOW}🔄 Retrying benchmark with clean cache...${NC}"
+                    echo
+                    
+                    # Retry the command
+                    if $CMD 2>&1 | tee "${BENCHMARK_LOG_DIR}/benchmark_retry.log"; then
+                        END_TIME=$(date +%s)
+                        DURATION=$((END_TIME - START_TIME))
+                        
+                        echo
+                        echo -e "${GREEN}✓ evaluate_all benchmark completed successfully after cache clear!${NC}"
+                        echo -e "Total time: ${DURATION} seconds"
+                        
+                        rm -f "$TEMP_LOG"
+                        return 0
+                    else
+                        echo -e "${RED}✗ Benchmark still failed after cache clear${NC}"
+                    fi
+                else
+                    echo -e "${RED}✗ Could not clear cache automatically${NC}"
+                fi
+            else
+                echo
+                echo -e "${YELLOW}💡 Suggestions to fix:${NC}"
+                echo -e "   1. Clear the compilation cache:"
+                echo -e "      ${CYAN}rm -rf ${NEURON_CACHE_DIR}${NC}"
+                echo -e "   2. Re-run with auto cache clearing:"
+                echo -e "      ${CYAN}$0 --mode evaluate_all --clear-cache${NC}"
+                echo -e "   3. Force retry failed compilations:"
+                echo -e "      ${CYAN}$0 --mode evaluate_all --retry-failed-compilation${NC}"
+            fi
+        else
+            echo -e "${RED}✗ evaluate_all benchmark failed!${NC}"
+        fi
+        
+        rm -f "$TEMP_LOG"
+        return 1
+    fi
+}
+
+# Main benchmark function
+run_benchmark() {
+    echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+    echo -e "${BLUE}NKI Benchmark Evaluation${NC}"
+    echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+    echo
+    echo -e "Mode:              ${CYAN}${MODE}${NC}"
+    echo -e "Model:             ${CYAN}${MODEL_NAME}${NC}"
+    echo -e "Model Path:        ${CYAN}${MODEL_PATH}${NC}"
+    echo -e "Compiled Path:     ${CYAN}${COMPILED_MODEL_PATH}${NC}"
+    echo -e "NKI Enabled:       ${CYAN}${ENABLE_NKI}${NC}"
+    echo -e "Sequence Length:   ${CYAN}${SEQ_LEN}${NC}"
+    echo -e "TP Degree:         ${CYAN}${TP_DEGREE}${NC}"
+    echo -e "Auto Clear Cache:  ${CYAN}${AUTO_CLEAR_CACHE}${NC}"
+    echo
+    
+    # Check if we should clear cache
+    if [[ "$CLEAR_CACHE" == "true" ]]; then
+        clear_compilation_cache
+        echo
+    else
+        # Check for failed cache entries
+        if ! check_failed_cache_entries; then
+            echo -e "${YELLOW}💡 Consider using --clear-cache to remove failed entries${NC}"
+            echo
+        fi
+    fi
+    
+    # Check prerequisites based on mode
+    if [[ "$MODE" == "evaluate_all" ]] || [[ "$MODE" == "evaluate_single" ]]; then
+        check_model
+    fi
+    
+    # Check if running in tmux for long compilations
+    if [[ "$MODE" == "evaluate_all" ]] && [[ -z "${TMUX:-}" ]]; then
+        echo -e "${YELLOW}⚠️  Warning: Not running in tmux!${NC}"
+        echo -e "${YELLOW}   Model compilation can take 10-30 minutes.${NC}"
+        echo -e "${YELLOW}   Any disconnection will terminate the process.${NC}"
+        echo
+        echo -e "${CYAN}   Recommended: tmux new -s benchmark${NC}"
+        echo
+        read -p "Continue without tmux? [y/N] " -n 1 -r
+        echo
+        if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+            echo -e "${BLUE}Exiting. Please run in tmux.${NC}"
+            exit 0
+        fi
+    fi
+    
+    # Set environment variables for the benchmark
+    export NEURON_RT_NUM_CORES="${NEURON_RT_NUM_CORES}"
+    export NEURON_CC_FLAGS="--enable-saturate-infinity"
+    
+    # Additional NKI-specific flags if enabled
+    if [[ "$ENABLE_NKI" == "true" ]]; then
+        export NEURON_CC_FLAGS="${NEURON_CC_FLAGS} --enable-mixed-precision-accumulation"
+        echo -e "${GREEN}✓ NKI optimizations enabled${NC}"
+    fi
+    
+    # Create log directory for this benchmark
+    TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+    BENCHMARK_LOG_DIR="${NKI_LOGS}/benchmarks/${TIMESTAMP}"
+    mkdir -p "$BENCHMARK_LOG_DIR"
+    
+    echo -e "${BLUE}📊 Benchmark logs will be saved to:${NC}"
+    echo -e "   ${BENCHMARK_LOG_DIR}"
+    echo
+    
+    # Save benchmark metadata
+    cat > "${BENCHMARK_LOG_DIR}/metadata.json" << EOF
+{
+    "timestamp": "${TIMESTAMP}",
+    "mode": "${MODE}",
+    "model_name": "${MODEL_NAME}",
+    "model_path": "${MODEL_PATH}",
+    "compiled_model_path": "${COMPILED_MODEL_PATH}",
+    "nki_enabled": ${ENABLE_NKI},
+    "sequence_length": ${SEQ_LEN},
+    "tensor_parallel_size": ${TP_DEGREE},
+    "neuron_rt_cores": "${NEURON_RT_NUM_CORES}",
+    "cache_cleared": ${CLEAR_CACHE},
+    "auto_clear_cache": ${AUTO_CLEAR_CACHE},
+    "retry_failed": ${RETRY_FAILED}
+}
+EOF
+    
+    # Run the appropriate benchmark mode
+    case "$MODE" in
+        evaluate_single)
+            if run_evaluate_single; then
+                RESULT="success"
+            else
+                RESULT="failed"
+            fi
+            ;;
+        evaluate_all)
+            if run_evaluate_all; then
+                RESULT="success"
+            else
+                RESULT="failed"
+            fi
+            ;;
+        *)
+            echo -e "${RED}Unknown mode: $MODE${NC}"
+            echo -e "Valid modes: evaluate_single, evaluate_all"
+            exit 1
+            ;;
+    esac
+    
+    # Update metadata with result
+    if [[ "$RESULT" == "success" ]]; then
+        END_TIME=$(date +%s)
+        DURATION=$((END_TIME - START_TIME))
+        
+        # Update metadata.json with duration
+        jq --arg duration "$DURATION" '.duration_seconds = ($duration | tonumber)' \
+            "${BENCHMARK_LOG_DIR}/metadata.json" > "${BENCHMARK_LOG_DIR}/metadata.json.tmp" && \
+            mv "${BENCHMARK_LOG_DIR}/metadata.json.tmp" "${BENCHMARK_LOG_DIR}/metadata.json"
+    fi
+}
+
+# Show benchmark info
+show_info() {
+    echo -e "${BLUE}NKI Benchmark Evaluation Tool${NC}"
+    echo
+    echo -e "This tool supports two benchmark modes:"
+    echo
+    echo -e "${YELLOW}1. evaluate_single mode:${NC}"
+    echo -e "   • Runs single evaluation configuration"
+    echo -e "   • Tests with NKI optimizations"
+    echo -e "   • Quick validation of model performance"
+    echo
+    echo -e "${YELLOW}2. evaluate_all mode:${NC}"
+    echo -e "   • Comprehensive benchmark with all configurations"
+    echo -e "   • Tests with NKI optimizations"
+    echo -e "   • Creates compiled model artifacts if needed"
+    echo -e "   • Full performance analysis"
+    echo
+    echo -e "${BLUE}Cache Management:${NC}"
+    echo -e "   • Auto-detects and handles failed compilations"
+    echo -e "   • Can automatically clear cache on failure"
+    echo -e "   • Manual cache clearing available"
+    echo
+    echo -e "${BLUE}Examples:${NC}"
+    echo -e "   # Run single evaluation"
+    echo -e "   ./run-nki-benchmark.sh --mode evaluate_single"
+    echo
+    echo -e "   # Run comprehensive benchmark with NKI"
+    echo -e "   ./run-nki-benchmark.sh --mode evaluate_all --seq-len 1024"
+    echo
+    echo -e "   # Clear cache before running"
+    echo -e "   ./run-nki-benchmark.sh --mode evaluate_all --clear-cache"
+    echo
+    echo -e "   # Run without automatic cache clearing"
+    echo -e "   ./run-nki-benchmark.sh --mode evaluate_all --no-auto-clear-cache"
+    echo
+}
+
+# Main execution
+if [[ "${1:-}" == "--info" ]]; then
+    show_info
+else
+    run_benchmark
+fi
\ No newline at end of file
diff --git a/src/inference/scripts/setup-vllm.sh b/src/inference/scripts/setup-vllm.sh
new file mode 100755
index 0000000..e15c5d6
--- /dev/null
+++ b/src/inference/scripts/setup-vllm.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+# setup-vllm.sh - Setup vLLM for Neuron inference
+
+set -euo pipefail
+
+# Load configuration
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "${SCRIPT_DIR}/../../../nki-llama.config"
+
+# Colors
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+RED='\033[0;31m'
+NC='\033[0m'
+
+echo -e "${GREEN}Setting up vLLM for Neuron...${NC}"
+
+# Check if in correct environment
+if [[ "$VIRTUAL_ENV" != *"inference"* ]]; then
+    echo -e "${RED}Error: Not in inference environment${NC}"
+    echo -e "Run: source ${NEURON_INFERENCE_VENV}/bin/activate"
+    exit 1
+fi
+
+# Clone or update vLLM repository
+if [[ -d "$VLLM_REPO" ]]; then
+    echo "Updating existing vLLM repository..."
+    cd "$VLLM_REPO"
+    git pull
+else
+    echo "Cloning vLLM repository..."
+    cd "$(dirname "$VLLM_REPO")"
+    git clone -b neuron-2.22-vllm-v0.7.2 https://github.com/aws-neuron/upstreaming-to-vllm.git
+fi
+
+# ---- NEW: make sure no wheel shadows the editable install ---------------
+echo "Removing any previously installed vLLM wheels..."
+pip uninstall -y vllm vllm-nightly vllm-neuron 2>/dev/null || true
+
+# Install requirements
+cd /home/ubuntu/upstreaming-to-vllm/
+echo "Installing vLLM requirements..."
+pip install -r requirements-neuron.txt
+
+# Install vLLM
+echo "Installing vLLM for Neuron..."
+VLLM_TARGET_DEVICE="neuron" pip install -e .
+
+# Ensure transformers < 4.50 (needed by Neuron hf_adapter)
+python - <<'PY'
+import subprocess, pkg_resources, sys
+req = "4.48.2"
+try:
+    ver = pkg_resources.get_distribution("transformers").version
+except pkg_resources.DistributionNotFound:
+    ver = ""
+if not ver or pkg_resources.parse_version(ver) >= pkg_resources.parse_version(req):
+    print("Installing transformers<%s …" % req)
+    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", f"transformers<{req}"])
+PY
+
+echo -e "${GREEN}✓ vLLM setup complete${NC}"
\ No newline at end of file
diff --git a/src/inference/scripts/start-server.sh b/src/inference/scripts/start-server.sh
new file mode 100755
index 0000000..5467f1c
--- /dev/null
+++ b/src/inference/scripts/start-server.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+# start-server.sh - Start vLLM OpenAI-compatible API server
+
+set -euo pipefail
+
+# Load configuration
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "${SCRIPT_DIR}/../../../nki-llama.config"
+
+# Colors
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+RED='\033[0;31m'
+NC='\033[0m'
+
+echo -e "${GREEN}Starting vLLM API Server${NC}"
+echo -e "${BLUE}Model: ${MODEL_NAME}${NC}"
+echo -e "${BLUE}Port: ${INFERENCE_PORT}${NC}"
+echo -e "${BLUE}Tensor Parallel Size: ${TENSOR_PARALLEL_SIZE}${NC}"
+echo
+
+# Check model exists
+if [[ ! -d "${NKI_MODELS}/${MODEL_NAME}" ]]; then
+    echo -e "${RED}Error: Model not found at ${NKI_MODELS}/${MODEL_NAME}${NC}"
+    echo "Run: ./nki-llama.sh inference download"
+    exit 1
+fi
+
+# Set Neuron environment variables
+export VLLM_NEURON_FRAMEWORK="${VLLM_NEURON_FRAMEWORK}"
+export NEURON_COMPILED_ARTIFACTS="${NKI_COMPILED}/${MODEL_NAME}"
+export NEURON_RT_NUM_CORES="${NEURON_RT_NUM_CORES}"
+
+# Create compiled model directory if needed
+mkdir -p "$NEURON_COMPILED_ARTIFACTS"
+
+echo -e "${YELLOW}Starting server on http://0.0.0.0:${INFERENCE_PORT}${NC}"
+echo -e "${YELLOW}Press Ctrl+C to stop${NC}"
+echo
+
+# Start vLLM server
+#if using a reasoning model, make sure 
+cd "$HOME"
+python -m vllm.entrypoints.openai.api_server \
+    --model="${NKI_MODELS}/${MODEL_NAME}" \
+    --max-num-seqs="${MAX_NUM_SEQS}" \
+    --max-model-len="${MAX_MODEL_LEN}" \
+    --tensor-parallel-size="${TENSOR_PARALLEL_SIZE}" \
+    --port="${INFERENCE_PORT}" \
+    --device="neuron" \
+    --override-neuron-config='{"enable_bucketing":false}'
\ No newline at end of file
diff --git a/src/self-attention/README.md b/src/self-attention/README.md
new file mode 100644
index 0000000..b29674c
--- /dev/null
+++ b/src/self-attention/README.md
@@ -0,0 +1,183 @@
+# Self-Attention Module for NKI-LLAMA
+
+This module implements optimized Flash Attention kernels using the Neuron Kernel Interface (NKI) for AWS Inferentia/Trainium hardware. The implementation focuses on high-performance, memory-efficient attention mechanisms for large language models.
+
+## Overview
+
+The self-attention module provides optimized implementations of attention mechanisms that are critical for transformer-based models like LLaMA. These implementations leverage NKI to achieve high performance on AWS Neuron hardware.
+
+## Key Components
+
+### Core Files
+
+- **`attention.py`**: Main implementation of Flash Attention kernels using NKI
+  - `flash_fwd`: Forward pass implementation of Flash Attention
+  - `flash_attn_bwd`: Backward pass implementation for gradient computation
+  - `fused_self_attn_for_SD_small_head_size`: Specialized attention for small head sizes
+
+### Configuration
+
+- **`FlashConfig`**: Configuration dataclass for tuning attention performance parameters
+  - `seq_tile_size`: Size of sequence tiles for attention computation (default: 2048)
+  - `attn_core_tile_size`: Size of attention core tiles (default: 256)
+  - `training`: Flag to indicate training vs. inference mode (default: True)
+  - `should_transpose_v`: Flag to control V tensor layout (default: False)
+  - `lse_dtype`: Data type for log-sum-exp computation (default: "")
+
+### Tests
+
+- **`tests/test_flash_attn_fwd.py`**: Tests for forward pass performance and numerical accuracy
+- **`tests/test_flash_attn_bwd.py`**: Tests for backward pass performance and numerical accuracy
+
+## Features
+
+- **Optimized Memory Usage**: Implements tiling strategies to efficiently use limited on-chip memory
+- **Mixed Precision Support**: Configurable precision for computation vs. accumulation
+- **Causal Masking**: Support for causal attention patterns used in decoder-only models
+- **Dropout Support**: Configurable dropout for training stability
+- **GQA/MQA Support**: Grouped Query Attention and Multi-Query Attention support
+- **Performance Tuning**: Configurable parameters for different hardware configurations
+
+## Usage
+
+### Basic Usage
+
+```python
+from attention import flash_fwd, FlashConfig
+
+# Configure the attention parameters
+config = FlashConfig(
+    seq_tile_size=2048,
+    training=True,
+    should_transpose_v=False
+)
+
+# Run the forward pass
+# q: shape (bs, n_heads, d, seq_q)
+# k: shape (bs, nk_heads, d, seq_k)
+# v: shape (bs, nv_heads, d, seq_v) if config.should_transpose_v else (bs, nv_heads, seq_v, d)
+output = flash_fwd[batch_size, kv_heads](
+    q, k, v, seed, 
+    use_causal_mask=True,
+    mixed_precision=True,
+    config=config
+)
+```
+
+### Training Usage
+
+```python
+from attention import flash_fwd, flash_attn_bwd, FlashConfig
+
+# Forward pass
+output, lse = flash_fwd[batch_size, kv_heads](
+    q, k, v, seed, 
+    use_causal_mask=True,
+    mixed_precision=True,
+    config=FlashConfig(training=True)
+)
+
+# Backward pass
+dq, dk, dv = flash_attn_bwd[batch_size, heads](
+    q, k, v, output, dy, lse, seed,
+    use_causal_mask=True,
+    mixed_precision=True
+)
+```
+
+## Performance Considerations
+
+- **Sequence Length**: Performance scales with sequence length; use appropriate tiling
+- **Head Dimensions**: Optimized for head dimensions ≤ 128
+- **Batch Size**: Consider batch size impact on memory usage and parallelism
+- **Tile Sizes**: Adjust `seq_tile_size` and `attn_core_tile_size` based on model size and hardware
+
+## Testing
+
+Run the tests to validate performance and numerical accuracy:
+
+```bash
+# Activate python environment
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+
+# Navigate to the tests directory
+cd nki-llama/src/self-attention/scripts
+
+# Run all tests
+./self-attention_benchmark.sh 
+
+# Run specific test suites
+pytest ../tests/test_flash_attn_fwd.py -v -s  # Forward pass tests
+pytest ../tests/test_flash_attn_bwd.py -v -s  # Backward pass tests
+```
+
+### Benchmarking
+
+The module includes a comprehensive benchmarking script that evaluates both forward and backward passes:
+
+```bash
+# Run the benchmark script
+./scripts/self-attention_benchmark.sh
+```
+
+The benchmark calculates a combined score based on the following formula:
+
+```
+final_score = accuracy * latency_improvement * throughput_improvement * (1.0 + nki_flop_ratio)
+```
+
+Where:
+- `accuracy`: Binary value (1.0 or 0.0) indicating if numerical tests pass
+- `latency_improvement`: Ratio of baseline latency to measured latency
+- `throughput_improvement`: Inversely proportional to latency (higher is better)
+- `nki_flop_ratio`: Ratio of operations executed on NKI hardware (hardware utilization)
+
+#### NKI FLOP Ratio Calculation
+
+The NKI FLOP ratio is automatically calculated based on the kernel characteristics:
+
+```python
+def calculate_nki_flop_ratio(bs, nheads, seq_len, d, is_backward=False):
+    # Calculate total FLOPs for attention operations
+    qk_bmm_flops = 2 * bs * nheads * seq_len * seq_len * d  # Q*K^T matrix multiply
+    attn_v_bmm_flops = 2 * bs * nheads * seq_len * seq_len * d  # Attention * V matrix multiply
+    softmax_flops = bs * nheads * seq_len * seq_len * 5  # Softmax operations
+    
+    # Additional operations for backward pass
+    if is_backward:
+        dq_flops = 2 * bs * nheads * seq_len * seq_len * d  # dQ calculation
+        dk_flops = 2 * bs * nheads * seq_len * seq_len * d  # dK calculation
+        dv_flops = 2 * bs * nheads * seq_len * seq_len * d  # dV calculation
+        dsoftmax_flops = bs * nheads * seq_len * seq_len * 10  # Softmax gradient
+        # Calculate total and NKI-accelerated operations
+        # ...
+    
+    # Apply adjustments based on empirical observations
+    # Larger batch sizes and head dimensions tend to have better utilization
+    # ...
+    
+    return adjusted_ratio  # Between 0.0 and 1.0
+```
+
+This calculation considers:
+- Matrix multiplication operations (highly accelerated on NKI)
+- Softmax operations (partially accelerated)
+- Batch size, sequence length, and head dimension effects on hardware utilization
+
+The benchmark generates detailed logs and a JSON results file in the `logs/self_attention/` directory, including accumulated metrics across multiple test runs.
+
+## Optimization Opportunities
+
+Areas for potential optimization:
+
+1. **Memory Tiling**: Improve tiling strategies for better memory locality
+2. **Instruction Scheduling**: Optimize instruction ordering for better hardware utilization
+3. **Precision Control**: Fine-tune mixed precision operations for specific model requirements
+4. **Specialized Kernels**: Create specialized kernels for specific sequence lengths or head sizes
+5. **Fused Operations**: Combine operations to reduce memory transfers
+
+## References
+
+- [Flash Attention Paper](https://arxiv.org/abs/2205.14135)
+- [AWS Neuron SDK Documentation](https://awsdocs-neuron.readthedocs-hosted.com/)
+- [NKI Programming Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/index.html)
\ No newline at end of file
diff --git a/src/self-attention/attention.py b/src/self-attention/attention.py
new file mode 100644
index 0000000..a6e1f9a
--- /dev/null
+++ b/src/self-attention/attention.py
@@ -0,0 +1,1171 @@
+"""
+Copyright (c) 2023, Amazon.com. All Rights Reserved
+
+kernels - Builtin high performance attention kernels
+
+"""
+import numpy as np
+
+import neuronxcc.nki.isa as nisa
+import neuronxcc.nki.language as nl
+from neuronxcc import nki
+
+from neuronxcc.nki.language import par_dim
+from dataclasses import dataclass
+from functools import reduce as functools_reduce
+from operator import mul as operator_mul
+
+
+def n_elts(shape):
+  return functools_reduce(operator_mul, shape, 1)
+
+
+def linearize(shape, indices):
+  return sum(i * (n_elts(shape[dim + 1:]))
+             for dim, i in enumerate(indices))
+
+
+def div_ceil(n, d):
+  return (n + d - 1) // d
+
+
+@dataclass(frozen=True)
+class FlashConfig:
+  """
+    Config class for flash attention with default values
+  """
+  seq_tile_size:int = 2048
+  attn_core_tile_size:int = 256
+  training:bool = True
+  should_transpose_v:bool = False
+  lse_dtype: str = ""
+
+
+@nki.jit(mode='trace')
+def transpose_p_local(p_local_transposed, p_local, LARGE_TILE_SZ, use_dma_transpose=False):
+  for i in nl.affine_range(LARGE_TILE_SZ // 512):
+    # Temporarily disable use_dma_tranpose by default until we stablized it
+    if use_dma_transpose and nisa.get_nc_version() >= nisa.nc_version.gen3:
+      p_local_t_tmp = nl.ndarray((par_dim(128), 512), buffer=nl.sbuf, dtype=p_local.dtype)
+    else:
+      p_local_t_tmp = nl.ndarray((par_dim(128), 512), buffer=nl.psum, dtype=np.float32)
+
+    for j in nl.affine_range(512 // 128):
+      j_128_slice = nl.ds(j * 128, 128)
+      i_j_128_slice = nl.ds(i * 512 + j * 128, 128)
+
+      if use_dma_transpose and nisa.get_nc_version() >= nisa.nc_version.gen3:
+        p_local_t_tmp[:, j_128_slice] = nisa.dma_transpose(
+          p_local[:, i_j_128_slice])
+      else:
+        p_local_t_tmp[:, j_128_slice] = nisa.nc_transpose(
+          p_local[:, i_j_128_slice])
+
+    p_local_transposed[:, nl.ds(i * 512, 512)] = nl.copy(
+      p_local_t_tmp, dtype=p_local_transposed.dtype)
+
+
+@nki.jit(mode='trace')
+def dropout_p_local(p_local, dropout_p, dropout_p_tensor, seed_tensor,
+                    seed_offset_base, k_r_i, REDUCTION_TILE):
+  B_F_SIZE = 512
+  for k_d_i in nl.sequential_range(REDUCTION_TILE // B_F_SIZE):
+    p_local_f_slice = nl.ds(k_r_i * REDUCTION_TILE + k_d_i * B_F_SIZE, B_F_SIZE)
+
+    offset = k_d_i + seed_offset_base
+    offset_seed = nl.add(seed_tensor, offset, dtype=nl.int32)
+    nl.random_seed(seed=offset_seed)
+    softmax_dropout = nl.dropout(p_local[:, p_local_f_slice],
+                                 rate=dropout_p_tensor[:, 0])
+    p_local[:, p_local_f_slice] = nl.multiply(
+      softmax_dropout, 1 / (1 - dropout_p))
+
+
+@nki.jit(mode='trace')
+def _flash_attention_core(q_local_tile, k, v,
+                          q_h_per_k_h, seqlen_q, nheads,
+                          o_buffer, l_buffer, m_buffer,
+                          batch_id, head_id, gqa_head_idx, q_tile_idx,
+                          local_k_large_tile_idx,
+                          kernel_dtype, acc_type,
+                          flash_config: FlashConfig,
+                          use_causal_mask, initialize,
+                          B_P_SIZE=128, B_F_SIZE=512, B_D_SIZE=128,
+                          dropout_p=0.0, dropout_p_tensor=None, seed_tensor=None,
+                          logit_bias_tile=None):
+  """
+  The flash attention core function to calcualte self attention between a tile of q and a block of K and V.
+  The q_local_tile has (B_P_SIZE, B_F_SIZE), which is loaded into the SBUF already. The block size of K and V
+  is defined in the seq_tile_size of the flash_config. The results are stored in the following three buffers
+  o_buffer: (B_P_SIZE, d)
+  l_buffer: (B_P_SIZE, 1)
+  m_buffer: (B_P_SIZE, 1)
+  """
+  LARGE_TILE_SZ = flash_config.seq_tile_size
+  num_k_tile_per_large_tile = LARGE_TILE_SZ // B_F_SIZE
+  seqlen_k = k.shape[-1]
+  seq_q_num_tiles = seqlen_q // B_P_SIZE
+  seq_k_num_tiles = seqlen_k // B_F_SIZE
+
+  qk_res_buf = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ), buffer=nl.sbuf, dtype=acc_type)
+  max_local = nl.ndarray((par_dim(B_P_SIZE), num_k_tile_per_large_tile), dtype=acc_type)
+
+  for k_i in nl.affine_range(num_k_tile_per_large_tile):
+    k_i_b_f_slice = nl.ds(k_i * B_F_SIZE, B_F_SIZE)
+
+    qk_psum = nl.ndarray((par_dim(B_P_SIZE), B_F_SIZE),
+                        dtype=np.float32, buffer=nl.psum)  # (128, 512)
+    if use_causal_mask:
+      multiplication_required_selection = q_tile_idx * B_P_SIZE >= local_k_large_tile_idx * LARGE_TILE_SZ + k_i * B_F_SIZE
+    else:
+      multiplication_required_selection = True
+
+    if multiplication_required_selection:
+      qk_psum[:, :] = nl.matmul(q_local_tile, k[:, k_i_b_f_slice], transpose_x=True) # (p(128), 512)
+    else:
+      qk_psum[:, :] = 0
+
+    if use_causal_mask:
+      left_diagonal_selection = q_tile_idx * B_P_SIZE >= local_k_large_tile_idx * LARGE_TILE_SZ + (k_i + 1) * B_F_SIZE
+      diagonal_and_right_selection = (q_tile_idx * B_P_SIZE < local_k_large_tile_idx * LARGE_TILE_SZ + (k_i + 1) * B_F_SIZE)
+      right_diagonal_selection = ((q_tile_idx + 1) * B_P_SIZE <= local_k_large_tile_idx * LARGE_TILE_SZ + k_i * B_F_SIZE)
+      diagonal = ((q_tile_idx * B_P_SIZE < local_k_large_tile_idx * LARGE_TILE_SZ + (k_i + 1) * B_F_SIZE) &
+                  ((q_tile_idx + 1) * B_P_SIZE > local_k_large_tile_idx * LARGE_TILE_SZ + k_i * B_F_SIZE))
+
+      i_q_p, i_q_f = nl.mgrid[0:B_P_SIZE, 0:B_F_SIZE]
+      q_pos = q_tile_idx * B_P_SIZE + i_q_p
+      k_pos = local_k_large_tile_idx * LARGE_TILE_SZ + k_i * B_F_SIZE + i_q_f
+      pred = q_pos >= k_pos
+
+      qk_select_tmp = nl.ndarray(qk_psum.shape, dtype=qk_psum.dtype, buffer=nl.sbuf)
+
+      if logit_bias_tile is not None:
+        if right_diagonal_selection:
+          qk_select_tmp[...] = qk_psum
+
+          # For tiles to the right of the diagonal, do affine_select.
+          # Magic number -9984.0 to replace -inf similar to what Tensorizer uses
+          qk_res_buf[:, k_i_b_f_slice] = nisa.affine_select(
+              pred=pred,
+              on_true_tile=qk_select_tmp, on_false_value=-9984.0, dtype=acc_type)
+
+        # For tiles on the diagonal, add logit bias and need to do affine_select.
+        intermediate = \
+            nl.add(qk_psum, logit_bias_tile[:, k_i_b_f_slice],
+                   dtype=acc_type, mask=diagonal)
+        qk_res_buf[:, k_i_b_f_slice] = nisa.affine_select(
+            pred=pred,
+            on_true_tile=intermediate, on_false_value=-9984.0, dtype=acc_type,
+            mask=diagonal)
+
+        # For tiles on the left of the diagonal, just add logit bias, no select required.
+        qk_res_buf[:, k_i_b_f_slice] = \
+            nl.add(qk_psum, logit_bias_tile[:, k_i_b_f_slice],
+                   dtype=acc_type, mask=left_diagonal_selection)
+      else:
+        # For tiles on and to the right of the diagonal, need to do affine_select.
+        # Magic number -9984.0 to replace -inf similar to what Tensorizer uses
+        if diagonal_and_right_selection:
+          qk_select_tmp[...] = qk_psum
+
+          qk_res_buf[:, k_i_b_f_slice] = nisa.affine_select(
+            pred=pred,
+            on_true_tile=qk_select_tmp, on_false_value=-9984.0, dtype=acc_type)
+
+        # For tiles on the left of the diagonal, direct copy, no select required.
+        qk_res_buf[:, k_i_b_f_slice] = \
+          nl.copy(qk_psum, dtype=acc_type, mask=left_diagonal_selection)
+    else:
+      if logit_bias_tile is not None:
+        # Simply add logit bias which copies back to sbuf at the same time
+        qk_res_buf[:, k_i_b_f_slice] = \
+            nl.add(qk_psum, logit_bias_tile[:, k_i_b_f_slice], dtype=acc_type)
+      else:
+        # Simply send psum result back to sbuf
+        qk_res_buf[:, k_i_b_f_slice] = nl.copy(qk_psum, dtype=acc_type)
+
+    # Calculate max of the current tile
+    max_local[:, k_i] = nisa.tensor_reduce(
+      np.max, qk_res_buf[:, k_i_b_f_slice], axis=(1,), dtype=acc_type,
+      negate=False)
+
+  max_ = nisa.tensor_reduce(np.max, max_local[:, :], axis=(1, ),
+                            dtype=acc_type, negate=False)
+
+  o_previous_scaled = nl.ndarray((par_dim(B_P_SIZE), B_D_SIZE), dtype=o_buffer.dtype)
+
+  if initialize:
+    m_buffer[:, 0] = nl.copy(max_)
+    m_current = max_
+  else:
+    m_previous = nl.copy(m_buffer[:, 0])
+    m_buffer[:, 0] = nl.maximum(m_previous, max_) # (128,1)
+
+    m_current = m_buffer[:, 0]
+    # Compute scaling factor
+    alpha = nisa.activation(np.exp, m_current, bias=m_previous, scale=-1.0)
+    o_previous_scaled[...] = nl.multiply(o_buffer[:, :], alpha)
+
+  p_local = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ), dtype=kernel_dtype)
+  REDUCTION_TILE = min(2048, LARGE_TILE_SZ // 2)
+
+  p_partial_sum = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ // REDUCTION_TILE), dtype=acc_type)
+
+  for k_r_i in nl.affine_range(LARGE_TILE_SZ // REDUCTION_TILE):
+    k_r_i_reduce_slice = nl.ds(k_r_i * REDUCTION_TILE, REDUCTION_TILE)
+
+    # dropout
+    if dropout_p > 0.0:
+      # compute exp(qk-max)
+      p_local[:, k_r_i_reduce_slice] = \
+        nisa.activation(np.exp, qk_res_buf[:, k_r_i_reduce_slice],
+                        bias=-1 * m_current, scale=1.0,
+                        dtype=kernel_dtype)
+
+      seed_offset_base = k_r_i * (REDUCTION_TILE // B_F_SIZE) \
+                         + local_k_large_tile_idx * (LARGE_TILE_SZ // B_F_SIZE) \
+                         + q_tile_idx * seq_k_num_tiles \
+                         + (head_id * q_h_per_k_h + gqa_head_idx) * seq_k_num_tiles * seq_q_num_tiles \
+                         + batch_id * nheads * seq_k_num_tiles * seq_q_num_tiles
+
+      dropout_p_local(p_local=p_local, dropout_p=dropout_p,
+                      dropout_p_tensor=dropout_p_tensor, seed_tensor=seed_tensor,
+                      seed_offset_base=seed_offset_base, k_r_i=k_r_i,
+                      REDUCTION_TILE=REDUCTION_TILE)
+
+      # Compute partial row-tile sum of exp(qk-max))
+      # FIXME: Use activation accumulate and accumulate over k_r_i loop?
+      p_partial_sum[:, k_r_i] = nl.sum(p_local[:, k_r_i_reduce_slice],
+                                       axis=1, dtype=acc_type)
+    else:
+      # compute exp(qk-max)
+      # Compute partial row-tile sum of exp(qk-max))
+      # FIXME: Use activation accumulate to accumulate over k_r_i loop?
+      p_local[:, k_r_i_reduce_slice] = \
+        nisa.activation_reduce(np.exp, qk_res_buf[:, k_r_i_reduce_slice],
+                               bias=-1 * m_current, scale=1.0,
+                               reduce_op=nl.add, reduce_res=p_partial_sum[:, k_r_i],
+                               dtype=kernel_dtype)
+
+  ps = nl.sum(p_partial_sum, axis=1, dtype=acc_type)
+
+  p_local_transposed = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ), dtype=kernel_dtype)
+  transpose_p_local(p_local_transposed=p_local_transposed, p_local=p_local,
+                    LARGE_TILE_SZ=LARGE_TILE_SZ)
+
+  pv_psum = nl.zeros((par_dim(B_P_SIZE), B_D_SIZE), dtype=np.float32,
+                     buffer=nl.psum, lazy_initialization=True)
+  for k_i in nl.affine_range(LARGE_TILE_SZ // B_P_SIZE):
+    pv_psum[:, :] += nl.matmul(p_local_transposed[:, nl.ds(k_i * B_P_SIZE, B_P_SIZE)],
+                               v[k_i, :, :], transpose_x=True) # (128, 128) (p(Br), d)
+
+  if initialize:
+    o_buffer[:, :] = nl.copy(pv_psum[:, :])
+    l_buffer[:, 0] = nl.add(nl.log(ps), max_)
+  else:
+    o_buffer[:, :] = nl.add(o_previous_scaled, pv_psum)
+
+    exp = nisa.activation(nl.exp, m_current, bias=l_buffer[:, 0], scale=-1.0)
+    l_buffer[:, 0] = nl.add(m_current, nisa.activation(nl.log, exp, bias=ps))
+
+
+@nki.jit(mode='trace')
+def load_v_tile(v_hbm_tile, cur_v_tile, j, v_i, config):
+  LARGE_TILE_SZ = config.seq_tile_size
+  B_P_SIZE = 128
+
+  if not config.should_transpose_v:
+    cur_v_tile[v_i, :, :] = nl.load(
+      v_hbm_tile[nl.ds(j * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE), :],
+      dtype=cur_v_tile.dtype)
+    return
+
+  if nisa.get_nc_version() >= nisa.nc_version.gen3:
+    cur_v_tile_transposed = nisa.dma_transpose(
+      v_hbm_tile[:, nl.ds(j * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE)])
+    cur_v_tile[v_i, :, :] = nisa.tensor_copy(cur_v_tile_transposed,
+                                             dtype=cur_v_tile.dtype)
+    return
+
+  cur_v_tile[v_i, :, :] = nl.load_transpose2d(
+    v_hbm_tile[:, nl.ds(j * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE)],
+    dtype=cur_v_tile.dtype)
+
+
+
+@nki.jit
+def flash_fwd(q, k, v, seed, logit_bias=None,
+              softmax_scale=None,
+              use_causal_mask=True,
+              mixed_precision=True,
+              dropout_p=0.0, config=None):
+  """
+  Flash Attention Forward kernel
+
+  IO tensor layouts:
+    - q: shape   (bs, n_heads, d, seq_q)
+    - k: shape   (bs, nk_heads, d, seq_k)
+    - v: shape   (bs, nv_heads, d, seq_v) if config.should_transpose_v  else (bs, nv_heads, seq_v, d)
+    - seed: shape (1,)
+    - logit_bias: shape (bs, n_heads, seq_q, seq_k)
+    - o: shape (bs, n_heads, seq_q, d)
+    - lse: shape (bs, n_heads, nl.tile_size.pmax, seq // nl.tile_size.pmax) if training else None
+    - This kernel requires seq_k == seq_v
+
+  IO tensor dtypes:
+    - This kernel assumes all IO tensors have the same dtype
+    - If mixed_precision is True, then all Tensor Engine operation will be performed in
+      bfloat16 and accumulation will be performed in float32. Otherwise the intermediates
+      will be in the same type as the inputs.
+
+  Compile-time Constants:
+    - softmax_scale: scaling for softmax, is None, default is `1.0/(d**0.5)`
+    - mixed_precision: flag to set non-matmul ops in fp32 precision, default is set to `true`, if false, we use same precision as input types
+    - causal_mask: flag to set causal masking
+    - config: Instance of :class:`nki.kernels.attention.FlashConfig` with Performance config parameters for flash attention with default values
+        seq_tile_size: `default=2048`, size of the kv tile size for attention computation reduction
+        training: bool to indicate training vs inference `default=True`
+
+  Performance Notes:
+    For better performance, the kernel is tiled to be of size `config.seq_tile_size`, and Flash attention math techniques are applied in unit
+    of `config.seq_tile_size`. Seqlen that is not divisible by `config.seq_tile_size` is not supported at the moment.
+
+    For large seqlen, `o_buffer` will overflow the statebuf. the kernel is tile `o_buffer` based on the value of `config.attn_core_tile_size`.
+    This is a tradeoff between memory usage and performance. The default value of `config.attn_core_tile_size` is 256, which means the `o_buffer`
+    will roughly take half of the statebuf. The computes are also tiled accordingly. DMA will be rematerialized
+    `seqlen_q // B_P_SIZE // attn_core_tile_size times`.
+
+
+
+  GQA support Notes:
+    the spmd kernel for launching kernel should be on kv_heads instead of nheads
+
+  Example usage:
+    MHA: q: [b, h, d, s], k: [b, h, d, s], v: [b, h, s, d]
+      usage: `flash_fwd[b, h](q, k, v, ...)`
+    GQA: q: [b, h, d, s], k: [b, kv_h, d, s], v: [b, kv_h, s, d]
+      usage: `flash_fwd[b, kv_h](q, k, v, ...)`
+  """
+  config = config or FlashConfig()
+  B_F_SIZE=512
+  B_P_SIZE=128
+  b, h, d, seqlen_q  = q.shape
+  B_D_SIZE = d
+  _, k_h, _, seqlen_k = k.shape
+  if config.should_transpose_v:
+    assert tuple(v.shape) == (b, k_h, d, seqlen_k), f"Expect shape of V to be {(b, k_h, d, seqlen_k)} (batch, heads, d_head, seqlen_k) but got {v.shape}"
+    assert tuple(k.shape) == (b, k_h, d, seqlen_k), f"Expect shape of K to be {(b, k_h, d, seqlen_k)} (batch, heads, d_head, seqlen_k) but got {k.shape}"
+  else:
+    assert tuple(v.shape) == (b, k_h, seqlen_k, d), f"Expect shape of V to be {(b, k_h, seqlen_k, d)} (batch, heads, seqlen_k, d_head) but got {v.shape}"
+    assert tuple(k.shape) == (b, k_h, d, seqlen_k), f"Expect shape of K to be {(b, k_h, d, seqlen_k)} (batch, heads, d_head, seqlen_k) but got {k.shape}"
+  assert d <= 128, f" we do not support head_dim > 128, got head dim {d}"
+  kernel_dtype = nl.bfloat16 if mixed_precision else q.dtype
+  acc_type = np.dtype(np.float32) if mixed_precision else kernel_dtype
+
+  o = nl.ndarray((b, h, seqlen_q, d), dtype=q.dtype, buffer=nl.shared_hbm)
+  if config.training:
+    if config.lse_dtype:
+      lse_dtype = getattr(nl, config.lse_dtype)
+    else:
+      lse_dtype = acc_type
+    lse = nl.ndarray((b, h, nl.tile_size.pmax, seqlen_q // nl.tile_size.pmax),
+                     dtype=lse_dtype, buffer=nl.shared_hbm)
+  else:
+    lse = None
+
+  assert nl.program_ndim() == 2,\
+    f'Expect spmd grid with 2 dimensions, got {nl.program_ndim()} instead!'
+  batch_id = nl.program_id(axis=0)
+  head_id = nl.program_id(axis=1)
+
+  softmax_scale = softmax_scale or (1.0 / (d ** 0.5))
+
+  n_tile_q = seqlen_q // B_P_SIZE # since q will be loaded on tensor engine
+
+  LARGE_TILE_SZ = config.seq_tile_size
+  attn_core_tile_size = config.attn_core_tile_size
+
+  # FIXME: Add masking for different seqlen values.
+  assert config.seq_tile_size >= 512, f" seq tile_size {config.seq_tile_size} cannot be less than 512"
+  assert seqlen_k % LARGE_TILE_SZ == 0, f"Need seqlen_k to be divisible by {LARGE_TILE_SZ} but got {seqlen_k}"
+  num_large_k_tile = seqlen_k // LARGE_TILE_SZ
+
+  # inference flag, check if lse is none
+  inference = not config.training
+  if inference:
+    assert lse is None, "lse should be none for inference"
+    assert seed is None, f"seed should be None for inference, but got {seed}"
+    assert dropout_p==0.0, f"dropout should be 0.0 for inference but got {dropout_p}"
+  else:
+    assert lse is not None, "lse should not be none for training"
+  q_h_per_k_h = h // k_h
+
+  if dropout_p > 0.0 and not inference:
+    seed_local = nl.load(seed[0])
+    # TODO: Remove this once the dropout supports scale prob
+    dropout_p_tensor = nl.full((B_P_SIZE, 1), fill_value=dropout_p, dtype=np.float32)
+  else:
+    dropout_p_tensor = None
+    seed_local = None
+
+  if logit_bias is not None:
+    b_logit_bias, h_logit_bias, _, _ = logit_bias.shape
+    assert b_logit_bias == 1 and h_logit_bias == 1, "only support broadcasting logit_bias with batch 1, n_heads 1"
+
+  n_remat = div_ceil(n_tile_q, attn_core_tile_size)
+  attn_core_tile_size = min(n_tile_q, attn_core_tile_size)
+
+  for i_q_h in nl.affine_range(q_h_per_k_h):
+    # =============== Global Flash Attention accumulators ====================== #
+    l_buffer = nl.zeros((par_dim(B_P_SIZE), n_tile_q), dtype=acc_type,
+                        buffer=nl.sbuf, lazy_initialization=True)
+    # =============== Global Flash Attention accumulators END ================== #
+
+    for i0 in nl.sequential_range(n_remat):
+      # =============== Global Flash Attention accumulators ====================== #
+      o_buffer = nl.zeros((attn_core_tile_size, par_dim(B_P_SIZE), d), dtype=acc_type,
+                          buffer=nl.sbuf, lazy_initialization=True)
+      m_buffer = nl.zeros((attn_core_tile_size, par_dim(B_P_SIZE), 1), dtype=acc_type,
+                          buffer=nl.sbuf, lazy_initialization=True)
+      # =============== Global Flash Attention accumulators END ================== #
+
+      for j in nl.sequential_range(0, num_large_k_tile):
+        cur_k_tile = nl.ndarray((par_dim(B_D_SIZE), LARGE_TILE_SZ), dtype=kernel_dtype)
+        cur_v_tile = nl.ndarray((LARGE_TILE_SZ // B_P_SIZE, par_dim(B_P_SIZE), B_D_SIZE), dtype=kernel_dtype)
+
+        cur_k_tile[:, :] = nl.load(k[batch_id, head_id, :, nl.ds(j*LARGE_TILE_SZ, LARGE_TILE_SZ)])
+
+        load_tile_size = B_P_SIZE
+
+        v_hbm_tile = v[batch_id, head_id]
+        for v_i in nl.affine_range(LARGE_TILE_SZ // load_tile_size):
+          load_v_tile(v_hbm_tile=v_hbm_tile, cur_v_tile=cur_v_tile, j=j, v_i=v_i,
+                      config=config)
+
+        for i1 in nl.affine_range(attn_core_tile_size):
+          i = i0 * attn_core_tile_size + i1
+          # mask are used to only apply computation to the lower half of the matrix,
+          # which reduce the arthimetic intensity by half.
+          # forward_mask imply initialize, i.e. if forward_mask is false, initialize will
+          # be false as well
+          if use_causal_mask:
+            forward_mask = i * B_P_SIZE >= j * LARGE_TILE_SZ
+          else:
+            forward_mask = True
+
+          if (i < n_tile_q) & forward_mask:
+            q_tile = nl.ndarray((B_D_SIZE, B_P_SIZE),dtype=kernel_dtype)
+            q_hbm_tile = q[batch_id, head_id * q_h_per_k_h + i_q_h]
+            q_sbuf_tile = nl.load(q_hbm_tile[:, nl.ds(i * B_P_SIZE, B_P_SIZE)],
+                                  dtype=kernel_dtype) # load (d, 128) tile in SBUF
+            q_tile[:, :] = q_sbuf_tile * softmax_scale
+
+            logit_bias_tile = None
+            if logit_bias is not None:
+              logit_bias_tile = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ), dtype=kernel_dtype)
+              logit_bias_tile[:, :] = nl.load(
+                logit_bias[0, 0, nl.ds(i * B_P_SIZE, B_P_SIZE),
+                           nl.ds(j * LARGE_TILE_SZ, LARGE_TILE_SZ)])
+
+            _flash_attention_core(q_local_tile=q_tile, k=cur_k_tile, v=cur_v_tile,
+                                  q_h_per_k_h=q_h_per_k_h, seqlen_q=seqlen_q, nheads=h,
+                                  o_buffer=o_buffer[i1], l_buffer=l_buffer[:, i], m_buffer=m_buffer[i1],
+                                  batch_id=batch_id, head_id=head_id,
+                                  gqa_head_idx=i_q_h, q_tile_idx=i, local_k_large_tile_idx=j,
+                                  kernel_dtype=kernel_dtype, acc_type=acc_type,
+                                  flash_config=config, use_causal_mask=use_causal_mask,
+                                  initialize=j == 0,
+                                  B_P_SIZE=B_P_SIZE, B_F_SIZE=B_F_SIZE, B_D_SIZE=B_D_SIZE,
+                                  dropout_p=dropout_p, dropout_p_tensor=dropout_p_tensor,
+                                  seed_tensor=seed_local, logit_bias_tile=logit_bias_tile)
+
+      # -------- write output to buffer on HBM ------------ #
+      for i1 in nl.affine_range(attn_core_tile_size):
+        i = i0 * attn_core_tile_size + i1
+
+        if i < n_tile_q:
+          exp = nisa.activation(np.exp, l_buffer[:, i], bias=m_buffer[i1, :, :],
+                                scale=-1.0)
+          out = nl.multiply(o_buffer[i1, :, :], exp,
+                            dtype=kernel_dtype)
+
+          nl.store(o[batch_id, head_id * q_h_per_k_h + i_q_h,
+                     nl.ds(i*B_P_SIZE, B_P_SIZE), :], out)
+
+    if not inference:
+      nl.store(lse[batch_id, head_id * q_h_per_k_h + i_q_h, :, :], l_buffer[:, :])
+
+  if config.training:
+    return o, lse
+
+  return o
+
+
+
+@nki.jit
+def flash_attn_bwd(
+  q_ref, k_ref, v_ref, o_ref,
+  dy_ref,
+  lse_ref,
+  seed_ref,
+  logit_bias_ref=None,
+  use_causal_mask=False,
+  mixed_precision=False,
+  dropout_p=0.0,
+  softmax_scale=None,
+):
+  """
+  Flash attention backward kernel. Compute the backward gradients.
+
+  IO tensor layouts:
+   - q_ref: shape (bs, nheads, head_size, seq)
+   - k_ref: shape (bs, nheads, head_size, seq)
+   - v_ref: shape (bs, nheads, head_size, seq)
+   - o_ref: shape (bs, nheads, head_size, seq)
+   - dy_ref: shape (bs, nheads, head_size, seq)
+   - lse_ref: shape (bs, nheads, nl.tile_size.pmax, seq // nl.tile_size.pmax)
+   - seed_ref: shape (1,)
+   - logit_bias_ref: shape (bs, n_heads, seq_q, seq_k)
+   - out_dq_ref: shape (bs, nheads, head_size, seq)
+   - out_dk_ref: shape (bs, nheads, head_size, seq)
+   - out_dv_ref: shape (bs, nheads, head_size, seq)
+
+  Detailed steps:
+    1. D = rowsum(dO ◦ O) (pointwise multiply)
+
+    2. Recompute (softmax(Q^T@K + logic_bias))
+
+      2.1 Q^T@K
+      2.2 Scale the QK score
+      2.3 Apply causal mask and add logit_bias
+      2.4 softmax
+
+    3. Compute the gradients of y = score @ V with respect to the loss
+
+    4. Compute the gradients of y = softmax(x)
+
+    5. Compute the gradients of Q^T@K
+
+      4.1 Compute dQ
+      4.2 Compute dK
+  """
+
+  # Use q_ref dtype as the intermediate tensor dtype
+  # Assume all IO tensors have the same dtype
+  kernel_dtype = q_ref.dtype
+  mixed_dtype = np.dtype(np.float32) if mixed_precision else kernel_dtype
+
+  assert q_ref.dtype == k_ref.dtype == v_ref.dtype == o_ref.dtype == dy_ref.dtype
+
+  # Shape checking
+  bs, nheads, d_head, seqlen_q = q_ref.shape
+  _, _, _, seqlen_k = k_ref.shape
+  assert tuple(k_ref.shape) == (bs, nheads, d_head, seqlen_k), \
+    f"Input K shape mismatch, got {k_ref.shape}"
+  assert tuple(v_ref.shape) == (bs, nheads, d_head, seqlen_k), \
+    f"Input V shape mismatch, got {v_ref.shape}"
+  assert tuple(o_ref.shape) == (bs, nheads, d_head, seqlen_q), \
+    f"Input o shape mismatch, got {o_ref.shape}"
+  assert tuple(dy_ref.shape) == (bs, nheads, d_head, seqlen_q), \
+    f"Input dy shape mismatch, got {dy_ref.shape}"
+  assert tuple(lse_ref.shape) == (bs, nheads, nl.tile_size.pmax, seqlen_q // nl.tile_size.pmax), \
+    f"Input lse shape mismatch, got {lse_ref.shape}"
+  if seed_ref is not None:
+    assert tuple(seed_ref.shape) == (1,), \
+      f"Input seed shape mismatch, got {seed_ref.shape}"
+
+  out_dq_ref = nl.ndarray((bs, nheads, d_head, seqlen_q), dtype=q_ref.dtype,
+                          buffer=nl.shared_hbm)
+  out_dk_ref = nl.ndarray((bs, nheads, d_head, seqlen_k), dtype=q_ref.dtype,
+                          buffer=nl.shared_hbm)
+  out_dv_ref = nl.ndarray((bs, nheads, d_head, seqlen_k), dtype=q_ref.dtype,
+                          buffer=nl.shared_hbm)
+
+  # FIXME: Add masking for different seqlen values.
+  assert seqlen_q % 128 == 0 and seqlen_k % 128 == 0, \
+    f"Input sequence lengths must be divisible by 128, got seqlen_q == {seqlen_q} and seqlen_k == {seqlen_k}"
+
+  # Softmax scaling factor, multiplied onto Q
+  softmax_scale = softmax_scale or 1.0 / float(d_head ** 0.5)
+
+  assert nl.program_ndim() == 2,\
+    f'Expect spmd grid with 2 dimensions, got {nl.program_ndim()} instead!'
+  # Different batch samples/attention heads have independent attention
+  batch_id = nl.program_id(axis=0)
+  head_id = nl.program_id(axis=1)
+
+  assert nl.num_programs(1) == nheads, \
+    f"The grid shape mismatch, got {nl.num_programs(1)} but should be {nheads}"
+
+  if logit_bias_ref is not None:
+    b_logit_bias, h_logit_bias, _, _ = logit_bias_ref.shape
+    assert b_logit_bias == 1 and h_logit_bias == 1, "Only support broadcasting logit_bias with batch 1, n_heads 1"
+
+  q_seq_n_tiles, q_seq_tile_size = div_ceil(seqlen_q, 128), 128
+  d_head_n_tiles, d_head_tile_size = div_ceil(d_head, 128), min(d_head, 128)
+
+  if seqlen_k >= 512:
+    k_seq_n_tiles, k_seq_tile_size = seqlen_k // 512, 512
+  else:
+    k_seq_n_tiles, k_seq_tile_size = seqlen_k // 128, 128
+
+  k_seq_n_tiles_backward, k_seq_tile_size_backward = seqlen_k // 128, 128
+  k_seq_fwd_bwd_tile_multipler = k_seq_tile_size // k_seq_tile_size_backward
+
+  ##############################################################
+  # Step 2.4 Prefetch exp bias for softmax
+  ##############################################################
+  softmax_exp_bias = nl.zeros((par_dim(q_seq_tile_size), q_seq_n_tiles), dtype=mixed_dtype)
+  lse_local = nl.load(lse_ref[batch_id, head_id, :, :], dtype=mixed_dtype)
+  softmax_exp_bias[:, :] = lse_local * -1.0
+
+  ##############################################################
+  # Step 1 Compute rowsum(dO ◦ O)
+  ##############################################################
+  dy_o_sum = nl.ndarray((q_seq_n_tiles, par_dim(q_seq_tile_size), 1), dtype=mixed_dtype)
+  compute_rowsum(dy_o_sum=dy_o_sum,
+                 dy_ref_hbm_tile=dy_ref[batch_id, head_id],
+                 o_ref_hbm_tile=o_ref[batch_id, head_id],
+                 d_head_n_tiles=d_head_n_tiles, d_head_tile_size=d_head_tile_size,
+                 q_seq_n_tiles=q_seq_n_tiles, q_seq_tile_size=q_seq_tile_size)
+
+  if dropout_p > 0.0:
+    seed_local = nl.load(seed_ref[0])
+    # TODO: Remove this once the dropout supports scale prob
+    dropout_p_local = nl.full((q_seq_tile_size, 1), fill_value=dropout_p, dtype=np.float32)
+  else:
+    seed_local = None
+    dropout_p_local = None
+
+  dq_local_reduced = nl.zeros((q_seq_n_tiles, d_head_n_tiles, par_dim(d_head_tile_size), q_seq_tile_size),
+                              dtype=mixed_dtype)
+
+  # affine_range give the compiler permission to vectorize instructions
+  # inside the loop which improves the performance. However, when using the
+  # the dropout we should use sequential_range to avoid setting
+  # seed vectorization. TODO: the compiler should avoid vectorizing seed setting
+  _range = nl.sequential_range if dropout_p > 0.0 else nl.affine_range
+
+  for i_k_seq_tile in nl.affine_range(k_seq_n_tiles):
+    i_k_seq_dslice = nl.ds(i_k_seq_tile * k_seq_tile_size, k_seq_tile_size)
+
+    # Prefetch V, K
+    v_local = nl.zeros((d_head_n_tiles, par_dim(d_head_tile_size), k_seq_tile_size),
+                       dtype=kernel_dtype)
+    k_local = nl.zeros((d_head_n_tiles, par_dim(d_head_tile_size), k_seq_tile_size),
+                       dtype=kernel_dtype)
+    transposed_k_local = nl.zeros((k_seq_fwd_bwd_tile_multipler, d_head_n_tiles,
+                                   par_dim(k_seq_tile_size_backward), d_head_tile_size),
+                                  dtype=kernel_dtype)
+
+    load_kv(k_ref_hbm_tile=k_ref[batch_id, head_id],
+            v_ref_hbm_tile=v_ref[batch_id, head_id],
+            k_local=k_local, transposed_k_local=transposed_k_local, v_local=v_local,
+            d_head_n_tiles=d_head_n_tiles, d_head_tile_size=d_head_tile_size,
+            i_k_seq_tile=i_k_seq_tile, k_seq_tile_size=k_seq_tile_size,
+            k_seq_tile_size_backward=k_seq_tile_size_backward)
+
+    # FIXME: Pass sbuf instead, we will have psum spilling in the current implementation
+    dv_psum = nl.zeros((d_head_n_tiles, par_dim(d_head_tile_size), k_seq_tile_size),
+                        dtype=np.float32, buffer=nl.psum)
+    dk_psum = nl.zeros((d_head_n_tiles, par_dim(d_head_tile_size), k_seq_tile_size),
+                        dtype=np.float32, buffer=nl.psum)
+    for i_q_seq_tile in _range(q_seq_n_tiles):
+      # Prefetch dy, Q
+      dy_local = nl.zeros((d_head_n_tiles, par_dim(d_head_tile_size), q_seq_tile_size), dtype=kernel_dtype)
+      q_local = nl.zeros((d_head_n_tiles, par_dim(d_head_tile_size), q_seq_tile_size), dtype=kernel_dtype)
+
+      load_dy_q(dy_ref_hbm_tile = dy_ref[batch_id, head_id],
+                q_ref_hbm_tile = q_ref[batch_id, head_id],
+                dy_local=dy_local, q_local=q_local, d_head_n_tiles=d_head_n_tiles,
+                d_head_tile_size=d_head_tile_size, i_q_seq_tile=i_q_seq_tile,
+                q_seq_tile_size=q_seq_tile_size, softmax_scale=softmax_scale)
+
+      logit_bias_tile = None
+      if logit_bias_ref is not None:
+        i_q_seq_dslice = nl.ds(i_q_seq_tile * q_seq_tile_size, q_seq_tile_size)
+        logit_bias_tile = nl.ndarray((par_dim(q_seq_tile_size), k_seq_tile_size),
+                                     buffer=nl.sbuf, dtype=kernel_dtype)
+        logit_bias_tile[:, :] = nl.load(
+          logit_bias_ref[0, 0, i_q_seq_dslice, i_k_seq_dslice])
+
+      _flash_attn_bwd_core(
+        q_local=q_local, k_local=k_local, transposed_k_local=transposed_k_local,
+        v_local=v_local, dy_local=dy_local,
+        dk_psum=dk_psum, dv_psum=dv_psum, dq_local_reduced=dq_local_reduced,
+        softmax_exp_bias=softmax_exp_bias, dy_o_sum=dy_o_sum,
+        local_i_q_seq_tile=i_q_seq_tile, local_i_k_seq_tile=i_k_seq_tile,
+        seqlen_q=seqlen_q, seqlen_k=seqlen_k, d_head=d_head, nheads=nheads,
+        use_causal_mask=use_causal_mask,
+        kernel_dtype=kernel_dtype, mixed_dtype=mixed_dtype,
+        softmax_scale=softmax_scale,
+        seed_local=seed_local, dropout_p=dropout_p, dropout_p_local=dropout_p_local,
+        logit_bias_tile=logit_bias_tile
+      )
+
+    # Write dK, dV
+    store_dk_dv(out_dk_ref_hbm_tile=out_dk_ref[batch_id, head_id],
+                out_dv_ref_hbm_tile=out_dv_ref[batch_id, head_id],
+                local_dk=dk_psum, local_dv=dv_psum, i_k_seq_dslice=i_k_seq_dslice,
+                d_head_n_tiles=d_head_n_tiles, d_head_tile_size=d_head_tile_size)
+
+  # Write dQ
+  for i_q_seq_tile in nl.affine_range(q_seq_n_tiles):
+    for i_d_head_tile in nl.affine_range(d_head_n_tiles):
+      i_q_seq_dslice = nl.ds(i_q_seq_tile * q_seq_tile_size, q_seq_tile_size)
+      i_d_head_dslice = nl.ds(i_d_head_tile * d_head_tile_size, d_head_tile_size)
+      nl.store(
+        out_dq_ref[batch_id, head_id, i_d_head_dslice, i_q_seq_dslice],
+        value=dq_local_reduced[i_q_seq_tile, i_d_head_tile, :, :],
+      )
+
+  return out_dq_ref, out_dk_ref, out_dv_ref
+
+
+@nki.jit(mode='trace')
+def load_dy_q(dy_ref_hbm_tile, q_ref_hbm_tile, dy_local, q_local, d_head_n_tiles, d_head_tile_size, i_q_seq_tile,
+              q_seq_tile_size, softmax_scale):
+  for i_d_head_tile in nl.affine_range(d_head_n_tiles):
+    i_d_head_dslice = nl.ds(i_d_head_tile * d_head_tile_size, d_head_tile_size)
+    i_q_seq_dslice = nl.ds(i_q_seq_tile * q_seq_tile_size, q_seq_tile_size)
+
+    dy_local[i_d_head_tile, :, :] = nl.load(
+      dy_ref_hbm_tile[i_d_head_dslice, i_q_seq_dslice],
+      dtype=dy_local.dtype)
+
+    q_local[i_d_head_tile, :, :] = nl.load(
+      q_ref_hbm_tile[i_d_head_dslice, i_q_seq_dslice],
+      dtype=q_local.dtype) * softmax_scale
+
+
+@nki.jit(mode='trace')
+def store_dk_dv(out_dk_ref_hbm_tile, out_dv_ref_hbm_tile, local_dk, local_dv,
+                d_head_n_tiles, d_head_tile_size, i_k_seq_dslice):
+  for i in nl.affine_range(d_head_n_tiles):
+    i_d_head_dslice = nl.ds(i * d_head_tile_size, d_head_tile_size)
+
+    nl.store(out_dv_ref_hbm_tile[i_d_head_dslice, i_k_seq_dslice],
+             value=local_dv[i, :, :])
+
+    nl.store(out_dk_ref_hbm_tile[i_d_head_dslice, i_k_seq_dslice],
+             value=local_dk[i, :, :])
+
+
+@nki.jit(mode='trace')
+def load_kv(k_ref_hbm_tile, v_ref_hbm_tile, k_local, transposed_k_local, v_local,
+            d_head_n_tiles, d_head_tile_size, i_k_seq_tile, k_seq_tile_size,
+            k_seq_tile_size_backward):
+  k_seq_fwd_bwd_tile_multipler = k_seq_tile_size // k_seq_tile_size_backward
+
+  for i in nl.affine_range(d_head_n_tiles):
+    i_d_head_dslice = nl.ds(i * d_head_tile_size, d_head_tile_size)
+    i_k_seq_dslice = nl.ds(i_k_seq_tile * k_seq_tile_size, k_seq_tile_size)
+    k_local[i, :, :] = nl.load(k_ref_hbm_tile[i_d_head_dslice, i_k_seq_dslice],
+                                           dtype=k_local.dtype)
+    v_local[i, :, :] = nl.load(v_ref_hbm_tile[i_d_head_dslice, i_k_seq_dslice],
+                                           dtype=v_local.dtype)
+    ##############################################################
+    # Prefetch k transpose for the backward too
+    ##############################################################
+    for j in nl.affine_range(k_seq_fwd_bwd_tile_multipler):
+      i_k_dslice = nl.ds(j * k_seq_tile_size_backward, k_seq_tile_size_backward)
+      transposed_k_local[j, i, :, :] = nisa.nc_transpose(k_local[i, :, i_k_dslice])
+
+
+@nki.jit(mode='trace')
+def compute_rowsum(dy_o_sum, dy_ref_hbm_tile, o_ref_hbm_tile, d_head_n_tiles, d_head_tile_size, q_seq_n_tiles,
+                   q_seq_tile_size):
+  mixed_dtype = dy_o_sum.dtype
+  for i in nl.affine_range(q_seq_n_tiles):
+    dy_o_partial = nl.zeros((par_dim(q_seq_tile_size), d_head_n_tiles), dtype=mixed_dtype)
+    for j in nl.affine_range(d_head_n_tiles):
+      d_head_dslice = nl.ds(j * d_head_tile_size, d_head_tile_size)
+      q_seq_dslice = nl.ds(i * q_seq_tile_size, q_seq_tile_size)
+
+      dy_local = nl.load_transpose2d(dy_ref_hbm_tile[d_head_dslice, q_seq_dslice],
+                                     dtype=mixed_dtype)
+      o_local = nl.load_transpose2d(o_ref_hbm_tile[d_head_dslice, q_seq_dslice],
+                                    dtype=mixed_dtype)
+
+      dy_o = nl.multiply(dy_local, o_local, dtype=mixed_dtype)
+      dy_o_partial[:, j] = nisa.tensor_reduce(np.add, data=dy_o, axis=(1,),
+                                              dtype=mixed_dtype)
+
+    dy_o_sum[i, :, 0] = nisa.tensor_reduce(
+      np.add, data=dy_o_partial[:, :], axis=(1,), dtype=mixed_dtype)
+
+
+@nki.jit(mode='trace')
+def _flash_attn_bwd_core(
+  q_local, k_local, transposed_k_local, v_local, dy_local,
+  dk_psum, dv_psum, dq_local_reduced,
+  softmax_exp_bias, dy_o_sum,
+  local_i_q_seq_tile, local_i_k_seq_tile,
+  seqlen_q, seqlen_k, d_head, nheads,
+  use_causal_mask,
+  kernel_dtype, mixed_dtype,
+  softmax_scale,
+  seed_local, dropout_p, dropout_p_local,
+  logit_bias_tile=None):
+  """
+  The flash backward core function to calculate the gradients of Q, K and V
+  of the given tiles. The result will be accumulated into the dk, dv, dq psum
+  """
+  q_seq_n_tiles, q_seq_tile_size = div_ceil(seqlen_q, 128), 128
+  d_head_n_tiles, d_head_tile_size = div_ceil(d_head, 128), min(d_head, 128)
+  if seqlen_k >= 512:
+    k_seq_n_tiles, k_seq_tile_size = seqlen_k // 512, 512
+  else:
+    k_seq_n_tiles, k_seq_tile_size = seqlen_k // 128, 128
+  k_seq_n_tiles_backward, k_seq_tile_size_backward = seqlen_k // 128, 128
+  k_seq_fwd_bwd_tile_multipler = k_seq_tile_size // k_seq_tile_size_backward
+
+  mask = local_i_q_seq_tile * q_seq_tile_size >= local_i_k_seq_tile * k_seq_tile_size if use_causal_mask else None
+  # PSUM buffer shape: [q_seq_tile_size P, k_seq_tile_size F]
+  qk_psum = nl.zeros((par_dim(q_seq_tile_size), k_seq_tile_size),
+                      dtype=np.float32, buffer=nl.psum)
+  qk_res_buf = nl.ndarray((par_dim(q_seq_tile_size), k_seq_tile_size), buffer=nl.sbuf, dtype=kernel_dtype)
+
+  batch_id = nl.program_id(axis=0)
+  head_id = nl.program_id(axis=1)
+
+  # Loop over contraction dim of QK matmul
+  for i_d_head_tile in nl.affine_range(d_head_n_tiles):
+    ##############################################################
+    # Step 2.1 Compute Q^T@K, with matmul(stationary=tensor_q, moving=tensor_k, contract=d_head)
+    ##############################################################
+    qk_psum[:, :] += nisa.nc_matmul(q_local[i_d_head_tile, :, :],
+                                            k_local[i_d_head_tile, :, :],
+                                            mask=mask)
+
+  ######################################
+  # Step 2.2. Apply optional causal mask
+  ######################################
+  if use_causal_mask:
+    iq, ik = nl.mgrid[0:q_seq_tile_size, 0:k_seq_tile_size]
+    causal_pred = (local_i_q_seq_tile * q_seq_tile_size + iq >= local_i_k_seq_tile * k_seq_tile_size + ik)
+    if logit_bias_tile is not None:
+      # Magic number -9984.0 to replace -inf similar to what Tensorizer uses
+      intermediate = \
+        nl.add(qk_psum[:, :], logit_bias_tile[:, :], dtype=mixed_dtype, mask=mask)
+      qk_res_buf[:, :] = nisa.affine_select(
+        pred=causal_pred, 
+        on_true_tile=intermediate, on_false_value=-9984.0, dtype=mixed_dtype,
+        mask=mask
+      )
+
+    else:
+      # Magic number -9984.0 to replace -inf similar to what Tensorizer uses
+      qk_res_buf[:, :] = nisa.affine_select(
+        pred=causal_pred,
+        on_true_tile=qk_psum[:, :], on_false_value=-9984.0, dtype=mixed_dtype,
+        mask=mask)
+  else:
+    if logit_bias_tile is not None:
+      # Simply add logit bias which copies back to sbuf at the same time
+      qk_res_buf[:, :] = \
+        nl.add(qk_psum[:, :], logit_bias_tile[:, :], dtype=mixed_dtype)
+    else:
+      # Simply send psum result back to sbuf
+      qk_res_buf[:, :] = \
+        nl.copy(qk_psum[:, :], dtype=mixed_dtype)
+
+  softmax_y = nl.ndarray((par_dim(q_seq_tile_size), k_seq_tile_size), dtype=kernel_dtype, buffer=nl.sbuf)
+  softmax_y[:, :] = nisa.activation(np.exp,
+                                    data=qk_res_buf[:, :],
+                                    bias=softmax_exp_bias[:, local_i_q_seq_tile],
+                                    scale=1.0,
+                                    mask=mask)
+  #####################################################################
+  # Dropout
+  #####################################################################
+  if dropout_p > 0.0:
+    offset = local_i_k_seq_tile + local_i_q_seq_tile * k_seq_n_tiles \
+              + head_id * k_seq_n_tiles * q_seq_n_tiles \
+              + batch_id * nheads * k_seq_n_tiles * q_seq_n_tiles
+    offset_seed = nl.add(seed_local[0, 0], offset, mask=mask)
+    nl.random_seed(seed=offset_seed, mask=mask)
+    softmax_y[:, :] = nl.dropout(softmax_y[:, :], rate=dropout_p_local[:, 0], mask=mask)
+    softmax_y[:, :] = nl.multiply(softmax_y[:, :], 1 / (1 - dropout_p), mask=mask)
+
+  #####################################################################
+  # Step 3.1 Calculate the backward gradients dL/dV, where y=softmax@V
+  # in value projection with matmul(stationary=dy, moving=softmax)
+  #####################################################################
+  for i_d_head_tile in nl.affine_range(d_head_n_tiles):
+    trans_dy = nisa.nc_transpose(dy_local[i_d_head_tile, :, :],
+                                  mask=mask)
+    dv_psum[i_d_head_tile, :, :] += \
+      nisa.nc_matmul(trans_dy, softmax_y[:, :], mask=mask)
+
+  #####################################################################
+  # Step 3.2 Calculate the backward gradients dL/dsoftmax, where y=softmax@V
+  # in value projection with matmul(stationary=dy, moving=v)
+  #####################################################################
+  softmax_dy_psum = nl.zeros((par_dim(q_seq_tile_size), k_seq_tile_size),
+                              dtype=np.float32, buffer=nl.psum)
+  for i_d_head_tile in nl.affine_range(d_head_n_tiles):
+    softmax_dy_psum[:, :] += \
+      nisa.nc_matmul(dy_local[i_d_head_tile, :, :],
+                      v_local[i_d_head_tile, :, :],
+                      mask=mask)
+
+  softmax_dy = nl.ndarray((par_dim(q_seq_tile_size), k_seq_tile_size), dtype=kernel_dtype, buffer=nl.sbuf)
+  softmax_dy[:, :] = nl.copy(softmax_dy_psum[:, :], dtype=kernel_dtype,
+                                      mask=mask)
+
+  #####################################################################
+  # Step 4 Calculate the softmax backward gradients dL/dx, where y=softmax(x)
+  # dL/dx = y * (dL/dy - rowsum(dO_O)), where y = softmax(x)
+  #####################################################################
+  softmax_dx_local = nl.ndarray((par_dim(q_seq_tile_size), k_seq_tile_size), dtype=kernel_dtype, buffer=nl.sbuf)
+  softmax_dx_local[:, :] = \
+    nisa.scalar_tensor_tensor(data=softmax_dy[:, :],
+                              op0=np.subtract,
+                              operand0=dy_o_sum[local_i_q_seq_tile, :, 0],
+                              op1=np.multiply,
+                              operand1=softmax_y[:, :],
+                              mask=mask)
+
+  #####################################################################
+  # Step 5.1 Calculate dK, with matmul(stationary=Q, moving=softmax_dx)
+  #####################################################################
+  for i_d_head_tile in nl.affine_range(d_head_n_tiles):
+    trans_q_local = nisa.nc_transpose(q_local[i_d_head_tile, :, :],
+                                      mask=mask)
+    dk_psum[i_d_head_tile, :, :] += \
+      nisa.nc_matmul(trans_q_local,
+                      softmax_dx_local[:, :],
+                      mask=mask)
+
+  #####################################################################
+  # Step 5.2 Calculate dQ
+  #####################################################################
+  for i_d_head_tile in nl.affine_range(d_head_n_tiles):
+    dq_psum = nl.zeros((par_dim(d_head_tile_size), q_seq_tile_size),
+                        dtype=np.float32, buffer=nl.psum)
+    for i_k_seq_tile_backward in nl.affine_range(k_seq_fwd_bwd_tile_multipler):
+      i_k_seq_dslice = nl.ds(i_k_seq_tile_backward * k_seq_tile_size_backward,
+                             k_seq_tile_size_backward)
+      transposed_softmax_dx_local = \
+        nisa.nc_transpose(softmax_dx_local[:, i_k_seq_dslice],
+                          mask=mask)
+      dq_psum[:, :] += nisa.nc_matmul(
+          transposed_k_local[i_k_seq_tile_backward, i_d_head_tile, :, :],
+          transposed_softmax_dx_local,
+          mask=mask)
+    dq_local = nl.multiply(dq_psum[:, :], softmax_scale, dtype=kernel_dtype, mask=mask)
+    dq_local_reduced[local_i_q_seq_tile, i_d_head_tile, :, :] = nl.loop_reduce(
+      dq_local, op=np.add, loop_indices=(local_i_k_seq_tile,),
+      dtype=mixed_dtype, mask=mask)
+
+
+@nki.jit
+def fused_self_attn_for_SD_small_head_size(q_ref, k_ref, v_ref, use_causal_mask=False,
+                                           mixed_precision=True):
+  """
+  Fused self attention kernel for small head size Stable Diffusion workload.
+
+  Computes softmax(QK^T)V. Decoder model can optionally include a causal mask
+  application. Does not include QKV projection, output projection, dropout,
+  residual connection, etc.
+
+  This kernel is designed to be used for Stable Diffusion models where the
+  n_heads is smaller or equal to 128. Assertion is thrown if `n_heads` does
+  not satisfy the requirement.
+
+  IO tensor layouts:
+   - q_ptr: shape   (bs, n_heads, seq_q)
+   - k_ptr: shape   (bs, seq_k, n_heads)
+   - v_ptr: shape   (bs, seq_v, n_heads)
+   - out_ptr: shape (bs, seq_q, n_heads)
+   - We use seq_q and seq_k just for clarity, this kernel requires seq_q == seq_k
+
+  IO tensor dtypes:
+   - This kernel assumes all IO tensors have the same dtype
+   - If mixed_precision is True, then all Tensor Engine operation will be performed in
+     bfloat16 and accumulation will be performed in float32. Otherwise the intermediates
+     will be in the same type as the inputs.
+  """
+  # Use q_ref dtype as the intermediate tensor dtype
+  # Assume all IO tensors have the same dtype
+  kernel_dtype = q_ref.dtype
+  pe_in_dt = nl.bfloat16 if mixed_precision else np.float32
+  assert q_ref.dtype == k_ref.dtype == v_ref.dtype
+
+  # Shape checking
+  bs, d_head, seqlen = q_ref.shape
+  assert d_head <= 128, "Cannot use this kernel for d_head > 128"
+  assert tuple(q_ref.shape) == (bs, d_head, seqlen), 'Input shape mismatch!'
+  assert tuple(k_ref.shape) == (bs, seqlen, d_head), 'Input shape mismatch!'
+  assert tuple(v_ref.shape) == (bs, seqlen,  d_head), \
+    f'Input shape mismatch! Expected: {(bs, seqlen, d_head)} Actual: {tuple(v_ref.shape)}'
+
+  out_ref = nl.ndarray((bs, seqlen, d_head), dtype=q_ref.dtype, buffer=nl.shared_hbm)
+
+  # Softmax scaling factor, multiplied onto Q
+  softmax_scale = 0.125
+
+  # Different batch samples/attention heads have independent attention
+  batch_id = nl.program_id(axis=0)
+  # batch_id = 0
+
+  # TODO: make q_seq_tile_size user input
+  # The matmuls currently use a fixed tile size of (128, 128). This may not achieve the best
+  # performance for dense attention. However, since this kernel is in preparation
+  # for block-sparse attention, this tile size is acceptable because the block
+  # size of block-sparse attention cannot be too large.
+  q_seq_n_tiles, q_seq_tile_size = seqlen // 128, 128
+  k_seq_n_tiles, k_seq_tile_size = seqlen // 128, 128
+  # No tiling on d_head dimension since the number of d_head fits in SB
+  d_head_tile_size = d_head
+  v_seq_n_tiles, v_seq_tile_size = seqlen // 128, 128
+
+  ###################################
+  # Step 1. transpose(tensor_v)
+  ###################################
+  # Buffer for v matrix transposed
+  # Pre-fetch and keep it in SBUF throughout different softmax tiles
+  trans_v = nl.ndarray((par_dim(v_seq_tile_size), v_seq_n_tiles, d_head), dtype=pe_in_dt)
+
+  for i_k_seq_tile in nl.affine_range(k_seq_n_tiles):
+    ip_v = nl.arange(v_seq_tile_size)[:, None]
+    if_v = nl.arange(d_head_tile_size)[None, :]
+    trans_v[ip_v, i_k_seq_tile, if_v] = nl.load(
+      v_ref[batch_id, i_k_seq_tile * k_seq_tile_size + ip_v, if_v],
+      dtype=pe_in_dt)
+
+  q_local = nl.ndarray((q_seq_n_tiles, par_dim(d_head_tile_size), q_seq_tile_size), dtype=pe_in_dt)
+  ip_q = nl.arange(d_head_tile_size)[:, None]
+  if_q = nl.arange(q_seq_tile_size)[None, :]
+  for i_q_seq_tile in nl.affine_range(q_seq_n_tiles):
+    q_local[i_q_seq_tile, ip_q, if_q] = nl.load(
+      q_ref[batch_id, ip_q, i_q_seq_tile * q_seq_tile_size + if_q],
+      dtype=pe_in_dt) * softmax_scale
+
+  k_local = nl.ndarray((k_seq_n_tiles, par_dim(d_head_tile_size), k_seq_tile_size), dtype=pe_in_dt)
+  ip_k = nl.arange(d_head_tile_size)[:, None]
+  if_k = nl.arange(k_seq_tile_size)[None, :]
+  for i_k_seq_tile in nl.affine_range(k_seq_n_tiles):
+    k_local[i_k_seq_tile, ip_k, if_k] = nl.load_transpose2d(
+      k_ref[batch_id,
+            i_k_seq_tile * k_seq_tile_size + nl.arange(k_seq_tile_size)[:, None],
+            nl.arange(d_head_tile_size)[None, :]],
+      dtype=pe_in_dt)
+
+  for i_q_seq_tile in nl.affine_range(q_seq_n_tiles):  # indent = 2
+    # A SBUF buffer for an independent softmax tile
+    qk_res_buf = nl.ndarray((par_dim(q_seq_tile_size), seqlen), dtype=kernel_dtype)
+
+    neg_max_res = nl.ndarray((par_dim(q_seq_tile_size), k_seq_n_tiles), dtype=kernel_dtype)
+    ip_max = nl.arange(q_seq_tile_size)[:, None]
+    if_max = nl.arange(k_seq_n_tiles)[None, :]
+
+    # Loop over RHS free of matmul(stationary=tensor_q, moving=tensor_k, contract=d_head)
+    for i_k_seq_tile in nl.affine_range(k_seq_n_tiles):  # indent = 4
+
+      # Since the K^T tile is the RHS, the q_seq_len dimension will be P in the result
+      # PSUM buffer shape: [q_seq_tile_size P, k_seq_tile_size F]
+      qk_psum = nl.zeros((par_dim(q_seq_tile_size), k_seq_tile_size),
+                         dtype=np.float32, buffer=nl.psum)
+
+      # Tensor indices for accessing qk result in k_seq_tile_size
+      ip_qk = nl.arange(q_seq_tile_size)[:, None]
+      if_qk = nl.arange(k_seq_tile_size)[None, :]
+
+      ##############################################################
+      # Step 2. matmul(stationary=tensor_q, moving=tensor_k, contract=d_head)
+      ##############################################################
+      qk_psum[ip_qk, if_qk] += nisa.nc_matmul(moving=k_local[i_k_seq_tile, ip_k, if_k],
+                                              stationary=q_local[i_q_seq_tile, ip_q, if_q])
+
+      ###################################
+      # Step 3. Apply optional causal mask
+      ###################################
+      if use_causal_mask:
+        # Magic number -9984.0 to replace -inf similar to what Tensorizer uses
+        qk_res_buf[ip_qk, i_k_seq_tile * k_seq_tile_size + if_qk] = nisa.affine_select(
+          pred=(i_q_seq_tile * q_seq_tile_size + ip_qk >= i_k_seq_tile * k_seq_tile_size + if_qk),
+          on_true_tile=qk_psum[ip_qk, if_qk], on_false_value=-9984.0, dtype=kernel_dtype)
+      else:
+        # Simply send psum result back to sbuf
+        qk_res_buf[ip_qk, i_k_seq_tile * k_seq_tile_size + if_qk] = nl.copy(qk_psum[ip_qk, if_qk],
+                                                                              dtype=kernel_dtype)
+
+      ###################################
+      # Step 4. Softmax
+      ###################################
+      # TODO: use TensorScalarCacheReduce to avoid an extra copy
+      # We want to break this reduction in tiles because we want to overlap it with the previous matmul
+      neg_max_res[ip_max, i_k_seq_tile] = nisa.tensor_reduce(
+        np.max, data=qk_res_buf[ip_qk, i_k_seq_tile * k_seq_tile_size + if_qk],
+        axis=(1,), dtype=kernel_dtype, negate=True)
+
+    neg_max_res_final = nisa.tensor_reduce(
+      np.min, data=neg_max_res[ip_max, if_max],
+      axis=(1,), dtype=kernel_dtype, negate=False)
+
+    ip_softmax = nl.arange(q_seq_tile_size)[:, None]
+    if_softmax = nl.arange(seqlen)[None, :]
+    ip_sum_res = nl.arange(q_seq_tile_size)[:, None]
+    if_sum_res = nl.arange(d_head_tile_size)[None, :]
+
+    softmax_res = nl.ndarray((par_dim(q_seq_tile_size), seqlen), dtype=pe_in_dt)
+    sum_divisor = nl.ndarray((par_dim(q_seq_tile_size), d_head_tile_size), dtype=kernel_dtype)
+
+    # Simply use a large tile of seq_len in size since this is a "blocking" instruction
+    # Assuming the compiler will merge exp and reduce_add into a single instruction on ACT
+    exp_res = nisa.activation(np.exp,
+                              data=qk_res_buf[ip_softmax, if_softmax],
+                              bias=neg_max_res_final, scale=1.0)
+
+    sum_res = nisa.tensor_reduce(np.add, data=exp_res, axis=(1,),
+                          dtype=kernel_dtype)
+    softmax_res[ip_softmax, if_softmax] = nl.copy(exp_res, dtype=pe_in_dt)
+
+    sum_reciprocal_broadcast = (1.0 / sum_res).broadcast_to((q_seq_tile_size, d_head_tile_size))
+    sum_divisor[ip_sum_res, if_sum_res] = nl.copy(sum_reciprocal_broadcast, dtype=kernel_dtype)
+
+    # Buffer for transposed softmax results (FP32 in PSUM)
+    trans_softmax_res = nl.ndarray(
+      (par_dim(k_seq_tile_size), k_seq_n_tiles, q_seq_tile_size),
+      dtype=pe_in_dt)
+
+    # Result psum buffer has the hidden dim as P
+    attn_res_psum = nl.zeros((par_dim(d_head_tile_size), q_seq_tile_size),
+                             dtype=np.float32, buffer=nl.psum)
+
+    ip_scores_t = nl.arange(k_seq_tile_size)[:, None]
+    if_scores_t = nl.arange(q_seq_tile_size)[None, :]
+    # Loop over matmul_1 contraction
+    for i_k_seq_tile in nl.affine_range(k_seq_n_tiles):
+      ###################################
+      # Step 5. transpose(softmax_res)
+      ###################################
+      ip_scores = nl.arange(q_seq_tile_size)[:, None]
+      if_scores = nl.arange(k_seq_tile_size)[None, :]
+
+      trans_softmax_res[ip_scores_t, i_k_seq_tile, if_scores_t] = nisa.nc_transpose(
+        softmax_res[ip_scores, i_k_seq_tile * k_seq_tile_size + if_scores])
+
+    ip_out = nl.arange(d_head_tile_size)[:, None]
+    if_out = nl.arange(q_seq_tile_size)[None, :]
+    for i_k_seq_tile in nl.affine_range(k_seq_n_tiles):
+      ######################################################################
+      # Step 6. matmul_1(stationary=trans_v, moving=trans_softmax_res, contract=seqlen_v=seqlen_k)
+      ######################################################################
+      ip_v_t = nl.arange(k_seq_tile_size)[:, None]
+      if_v_t = nl.arange(d_head_tile_size)[None, :]
+      attn_res_psum[ip_out, if_out] += \
+        nisa.nc_matmul(moving=trans_softmax_res[ip_scores_t, i_k_seq_tile, if_scores_t],
+                       stationary=trans_v[ip_v_t, i_k_seq_tile, if_v_t])
+
+    attn_res_sbuf = nl.copy(attn_res_psum[ip_out, if_out], dtype=kernel_dtype)
+
+    attn_res_div = attn_res_sbuf * nisa.nc_transpose(sum_divisor[ip_sum_res, if_sum_res])
+
+    nl.store(
+      out_ref[batch_id, i_q_seq_tile * q_seq_tile_size + if_out, ip_out],
+      value=attn_res_div)
+
+  return out_ref
diff --git a/src/self-attention/scripts/calculate_score.py b/src/self-attention/scripts/calculate_score.py
new file mode 100644
index 0000000..cb7641f
--- /dev/null
+++ b/src/self-attention/scripts/calculate_score.py
@@ -0,0 +1,197 @@
+#!/usr/bin/env python3
+"""
+Simple Self-Attention Score Calculator
+
+This script calculates a combined score for self-attention based on forward and backward pass metrics.
+It can be used directly after running the tests or with manually provided metrics.
+
+The scoring formula is:
+    final_score = accuracy * latency_improvement * throughput_improvement * (1.0 + nki_flop_ratio)
+
+Where:
+- accuracy: Binary value (1.0 or 0.0) indicating if numerical tests pass
+- latency_improvement: Ratio of baseline latency to measured latency
+- throughput_improvement: Inversely proportional to latency (higher is better)
+- nki_flop_ratio: Ratio of operations executed on NKI hardware (hardware utilization)
+
+The NKI FLOP ratio is calculated based on the kernel characteristics and represents
+the percentage of operations that are accelerated by the NKI hardware.
+"""
+import argparse
+import json
+import os
+import sys
+from datetime import datetime
+
+def calculate_score(fwd_latency, fwd_base_latency, fwd_numerical_accuracy,
+                   bwd_latency, bwd_base_latency, bwd_numerical_accuracy,
+                   fwd_weight=0.4, bwd_weight=0.6, nki_flop_ratio=0.0):
+    """
+    Calculate a combined score for self-attention.
+    
+    Parameters:
+    - fwd_latency: Measured latency for forward pass (ns)
+    - fwd_base_latency: Baseline latency for forward pass (ns)
+    - fwd_numerical_accuracy: Boolean indicating if forward numerical tests passed
+    - bwd_latency: Measured latency for backward pass (ns)
+    - bwd_base_latency: Baseline latency for backward pass (ns)
+    - bwd_numerical_accuracy: Boolean indicating if backward numerical tests passed
+    - fwd_weight: Weight for forward pass in combined score (default: 0.4)
+    - bwd_weight: Weight for backward pass in combined score (default: 0.6)
+    - nki_flop_ratio: Ratio of NKI FLOPs to total FLOPs (default: 0.0)
+    
+    Returns:
+    - Dictionary containing all score components and the final score
+    """
+    # Convert boolean accuracy to 1.0 or 0.0
+    fwd_accuracy = 1.0 if fwd_numerical_accuracy else 0.0
+    bwd_accuracy = 1.0 if bwd_numerical_accuracy else 0.0
+    
+    # Calculate latency improvements
+    fwd_latency_improvement = fwd_base_latency / fwd_latency if fwd_latency > 0 else 0.0
+    bwd_latency_improvement = bwd_base_latency / bwd_latency if bwd_latency > 0 else 0.0
+    
+    # Calculate throughput improvements (inversely proportional to latency)
+    fwd_throughput_improvement = fwd_latency_improvement
+    bwd_throughput_improvement = bwd_latency_improvement
+    
+    # Calculate individual scores
+    # Score = accuracy * latency_improvement * throughput_improvement * (1.0 + nki_flop_ratio)
+    fwd_score = fwd_accuracy * fwd_latency_improvement * fwd_throughput_improvement
+    bwd_score = bwd_accuracy * bwd_latency_improvement * bwd_throughput_improvement
+    
+    # Calculate combined score with NKI FLOP ratio bonus
+    # If either test fails numerically, the combined score is 0
+    combined_numerical_accuracy = fwd_accuracy * bwd_accuracy
+    if combined_numerical_accuracy < 1.0:
+        raw_score = 0.0
+        combined_score = 0.0
+    else:
+        # Apply the NKI FLOP ratio bonus to the weighted sum of forward and backward scores
+        raw_score = ((fwd_weight * fwd_score) + (bwd_weight * bwd_score)) / 1000000000 # Dividing by ns
+        combined_score = raw_score * (1.0 + nki_flop_ratio) # Dividing by nanoseconds
+    
+    # Return all components
+    return {
+        "forward": {
+            "latency": fwd_latency,
+            "base_latency": fwd_base_latency,
+            "latency_improvement": fwd_latency_improvement,
+            "throughput_improvement": fwd_throughput_improvement,
+            "numerical_accuracy": fwd_accuracy,
+            "score": fwd_score
+        },
+        "backward": {
+            "latency": bwd_latency,
+            "base_latency": bwd_base_latency,
+            "latency_improvement": bwd_latency_improvement,
+            "throughput_improvement": bwd_throughput_improvement,
+            "numerical_accuracy": bwd_accuracy,
+            "score": bwd_score
+        },
+        "combined": {
+            "forward_weight": fwd_weight,
+            "backward_weight": bwd_weight,
+            "nki_flop_ratio": nki_flop_ratio,
+            "raw_score": raw_score if combined_numerical_accuracy >= 1.0 else 0.0,
+            "score": combined_score
+        }
+    }
+
+def print_results(results):
+    """Print formatted results to console"""
+    print("\n" + "="*60)
+    print("Self-Attention Benchmark Results")
+    print("="*60)
+    
+    print("\nForward Pass:")
+    print(f"   Latency:             {results['forward']['latency']:,} ns")
+    print(f"   Base Latency:        {results['forward']['base_latency']:,} ns")
+    print(f"   Latency Improvement: {results['forward']['latency_improvement']:.2f}x")
+    print(f"   Throughput Improvement: {results['forward']['throughput_improvement']:.2f}x")
+    print(f"   Numerical Accuracy:  {'✅ PASS' if results['forward']['numerical_accuracy'] == 1.0 else '❌ FAIL'}")
+    print(f"   Forward Score:       {results['forward']['score']:.2f}")
+    
+    print("\nBackward Pass:")
+    print(f"   Latency:             {results['backward']['latency']:,} ns")
+    print(f"   Base Latency:        {results['backward']['base_latency']:,} ns")
+    print(f"   Latency Improvement: {results['backward']['latency_improvement']:.2f}x")
+    print(f"   Throughput Improvement: {results['backward']['throughput_improvement']:.2f}x")
+    print(f"   Numerical Accuracy:  {'✅ PASS' if results['backward']['numerical_accuracy'] == 1.0 else '❌ FAIL'}")
+    print(f"   Backward Score:      {results['backward']['score']:.2f}")
+    
+    print("\nCombined Metrics:")
+    print(f"   Forward Weight:      {results['combined']['forward_weight']:.2f}")
+    print(f"   Backward Weight:     {results['combined']['backward_weight']:.2f}")
+    print(f"   NKI FLOP Ratio:      {results['combined']['nki_flop_ratio']:.2f}")
+    print(f"   Raw Score:           {results['combined']['raw_score']:.2f}")
+    print(f"   Final Score:         {results['combined']['score']:.2f} = Raw Score * (1 + NKI Flop Ratio)")
+    
+    # Print overall status
+    if results['combined']['score'] > 0.0:
+        print("\n🎉 OVERALL STATUS: PASS")
+    else:
+        print("\n❌ OVERALL STATUS: FAIL")
+    
+    print("\n" + "="*60)
+
+def main():
+    """Main function to parse arguments and calculate score"""
+    parser = argparse.ArgumentParser(description="Calculate Self-Attention Score")
+    
+    # Required arguments
+    parser.add_argument("--fwd-latency", type=float, required=True,
+                        help="Measured latency for forward pass (ns)")
+    parser.add_argument("--fwd-base-latency", type=float, required=True,
+                        help="Baseline latency for forward pass (ns)")
+    parser.add_argument("--fwd-numerical-accuracy", type=str, required=True, choices=["pass", "fail"],
+                        help="Whether forward numerical tests passed")
+    
+    parser.add_argument("--bwd-latency", type=float, required=True,
+                        help="Measured latency for backward pass (ns)")
+    parser.add_argument("--bwd-base-latency", type=float, required=True,
+                        help="Baseline latency for backward pass (ns)")
+    parser.add_argument("--bwd-numerical-accuracy", type=str, required=True, choices=["pass", "fail"],
+                        help="Whether backward numerical tests passed")
+    
+    # Optional arguments
+    parser.add_argument("--fwd-weight", type=float, default=0.4,
+                        help="Weight for forward pass in combined score (default: 0.4)")
+    parser.add_argument("--bwd-weight", type=float, default=0.6,
+                        help="Weight for backward pass in combined score (default: 0.6)")
+    parser.add_argument("--nki-flop-ratio", type=float, default=0.0,
+                        help="Ratio of NKI FLOPs to total FLOPs (default: 0.0)")
+    parser.add_argument("--output", type=str,
+                        help="Path to save results as JSON")
+    
+    args = parser.parse_args()
+    
+    # Convert string accuracy to boolean
+    fwd_numerical_accuracy = args.fwd_numerical_accuracy.lower() == "pass"
+    bwd_numerical_accuracy = args.bwd_numerical_accuracy.lower() == "pass"
+    
+    # Calculate score
+    results = calculate_score(
+        args.fwd_latency, args.fwd_base_latency, fwd_numerical_accuracy,
+        args.bwd_latency, args.bwd_base_latency, bwd_numerical_accuracy,
+        args.fwd_weight, args.bwd_weight, args.nki_flop_ratio
+    )
+    
+    # Add timestamp
+    results["timestamp"] = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
+    
+    # Print results
+    print_results(results)
+    
+    # Save results if output path is provided
+    if args.output:
+        os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True)
+        with open(args.output, 'w') as f:
+            json.dump(results, f, indent=2)
+        print(f"\nResults saved to: {args.output}")
+    
+    # Return success if combined score is positive
+    return 0 if results["combined"]["score"] > 0.0 else 1
+
+if __name__ == "__main__":
+    sys.exit(main())
\ No newline at end of file
diff --git a/src/self-attention/scripts/self-attention_benchmark.sh b/src/self-attention/scripts/self-attention_benchmark.sh
new file mode 100755
index 0000000..e6fd68f
--- /dev/null
+++ b/src/self-attention/scripts/self-attention_benchmark.sh
@@ -0,0 +1,250 @@
+#!/bin/bash
+# Simple Self-Attention Benchmark Script
+# Runs forward and backward tests and calculates a combined score
+
+set -euo pipefail
+
+# Get script directory
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../../../" && pwd)"
+
+# Colors for output
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+RED='\033[0;31m'
+NC='\033[0m'
+
+echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+echo -e "${BLUE}NKI Self-Attention Benchmark${NC}"
+echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+
+# Create log directory
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+LOG_DIR="${PROJECT_ROOT}/logs/self_attention/${TIMESTAMP}"
+mkdir -p "$LOG_DIR"
+
+# Default NKI FLOP ratio (will be overridden by calculated on NKIfrom tests)
+# This represents the ratio of operations that are accelerated by NKI
+# Higher values indicate better hardware utilization
+NKI_FLOP_RATIO=0.85  # Default: 85% of operations executed on NKI
+
+# Step 1: Run forward test
+echo -e "\n${YELLOW}Running forward attention test...${NC}"
+cd "${PROJECT_ROOT}/src/self-attention"
+python -m pytest tests/test_flash_attn_fwd.py::TestAttention::test_flash_attn_fwd_perf -v -s > "${LOG_DIR}/forward_perf.log"
+FWD_PERF_STATUS=$?
+
+python -m pytest tests/test_flash_attn_fwd.py::TestAttention::test_flash_attn_fwd_numerical -v -s > "${LOG_DIR}/forward_numerical.log"
+FWD_NUM_STATUS=$?
+
+# Step 2: Run backward test
+echo -e "\n${YELLOW}Running backward attention test...${NC}"
+python -m pytest tests/test_flash_attn_bwd.py::TestAttention::test_flash_attn_bwd_perf -v -s > "${LOG_DIR}/backward_perf.log"
+BWD_PERF_STATUS=$?
+
+python -m pytest tests/test_flash_attn_bwd.py::TestAttention::test_flash_attn_bwd_numerical -v -s > "${LOG_DIR}/backward_numerical.log"
+BWD_NUM_STATUS=$?
+
+# Step 3: Extract latency values from logs and config
+echo -e "\n${YELLOW}Extracting metrics from test results...${NC}"
+
+# Extract P50 latency from forward performance test
+FWD_P50_LATENCY=$(grep -o "P50 Latency: [0-9,]* ns" "${LOG_DIR}/forward_perf.log" | grep -o "[0-9,]*" | tr -d ',')
+if [[ -z "$FWD_P50_LATENCY" ]]; then
+    echo -e "${RED}Could not extract forward P50 latency. Using baseline.${NC}"
+    FWD_P50_LATENCY=$FWD_BASE_LATENCY
+fi
+
+# Extract P50 latency from backward performance test
+BWD_P50_LATENCY=$(grep -o "P50 Latency: [0-9,]* ns" "${LOG_DIR}/backward_perf.log" | grep -o "[0-9,]*" | tr -d ',')
+if [[ -z "$BWD_P50_LATENCY" ]]; then
+    echo -e "${RED}Could not extract backward P50 latency. Using baseline.${NC}"
+    BWD_P50_LATENCY=$BWD_BASE_LATENCY
+fi
+
+# Extract accumulated latency metrics from config file
+CONFIG_FILE="${PROJECT_ROOT}/src/self-attention/config/performance_metrics.json"
+if [[ -f "$CONFIG_FILE" ]]; then
+    echo -e "${GREEN}Loading accumulated metrics from config file...${NC}"
+    
+    # Use jq if available, otherwise use grep and sed
+    if command -v jq &> /dev/null; then
+        FWD_LATENCY_TOTAL=$(jq -r '.FWD_LATENCY_TOTAL' "$CONFIG_FILE")
+        FWD_BASE_LATENCY_TOTAL=$(jq -r '.FWD_BASE_LATENCY_TOTAL' "$CONFIG_FILE")
+        FWD_TEST_COUNT=$(jq -r '.FWD_TEST_COUNT' "$CONFIG_FILE")
+        BWD_LATENCY_TOTAL=$(jq -r '.BWD_LATENCY_TOTAL' "$CONFIG_FILE")
+        BWD_BASE_LATENCY_TOTAL=$(jq -r '.BWD_BASE_LATENCY_TOTAL' "$CONFIG_FILE")
+        BWD_TEST_COUNT=$(jq -r '.BWD_TEST_COUNT' "$CONFIG_FILE")
+    else
+        # Fallback to grep and sed if jq is not available
+        FWD_LATENCY_TOTAL=$(grep -o '"FWD_LATENCY_TOTAL": [0-9.]*' "$CONFIG_FILE" | sed 's/.*: //')
+        FWD_BASE_LATENCY_TOTAL=$(grep -o '"FWD_BASE_LATENCY_TOTAL": [0-9.]*' "$CONFIG_FILE" | sed 's/.*: //')
+        FWD_TEST_COUNT=$(grep -o '"FWD_TEST_COUNT": [0-9.]*' "$CONFIG_FILE" | sed 's/.*: //')
+        BWD_LATENCY_TOTAL=$(grep -o '"BWD_LATENCY_TOTAL": [0-9.]*' "$CONFIG_FILE" | sed 's/.*: //')
+        BWD_BASE_LATENCY_TOTAL=$(grep -o '"BWD_BASE_LATENCY_TOTAL": [0-9.]*' "$CONFIG_FILE" | sed 's/.*: //')
+        BWD_TEST_COUNT=$(grep -o '"BWD_TEST_COUNT": [0-9.]*' "$CONFIG_FILE" | sed 's/.*: //')
+    fi
+    
+    # Use the accumulated metrics if available
+    if [[ -n "$FWD_LATENCY_TOTAL" && -n "$FWD_BASE_LATENCY_TOTAL" && "$FWD_TEST_COUNT" -gt 0 ]]; then
+        echo -e "${GREEN}Using accumulated forward metrics from ${FWD_TEST_COUNT} tests${NC}"
+        FWD_AVG_LATENCY=$(echo "scale=2; $FWD_LATENCY_TOTAL / $FWD_TEST_COUNT" | bc)
+        FWD_AVG_BASE_LATENCY=$(echo "scale=2; $FWD_BASE_LATENCY_TOTAL / $FWD_TEST_COUNT" | bc)
+        echo -e "   Average achieved latency: ${FWD_AVG_LATENCY} ns"
+        echo -e "   Average baseline latency: ${FWD_AVG_BASE_LATENCY} ns"
+        FWD_P50_LATENCY=$FWD_AVG_LATENCY
+        FWD_BASE_LATENCY=$FWD_AVG_BASE_LATENCY
+    fi
+    
+    if [[ -n "$BWD_LATENCY_TOTAL" && -n "$BWD_BASE_LATENCY_TOTAL" && "$BWD_TEST_COUNT" -gt 0 ]]; then
+        echo -e "${GREEN}Using accumulated backward metrics from ${BWD_TEST_COUNT} tests${NC}"
+        BWD_AVG_LATENCY=$(echo "scale=2; $BWD_LATENCY_TOTAL / $BWD_TEST_COUNT" | bc)
+        BWD_AVG_BASE_LATENCY=$(echo "scale=2; $BWD_BASE_LATENCY_TOTAL / $BWD_TEST_COUNT" | bc)
+        echo -e "   Average achieved latency: ${BWD_AVG_LATENCY} ns"
+        echo -e "   Average baseline latency: ${BWD_AVG_BASE_LATENCY} ns"
+        BWD_P50_LATENCY=$BWD_AVG_LATENCY
+        BWD_BASE_LATENCY=$BWD_AVG_BASE_LATENCY
+    fi
+else
+    echo -e "${YELLOW}No accumulated metrics found. Using single test results.${NC}"
+fi
+
+# Step 4: Calculate scores using the Python script
+echo -e "\n${YELLOW}Calculating scores...${NC}"
+
+# Convert test status to pass/fail strings for the Python script
+FWD_NUMERICAL_RESULT=$([ $FWD_NUM_STATUS -eq 0 ] && echo "pass" || echo "fail")
+BWD_NUMERICAL_RESULT=$([ $BWD_NUM_STATUS -eq 0 ] && echo "pass" || echo "fail")
+
+# Define weights
+FWD_WEIGHT=0.4
+BWD_WEIGHT=0.6
+
+# Run the Python script to calculate scores
+SCORE_OUTPUT_FILE="${LOG_DIR}/score_details.json"
+echo -e "${BLUE}Running score calculation script...${NC}"
+
+# Extract NKI_FLOP_RATIO from config if available
+if [[ -f "$CONFIG_FILE" ]]; then
+    if command -v jq &> /dev/null; then
+        CONFIG_NKI_FLOP_RATIO=$(jq -r '.NKI_FLOP_RATIO' "$CONFIG_FILE")
+        if [[ -n "$CONFIG_NKI_FLOP_RATIO" && "$CONFIG_NKI_FLOP_RATIO" != "null" ]]; then
+            NKI_FLOP_RATIO=$CONFIG_NKI_FLOP_RATIO
+            echo -e "${GREEN}Using calculated NKI_FLOP_RATIO from config: ${NKI_FLOP_RATIO}${NC}"
+        fi
+    else
+        # Fallback to grep and sed if jq is not available
+        CONFIG_NKI_FLOP_RATIO=$(grep -o '"NKI_FLOATIO": [0- "$CONFIG_FILE" | sed 's/.*: //'')
+        if [[ -n "$CONFIG_NKI_FLOP_RATIO" ]]; then
+            NKI_FLOP_RATIO=$CONFIG_P_RATIO
+            echo -e "${GREEN}Using calculated NKI_FLOP_RATIO from config: ${NKI_FLOP_RATIO}${NC}"
+        fi
+    fi
+else
+    -e "${YELLOW}No config file found. Using default NKI_FLOP_RATIO: ${NKI_FLOP_RATIO}${NC}"
+fi
+
+python "${SCRIPT_DIR}/calculate_score.py" \
+  --fwd-latency "$FWD_P50_LATENCY" \
+  --fwd-base-latency "$FWD_BASE_LATENCY" \
+  --fwd-numerical-accuracy "$FWD_NUMERICAL_RESULT" \
+  --bwd-latency "$BWD_P50_LATENCY" \
+  --bwd-base-latency "$BWD_BASE_LATENCY" \
+  --bwd-numerical-accuracy "$BWD_NUMERICAL_RESULT" \
+  --fwd-weight "$FWD_WEIGHT" \
+  --bwd-weight "$BWD_WEIGHT" \
+  --nki-flop-ratio "$NKI_FLOP_RATIO" \
+  --output "$SCORE_OUTPUT_FILE"
+
+# Extract values from the JSON file
+if [ -f "$SCORE_OUTPUT_FILE" ]; then
+    # Use jq if available, otherwise use grep and sed
+    if command -v jq &> /dev/null; then
+        FWD_LATENCY_IMPROVEMENT=$(jq -r '.forward.latency_improvement' "$SCORE_OUTPUT_FILE")
+        BWD_LATENCY_IMPROVEMENT=$(jq -r '.backward.latency_improvement' "$SCORE_OUTPUT_FILE")
+        FWD_THROUGHPUT_IMPROVEMENT=$(jq -r '.forward.throughput_improvement' "$SCORE_OUTPUT_FILE")
+        BWD_THROUGHPUT_IMPROVEMENT=$(jq -r '.backward.throughput_improvement' "$SCORE_OUTPUT_FILE")
+        FWD_SCORE=$(jq -r '.forward.score' "$SCORE_OUTPUT_FILE")
+        BWD_SCORE=$(jq -r '.backward.score' "$SCORE_OUTPUT_FILE")
+        RAW_SCORE=$(jq -r '.combined.raw_score' "$SCORE_OUTPUT_FILE")
+        COMBINED_SCORE=$(jq -r '.combined.score' "$SCORE_OUTPUT_FILE")
+    else
+        # Fallback to grep and sed if jq is not available
+        FWD_LATENCY_IMPROVEMENT=$(grep -o '"latency_improvement": [0-9.]*' "$SCORE_OUTPUT_FILE" | head -1 | sed 's/.*: //')
+        BWD_LATENCY_IMPROVEMENT=$(grep -o '"latency_improvement": [0-9.]*' "$SCORE_OUTPUT_FILE" | tail -1 | sed 's/.*: //')
+        FWD_THROUGHPUT_IMPROVEMENT=$(grep -o '"throughput_improvement": [0-9.]*' "$SCORE_OUTPUT_FILE" | head -1 | sed 's/.*: //')
+        BWD_THROUGHPUT_IMPROVEMENT=$(grep -o '"throughput_improvement": [0-9.]*' "$SCORE_OUTPUT_FILE" | tail -1 | sed 's/.*: //')
+        FWD_SCORE=$(grep -o '"score": [0-9.]*' "$SCORE_OUTPUT_FILE" | head -1 | sed 's/.*: //')
+        BWD_SCORE=$(grep -o '"score": [0-9.]*' "$SCORE_OUTPUT_FILE" | head -2 | tail -1 | sed 's/.*: //')
+        RAW_SCORE=$(grep -o '"raw_score": [0-9.]*' "$SCORE_OUTPUT_FILE" | sed 's/.*: //')
+        COMBINED_SCORE=$(grep -o '"score": [0-9.]*' "$SCORE_OUTPUT_FILE" | tail -1 | sed 's/.*: //')
+    fi
+else
+    echo -e "${RED}Score calculation failed. Using manual calculation.${NC}"
+    
+    # Convert test status to numerical accuracy (1.0 for pass, 0.0 for fail)
+    FWD_NUMERICAL_ACCURACY=$([ $FWD_NUM_STATUS -eq 0 ] && echo 1.0 || echo 0.0)
+    BWD_NUMERICAL_ACCURACY=$([ $BWD_NUM_STATUS -eq 0 ] && echo 1.0 || echo 0.0)
+    
+    # Calculate latency improvements
+    FWD_LATENCY_IMPROVEMENT=$(echo "scale=2; $FWD_BASE_LATENCY / $FWD_P50_LATENCY" | bc)
+    BWD_LATENCY_IMPROVEMENT=$(echo "scale=2; $BWD_BASE_LATENCY / $BWD_P50_LATENCY" | bc)
+    FWD_THROUGHPUT_IMPROVEMENT=$FWD_LATENCY_IMPROVEMENT
+    BWD_THROUGHPUT_IMPROVEMENT=$BWD_LATENCY_IMPROVEMENT
+    
+    # Calculate individual scores
+    FWD_SCORE=$(echo "scale=2; $FWD_NUMERICAL_ACCURACY * $FWD_LATENCY_IMPROVEMENT * $FWD_THROUGHPUT_IMPROVEMENT" | bc)
+    BWD_SCORE=$(echo "scale=2; $BWD_NUMERICAL_ACCURACY * $BWD_LATENCY_IMPROVEMENT * $BWD_THROUGHPUT_IMPROVEMENT" | bc)
+    
+    # Calculate combined score
+    COMBINED_NUMERICAL_ACCURACY=$(echo "scale=2; $FWD_NUMERICAL_ACCURACY * $BWD_NUMERICAL_ACCURACY" | bc)
+    if (( $(echo "$COMBINED_NUMERICAL_ACCURACY < 1.0" | bc -l) )); then
+        RAW_SCORE=0.0
+        COMBINED_SCORE=0.0
+    else
+        RAW_SCORE=$(echo "scale=2; ($FWD_WEIGHT * $FWD_SCORE) + ($BWD_WEIGHT * $BWD_SCORE)" | bc)
+        COMBINED_SCORE=$(echo "scale=2; $RAW_SCORE * (1.0 + $NKI_FLOP_RATIO)" | bc)
+    fi
+fi
+
+echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+echo -e "\nLog files saved to: ${LOG_DIR}"
+echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+
+# Save results to JSON file if the Python script didn't already create one
+if [ ! -f "$SCORE_OUTPUT_FILE" ]; then
+    cat > "${LOG_DIR}/results.json" << EOF
+{
+    "timestamp": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")",
+    "forward": {
+        "latency": ${FWD_P50_LATENCY},
+        "base_latency": ${FWD_BASE_LATENCY},
+        "latency_improvement": ${FWD_LATENCY_IMPROVEMENT},
+        "throughput_improvement": ${FWD_THROUGHPUT_IMPROVEMENT},
+        "numerical_accuracy": $([ "$FWD_NUMERICAL_RESULT" = "pass" ] && echo 1.0 || echo 0.0),
+        "score": ${FWD_SCORE}
+    },
+    "backward": {
+        "latency": ${BWD_P50_LATENCY},
+        "base_latency": ${BWD_BASE_LATENCY},
+        "latency_improvement": ${BWD_LATENCY_IMPROVEMENT},
+        "throughput_improvement": ${BWD_THROUGHPUT_IMPROVEMENT},
+        "numerical_accuracy": $([ "$BWD_NUMERICAL_RESULT" = "pass" ] && echo 1.0 || echo 0.0),
+        "score": ${BWD_SCORE}
+    },
+    "combined": {
+        "forward_weight": ${FWD_WEIGHT},
+        "backward_weight": ${BWD_WEIGHT},
+        "nki_flop_ratio": ${NKI_FLOP_RATIO},
+        "raw_score": ${RAW_SCORE},
+        "score": ${COMBINED_SCORE}
+    }
+}
+EOF
+else
+    # Copy the score details file to the standard results.json location
+    cp "$SCORE_OUTPUT_FILE" "${LOG_DIR}/results.json"
+fi
+
+exit 0
\ No newline at end of file
diff --git a/src/self-attention/tests/config.py b/src/self-attention/tests/config.py
new file mode 100644
index 0000000..7d12f0e
--- /dev/null
+++ b/src/self-attention/tests/config.py
@@ -0,0 +1,226 @@
+#!/usr/bin/env python3
+"""
+Configuration utilities for self-attention tests
+"""
+import os
+import json
+import logging
+from typing import Dict, Any
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+# Path to the config file
+CONFIG_FILE = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 
+                          "config", "performance_metrics.json")
+
+def ensure_config_dir():
+    """Ensure the config directory exists"""
+    config_dir = os.path.dirname(CONFIG_FILE)
+    if not os.path.exists(config_dir):
+        try:
+            os.makedirs(config_dir)
+            logger.info(f"Created config directory: {config_dir}")
+        except Exception as e:
+            logger.error(f"Failed to create config directory: {e}")
+            return False
+    return True
+
+def load_config() -> Dict[str, Any]:
+    """Load the configuration from the JSON file"""
+    if not ensure_config_dir():
+        return {}
+        
+    if not os.path.exists(CONFIG_FILE):
+        logger.info(f"Config file not found, creating default: {CONFIG_FILE}")
+        default_config = {
+            "FWD_LATENCY_TOTAL": 0,
+            "FWD_BASE_LATENCY_TOTAL": 0,
+            "FWD_TEST_COUNT": 0,
+            "BWD_LATENCY_TOTAL": 0,
+            "BWD_BASE_LATENCY_TOTAL": 0,
+            "BWD_TEST_COUNT": 0,
+            "NKI_FLOP_RATIO": 0.85,
+            "LAST_RUN_TIMESTAMP": "",
+            "test_details": []
+        }
+        save_config(default_config)
+        return default_config
+    
+    try:
+        with open(CONFIG_FILE, 'r') as f:
+            return json.load(f)
+    except Exception as e:
+        logger.error(f"Failed to load config file: {e}")
+        return {}
+
+def save_config(config: Dict[str, Any]) -> bool:
+    """Save the configuration to the JSON file"""
+    if not ensure_config_dir():
+        return False
+        
+    try:
+        with open(CONFIG_FILE, 'w') as f:
+            json.dump(config, f, indent=2)
+        logger.info(f"Config saved to: {CONFIG_FILE}")
+        return True
+    except Exception as e:
+        logger.error(f"Failed to save config file: {e}")
+        return False
+
+def update_config(updates: Dict[str, Any]) -> bool:
+    """Update specific configuration values"""
+    config = load_config()
+    config.update(updates)
+    return save_config(config)
+
+def add_test_result(test_type: str, achieved_latency: int, expected_latency: int, test_params: Dict[str, Any]) -> bool:
+    """Add a test result and update the accumulated latency totals
+    
+    Args:
+        test_type: Either 'FWD' or 'BWD' for forward or backward tests
+        achieved_latency: The measured latency in nanoseconds
+        expected_latency: The baseline/expected latency in nanoseconds
+        test_params: Dictionary of test parameters (batch size, heads, etc.)
+        
+    Returns:
+        bool: True if update was successful
+    """
+    if test_type not in ['FWD', 'BWD']:
+        logger.error(f"Invalid test type: {test_type}. Must be 'FWD' or 'BWD'.")
+        return False
+        
+    config = load_config()
+    
+    # Update the accumulated totals
+    latency_key = f"{test_type}_LATENCY_TOTAL"
+    base_key = f"{test_type}_BASE_LATENCY_TOTAL"
+    count_key = f"{test_type}_TEST_COUNT"
+    
+    config[latency_key] = config.get(latency_key, 0) + achieved_latency
+    config[base_key] = config.get(base_key, 0) + expected_latency
+    config[count_key] = config.get(count_key, 0) + 1
+    
+    # Add timestamp
+    import datetime
+    config["LAST_RUN_TIMESTAMP"] = datetime.datetime.now().isoformat()
+    
+    # Add test details to the history
+    if "test_details" not in config:
+        config["test_details"] = []
+        
+    test_details = {
+        "timestamp": config["LAST_RUN_TIMESTAMP"],
+        "test_type": test_type,
+        "achieved_latency": achieved_latency,
+        "expected_latency": expected_latency,
+        "params": test_params
+    }
+    
+    config["test_details"].append(test_details)
+    
+    # Keep only the last 20 test details to avoid the file growing too large
+    if len(config["test_details"]) > 20:
+        config["test_details"] = config["test_details"][-20:]
+    
+    return save_config(config)
+
+def get_latency_improvement_ratio(test_type: str) -> float:
+    """Calculate the latency improvement ratio for a test type
+    
+    Args:
+        test_type: Either 'FWD' or 'BWD' for forward or backward tests
+        
+    Returns:
+        float: The latency improvement ratio (baseline/achieved)
+    """
+    config = load_config()
+    
+    latency_key = f"{test_type}_LATENCY_TOTAL"
+    base_key = f"{test_type}_BASE_LATENCY_TOTAL"
+    
+    achieved = config.get(latency_key, 0)
+    baseline = config.get(base_key, 0)
+    
+    if achieved <= 0:
+        return 0.0
+        
+    return baseline / achieved
+
+
+def calculate_nki_flop_ratio(bs: int, nheads: int, seq_len: int, d: int, is_backward: bool = False) -> float:
+    """
+    Calculate the NKI FLOP ratio based on kernel characteristics.
+    This estimates what percentage of operations are executed on NKI hardware.
+    
+    Args:
+        bs: Batch size
+        nheads: Number of attention heads
+        seq_len: Sequence length
+        d: Head dimension
+        is_backward: Whether this is a backward pass calculation
+        
+    Returns:
+        float: The estimated NKI FLOP ratio (0.0 to 1.0)
+    """
+    # Calculate total FLOPs for attention
+    # For forward pass: 2 * bs * nheads * seq_len * seq_len * d
+    # For backward pass: ~3x the forward pass
+    
+    # Calculate FLOPs for different components
+    qk_bmm_flops = 2 * bs * nheads * seq_len * seq_len * d  # Q*K^T matrix multiply
+    attn_v_bmm_flops = 2 * bs * nheads * seq_len * seq_len * d  # Attention * V matrix multiply
+    softmax_flops = bs * nheads * seq_len * seq_len * 5  # Softmax operations (exp, sum, div)
+    
+    # Total FLOPs for forward pass
+    total_forward_flops = qk_bmm_flops + attn_v_bmm_flops + softmax_flops
+    
+    # For backward pass, we need gradients for Q, K, V
+    if is_backward:
+        # Backward pass has additional operations for gradients
+        dq_flops = 2 * bs * nheads * seq_len * seq_len * d  # dQ calculation
+        dk_flops = 2 * bs * nheads * seq_len * seq_len * d  # dK calculation
+        dv_flops = 2 * bs * nheads * seq_len * seq_len * d  # dV calculation
+        dsoftmax_flops = bs * nheads * seq_len * seq_len * 10  # Softmax gradient operations
+        
+        total_flops = total_forward_flops + dq_flops + dk_flops + dv_flops + dsoftmax_flops
+    else:
+        total_flops = total_forward_flops
+    
+    # Estimate NKI accelerated operations
+    # Matrix multiplies and most vector operations can be accelerated
+    nki_accelerated_flops = qk_bmm_flops + attn_v_bmm_flops
+    
+    if is_backward:
+        nki_accelerated_flops += dq_flops + dk_flops + dv_flops
+    
+    # Some softmax operations can be accelerated too
+    nki_accelerated_flops += softmax_flops * 0.7  # Assume 70% of softmax ops are accelerated
+    
+    if is_backward:
+        nki_accelerated_flops += dsoftmax_flops * 0.7
+    
+    # Calculate the ratio
+    nki_flop_ratio = nki_accelerated_flops / total_flops
+    
+    # Apply some adjustments based on empirical observations
+    # Larger batch sizes and head dimensions tend to have better utilization
+    batch_factor = min(1.0, 0.8 + (bs * 0.05))  # Increases with batch size
+    head_factor = min(1.0, 0.8 + (d / 256) * 0.2)  # Increases with head dimension
+    
+    # Sequence length affects utilization - very long sequences may have lower utilization
+    seq_factor = 1.0
+    if seq_len > 8192:
+        seq_factor = 0.95  # Slight reduction for very long sequences
+    
+    # Apply the adjustments
+    adjusted_ratio = nki_flop_ratio * batch_factor * head_factor * seq_factor
+    
+    # Ensure the ratio is between 0.0 and 1.0
+    return max(0.0, min(1.0, adjusted_ratio))
+
+def get_config_value(key: str, default=None) -> Any:
+    """Get a specific configuration value"""
+    config = load_config()
+    return config.get(key, default)
\ No newline at end of file
diff --git a/src/self-attention/tests/conftest.py b/src/self-attention/tests/conftest.py
new file mode 100644
index 0000000..cd663ae
--- /dev/null
+++ b/src/self-attention/tests/conftest.py
@@ -0,0 +1,28 @@
+import pytest
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--simulation-only", action="store_true", default=False, help="Run simulation only, it will run test with `simulation` marker in simulation mode"
+    )
+
+def pytest_configure(config):
+    config.addinivalue_line(
+        "markers", "simulation: mark simulation test that can be executed without a NeuronDevice"
+    )
+
+@pytest.fixture
+def simulation_only(request):
+    return request.config.getoption("--simulation-only")
+
+def pytest_collection_modifyitems(session, config, items):
+    if config.getoption("--simulation-only"):
+        # Only run cases with `simulation marker`
+        result = []
+        for item in items:
+            for marker in item.iter_markers():
+                if marker.name == 'simulation':
+                    result.append(item)
+                    break
+        items.clear()
+        items.extend(result)
+        
\ No newline at end of file
diff --git a/src/self-attention/tests/test_flash_attn_bwd.py b/src/self-attention/tests/test_flash_attn_bwd.py
new file mode 100644
index 0000000..7c75f35
--- /dev/null
+++ b/src/self-attention/tests/test_flash_attn_bwd.py
@@ -0,0 +1,504 @@
+"""
+Copyright (c) 2023, Amazon.com. All Rights Reserved
+"""
+import pytest
+import sys
+import os
+import logging
+import time
+from typing import Optional, Tuple, Dict, Any
+import numpy as np
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from attention import flash_attn_bwd
+from neuronxcc.nki import benchmark, baremetal, simulate_kernel
+import neuronxcc.nki.language as nl
+from config import update_config, load_config, add_test_result, calculate_nki_flop_ratio
+
+# Configure logging for verbose output
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+xfail = pytest.mark.arch_specific_xfail
+bench_func = benchmark(warmup=5, iters=10)(flash_attn_bwd)
+
+def print_test_header(test_name: str, params: dict):
+    """Print a formatted test header with parameters"""
+    print("\n" + "="*80)
+    print(f"🧪 RUNNING TEST: {test_name}")
+    print("="*80)
+    print("📋 Test Parameters:")
+    for key, value in params.items():
+        print(f"   {key:20}: {value}")
+    print("="*80)
+
+def print_tensor_info(name: str, tensor: np.ndarray):
+    """Print detailed tensor information"""
+    print(f"📊 {name} Info:")
+    print(f"   Shape: {tensor.shape}")
+    print(f"   Dtype: {tensor.dtype}")
+    print(f"   Size (elements): {tensor.size:,}")
+    print(f"   Memory (MB): {tensor.nbytes / 1024 / 1024:.2f}")
+    print(f"   Min/Max: {tensor.min():.6f} / {tensor.max():.6f}")
+    print(f"   Mean/Std: {tensor.mean():.6f} / {tensor.std():.6f}")
+
+def print_performance_metrics(latency_res, expected_latency: int, test_name: str):
+    """Print detailed performance metrics with robust error handling"""
+    print("\n📈 PERFORMANCE METRICS:")
+    print("-" * 40)
+    
+    # Try to get available percentiles, fallback to common ones
+    percentiles = [50, 90, 95, 99]
+    available_percentiles = []
+    
+    for p in percentiles:
+        try:
+            latency = latency_res.get_latency_percentile(p)
+            available_percentiles.append(p)
+            status = "✅ PASS" if latency <= expected_latency else "❌ FAIL"
+            print(f"   P{p:2d} Latency: {latency:,} ns ({latency/1e9:.3f}s) {status}")
+        except (KeyError, AttributeError, Exception) as e:
+            print(f"   P{p:2d} Latency: ❓ NOT AVAILABLE ({type(e).__name__})")
+    
+    # Try to get basic stats if percentiles fail
+    if not available_percentiles:
+        try:
+            # Try alternative methods to get latency data
+            if hasattr(latency_res, 'mean'):
+                mean_latency = latency_res.mean
+                print(f"   Mean Latency: {mean_latency:,} ns ({mean_latency/1e9:.3f}s)")
+            if hasattr(latency_res, 'min'):
+                min_latency = latency_res.min
+                print(f"   Min Latency:  {min_latency:,} ns ({min_latency/1e9:.3f}s)")
+            if hasattr(latency_res, 'max'):
+                max_latency = latency_res.max
+                print(f"   Max Latency:  {max_latency:,} ns ({max_latency/1e9:.3f}s)")
+            
+            print(f"   Available attributes: {[attr for attr in dir(latency_res) if not attr.startswith('_')]}")
+        except Exception as e:
+            print(f"   ⚠️  Could not extract latency metrics: {e}")
+            print(f"   Latency result type: {type(latency_res)}")
+            print(f"   Available methods: {[method for method in dir(latency_res) if not method.startswith('_')]}")
+    
+    print(f"   Expected:   {expected_latency:,} ns ({expected_latency/1e9:.3f}s)")
+    
+    # Use P50 if available, otherwise try other metrics
+    try:
+        p50_latency = latency_res.get_latency_percentile(50)
+        test_passed = p50_latency <= expected_latency
+        print(f"   Test Status: {'✅ PASSED' if test_passed else '❌ FAILED'}")
+        return p50_latency
+    except:
+        print(f"   Test Status: ❓ CANNOT DETERMINE (P50 not available)")
+        return None
+
+def print_memory_usage(bs: int, nheads: int, seqlen: int, d: int, dtype):
+    """Calculate and print memory usage estimates for backward pass"""
+    element_size = 2 if dtype == nl.bfloat16 else 4  # bytes
+    
+    # Input tensors
+    q_size = bs * nheads * d * seqlen * element_size
+    k_size = bs * nheads * d * seqlen * element_size
+    v_size = bs * nheads * d * seqlen * element_size
+    dy_size = bs * nheads * d * seqlen * element_size
+    o_proj_size = bs * nheads * d * seqlen * element_size
+    lse_size = bs * nheads * nl.tile_size.pmax * (seqlen // nl.tile_size.pmax) * 4  # float32
+    
+    # Output gradients
+    dq_size = q_size
+    dk_size = k_size
+    dv_size = v_size
+    
+    total_input_size = q_size + k_size + v_size + dy_size + o_proj_size + lse_size
+    total_output_size = dq_size + dk_size + dv_size
+    total_size = total_input_size + total_output_size
+    
+    print("\n💾 MEMORY USAGE ESTIMATES:")
+    print("-" * 40)
+    print("   Input Tensors:")
+    print(f"     Q tensor:       {q_size / 1024 / 1024:.2f} MB")
+    print(f"     K tensor:       {k_size / 1024 / 1024:.2f} MB")
+    print(f"     V tensor:       {v_size / 1024 / 1024:.2f} MB")
+    print(f"     dY tensor:      {dy_size / 1024 / 1024:.2f} MB")
+    print(f"     O_proj tensor:  {o_proj_size / 1024 / 1024:.2f} MB")
+    print(f"     LSE tensor:     {lse_size / 1024 / 1024:.2f} MB")
+    print("   Output Gradients:")
+    print(f"     dQ tensor:      {dq_size / 1024 / 1024:.2f} MB")
+    print(f"     dK tensor:      {dk_size / 1024 / 1024:.2f} MB")
+    print(f"     dV tensor:      {dv_size / 1024 / 1024:.2f} MB")
+    print("-" * 40)
+    print(f"   Total Input:    {total_input_size / 1024 / 1024:.2f} MB")
+    print(f"   Total Output:   {total_output_size / 1024 / 1024:.2f} MB")
+    print(f"   Total Memory:   {total_size / 1024 / 1024:.2f} MB")
+    print(f"   Est. Peak:      {total_size * 2 / 1024 / 1024:.2f} MB (2x for intermediate)")
+
+def softmax(x: np.ndarray, dim: int, zero_max_mode=False,
+            mixed_precision=False, return_max_reduce=False):
+    """Softmax implementation with verbose logging"""
+    logger.debug(f"Computing softmax on tensor shape {x.shape} along dim {dim}")
+    
+    max_value = np.amax(x, axis=dim, keepdims=True)
+    max_value = np.maximum(0, max_value) if zero_max_mode else max_value
+    exp = np.exp(x - max_value)
+    
+    if mixed_precision:
+        reduce = np.add.reduce(exp.astype(np.float32), axis=dim, keepdims=True).astype(x.dtype)
+    else:
+        reduce = np.add.reduce(exp, axis=dim, keepdims=True)
+    
+    if return_max_reduce:
+        return exp / reduce, -max_value, np.reciprocal(reduce)
+    return exp / reduce
+
+def softmax_dx(dy: np.ndarray, y: np.ndarray, dim: int, mixed_precision=False):
+    """Softmax gradient computation with logging"""
+    logger.debug(f"Computing softmax gradient on tensors shape {dy.shape}")
+    
+    # dx_i = (dy_i - sum(dy_k*y_k)) * y_i
+    prod = dy * y
+    if mixed_precision:
+        reduce = np.add.reduce(prod.astype(np.float32), axis=dim, keepdims=True).astype(dy.dtype)
+    else:
+        reduce = np.add.reduce(prod, axis=dim, keepdims=True)
+    subtract = dy - reduce
+    return subtract * y
+
+def cpu_attention_backward(q, k, v, dy, use_causal_mask=True, mixed_precision=True):
+    """
+    Compute the attention backward with the softmax recomputation
+    """
+    logger.info("🔄 Computing CPU reference attention backward pass...")
+    start_time = time.time()
+    
+    def mixed_precision_matmul(a, b):
+        input_dtype = a.dtype
+        a, b = a.astype(np.float32), b.astype(np.float32)
+        c = np.matmul(a, b)
+        return c.astype(input_dtype)
+
+    _, _, d, _ = q.shape
+    logger.debug(f"Attention head dimension: {d}")
+    
+    # Compute golden output
+    softmax_scale = 1.0 / (d ** 0.5)
+    logger.debug(f"Softmax scale factor: {softmax_scale:.6f}")
+    
+    q_scaled = q * softmax_scale
+    
+    logger.debug("Computing attention scores...")
+    raw_score = mixed_precision_matmul(q_scaled.transpose(0, 1, 3, 2), k)
+
+    if use_causal_mask:
+        logger.debug("Applying causal mask...")
+        for i in range(raw_score.shape[0]):
+            for j in range(raw_score.shape[1]):
+                # -inf triggers invalid input error in softmax implementation, use a small negative instead
+                # k=1 to exclude the diagonal, because each token can still attend to itself
+                raw_score[i, j][np.triu_indices_from(raw_score[i, j], k=1)] = -9984.0
+
+    logger.debug("Computing forward softmax...")
+    norm_score, cached_negative_max, cached_sum_reciprocal = \
+        softmax(raw_score, dim=-1, mixed_precision=mixed_precision, return_max_reduce=True)
+
+    logger.debug("Computing backward pass gradients...")
+    
+    # Calculate softmax_dy = (dL/dy)^T @ V
+    logger.debug("Computing softmax gradient input...")
+    softmax_dy = mixed_precision_matmul(dy.transpose(0, 1, 3, 2), v)
+
+    # Calculate dv = (dL/dy) @ softmax_y
+    logger.debug("Computing dV gradient...")
+    dv_golden = mixed_precision_matmul(dy, norm_score)
+
+    # Calculate softmax_dx
+    logger.debug("Computing softmax gradient...")
+    softmax_dx_golden = softmax_dx(softmax_dy, norm_score, dim=-1, mixed_precision=mixed_precision)
+
+    # Calculate dq
+    logger.debug("Computing dQ gradient...")
+    dq_golden = mixed_precision_matmul(k, softmax_dx_golden.transpose(0, 1, 3, 2)) * softmax_scale
+
+    # Calculate dk
+    logger.debug("Computing dK gradient...")
+    dk_golden = mixed_precision_matmul(q_scaled, softmax_dx_golden)
+
+    # Calculate output projection
+    logger.debug("Computing output projection...")
+    o_proj = np.matmul(norm_score, v.transpose(0, 1, 3, 2)).transpose(0, 1, 3, 2)
+
+    elapsed_time = time.time() - start_time
+    logger.info(f"✅ CPU reference backward pass completed in {elapsed_time:.2f} seconds")
+
+    return dq_golden, dk_golden, dv_golden, cached_negative_max, cached_sum_reciprocal, o_proj
+
+def print_gradient_comparison(grad_name: str, computed_grad: np.ndarray, reference_grad: np.ndarray, tolerance: float = 1e-2):
+    """Print detailed comparison of gradients"""
+    max_diff = np.max(np.abs(computed_grad - reference_grad))
+    mean_diff = np.mean(np.abs(computed_grad - reference_grad))
+    relative_error = np.mean(np.abs(computed_grad - reference_grad) / (np.abs(reference_grad) + 1e-8))
+    close = np.allclose(computed_grad, reference_grad, atol=tolerance)
+    
+    print(f"📊 {grad_name} Gradient Comparison:")
+    print(f"   Max absolute difference:  {max_diff:.6f}")
+    print(f"   Mean absolute difference: {mean_diff:.6f}")
+    print(f"   Mean relative error:      {relative_error:.6f}")
+    print(f"   Tolerance:                {tolerance}")
+    print(f"   Result: {'✅ PASS' if close else '❌ FAIL'}")
+    
+    if not close:
+        # Additional debugging info for failures
+        print(f"   Computed - Min/Max: {computed_grad.min():.6f} / {computed_grad.max():.6f}")
+        print(f"   Reference - Min/Max: {reference_grad.min():.6f} / {reference_grad.max():.6f}")
+    
+    return close
+
+class TestAttention:
+
+    @xfail # P167481231
+    @pytest.mark.parametrize("bs, nheads, seqlen, d, dtype, latency", [
+        [1, 4, 32*1024, 128, nl.bfloat16, 117000],
+    ])
+    def test_flash_attn_bwd_perf(self, bs, nheads, seqlen, d, dtype, latency):
+        
+        # Print test header with all parameters
+        test_params = {
+            'Batch Size': bs,
+            'Num Heads': nheads,
+            'Sequence Length': f"{seqlen:,}",
+            'Head Dimension': d,
+            'Data Type': str(dtype),
+            'Expected Latency': f"{latency:,} ns"
+        }
+        
+        print_test_header("Flash Attention Backward Performance Test", test_params)
+        print_memory_usage(bs, nheads, seqlen, d, dtype)
+        
+        print("\n⚙️  SETUP PHASE:")
+        print("-" * 40)
+        
+        # Generate test data
+        print("🎲 Generating random test tensors...")
+        q = (np.random.random_sample([bs, nheads, d, seqlen]) - 0.5) * 2
+        k = (np.random.random_sample([bs, nheads, d, seqlen]) - 0.5) * 2
+        v = (np.random.random_sample([bs, nheads, d, seqlen]) - 0.5) * 2
+        dy = (np.random.random_sample([bs, nheads, d, seqlen]) - 0.5) * 2
+        o_proj = (np.random.random_sample([bs, nheads, d, seqlen]) - 0.5) * 2
+        lse = np.random.random_sample([bs, nheads, nl.tile_size.pmax, seqlen // nl.tile_size.pmax]).astype(np.float32)
+        seed = None
+
+        # Print tensor information
+        print_tensor_info("Q", q)
+        print_tensor_info("K", k)
+        print_tensor_info("V", v)
+        print_tensor_info("dY (output gradient)", dy)
+        print_tensor_info("O_proj (forward output)", o_proj)
+        print_tensor_info("LSE (log-sum-exp)", lse)
+        
+        # Cast to target dtype
+        print(f"\n🔄 Converting tensors to {dtype}...")
+        q = nl.static_cast(q, dtype)
+        k = nl.static_cast(k, dtype)
+        v = nl.static_cast(v, dtype)
+        o_proj = nl.static_cast(o_proj, dtype)
+        dy = nl.static_cast(dy, dtype)
+        
+        print("\n🚀 BENCHMARKING PHASE:")
+        print("-" * 40)
+        print("⏱️  Running benchmark with warmup=5, iters=10...")
+        print("⚠️  Note: This test is marked as xfail due to P167481231")
+        
+        bench_func_ = bench_func[bs, nheads]
+        
+        # Run the benchmark
+        start_time = time.time()
+        bench_func_(q, k, v, o_proj, dy, lse, seed,
+                    use_causal_mask=True, mixed_precision=True)
+        benchmark_time = time.time() - start_time
+        
+        print(f"✅ Benchmark completed in {benchmark_time:.2f} seconds")
+        
+        # Get and display results
+        latency_res = bench_func_.benchmark_result.nc_latency
+        p50_latency = print_performance_metrics(latency_res, latency, "Flash Attention Backward")
+        
+        # Update performance metrics in config file
+        print("\n📊 Adding latency metrics to config file...")
+        # Initialize variables to track total latency
+        achieved_latency = p50_latency if p50_latency is not None else 0
+        
+        # If p50_latency is None, try to get mean latency
+        if achieved_latency == 0 and hasattr(latency_res, 'mean'):
+            achieved_latency = latency_res.mean
+            print(f"   Using mean latency: {achieved_latency:,} ns")
+        
+        # Update the config variables if we have valid latency data
+        if achieved_latency > 0:
+            # Calculate NKI FLOP ratio for this kernel configuration
+            nki_flop_ratio = calculate_nki_flop_ratio(
+                bs=bs, 
+                nheads=nheads, 
+                seq_len=seqlen, 
+                d=d, 
+                is_backward=True
+            )
+            print(f"   Calculated NKI FLOP ratio: {nki_flop_ratio:.4f}")
+            
+            # Create test parameters dictionary for logging
+            test_params = {
+                "batch_size": bs,
+                "num_heads": nheads,
+                "seq_len": seqlen,
+                "head_dim": d,
+                "dtype": str(dtype),
+                "nki_flop_ratio": nki_flop_ratio
+            }
+            
+            # Update the NKI FLOP ratio in the config
+            update_config({'NKI_FLOP_RATIO': nki_flop_ratio})
+            
+            # Add the test result to our accumulated metrics
+            update_success = add_test_result('BWD', achieved_latency, latency, test_params)
+            print(f"   Config update {'succeeded' if update_success else 'failed'}")
+            print(f"   Added to BWD_LATENCY_TOTAL: {achieved_latency:,} ns")
+            print(f"   Added to BWD_BASE_LATENCY_TOTAL: {latency:,} ns")
+        else:
+            print("   No valid latency data to update config")
+        
+        # Final assertion with better error handling
+        if p50_latency is not None:
+            try:
+                assert p50_latency <= latency
+                print(f"\n🎉 TEST PASSED! P50 latency ({p50_latency:,} ns) <= expected ({latency:,} ns)")
+            except AssertionError:
+                print(f"\n💥 TEST FAILED! P50 latency ({p50_latency:,} ns) > expected ({latency:,} ns)")
+                raise
+        else:
+            # Fallback: try to find any available latency metric
+            print(f"\n⚠️  WARNING: Could not determine P50 latency for comparison")
+            print(f"   Benchmark result type: {type(bench_func_.benchmark_result)}")
+            print(f"   NC latency type: {type(latency_res)}")
+            
+            # Try alternative assertion methods
+            try:
+                # Look for any latency value we can use
+                if hasattr(latency_res, 'mean'):
+                    mean_latency = latency_res.mean
+                    assert mean_latency <= latency
+                    print(f"✅ Using mean latency for comparison: {mean_latency:,} ns <= {latency:,} ns")
+                else:
+                    print("❌ No suitable latency metric found for assertion")
+                    raise AssertionError("Cannot determine latency for comparison")
+            except Exception as e:
+                print(f"💥 Assertion failed: {e}")
+                raise
+        
+        print("\n" + "="*80 + "\n")
+
+    @pytest.mark.simulation
+    @pytest.mark.parametrize("bs, nheads, seqlen, d, dtype", [
+        [1, 4, 4096, 128, np.float32],
+    ])
+    def test_flash_attn_bwd_numerical(self, simulation_only, bs, nheads, seqlen, d, dtype):
+        
+        # Print test header
+        test_params = {
+            'Batch Size': bs,
+            'Num Heads': nheads,
+            'Sequence Length': f"{seqlen:,}",
+            'Head Dimension': d,
+            'Data Type': str(dtype),
+            'Simulation Only': simulation_only
+        }
+        
+        print_test_header("Flash Attention Backward Numerical Test", test_params)
+        print_memory_usage(bs, nheads, seqlen, d, dtype)
+        
+        print("\n⚙️  SETUP PHASE:")
+        print("-" * 40)
+        
+        # Generate test data
+        print("🎲 Generating random test tensors...")
+        q = (np.random.random_sample([bs, nheads, d, seqlen]) - 0.5) * 2
+        k = (np.random.random_sample([bs, nheads, d, seqlen]) - 0.5) * 2
+        v = (np.random.random_sample([bs, nheads, d, seqlen]) - 0.5) * 2
+        dy = (np.random.random_sample([bs, nheads, d, seqlen]) - 0.5) * 2
+        
+        # Print tensor information
+        print_tensor_info("Q", q)
+        print_tensor_info("K", k)
+        print_tensor_info("V", v)
+        print_tensor_info("dY (output gradient)", dy)
+        
+        # Cast to target dtype
+        print(f"\n🔄 Converting tensors to {dtype}...")
+        q = nl.static_cast(q, dtype)
+        k = nl.static_cast(k, dtype)
+        v = nl.static_cast(v, dtype)
+        dy = nl.static_cast(dy, dtype)
+        seed = None
+
+        print("\n🔍 REFERENCE COMPUTATION:")
+        print("-" * 40)
+        
+        # Compute reference (golden) output
+        dq_golden, dk_golden, dv_golden, cached_negative_max, cached_sum_reciprocal, o_proj = \
+          cpu_attention_backward(q, k, v, dy, use_causal_mask=True)
+        
+        # Reshape reference outputs to match expected format
+        cached_negative_max = cached_negative_max.reshape(bs, nheads, seqlen // nl.tile_size.pmax,
+                                                          nl.tile_size.pmax).transpose(0, 1, 3, 2)
+        cached_sum_reciprocal = cached_sum_reciprocal.reshape(bs, nheads, seqlen // nl.tile_size.pmax,
+                                                              nl.tile_size.pmax).transpose(0, 1, 3, 2)
+        lse = -1.0 * (cached_negative_max + np.log(cached_sum_reciprocal))
+        
+        print_tensor_info("Reference dQ", dq_golden)
+        print_tensor_info("Reference dK", dk_golden)
+        print_tensor_info("Reference dV", dv_golden)
+        print_tensor_info("Reference O_proj", o_proj)
+        print_tensor_info("LSE (computed)", lse)
+
+        print("\n🚀 FLASH ATTENTION BACKWARD COMPUTATION:")
+        print("-" * 40)
+        
+        numeric_func = baremetal(flash_attn_bwd)
+        
+        if simulation_only:
+            print("🔬 Running in simulation mode...")
+            start_time = time.time()
+            out_dq, out_dk, out_dv = simulate_kernel(numeric_func[bs, nheads], q, k, v, o_proj, dy, lse, seed,
+                                                     use_causal_mask=True,
+                                                     mixed_precision=True)
+            compute_time = time.time() - start_time
+            print(f"✅ Simulation completed in {compute_time:.2f} seconds")
+        else:
+            print("⚡ Running on hardware...")
+            start_time = time.time()
+            out_dq, out_dk, out_dv = numeric_func[bs, nheads](q, k, v, o_proj, dy, lse, seed,
+                                                             use_causal_mask=True,
+                                                             mixed_precision=True)
+            compute_time = time.time() - start_time
+            print(f"✅ Hardware execution completed in {compute_time:.2f} seconds")
+
+        print("\n🔬 NUMERICAL VERIFICATION:")
+        print("-" * 40)
+        
+        print_tensor_info("Flash dQ", out_dq)
+        print_tensor_info("Flash dK", out_dk)
+        print_tensor_info("Flash dV", out_dv)
+        
+        # Check all gradients
+        dq_close = print_gradient_comparison("dQ", out_dq, dq_golden, tolerance=1e-2)
+        dk_close = print_gradient_comparison("dK", out_dk, dk_golden, tolerance=1e-2)
+        dv_close = print_gradient_comparison("dV", out_dv, dv_golden, tolerance=1e-2)
+        
+        # Final assertions
+        try:
+            assert dq_close, f"dQ gradient mismatch"
+            assert dk_close, f"dK gradient mismatch"
+            assert dv_close, f"dV gradient mismatch"
+            print(f"\n🎉 TEST PASSED! All gradients match reference within tolerance")
+        except AssertionError as e:
+            print(f"\n💥 TEST FAILED! {str(e)}")
+            raise
+        
+        print("\n" + "="*80 + "\n")
\ No newline at end of file
diff --git a/src/self-attention/tests/test_flash_attn_fwd.py b/src/self-attention/tests/test_flash_attn_fwd.py
new file mode 100644
index 0000000..425ab94
--- /dev/null
+++ b/src/self-attention/tests/test_flash_attn_fwd.py
@@ -0,0 +1,517 @@
+"""
+Copyright (c) 2023, Amazon.com. All Rights Reserved
+"""
+import pytest
+import sys
+import os
+import logging
+import time
+from typing import Optional, Tuple, Dict, Any
+import numpy as np
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from attention import flash_fwd, FlashConfig
+from neuronxcc.nki import benchmark, baremetal, simulate_kernel
+import neuronxcc.nki.language as nl
+from config import update_config, load_config, add_test_result, calculate_nki_flop_ratio
+
+# Configure logging for verbose output
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+bench_func = benchmark(warmup=5, iters=10)(flash_fwd)
+
+def print_test_header(test_name: str, params: dict):
+    """Print a formatted test header with parameters"""
+    print("\n" + "="*80)
+    print(f"🧪 RUNNING TEST: {test_name}")
+    print("="*80)
+    print("📋 Test Parameters:")
+    for key, value in params.items():
+        print(f"   {key:20}: {value}")
+    print("="*80)
+
+def print_tensor_info(name: str, tensor: np.ndarray):
+    """Print detailed tensor information"""
+    print(f"📊 {name} Info:")
+    print(f"   Shape: {tensor.shape}")
+    print(f"   Dtype: {tensor.dtype}")
+    print(f"   Size (elements): {tensor.size:,}")
+    print(f"   Memory (MB): {tensor.nbytes / 1024 / 1024:.2f}")
+    print(f"   Min/Max: {tensor.min():.6f} / {tensor.max():.6f}")
+    print(f"   Mean/Std: {tensor.mean():.6f} / {tensor.std():.6f}")
+
+def print_performance_metrics(latency_res, expected_latency: int, test_name: str):
+    """Print detailed performance metrics"""
+    print("\n📈 PERFORMANCE METRICS:")
+    print("-" * 40)
+    
+    # Try to get available percentiles, fallback to common ones
+    percentiles = [50, 90, 95, 99]
+    available_percentiles = []
+    
+    for p in percentiles:
+        try:
+            latency = latency_res.get_latency_percentile(p)
+            available_percentiles.append(p)
+            status = "✅ PASS" if latency <= expected_latency else "❌ FAIL"
+            print(f"   P{p:2d} Latency: {latency:,} ns ({latency/1e9:.3f}s) {status}")
+        except (KeyError, AttributeError, Exception) as e:
+            print(f"   P{p:2d} Latency: ❓ NOT AVAILABLE ({type(e).__name__})")
+    
+    # Try to get basic stats if percentiles fail
+    if not available_percentiles:
+        try:
+            # Try alternative methods to get latency data
+            if hasattr(latency_res, 'mean'):
+                mean_latency = latency_res.mean
+                print(f"   Mean Latency: {mean_latency:,} ns ({mean_latency/1e9:.3f}s)")
+            if hasattr(latency_res, 'min'):
+                min_latency = latency_res.min
+                print(f"   Min Latency:  {min_latency:,} ns ({min_latency/1e9:.3f}s)")
+            if hasattr(latency_res, 'max'):
+                max_latency = latency_res.max
+                print(f"   Max Latency:  {max_latency:,} ns ({max_latency/1e9:.3f}s)")
+            
+            print(f"   Available attributes: {[attr for attr in dir(latency_res) if not attr.startswith('_')]}")
+        except Exception as e:
+            print(f"   ⚠️  Could not extract latency metrics: {e}")
+            print(f"   Latency result type: {type(latency_res)}")
+            print(f"   Available methods: {[method for method in dir(latency_res) if not method.startswith('_')]}")
+    
+    print(f"   Expected:   {expected_latency:,} ns ({expected_latency/1e9:.3f}s)")
+    
+    # Use P50 if available, otherwise try other metrics
+    try:
+        p50_latency = latency_res.get_latency_percentile(50)
+        test_passed = p50_latency <= expected_latency
+        print(f"   Test Status: {'✅ PASSED' if test_passed else '❌ FAILED'}")
+        return p50_latency
+    except:
+        print(f"   Test Status: ❓ CANNOT DETERMINE (P50 not available)")
+        return None
+
+def print_memory_usage(bs: int, nheads: int, seqlen_q: int, seqlen_k: int, d: int, dtype):
+    """Calculate and print memory usage estimates"""
+    element_size = 2 if dtype == nl.bfloat16 else 4  # bytes
+    
+    q_size = bs * nheads * d * seqlen_q * element_size
+    k_size = bs * nheads * d * seqlen_k * element_size
+    v_size = bs * nheads * seqlen_k * d * element_size
+    total_size = q_size + k_size + v_size
+    
+    print("\n💾 MEMORY USAGE ESTIMATES:")
+    print("-" * 40)
+    print(f"   Q tensor:     {q_size / 1024 / 1024:.2f} MB")
+    print(f"   K tensor:     {k_size / 1024 / 1024:.2f} MB")
+    print(f"   V tensor:     {v_size / 1024 / 1024:.2f} MB")
+    print(f"   Total Input:  {total_size / 1024 / 1024:.2f} MB")
+    print(f"   Est. Peak:    {total_size * 2 / 1024 / 1024:.2f} MB (2x for intermediate)")
+
+def softmax(x: np.ndarray, dim: int, zero_max_mode=False,
+            mixed_precision=False, return_max_reduce=False):
+    """Softmax implementation with verbose logging"""
+    logger.debug(f"Computing softmax on tensor shape {x.shape} along dim {dim}")
+    
+    max_value = np.amax(x, axis=dim, keepdims=True)
+    max_value = np.maximum(0, max_value) if zero_max_mode else max_value
+    exp = np.exp(x - max_value)
+    
+    if mixed_precision:
+        reduce = np.add.reduce(exp.astype(np.float32), axis=dim, keepdims=True).astype(x.dtype)
+    else:
+        reduce = np.add.reduce(exp, axis=dim, keepdims=True)
+    
+    if return_max_reduce:
+        return exp / reduce, -max_value, np.reciprocal(reduce)
+    return exp / reduce
+
+def cpu_attention_forward(q, k, v, use_causal_mask=True, mixed_precision=True):
+    """CPU attention forward pass with verbose logging"""
+    logger.info("🔄 Computing CPU reference attention forward pass...")
+    start_time = time.time()
+    
+    def mixed_precision_matmul(a, b):
+        input_dtype = a.dtype
+        a, b = a.astype(np.float32), b.astype(np.float32)
+        c = np.matmul(a, b)
+        return c.astype(input_dtype)
+
+    _, _, d, _ = q.shape
+    logger.debug(f"Attention head dimension: {d}")
+
+    # Compute golden output
+    softmax_scale = 1.0 / (d ** 0.5)
+    logger.debug(f"Softmax scale factor: {softmax_scale:.6f}")
+    
+    q_scaled = q * softmax_scale
+    nheads = q.shape[1]
+    kv_heads = k.shape[1]
+    
+    if nheads > kv_heads:
+        logger.info(f"📡 Expanding KV heads from {kv_heads} to {nheads} (GQA/MQA)")
+        k = np.repeat(k, nheads//kv_heads, axis=1)
+        v = np.repeat(v, nheads//kv_heads, axis=1)
+    
+    logger.debug("Computing attention scores...")
+    raw_score = mixed_precision_matmul(q_scaled.transpose(0, 1, 3, 2), k)
+
+    if use_causal_mask:
+        logger.debug("Applying causal mask...")
+        for i in range(raw_score.shape[0]):
+            for j in range(raw_score.shape[1]):
+                # -inf triggers invalid input error in softmax implementation, use a small negative instead
+                # k=1 to exclude the diagonal, because each token can still attend to itself
+                raw_score[i, j][np.triu_indices_from(raw_score[i, j], k=1)] = -9984.0
+
+    logger.debug("Computing softmax...")
+    norm_score, cached_negative_max, cached_sum_reciprocal = \
+        softmax(raw_score, dim=-1, mixed_precision=mixed_precision, return_max_reduce=True)
+
+    logger.debug("Computing final output...")
+    # Transpose the result so it has the same layout as ours
+    out_golden = mixed_precision_matmul(norm_score, v.transpose(0, 1, 3, 2)).transpose(0, 1, 3, 2)
+    
+    elapsed_time = time.time() - start_time
+    logger.info(f"✅ CPU reference completed in {elapsed_time:.2f} seconds")
+
+    return out_golden, cached_negative_max, cached_sum_reciprocal
+
+class TestAttention:
+    
+    @pytest.mark.parametrize("bs, nheads, seqlen_q, seqlen_k, d, dtype, use_causal_mask,\
+                              mixed_precision, training, tile_size, kv_heads, should_transpose_v, latency", [
+    [1, 6, 32*1024, 32*1024, 96, nl.bfloat16, True, True, True, 2048, 3, False, 87000000000],
+    [1, 1, 32*1024, 32*1024, 96, nl.bfloat16, True, True, False, 2048, None, False, 15100000000],
+    # Non-square
+    [1, 3, 32*1024, 16*1024, 96, nl.bfloat16, True, True, False, 2048, None, False, 7550000000],
+    [1, 3, 16*1024, 32*1024, 96, nl.bfloat16, True, True, False, 2048, None, False, 7550000000],
+    ])
+    def test_flash_attn_fwd_perf(self, bs, nheads, seqlen_q, seqlen_k, d, dtype, use_causal_mask, 
+                                 mixed_precision, training, tile_size, kv_heads, should_transpose_v, latency):
+        
+        # Print test header with all parameters
+        test_params = {
+            'Batch Size': bs,
+            'Num Heads': nheads,
+            'Q Sequence Length': f"{seqlen_q:,}",
+            'K Sequence Length': f"{seqlen_k:,}",
+            'Head Dimension': d,
+            'Data Type': str(dtype),
+            'Causal Mask': use_causal_mask,
+            'Mixed Precision': mixed_precision,
+            'Training Mode': training,
+            'Tile Size': tile_size,
+            'KV Heads': kv_heads or nheads,
+            'Transpose V': should_transpose_v,
+            'Expected Latency': f"{latency:,} ns"
+        }
+        
+        print_test_header("Flash Attention Forward Performance Test", test_params)
+        print_memory_usage(bs, nheads, seqlen_q, seqlen_k, d, dtype)
+        
+        print("\n⚙️  SETUP PHASE:")
+        print("-" * 40)
+        
+        # Generate test data
+        print("🎲 Generating random test tensors...")
+        q = (np.random.random_sample([bs, nheads, d, seqlen_q]) - 0.5) * 2
+        k = (np.random.random_sample([bs, nheads, d, seqlen_k]) - 0.5) * 2
+        
+        if should_transpose_v:
+            v = (np.random.random_sample([bs, nheads, d, seqlen_k]) - 0.5) * 2
+            print("   V tensor: Using transposed layout")
+        else:
+            v = (np.random.random_sample([bs, nheads, seqlen_k, d]) - 0.5) * 2
+            print("   V tensor: Using standard layout")
+        
+        o_proj = np.zeros(shape=[bs, nheads, seqlen_q, d], dtype=dtype)
+        out_lse = np.zeros(shape=[bs, nheads, int(nl.tile_size.pmax), seqlen_q // nl.tile_size.pmax], 
+                                  dtype=nl.float32 if mixed_precision else dtype) if training else None
+        seed = None
+        
+        # Print tensor information
+        print_tensor_info("Q", q)
+        print_tensor_info("K", k)
+        print_tensor_info("V", v)
+        
+        # Cast to target dtype
+        print(f"\n🔄 Converting tensors to {dtype}...")
+        q = nl.static_cast(q, dtype)
+        k = nl.static_cast(k, dtype)
+        v = nl.static_cast(v, dtype)
+        
+        # Setup configuration
+        config = FlashConfig(**{'seq_tile_size':tile_size, 'training':training, 'should_transpose_v':should_transpose_v})
+        print(f"📝 Flash Config: {config.__dict__}")
+        
+        heads = nheads if kv_heads is None else kv_heads
+        
+        print("\n🚀 BENCHMARKING PHASE:")
+        print("-" * 40)
+        print("⏱️  Running benchmark with warmup=5, iters=10...")
+        
+        bench_func_ = bench_func[bs, heads]
+        
+        # Run the benchmark
+        start_time = time.time()
+        bench_func_(q, k, v, seed, use_causal_mask=use_causal_mask,
+                    mixed_precision=mixed_precision, config=config)
+        benchmark_time = time.time() - start_time
+        
+        print(f"✅ Benchmark completed in {benchmark_time:.2f} seconds")
+        
+        # Get and display results
+        latency_res = bench_func_.benchmark_result.nc_latency
+        p50_latency = print_performance_metrics(latency_res, latency, "Flash Attention Forward")
+        
+        # Update performance metrics in config file
+        print("\n📊 Adding latency metrics to config file...")
+        # Initialize variables to track total latency
+        achieved_latency = p50_latency if p50_latency is not None else 0
+        
+        # If p50_latency is None, try to get mean latency
+        if achieved_latency == 0 and hasattr(latency_res, 'mean'):
+            achieved_latency = latency_res.mean
+            print(f"   Using mean latency: {achieved_latency:,} ns")
+        
+        # Update the config variables if we have valid latency data
+        if achieved_latency > 0:
+            # Calculate NKI FLOP ratio for this kernel configuration
+            nki_flop_ratio = calculate_nki_flop_ratio(
+                bs=bs, 
+                nheads=nheads, 
+                seq_len=max(seqlen_q, seqlen_k), 
+                d=d, 
+                is_backward=False
+            )
+            print(f"   Calculated NKI FLOP ratio: {nki_flop_ratio:.4f}")
+            
+            # Create test parameters dictionary for logging
+            test_params = {
+                "batch_size": bs,
+                "num_heads": nheads,
+                "seq_len_q": seqlen_q,
+                "seq_len_k": seqlen_k,
+                "head_dim": d,
+                "dtype": str(dtype),
+                "causal_mask": use_causal_mask,
+                "mixed_precision": mixed_precision,
+                "training": training,
+                "tile_size": tile_size,
+                "kv_heads": kv_heads if kv_heads is not None else nheads,
+                "transpose_v": should_transpose_v,
+                "nki_flop_ratio": nki_flop_ratio
+            }
+            
+            # Update the NKI FLOP ratio in the config
+            update_config({'NKI_FLOP_RATIO': nki_flop_ratio})
+            
+            # Add the test result to our accumulated metrics
+            update_success = add_test_result('FWD', achieved_latency, latency, test_params)
+            print(f"   Config update {'succeeded' if update_success else 'failed'}")
+            print(f"   Added to FWD_LATENCY_TOTAL: {achieved_latency:,} ns")
+            print(f"   Added to FWD_BASE_LATENCY_TOTAL: {latency:,} ns")
+        else:
+            print("   No valid latency data to update config")
+        
+        # Final assertion with better error handling
+        if p50_latency is not None:
+            try:
+                assert p50_latency <= latency
+                print(f"\n🎉 TEST PASSED! P50 latency ({p50_latency:,} ns) <= expected ({latency:,} ns)")
+            except AssertionError:
+                print(f"\n💥 TEST FAILED! P50 latency ({p50_latency:,} ns) > expected ({latency:,} ns)")
+                raise
+        else:
+            # Fallback: try to find any available latency metric
+            print(f"\n⚠️  WARNING: Could not determine P50 latency for comparison")
+            print(f"   Benchmark result type: {type(bench_func_.benchmark_result)}")
+            print(f"   NC latency type: {type(latency_res)}")
+            
+            # Try alternative assertion methods
+            try:
+                # Look for any latency value we can use
+                if hasattr(latency_res, 'mean'):
+                    mean_latency = latency_res.mean
+                    assert mean_latency <= latency
+                    print(f"✅ Using mean latency for comparison: {mean_latency:,} ns <= {latency:,} ns")
+                else:
+                    print("❌ No suitable latency metric found for assertion")
+                    raise AssertionError("Cannot determine latency for comparison")
+            except Exception as e:
+                print(f"💥 Assertion failed: {e}")
+                raise
+        
+        print("\n" + "="*80 + "\n")
+    
+    @pytest.mark.simulation
+    @pytest.mark.parametrize("bs, nheads, seqlen_q, seqlen_k, d, dtype, use_causal_mask,\
+                              training, tile_size, kv_heads, should_transpose_v", [
+    [1, 6, 4096, 4096, 128, np.float32, True, True, 2048, 3, False],
+    [1, 1, 4096, 4096, 128, np.float32, True, False, 2048, None, False],
+    [1, 1, 8192, 4096, 128, np.float32, True, False, 2048, None, False],
+    [1, 1, 4096, 8192, 128, np.float32, True, False, 2048, None, False],
+    ])
+    def test_flash_attn_fwd_numerical(self, simulation_only, bs, nheads, seqlen_q, seqlen_k, d, dtype, use_causal_mask, 
+                                     training, tile_size, kv_heads, should_transpose_v):
+        
+        # Print test header
+        test_params = {
+            'Batch Size': bs,
+            'Num Heads': nheads,
+            'Q Sequence Length': f"{seqlen_q:,}",
+            'K Sequence Length': f"{seqlen_k:,}",
+            'Head Dimension': d,
+            'Data Type': str(dtype),
+            'Causal Mask': use_causal_mask,
+            'Training Mode': training,
+            'Tile Size': tile_size,
+            'KV Heads': kv_heads or nheads,
+            'Transpose V': should_transpose_v,
+            'Simulation Only': simulation_only
+        }
+        
+        print_test_header("Flash Attention Forward Numerical Test", test_params)
+        print_memory_usage(bs, nheads, seqlen_q, seqlen_k, d, dtype)
+        
+        print("\n⚙️  SETUP PHASE:")
+        print("-" * 40)
+        
+        # Generate test data
+        print("🎲 Generating random test tensors...")
+        q = (np.random.random_sample([bs, nheads, d, seqlen_q]) - 0.5) * 2
+        k = (np.random.random_sample([bs, kv_heads or nheads, d, seqlen_k]) - 0.5) * 2
+        
+        if should_transpose_v:
+            v = (np.random.random_sample([bs, nheads, d, seqlen_k]) - 0.5) * 2
+            cpu_permute = (0, 1, 2, 3)
+            print("   V tensor: Using transposed layout")
+        else:
+            v = (np.random.random_sample([bs, kv_heads or nheads, seqlen_k, d]) - 0.5) * 2
+            cpu_permute = (0, 1, 3, 2)
+            print("   V tensor: Using standard layout")
+
+        # Print tensor information
+        print_tensor_info("Q", q)
+        print_tensor_info("K", k)
+        print_tensor_info("V", v)
+        
+        # Cast to target dtype
+        print(f"\n🔄 Converting tensors to {dtype}...")
+        q = nl.static_cast(q, dtype)
+        k = nl.static_cast(k, dtype)
+        v = nl.static_cast(v, dtype)
+        seed = None
+
+        print("\n🔍 REFERENCE COMPUTATION:")
+        print("-" * 40)
+        
+        # Compute reference (golden) output
+        o_proj_golden, cached_negative_max, cached_sum_reciprocal = \
+          cpu_attention_forward(q, k, v.transpose(cpu_permute), use_causal_mask=use_causal_mask, mixed_precision=True)
+        
+        # Reshape reference outputs to match expected format
+        o_proj_golden = o_proj_golden.transpose(0,1,3,2) # (b,h, d, seq)
+        cached_negative_max = cached_negative_max.reshape(bs, nheads, seqlen_q // nl.tile_size.pmax,
+                                                          nl.tile_size.pmax).transpose(0, 1, 3, 2)
+        cached_sum_reciprocal = cached_sum_reciprocal.reshape(bs, nheads, seqlen_q // nl.tile_size.pmax,
+                                                              nl.tile_size.pmax).transpose(0, 1, 3, 2)
+        lse_golden = -1.0 * (cached_negative_max + np.log(cached_sum_reciprocal)) if training else None
+        
+        print_tensor_info("Reference Output", o_proj_golden)
+        if training:
+            print_tensor_info("Reference LSE", lse_golden)
+        
+        # Setup configuration
+        config = FlashConfig(**{'seq_tile_size':tile_size, 'training':training, 'should_transpose_v':should_transpose_v})
+        print(f"📝 Flash Config: {config.__dict__}")
+        
+        heads = nheads if kv_heads is None else kv_heads
+
+        print("\n🚀 FLASH ATTENTION COMPUTATION:")
+        print("-" * 40)
+        
+        numeric_func = baremetal(flash_fwd)
+        
+        if simulation_only:
+            print("🔬 Running in simulation mode...")
+            start_time = time.time()
+            results = simulate_kernel(numeric_func[bs, heads], q, k, v, seed,
+                                          use_causal_mask=use_causal_mask,
+                                          mixed_precision=True,
+                                          config=config)
+            compute_time = time.time() - start_time
+            print(f"✅ Simulation completed in {compute_time:.2f} seconds")
+        else:
+            print("⚡ Running on hardware...")
+            start_time = time.time()
+            results = numeric_func[bs, heads](q, k, v, seed,
+                                          use_causal_mask=use_causal_mask,
+                                          mixed_precision=True,
+                                          config=config)
+            compute_time = time.time() - start_time
+            print(f"✅ Hardware execution completed in {compute_time:.2f} seconds")
+
+        print("\n🔬 NUMERICAL VERIFICATION:")
+        print("-" * 40)
+        
+        if training:
+            o_proj, out_lse = results
+            print_tensor_info("Flash Output", o_proj)
+            print_tensor_info("Flash LSE", out_lse)
+            
+            # Check output tensor
+            output_close = np.allclose(o_proj, o_proj_golden, atol=1e-2)
+            output_max_diff = np.max(np.abs(o_proj - o_proj_golden))
+            output_mean_diff = np.mean(np.abs(o_proj - o_proj_golden))
+            
+            print(f"📊 Output Comparison:")
+            print(f"   Max absolute difference: {output_max_diff:.6f}")
+            print(f"   Mean absolute difference: {output_mean_diff:.6f}")
+            print(f"   Tolerance: 1e-2")
+            print(f"   Result: {'✅ PASS' if output_close else '❌ FAIL'}")
+            
+            # Check LSE tensor
+            lse_close = np.allclose(out_lse, lse_golden, atol=1e-2)
+            lse_max_diff = np.max(np.abs(out_lse - lse_golden))
+            lse_mean_diff = np.mean(np.abs(out_lse - lse_golden))
+            
+            print(f"📊 LSE Comparison:")
+            print(f"   Max absolute difference: {lse_max_diff:.6f}")
+            print(f"   Mean absolute difference: {lse_mean_diff:.6f}")
+            print(f"   Tolerance: 1e-2")
+            print(f"   Result: {'✅ PASS' if lse_close else '❌ FAIL'}")
+            
+            # Final assertions
+            try:
+                assert output_close, f"Output mismatch: max_diff={output_max_diff:.6f} > 1e-2"
+                assert lse_close, f"LSE mismatch: max_diff={lse_max_diff:.6f} > 1e-2"
+                print(f"\n🎉 TEST PASSED! Both output and LSE match reference within tolerance")
+            except AssertionError as e:
+                print(f"\n💥 TEST FAILED! {str(e)}")
+                raise
+        else:
+            o_proj = results
+            print_tensor_info("Flash Output", o_proj)
+            
+            # Check output tensor
+            output_close = np.allclose(o_proj, o_proj_golden, atol=1e-2)
+            output_max_diff = np.max(np.abs(o_proj - o_proj_golden))
+            output_mean_diff = np.mean(np.abs(o_proj - o_proj_golden))
+            
+            print(f"📊 Output Comparison:")
+            print(f"   Max absolute difference: {output_max_diff:.6f}")
+            print(f"   Mean absolute difference: {output_mean_diff:.6f}")
+            print(f"   Tolerance: 1e-2")
+            print(f"   Result: {'✅ PASS' if output_close else '❌ FAIL'}")
+            
+            # Final assertion
+            try:
+                assert output_close, f"Output mismatch: max_diff={output_max_diff:.6f} > 1e-2"
+                print(f"\n🎉 TEST PASSED! Output matches reference within tolerance")
+            except AssertionError as e:
+                print(f"\n💥 TEST FAILED! {str(e)}")
+                raise
+        
+        print("\n" + "="*80 + "\n")
\ No newline at end of file
diff --git a/test/inference/test.py b/test/inference/test.py
index ee4fa0b..22bfb87 100644
--- a/test/inference/test.py
+++ b/test/inference/test.py
@@ -1,22 +1,36 @@
 import subprocess
 import argparse
+import json
 
 
 def parse_prompts(filepath):
+    """Parse prompts from JSON file"""
     with open(filepath, 'r') as file:
-        arr = file.read().split('\n\n')
-    arr = [prompt.strip() for prompt in arr if prompt.strip()]
-    return arr
+        data = json.load(file)
+    
+    # Extract just the prompt text from each prompt object
+    prompts = [prompt_obj['prompt'] for prompt_obj in data['prompts']]
+    return prompts
 
 
 def parse_prompt_data(filepath):
+    """Parse prompt performance data from JSON file"""
     with open(filepath, 'r') as file:
-        content = file.read()
-
-    blocks = content.split('\n')
-    if blocks[-1] == '':
-        blocks = blocks[0:-1]
-    return [block.split(',') for block in blocks]
+        data = json.load(file)
+    
+    # Convert the JSON data to the expected format (list of lists)
+    # Each inner list contains: [index, word_count, sequence_length, baseline_latency_ms, baseline_throughput]
+    prompt_data = []
+    for item in data['prompt_performance_data']:
+        prompt_data.append([
+            str(item['index']),
+            str(item['word_count']),
+            str(item['sequence_length']),
+            str(item['baseline_latency_ms']),
+            str(item['baseline_throughput'])
+        ])
+    
+    return prompt_data
 
 def parse_args():
     parser = argparse.ArgumentParser()
@@ -28,8 +42,8 @@ def parse_args():
 
 def main():
     args = parse_args()
-    prompts = parse_prompts(f"{args.repository_path}/data/prompts.txt")
-    prompt_data = parse_prompt_data(f"{args.repository_path}/data/prompt_data.txt")
+    prompts = parse_prompts(f"{args.repository_path}/data/prompts.json")
+    prompt_data = parse_prompt_data(f"{args.repository_path}/data/prompt_data.json")
     assert len(prompts) == len(prompt_data)
 
     mode = "evaluate_single"