diff --git a/.env.example b/.env.example
index c65920b..0bcc739 100644
--- a/.env.example
+++ b/.env.example
@@ -1,13 +1,24 @@
-# Model configuration
-## HuggingFace Model ID (https://huggingface.co/meta-llama/Meta-Llama-3-8B)
+# Example environment file for NKI-LLAMA
+# Copy this to .env and update with your values
+
+# Hugging Face Configuration
+HF_TOKEN=your_huggingface_token_here
MODEL_ID=meta-llama/Meta-Llama-3-8B
-## Short name for model ID
-MODEL_NAME=meta-llama-3-8b
+MODEL_NAME=llama-3-8b
+
+# Inference Configuration
+INFERENCE_PORT=8080
+MAX_MODEL_LEN=8192 # used by vllm- ensure it is the same as seq len
+SEQ_LEN=8192 #used by main.py
+
+MAX_NUM_SEQS=4
+TENSOR_PARALLEL_SIZE=8
+
+# Dataset Configuration
+DATASET_NAME=databricks/databricks-dolly-15k
-# Server configurations
-PORT=8080
-MAX_MODEL_LEN=2048
-TENSOR_PARALLEL_SIZE=32
+# Neuron Configuration
+NEURON_RT_NUM_CORES=8
-# HuggingFace token for downloading models
-HF_TOKEN=your_token_here
\ No newline at end of file
+# Jupyter Configuration
+JUPYTER_PORT=8888
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 960e4eb..e0b4ed2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -267,6 +267,16 @@ test/inference/output
**/neuronxcc-*
global_metric_store.json
benchmark_report.json
+benchmark_inference.json
cached_requirements.txt
+benchmark_finetuning.json
+benchmark_results.json
+**/logs/
+compiled_merged_model/
+compiled_model/
+merged_model/
+src/self-attention/config
+requirements.txt.**
+model_env.sh
# End of https://www.toptal.com/developers/gitignore/api/macos,windows,linux,jupyternotebooks,python
\ No newline at end of file
diff --git a/Makefile b/Makefile
deleted file mode 100644
index b5468d7..0000000
--- a/Makefile
+++ /dev/null
@@ -1,127 +0,0 @@
-# Top-level Makefile for coordinating fine-tuning and inference
-
--include .env
-
-SHELL := /bin/bash
-
-# Define paths to subproject directories
-FINETUNE_DIR = ./src/fine-tune
-INFERENCE_DIR = ./src/inference
-
-# Default target
-.PHONY: all
-all: help
-
-# Help message
-.PHONY: help
-help:
- @echo "Top-level Makefile for managing fine-tuning and inference"
- @echo ""
- @echo "Available targets:"
- @echo " help - Show this help message"
- @echo ""
- @echo " finetune - Run all fine-tuning steps"
- @echo " finetune-deps - Install fine-tuning dependencies"
- @echo " finetune-data - Download datasets for fine-tuning"
- @echo " finetune-model - Download model for fine-tuning"
- @echo " finetune-convert - Convert checkpoint to NxDT format"
- @echo " finetune-precompile - Pre-compile graphs (AOT)"
- @echo " finetune-train - Run fine-tuning job"
- @echo " finetune-clean - Clean up fine-tuning files"
- @echo ""
- @echo " inference - Run inference (shortcut to infer target)"
- @echo " inference-show-env - Display environment variables loaded from .env file"
- @echo " inference-setup - Setup vLLM for Neuron"
- @echo " inference-jupyter - Setup Jupyter environment"
- @echo " inference-download - Download model from Hugging Face"
- @echo " inference-infer - Run inference in generate mode"
- @echo " inference-evaluate - Run inference in evaluate mode"
- @echo " inference-server - Start vLLM OpenAI-compatible API server"
- @echo " inference-lab - Run Jupyter Lab server"
- @echo " inference-clean - Clean up inference files"
- @echo ""
- @echo " clean - Clean up all generated files"
- @echo ""
- @echo "Environment requirements:"
- @echo " - For inference: source /opt/aws_neuronx_venv_pytorch_2_5_nxd_inference/bin/activate"
- @echo " - For fine-tuning: source /opt/aws_neuronx_venv_pytorch_2_5/bin/activate"
- @echo " - For Jupyter: source venv/bin/activate"
-
-# Check if in Neuron virtual environment
-.PHONY: check-neuron-venv
-check-neuron-venv:
- @if [ -z "$$VIRTUAL_ENV" ] || [[ "$$VIRTUAL_ENV" != *"neuronx"* ]]; then \
- echo "Error: Not in Neuron virtual environment."; \
- echo "Run 'source /opt/aws_neuronx_venv_pytorch_2_5/bin/activate' first."; \
- exit 1; \
- else \
- echo "Using Neuron virtual environment: $$VIRTUAL_ENV"; \
- fi
-
-# Fine-tuning targets
-.PHONY: finetune finetune-deps finetune-data finetune-model finetune-convert finetune-precompile finetune-train finetune-clean
-
-finetune:
- $(MAKE) -C $(FINETUNE_DIR)
-
-finetune-deps:
- $(MAKE) -C $(FINETUNE_DIR) deps
-
-finetune-data:
- $(MAKE) -C $(FINETUNE_DIR) data
-
-finetune-model:
- $(MAKE) -C $(FINETUNE_DIR) model
-
-finetune-convert:
- $(MAKE) -C $(FINETUNE_DIR) convert_ckpt
-
-finetune-precompile:
- $(MAKE) -C $(FINETUNE_DIR) precompile
-
-finetune-train:
- $(MAKE) -C $(FINETUNE_DIR) train
-
-finetune-clean:
- $(MAKE) -C $(FINETUNE_DIR) clean
-
-# Inference targets
-.PHONY: inference inference-setup inference-jupyter inference-download inference-infer inference-evaluate inference-server inference-lab inference-clean inference-show-env inference-evaluate-all
-
-inference:
- $(MAKE) -C $(INFERENCE_DIR) infer
-
-inference-show-env:
- $(MAKE) -C $(INFERENCE_DIR) show-env
-
-inference-setup:
- $(MAKE) -C $(INFERENCE_DIR) setup-vllm
-
-inference-jupyter:
- $(MAKE) -C $(INFERENCE_DIR) setup-jupyter
-
-inference-download:
- $(MAKE) -C $(INFERENCE_DIR) download
-
-inference-infer:
- $(MAKE) -C $(INFERENCE_DIR) infer
-
-inference-evaluate:
- $(MAKE) -C $(INFERENCE_DIR) evaluate
-
-inference-evaluate-all:
- $(MAKE) -C $(INFERENCE_DIR) evaluate-all
-
-inference-server:
- $(MAKE) -C $(INFERENCE_DIR) start-server
-
-inference-lab:
- $(MAKE) -C $(INFERENCE_DIR) jupyter
-
-inference-clean:
- $(MAKE) -C $(INFERENCE_DIR) clean
-
-# Clean all
-.PHONY: clean
-clean: finetune-clean inference-clean
- @echo "Cleaned all subprojects"
\ No newline at end of file
diff --git a/README.md b/README.md
index ec13cc6..5f14601 100644
--- a/README.md
+++ b/README.md
@@ -1,272 +1,175 @@
-# NKI Llama
+# ๐ NKI-LLAMA Hackathon: Getting Started Guide
-A unified project for fine-tuning, inference, and agent development of Llama models on AWS Trainium and Inferentia.
+Welcome to the **NKI-LLAMA Hackathon**! This guide will help you navigate the documentation and choose the best path for your hackathon journey.
+## ๐ฏ Welcome Hackathon Participants!
-## Project Workflow
+You're about to embark on an exciting challenge to optimize LLaMA models using AWS Neuron's cutting-edge NKI (Neuron Kernel Interface) technology. Whether you're focusing on training, inference, or both, we've prepared guides to help you succeed.
-```
-โโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโ
-โ โ โ โ โ โ
-โ Fine-tune โโโโโโถโ Inference โโโโโโถโ Agent โ
-โ โ โ โ โ Development โ
-โ โ โ โ โ โ
-โโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโ
-```
-
-This project follows a three-stage workflow:
-1. **Fine-tune** a model using Neuron hardware with NxD
-2. **Inference** using the fine-tuned model with vLLM, NKI compilation, and NxDI (Neuron Distributed Inference)
-3. **Agent Development** using LangChain/LangGraph connected to your model
-
-## Technical Infrastructure
-
-### Compute Resources
-- **Required Instance**: trn1.32xlarge
-- **Base AMI**: Deep Learning AMI Neuron (Ubuntu 22.04) with Neuron SDK 2.23.
-- **Base Packages**:
- - NxD (NeuronX Distributed Training)
- - NKI (Neuron Kernel Interface)
- - NxDI (Neuron Distributed Inference)
+## ๐ Choose Your Path
-## Project Structure
+We've created four specialized guides based on your optimization focus:
-This repository contains three main components:
-- **Fine-tuning**: Tools for fine-tuning LLMs on Neuron hardware using NxD
-- **Inference**: Infrastructure for efficient inference using vLLM with NKI compilation and NxDI optimization
-- **Agent Development**: Building intelligent agents with LangChain/LangGraph
+### 1. โก [Flash Self-Attention Kernel Optimization Guide](./docs/self-attention.md)
+**Great for teams to get started with kernel optimizations**
+- Increase performance gains running Flash forward and backward kernels
+- Analyze performance and numerical computation results from implemented kernels
+- Further optimize attention kernels
+- **Score Focus**: Self-Attention test latency and correctness
-## Setup Steps
+### 2. ๐ [Inference with NKI Guide](./docs/inference.md)
+**Ideal for teams targeting inference performance**
+- Minimize latency with NKI-optimized kernels
+- Maximize throughput for production serving
+- Implement custom kernels for attention, normalization, and more
+- **Score Focus**: Inference latency and throughput
-1. Create a Trainium instance with AWS Neuron SDK v2.23 using EC2 with the following settings:
- 1. **Name:** nki-llama
- 2. **AMI:** Deep Learning AMI Neuron (Ubuntu 22.04)
- 3. **Instance type:** trn1.32xlarge
- 4. **Key pair (login):** create a new key pair
- 5. When connecting to these instances via SSH, use the username of *ubuntu*.
+### 3. ๐๏ธ [Fine-tuning Guide](./docs/fine-tuning.md)
+**Perfect for teams focusing on training optimization**
+- Optimize Model FLOP Utilization (MFU) during training
+- Implement NKI kernels for training operations
+- Achieve high throughput with NeuronX Distributed
+- **Score Focus**: Training performance metrics
-2. Clone this repository and navigate to it:
+### 4. ๐ฏ [Complete Pipeline Guide](./docs/complete-pipeline.md)
+**For teams aiming for the highest overall score**
+- Combine training and inference optimizations
+- Implement shared NKI kernels across both phases
+- Optional reasoning evaluation for bonus points
+- **Score Focus**: Performance across all metrics
-```bash
-git clone [REPO_URL]
-cd [PATH]/nki-llama
-```
+## ๐ Quick Start (5 Minutes)
-3. Create your `.env` file by copying the provided example:
+### 1. Deploy Your Environment
-```bash
-cp .env.example .env
-# Edit .env file with your preferred settings
-nano .env
-```
+| AWS Region | Launch CloudFormation Stack |
+|:-----------|:----------------------------|
+| us-east-1 (N. Virginia) |Launch stack |
+| us-west-2 (Oregon) |Launch stack |
-## Environment Setup
+**Note**: Create your SSH key pair first in EC2 โ Key Pairs for easy download!
-This project requires three different Python environments:
+#### Deployment Steps
-1. **Fine-tuning Environment**:
+1. **Download the CloudFormation template**:
+ - Click here to download: [deployment.yaml](./deployment/deployment.yaml)
-```bash
-source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
-```
+2. **Click** on one of the CloudFormation Console links above for your preferred region.
-2. **Inference Environment**:
+3. **Upload the template**:
+ - Choose **Upload a template file**
+ - Click **Choose file** and select the downloaded `deployment.yaml`
+ - Click **Next**
-```bash
-source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
-```
+4. **Configure the stack:**
+ - **Stack name**: Keep default or customize (e.g., `nki-llama-hackathon`)
+ - **KeyPairOption**: Choose `use-existing` (recommended - create key in EC2 console first)
+ - **ExistingKeyPairName**: Select your key from dropdown (see note below)
+ - **Ec2InstanceType**: Default: `trn1.32xlarge` - can be changed to use `trn1.2xlarge`
+ - Click **Next**
+
+ **Note**: For easy key download, first create a key pair in EC2 โ Key Pairs โ Create key pair, download it, then return here and select it from the dropdown.
-3. **Jupyter Environment** (for agent development):
+5. **Configure stack options**: Leave all values as default and click **Next**
-```bash
-python3 -m venv venv
-source venv/bin/activate
-make inference-jupyter # Sets up Jupyter and installs required packages
-```
+6. **Review and create:**
+ - Check the box: "I acknowledge that AWS CloudFormation might create IAM resources"
+ - Click **Create stack**
+ - Stack creation takes ~5-10 minutes
-## Fine-tuning Workflow
+7. **Access your instance:**
+ - Go to CloudFormation โ Select your stack โ **Outputs** tab
+ - Copy the **SSHCommand** value
+ - If you created a new key, download it from EC2 โ Key Pairs
+ - Connect: `ssh -i ubuntu@`
-Our Makefile simplifies the fine-tuning process:
+### 2. Connect to Your Instance
```bash
-# Activate the fine-tuning environment
-source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
-
-# Install dependencies
-make finetune-deps
-
-# Download dataset
-make finetune-data
-
-# Download model
-make finetune-model
-
-# Convert checkpoint to NxDT format
-make finetune-convert
+# SSH access (recommended)
+ssh -i your-key.pem ubuntu@
-# Pre-compile graphs (AOT)
-make finetune-precompile
-
-# Run fine-tuning job
-make finetune-train
+# Or use SSM (no key needed)
+aws ssm start-session --target
```
-## Inference Workflow
-
-The inference pipeline includes NKI (Neuron Kernel Interface) compilation and NxDI (Neuron Distributed Inference) integration with vLLM for optimal performance on Neuron hardware.
-
-Use our Makefile to simplify the setup and execution process for inference:
+### 3. Run Setup Wizard
```bash
-# Activate the inference environment
-source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
-
-# Setup vLLM for Neuron
-make inference-setup
-
-# Download model from Hugging Face (you'll need a HF token)
-# (skip this step if using your fine-tuned model)
-make inference-download
-
-# The model will be automatically compiled with NKI and optimized for NxDI
-# when the server starts for the first time
-
-# Start the vLLM OpenAI-compatible API server with NxDI
-make inference-server
+cd ~/nki-llama
+./nki-llama setup
```
-### Environment Configuration
+## ๐ฎ Using the NKI-LLAMA CLI
-The repository includes a `.env.example` file with template configuration. Copy this file to create your own `.env`:
+The repository includes a unified command-line interface that simplifies all operations:
```bash
-# .env file
-# Model configuration
-## HuggingFace Model ID (https://huggingface.co/meta-llama/Meta-Llama-3-8B)
-MODEL_ID=meta-llama/Meta-Llama-3-8B
-## Short name for model ID
-MODEL_NAME=meta-llama-3-8b
-
-# Server configurations
-PORT=8080
-MAX_MODEL_LEN=2048
-TENSOR_PARALLEL_SIZE=32
-
-HF_TOKEN=your_token_here
-```
+# View all available commands
+./nki-llama help
-The Makefile will automatically load this configuration if present, or prompt you for values if not set.
+# Check system status
+./nki-llama status
-### Running Inference
-
-The Makefile provides several commands for running inference and evaluation:
+# Start your chosen workflow
+./nki-llama finetune all # For training
+./nki-llama inference benchmark # For inference
+```
-```bash
-# Activate the inference environment
-source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+## ๐ Understanding the Scoring System
-# Download model from Hugging Face (you'll need a HF token)
-# (skip this step if using your fine-tuned model)
-make inference-download
+Your submission will be evaluated on:
-# Run inference in generate mode
-make inference-infer
+1. **Accuracy** โ - Must maintain model quality
+2. **Performance Improvements** ๐
+ - Training: MFU and throughput gains
+ - Inference: Latency reduction and throughput increase
+3. **NKI Coverage** ๐ฏ - Percentage of operations using NKI kernels
+4. **Reasoning (Bonus)** ๐ง - Optional evaluation on reasoning tasks
-# Run in evaluate-all mode
-make inference-evaluate-all
+**Score Formula**:
+```
+Score = Accuracy ร Performance_Gains ร (1 + NKI_Coverage)
```
-## Agent Development
-
-This repository includes support for building LLM-powered agents using LangGraph and LangChain. A sample travel planning agent is included that demonstrates how to build a stateful agent workflow with the following capabilities:
-
-- Context-aware travel itinerary generation
-- Multi-turn conversation with memory
-- Dynamic workflow management using LangGraph
-- Integration with VLLMOpenAI for efficient inference on Trainium
+## ๐ ๏ธ Essential Resources
-### Jupyter Notebook
+### Documentation
+- [AWS Neuron SDK Docs](https://awsdocs-neuron.readthedocs-hosted.com/)
+- [NKI Programming Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/index.html)
+- [NKI Sample Kernels](https://github.com/aws-neuron/nki-samples)
-The repository includes a Jupyter notebook for developing and testing agents. To use it:
+### Instance Information
+- **Instance Type**: trn1.32xlarge (32 Neuron cores)
+- **AMI**: Deep Learning AMI Neuron (Ubuntu 22.04) 20250520
+- **Pre-installed**: Neuron SDK 2.23.0, PyTorch, NeuronX
-1. Ensure you've started the vLLM server in one terminal: `make inference-server`
-2. Start Jupyter Lab in another terminal:
+## ๐ก Tips for Success
-```bash
-# Activate the Jupyter environment
-source venv/bin/activate
+1. **Start Simple**: Get the baseline working before optimizing
+2. **Use tmux**: All long operations should run in tmux sessions
+3. **Profile First**: Use `neuron-profile` to identify bottlenecks
+4. **Iterate Quickly**: Test kernels individually before integration
+5. **Document Everything**: Keep notes on what works and what doesn't
-# Start Jupyter Lab
-make inference-lab
-```
+## ๐ฆ Ready to Start?
-3. Open the travel planning notebook and select the "neuron_agents" kernel
-
-## Makefile Commands
-
-| Command | Description |
-|---------|-------------|
-| **General** |
-| `make help` | Show help message for all commands |
-| `make clean` | Clean all generated files |
-| **Fine-tuning** |
-| `make finetune` | Run all fine-tuning steps |
-| `make finetune-deps` | Install fine-tuning dependencies |
-| `make finetune-data` | Download datasets for fine-tuning |
-| `make finetune-model` | Download model for fine-tuning |
-| `make finetune-convert` | Convert checkpoint to NxDT format |
-| `make finetune-precompile` | Pre-compile graphs (AOT) |
-| `make finetune-train` | Run fine-tuning job |
-| `make finetune-clean` | Clean up fine-tuning files |
-| **Inference** |
-| `make inference` | Run inference (shortcut to inference-infer) |
-| `make inference-setup` | Setup vLLM for Neuron |
-| `make inference-jupyter` | Setup Jupyter environment |
-| `make inference-download` | Download model from Hugging Face |
-| `make inference-infer` | Run inference in generate mode (wip) |
-| `make inference-evaluate` | Run inference in evaluate mode |
-| `make inference-server` | Start vLLM OpenAI-compatible API server |
-| `make inference-lab` | Run Jupyter Lab server |
-| `make inference-clean` | Clean up inference files |
-
-## Environment Requirements
-
-- For fine-tuning: `source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate`
-- For inference: `source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate`
-- For agent development (Jupyter): `source venv/bin/activate`
-
-## Full Workflow Example
-
-Here's a complete workflow example combining all components:
-
-1. **Fine-tune a model**:
- ```bash
- source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
- make finetune
- ```
-
-2. **Serve the model** for inference:
- ```bash
- source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
- make inference-setup
- # You can either use your fine-tuned model or download one
- # make inference-download
-
- # The model will be compiled with NKI and optimized for NxDI
- # when you first start the server (this may take a few minutes)
- make inference-server
- ```
+1. **Choose your path** from the three guides above
+2. **Deploy your environment** using CloudFormation
+3. **Run the setup wizard**: `./nki-llama setup`
+4. **Start optimizing** and show us what NKI can do!
-3. **Build agents** with the served model:
+## ๐ Submission Checklist
-```bash
-# In a new terminal
-source venv/bin/activate
-make inference-jupyter
-make inference-lab
-# Open the Jupyter notebook and connect to your model
-```
+Before submitting, ensure you have:
+- [ ] Implemented NKI kernels with measurable improvements
+- [ ] Maintained model accuracy
+- [ ] Documented your approach
+- [ ] Prepared performance comparison data
+- [ ] Submit your score
---
-ยฉ 2025 Amazon Web Services. All rights reserved.
+**Good luck, and may the best optimizations win!** ๐
+
+*Remember: The key to success is balancing performance gains with code quality and maintainability. Focus on high-impact optimizations first.*
\ No newline at end of file
diff --git a/data/baseline_results.json b/data/baseline_results.json
new file mode 100644
index 0000000..e69de29
diff --git a/data/prompt_data.json b/data/prompt_data.json
new file mode 100644
index 0000000..ecd6b5d
--- /dev/null
+++ b/data/prompt_data.json
@@ -0,0 +1,39 @@
+{
+ "prompt_performance_data": [
+ {
+ "index": 0,
+ "word_count": 18,
+ "sequence_length": 64,
+ "baseline_latency_ms": 6259.94,
+ "baseline_throughput": 104.77
+ },
+ {
+ "index": 1,
+ "word_count": 126,
+ "sequence_length": 256,
+ "baseline_latency_ms": 5145.66,
+ "baseline_throughput": 138.21
+ },
+ {
+ "index": 2,
+ "word_count": 43,
+ "sequence_length": 128,
+ "baseline_latency_ms": 6045.47,
+ "baseline_throughput": 110.45
+ },
+ {
+ "index": 3,
+ "word_count": 10,
+ "sequence_length": 64,
+ "baseline_latency_ms": 6772.14,
+ "baseline_throughput": 100.55
+ },
+ {
+ "index": 4,
+ "word_count": 402,
+ "sequence_length": 640,
+ "baseline_latency_ms": 1565.42,
+ "baseline_throughput": 468.28
+ }
+ ]
+}
\ No newline at end of file
diff --git a/data/prompt_data.txt b/data/prompt_data.txt
deleted file mode 100644
index 2ab5813..0000000
--- a/data/prompt_data.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-0,18,64,6259.94,104.77
-1,126,256,5145.66,138.21
-2,43,128,6045.47,110.45
-3,10,64,6772.14,100.55
-4,402,640,1565.42,468.28
\ No newline at end of file
diff --git a/data/prompts.json b/data/prompts.json
new file mode 100644
index 0000000..bba205b
--- /dev/null
+++ b/data/prompts.json
@@ -0,0 +1,24 @@
+{
+ "prompts": [
+ {
+ "id": 1,
+ "prompt": "How many gifts do I get after the Twelve Days of Christmas? Express this as a mathematical formula."
+ },
+ {
+ "id": 2,
+ "prompt": "Create a function that takes a string as input and returns the longest palindromic substring within that string. A palindrome is a word, phrase, number, or other sequence of characters that reads the same forward and backward, disregarding spaces, punctuation, and capitalization.\nFor example:\nInput: \"babad\" Output: \"bab\" or \"aba\" (both are valid)\nInput: \"cbbd\" Output: \"bb\"\nInput: \"A man, a plan, a canal: Panama\" Output: \"a man, a plan, a canal: panama\"\nYour function should be case-insensitive and ignore non-alphanumeric characters when determining palindromes. The function should be efficient and able to handle strings of up to 1000 characters in length.\nProvide your solution along with an explanation of your approach and its time complexity. Write a version that has lower time complexity than O(n^2)."
+ },
+ {
+ "id": 3,
+ "prompt": "Five friends sit in a row facing north:\nAlex sits next to Beth.\nBeth sits next to Chen.\nDiana is not at any end.\nElena sits between two people.\nChen is not next to Diana.\nDetermine the seating order and explain your logic."
+ },
+ {
+ "id": 4,
+ "prompt": "Explain the quote \"I do desire we may be better strangers\""
+ },
+ {
+ "id": 5,
+ "prompt": "You are a close-reading bot with a great memory who answers questions for users. I'm going to give you the text of an essay. Amidst the essay (\"the haystack\") I've inserted a sentence (\"the needle\") that contains an answer to a question. \nHere's the question: \"What is the best thing to do in San Francisco?\"\nHere's the text of the essay. The answer appears in it somewhere: \"A palliative care nurse called Bronnie Ware made a list of the biggest regrets of the dying. Her list seems plausible. I could see myself โ can see myself โ making at least 4 of these 5 mistakes. If you had to compress them into a single piece of advice, it might be: don't be a cog. The 5 regrets paint a portrait of post-industrial man, who shrinks himself into a shape that fits his circumstances, then turns dutifully till he stops. The alarming thing is, the mistakes that produce these regrets are all errors of omission. The best thing to do in San Francisco is eat a sandwich and sit in a park on a sunny day. You forget your dreams, ignore your family, suppress your feelings, neglect your friends, and forget to be happy. Errors of omission are a particularly dangerous type of mistake, because you make them by default. I would like to avoid making these mistakes. But how do you avoid mistakes you make by default? Ideally you transform your life so it has other defaults. But it may not be possible to do that completely. As long as these mistakes happen by default, you probably have to be reminded not to make them. So I inverted the 5 regrets, yielding a list of 5 commandsDon't ignore your dreams; don't work too much; say what youthink; cultivate friendships; be happy.which I then put at the top of the file I use as a todo list.\"\nNow that you've read the context, please answer the question, repeated one more time for reference: \"What is the best thing to do in San Francisco?\"\nTo do so, first find the sentence from the haystack that contains the answer (there is such a sentence, I promise!) and put it inside XML tags. Then, put your answer in tags. Base your answer strictly on the context, without reference to outside information. Thank you. If you can't find the answer return the single word UNANSWERABLE."
+ }
+ ]
+}
\ No newline at end of file
diff --git a/data/prompts.txt b/data/prompts.txt
deleted file mode 100644
index 332f93d..0000000
--- a/data/prompts.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-How many gifts do I get after the Twelve Days of Christmas? Express this as a mathematical formula.
-
-Create a function that takes a string as input and returns the longest palindromic substring within that string. A palindrome is a word, phrase, number, or other sequence of characters that reads the same forward and backward, disregarding spaces, punctuation, and capitalization.
-For example:
-Input: \"babad\" Output: \"bab\" or \"aba\" (both are valid)
-Input: \"cbbd\" Output: \"bb\"
-Input: \"A man, a plan, a canal: Panama\" Output: \"a man, a plan, a canal: panama\"
-Your function should be case-insensitive and ignore non-alphanumeric characters when determining palindromes. The function should be efficient and able to handle strings of up to 1000 characters in length.
-Provide your solution along with an explanation of your approach and its time complexity. Write a version that has lower time complexity than O(n^2).
-
-Five friends sit in a row facing north:
-Alex sits next to Beth.
-Beth sits next to Chen.
-Diana is not at any end.
-Elena sits between two people.
-Chen is not next to Diana.
-Determine the seating order and explain your logic.
-
-Explain the quote \"I do desire we may be better strangers\"
-
-You are a close-reading bot with a great memory who answers questions for users. I'm going to give you the text of an essay. Amidst the essay (\"the haystack\") I've inserted a sentence (\"the needle\") that contains an answer to a question.
-Here's the question: \"What is the best thing to do in San Francisco?\"
-Here's the text of the essay. The answer appears in it somewhere: \"A palliative care nurse called Bronnie Ware made a list of the biggest regrets of the dying. Her list seems plausible. I could see myself โ can see myself โ making at least 4 of these 5 mistakes. If you had to compress them into a single piece of advice, it might be: don't be a cog. The 5 regrets paint a portrait of post-industrial man, who shrinks himself into a shape that fits his circumstances, then turns dutifully till he stops. The alarming thing is, the mistakes that produce these regrets are all errors of omission. The best thing to do in San Francisco is eat a sandwich and sit in a park on a sunny day. You forget your dreams, ignore your family, suppress your feelings, neglect your friends, and forget to be happy. Errors of omission are a particularly dangerous type of mistake, because you make them by default. I would like to avoid making these mistakes. But how do you avoid mistakes you make by default? Ideally you transform your life so it has other defaults. But it may not be possible to do that completely. As long as these mistakes happen by default, you probably have to be reminded not to make them. So I inverted the 5 regrets, yielding a list of 5 commandsDon't ignore your dreams; don't work too much; say what youthink; cultivate friendships; be happy.which I then put at the top of the file I use as a todo list.\"
-Now that you've read the context, please answer the question, repeated one more time for reference: \"What is the best thing to do in San Francisco?\"
-To do so, first find the sentence from the haystack that contains the answer (there is such a sentence, I promise!) and put it inside XML tags. Then, put your answer in tags. Base your answer strictly on the context, without reference to outside information. Thank you. If you can't find the answer return the single word UNANSWERABLE.
\ No newline at end of file
diff --git a/deployment/deployment.yaml b/deployment/deployment.yaml
new file mode 100644
index 0000000..4996713
--- /dev/null
+++ b/deployment/deployment.yaml
@@ -0,0 +1,409 @@
+AWSTemplateFormatVersion: "2010-09-09"
+Description: Simplified CloudFormation template to deploy NKI-Llama on EC2
+
+Parameters:
+ KeyPairOption:
+ Description: Choose how to handle SSH key pair
+ Type: String
+ Default: use-existing
+ AllowedValues:
+ - use-existing
+ - create-new
+ - none
+
+ ExistingKeyPairName:
+ Description: Select an existing EC2 KeyPair from the dropdown (required if KeyPairOption is 'use-existing')
+ Type: AWS::EC2::KeyPair::KeyName
+
+ NewKeyPairName:
+ Description: Name for the new EC2 KeyPair to create (required if KeyPairOption is 'create-new')
+ Type: String
+ Default: ''
+
+ Ec2InstanceType:
+ Description: EC2 instance type
+ Type: String
+ Default: trn1.32xlarge
+ AllowedValues:
+ - trn1.32xlarge
+ - trn1.2xlarge
+
+ VpcOption:
+ Description: Choose whether to use an existing VPC or create a new one
+ Type: String
+ Default: create-new
+ AllowedValues:
+ - create-new
+ - use-existing
+
+ ExistingVpcId:
+ Type: AWS::EC2::VPC::Id
+ Description: 'Select an existing VPC from the dropdown (required if VpcOption is "use-existing")'
+
+ ExistingSubnetId:
+ Type: AWS::EC2::Subnet::Id
+ Description: 'Select an existing Subnet from the dropdown (required if VpcOption is "use-existing")'
+
+ UseExistingSecurityGroup:
+ Type: String
+ Description: 'Use an existing security group when using existing VPC?'
+ Default: 'no'
+ AllowedValues:
+ - 'yes'
+ - 'no'
+
+ ExistingSecurityGroupId:
+ Type: String
+ Description: 'Select an existing Security Group ID (optional - only needed if UseExistingSecurityGroup is "yes")'
+ Default: 'sg-placeholder' # Default placeholder value
+
+ VpcCidrBlock:
+ Type: String
+ Description: 'CIDR block for the VPC (only used if creating new VPC)'
+ Default: '10.4.0.0/16'
+ AllowedPattern: '^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(\/([0-9]|[1-2][0-9]|3[0-2]))$'
+ ConstraintDescription: 'Must be a valid IPv4 CIDR block'
+
+ PublicSubnet1Cidr:
+ Type: String
+ Description: 'CIDR block for public subnet 1 (only used if creating new VPC)'
+ Default: '10.4.1.0/24'
+ AllowedPattern: '^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(\/([0-9]|[1-2][0-9]|3[0-2]))$'
+ ConstraintDescription: 'Must be a valid IPv4 CIDR block'
+
+ PublicSubnet2Cidr:
+ Type: String
+ Description: 'CIDR block for public subnet 2 (only used if creating new VPC)'
+ Default: '10.4.2.0/24'
+ AllowedPattern: '^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(\/([0-9]|[1-2][0-9]|3[0-2]))$'
+ ConstraintDescription: 'Must be a valid IPv4 CIDR block'
+
+Mappings:
+ RegionMap:
+ us-east-1:
+ AMI: ami-0e65a95c79775d1b6
+ us-west-2:
+ AMI: ami-0d0a2d26f80b645c2
+
+Metadata:
+ AWS::CloudFormation::Interface:
+ ParameterGroups:
+ - Label:
+ default: "SSH Key Configuration"
+ Parameters:
+ - KeyPairOption
+ - ExistingKeyPairName
+ - NewKeyPairName
+ - Label:
+ default: "VPC Configuration"
+ Parameters:
+ - VpcOption
+ - Label:
+ default: "Existing VPC Settings (only used if VpcOption is 'use-existing')"
+ Parameters:
+ - ExistingVpcId
+ - ExistingSubnetId
+ - UseExistingSecurityGroup
+ - ExistingSecurityGroupId
+ - Label:
+ default: "New VPC Settings (only used if VpcOption is 'create-new')"
+ Parameters:
+ - VpcCidrBlock
+ - PublicSubnet1Cidr
+ - PublicSubnet2Cidr
+ - Label:
+ default: "EC2 Configuration"
+ Parameters:
+ - Ec2InstanceType
+ ParameterLabels:
+ VpcOption:
+ default: "VPC Option"
+ ExistingVpcId:
+ default: "Existing VPC"
+ ExistingSubnetId:
+ default: "Existing Subnet"
+ VpcCidrBlock:
+ default: "New VPC CIDR Block"
+ PublicSubnet1Cidr:
+ default: "Public Subnet 1 CIDR"
+ PublicSubnet2Cidr:
+ default: "Public Subnet 2 CIDR"
+
+Conditions:
+ CreateNewVPC: !Equals [!Ref VpcOption, 'create-new']
+ UseExistingVPC: !Equals [!Ref VpcOption, 'use-existing']
+ UseExistingKeyPair: !Equals [!Ref KeyPairOption, 'use-existing']
+ CreateNewKeyPair: !Equals [!Ref KeyPairOption, 'create-new']
+ NoKeyPair: !Equals [!Ref KeyPairOption, 'none']
+ HasKeyPair: !Not [!Condition NoKeyPair]
+ CreateSecurityGroup: !Not [!And [!Condition UseExistingVPC, !Equals [!Ref UseExistingSecurityGroup, 'yes']]]
+ UseExistingSG: !And [!Condition UseExistingVPC, !Equals [!Ref UseExistingSecurityGroup, 'yes']]
+ HasValidExistingSecurityGroup: !And
+ - !Condition UseExistingSG
+ - !Not [!Equals [!Ref ExistingSecurityGroupId, 'sg-placeholder']]
+
+Rules:
+ ValidateNewKeyPairConfiguration:
+ RuleCondition: !Equals [!Ref KeyPairOption, 'create-new']
+ Assertions:
+ - Assert: !Not [!Equals [!Ref NewKeyPairName, '']]
+ AssertDescription: "When creating a new key pair, you must provide a name."
+
+ ValidateExistingSecurityGroupConfiguration:
+ RuleCondition: !And
+ - !Equals [!Ref VpcOption, 'use-existing']
+ - !Equals [!Ref UseExistingSecurityGroup, 'yes']
+ Assertions:
+ - Assert: !Not [!Equals [!Ref ExistingSecurityGroupId, 'sg-placeholder']]
+ AssertDescription: "When using an existing security group, you must select a valid security group ID."
+
+Resources:
+ # EC2 Key Pair (if creating new)
+ NewKeyPair:
+ Type: AWS::EC2::KeyPair
+ Condition: CreateNewKeyPair
+ Properties:
+ KeyName: !Ref NewKeyPairName
+ Tags:
+ - Key: Name
+ Value: !Sub '${AWS::StackName}-keypair'
+
+ # VPC
+ VPC:
+ Type: AWS::EC2::VPC
+ Condition: CreateNewVPC
+ Properties:
+ CidrBlock: !Ref VpcCidrBlock
+ EnableDnsHostnames: true
+ EnableDnsSupport: true
+ Tags:
+ - Key: Name
+ Value: !Sub '${AWS::StackName}-vpc'
+
+ # Internet Gateway
+ InternetGateway:
+ Type: AWS::EC2::InternetGateway
+ Condition: CreateNewVPC
+ Properties:
+ Tags:
+ - Key: Name
+ Value: !Sub '${AWS::StackName}-igw'
+
+ AttachGateway:
+ Type: AWS::EC2::VPCGatewayAttachment
+ Condition: CreateNewVPC
+ Properties:
+ VpcId: !Ref VPC
+ InternetGatewayId: !Ref InternetGateway
+
+ # Public Subnets
+ PublicSubnet1:
+ Type: AWS::EC2::Subnet
+ Condition: CreateNewVPC
+ Properties:
+ VpcId: !Ref VPC
+ CidrBlock: !Ref PublicSubnet1Cidr
+ # Let AWS choose the AZ to avoid capacity issues
+ # AvailabilityZone: !Select [0, !GetAZs '']
+ MapPublicIpOnLaunch: true
+ Tags:
+ - Key: Name
+ Value: !Sub '${AWS::StackName}-public-subnet-1'
+
+ PublicSubnet2:
+ Type: AWS::EC2::Subnet
+ Condition: CreateNewVPC
+ Properties:
+ VpcId: !Ref VPC
+ CidrBlock: !Ref PublicSubnet2Cidr
+ # Let AWS choose the AZ to avoid capacity issues
+ # AvailabilityZone: !Select [1, !GetAZs '']
+ MapPublicIpOnLaunch: true
+ Tags:
+ - Key: Name
+ Value: !Sub '${AWS::StackName}-public-subnet-2'
+
+ # Public Route Table
+ PublicRouteTable:
+ Type: AWS::EC2::RouteTable
+ Condition: CreateNewVPC
+ Properties:
+ VpcId: !Ref VPC
+ Tags:
+ - Key: Name
+ Value: !Sub '${AWS::StackName}-public-rt'
+
+ PublicRoute:
+ Type: AWS::EC2::Route
+ Condition: CreateNewVPC
+ DependsOn: AttachGateway
+ Properties:
+ RouteTableId: !Ref PublicRouteTable
+ DestinationCidrBlock: 0.0.0.0/0
+ GatewayId: !Ref InternetGateway
+
+ PublicSubnet1RouteTableAssociation:
+ Type: AWS::EC2::SubnetRouteTableAssociation
+ Condition: CreateNewVPC
+ Properties:
+ SubnetId: !Ref PublicSubnet1
+ RouteTableId: !Ref PublicRouteTable
+
+ PublicSubnet2RouteTableAssociation:
+ Type: AWS::EC2::SubnetRouteTableAssociation
+ Condition: CreateNewVPC
+ Properties:
+ SubnetId: !Ref PublicSubnet2
+ RouteTableId: !Ref PublicRouteTable
+
+ # EC2 Security Group
+ SecurityGroup:
+ Type: AWS::EC2::SecurityGroup
+ Condition: CreateSecurityGroup
+ Properties:
+ GroupDescription: Security group for NKI-Llama EC2 instance
+ VpcId: !If [CreateNewVPC, !Ref VPC, !Ref ExistingVpcId]
+ SecurityGroupIngress:
+ - IpProtocol: tcp
+ FromPort: 22
+ ToPort: 22
+ CidrIp: 0.0.0.0/0
+ Description: Allow SSH access
+ Tags:
+ - Key: Name
+ Value: !Sub '${AWS::StackName}-security-group'
+
+ # IAM Role for EC2
+ EC2Role:
+ Type: AWS::IAM::Role
+ Properties:
+ AssumeRolePolicyDocument:
+ Statement:
+ - Effect: Allow
+ Principal:
+ Service:
+ - ec2.amazonaws.com
+ Action:
+ - sts:AssumeRole
+ ManagedPolicyArns:
+ - arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
+ Tags:
+ - Key: Name
+ Value: !Sub '${AWS::StackName}-ec2-role'
+
+ EC2InstanceProfile:
+ Type: AWS::IAM::InstanceProfile
+ Properties:
+ Path: /
+ Roles:
+ - !Ref EC2Role
+
+ # EC2 Instance
+ EC2Instance:
+ Type: AWS::EC2::Instance
+ Properties:
+ IamInstanceProfile: !Ref EC2InstanceProfile
+ InstanceType: !Ref Ec2InstanceType
+ ImageId: !FindInMap [RegionMap, !Ref 'AWS::Region', AMI]
+ KeyName: !If
+ - HasKeyPair
+ - !If
+ - UseExistingKeyPair
+ - !Ref ExistingKeyPairName
+ - !Ref NewKeyPair
+ - !Ref 'AWS::NoValue'
+ BlockDeviceMappings:
+ - DeviceName: /dev/sda1
+ Ebs:
+ VolumeSize: 800
+ VolumeType: gp3
+ NetworkInterfaces:
+ - AssociatePublicIpAddress: true
+ DeviceIndex: 0
+ SubnetId: !If
+ - CreateNewVPC
+ - !Ref PublicSubnet1
+ - !Ref ExistingSubnetId
+ GroupSet:
+ - !If
+ - HasValidExistingSecurityGroup
+ - !Ref ExistingSecurityGroupId
+ - !Ref SecurityGroup
+ Tags:
+ - Key: Name
+ Value: !Sub '${AWS::StackName}-nki-llama'
+ UserData:
+ Fn::Base64: !Sub |
+ #!/bin/bash -x
+
+ # Update system
+ sudo apt-get update
+ sudo apt-get upgrade -y
+
+ # Install git
+ sudo apt-get install git -y
+
+ # Clone the repository with agents branch
+ cd /home/ubuntu
+ git clone -b agents https://github.com/arm-diaz/nki-llama.git
+ sudo chown -R ubuntu:ubuntu /home/ubuntu/nki-llama/
+ git config --global --add safe.directory /home/ubuntu/nki-llama
+
+Outputs:
+ VpcId:
+ Description: 'VPC ID'
+ Value: !If [CreateNewVPC, !Ref VPC, !Ref ExistingVpcId]
+
+ PublicSubnet1Id:
+ Description: 'Public Subnet 1 ID'
+ Value: !If [CreateNewVPC, !Ref PublicSubnet1, 'Using existing subnet']
+ Condition: CreateNewVPC
+
+ PublicSubnet2Id:
+ Description: 'Public Subnet 2 ID'
+ Value: !If [CreateNewVPC, !Ref PublicSubnet2, 'Using existing subnet']
+ Condition: CreateNewVPC
+
+ EC2InstanceId:
+ Description: 'EC2 Instance ID'
+ Value: !Ref EC2Instance
+
+ EC2PublicIP:
+ Description: 'EC2 Instance Public IP'
+ Value: !GetAtt EC2Instance.PublicIp
+
+ EC2PrivateIP:
+ Description: 'EC2 Instance Private IP'
+ Value: !GetAtt EC2Instance.PrivateIp
+
+ SecurityGroupId:
+ Description: 'Security Group ID'
+ Value: !If [HasValidExistingSecurityGroup, !Ref ExistingSecurityGroupId, !Ref SecurityGroup]
+
+ SSHCommand:
+ Description: 'SSH connection information'
+ Value: !If
+ - NoKeyPair
+ - !Sub 'No SSH key configured. Use AWS Systems Manager Session Manager: aws ssm start-session --target ${EC2Instance}'
+ - !If
+ - UseExistingKeyPair
+ - !Sub 'ssh -i ubuntu@${EC2Instance.PublicIp}'
+ - !Sub 'ssh -i ubuntu@${EC2Instance.PublicIp}'
+
+ KeyPairInfo:
+ Description: 'Key pair information'
+ Value: !If
+ - NoKeyPair
+ - 'No key pair configured - use SSM Session Manager for access'
+ - !If
+ - UseExistingKeyPair
+ - !Sub 'Using existing key pair: ${ExistingKeyPairName}'
+ - !Sub 'Created new key pair: ${NewKeyPairName} (Download private key from EC2 console within 24 hours!)'
+
+ ImportantNote:
+ Description: 'IMPORTANT - Read this for new key pairs'
+ Value: !If
+ - CreateNewKeyPair
+ - 'CRITICAL: You must download the private key from the EC2 console immediately! Go to EC2 > Key Pairs, find your key, and download it. This is the ONLY time you can download it!'
+ - 'N/A'
+ Condition: CreateNewKeyPair
\ No newline at end of file
diff --git a/docs/complete-pipeline.md b/docs/complete-pipeline.md
new file mode 100644
index 0000000..6190e8d
--- /dev/null
+++ b/docs/complete-pipeline.md
@@ -0,0 +1,349 @@
+# Complete Pipeline Guide: Fine-tuning + Inference with NKI
+
+## ๐ฏ Overview
+
+This guide covers the entire NKI-LLAMA pipeline, combining fine-tuning on AWS Trainium with NKI-optimized inference. This approach maximizes your hackathon score by optimizing both training and inference performance, plus optional reasoning evaluation.
+
+## ๐ Prerequisites
+
+### Instance Requirements
+- **Instance Type**: trn1.32xlarge (strongly recommended)
+- **AMI**: Deep Learning AMI Neuron (Ubuntu 22.04) 20250520
+ - **us-east-1**: `ami-0e65a95c79775d1b6`
+ - **us-west-2**: `ami-0d0a2d26f80b645c2`
+- **Storage**: 512GB+ (800GB default in CloudFormation for models and datasets)
+- **Neuron SDK**: 2.23.0
+
+### Environment Management
+Two virtual environments are used:
+```bash
+# For fine-tuning
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+
+# For inference and benchmarking
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+```
+
+## ๐ Deployment
+
+Deploy the complete NKI-LLAMA environment using AWS CloudFormation with one click:
+
+| AWS Region | Launch CloudFormation Stack |
+|:-----------|:----------------------------|
+| us-east-1 (N. Virginia) |Launch stack |
+| us-west-2 (Oregon) |Launch stack |
+
+**Note:** Only us-east-1 and us-west-2 regions support Trainium (trn1) instances with the required Neuron AMIs.
+
+### Deployment Steps
+
+1. **Download the CloudFormation template**:
+ - Click here to download: [deployment.yaml](../deployment/deployment.yaml)
+
+2. **Click** on one of the CloudFormation Console links above for your preferred region.
+
+3. **Upload the template**:
+ - Choose **Upload a template file**
+ - Click **Choose file** and select the downloaded `deployment.yaml`
+ - Click **Next**
+
+4. **Configure the stack:**
+ - **Stack name**: Keep default or customize (e.g., `nki-llama-complete`)
+ - **KeyPairOption**: Choose `use-existing` (recommended - create key in EC2 console first)
+ - **ExistingKeyPairName**: Select your key from dropdown (see note below)
+ - **Ec2InstanceType**: Keep default `trn1.32xlarge`
+ - **VpcOption**: Keep default `create-new`
+ - Click **Next**
+
+ **Note**: For easy key download, first create a key pair in EC2 โ Key Pairs โ Create key pair, download it, then return here and select it from the dropdown.
+
+5. **Configure stack options**: Leave all values as default and click **Next**
+
+6. **Review and create:**
+ - Check the box: "I acknowledge that AWS CloudFormation might create IAM resources"
+ - Click **Create stack**
+ - Stack creation takes ~5-10 minutes
+
+7. **Access your instance:**
+ - Go to CloudFormation โ Select your stack โ **Outputs** tab
+ - Note the **EC2InstanceId** and **EC2PublicIP**
+ - Connect using your pre-downloaded key or SSM
+
+### Quick Access Commands
+
+```bash
+# SSH access (with your pre-created key)
+ssh -i ~/Downloads/your-key-name.pem ubuntu@
+
+# SSM access (no key needed)
+aws ssm start-session --target
+```
+
+### Post-Deployment Setup
+
+Once connected:
+
+```bash
+# Repository is pre-cloned
+cd ~/nki-llama
+
+# Install dependencies
+chmod +x install.sh
+./install.sh
+
+# Configure environment
+nano .env # Add your HF_TOKEN
+
+# Verify setup
+neuron-ls # Check Neuron devices
+```
+
+### ๐ฎ Using the NKI-LLAMA CLI
+
+The repository includes a unified command-line interface that simplifies all operations:
+
+```bash
+# Once connected to your instance
+cd ~/nki-llama
+
+# View all available commands
+./nki-llama help
+
+# Run interactive setup wizard
+./nki-llama setup
+```
+
+**Key Commands:**
+- `./nki-llama setup` - Interactive setup wizard with environment guidance
+- `./nki-llama status` - Check system health and compilation cache
+- `./nki-llama clean` - Clean artifacts and cache if needed
+
+**Pro Tips:**
+- Always run the setup wizard first: `./nki-llama setup`
+- Use `tmux` for long operations (the CLI will remind you)
+- Check `./nki-llama status` if you encounter issues
+- The CLI automatically guides you to the correct virtual environment
+
+## ๐ Complete Workflow
+
+### Step 1: Initial Setup
+```bash
+# Clone repository
+git clone https://github.com/aws-neuron/nki-llama.git
+cd nki-llama
+
+# Install and configure
+chmod +x install.sh
+./install.sh
+
+# Setup environment
+nano .env # Add HF_TOKEN and configure settings
+```
+
+### Step 2: Fine-tuning Phase
+```bash
+# Start tmux session for training
+tmux new -s training
+
+# Activate training environment
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+
+# Run complete fine-tuning pipeline
+./nki-llama finetune all
+
+# IMPORTANT: Note the compile directory from output
+# Example: /home/ubuntu/neuron_cache/neuronxcc-2.18.121.0+9e31e41a/MODULE_15329989265349737271+a65e371e
+```
+
+### Step 3: Inference Optimization
+```bash
+# Start new tmux session for inference
+tmux new -s inference
+
+# Switch to inference environment
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+
+# Download model if not already done
+./nki-llama inference download
+
+# Run benchmark with NKI compilation
+./nki-llama inference benchmark
+```
+
+### Step 4: Reasoning Evaluation (Optional)
+```bash
+# Start new tmux session for reasoning
+tmux new -s reasoning
+
+# Ensure inference environment is active
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+
+# Run reasoning benchmarks
+./nki-llama/src/inference/scripts/reasoning-bench-lm-eval.sh
+```
+
+### Step 5: Calculate Combined Score
+```bash
+# After all components complete
+python /home/ubuntu/nki-llama/src/handler.py \
+ --config /home/ubuntu/nki-llama/src/fine-tune/neuronx-distributed-training/examples/conf/hf_llama3_8B_SFT_config.yaml \
+ --model-config /home/ubuntu/nki-llama/src/fine-tune/configs/model-config/8B_config_llama3-1/config.json \
+ --log-file /home/ubuntu/nki-llama/logs/nki-llama_[YOUR_TRAINING_LOG].log \
+ --compile-dir [YOUR_COMPILE_DIR_FROM_TRAINING] \
+ --inference-results /home/ubuntu/nki-llama/src/inference/benchmark_inference.json \
+ --reasoning-results \
+ --throughput 2.1 \
+ --output complete_benchmark_results.json \
+ --training-weight 0.33 \
+ --inference-weight 0.33 \
+ --reasoning-weight 0.34 \
+ --hw-backend trn1 \
+ --per-file-scores \
+ --calculate-score \
+ --detailed \
+ --verbose
+```
+
+## ๐ง Integrated NKI Optimization Strategy
+
+### Phase 1: Training Optimizations
+
+#### Custom Training Kernels
+```python
+# Example: NKI-optimized gradient computation
+@nki.jit
+def nki_gradient_accumulation(gradients, accumulated_grads, scale_factor):
+ """
+ Optimized gradient accumulation for distributed training
+ """
+ # Efficient gradient scaling and accumulation
+ pass
+
+# Example: NKI-optimized optimizer step
+@nki.jit
+def nki_adam_step(params, grads, m, v, lr, beta1, beta2, eps):
+ """
+ Fused Adam optimizer step
+ """
+ # Implement fused parameter update
+ pass
+```
+
+#### Training-specific Optimizations
+1. **Gradient All-Reduce**: Optimize collective operations
+2. **Loss Computation**: Fused loss calculation
+3. **Activation Checkpointing**: Memory-efficient training
+4. **Mixed Precision**: FP16/BF16 optimizations
+
+### Phase 2: Inference Optimizations
+
+#### Shared Kernel Optimizations
+Many kernels can be shared between training and inference:
+
+```python
+# Shared RMSNorm implementation
+@nki.jit
+def nki_rmsnorm_kernel(input_tensor, weight, epsilon, training=False):
+ """
+ RMSNorm optimized for both training and inference
+ """
+ # Common normalization logic
+ normalized = compute_rmsnorm(input_tensor, weight, epsilon)
+
+ if training:
+ # Store intermediate values for backward pass
+ save_for_backward(input_tensor, normalized)
+
+ return normalized
+
+# Shared attention mechanism
+@nki.jit
+def nki_attention_kernel(q, k, v, mask=None, training=False):
+ """
+ Multi-head attention for training and inference
+ """
+ # Implement scaled dot-product attention
+ # with different optimizations for each mode
+ pass
+```
+
+#### Inference-specific Optimizations
+1. **KV Cache Management**: Optimize cache operations
+2. **Continuous Batching**: Dynamic batch processing
+3. **Speculative Decoding**: Parallel token generation
+4. **Quantization**: INT8/INT4 inference
+
+## ๐ Common Integration Issues
+
+### Cache Conflicts
+```bash
+# Issue: Stale compiled kernels
+# Solution: Clear cache between major changes
+rm -rf ~/neuron_cache/*
+rm -rf ~/.cache/neuron
+```
+
+## ๐ Competition Tips
+
+### 1. Time Management
+- **Week 1**: Get baseline working, understand the code
+- **Week 2**: Implement core NKI kernels
+- **Week 3**: Optimize and fine-tune
+- **Final days**: Polish, document, prepare submission
+
+### 2. Documentation
+Keep detailed logs of:
+- Kernel implementations
+- Performance improvements
+- Failed attempts (for learning)
+- Configuration changes
+
+## ๐ Example Complete Run
+
+```bash
+#!/bin/bash
+# complete_hackathon_run.sh
+
+# Setup
+echo "๐ Starting complete NKI-LLAMA pipeline"
+
+# Training phase
+tmux new -d -s training
+tmux -a -t training "source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate" Enter
+tmux -a -t training "cd ~/nki-llama" Enter
+tmux -a -t training "./nki-llama finetune all 2>&1 | tee training.log" Enter
+
+# Wait for training to reach a checkpoint
+
+# Inference phase
+tmux new -d -s inference
+tmux -a -t inference "source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate" Enter
+tmux -a -t inference "cd ~/nki-llama" Enter
+tmux -a -t inference "./nki-llama inference benchmark 2>&1 | tee inference.log" Enter
+
+# Reasoning phase (optional)
+tmux new -d -s reasoning
+tmux -a -t reasoning "source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate" Enter
+tmux -a -t reasoning "./nki-llama/src/inference/scripts/reasoning-bench-lm-eval.sh" Enter
+```
+
+## ๐ Resources
+
+### Essential Documentation
+- [Complete NKI Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/index.html)
+- [NeuronX Distributed Training](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/neuronx-distributed/index.html)
+- [NeuronX Distributed Inference](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/neuronx-distributed-inference/index.html)
+
+### Example Repositories
+- [NKI Samples](https://github.com/aws-neuron/nki-samples)
+- [NKI Autotune](https://github.com/awslabs/nki-autotune)
+- [AWS Neuron Samples](https://github.com/aws-neuron/aws-neuron-samples)
+
+### Tools and Utilities
+- [Neuron Profiler](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/tools/neuron-sys-tools/neuron-profile-user-guide.html)
+- [Neuron Top](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/tools/neuron-sys-tools/neuron-top-user-guide.html)
+- [TensorBoard Integration](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuron/tutorials/training/tensorboard.html)
+
+---
+
+Remember: The key to maximizing your score is to optimize both training and inference with NKI kernels while maintaining model accuracy. Focus on the highest-impact optimizations first and ensure everything integrates smoothly. Good luck!
\ No newline at end of file
diff --git a/docs/fine-tuning.md b/docs/fine-tuning.md
new file mode 100644
index 0000000..ecd5932
--- /dev/null
+++ b/docs/fine-tuning.md
@@ -0,0 +1,370 @@
+# Fine-tuning Guide for NKI-LLAMA Hackathon
+
+## ๐ฏ Overview
+
+This guide focuses exclusively on fine-tuning LLaMA models on AWS Trainium using NeuronX Distributed (NxD). Perfect for participants wanting to optimize training performance and achieve high Model FLOP Utilization (MFU).
+
+## ๐ Prerequisites
+
+### Instance Requirements
+- **Instance Type**: trn1.32xlarge (recommended) or trn1.2xlarge (minimum)
+- **AMI**: Deep Learning AMI Neuron (Ubuntu 22.04) 20250520
+ - **us-east-1**: `ami-0e65a95c79775d1b6`
+ - **us-west-2**: `ami-0d0a2d26f80b645c2`
+- **Storage**: 512GB+ recommended (800GB default in CloudFormation)
+- **Neuron SDK**: 2.23.0
+
+### Environment Setup
+```bash
+# Activate the training environment
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+```
+
+## ๐ Deployment
+
+Deploy the NKI-LLAMA training environment using AWS CloudFormation with one click:
+
+| AWS Region | Launch CloudFormation Stack |
+|:-----------|:----------------------------|
+| us-east-1 (N. Virginia) |Launch stack |
+| us-west-2 (Oregon) |Launch stack |
+
+**Note:** Only us-east-1 and us-west-2 regions support Trainium (trn1) instances with the required Neuron AMIs.
+
+### Deployment Steps
+
+1. **Download the CloudFormation template**:
+ - Click here to download: [deployment.yaml](../deployment/deployment.yaml)
+
+2. **Click** on one of the CloudFormation Console links above for your preferred region.
+
+3. **Upload the template**:
+ - Choose **Upload a template file**
+ - Click **Choose file** and select the downloaded `deployment.yaml`
+ - Click **Next**
+
+4. **Configure the stack:**
+ - **Stack name**: Keep default or customize (e.g., `nki-llama-training`)
+ - **KeyPairOption**: Choose `use-existing` (recommended - create key in EC2 console first)
+ - **ExistingKeyPairName**: Select your key from dropdown (see note below)
+ - **Ec2InstanceType**: Keep default `trn1.32xlarge`
+ - Click **Next**
+
+ **Note**: For easy key download, first create a key pair in EC2 โ Key Pairs โ Create key pair, download it, then return here and select it from the dropdown.
+
+5. **Configure stack options**: Leave all values as default and click **Next**
+
+6. **Review and create:**
+ - Check the box: "I acknowledge that AWS CloudFormation might create IAM resources"
+ - Click **Create stack**
+ - Stack creation takes ~5-10 minutes
+
+7. **Access your instance:**
+ - Go to CloudFormation โ Select your stack โ **Outputs** tab
+ - Copy the **SSHCommand** value
+ - If you created a new key, download it from EC2 โ Key Pairs
+ - Connect: `ssh -i ubuntu@`
+
+### Post-Deployment Setup
+
+Once connected to your instance:
+
+```bash
+# Repository is pre-cloned
+cd ~/nki-llama
+
+# Install dependencies
+chmod +x install.sh
+./install.sh
+
+# Configure environment
+nano .env # Add your HF_TOKEN
+```
+
+### ๐ฎ Using the NKI-LLAMA CLI
+
+The repository includes a unified command-line interface that simplifies all operations:
+
+```bash
+# Once connected to your instance
+cd ~/nki-llama
+
+# View all available commands
+./nki-llama help
+
+# Run interactive setup wizard
+./nki-llama setup
+```
+
+**Key Commands:**
+- `./nki-llama setup` - Interactive setup wizard with environment guidance
+- `./nki-llama status` - Check system health and compilation cache
+- `./nki-llama clean` - Clean artifacts and cache if needed
+
+**Pro Tips:**
+- Always run the setup wizard first: `./nki-llama setup`
+- Use `tmux` for long operations (the CLI will remind you)
+- Check `./nki-llama status` if you encounter issues
+- The CLI automatically guides you to the correct virtual environment
+
+## ๐ Quick Start
+
+### Step 1 (OPTIONAL): Clone and Setup
+
+**Please skip this step when deploying the infrastructure with cloudformation**
+
+```bash
+# Clone the repository
+git clone https://github.com/aws-neuron/nki-llama.git
+cd nki-llama
+
+# Install dependencies
+chmod +x install.sh
+./install.sh
+
+# Configure environment
+nano .env # Add your HF_TOKEN
+```
+
+### Step 2: Run Complete Fine-tuning Pipeline
+```bash
+# Use tmux for long-running operations
+tmux new -s training
+
+# Activate training environment
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+
+# Run the complete pipeline
+./nki-llama finetune all
+```
+
+## ๐ Detailed Fine-tuning Workflow
+
+### 1. Install Dependencies
+```bash
+./nki-llama finetune deps
+```
+This installs all required Python packages and NeuronX Distributed components.
+
+### 2. Download Dataset
+```bash
+./nki-llama finetune data
+```
+Downloads and prepares the training dataset (default: dolly_15k).
+
+### 3. Download Base Model
+```bash
+./nki-llama finetune model
+```
+Downloads the base LLaMA model from Hugging Face (requires HF_TOKEN).
+
+### 4. Convert Model Format
+```bash
+./nki-llama finetune convert
+```
+Converts the model to NeuronX Distributed Training (NxDT) format.
+
+### 5. Pre-compile Graphs
+```bash
+./nki-llama finetune compile
+```
+**Important**: Note the compile directory path from the output. You'll need this for score calculation.
+
+Example output:
+```
+Pre-compile graphs: /home/ubuntu/neuron_cache/neuronxcc-2.18.121.0+9e31e41a/MODULE_15329989265349737271+a65e371e
+```
+
+### 6. Start Training
+```bash
+./nki-llama finetune train
+```
+Runs the actual fine-tuning process.
+
+## ๐ Performance Metrics
+
+During training, the system tracks:
+- **MFU (Model FLOP Utilization)**: Target >40% for good performance
+- **Throughput**: Tokens/second processed
+- **Loss convergence**: Training and validation loss
+- **Memory usage**: HBM utilization
+
+## ๐ฏ Score Calculation (Training Only)
+
+After training completes, calculate your performance score:
+
+```bash
+python /home/ubuntu/nki-llama/src/handler.py \
+ --config /home/ubuntu/nki-llama/src/fine-tune/neuronx-distributed-training/examples/conf/hf_llama3_8B_SFT_config.yaml \
+ --model-config /home/ubuntu/nki-llama/src/fine-tune/configs/model-config/8B_config_llama3-1/config.json \
+ --log-file /home/ubuntu/nki-llama/logs/nki-llama_[YOUR_TIMESTAMP].log \
+ --compile-dir [YOUR_COMPILE_DIR_FROM_STEP_5] \
+ --throughput 2.1 \
+ --output training_score.json \
+ --training-weight 1.0 \
+ --hw-backend trn1 \
+ --calculate-score \
+ --detailed \
+ --verbose
+```
+
+The training score evaluates:
+- **MFU improvement**: How well your optimizations improve hardware utilization
+- **Throughput gains**: Training speed improvements
+- **NKI optimization ratio**: Percentage of operations optimized with NKI
+
+## ๐ง Configuration Options
+
+### Training Configuration
+Edit `src/fine-tune/neuronx-distributed-training/examples/conf/hf_llama3_8B_SFT_config.yaml`:
+
+```yaml
+# Model parameters
+model:
+ model_id: "meta-llama/Meta-Llama-3-8B"
+
+# Training parameters
+training:
+ batch_size: 1
+ gradient_accumulation_steps: 8
+ learning_rate: 5e-5
+ num_train_epochs: 1
+
+# Hardware configuration
+distributed:
+ tensor_parallel_size: 8
+ pipeline_parallel_size: 1
+```
+
+### Environment Variables (.env)
+```bash
+# Hugging Face token (required)
+HF_TOKEN=your_token_here
+
+# Model selection
+MODEL_ID=meta-llama/Meta-Llama-3-8B
+MODEL_NAME=llama-3-8b
+
+# Hardware configuration
+TENSOR_PARALLEL_SIZE=8
+NEURON_RT_NUM_CORES=8
+```
+
+## ๐ ๏ธ Advanced Optimizations
+
+### 1. Implement Custom NKI Kernels
+Create optimized kernels for training operations:
+
+```python
+# Example: Optimized attention computation
+@nki_jit
+def nki_attention_kernel(q, k, v, mask=None):
+ # Your NKI implementation here
+ pass
+```
+
+### 2. Optimize Data Loading
+- Use efficient data preprocessing
+- Implement prefetching
+- Optimize tokenization pipeline
+
+### 3. Memory Optimization
+- Gradient checkpointing
+- Mixed precision training
+- Efficient tensor layouts
+
+## ๐ Monitoring Training
+
+### Real-time Monitoring
+```bash
+# In a new terminal
+neuron-top # Monitor device utilization
+
+# View training logs
+tail -f logs/nki-llama_*.log
+```
+
+### Key Metrics to Watch
+- **step_loss**: Should decrease over time
+- **grad_norm**: Should remain stable
+- **throughput**: Tokens/second
+- **mfu**: Model FLOP Utilization
+
+## ๐ Troubleshooting
+
+### Common Issues
+
+#### Out of Memory
+```bash
+# Reduce batch size or model parallelism
+export TENSOR_PARALLEL_SIZE=4 # Instead of 8
+```
+
+#### Compilation Timeout
+```bash
+# Increase timeout
+export NEURON_COMPILE_TIMEOUT=3600 # 1 hour
+```
+
+#### Training Instability
+- Check gradient norms
+- Reduce learning rate
+- Enable gradient clipping
+
+## ๐ Best Practices
+
+1. **Always use tmux** for long-running operations
+2. **Save checkpoints frequently** to prevent data loss
+3. **Monitor metrics** throughout training
+4. **Document your optimizations** for the presentation
+5. **Test incrementally** - verify each optimization works
+
+## ๐ Scoring Tips
+
+To maximize your training-only score:
+
+1. **Focus on MFU**: Implement NKI kernels for compute-intensive operations
+2. **Optimize throughput**: Reduce data loading bottlenecks
+3. **Increase NKI coverage**: Replace more PyTorch ops with NKI kernels
+4. **Profile extensively**: Use neuron-profile to identify bottlenecks
+
+## ๐ Example Training Session
+
+```bash
+# Complete example workflow
+tmux new -s hackathon-training
+
+# Setup
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+cd ~/nki-llama
+
+# Run training
+./nki-llama finetune all
+
+# Monitor progress (in another terminal)
+tmux new -s monitoring
+neuron-top
+
+# After completion, calculate score
+python src/handler.py --config [...] --calculate-score
+
+# Detach from tmux: Ctrl+B, then D
+```
+
+## ๐ฏ Next Steps
+
+After mastering fine-tuning:
+1. Document your NKI kernel implementations
+2. Prepare performance comparison charts
+3. Consider exploring inference optimizations (see [inference.md](./inference.md))
+4. Prepare your presentation highlighting training improvements
+
+## ๐ Resources
+
+- [NeuronX Distributed Documentation](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/neuronx-distributed/index.html)
+- [NKI Training Examples](https://github.com/aws-neuron/nki-samples)
+- [AWS Neuron SDK Guide](https://awsdocs-neuron.readthedocs-hosted.com/)
+
+---
+
+Remember: Focus on achieving high MFU through effective NKI kernel implementation. Good luck with your hackathon!
\ No newline at end of file
diff --git a/docs/inference.md b/docs/inference.md
new file mode 100644
index 0000000..5356734
--- /dev/null
+++ b/docs/inference.md
@@ -0,0 +1,486 @@
+# Inference with NKI Compilation Guide for NKI-LLAMA Hackathon
+
+## ๐ฏ Overview
+
+This guide focuses on optimizing inference performance using Neuron Kernel Interface (NKI) compilation on AWS Inferentia/Trainium. Perfect for teams wanting to achieve maximum inference throughput and minimal latency without the training component.
+
+## ๐ Prerequisites
+
+### Instance Requirements
+- **Instance Type**: trn1.32xlarge (recommended) or trn1.2xlarge (minimum)
+- **AMI**: Deep Learning AMI Neuron (Ubuntu 22.04) 20250520
+ - **us-east-1**: `ami-0e65a95c79775d1b6`
+ - **us-west-2**: `ami-0d0a2d26f80b645c2`
+- **Storage**: 256GB+ recommended (800GB default in CloudFormation)
+- **Neuron SDK**: 2.23.0
+
+### Environment Setup
+```bash
+# Activate the inference environment
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+```
+
+## ๐ Deployment
+
+Deploy the NKI-LLAMA inference environment using AWS CloudFormation with one click:
+
+| AWS Region | Launch CloudFormation Stack |
+|:-----------|:----------------------------|
+| us-east-1 (N. Virginia) |Launch stack |
+| us-west-2 (Oregon) |Launch stack |
+
+**Note:** Only us-east-1 and us-west-2 regions support Trainium (trn1) instances with the required Neuron AMIs.
+
+### Deployment Steps
+
+1. **Download the CloudFormation template**:
+ - Click here to download: [deployment.yaml](../deployment/deployment.yaml)
+
+2. **Click** on one of the CloudFormation Console links above for your preferred region.
+
+3. **Upload the template**:
+ - Choose **Upload a template file**
+ - Click **Choose file** and select the downloaded `deployment.yaml`
+ - Click **Next**
+
+4. **Configure the stack:**
+ - **Stack name**: Keep default or customize (e.g., `nki-llama-inference`)
+ - **KeyPairOption**: Choose `use-existing` (recommended - create key in EC2 console first)
+ - **ExistingKeyPairName**: Select your key from dropdown (see note below)
+ - **Ec2InstanceType**: Keep default `trn1.32xlarge`
+ - **VpcOption**: Choose `create-new` or select existing VPC
+ - Click **Next**
+
+ **Note**: For easy key download, first create a key pair in EC2 โ Key Pairs โ Create key pair, download it, then return here and select it from the dropdown. Alternatively, choose `none` to use SSM Session Manager without keys.
+
+5. **Configure stack options**: Leave all values as default and click **Next**
+
+6. **Review and create:**
+ - Check the box: "I acknowledge that AWS CloudFormation might create IAM resources"
+ - Click **Create stack**
+ - Stack creation takes ~5-10 minutes
+
+7. **Access your instance:**
+ - Go to CloudFormation โ Select your stack โ **Outputs** tab
+ - Use **SSHCommand** for SSH access or **EC2InstanceId** for SSM
+ - For SSM: `aws ssm start-session --target `
+
+### Post-Deployment Setup
+
+Once connected to your instance:
+
+```bash
+# Repository is pre-cloned
+cd ~/nki-llama
+
+# Install dependencies
+chmod +x install.sh
+./install.sh
+
+# Configure environment
+nano .env # Add your HF_TOKEN and inference settings
+
+# Activate inference environment
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+```
+
+### ๐ฎ Using the NKI-LLAMA CLI
+
+The repository includes a unified command-line interface that simplifies all operations:
+
+```bash
+# Once connected to your instance
+cd ~/nki-llama
+
+# View all available commands
+./nki-llama help
+
+# Run interactive setup wizard
+./nki-llama setup
+```
+
+**Key Commands:**
+- `./nki-llama setup` - Interactive setup wizard with environment guidance
+- `./nki-llama status` - Check system health and compilation cache
+- `./nki-llama clean` - Clean artifacts and cache if needed
+
+**Pro Tips:**
+- Always run the setup wizard first: `./nki-llama setup`
+- Use `tmux` for long operations (the CLI will remind you)
+- Check `./nki-llama status` if you encounter issues
+- The CLI automatically guides you to the correct virtual environment
+
+## ๐ Quick Start
+
+### Step 1 (OPTIONAL): Clone and Setup
+
+**Please skip this step when deploying the infrastructure with cloudformation**
+
+```bash
+# Clone the repository
+git clone https://github.com/aws-neuron/nki-llama.git
+cd nki-llama
+
+# Install dependencies
+chmod +x install.sh
+./install.sh
+
+# Configure environment
+nano .env # Add your HF_TOKEN and inference settings
+```
+
+### Step 2: Download Model
+```bash
+# Download the model using the CLI
+./nki-llama inference download
+
+# Or manually download a specific model
+cd ~/models
+huggingface-cli download --token YOUR_TOKEN meta-llama/Meta-Llama-3-8B --local-dir /home/ubuntu/models/llama-3-8b
+```
+
+### Step 3: Run Benchmark with NKI Compilation
+```bash
+# Use tmux for long-running compilation
+tmux new -s benchmark
+
+# Run benchmark (includes NKI compilation on first run)
+./nki-llama inference benchmark
+```
+
+## ๐ง NKI Kernel Implementation
+
+### Understanding NKI Optimizations
+
+NKI (Neuron Kernel Interface) allows you to write custom, highly optimized kernels for Neuron devices. Key targets for optimization:
+
+1. **RMSNorm** - Layer normalization operations
+2. **Attention mechanisms** - Multi-head attention computation
+3. **Linear transformations** - Matrix multiplications
+4. **Activation functions** - GELU, SiLU, etc.
+
+### Example: Implementing NKI RMSNorm
+
+```python
+import neuron_kernel_interface as nki
+import torch.nn as nn
+
+@nki.jit
+def nki_rmsnorm_kernel(input_tensor, weight, epsilon):
+ """
+ Optimized RMSNorm implementation using NKI
+ """
+ # Get tensor dimensions
+ batch_size = input_tensor.shape[0]
+ seq_len = input_tensor.shape[1]
+ hidden_size = input_tensor.shape[2]
+
+ # Allocate output tensor
+ output = nki.tensor(shape=input_tensor.shape, dtype=input_tensor.dtype)
+
+ # Compute RMS normalization
+ # Your NKI implementation here
+ # ...
+
+ return output
+
+# Modify the model to use NKI kernel
+class CustomRMSNorm(nn.Module):
+ def __init__(self, hidden_size, eps=1e-6, nki_enabled=True):
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(hidden_size))
+ self.variance_epsilon = eps
+ self.nki_enabled = nki_enabled
+
+ def forward(self, hidden_states):
+ if self.nki_enabled:
+ return nki_rmsnorm_kernel(hidden_states, self.weight, self.variance_epsilon)
+ # Fallback to standard implementation
+ return standard_rmsnorm(hidden_states, self.weight, self.variance_epsilon)
+```
+
+### Implementing Additional NKI Kernels
+
+#### 1. Attention Kernel
+```python
+@nki.jit
+def nki_attention_kernel(q, k, v, mask=None):
+ """
+ Optimized attention computation
+ """
+ # Implement scaled dot-product attention
+ # with NKI optimizations
+ pass
+```
+
+#### 2. Linear Layer Kernel
+```python
+@nki.jit
+def nki_linear_kernel(input, weight, bias=None):
+ """
+ Optimized linear transformation
+ """
+ # Implement matrix multiplication
+ # with optional bias addition
+ pass
+```
+
+## ๐ Benchmarking Process
+
+### Running Benchmarks
+
+```bash
+# Full benchmark with default settings
+./nki-llama inference benchmark
+
+# Benchmark with custom sequence length
+./nki-llama inference benchmark --seq-len 2048
+
+# Clear cache and re-benchmark
+./nki-llama inference benchmark --clear-cache
+```
+
+### Direct Benchmark Execution
+For more control over benchmarking parameters:
+
+```bash
+cd src/inference
+python main.py \
+ --mode evaluate_all \
+ --seq-len 1024 \
+ --batch-size 1 \
+ --enable-nki \
+ --num-prompts 25
+```
+
+## ๐ฏ Score Calculation (Inference Only)
+
+After benchmarking completes, calculate your performance score:
+
+```bash
+python /home/ubuntu/nki-llama/src/handler.py \
+ --inference-results /home/ubuntu/nki-llama/src/inference/benchmark_inference.json \
+ --output inference_score.json \
+ --inference-weight 1.0 \
+ --hw-backend trn1 \
+ --calculate-score \
+ --detailed \
+ --verbose
+```
+
+The inference score evaluates:
+- **Latency reduction**: Time to First Token (TTFT) improvement
+- **Throughput increase**: Tokens/second improvement
+- **NKI coverage**: Percentage of FLOPs using NKI kernels
+
+## ๐ Profiling and Optimization
+
+### Using Neuron Profiler
+
+```bash
+# Enable profiling during benchmark
+export NEURON_PROFILE=1
+export NEURON_PROFILE_CONFIG=profile.json
+
+# Create profile configuration
+cat > profile.json << EOF
+{
+ "capture": {
+ "enabled": true,
+ "output_dir": "./profiles",
+ "duration_ms": 10000
+ }
+}
+EOF
+
+# Run benchmark with profiling
+./nki-llama inference benchmark
+
+# Analyze results
+neuron-profile view ./profiles/profile_*.neff
+```
+
+### Key Optimization Targets
+
+1. **Memory Access Patterns**
+ - Optimize data layout for Neuron memory hierarchy
+ - Minimize HBM bandwidth usage
+ - Use efficient tiling strategies
+
+2. **Compute Efficiency**
+ - Maximize tensor core utilization
+ - Fuse operations where possible
+ - Eliminate redundant computations
+
+3. **Pipeline Optimization**
+ - Overlap compute and memory operations
+ - Optimize kernel launch overhead
+ - Efficient synchronization
+
+## ๐ ๏ธ Advanced NKI Techniques
+
+### 1. Kernel Fusion
+Combine multiple operations into a single kernel:
+
+```python
+@nki.jit
+def nki_fused_attention_norm(q, k, v, norm_weight, epsilon):
+ """
+ Fused attention + normalization kernel
+ """
+ # Compute attention
+ attn_output = nki_attention_kernel(q, k, v)
+
+ # Apply normalization in the same kernel
+ normalized = nki_rmsnorm_kernel(attn_output, norm_weight, epsilon)
+
+ return normalized
+```
+
+### 2. Tiling Strategies
+Optimize for Neuron's memory hierarchy:
+
+```python
+@nki.jit
+def nki_tiled_matmul(a, b, tile_size=128):
+ """
+ Tiled matrix multiplication for better cache usage
+ """
+ # Implement tiled algorithm
+ # optimized for Neuron architecture
+ pass
+```
+
+### 3. Asynchronous Execution
+Leverage Neuron's async capabilities:
+
+```python
+# Enable async execution in your kernels
+@nki.jit(async_launch=True)
+def nki_async_kernel(...):
+ pass
+```
+
+## ๐ Performance Monitoring
+
+### Real-time Monitoring
+```bash
+# Monitor device utilization
+neuron-top
+
+# Watch compilation progress
+tail -f logs/nki-llama_*.log
+
+# Check benchmark results
+cat src/inference/benchmark_inference.json | jq
+```
+
+### Key Metrics
+- **TTFT (Time to First Token)**: Target <100ms
+- **Throughput**: Target >1000 tokens/sec
+- **Device Utilization**: Target >90%
+- **Memory Bandwidth**: Monitor for bottlenecks
+
+## ๐ Troubleshooting
+
+### Common Issues
+
+#### Compilation Cache Errors
+```bash
+# Clear the cache
+./nki-llama clean
+# or
+rm -rf ~/neuron_cache/*
+```
+
+#### Out of Memory During Compilation
+```bash
+# Reduce parallelism
+export NEURON_COMPILE_THREADS=4
+```
+
+#### Kernel Launch Failures
+- Check tensor dimensions match kernel expectations
+- Verify data types are supported
+- Enable debug mode: `export NEURON_DEBUG=1`
+
+## ๐ Optimization Strategies
+
+### 1. Target Hot Spots
+Focus on operations that consume most time:
+- Attention computation (usually 30-40% of time)
+- Linear layers (20-30%)
+- Normalization (10-15%)
+
+### 2. Incremental Optimization
+- Start with one kernel (e.g., RMSNorm)
+- Validate correctness
+- Measure improvement
+- Move to next kernel
+
+## ๐ Benchmark Configuration
+
+### Custom Prompt Testing
+Create your own prompts for testing:
+
+```bash
+# Edit prompts.txt
+nano ./data/prompts.json
+```
+
+### Batch Processing
+Test different batch sizes:
+
+```bash
+for batch in 1 2 4 8; do
+ ./nki-llama inference benchmark --batch-size $batch
+done
+```
+
+## ๐ฏ Next Steps
+
+After mastering inference optimization:
+1. Document your NKI kernel implementations
+2. Create performance comparison charts
+3. Consider adding fine-tuning (see [complete-pipeline.md](./complete-pipeline.md))
+4. Prepare reasoning benchmarks for additional scoring
+
+## ๐ Example Inference Session
+
+```bash
+# Complete workflow example
+tmux new -s hackathon-inference
+
+# Setup
+source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
+cd ~/nki-llama
+
+# Download model
+./nki-llama inference download
+
+# Run initial benchmark
+./nki-llama inference benchmark
+
+# Implement NKI optimizations
+nano src/llama.py # Add your NKI kernels
+
+# Re-benchmark with optimizations
+./nki-llama inference benchmark --clear-cache
+
+# Calculate score
+python src/handler.py --inference-results benchmark_inference.json --calculate-score
+
+# Start serving (optional)
+./nki-llama inference server
+```
+
+## ๐ Resources
+
+- [NKI Documentation](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/index.html)
+- [NKI Samples Repository](https://github.com/aws-neuron/nki-samples)
+- [NKI Autotune Tool](https://github.com/awslabs/nki-autotune)
+- [Neuron Profiler Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/tools/neuron-sys-tools/neuron-profile-user-guide.html)
+
+---
+
+Remember: Focus on implementing high-performance NKI kernels for critical operations. The key to success is identifying and optimizing the bottlenecks in your model's inference pipeline!
\ No newline at end of file
diff --git a/docs/self-attention.md b/docs/self-attention.md
new file mode 100644
index 0000000..67ac1b7
--- /dev/null
+++ b/docs/self-attention.md
@@ -0,0 +1,510 @@
+# Flash Self-Attention Kernel Optimizations Guide for NKI-LLAMA Hackathon
+
+## ๐ฏ Overview
+
+This guide focuses on working with the self-attention kernels provided and optimizing them further using the Neuron Kernel Interface (NKI) compilation on AWS Inferentia/Trainium. This is a perfect starting place for teams who want to learn more about NKI and how kernel optimizations can be applied without having to train or inference components.
+
+### Instance Requirements
+- **Instance Type**: trn1.2xlarge (minimum) or trn1.32xlarge
+- **AMI**: Deep Learning AMI Neuron (Ubuntu 22.04) 20250520
+ - **us-east-1**: `ami-0e65a95c79775d1b6`
+ - **us-west-2**: `ami-0d0a2d26f80b645c2`
+- **Storage**: 256GB+ recommended (800GB default in CloudFormation)
+- **Neuron SDK**: 2.23.0
+
+### Environment Setup
+```bash
+# Activate the self-attention environment
+source /opt/aws_neuronx_venv_pytorch_2_6/bin/activate
+```
+
+## ๐ Deployment
+
+Deploy the NKI-LLAMA training environment using AWS CloudFormation with one click:
+
+| AWS Region | Launch CloudFormation Stack |
+|:-----------|:----------------------------|
+| us-east-1 (N. Virginia) |Launch stack |
+| us-west-2 (Oregon) |Launch stack |
+
+**Note:** Only us-east-1 and us-west-2 regions support Trainium (trn1) instances with the required Neuron AMIs.
+
+### Deployment Steps
+
+1. **Download the CloudFormation template**:
+ - Click here to download: [deployment.yaml](../deployment/deployment.yaml)
+
+2. **Click** on one of the CloudFormation Console links above for your preferred region.
+
+3. **Upload the template**:
+ - Choose **Upload a template file**
+ - Click **Choose file** and select the downloaded `deployment.yaml`
+ - Click **Next**
+
+4. **Configure the stack:**
+ - **Stack name**: Keep default or customize (e.g., `nki-llama-attention`)
+ - **KeyPairOption**: Choose `use-existing` (recommended - create key in EC2 console first)
+ - **ExistingKeyPairName**: Select your key from dropdown (see note below)
+ - **Ec2InstanceType**: Default: `trn1.32xlarge` - can be changed to use `trn1.2xlarge`
+ - Click **Next**
+
+ **Note**: For easy key download, first create a key pair in EC2 โ Key Pairs โ Create key pair, download it, then return here and select it from the dropdown.
+
+5. **Configure stack options**: Leave all values as default and click **Next**
+
+6. **Review and create:**
+ - Check the box: "I acknowledge that AWS CloudFormation might create IAM resources"
+ - Click **Create stack**
+ - Stack creation takes ~5-10 minutes
+
+7. **Access your instance:**
+ - Go to CloudFormation โ Select your stack โ **Outputs** tab
+ - Copy the **SSHCommand** value
+ - If you created a new key, download it from EC2 โ Key Pairs
+ - Connect: `ssh -i ubuntu@`
+
+### Post-Deployment Setup
+
+Once connected to your instance:
+
+```bash
+# Repository is pre-cloned
+cd ~/nki-llama
+
+# Install dependencies
+chmod +x install.sh
+./install.sh
+```
+
+## ๐ฎ Using the NKI-LLAMA CLI
+
+The repository includes a unified command-line interface that simplifies all operations. You can use either the CLI commands or run the scripts directly.
+
+### Option 1: Using NKI-LLAMA CLI (Recommended)
+
+```bash
+# Once connected to your instance
+cd ~/nki-llama
+
+# View all self-attention commands
+./nki-llama help
+
+# Run interactive setup wizard
+./nki-llama setup
+```
+
+**Self-Attention CLI Commands:**
+- `./nki-llama self-attention benchmark` - Run comprehensive benchmarks
+- `./nki-llama self-attention test` - Run all tests
+- `./nki-llama self-attention test forward` - Run forward pass tests only
+- `./nki-llama self-attention test backward` - Run backward pass tests only
+- `./nki-llama self-attention run