diff --git a/.github/workflows/build-and-run.yml b/.github/workflows/build-and-run.yml
index 41c8d9a6..cd84a896 100644
--- a/.github/workflows/build-and-run.yml
+++ b/.github/workflows/build-and-run.yml
@@ -5,48 +5,66 @@ on:
     branches: [ main ]
   pull_request:
     branches: [ main ]  
-    types: [opened, synchronize,  reopened]
+    types: [opened, synchronize, reopened]
 
+env:
+  JAVA_HOME: /opt/jenkins/jdks/graal-23.1.0/jdk-21.0.3
+  TORNADO_ROOT: ${{ github.workspace }}/GPULlama3.java/external/tornadovm
+  LLAMA_ROOT: ${{ github.workspace }}
+  GRAAL_JARS: /opt/graalJars
+  MODELS_DIR: /opt/models
 
 jobs:
-  build-and-run:
+  code-quality:
     runs-on: self-hosted
-    
-    env:
-      JAVA_HOME: /opt/jenkins/jdks/graal-23.1.0/jdk-21.0.3
-      TORNADO_ROOT: ${{ github.workspace }}/GPULlama3.java/external/tornadovm
-      LLAMA_ROOT: ${{ github.workspace }} 
-    
+    timeout-minutes: 30
+
     steps:
       - name: Checkout GPULlama3
         uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
 
       - name: Check code formatting (Spotless)
         run: |
           cd ${{ github.workspace }}
-          #./mvnw -T12C -Pspotless spotless:check
-          
-      - name: Clone TornadoVM explicitly
+          # ./mvnw -T12C -Pspotless spotless:check
+
+  build-and-run:
+    runs-on: [self-hosted]
+    needs: code-quality
+    timeout-minutes: 30
+
+    strategy:
+      fail-fast: true
+      matrix:
+        backend:
+          - name: opencl
+          - name: ptx
+
+    steps:
+      - name: Checkout GPULlama3
+        uses: actions/checkout@v4
+
+      - name: Clone TornadoVM master
         run: |
-          git clone --depth 1 --branch develop \
+          git clone --depth 1 --branch master \
             https://github.com/beehive-lab/TornadoVM.git \
-            GPULlama3.java/external/tornadovm
+            $TORNADO_ROOT
       - name: Set up Python venv for TornadoVM
         run: |
-          python3 -m venv GPULlama3.java/external/tornadovm/venv
-          source GPULlama3.java/external/tornadovm/venv/bin/activate
+          python3 -m venv $TORNADO_ROOT/venv
+          source $TORNADO_ROOT/venv/bin/activate
           python --version
       - name: Build TornadoVM
         run: |
-          set -x
-          cd GPULlama3.java/external/tornadovm
+          cd $TORNADO_ROOT
+          mkdir -p graalJars && cp $GRAAL_JARS/* graalJars/
           source venv/bin/activate
           echo "=== Building TornadoVM ==="
-          make
+          
+          make BACKEND=${{ matrix.backend.name }}
+          
           echo "=== Searching for TornadoVM SDK directory ==="
-          SDK_DIR=$(find dist -type d -maxdepth 3 -path "*/tornadovm-*-opencl" | head -n 1)
+          SDK_DIR=$(find dist -type d -maxdepth 3 -path "*/tornadovm-*-${{ matrix.backend.name }}" | head -n 1)
           if [ -z "$SDK_DIR" ]; then
             echo "::error::Could not locate TornadoVM SDK directory!"
             find dist -maxdepth 5 -type d
@@ -66,59 +84,80 @@ jobs:
           echo "=== Checking tornado CLI ==="
           which tornado || { echo "::error::tornado not in PATH"; exit 1; }
           tornado --devices
-      - name: Build GPULlama3
+      - name: Build GPULlama3.java
         run: |
-          set -x
           cd ${{ github.workspace }}
           echo "Using TORNADO_SDK=$TORNADO_SDK"
           export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH"
-          which tornado || { echo "::error::tornado unavailable during GPULlama3 build"; exit 1; }
           tornado --version
-          make
-  
-  test-models:
-    runs-on: self-hosted
-    needs: build-and-run
-
-    strategy:
-      fail-fast: false
-      matrix:
-        model:
-          - /opt/models/DeepSeek-R1-Distill-Qwen-1.5B-F16.gguf
-          - /opt/models/DeepSeek-R1-Distill-Qwen-1.5B-Q8_0.gguf
-          - /opt/models/Llama-3.2-1B-Instruct-F16.gguf
-          - /opt/models/Llama-3.2-1B-Instruct-Q8_0.gguf
-          - /opt/models/Llama-3.2-3B-Instruct-F16.gguf
-          - /opt/models/Llama-3.2-3B-Instruct-Q8_0.gguf
-          - /opt/models/Mistral-7B-Instruct-v0.3.fp16.gguf
-          - /opt/models/Mistral-7B-Instruct-v0.3.Q8_0.gguf
-          - /opt/models/Phi-3-mini-4k-instruct-fp16.gguf
-          - /opt/models/Phi-3-mini-4k-instruct-Q8_0.gguf
-          - /opt/models/Qwen2.5-0.5B-Instruct-f16.gguf
-          - /opt/models/Qwen2.5-0.5B-Instruct-Q8_0.gguf
-          - /opt/models/qwen2.5-1.5b-instruct-fp16.gguf
-          - /opt/models/qwen2.5-1.5b-instruct-q8_0.gguf
-          - /opt/models/Qwen3-0.6B-f16.gguf
-          - /opt/models/Qwen3-0.6B-Q8_0.gguf
-          - /opt/models/Qwen3-4B-f16.gguf
-          - /opt/models/Qwen3-4B-Q8_0.gguf
-
-    env:
-      JAVA_HOME: /opt/jenkins/jdks/graal-23.1.0/jdk-21.0.3
-      TORNADO_SDK: ${{ needs.build-and-run.outputs.tornado_sdk }}
-
-    steps:
-      - name: Checkout GPULlama3
-        uses: actions/checkout@v4
-
-      - name: Run inference for ${{ matrix.model }}
+          ./mvnw clean package -DskipTests
+      - name: FP16 - Run Llama-3.2-1B-Instruct-F16.gguf
         run: |
-          set -x
           cd ${{ github.workspace }}
-
           export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH"
-          echo "Using Tornado SDK: $TORNADO_SDK"
-
-          ./llama-tornado --gpu --opencl \
-            --model "${{ matrix.model }}" \
+          ./llama-tornado --gpu --${{ matrix.backend.name }} \
+            --model $MODELS_DIR/Llama-3.2-1B-Instruct-F16.gguf \
+            --prompt "Say hello"
+      - name: FP16 - Run Qwen3-4B-f16.gguf
+        run: |
+          cd ${{ github.workspace }}
+          export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH"
+          ./llama-tornado --gpu --${{ matrix.backend.name }} \
+            --model $MODELS_DIR/Qwen3-4B-f16.gguf \
+            --prompt "Say hello"
+      - name: FP16 - Run Mistral-7B-Instruct-v0.3.fp16.gguf
+        run: |
+          cd ${{ github.workspace }}
+          export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH"
+          ./llama-tornado --gpu --${{ matrix.backend.name }} \
+            --model $MODELS_DIR/Mistral-7B-Instruct-v0.3.fp16.gguf \
+            --prompt "Say hello"
+      - name: FP16 - Run Qwen2.5-1.5b-instruct-fp16.gguf
+        run: |
+          cd ${{ github.workspace }}
+          export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH"
+          ./llama-tornado --gpu --${{ matrix.backend.name }} \
+            --model $MODELS_DIR/qwen2.5-1.5b-instruct-fp16.gguf \
+            --prompt "Say hello"
+      - name: FP16 - Run Phi-3-mini-4k-instruct-fp16.gguf
+        run: |
+          cd ${{ github.workspace }}
+          export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH"
+          ./llama-tornado --gpu --${{ matrix.backend.name }} \
+            --model /$MODELS_DIR/Phi-3-mini-4k-instruct-fp16.gguf \
+            --prompt "Say hello"
+      - name: Q8 - Run Llama-3.2-1B-Instruct-Q8_0.gguf
+        run: |
+          cd ${{ github.workspace }}
+          export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH"
+          ./llama-tornado --gpu --${{ matrix.backend.name }} \
+            --model $MODELS_DIR/Llama-3.2-1B-Instruct-Q8_0.gguf \
+            --prompt "Say hello"
+      - name: Q8 - Run Qwen3-0.6B-Q8_0.gguf
+        run: |
+          cd ${{ github.workspace }}
+          export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH"
+          ./llama-tornado --gpu --${{ matrix.backend.name }} \
+            --model $MODELS_DIR/Qwen3-0.6B-Q8_0.gguf \
+            --prompt "Say hello"
+      - name: Q8 - Run Phi-3-mini-4k-instruct-Q8_0.gguf
+        run: |
+          cd ${{ github.workspace }}
+          export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH"
+          ./llama-tornado --gpu --${{ matrix.backend.name }} \
+            --model $MODELS_DIR/Phi-3-mini-4k-instruct-Q8_0.gguf \
+            --prompt "Say hello"
+      - name: Q8 - Run Qwen2.5-1.5b-instruct-q8_0.gguf
+        run: |
+          cd ${{ github.workspace }}
+          export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH"
+          ./llama-tornado --gpu --${{ matrix.backend.name }} \
+            --model $MODELS_DIR/qwen2.5-1.5b-instruct-q8_0.gguf \
+            --prompt "Say hello"
+      - name: Q8 - Mistral-7B-Instruct-v0.3.Q8_0.gguf
+        run: |
+          cd ${{ github.workspace }}
+          export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH"
+          ./llama-tornado --gpu --${{ matrix.backend.name }} \
+            --model $MODELS_DIR/Mistral-7B-Instruct-v0.3.Q8_0.gguf \
             --prompt "Say hello"
diff --git a/.github/workflows/rerun-workflow.yml b/.github/workflows/rerun-workflow.yml
new file mode 100644
index 00000000..a44eae98
--- /dev/null
+++ b/.github/workflows/rerun-workflow.yml
@@ -0,0 +1,149 @@
+name: Rerun Workflows
+
+on:
+  issue_comment:
+    types: [created]
+
+jobs:
+  rerun:
+    name: Rerun CI Workflows
+    # Only run on PR comments (not issue comments) with /rerun command
+    if: |
+      github.event.issue.pull_request &&
+      contains(github.event.comment.body, '/rerun')
+    runs-on: ubuntu-latest
+    permissions:
+      actions: write
+      pull-requests: read
+      contents: read
+
+    steps:
+      - name: Get PR SHA
+        id: pr
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const { data: pr } = await github.rest.pulls.get({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              pull_number: context.issue.number
+            });
+            core.setOutput('sha', pr.head.sha);
+            core.setOutput('head_ref', pr.head.ref);
+            console.log(`PR #${context.issue.number} SHA: ${pr.head.sha}`);
+            console.log(`PR head ref: ${pr.head.ref}`);
+
+      - name: Add reaction to comment
+        uses: actions/github-script@v7
+        with:
+          script: |
+            await github.rest.reactions.createForIssueComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              comment_id: context.payload.comment.id,
+              content: 'rocket'
+            });
+
+      - name: Post start comment
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const comment = context.payload.comment.body;
+            const rerunMatch = comment.match(/\/rerun\s*(\S+)?/);
+            const rerunArg = rerunMatch && rerunMatch[1] ? rerunMatch[1] : 'failed';
+            
+            await github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+              body: `🚀 **Workflow rerun started**\n\nMode: \`${rerunArg}\`\nTriggered by: @${context.payload.comment.user.login}\n\n[View Actions](https://github.com/${context.repo.owner}/${context.repo.repo}/actions)`
+            });
+
+      - name: Rerun failed workflows
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const sha = '${{ steps.pr.outputs.sha }}';
+            const headRef = '${{ steps.pr.outputs.head_ref }}';
+            
+            // Get all workflow runs for this PR's head SHA
+            const { data: runs } = await github.rest.actions.listWorkflowRunsForRepo({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              head_sha: sha,
+              per_page: 100
+            });
+            
+            console.log(`Found ${runs.total_count} workflow runs for SHA ${sha}`);
+            
+            if (runs.total_count === 0) {
+              console.log('No workflow runs found for this PR');
+              return;
+            }
+            
+            // Parse command for specific workflow filter
+            // Supports: /rerun, /rerun all, /rerun failed, /rerun <workflow-name>
+            const comment = context.payload.comment.body;
+            const rerunMatch = comment.match(/\/rerun\s*(\S+)?/);
+            const rerunArg = rerunMatch && rerunMatch[1] ? rerunMatch[1].toLowerCase() : 'failed';
+            
+            console.log(`Rerun mode: ${rerunArg}`);
+            
+            let rerunCount = 0;
+            
+            for (const run of runs.workflow_runs) {
+              const shouldRerun = 
+                rerunArg === 'all' ||
+                (rerunArg === 'failed' && ['failure', 'cancelled', 'timed_out'].includes(run.conclusion)) ||
+                run.name.toLowerCase().includes(rerunArg);
+              
+              if (!shouldRerun) {
+                console.log(`Skipping ${run.name} (status: ${run.status}, conclusion: ${run.conclusion})`);
+                continue;
+              }
+              
+              // Only rerun completed workflows
+              if (run.status !== 'completed') {
+                console.log(`Skipping ${run.name} - still ${run.status}`);
+                continue;
+              }
+              
+              try {
+                console.log(`Rerunning workflow: ${run.name} (ID: ${run.id})`);
+                
+                // Use rerun-failed-jobs if available and workflow failed, otherwise full rerun
+                if (['failure', 'cancelled', 'timed_out'].includes(run.conclusion)) {
+                  await github.rest.actions.reRunWorkflowFailedJobs({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    run_id: run.id
+                  });
+                } else {
+                  await github.rest.actions.reRunWorkflow({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    run_id: run.id
+                  });
+                }
+                rerunCount++;
+              } catch (error) {
+                console.log(`Failed to rerun ${run.name}: ${error.message}`);
+              }
+            }
+            
+            console.log(`Reran ${rerunCount} workflow(s)`);
+
+      - name: Post completion comment
+        if: always()
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const status = '${{ job.status }}';
+            const emoji = status === 'success' ? '✅' : '❌';
+            
+            await github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+              body: `${emoji} **Workflow rerun ${status}**\n\n[View Actions](https://github.com/${context.repo.owner}/${context.repo.repo}/actions)`
+            });
diff --git a/README.md b/README.md
index 6b8e8167..585be1e5 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# GPULlama3.java powered by TornadoVM
+# GPULlama3.java powered by TornadoVM [![GPULlama3 Build & Run Inference](https://github.com/beehive-lab/GPULlama3.java/actions/workflows/build-and-run.yml/badge.svg)](https://github.com/beehive-lab/GPULlama3.java/actions/workflows/build-and-run.yml)
 ![Java Version](https://img.shields.io/badge/java-21+-blue?style=for-the-badge&logo=openjdk)
 ![OpenCL](https://img.shields.io/badge/OpenCL-supported-blue?style=for-the-badge&logo=khronos)
 ![CUDA](https://img.shields.io/badge/CUDA/PTX-supported-76B900?style=for-the-badge&logo=nvidia)
@@ -99,7 +99,6 @@ Ensure you have the following installed and configured:
 
 - **Java 21**: Required for Vector API support & TornadoVM.
 - [TornadoVM](https://github.com/beehive-lab/TornadoVM) with OpenCL or PTX backends.
-- [Maven](https://maven.apache.org/): For building the Java project.
 
 ### Install, Build, and Run
 
@@ -264,82 +263,8 @@ Check models below.
 
 ## Download Model Files
 
-Download `FP16` quantized `Llama-3` .gguf files from:
-- https://huggingface.co/beehive-lab/Llama-3.2-1B-Instruct-GGUF-FP16
-- https://huggingface.co/beehive-lab/Llama-3.2-3B-Instruct-GGUF-FP16
-- https://huggingface.co/beehive-lab/Llama-3.2-8B-Instruct-GGUF-FP16
-
-Download `FP16` quantized `Mistral` .gguf files from:
-- https://huggingface.co/collections/beehive-lab/mistral-gpullama3java-684afabb206136d2e9cd47e0
-
-Download `FP16` quantized `Qwen3` .gguf files from:
-- https://huggingface.co/ggml-org/Qwen3-0.6B-GGUF
-- https://huggingface.co/ggml-org/Qwen3-1.7B-GGUF
-- https://huggingface.co/ggml-org/Qwen3-4B-GGUF
-- https://huggingface.co/ggml-org/Qwen3-8B-GGUF
-
-Download `FP16` quantized `Qwen2.5` .gguf files from:
-- https://huggingface.co/bartowski/Qwen2.5-0.5B-Instruct-GGUF
-- https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF
-
-Download `FP16` quantized `DeepSeek-R1-Distill-Qwen` .gguf files from:
-- https://huggingface.co/hdnh2006/DeepSeek-R1-Distill-Qwen-1.5B-GGUF
-
-Please be gentle with [huggingface.co](https://huggingface.co) servers:
-
-**Note** FP16 models are first-class citizens for the current version.
-```
-# Llama 3.2 (1B) - FP16
-wget https://huggingface.co/beehive-lab/Llama-3.2-1B-Instruct-GGUF-FP16/resolve/main/beehive-llama-3.2-1b-instruct-fp16.gguf
-
-# Llama 3.2 (3B) - FP16 
-wget https://huggingface.co/beehive-lab/Llama-3.2-3B-Instruct-GGUF-FP16/resolve/main/beehive-llama-3.2-3b-instruct-fp16.gguf
-
-# Llama 3 (8B) - FP16 
-wget https://huggingface.co/beehive-lab/Llama-3.2-8B-Instruct-GGUF-FP16/resolve/main/beehive-llama-3.2-8b-instruct-fp16.gguf
-
-# Mistral (7B) - FP16
-wget https://huggingface.co/MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF/resolve/main/Mistral-7B-Instruct-v0.3.fp16.gguf
-
-# Qwen3 (0.6B) - FP16
-wget https://huggingface.co/ggml-org/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-f16.gguf
-
-# Qwen3 (1.7B) - FP16
-wget https://huggingface.co/ggml-org/Qwen3-0.6B-GGUF/resolve/main/Qwen3-1.7B-f16.gguf
-
-# Qwen3 (4B) - FP16
-wget https://huggingface.co/ggml-org/Qwen3-0.6B-GGUF/resolve/main/Qwen3-4B-f16.gguf
-
-# Qwen3 (8B) - FP16
-wget https://huggingface.co/ggml-org/Qwen3-0.6B-GGUF/resolve/main/Qwen3-8B-f16.gguf
-
-# Phi-3-mini-4k - FP16
-wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-fp16.gguf
-
-# Qwen2.5 (0.5B)
-wget https://huggingface.co/bartowski/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/Qwen2.5-0.5B-Instruct-f16.gguf
-
-# Qwen2.5 (1.5B)
-wget https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-fp16.gguf
-
-# DeepSeek-R1-Distill-Qwen (1.5B)
-wget https://huggingface.co/hdnh2006/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-1.5B-F16.gguf
-```
-
-**[Experimental]** you can download the Q8 and Q4 used in the original implementation of Llama3.java, but for now are going to be dequanted to FP16 for TornadoVM support:
-```
-# Llama 3.2 (1B) - Q4_0
-curl -L -O https://huggingface.co/mukel/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf
-# Llama 3.2 (3B) - Q4_0 
-curl -L -O https://huggingface.co/mukel/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct-Q4_0.gguf
-# Llama 3 (8B) - Q4_0 
-curl -L -O https://huggingface.co/mukel/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_0.gguf
-# Llama 3.2 (1B) - Q8_0 
-curl -L -O https://huggingface.co/mukel/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q8_0.gguf
-# Llama 3.1 (8B) - Q8_0 
-curl -L -O https://huggingface.co/mukel/Meta-Llama-3.1-8B-Instruct-GGUF/resolve/main/Meta-Llama-3.1-8B-Instruct-Q4_0.gguf
-```
-
+We provided a collection of models tested by us in [Hugging-face](https://huggingface.co/beehive-lab/collections).
+However, any Llama3, Mistral, Qwen2, Qwen3, or Phi-3 model in `gguf` format can be used with **GPULlama3.java**.
 -----------
 
 ## Running `llama-tornado`
@@ -387,146 +312,12 @@ docker run --rm -it --gpus all \
   --model /data/Llama-3.2-1B-Instruct.FP16.gguf \
   --prompt "Tell me a joke"
 ```
------------
-
-## Troubleshooting GPU Memory Issues
-
-### Out of Memory Error
-
-You may encounter an out-of-memory error like:
-```
-Exception in thread "main" uk.ac.manchester.tornado.api.exceptions.TornadoOutOfMemoryException: Unable to allocate 100663320 bytes of memory.
-To increase the maximum device memory, use -Dtornado.device.memory=<X>GB
-```
-
-This indicates that the default GPU memory allocation (7GB) is insufficient for your model.
-
-### Solution
-
-First, check your GPU specifications. If your GPU has high memory capacity, you can increase the GPU memory allocation using the `--gpu-memory` flag:
-
-```bash
-# For 3B models, try increasing to 15GB
-./llama-tornado --gpu --model beehive-llama-3.2-3b-instruct-fp16.gguf --prompt "Tell me a joke" --gpu-memory 15GB
-
-# For 8B models, you may need even more (20GB or higher)
-./llama-tornado --gpu --model beehive-llama-3.2-8b-instruct-fp16.gguf --prompt "Tell me a joke" --gpu-memory 20GB
-```
-
-### GPU Memory Requirements by Model Size
-
-| Model Size  | Recommended GPU Memory |
-|-------------|------------------------|
-| 1B models   | 7GB (default)          |
-| 3-7B models | 15GB                   |
-| 8B models   | 20GB+                  |
-
-**Note**: If you still encounter memory issues, try:
-
-1. Using Q4_0 instead of Q8_0 quantization (requires less memory).
-2. Closing other GPU-intensive applications in your system.
 
 -----------
 
-## Command Line Options
-
-Supported command-line options include:
-
-```bash
-cmd ➜ llama-tornado --help
-usage: llama-tornado [-h] --model MODEL_PATH [--prompt PROMPT] [-sp SYSTEM_PROMPT] [--temperature TEMPERATURE] [--top-p TOP_P] [--seed SEED] [-n MAX_TOKENS]
-                     [--stream STREAM] [--echo ECHO] [-i] [--instruct] [--gpu] [--opencl] [--ptx] [--gpu-memory GPU_MEMORY] [--heap-min HEAP_MIN] [--heap-max HEAP_MAX]
-                     [--debug] [--profiler] [--profiler-dump-dir PROFILER_DUMP_DIR] [--print-bytecodes] [--print-threads] [--print-kernel] [--full-dump]
-                     [--show-command] [--execute-after-show] [--opencl-flags OPENCL_FLAGS] [--max-wait-events MAX_WAIT_EVENTS] [--verbose]
-
-GPU-accelerated LLaMA.java model runner using TornadoVM
-
-options:
-  -h, --help            show this help message and exit
-  --model MODEL_PATH    Path to the LLaMA model file (e.g., beehive-llama-3.2-8b-instruct-fp16.gguf) (default: None)
-
-LLaMA Configuration:
-  --prompt PROMPT       Input prompt for the model (default: None)
-  -sp SYSTEM_PROMPT, --system-prompt SYSTEM_PROMPT
-                        System prompt for the model (default: None)
-  --temperature TEMPERATURE
-                        Sampling temperature (0.0 to 2.0) (default: 0.1)
-  --top-p TOP_P         Top-p sampling parameter (default: 0.95)
-  --seed SEED           Random seed (default: current timestamp) (default: None)
-  -n MAX_TOKENS, --max-tokens MAX_TOKENS
-                        Maximum number of tokens to generate (default: 512)
-  --stream STREAM       Enable streaming output (default: True)
-  --echo ECHO           Echo the input prompt (default: False)
-  --suffix SUFFIX       Suffix for fill-in-the-middle request (Codestral) (default: None)
-
-Mode Selection:
-  -i, --interactive     Run in interactive/chat mode (default: False)
-  --instruct            Run in instruction mode (default) (default: True)
-
-Hardware Configuration:
-  --gpu                 Enable GPU acceleration (default: False)
-  --opencl              Use OpenCL backend (default) (default: None)
-  --ptx                 Use PTX/CUDA backend (default: None)
-  --gpu-memory GPU_MEMORY
-                        GPU memory allocation (default: 7GB)
-  --heap-min HEAP_MIN   Minimum JVM heap size (default: 20g)
-  --heap-max HEAP_MAX   Maximum JVM heap size (default: 20g)
-
-Debug and Profiling:
-  --debug               Enable debug output (default: False)
-  --profiler            Enable TornadoVM profiler (default: False)
-  --profiler-dump-dir PROFILER_DUMP_DIR
-                        Directory for profiler output (default: /home/mikepapadim/repos/gpu-llama3.java/prof.json)
-
-TornadoVM Execution Verbose:
-  --print-bytecodes     Print bytecodes (tornado.print.bytecodes=true) (default: False)
-  --print-threads       Print thread information (tornado.threadInfo=true) (default: False)
-  --print-kernel        Print kernel information (tornado.printKernel=true) (default: False)
-  --full-dump           Enable full debug dump (tornado.fullDebug=true) (default: False)
-  --verbose-init        Enable timers for TornadoVM initialization (llama.EnableTimingForTornadoVMInit=true) (default: False)
-
-Command Display Options:
-  --show-command        Display the full Java command that will be executed (default: False)
-  --execute-after-show  Execute the command after showing it (use with --show-command) (default: False)
-
-Advanced Options:
-  --opencl-flags OPENCL_FLAGS
-                        OpenCL compiler flags (default: -cl-denorms-are-zero -cl-no-signed-zeros -cl-finite-math-only)
-  --max-wait-events MAX_WAIT_EVENTS
-                        Maximum wait events for TornadoVM event pool (default: 32000)
-  --verbose, -v         Verbose output (default: False)
-
-```
-
-## Debug & Profiling Options
-View TornadoVM's internal behavior:
-```bash
-# Print thread information during execution
-./llama-tornado --gpu --model model.gguf --prompt "..." --print-threads
-
-# Show bytecode compilation details
-./llama-tornado --gpu --model model.gguf --prompt "..." --print-bytecodes
-
-# Display generated GPU kernel code
-./llama-tornado --gpu --model model.gguf --prompt "..." --print-kernel
-
-# Enable full debug output with all details
-./llama-tornado --gpu --model model.gguf --prompt "..." --debug --full-dump
-
-# Combine debug options
-./llama-tornado --gpu --model model.gguf --prompt "..." --print-threads --print-bytecodes --print-kernel
-```
-
-## Current Features & Roadmap
+## Miscellaneous
 
-  - **Support for GGUF format models** with full FP16 and partial support for Q8_0 and Q4_0 quantization.
-  - **Instruction-following and chat modes** for various use cases.
-  - **Interactive CLI** with `--interactive` and `--instruct` modes.
-  - **Flexible backend switching** - choose OpenCL or PTX at runtime (need to build TornadoVM with both enabled).
-  - **Cross-platform compatibility**:
-    - ✅ NVIDIA GPUs (OpenCL & PTX )
-    - ✅ Intel GPUs (OpenCL)
-    - ✅ Apple GPUs (OpenCL)
+Click [here](https://github.com/beehive-lab/GPULlama3.java/tree/main/docs/RUN_DEBUB.md) for more run and debugging tips, also how to use the ./llama-tornado cli to run the model with different flags.
 
 Click [here](https://github.com/beehive-lab/GPULlama3.java/tree/main/docs/TORNADOVM_TRANSFORMER_OPTIMIZATIONS.md) to view a more detailed list of the transformer optimizations implemented in TornadoVM.
 
diff --git a/docs/ChatGPT Image Apr 27, 2025, 02_45_40 PM.png b/docs/ChatGPT Image Apr 27, 2025, 02_45_40 PM.png
deleted file mode 100644
index 467f3316..00000000
Binary files a/docs/ChatGPT Image Apr 27, 2025, 02_45_40 PM.png and /dev/null differ
diff --git a/docs/RUN_DEBUG.md b/docs/RUN_DEBUG.md
new file mode 100644
index 00000000..c5ae967f
--- /dev/null
+++ b/docs/RUN_DEBUG.md
@@ -0,0 +1,125 @@
+## Troubleshooting GPU Memory Issues
+
+### Out of Memory Error
+
+You may encounter an out-of-memory error like:
+```
+Exception in thread "main" uk.ac.manchester.tornado.api.exceptions.TornadoOutOfMemoryException: Unable to allocate 100663320 bytes of memory.
+To increase the maximum device memory, use -Dtornado.device.memory=<X>GB
+```
+
+This indicates that the default GPU memory allocation (7GB) is insufficient for your model.
+
+### Solution
+
+First, check your GPU specifications. If your GPU has high memory capacity, you can increase the GPU memory allocation using the `--gpu-memory` flag:
+
+```bash
+# For 3B models, try increasing to 15GB
+./llama-tornado --gpu --model beehive-llama-3.2-3b-instruct-fp16.gguf --prompt "Tell me a joke" --gpu-memory 15GB
+
+# For 8B models, you may need even more (20GB or higher)
+./llama-tornado --gpu --model beehive-llama-3.2-8b-instruct-fp16.gguf --prompt "Tell me a joke" --gpu-memory 20GB
+```
+
+### GPU Memory Requirements by Model Size
+
+| Model Size  | Recommended GPU Memory |
+|-------------|------------------------|
+| 1B models   | 7GB (default)          |
+| 3-7B models | 15GB                   |
+| 8B models   | 20GB+                  |
+
+**Note**: If you still encounter memory issues, try:
+
+1. Using Q4_0 instead of Q8_0 quantization (requires less memory).
+2. Closing other GPU-intensive applications in your system.
+
+## Command Line Options
+
+Supported command-line options include:
+
+```bash
+cmd ➜ llama-tornado --help
+usage: llama-tornado [-h] --model MODEL_PATH [--prompt PROMPT] [-sp SYSTEM_PROMPT] [--temperature TEMPERATURE] [--top-p TOP_P] [--seed SEED] [-n MAX_TOKENS]
+                     [--stream STREAM] [--echo ECHO] [-i] [--instruct] [--gpu] [--opencl] [--ptx] [--gpu-memory GPU_MEMORY] [--heap-min HEAP_MIN] [--heap-max HEAP_MAX]
+                     [--debug] [--profiler] [--profiler-dump-dir PROFILER_DUMP_DIR] [--print-bytecodes] [--print-threads] [--print-kernel] [--full-dump]
+                     [--show-command] [--execute-after-show] [--opencl-flags OPENCL_FLAGS] [--max-wait-events MAX_WAIT_EVENTS] [--verbose]
+
+GPU-accelerated LLaMA.java model runner using TornadoVM
+
+options:
+  -h, --help            show this help message and exit
+  --model MODEL_PATH    Path to the LLaMA model file (e.g., beehive-llama-3.2-8b-instruct-fp16.gguf) (default: None)
+
+LLaMA Configuration:
+  --prompt PROMPT       Input prompt for the model (default: None)
+  -sp SYSTEM_PROMPT, --system-prompt SYSTEM_PROMPT
+                        System prompt for the model (default: None)
+  --temperature TEMPERATURE
+                        Sampling temperature (0.0 to 2.0) (default: 0.1)
+  --top-p TOP_P         Top-p sampling parameter (default: 0.95)
+  --seed SEED           Random seed (default: current timestamp) (default: None)
+  -n MAX_TOKENS, --max-tokens MAX_TOKENS
+                        Maximum number of tokens to generate (default: 512)
+  --stream STREAM       Enable streaming output (default: True)
+  --echo ECHO           Echo the input prompt (default: False)
+  --suffix SUFFIX       Suffix for fill-in-the-middle request (Codestral) (default: None)
+
+Mode Selection:
+  -i, --interactive     Run in interactive/chat mode (default: False)
+  --instruct            Run in instruction mode (default) (default: True)
+
+Hardware Configuration:
+  --gpu                 Enable GPU acceleration (default: False)
+  --opencl              Use OpenCL backend (default) (default: None)
+  --ptx                 Use PTX/CUDA backend (default: None)
+  --gpu-memory GPU_MEMORY
+                        GPU memory allocation (default: 7GB)
+  --heap-min HEAP_MIN   Minimum JVM heap size (default: 20g)
+  --heap-max HEAP_MAX   Maximum JVM heap size (default: 20g)
+
+Debug and Profiling:
+  --debug               Enable debug output (default: False)
+  --profiler            Enable TornadoVM profiler (default: False)
+  --profiler-dump-dir PROFILER_DUMP_DIR
+                        Directory for profiler output (default: /home/mikepapadim/repos/gpu-llama3.java/prof.json)
+
+TornadoVM Execution Verbose:
+  --print-bytecodes     Print bytecodes (tornado.print.bytecodes=true) (default: False)
+  --print-threads       Print thread information (tornado.threadInfo=true) (default: False)
+  --print-kernel        Print kernel information (tornado.printKernel=true) (default: False)
+  --full-dump           Enable full debug dump (tornado.fullDebug=true) (default: False)
+  --verbose-init        Enable timers for TornadoVM initialization (llama.EnableTimingForTornadoVMInit=true) (default: False)
+
+Command Display Options:
+  --show-command        Display the full Java command that will be executed (default: False)
+  --execute-after-show  Execute the command after showing it (use with --show-command) (default: False)
+
+Advanced Options:
+  --opencl-flags OPENCL_FLAGS
+                        OpenCL compiler flags (default: -cl-denorms-are-zero -cl-no-signed-zeros -cl-finite-math-only)
+  --max-wait-events MAX_WAIT_EVENTS
+                        Maximum wait events for TornadoVM event pool (default: 32000)
+  --verbose, -v         Verbose output (default: False)
+
+```
+
+## Debug & Profiling Options
+View TornadoVM's internal behavior:
+```bash
+# Print thread information during execution
+./llama-tornado --gpu --model model.gguf --prompt "..." --print-threads
+
+# Show bytecode compilation details
+./llama-tornado --gpu --model model.gguf --prompt "..." --print-bytecodes
+
+# Display generated GPU kernel code
+./llama-tornado --gpu --model model.gguf --prompt "..." --print-kernel
+
+# Enable full debug output with all details
+./llama-tornado --gpu --model model.gguf --prompt "..." --debug --full-dump
+
+# Combine debug options
+./llama-tornado --gpu --model model.gguf --prompt "..." --print-threads --print-bytecodes --print-kernel
+```
\ No newline at end of file
diff --git a/docs/diagrams/gpullama3-architecture-light.svg b/docs/diagrams/gpullama3-architecture-light.svg
new file mode 100644
index 00000000..700f33b2
--- /dev/null
+++ b/docs/diagrams/gpullama3-architecture-light.svg
@@ -0,0 +1,195 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 800 520">
+  <defs>
+    <!-- Gradients -->
+    <linearGradient id="appGrad" x1="0%" y1="0%" x2="0%" y2="100%">
+      <stop offset="0%" style="stop-color:#667eea"/>
+      <stop offset="100%" style="stop-color:#764ba2"/>
+    </linearGradient>
+    <linearGradient id="langchainGrad" x1="0%" y1="0%" x2="0%" y2="100%">
+      <stop offset="0%" style="stop-color:#f093fb"/>
+      <stop offset="100%" style="stop-color:#f5576c"/>
+    </linearGradient>
+    <linearGradient id="engineGrad" x1="0%" y1="0%" x2="0%" y2="100%">
+      <stop offset="0%" style="stop-color:#4facfe"/>
+      <stop offset="100%" style="stop-color:#00f2fe"/>
+    </linearGradient>
+    <linearGradient id="tornadoGrad" x1="0%" y1="0%" x2="0%" y2="100%">
+      <stop offset="0%" style="stop-color:#43e97b"/>
+      <stop offset="100%" style="stop-color:#38f9d7"/>
+    </linearGradient>
+    <linearGradient id="openclGrad" x1="0%" y1="0%" x2="0%" y2="100%">
+      <stop offset="0%" style="stop-color:#fa709a"/>
+      <stop offset="100%" style="stop-color:#fee140"/>
+    </linearGradient>
+    <linearGradient id="ptxGrad" x1="0%" y1="0%" x2="0%" y2="100%">
+      <stop offset="0%" style="stop-color:#a8ff78"/>
+      <stop offset="100%" style="stop-color:#78ffd6"/>
+    </linearGradient>
+    <linearGradient id="metalGrad" x1="0%" y1="0%" x2="0%" y2="100%">
+      <stop offset="0%" style="stop-color:#c9d6ff"/>
+      <stop offset="100%" style="stop-color:#e2e2e2"/>
+    </linearGradient>
+    <linearGradient id="intelGrad" x1="0%" y1="0%" x2="0%" y2="100%">
+      <stop offset="0%" style="stop-color:#0071c5"/>
+      <stop offset="100%" style="stop-color:#00aeef"/>
+    </linearGradient>
+    <linearGradient id="nvidiaGrad" x1="0%" y1="0%" x2="0%" y2="100%">
+      <stop offset="0%" style="stop-color:#76b900"/>
+      <stop offset="100%" style="stop-color:#a3d944"/>
+    </linearGradient>
+    <linearGradient id="appleGrad" x1="0%" y1="0%" x2="0%" y2="100%">
+      <stop offset="0%" style="stop-color:#555555"/>
+      <stop offset="100%" style="stop-color:#888888"/>
+    </linearGradient>
+    
+    <!-- Drop shadow filter -->
+    <filter id="shadow" x="-10%" y="-10%" width="120%" height="130%">
+      <feDropShadow dx="0" dy="3" stdDeviation="4" flood-opacity="0.12"/>
+    </filter>
+    
+    <!-- Glow effect -->
+    <filter id="glow">
+      <feGaussianBlur stdDeviation="2" result="coloredBlur"/>
+      <feMerge>
+        <feMergeNode in="coloredBlur"/>
+        <feMergeNode in="SourceGraphic"/>
+      </feMerge>
+    </filter>
+  </defs>
+  
+  <!-- Background -->
+  <rect width="800" height="520" fill="#ffffff"/>
+  
+  <!-- Decorative grid pattern -->
+  <pattern id="grid" width="40" height="40" patternUnits="userSpaceOnUse">
+    <path d="M 40 0 L 0 0 0 40" fill="none" stroke="#f0f0f0" stroke-width="0.5"/>
+  </pattern>
+  <rect width="800" height="520" fill="url(#grid)" opacity="0.8"/>
+  
+  <!-- Title -->
+  <text x="400" y="35" text-anchor="middle" fill="#1a1a2e" font-family="system-ui, -apple-system, sans-serif" font-size="22" font-weight="600">
+    GPULlama3.java Architecture
+  </text>
+  
+  <!-- Layer 1: Your Java Application -->
+  <g filter="url(#shadow)">
+    <rect x="60" y="55" width="680" height="60" rx="12" fill="url(#appGrad)"/>
+    <text x="400" y="90" text-anchor="middle" fill="white" font-family="system-ui, -apple-system, sans-serif" font-size="18" font-weight="600">
+      ☕ Your Java Application
+    </text>
+    <text x="400" y="106" text-anchor="middle" fill="rgba(255,255,255,0.85)" font-family="system-ui, -apple-system, sans-serif" font-size="11">
+      Quarkus • Spring Boot • Micronaut • Any JVM App
+    </text>
+  </g>
+  
+  <!-- Connector arrow 1 -->
+  <polygon points="400,138 395,128 405,128" fill="#6366f1"/>
+  
+  <!-- Layer 2: LangChain4j API -->
+  <g filter="url(#shadow)">
+    <rect x="60" y="140" width="680" height="60" rx="12" fill="url(#langchainGrad)"/>
+    <text x="400" y="172" text-anchor="middle" fill="white" font-family="system-ui, -apple-system, sans-serif" font-size="18" font-weight="600">
+      🔗 LangChain4j / Quarkus-LangChain4j
+    </text>
+    <text x="400" y="191" text-anchor="middle" fill="rgba(255,255,255,0.85)" font-family="system-ui, -apple-system, sans-serif" font-size="11">
+      GPULlama3ChatModel • StreamingChatModel • AI Services • Agents • RAG
+    </text>
+  </g>
+  
+  <!-- Connector arrow 2 -->
+  <polygon points="400,223 395,213 405,213" fill="#6366f1"/>
+  
+  <!-- Layer 3: GPULlama3.java Engine -->
+  <g filter="url(#shadow)">
+    <rect x="60" y="225" width="680" height="70" rx="12" fill="url(#engineGrad)"/>
+    <text x="400" y="258" text-anchor="middle" fill="white" font-family="system-ui, -apple-system, sans-serif" font-size="18" font-weight="600">
+      ⚡ GPULlama3.java Engine
+    </text>
+    <text x="400" y="280" text-anchor="middle" fill="rgba(255,255,255,0.9)" font-family="system-ui, -apple-system, sans-serif" font-size="12">
+      Transformer • Multi-Head Attention • RoPE • RMSNorm • SwiGLU FFN • KV-Cache
+    </text>
+  </g>
+  
+  <!-- Connector arrow 3 -->
+  <polygon points="400,318 395,308 405,308" fill="#6366f1"/>
+  
+  <!-- Layer 4: TornadoVM Runtime -->
+  <g filter="url(#shadow)">
+    <rect x="60" y="320" width="680" height="55" rx="12" fill="url(#tornadoGrad)"/>
+    <text x="400" y="352" text-anchor="middle" fill="#1a1a2e" font-family="system-ui, -apple-system, sans-serif" font-size="18" font-weight="600">
+      🌪️ TornadoVM Runtime
+    </text>
+    <text x="400" y="368" text-anchor="middle" fill="rgba(0,0,0,0.6)" font-family="system-ui, -apple-system, sans-serif" font-size="11">
+      JIT Compilation • Task Graphs • Automatic Parallelization • Memory Management
+    </text>
+  </g>
+  
+  <!-- Connector arrows to backends -->
+  <polygon points="180,403 175,393 185,393" fill="#6366f1"/>
+  <polygon points="400,403 395,393 405,393" fill="#6366f1"/>
+  <polygon points="620,403 615,393 625,393" fill="#6366f1"/>
+  
+  <!-- Layer 5: Backends -->
+  <g filter="url(#shadow)">
+    <!-- OpenCL -->
+    <rect x="60" y="405" width="200" height="45" rx="10" fill="url(#openclGrad)"/>
+    <text x="160" y="433" text-anchor="middle" fill="white" font-family="system-ui, -apple-system, sans-serif" font-size="15" font-weight="600">
+      OpenCL
+    </text>
+  </g>
+  
+  <g filter="url(#shadow)">
+    <!-- PTX/CUDA -->
+    <rect x="300" y="405" width="200" height="45" rx="10" fill="url(#ptxGrad)"/>
+    <text x="400" y="433" text-anchor="middle" fill="#1a1a2e" font-family="system-ui, -apple-system, sans-serif" font-size="15" font-weight="600">
+      PTX / CUDA
+    </text>
+  </g>
+  
+  <g filter="url(#shadow)">
+    <!-- Metal -->
+    <rect x="540" y="405" width="200" height="45" rx="10" fill="url(#metalGrad)" stroke="#999" stroke-width="1" stroke-dasharray="4,2"/>
+    <text x="640" y="433" text-anchor="middle" fill="#555" font-family="system-ui, -apple-system, sans-serif" font-size="15" font-weight="600">
+      Metal (WIP)
+    </text>
+  </g>
+  
+  <!-- Connector arrows to hardware -->
+  <polygon points="160,468 155,458 165,458" fill="#6366f1"/>
+  <polygon points="400,468 395,458 405,458" fill="#6366f1"/>
+  <polygon points="640,468 635,458 645,458" fill="#6366f1"/>
+  
+  <!-- Layer 6: Hardware -->
+  <g filter="url(#shadow)">
+    <!-- Intel -->
+    <rect x="60" y="470" width="200" height="40" rx="8" fill="url(#intelGrad)"/>
+    <text x="160" y="496" text-anchor="middle" fill="white" font-family="system-ui, -apple-system, sans-serif" font-size="14" font-weight="600">
+      🔷 Intel Arc / Iris
+    </text>
+  </g>
+  
+  <g filter="url(#shadow)">
+    <!-- NVIDIA -->
+    <rect x="300" y="470" width="200" height="40" rx="8" fill="url(#nvidiaGrad)"/>
+    <text x="400" y="496" text-anchor="middle" fill="white" font-family="system-ui, -apple-system, sans-serif" font-size="14" font-weight="600">
+      🟢 NVIDIA GPUs
+    </text>
+  </g>
+  
+  <g filter="url(#shadow)">
+    <!-- Apple Silicon -->
+    <rect x="540" y="470" width="200" height="40" rx="8" fill="url(#appleGrad)"/>
+    <text x="640" y="496" text-anchor="middle" fill="white" font-family="system-ui, -apple-system, sans-serif" font-size="14" font-weight="600">
+       Apple Silicon
+    </text>
+  </g>
+  
+  <!-- Side annotation: Pure Java badge -->
+  <g transform="translate(745, 250) rotate(90)">
+    <rect x="-50" y="-15" width="100" height="26" rx="13" fill="#238636" filter="url(#glow)"/>
+    <text x="0" y="4" text-anchor="middle" fill="white" font-family="system-ui, -apple-system, sans-serif" font-size="11" font-weight="600">
+      100% Pure Java
+    </text>
+  </g>
+  
+</svg>
diff --git a/docs/diagrams/gpullama3-architecture.svg b/docs/diagrams/gpullama3-architecture.svg
new file mode 100644
index 00000000..b2ce0370
--- /dev/null
+++ b/docs/diagrams/gpullama3-architecture.svg
@@ -0,0 +1,199 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 800 520">
+  <defs>
+    <!-- Gradients -->
+    <linearGradient id="appGrad" x1="0%" y1="0%" x2="0%" y2="100%">
+      <stop offset="0%" style="stop-color:#667eea"/>
+      <stop offset="100%" style="stop-color:#764ba2"/>
+    </linearGradient>
+    <linearGradient id="langchainGrad" x1="0%" y1="0%" x2="0%" y2="100%">
+      <stop offset="0%" style="stop-color:#f093fb"/>
+      <stop offset="100%" style="stop-color:#f5576c"/>
+    </linearGradient>
+    <linearGradient id="engineGrad" x1="0%" y1="0%" x2="0%" y2="100%">
+      <stop offset="0%" style="stop-color:#4facfe"/>
+      <stop offset="100%" style="stop-color:#00f2fe"/>
+    </linearGradient>
+    <linearGradient id="tornadoGrad" x1="0%" y1="0%" x2="0%" y2="100%">
+      <stop offset="0%" style="stop-color:#43e97b"/>
+      <stop offset="100%" style="stop-color:#38f9d7"/>
+    </linearGradient>
+    <linearGradient id="openclGrad" x1="0%" y1="0%" x2="0%" y2="100%">
+      <stop offset="0%" style="stop-color:#fa709a"/>
+      <stop offset="100%" style="stop-color:#fee140"/>
+    </linearGradient>
+    <linearGradient id="ptxGrad" x1="0%" y1="0%" x2="0%" y2="100%">
+      <stop offset="0%" style="stop-color:#a8ff78"/>
+      <stop offset="100%" style="stop-color:#78ffd6"/>
+    </linearGradient>
+    <linearGradient id="metalGrad" x1="0%" y1="0%" x2="0%" y2="100%">
+      <stop offset="0%" style="stop-color:#c9d6ff"/>
+      <stop offset="100%" style="stop-color:#e2e2e2"/>
+    </linearGradient>
+    <linearGradient id="intelGrad" x1="0%" y1="0%" x2="0%" y2="100%">
+      <stop offset="0%" style="stop-color:#0071c5"/>
+      <stop offset="100%" style="stop-color:#00aeef"/>
+    </linearGradient>
+    <linearGradient id="nvidiaGrad" x1="0%" y1="0%" x2="0%" y2="100%">
+      <stop offset="0%" style="stop-color:#76b900"/>
+      <stop offset="100%" style="stop-color:#a3d944"/>
+    </linearGradient>
+    <linearGradient id="appleGrad" x1="0%" y1="0%" x2="0%" y2="100%">
+      <stop offset="0%" style="stop-color:#555555"/>
+      <stop offset="100%" style="stop-color:#888888"/>
+    </linearGradient>
+    
+    <!-- Drop shadow filter -->
+    <filter id="shadow" x="-10%" y="-10%" width="120%" height="130%">
+      <feDropShadow dx="0" dy="4" stdDeviation="6" flood-opacity="0.15"/>
+    </filter>
+    
+    <!-- Glow effect -->
+    <filter id="glow">
+      <feGaussianBlur stdDeviation="2" result="coloredBlur"/>
+      <feMerge>
+        <feMergeNode in="coloredBlur"/>
+        <feMergeNode in="SourceGraphic"/>
+      </feMerge>
+    </filter>
+  </defs>
+  
+  <!-- Background -->
+  <rect width="800" height="520" fill="#0d1117"/>
+  
+  <!-- Decorative grid pattern -->
+  <pattern id="grid" width="40" height="40" patternUnits="userSpaceOnUse">
+    <path d="M 40 0 L 0 0 0 40" fill="none" stroke="#21262d" stroke-width="0.5"/>
+  </pattern>
+  <rect width="800" height="520" fill="url(#grid)" opacity="0.5"/>
+  
+  <!-- Title -->
+  <text x="400" y="35" text-anchor="middle" fill="#f0f6fc" font-family="system-ui, -apple-system, sans-serif" font-size="22" font-weight="600">
+    GPULlama3.java Architecture
+  </text>
+  
+  <!-- Layer 1: Your Java Application -->
+  <g filter="url(#shadow)">
+    <rect x="60" y="55" width="680" height="60" rx="12" fill="url(#appGrad)"/>
+    <text x="400" y="90" text-anchor="middle" fill="white" font-family="system-ui, -apple-system, sans-serif" font-size="18" font-weight="600">
+      ☕ Your Java Application
+    </text>
+    <text x="400" y="106" text-anchor="middle" fill="rgba(255,255,255,0.8)" font-family="system-ui, -apple-system, sans-serif" font-size="11">
+      Quarkus • Spring Boot • Micronaut • Any JVM App
+    </text>
+  </g>
+  
+  <!-- Connector arrow 1 -->
+  <path d="M 400 115 L 400 130" stroke="#58a6ff" stroke-width="2" marker-end="url(#arrowhead)"/>
+  <polygon points="400,138 395,128 405,128" fill="#58a6ff"/>
+  
+  <!-- Layer 2: LangChain4j API -->
+  <g filter="url(#shadow)">
+    <rect x="60" y="140" width="680" height="60" rx="12" fill="url(#langchainGrad)"/>
+    <text x="400" y="172" text-anchor="middle" fill="white" font-family="system-ui, -apple-system, sans-serif" font-size="18" font-weight="600">
+      🔗 LangChain4j / Quarkus-LangChain4j
+    </text>
+    <text x="400" y="191" text-anchor="middle" fill="rgba(255,255,255,0.85)" font-family="system-ui, -apple-system, sans-serif" font-size="11">
+      GPULlama3ChatModel • StreamingChatModel • AI Services • Agents • RAG
+    </text>
+  </g>
+  
+  <!-- Connector arrow 2 -->
+  <polygon points="400,223 395,213 405,213" fill="#58a6ff"/>
+  
+  <!-- Layer 3: GPULlama3.java Engine -->
+  <g filter="url(#shadow)">
+    <rect x="60" y="225" width="680" height="70" rx="12" fill="url(#engineGrad)"/>
+    <text x="400" y="258" text-anchor="middle" fill="white" font-family="system-ui, -apple-system, sans-serif" font-size="18" font-weight="600">
+      ⚡ GPULlama3.java Engine
+    </text>
+    <text x="400" y="280" text-anchor="middle" fill="rgba(255,255,255,0.9)" font-family="system-ui, -apple-system, sans-serif" font-size="12">
+      Transformer • Multi-Head Attention • RoPE • RMSNorm • SwiGLU FFN • KV-Cache
+    </text>
+  </g>
+  
+  <!-- Connector arrow 3 -->
+  <polygon points="400,318 395,308 405,308" fill="#58a6ff"/>
+  
+  <!-- Layer 4: TornadoVM Runtime -->
+  <g filter="url(#shadow)">
+    <rect x="60" y="320" width="680" height="55" rx="12" fill="url(#tornadoGrad)"/>
+    <text x="400" y="352" text-anchor="middle" fill="#1a1a2e" font-family="system-ui, -apple-system, sans-serif" font-size="18" font-weight="600">
+      🌪️ TornadoVM Runtime
+    </text>
+    <text x="400" y="368" text-anchor="middle" fill="rgba(0,0,0,0.6)" font-family="system-ui, -apple-system, sans-serif" font-size="11">
+      JIT Compilation • Task Graphs • Automatic Parallelization • Memory Management
+    </text>
+  </g>
+  
+  <!-- Connector arrows to backends -->
+  <path d="M 180 375 L 180 395" stroke="#58a6ff" stroke-width="2"/>
+  <polygon points="180,403 175,393 185,393" fill="#58a6ff"/>
+  <path d="M 400 375 L 400 395" stroke="#58a6ff" stroke-width="2"/>
+  <polygon points="400,403 395,393 405,393" fill="#58a6ff"/>
+  <path d="M 620 375 L 620 395" stroke="#58a6ff" stroke-width="2"/>
+  <polygon points="620,403 615,393 625,393" fill="#58a6ff"/>
+  
+  <!-- Layer 5: Backends -->
+  <g filter="url(#shadow)">
+    <!-- OpenCL -->
+    <rect x="60" y="405" width="200" height="45" rx="10" fill="url(#openclGrad)"/>
+    <text x="160" y="433" text-anchor="middle" fill="white" font-family="system-ui, -apple-system, sans-serif" font-size="15" font-weight="600">
+      OpenCL
+    </text>
+  </g>
+  
+  <g filter="url(#shadow)">
+    <!-- PTX/CUDA -->
+    <rect x="300" y="405" width="200" height="45" rx="10" fill="url(#ptxGrad)"/>
+    <text x="400" y="433" text-anchor="middle" fill="#1a1a2e" font-family="system-ui, -apple-system, sans-serif" font-size="15" font-weight="600">
+      PTX / CUDA
+    </text>
+  </g>
+  
+  <g filter="url(#shadow)">
+    <!-- Metal -->
+    <rect x="540" y="405" width="200" height="45" rx="10" fill="url(#metalGrad)" stroke="#666" stroke-width="1" stroke-dasharray="4,2"/>
+    <text x="640" y="433" text-anchor="middle" fill="#555" font-family="system-ui, -apple-system, sans-serif" font-size="15" font-weight="600">
+      Metal (WIP)
+    </text>
+  </g>
+  
+  <!-- Connector arrows to hardware -->
+  <polygon points="160,468 155,458 165,458" fill="#58a6ff"/>
+  <polygon points="400,468 395,458 405,458" fill="#58a6ff"/>
+  <polygon points="640,468 635,458 645,458" fill="#58a6ff"/>
+  
+  <!-- Layer 6: Hardware -->
+  <g filter="url(#shadow)">
+    <!-- Intel -->
+    <rect x="60" y="470" width="200" height="40" rx="8" fill="url(#intelGrad)"/>
+    <text x="160" y="496" text-anchor="middle" fill="white" font-family="system-ui, -apple-system, sans-serif" font-size="14" font-weight="600">
+      🔷 Intel Arc / Iris
+    </text>
+  </g>
+  
+  <g filter="url(#shadow)">
+    <!-- NVIDIA -->
+    <rect x="300" y="470" width="200" height="40" rx="8" fill="url(#nvidiaGrad)"/>
+    <text x="400" y="496" text-anchor="middle" fill="white" font-family="system-ui, -apple-system, sans-serif" font-size="14" font-weight="600">
+      🟢 NVIDIA GPUs
+    </text>
+  </g>
+  
+  <g filter="url(#shadow)">
+    <!-- Apple Silicon -->
+    <rect x="540" y="470" width="200" height="40" rx="8" fill="url(#appleGrad)"/>
+    <text x="640" y="496" text-anchor="middle" fill="white" font-family="system-ui, -apple-system, sans-serif" font-size="14" font-weight="600">
+       Apple Silicon
+    </text>
+  </g>
+  
+  <!-- Side annotation: Pure Java badge -->
+  <g transform="translate(745, 250) rotate(90)">
+    <rect x="-50" y="-15" width="100" height="26" rx="13" fill="#238636" filter="url(#glow)"/>
+    <text x="0" y="4" text-anchor="middle" fill="white" font-family="system-ui, -apple-system, sans-serif" font-size="11" font-weight="600">
+      100% Pure Java
+    </text>
+  </g>
+  
+</svg>
diff --git a/llama-tornado b/llama-tornado
index b59473f2..9c0d6ba8 100755
--- a/llama-tornado
+++ b/llama-tornado
@@ -410,7 +410,7 @@ def create_parser() -> argparse.ArgumentParser:
         const=Backend.PTX,
         help="Use PTX/CUDA backend",
     )
-    hw_group.add_argument("--gpu-memory", default="7GB", help="GPU memory allocation")
+    hw_group.add_argument("--gpu-memory", default="14GB", help="GPU memory allocation")
     hw_group.add_argument("--heap-min", default="20g", help="Minimum JVM heap size")
     hw_group.add_argument("--heap-max", default="20g", help="Maximum JVM heap size")
 
diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/layers/type/fp16/Phi3FP16FFNLayers.java b/src/main/java/org/beehive/gpullama3/tornadovm/layers/type/fp16/Phi3FP16FFNLayers.java
index 9f1c335a..75f9f531 100644
--- a/src/main/java/org/beehive/gpullama3/tornadovm/layers/type/fp16/Phi3FP16FFNLayers.java
+++ b/src/main/java/org/beehive/gpullama3/tornadovm/layers/type/fp16/Phi3FP16FFNLayers.java
@@ -156,12 +156,12 @@ TaskGraph setupSinglePhi3FFNLayer(Phi3TornadoWeights weights, int layerIndex) {
         unifiedLayer.consumeFromDevice(phi3State.wrapX);
         unifiedLayer.transferToDevice(DataTransferMode.FIRST_EXECUTION,
                 // Copy-in weights per layer for batched-layered layout
-                weights.rms_att_weightLayered[layerIndex],
-                weights.wqkvLayered[layerIndex],
-                weights.woLayered[layerIndex],
-                weights.rms_ffn_weightLayered[layerIndex],
-                weights.wUpLayered[layerIndex],
-                weights.wDownLayered[layerIndex]
+                weights.rms_att_weightLayered[layerIndex].asFloatArray(),
+                weights.wqkvLayered[layerIndex].asHalfFloatArray(),
+                weights.woLayered[layerIndex].asHalfFloatArray(),
+                weights.rms_ffn_weightLayered[layerIndex].asFloatArray(),
+                weights.wUpLayered[layerIndex].asHalfFloatArray(),
+                weights.wDownLayered[layerIndex].asHalfFloatArray()
         );
         unifiedLayer = configureLayerDataTransfers(unifiedLayer, layerIndex);