diff --git a/.github/workflows/build-and-run.yml b/.github/workflows/build-and-run.yml index 41c8d9a6..cd84a896 100644 --- a/.github/workflows/build-and-run.yml +++ b/.github/workflows/build-and-run.yml @@ -5,48 +5,66 @@ on: branches: [ main ] pull_request: branches: [ main ] - types: [opened, synchronize, reopened] + types: [opened, synchronize, reopened] +env: + JAVA_HOME: /opt/jenkins/jdks/graal-23.1.0/jdk-21.0.3 + TORNADO_ROOT: ${{ github.workspace }}/GPULlama3.java/external/tornadovm + LLAMA_ROOT: ${{ github.workspace }} + GRAAL_JARS: /opt/graalJars + MODELS_DIR: /opt/models jobs: - build-and-run: + code-quality: runs-on: self-hosted - - env: - JAVA_HOME: /opt/jenkins/jdks/graal-23.1.0/jdk-21.0.3 - TORNADO_ROOT: ${{ github.workspace }}/GPULlama3.java/external/tornadovm - LLAMA_ROOT: ${{ github.workspace }} - + timeout-minutes: 30 + steps: - name: Checkout GPULlama3 uses: actions/checkout@v4 - with: - fetch-depth: 0 - name: Check code formatting (Spotless) run: | cd ${{ github.workspace }} - #./mvnw -T12C -Pspotless spotless:check - - - name: Clone TornadoVM explicitly + # ./mvnw -T12C -Pspotless spotless:check + + build-and-run: + runs-on: [self-hosted] + needs: code-quality + timeout-minutes: 30 + + strategy: + fail-fast: true + matrix: + backend: + - name: opencl + - name: ptx + + steps: + - name: Checkout GPULlama3 + uses: actions/checkout@v4 + + - name: Clone TornadoVM master run: | - git clone --depth 1 --branch develop \ + git clone --depth 1 --branch master \ https://github.com/beehive-lab/TornadoVM.git \ - GPULlama3.java/external/tornadovm + $TORNADO_ROOT - name: Set up Python venv for TornadoVM run: | - python3 -m venv GPULlama3.java/external/tornadovm/venv - source GPULlama3.java/external/tornadovm/venv/bin/activate + python3 -m venv $TORNADO_ROOT/venv + source $TORNADO_ROOT/venv/bin/activate python --version - name: Build TornadoVM run: | - set -x - cd GPULlama3.java/external/tornadovm + cd $TORNADO_ROOT + mkdir -p graalJars && cp $GRAAL_JARS/* graalJars/ source venv/bin/activate echo "=== Building TornadoVM ===" - make + + make BACKEND=${{ matrix.backend.name }} + echo "=== Searching for TornadoVM SDK directory ===" - SDK_DIR=$(find dist -type d -maxdepth 3 -path "*/tornadovm-*-opencl" | head -n 1) + SDK_DIR=$(find dist -type d -maxdepth 3 -path "*/tornadovm-*-${{ matrix.backend.name }}" | head -n 1) if [ -z "$SDK_DIR" ]; then echo "::error::Could not locate TornadoVM SDK directory!" find dist -maxdepth 5 -type d @@ -66,59 +84,80 @@ jobs: echo "=== Checking tornado CLI ===" which tornado || { echo "::error::tornado not in PATH"; exit 1; } tornado --devices - - name: Build GPULlama3 + - name: Build GPULlama3.java run: | - set -x cd ${{ github.workspace }} echo "Using TORNADO_SDK=$TORNADO_SDK" export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH" - which tornado || { echo "::error::tornado unavailable during GPULlama3 build"; exit 1; } tornado --version - make - - test-models: - runs-on: self-hosted - needs: build-and-run - - strategy: - fail-fast: false - matrix: - model: - - /opt/models/DeepSeek-R1-Distill-Qwen-1.5B-F16.gguf - - /opt/models/DeepSeek-R1-Distill-Qwen-1.5B-Q8_0.gguf - - /opt/models/Llama-3.2-1B-Instruct-F16.gguf - - /opt/models/Llama-3.2-1B-Instruct-Q8_0.gguf - - /opt/models/Llama-3.2-3B-Instruct-F16.gguf - - /opt/models/Llama-3.2-3B-Instruct-Q8_0.gguf - - /opt/models/Mistral-7B-Instruct-v0.3.fp16.gguf - - /opt/models/Mistral-7B-Instruct-v0.3.Q8_0.gguf - - /opt/models/Phi-3-mini-4k-instruct-fp16.gguf - - /opt/models/Phi-3-mini-4k-instruct-Q8_0.gguf - - /opt/models/Qwen2.5-0.5B-Instruct-f16.gguf - - /opt/models/Qwen2.5-0.5B-Instruct-Q8_0.gguf - - /opt/models/qwen2.5-1.5b-instruct-fp16.gguf - - /opt/models/qwen2.5-1.5b-instruct-q8_0.gguf - - /opt/models/Qwen3-0.6B-f16.gguf - - /opt/models/Qwen3-0.6B-Q8_0.gguf - - /opt/models/Qwen3-4B-f16.gguf - - /opt/models/Qwen3-4B-Q8_0.gguf - - env: - JAVA_HOME: /opt/jenkins/jdks/graal-23.1.0/jdk-21.0.3 - TORNADO_SDK: ${{ needs.build-and-run.outputs.tornado_sdk }} - - steps: - - name: Checkout GPULlama3 - uses: actions/checkout@v4 - - - name: Run inference for ${{ matrix.model }} + ./mvnw clean package -DskipTests + - name: FP16 - Run Llama-3.2-1B-Instruct-F16.gguf run: | - set -x cd ${{ github.workspace }} - export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH" - echo "Using Tornado SDK: $TORNADO_SDK" - - ./llama-tornado --gpu --opencl \ - --model "${{ matrix.model }}" \ + ./llama-tornado --gpu --${{ matrix.backend.name }} \ + --model $MODELS_DIR/Llama-3.2-1B-Instruct-F16.gguf \ + --prompt "Say hello" + - name: FP16 - Run Qwen3-4B-f16.gguf + run: | + cd ${{ github.workspace }} + export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH" + ./llama-tornado --gpu --${{ matrix.backend.name }} \ + --model $MODELS_DIR/Qwen3-4B-f16.gguf \ + --prompt "Say hello" + - name: FP16 - Run Mistral-7B-Instruct-v0.3.fp16.gguf + run: | + cd ${{ github.workspace }} + export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH" + ./llama-tornado --gpu --${{ matrix.backend.name }} \ + --model $MODELS_DIR/Mistral-7B-Instruct-v0.3.fp16.gguf \ + --prompt "Say hello" + - name: FP16 - Run Qwen2.5-1.5b-instruct-fp16.gguf + run: | + cd ${{ github.workspace }} + export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH" + ./llama-tornado --gpu --${{ matrix.backend.name }} \ + --model $MODELS_DIR/qwen2.5-1.5b-instruct-fp16.gguf \ + --prompt "Say hello" + - name: FP16 - Run Phi-3-mini-4k-instruct-fp16.gguf + run: | + cd ${{ github.workspace }} + export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH" + ./llama-tornado --gpu --${{ matrix.backend.name }} \ + --model /$MODELS_DIR/Phi-3-mini-4k-instruct-fp16.gguf \ + --prompt "Say hello" + - name: Q8 - Run Llama-3.2-1B-Instruct-Q8_0.gguf + run: | + cd ${{ github.workspace }} + export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH" + ./llama-tornado --gpu --${{ matrix.backend.name }} \ + --model $MODELS_DIR/Llama-3.2-1B-Instruct-Q8_0.gguf \ + --prompt "Say hello" + - name: Q8 - Run Qwen3-0.6B-Q8_0.gguf + run: | + cd ${{ github.workspace }} + export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH" + ./llama-tornado --gpu --${{ matrix.backend.name }} \ + --model $MODELS_DIR/Qwen3-0.6B-Q8_0.gguf \ + --prompt "Say hello" + - name: Q8 - Run Phi-3-mini-4k-instruct-Q8_0.gguf + run: | + cd ${{ github.workspace }} + export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH" + ./llama-tornado --gpu --${{ matrix.backend.name }} \ + --model $MODELS_DIR/Phi-3-mini-4k-instruct-Q8_0.gguf \ + --prompt "Say hello" + - name: Q8 - Run Qwen2.5-1.5b-instruct-q8_0.gguf + run: | + cd ${{ github.workspace }} + export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH" + ./llama-tornado --gpu --${{ matrix.backend.name }} \ + --model $MODELS_DIR/qwen2.5-1.5b-instruct-q8_0.gguf \ + --prompt "Say hello" + - name: Q8 - Mistral-7B-Instruct-v0.3.Q8_0.gguf + run: | + cd ${{ github.workspace }} + export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH" + ./llama-tornado --gpu --${{ matrix.backend.name }} \ + --model $MODELS_DIR/Mistral-7B-Instruct-v0.3.Q8_0.gguf \ --prompt "Say hello" diff --git a/.github/workflows/rerun-workflow.yml b/.github/workflows/rerun-workflow.yml new file mode 100644 index 00000000..a44eae98 --- /dev/null +++ b/.github/workflows/rerun-workflow.yml @@ -0,0 +1,149 @@ +name: Rerun Workflows + +on: + issue_comment: + types: [created] + +jobs: + rerun: + name: Rerun CI Workflows + # Only run on PR comments (not issue comments) with /rerun command + if: | + github.event.issue.pull_request && + contains(github.event.comment.body, '/rerun') + runs-on: ubuntu-latest + permissions: + actions: write + pull-requests: read + contents: read + + steps: + - name: Get PR SHA + id: pr + uses: actions/github-script@v7 + with: + script: | + const { data: pr } = await github.rest.pulls.get({ + owner: context.repo.owner, + repo: context.repo.repo, + pull_number: context.issue.number + }); + core.setOutput('sha', pr.head.sha); + core.setOutput('head_ref', pr.head.ref); + console.log(`PR #${context.issue.number} SHA: ${pr.head.sha}`); + console.log(`PR head ref: ${pr.head.ref}`); + + - name: Add reaction to comment + uses: actions/github-script@v7 + with: + script: | + await github.rest.reactions.createForIssueComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: context.payload.comment.id, + content: 'rocket' + }); + + - name: Post start comment + uses: actions/github-script@v7 + with: + script: | + const comment = context.payload.comment.body; + const rerunMatch = comment.match(/\/rerun\s*(\S+)?/); + const rerunArg = rerunMatch && rerunMatch[1] ? rerunMatch[1] : 'failed'; + + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body: `🚀 **Workflow rerun started**\n\nMode: \`${rerunArg}\`\nTriggered by: @${context.payload.comment.user.login}\n\n[View Actions](https://github.com/${context.repo.owner}/${context.repo.repo}/actions)` + }); + + - name: Rerun failed workflows + uses: actions/github-script@v7 + with: + script: | + const sha = '${{ steps.pr.outputs.sha }}'; + const headRef = '${{ steps.pr.outputs.head_ref }}'; + + // Get all workflow runs for this PR's head SHA + const { data: runs } = await github.rest.actions.listWorkflowRunsForRepo({ + owner: context.repo.owner, + repo: context.repo.repo, + head_sha: sha, + per_page: 100 + }); + + console.log(`Found ${runs.total_count} workflow runs for SHA ${sha}`); + + if (runs.total_count === 0) { + console.log('No workflow runs found for this PR'); + return; + } + + // Parse command for specific workflow filter + // Supports: /rerun, /rerun all, /rerun failed, /rerun + const comment = context.payload.comment.body; + const rerunMatch = comment.match(/\/rerun\s*(\S+)?/); + const rerunArg = rerunMatch && rerunMatch[1] ? rerunMatch[1].toLowerCase() : 'failed'; + + console.log(`Rerun mode: ${rerunArg}`); + + let rerunCount = 0; + + for (const run of runs.workflow_runs) { + const shouldRerun = + rerunArg === 'all' || + (rerunArg === 'failed' && ['failure', 'cancelled', 'timed_out'].includes(run.conclusion)) || + run.name.toLowerCase().includes(rerunArg); + + if (!shouldRerun) { + console.log(`Skipping ${run.name} (status: ${run.status}, conclusion: ${run.conclusion})`); + continue; + } + + // Only rerun completed workflows + if (run.status !== 'completed') { + console.log(`Skipping ${run.name} - still ${run.status}`); + continue; + } + + try { + console.log(`Rerunning workflow: ${run.name} (ID: ${run.id})`); + + // Use rerun-failed-jobs if available and workflow failed, otherwise full rerun + if (['failure', 'cancelled', 'timed_out'].includes(run.conclusion)) { + await github.rest.actions.reRunWorkflowFailedJobs({ + owner: context.repo.owner, + repo: context.repo.repo, + run_id: run.id + }); + } else { + await github.rest.actions.reRunWorkflow({ + owner: context.repo.owner, + repo: context.repo.repo, + run_id: run.id + }); + } + rerunCount++; + } catch (error) { + console.log(`Failed to rerun ${run.name}: ${error.message}`); + } + } + + console.log(`Reran ${rerunCount} workflow(s)`); + + - name: Post completion comment + if: always() + uses: actions/github-script@v7 + with: + script: | + const status = '${{ job.status }}'; + const emoji = status === 'success' ? '✅' : '❌'; + + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body: `${emoji} **Workflow rerun ${status}**\n\n[View Actions](https://github.com/${context.repo.owner}/${context.repo.repo}/actions)` + }); diff --git a/README.md b/README.md index 6b8e8167..585be1e5 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# GPULlama3.java powered by TornadoVM +# GPULlama3.java powered by TornadoVM [![GPULlama3 Build & Run Inference](https://github.com/beehive-lab/GPULlama3.java/actions/workflows/build-and-run.yml/badge.svg)](https://github.com/beehive-lab/GPULlama3.java/actions/workflows/build-and-run.yml) ![Java Version](https://img.shields.io/badge/java-21+-blue?style=for-the-badge&logo=openjdk) ![OpenCL](https://img.shields.io/badge/OpenCL-supported-blue?style=for-the-badge&logo=khronos) ![CUDA](https://img.shields.io/badge/CUDA/PTX-supported-76B900?style=for-the-badge&logo=nvidia) @@ -99,7 +99,6 @@ Ensure you have the following installed and configured: - **Java 21**: Required for Vector API support & TornadoVM. - [TornadoVM](https://github.com/beehive-lab/TornadoVM) with OpenCL or PTX backends. -- [Maven](https://maven.apache.org/): For building the Java project. ### Install, Build, and Run @@ -264,82 +263,8 @@ Check models below. ## Download Model Files -Download `FP16` quantized `Llama-3` .gguf files from: -- https://huggingface.co/beehive-lab/Llama-3.2-1B-Instruct-GGUF-FP16 -- https://huggingface.co/beehive-lab/Llama-3.2-3B-Instruct-GGUF-FP16 -- https://huggingface.co/beehive-lab/Llama-3.2-8B-Instruct-GGUF-FP16 - -Download `FP16` quantized `Mistral` .gguf files from: -- https://huggingface.co/collections/beehive-lab/mistral-gpullama3java-684afabb206136d2e9cd47e0 - -Download `FP16` quantized `Qwen3` .gguf files from: -- https://huggingface.co/ggml-org/Qwen3-0.6B-GGUF -- https://huggingface.co/ggml-org/Qwen3-1.7B-GGUF -- https://huggingface.co/ggml-org/Qwen3-4B-GGUF -- https://huggingface.co/ggml-org/Qwen3-8B-GGUF - -Download `FP16` quantized `Qwen2.5` .gguf files from: -- https://huggingface.co/bartowski/Qwen2.5-0.5B-Instruct-GGUF -- https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF - -Download `FP16` quantized `DeepSeek-R1-Distill-Qwen` .gguf files from: -- https://huggingface.co/hdnh2006/DeepSeek-R1-Distill-Qwen-1.5B-GGUF - -Please be gentle with [huggingface.co](https://huggingface.co) servers: - -**Note** FP16 models are first-class citizens for the current version. -``` -# Llama 3.2 (1B) - FP16 -wget https://huggingface.co/beehive-lab/Llama-3.2-1B-Instruct-GGUF-FP16/resolve/main/beehive-llama-3.2-1b-instruct-fp16.gguf - -# Llama 3.2 (3B) - FP16 -wget https://huggingface.co/beehive-lab/Llama-3.2-3B-Instruct-GGUF-FP16/resolve/main/beehive-llama-3.2-3b-instruct-fp16.gguf - -# Llama 3 (8B) - FP16 -wget https://huggingface.co/beehive-lab/Llama-3.2-8B-Instruct-GGUF-FP16/resolve/main/beehive-llama-3.2-8b-instruct-fp16.gguf - -# Mistral (7B) - FP16 -wget https://huggingface.co/MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF/resolve/main/Mistral-7B-Instruct-v0.3.fp16.gguf - -# Qwen3 (0.6B) - FP16 -wget https://huggingface.co/ggml-org/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-f16.gguf - -# Qwen3 (1.7B) - FP16 -wget https://huggingface.co/ggml-org/Qwen3-0.6B-GGUF/resolve/main/Qwen3-1.7B-f16.gguf - -# Qwen3 (4B) - FP16 -wget https://huggingface.co/ggml-org/Qwen3-0.6B-GGUF/resolve/main/Qwen3-4B-f16.gguf - -# Qwen3 (8B) - FP16 -wget https://huggingface.co/ggml-org/Qwen3-0.6B-GGUF/resolve/main/Qwen3-8B-f16.gguf - -# Phi-3-mini-4k - FP16 -wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-fp16.gguf - -# Qwen2.5 (0.5B) -wget https://huggingface.co/bartowski/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/Qwen2.5-0.5B-Instruct-f16.gguf - -# Qwen2.5 (1.5B) -wget https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-fp16.gguf - -# DeepSeek-R1-Distill-Qwen (1.5B) -wget https://huggingface.co/hdnh2006/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-1.5B-F16.gguf -``` - -**[Experimental]** you can download the Q8 and Q4 used in the original implementation of Llama3.java, but for now are going to be dequanted to FP16 for TornadoVM support: -``` -# Llama 3.2 (1B) - Q4_0 -curl -L -O https://huggingface.co/mukel/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf -# Llama 3.2 (3B) - Q4_0 -curl -L -O https://huggingface.co/mukel/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct-Q4_0.gguf -# Llama 3 (8B) - Q4_0 -curl -L -O https://huggingface.co/mukel/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_0.gguf -# Llama 3.2 (1B) - Q8_0 -curl -L -O https://huggingface.co/mukel/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q8_0.gguf -# Llama 3.1 (8B) - Q8_0 -curl -L -O https://huggingface.co/mukel/Meta-Llama-3.1-8B-Instruct-GGUF/resolve/main/Meta-Llama-3.1-8B-Instruct-Q4_0.gguf -``` - +We provided a collection of models tested by us in [Hugging-face](https://huggingface.co/beehive-lab/collections). +However, any Llama3, Mistral, Qwen2, Qwen3, or Phi-3 model in `gguf` format can be used with **GPULlama3.java**. ----------- ## Running `llama-tornado` @@ -387,146 +312,12 @@ docker run --rm -it --gpus all \ --model /data/Llama-3.2-1B-Instruct.FP16.gguf \ --prompt "Tell me a joke" ``` ------------ - -## Troubleshooting GPU Memory Issues - -### Out of Memory Error - -You may encounter an out-of-memory error like: -``` -Exception in thread "main" uk.ac.manchester.tornado.api.exceptions.TornadoOutOfMemoryException: Unable to allocate 100663320 bytes of memory. -To increase the maximum device memory, use -Dtornado.device.memory=GB -``` - -This indicates that the default GPU memory allocation (7GB) is insufficient for your model. - -### Solution - -First, check your GPU specifications. If your GPU has high memory capacity, you can increase the GPU memory allocation using the `--gpu-memory` flag: - -```bash -# For 3B models, try increasing to 15GB -./llama-tornado --gpu --model beehive-llama-3.2-3b-instruct-fp16.gguf --prompt "Tell me a joke" --gpu-memory 15GB - -# For 8B models, you may need even more (20GB or higher) -./llama-tornado --gpu --model beehive-llama-3.2-8b-instruct-fp16.gguf --prompt "Tell me a joke" --gpu-memory 20GB -``` - -### GPU Memory Requirements by Model Size - -| Model Size | Recommended GPU Memory | -|-------------|------------------------| -| 1B models | 7GB (default) | -| 3-7B models | 15GB | -| 8B models | 20GB+ | - -**Note**: If you still encounter memory issues, try: - -1. Using Q4_0 instead of Q8_0 quantization (requires less memory). -2. Closing other GPU-intensive applications in your system. ----------- -## Command Line Options - -Supported command-line options include: - -```bash -cmd ➜ llama-tornado --help -usage: llama-tornado [-h] --model MODEL_PATH [--prompt PROMPT] [-sp SYSTEM_PROMPT] [--temperature TEMPERATURE] [--top-p TOP_P] [--seed SEED] [-n MAX_TOKENS] - [--stream STREAM] [--echo ECHO] [-i] [--instruct] [--gpu] [--opencl] [--ptx] [--gpu-memory GPU_MEMORY] [--heap-min HEAP_MIN] [--heap-max HEAP_MAX] - [--debug] [--profiler] [--profiler-dump-dir PROFILER_DUMP_DIR] [--print-bytecodes] [--print-threads] [--print-kernel] [--full-dump] - [--show-command] [--execute-after-show] [--opencl-flags OPENCL_FLAGS] [--max-wait-events MAX_WAIT_EVENTS] [--verbose] - -GPU-accelerated LLaMA.java model runner using TornadoVM - -options: - -h, --help show this help message and exit - --model MODEL_PATH Path to the LLaMA model file (e.g., beehive-llama-3.2-8b-instruct-fp16.gguf) (default: None) - -LLaMA Configuration: - --prompt PROMPT Input prompt for the model (default: None) - -sp SYSTEM_PROMPT, --system-prompt SYSTEM_PROMPT - System prompt for the model (default: None) - --temperature TEMPERATURE - Sampling temperature (0.0 to 2.0) (default: 0.1) - --top-p TOP_P Top-p sampling parameter (default: 0.95) - --seed SEED Random seed (default: current timestamp) (default: None) - -n MAX_TOKENS, --max-tokens MAX_TOKENS - Maximum number of tokens to generate (default: 512) - --stream STREAM Enable streaming output (default: True) - --echo ECHO Echo the input prompt (default: False) - --suffix SUFFIX Suffix for fill-in-the-middle request (Codestral) (default: None) - -Mode Selection: - -i, --interactive Run in interactive/chat mode (default: False) - --instruct Run in instruction mode (default) (default: True) - -Hardware Configuration: - --gpu Enable GPU acceleration (default: False) - --opencl Use OpenCL backend (default) (default: None) - --ptx Use PTX/CUDA backend (default: None) - --gpu-memory GPU_MEMORY - GPU memory allocation (default: 7GB) - --heap-min HEAP_MIN Minimum JVM heap size (default: 20g) - --heap-max HEAP_MAX Maximum JVM heap size (default: 20g) - -Debug and Profiling: - --debug Enable debug output (default: False) - --profiler Enable TornadoVM profiler (default: False) - --profiler-dump-dir PROFILER_DUMP_DIR - Directory for profiler output (default: /home/mikepapadim/repos/gpu-llama3.java/prof.json) - -TornadoVM Execution Verbose: - --print-bytecodes Print bytecodes (tornado.print.bytecodes=true) (default: False) - --print-threads Print thread information (tornado.threadInfo=true) (default: False) - --print-kernel Print kernel information (tornado.printKernel=true) (default: False) - --full-dump Enable full debug dump (tornado.fullDebug=true) (default: False) - --verbose-init Enable timers for TornadoVM initialization (llama.EnableTimingForTornadoVMInit=true) (default: False) - -Command Display Options: - --show-command Display the full Java command that will be executed (default: False) - --execute-after-show Execute the command after showing it (use with --show-command) (default: False) - -Advanced Options: - --opencl-flags OPENCL_FLAGS - OpenCL compiler flags (default: -cl-denorms-are-zero -cl-no-signed-zeros -cl-finite-math-only) - --max-wait-events MAX_WAIT_EVENTS - Maximum wait events for TornadoVM event pool (default: 32000) - --verbose, -v Verbose output (default: False) - -``` - -## Debug & Profiling Options -View TornadoVM's internal behavior: -```bash -# Print thread information during execution -./llama-tornado --gpu --model model.gguf --prompt "..." --print-threads - -# Show bytecode compilation details -./llama-tornado --gpu --model model.gguf --prompt "..." --print-bytecodes - -# Display generated GPU kernel code -./llama-tornado --gpu --model model.gguf --prompt "..." --print-kernel - -# Enable full debug output with all details -./llama-tornado --gpu --model model.gguf --prompt "..." --debug --full-dump - -# Combine debug options -./llama-tornado --gpu --model model.gguf --prompt "..." --print-threads --print-bytecodes --print-kernel -``` - -## Current Features & Roadmap +## Miscellaneous - - **Support for GGUF format models** with full FP16 and partial support for Q8_0 and Q4_0 quantization. - - **Instruction-following and chat modes** for various use cases. - - **Interactive CLI** with `--interactive` and `--instruct` modes. - - **Flexible backend switching** - choose OpenCL or PTX at runtime (need to build TornadoVM with both enabled). - - **Cross-platform compatibility**: - - ✅ NVIDIA GPUs (OpenCL & PTX ) - - ✅ Intel GPUs (OpenCL) - - ✅ Apple GPUs (OpenCL) +Click [here](https://github.com/beehive-lab/GPULlama3.java/tree/main/docs/RUN_DEBUB.md) for more run and debugging tips, also how to use the ./llama-tornado cli to run the model with different flags. Click [here](https://github.com/beehive-lab/GPULlama3.java/tree/main/docs/TORNADOVM_TRANSFORMER_OPTIMIZATIONS.md) to view a more detailed list of the transformer optimizations implemented in TornadoVM. diff --git a/docs/ChatGPT Image Apr 27, 2025, 02_45_40 PM.png b/docs/ChatGPT Image Apr 27, 2025, 02_45_40 PM.png deleted file mode 100644 index 467f3316..00000000 Binary files a/docs/ChatGPT Image Apr 27, 2025, 02_45_40 PM.png and /dev/null differ diff --git a/docs/RUN_DEBUG.md b/docs/RUN_DEBUG.md new file mode 100644 index 00000000..c5ae967f --- /dev/null +++ b/docs/RUN_DEBUG.md @@ -0,0 +1,125 @@ +## Troubleshooting GPU Memory Issues + +### Out of Memory Error + +You may encounter an out-of-memory error like: +``` +Exception in thread "main" uk.ac.manchester.tornado.api.exceptions.TornadoOutOfMemoryException: Unable to allocate 100663320 bytes of memory. +To increase the maximum device memory, use -Dtornado.device.memory=GB +``` + +This indicates that the default GPU memory allocation (7GB) is insufficient for your model. + +### Solution + +First, check your GPU specifications. If your GPU has high memory capacity, you can increase the GPU memory allocation using the `--gpu-memory` flag: + +```bash +# For 3B models, try increasing to 15GB +./llama-tornado --gpu --model beehive-llama-3.2-3b-instruct-fp16.gguf --prompt "Tell me a joke" --gpu-memory 15GB + +# For 8B models, you may need even more (20GB or higher) +./llama-tornado --gpu --model beehive-llama-3.2-8b-instruct-fp16.gguf --prompt "Tell me a joke" --gpu-memory 20GB +``` + +### GPU Memory Requirements by Model Size + +| Model Size | Recommended GPU Memory | +|-------------|------------------------| +| 1B models | 7GB (default) | +| 3-7B models | 15GB | +| 8B models | 20GB+ | + +**Note**: If you still encounter memory issues, try: + +1. Using Q4_0 instead of Q8_0 quantization (requires less memory). +2. Closing other GPU-intensive applications in your system. + +## Command Line Options + +Supported command-line options include: + +```bash +cmd ➜ llama-tornado --help +usage: llama-tornado [-h] --model MODEL_PATH [--prompt PROMPT] [-sp SYSTEM_PROMPT] [--temperature TEMPERATURE] [--top-p TOP_P] [--seed SEED] [-n MAX_TOKENS] + [--stream STREAM] [--echo ECHO] [-i] [--instruct] [--gpu] [--opencl] [--ptx] [--gpu-memory GPU_MEMORY] [--heap-min HEAP_MIN] [--heap-max HEAP_MAX] + [--debug] [--profiler] [--profiler-dump-dir PROFILER_DUMP_DIR] [--print-bytecodes] [--print-threads] [--print-kernel] [--full-dump] + [--show-command] [--execute-after-show] [--opencl-flags OPENCL_FLAGS] [--max-wait-events MAX_WAIT_EVENTS] [--verbose] + +GPU-accelerated LLaMA.java model runner using TornadoVM + +options: + -h, --help show this help message and exit + --model MODEL_PATH Path to the LLaMA model file (e.g., beehive-llama-3.2-8b-instruct-fp16.gguf) (default: None) + +LLaMA Configuration: + --prompt PROMPT Input prompt for the model (default: None) + -sp SYSTEM_PROMPT, --system-prompt SYSTEM_PROMPT + System prompt for the model (default: None) + --temperature TEMPERATURE + Sampling temperature (0.0 to 2.0) (default: 0.1) + --top-p TOP_P Top-p sampling parameter (default: 0.95) + --seed SEED Random seed (default: current timestamp) (default: None) + -n MAX_TOKENS, --max-tokens MAX_TOKENS + Maximum number of tokens to generate (default: 512) + --stream STREAM Enable streaming output (default: True) + --echo ECHO Echo the input prompt (default: False) + --suffix SUFFIX Suffix for fill-in-the-middle request (Codestral) (default: None) + +Mode Selection: + -i, --interactive Run in interactive/chat mode (default: False) + --instruct Run in instruction mode (default) (default: True) + +Hardware Configuration: + --gpu Enable GPU acceleration (default: False) + --opencl Use OpenCL backend (default) (default: None) + --ptx Use PTX/CUDA backend (default: None) + --gpu-memory GPU_MEMORY + GPU memory allocation (default: 7GB) + --heap-min HEAP_MIN Minimum JVM heap size (default: 20g) + --heap-max HEAP_MAX Maximum JVM heap size (default: 20g) + +Debug and Profiling: + --debug Enable debug output (default: False) + --profiler Enable TornadoVM profiler (default: False) + --profiler-dump-dir PROFILER_DUMP_DIR + Directory for profiler output (default: /home/mikepapadim/repos/gpu-llama3.java/prof.json) + +TornadoVM Execution Verbose: + --print-bytecodes Print bytecodes (tornado.print.bytecodes=true) (default: False) + --print-threads Print thread information (tornado.threadInfo=true) (default: False) + --print-kernel Print kernel information (tornado.printKernel=true) (default: False) + --full-dump Enable full debug dump (tornado.fullDebug=true) (default: False) + --verbose-init Enable timers for TornadoVM initialization (llama.EnableTimingForTornadoVMInit=true) (default: False) + +Command Display Options: + --show-command Display the full Java command that will be executed (default: False) + --execute-after-show Execute the command after showing it (use with --show-command) (default: False) + +Advanced Options: + --opencl-flags OPENCL_FLAGS + OpenCL compiler flags (default: -cl-denorms-are-zero -cl-no-signed-zeros -cl-finite-math-only) + --max-wait-events MAX_WAIT_EVENTS + Maximum wait events for TornadoVM event pool (default: 32000) + --verbose, -v Verbose output (default: False) + +``` + +## Debug & Profiling Options +View TornadoVM's internal behavior: +```bash +# Print thread information during execution +./llama-tornado --gpu --model model.gguf --prompt "..." --print-threads + +# Show bytecode compilation details +./llama-tornado --gpu --model model.gguf --prompt "..." --print-bytecodes + +# Display generated GPU kernel code +./llama-tornado --gpu --model model.gguf --prompt "..." --print-kernel + +# Enable full debug output with all details +./llama-tornado --gpu --model model.gguf --prompt "..." --debug --full-dump + +# Combine debug options +./llama-tornado --gpu --model model.gguf --prompt "..." --print-threads --print-bytecodes --print-kernel +``` \ No newline at end of file diff --git a/docs/diagrams/gpullama3-architecture-light.svg b/docs/diagrams/gpullama3-architecture-light.svg new file mode 100644 index 00000000..700f33b2 --- /dev/null +++ b/docs/diagrams/gpullama3-architecture-light.svg @@ -0,0 +1,195 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + GPULlama3.java Architecture + + + + + + + ☕ Your Java Application + + + Quarkus • Spring Boot • Micronaut • Any JVM App + + + + + + + + + + + 🔗 LangChain4j / Quarkus-LangChain4j + + + GPULlama3ChatModel • StreamingChatModel • AI Services • Agents • RAG + + + + + + + + + + + ⚡ GPULlama3.java Engine + + + Transformer • Multi-Head Attention • RoPE • RMSNorm • SwiGLU FFN • KV-Cache + + + + + + + + + + + 🌪️ TornadoVM Runtime + + + JIT Compilation • Task Graphs • Automatic Parallelization • Memory Management + + + + + + + + + + + + + + OpenCL + + + + + + + + PTX / CUDA + + + + + + + + Metal (WIP) + + + + + + + + + + + + + + 🔷 Intel Arc / Iris + + + + + + + + 🟢 NVIDIA GPUs + + + + + + + + Apple Silicon + + + + + + + + 100% Pure Java + + + + diff --git a/docs/diagrams/gpullama3-architecture.svg b/docs/diagrams/gpullama3-architecture.svg new file mode 100644 index 00000000..b2ce0370 --- /dev/null +++ b/docs/diagrams/gpullama3-architecture.svg @@ -0,0 +1,199 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + GPULlama3.java Architecture + + + + + + + ☕ Your Java Application + + + Quarkus • Spring Boot • Micronaut • Any JVM App + + + + + + + + + + + + 🔗 LangChain4j / Quarkus-LangChain4j + + + GPULlama3ChatModel • StreamingChatModel • AI Services • Agents • RAG + + + + + + + + + + + ⚡ GPULlama3.java Engine + + + Transformer • Multi-Head Attention • RoPE • RMSNorm • SwiGLU FFN • KV-Cache + + + + + + + + + + + 🌪️ TornadoVM Runtime + + + JIT Compilation • Task Graphs • Automatic Parallelization • Memory Management + + + + + + + + + + + + + + + + + OpenCL + + + + + + + + PTX / CUDA + + + + + + + + Metal (WIP) + + + + + + + + + + + + + + 🔷 Intel Arc / Iris + + + + + + + + 🟢 NVIDIA GPUs + + + + + + + + Apple Silicon + + + + + + + + 100% Pure Java + + + + diff --git a/llama-tornado b/llama-tornado index b59473f2..9c0d6ba8 100755 --- a/llama-tornado +++ b/llama-tornado @@ -410,7 +410,7 @@ def create_parser() -> argparse.ArgumentParser: const=Backend.PTX, help="Use PTX/CUDA backend", ) - hw_group.add_argument("--gpu-memory", default="7GB", help="GPU memory allocation") + hw_group.add_argument("--gpu-memory", default="14GB", help="GPU memory allocation") hw_group.add_argument("--heap-min", default="20g", help="Minimum JVM heap size") hw_group.add_argument("--heap-max", default="20g", help="Maximum JVM heap size") diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/layers/type/fp16/Phi3FP16FFNLayers.java b/src/main/java/org/beehive/gpullama3/tornadovm/layers/type/fp16/Phi3FP16FFNLayers.java index 9f1c335a..75f9f531 100644 --- a/src/main/java/org/beehive/gpullama3/tornadovm/layers/type/fp16/Phi3FP16FFNLayers.java +++ b/src/main/java/org/beehive/gpullama3/tornadovm/layers/type/fp16/Phi3FP16FFNLayers.java @@ -156,12 +156,12 @@ TaskGraph setupSinglePhi3FFNLayer(Phi3TornadoWeights weights, int layerIndex) { unifiedLayer.consumeFromDevice(phi3State.wrapX); unifiedLayer.transferToDevice(DataTransferMode.FIRST_EXECUTION, // Copy-in weights per layer for batched-layered layout - weights.rms_att_weightLayered[layerIndex], - weights.wqkvLayered[layerIndex], - weights.woLayered[layerIndex], - weights.rms_ffn_weightLayered[layerIndex], - weights.wUpLayered[layerIndex], - weights.wDownLayered[layerIndex] + weights.rms_att_weightLayered[layerIndex].asFloatArray(), + weights.wqkvLayered[layerIndex].asHalfFloatArray(), + weights.woLayered[layerIndex].asHalfFloatArray(), + weights.rms_ffn_weightLayered[layerIndex].asFloatArray(), + weights.wUpLayered[layerIndex].asHalfFloatArray(), + weights.wDownLayered[layerIndex].asHalfFloatArray() ); unifiedLayer = configureLayerDataTransfers(unifiedLayer, layerIndex);