diff --git a/.github/ISSUE_TEMPLATE/benchmark_submission.md b/.github/ISSUE_TEMPLATE/benchmark_submission.md new file mode 100644 index 0000000..9649489 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/benchmark_submission.md @@ -0,0 +1,26 @@ +--- +name: Benchmark Submission +about: Submit your ANE benchmark results +title: "[Benchmark] results" +labels: benchmark +assignees: '' +--- + +## System Info + +- **Chip**: (e.g., Apple M4 Max) +- **Machine**: (e.g., Mac16,5) +- **macOS Version**: +- **Memory**: (e.g., 128 GB) + +## Benchmark Results + +Paste the contents of your JSON results file below: + +```json + +``` + +## Notes + +Any observations, issues encountered, or interesting findings. diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..07b0a49 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,33 @@ +--- +name: Bug Report +about: Report a build failure, crash, or unexpected behavior +title: "[Bug] " +labels: bug +assignees: '' +--- + +## Environment + +- **Chip**: +- **macOS Version**: +- **Xcode Version**: (run `xcodebuild -version`) + +## Description + +What happened? + +## Steps to Reproduce + +1. +2. +3. + +## Expected Behavior + +What did you expect to happen? + +## Logs / Output + +``` +Paste relevant output here +``` diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000..881f1d4 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,19 @@ +--- +name: Feature Request +about: Suggest a new feature or research direction +title: "[Feature] " +labels: enhancement +assignees: '' +--- + +## Description + +What would you like to see added? + +## Motivation + +Why would this be useful? + +## Possible Approach + +If you have ideas on how to implement this, share them here. diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b2eb942 --- /dev/null +++ b/.gitignore @@ -0,0 +1,83 @@ +# Build artifacts +*.o +*.dSYM/ + +# Root-level compiled binaries +ane_probe +api_explore +inmem_basic +inmem_bench +inmem_peak +sram_bench +sram_probe + +# Training binaries +tiny_train +tiny_train_m1 +train_large +training/train_large +training/train_large_ane +training/train_opt +training/train_double_buffer +training/test_* +!training/test_*.m + +# Inference binaries and runtime data +inference/qwen_ane +inference/qwen05b.bin +inference/qwen05b_f32.bin +inference/qwen05b_f16.bin +inference/qwen05b_q8.bin +inference/.venv/ +inference/benchmark_results.json + +# Dynamic training binaries +training/training_dynamic/train + +# Test/research binaries +test_chaining + +# Generated mlpackage files +/tmp/ane_*.mlpackage + +# Benchmark results (keep community_benchmarks/ submissions) +benchmark_results_*.txt +community_benchmarks/SUMMARY.json +community_benchmarks/SUMMARY.md +community_benchmarks/apple_m4_max_20260303_*.json + +# Python +__pycache__/ +*.pyc +*.egg-info/ +/tmp/ane_venv/ + +# Training data (downloaded separately) +assets/ + +# Web dashboard (lives in separate private repo) +web/ + +# Training data binaries (downloaded via make setup) +training/tinystories_data00.bin +training/ane_stories110M_ckpt.bin +*.bin +*.metallib +!training/download_data.sh + +# Secrets / env +.env +inference/.env + +# Internal / private +.cursor/ +docs/launch/ +comm + +# macOS +.DS_Store + +# Editor +*.swp +*.swo +*~ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..a9b65c7 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,60 @@ +# Contributing to ANE Training + +Thanks for your interest in contributing! This community fork welcomes benchmark submissions, bug fixes, and research contributions. + +## Benchmark Submissions (Easiest Way to Contribute) + +The single most valuable thing you can do is run the benchmark on your hardware and submit results. + +### Quick Version + +```bash +bash scripts/run_community_benchmark.sh +``` + +The script will guide you through everything, including optional auto-submission to the dashboard. + +### What Gets Collected + +- Your chip model (e.g., Apple M4 Max) +- macOS version, memory, core counts +- SRAM probe results (TFLOPS vs weight size) +- In-memory peak TFLOPS +- Training performance (optional, requires training data) +- Your GitHub username (optional) + +No personal data, no IP addresses stored (only hashed for rate limiting). + +## Bug Reports + +Open an issue with: +- Your hardware (chip, macOS version, memory) +- Steps to reproduce +- Expected vs actual behavior +- Relevant log output + +## Code Contributions + +1. Fork the repository +2. Create a feature branch (`git checkout -b my-feature`) +3. Make your changes +4. Test on your hardware +5. Submit a Pull Request + +### Code Style + +- Objective-C: follow the existing style in `training/` (no ARC annotations in headers, `_Float16` for fp16) +- Shell scripts: use `set -euo pipefail`, quote variables +- Python: minimal dependencies, Python 3.11+ compatible + +### Areas Where Help is Needed + +- **Benchmarks on hardware we don't have**: M1, M2, M3, M3 Pro/Max/Ultra, M4 Pro, M5 +- **Reducing compilation overhead**: currently 80-85% of wall time +- **`_ANEChainingRequest` research**: pipelining multiple ANE operations without recompile +- **`_ANEPerformanceStats` investigation**: getting real hardware timing data +- **Larger model support**: scaling beyond Stories110M + +## Questions? + +Open a GitHub issue or discussion. We're happy to help. diff --git a/PROBE_RESULTS.md b/PROBE_RESULTS.md new file mode 100644 index 0000000..f3ea376 --- /dev/null +++ b/PROBE_RESULTS.md @@ -0,0 +1,88 @@ +# ANE Probe Results: M4 (macOS 26.3) + +**Machine:** Apple M4 (10 cores), 32GB RAM, macOS 26.3 +**Date:** 2026-03-03 +**ANE Family:** H16 (same as M5 results in `training/m5result.md`) + +## Key Discovery: Compile and Eval Run in Parallel + +**This was not known before.** The M5 probes tested compile and eval sequentially. +We tested with GCD `dispatch_async` and found they fully overlap. + +### probe_v2.m Results + +#### TEST 1: Pure Eval Throughput +``` +Conv 128x128, spatial=64 +1000 evals: 189.1ms total, 0.189ms/eval +11.09 GFLOPS sustained +``` + +#### TEST 2: Ping-pong (Two Pre-compiled Models) +``` +500 ping-pong pairs: 207.4ms (0.415ms/pair, 0.207ms/eval) +``` +Near-zero overhead switching between two loaded models. + +#### TEST 3: Sequential Compile (20 Models) +``` +All 20 models compiled and verified ✓ +Compile time: ~23-29ms each (consistent, no degradation) +All 20 models correct with different scale factors +``` + +#### TEST 4: Background Compile Overlap ⭐ +``` +Background compile: 26.8ms +Foreground evals during compile: 119 (26.8ms total) +Overlap: YES — compile and eval CAN run in parallel! +Background model verified correct ✓ +``` + +### Summary +| Metric | Value | +|--------|-------| +| Compile time | ~25ms per kernel set | +| Eval time | 0.189ms per eval | +| Compile:eval ratio | ~130:1 | +| Parallel compile+eval | **YES** | +| Max simultaneous models | 20+ | +| Ping-pong overhead | +10% vs single model | + +## Peak ANE Throughput (inmem_peak) + +``` +Config W(MB) GFLOP ms/eval TFLOPS +96x conv 512ch sp64 48.0 3.22 0.429 ms 7.50 +128x conv 512ch sp64 64.0 4.29 0.589 ms 7.30 +256x conv 256ch sp64 32.0 2.15 0.380 ms 5.65 +64x conv 512ch sp64 32.0 2.15 0.395 ms 5.43 +``` + +Peak: **7.50 TFLOPS** (47% of 15.8 TFLOPS theoretical). + +## Implications for Training + +### Before (train_large.m) +- Synchronous compile: **88.6% of wall time is compilation** +- 55ms compile per batch, 0.54ms actual training +- Training throughput limited by compiler, not by ANE + +### After (train_double_buffer.m) +- Async double-buffered compile: **0% compile stall** +- Background compile happens during forward/backward passes +- ~130 eval steps fit in one compile window +- Weight updates are "delayed" by one batch (standard technique in distributed training) +- Training throughput limited only by ANE eval speed + +### Architecture +``` +Time → +Active kernels: [=== eval batch N ===][=== eval batch N+1 ===][=== eval batch N+2 ===] +Background: [compile N+1 weights ][compile N+2 weights ][compile N+3 weights ] + ↑ ↑ ↑ + swap ready swap ready swap ready +``` + +Two kernel sets (A and B) alternate between active evaluation and background compilation. +When the background compile finishes, pointers swap atomically at the batch boundary. diff --git a/README.md b/README.md index ce3df1f..beed9c2 100644 --- a/README.md +++ b/README.md @@ -12,24 +12,24 @@ This is a **research project**, not a production framework. The goal was to demonstrate that **training on the Apple Neural Engine — and potentially other NPUs — is possible**, and that the barrier has always been software support, not hardware capability. The ANE is a remarkably capable piece of silicon that Apple restricts to inference-only use through CoreML. This project bypasses that restriction using reverse-engineered private APIs to show what's possible when you give the hardware a chance. -### What this project is +### What This Project Is - A proof of concept for ANE training via `_ANEClient` and `_ANECompiler` private APIs - A set of benchmarks documenting real ANE performance characteristics (throughput, power, SRAM behavior) - A reference for anyone exploring direct ANE access outside CoreML - Research code that I update when I find something interesting -### What this project is not +### What This Project Is Not - A maintained framework or library - A replacement for CoreML, MLX, llama.cpp, or any production inference stack - A path to training large models on consumer hardware (yet) -### On the hype +### On The Hype Some coverage of this project has overstated its implications. To be clear: -- Training works, but utilization is low (~2-3% of peak) with significant engineering challenges remaining +- Training works, but utilization is low (~8-11% of peak) with significant engineering challenges remaining - Many element-wise operations still fall back to CPU - This does **not** replace GPU training for anything beyond small research models today @@ -37,18 +37,57 @@ The honest results — including all limitations — are documented in the accom - [Part 1: Reverse Engineering](https://maderix.substack.com/p/inside-the-m4-apple-neural-engine) - [Part 2: Benchmarks](https://maderix.substack.com/p/inside-the-m4-apple-neural-engine-615) -### On maintenance +### Fork it, build on it + +This is MIT licensed for a reason. Everyone now has access to AI-assisted development tools that can adapt and extend code in hours. If this project is useful to you — take it, modify it, build something better. If you do something cool with it, I'd love to hear about it. -I don't intend to grow this into a large community project. My focus is on original research (compiler infrastructure for edge AI optimization), and maintaining an open-source framework takes time away from that. +--- -That said: -- I'll keep pushing updates when I discover something interesting -- Bug fixes and benchmark contributions (especially on hardware I don't own) are welcome -- Feature requests will likely go unaddressed — but feel free to fork +## Community Fork -### Fork it, build on it +This fork extends the original project with: -This is MIT licensed for a reason. Everyone now has access to AI-assisted development tools that can adapt and extend code in hours. If this project is useful to you — take it, modify it, build something better. If you do something cool with it, I'd love to hear about it. +- **M1/M2/M3/M4 compatibility** — MIL syntax fixes for broader Apple Silicon support (from upstream PR #6) +- **Security hardening** — stack protection, format security, input validation (upstream PRs #5, #7) +- **Bug fixes** — token sampling underflow fix, dashboard sudo hang fix (upstream PRs #17, #20) +- **Configurable paths** — training data, model, and checkpoint paths via environment variables +- **Community benchmarks** — standardized benchmark script + online dashboard for comparing results across chips +- **12-layer training** — full Stories110M (12 transformer layers, 109M params) already working + +### Contributing + +We welcome benchmark submissions from any Apple Silicon hardware. See [Community Benchmarks](#community-benchmarks) below for how to run and submit your results. + +--- + +## Quick Start + +**Requirements:** macOS 15+ on Apple Silicon (M1/M2/M3/M4/M5), Xcode CLI tools. + +```bash +# Install Xcode CLI tools (if not already installed) +xcode-select --install + +# Clone and set up +git clone https://github.com/dev-erik/ANE.git +cd ANE/training + +# Download training data + model weights +make setup + +# Build and run training (12-layer Stories110M) +make train_large +./train_large --steps 100 +``` + +### Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `ANE_MODEL_PATH` | `../../assets/models/stories110M.bin` | Path to model weights | +| `ANE_DATA_PATH` | `../../assets/data/tinystories_data00.bin` | Path to tokenized training data | +| `ANE_CKPT_PATH` | `/tmp/ane_ckpt.bin` | Path for checkpoint files | +| `ANE_ACCUM_STEPS` | `10` | Gradient accumulation steps before weight update (max 10000) | --- @@ -56,15 +95,15 @@ This is MIT licensed for a reason. Everyone now has access to AI-assisted develo A from-scratch implementation of transformer training (forward + backward pass) running on the ANE in Apple Silicon. The ANE is a 15.8 TFLOPS (M4) inference accelerator that Apple does not expose for training. This project reverse-engineers the `_ANEClient` / `_ANECompiler` private APIs and the MIL (Model Intermediate Language) format to run custom compute graphs — including backpropagation — directly on ANE hardware. -**Current results (M4, single transformer layer, dim=768, seq=512):** -- 9.3 ms/step, 11.2% ANE utilization (1.78 TFLOPS sustained) -- 6 ANE kernel dispatches per training step +**Current results (M4 Max, 12-layer Stories110M, dim=768, seq=256):** +- 62-72 ms/step, 8-11% ANE utilization (1.3-1.7 TFLOPS sustained) +- 6 ANE kernel dispatches per layer per training step - All forward and backward dx passes on ANE, dW gradients on CPU (Accelerate cblas) -- Adam optimizer, gradient accumulation, checkpoint/resume +- Adam optimizer, gradient accumulation, checkpoint/resume via process restart ## Architecture -The training loop uses 6 ANE kernels per step: +The training loop uses 6 ANE kernels per step per layer: | Kernel | Function | Weights | |--------|----------|---------| @@ -73,19 +112,19 @@ The training loop uses 6 ANE kernels per step: | `kFFNBwd` | FFN backward (W2^T + SiLU_bwd + W1^T + W3^T) | W2^T, W1^T, W3^T | | `kSdpaBwd1` | Wo^T + SDPA backward part 1 (dV, probs, dp) | Wo^T, mask | | `kSdpaBwd2` | SDPA backward part 2 (softmax grad, dQ, dK) | — | -| `kQKVb` | QKV backward (Wq^T + Wk^T + Wv^T → dx) | Wq^T, Wk^T, Wv^T | +| `kQKVb` | QKV backward (Wq^T + Wk^T + Wv^T -> dx) | Wq^T, Wk^T, Wv^T | CPU handles: RMSNorm backward, residual connections, loss computation, dW gradient accumulation (cblas_sgemm), Adam optimizer updates. Key optimizations: - **Channel-first CPU layout** — matches ANE IOSurface `[1,C,1,S]` format, eliminates all transpose overhead -- **vDSP vectorized RMSNorm** — 10x faster than naive (6.7ms → 0.7ms) +- **vDSP vectorized RMSNorm** — 10x faster than naive (6.7ms to 0.7ms) - **GCD async cblas overlap** — dW gradient sgemms run in parallel with ANE evals on a serial dispatch queue - **Deferred cblas wait** — wait pushed into next step's forward pass for maximum overlap - **ANE RMSNorm fusion** — RMSNorm folded into forward kernels as MIL ops (reduce_sum + pow + mul) - **Wo^T fusion** — output projection backward merged into SDPA backward kernel - **Forward taps** — Q, K, V, attention scores, hidden states exposed via concat outputs, avoiding CPU recompute -- **exec() restart** — bypasses ~119 ANE compile limit per process +- **Process restart** — bypasses ~119 ANE compile limit per process via checkpoint and re-launch ## File Structure @@ -93,34 +132,116 @@ Key optimizations: ├── api_exploration.m # Initial ANE API discovery ├── inmem_basic.m # In-memory MIL compilation proof-of-concept ├── inmem_bench.m # ANE dispatch latency benchmarks -├── inmem_peak.m # Peak TFLOPS measurement (2048x2048 matmul) +├── inmem_peak.m # Peak TFLOPS measurement ├── sram_bench.m # ANE SRAM bandwidth probing ├── sram_probe.m # SRAM size/layout exploration +├── scripts/ +│ ├── run_benchmarks.sh # Full benchmark suite runner +│ ├── run_community_benchmark.sh # Standardized community benchmark (JSON output) +│ ├── gen_mlpackages.py # Generate .mlpackage models for sram/inmem tests +│ └── aggregate_benchmarks.py # Aggregate community JSON results +├── community_benchmarks/ # Community-submitted benchmark results (JSON) +├── web/ # Dashboard web app (Next.js + Neon Postgres) +├── docs/ +│ ├── ARCHITECTURE.md # System architecture with diagrams +│ ├── API_REFERENCE.md # Complete function index +│ ├── BENCHMARKS.md # Benchmark guide +│ └── BENCHMARK_RESULTS.md # Detailed M4 Max results └── training/ ├── ane_runtime.h # ANE private API wrapper (compile, eval, IOSurface) ├── ane_mil_gen.h # MIL program generation helpers - ├── model.h # Model weight initialization and blob builders - ├── forward.h # Forward pass MIL generators - ├── backward.h # Backward pass MIL generators + ├── ane_classifier.h # Classifier forward/backward MIL generators + ├── ane_rmsnorm_bwd.h # RMSNorm backward MIL generator + ├── stories_config.h # Model configuration (dims, structs, macros) + ├── stories_io.h # IOSurface I/O, blob builders, compile/eval helpers + ├── stories_mil.h # MIL generators (SDPA, FFN, QKV backward) + ├── stories_cpu_ops.h # CPU ops (RMSNorm, Adam, cross-entropy, embed) + ├── model.h # Gen1 model weight init and blob builders + ├── forward.h # Gen1 forward pass MIL generators + ├── backward.h # Gen1 backward pass MIL generators + ├── train_large.m # Main: 12-layer training (CPU classifier) + ├── train_large_ane.m # 12-layer training (ANE classifier) ├── train.m # Minimal training loop (early prototype) ├── tiny_train.m # 2-layer tiny model training - ├── train_large.m # Main: single-layer dim=768 training (optimized) ├── test_*.m # Unit tests for individual kernels - └── Makefile + ├── dashboard.py # Real-time training monitor + ├── tokenize.py # Training data preprocessing + ├── download_data.sh # Download training data + model weights + └── Makefile # Build system (make train_large, make test, etc.) +``` + +## Community Benchmarks + +We collect community benchmark results across Apple Silicon chips to understand ANE performance characteristics. + +### Run Benchmarks + +```bash +# Run the standardized community benchmark +bash scripts/run_community_benchmark.sh + +# Skip training benchmarks (if no training data) +bash scripts/run_community_benchmark.sh --skip-training + +# Custom training steps +bash scripts/run_community_benchmark.sh --steps 50 ``` +The script will: +1. Detect your hardware (chip, memory, cores) +2. Run SRAM probe and in-memory peak benchmarks +3. Optionally run training benchmarks +4. Save results as JSON to `community_benchmarks/` +5. Ask if you'd like to submit results to the online dashboard + +### Submit Results + +**Option A: Automatic submission** +At the end of the benchmark run, the script will ask if you want to submit. Your results are sent anonymously to our dashboard (IP is hashed, never stored raw). + +**Option B: GitHub PR** +1. Fork this repository +2. Run the benchmark script +3. Commit the JSON file from `community_benchmarks/` +4. Open a Pull Request + +**Option C: GitHub Issue** +Paste the contents of your JSON results file in a new issue. + +### View Results + +Visit the **[ANE Community Benchmark Dashboard](https://web-lac-sigma-61.vercel.app)** to see aggregated results across all Apple Silicon chips. + +### Data Privacy + +- Your IP address is hashed (SHA-256) for rate limiting and duplicate detection only +- No personal information is collected or stored +- All benchmark data is public +- Rate limited to 5 submissions per hour per IP + +--- + ## Building -Requires macOS 15+ on Apple Silicon (tested on M4). +Requires macOS 15+ on Apple Silicon (tested on M1 through M5). ```bash -# Build the main training program -xcrun clang -O2 -framework Foundation -framework IOSurface \ - -framework CoreML -framework Accelerate -ldl -lobjc \ - -o train_large training/train_large.m +cd training -# Run -./train_large +# Build everything +make all + +# Build just the training programs +make train_large train_large_ane + +# Run tests +make test + +# Download training data +make data + +# Full setup (data + dependencies) +make setup ``` No external dependencies. Uses only system frameworks + private ANE APIs resolved at runtime via `objc_msgSend`. @@ -135,10 +256,11 @@ No external dependencies. Uses only system frameworks + private ANE APIs resolve ## Limitations -- **SDPA causal masking** — ANE hardware ignores `attn_mask` in SDPA ops; causal attention is decomposed into separate Q@K^T (ANE) → mask+softmax (ANE via add+softmax) → scores@V (ANE) -- **~119 compile limit** — ANE compiler leaks resources; worked around via `exec()` restart with checkpoint -- **Single layer** — Currently trains one transformer layer; multi-layer would need pipeline scheduling -- **Synthetic data** — Currently uses random data for benchmarking; real tokenized data support is WIP +- **SDPA causal masking** — ANE hardware ignores `attn_mask` in SDPA ops; causal attention is decomposed into separate Q@K^T (ANE) then mask+softmax (ANE via add+softmax) then scores@V (ANE) +- **~119 compile limit** — ANE compiler leaks resources; worked around via process restart with checkpoint +- **Compilation overhead** — Weights baked at compile time mean recompilation every ACCUM_STEPS. Compilation is 80-85% of wall time. Investigating `_ANEChainingRequest` for potential pipeline without recompile. +- **Classifier backward regression** — ANE classifier backward is ~3x slower than CPU cblas due to matmul (not conv) being used to work around ANE's 8192 input channel limit +- **SRAM capacity** — ANE SRAM is ~24-32 MB (M4 Max). Models with weight matrices exceeding this threshold spill to DRAM with significant performance cliffs. Current Stories110M weights (~1.2 MB each) stay within SRAM. ## Performance History @@ -149,12 +271,14 @@ No external dependencies. Uses only system frameworks + private ANE APIs resolve | vDSP vectorized RMSNorm | 14.2 | 7.4% | | GCD async cblas overlap | 11.4 | 9.2% | | ANE RMSNorm fusion | 11.4 | 9.2% | -| Wo^T fusion (7→6 kernels) | 11.4 | 9.2% | +| Wo^T fusion (7 to 6 kernels) | 11.4 | 9.2% | | Deferred cblas wait | **9.3** | **11.2%** | +*Note: Above numbers are for single-layer training. Full 12-layer training runs at 62-72 ms/step.* + ## Disclaimer -This project uses Apple's private, undocumented APIs (`_ANEClient`, `_ANECompiler`, `_ANEInMemoryModelDescriptor`). These APIs are not covered by any public stability guarantee and may change or break with any macOS update. This is independent research into Apple Neural Engine architecture, using APIs discovered through runtime introspection for research and educational purposes under fair use and interoperability provisions (see *Sega v. Accolade*, 1992; DMCA §1201(f)). No Apple proprietary code or binaries are included in this repository. This project is not affiliated with or endorsed by Apple Inc. Use at your own risk. +This project uses Apple's private, undocumented APIs (`_ANEClient`, `_ANECompiler`, `_ANEInMemoryModelDescriptor`). These APIs are not covered by any public stability guarantee and may change or break with any macOS update. This is independent research into Apple Neural Engine architecture, using APIs discovered through runtime introspection for research and educational purposes under fair use and interoperability provisions (see *Sega v. Accolade*, 1992; DMCA section 1201(f)). No Apple proprietary code or binaries are included in this repository. This project is not affiliated with or endorsed by Apple Inc. Use at your own risk. ## License @@ -162,4 +286,4 @@ MIT — see [LICENSE](LICENSE) --- -*Built by a human + Claude, one weekend at a time.* +*Originally built by [maderix](https://github.com/maderix). Community fork maintained with contributions from the ANE research community.* diff --git a/community_benchmarks/README.md b/community_benchmarks/README.md new file mode 100644 index 0000000..f9e08c6 --- /dev/null +++ b/community_benchmarks/README.md @@ -0,0 +1,111 @@ +# ANE Community Benchmarks + +Standardized benchmark results from different Apple Silicon machines, contributed by the community. + +## How to Run + +```bash +# Full benchmark (SRAM probe + peak TFLOPS + training) +bash scripts/run_community_benchmark.sh + +# Quick benchmark (skip training -- useful if you don't have training data) +bash scripts/run_community_benchmark.sh --skip-training + +# Custom training steps +bash scripts/run_community_benchmark.sh --steps 50 +``` + +This produces a JSON file in `community_benchmarks/` named `_.json`. + +### Prerequisites + +- macOS on Apple Silicon (M1/M2/M3/M4/M5) +- Xcode command line tools (`xcode-select --install`) +- Python 3.11-3.13 with `coremltools` (auto-installed into a temp venv) +- For training benchmarks: run `cd training && make data` first + +## How to Submit + +### Option 1: Pull Request + +1. Fork this repo +2. Run the benchmark: `bash scripts/run_community_benchmark.sh` +3. Commit the generated JSON file from `community_benchmarks/` +4. Open a PR + +### Option 2: GitHub Issue + +1. Run the benchmark +2. Open a [new issue](../../issues/new) with title "Benchmark: [Your Chip]" +3. Paste the contents of your JSON file + +## Viewing Aggregated Results + +```bash +python3 scripts/aggregate_benchmarks.py +``` + +This reads all JSON files in `community_benchmarks/` and prints a markdown comparison table. + +## JSON Schema (v1) + +Each submission contains: + +```json +{ + "schema_version": 1, + "timestamp": "2026-03-03T12:00:00Z", + "system": { + "chip": "Apple M4 Max", + "machine": "Mac16,5", + "macos_version": "26.2", + "memory_gb": 128, + "neural_engine_cores": "16" + }, + "benchmarks": { + "sram_probe": [ + {"channels": 256, "weight_mb": 0.1, "ms_per_eval": 0.378, "tflops": 0.02, "gflops_per_mb": 177.7}, + ... + ], + "inmem_peak": [ + {"depth": 128, "channels": 512, "spatial": 64, "weight_mb": 64.0, "gflops": 4.29, "ms_per_eval": 0.385, "tflops": 11.14}, + ... + ], + "training_cpu_classifier": { + "ms_per_step": 72.4, + "ane_tflops_sustained": 1.29, + "ane_util_pct": 8.1, + "compile_pct": 79.7 + }, + "training_ane_classifier": { + "ms_per_step": 62.9, + "ane_tflops_sustained": 1.68, + "ane_util_pct": 10.6, + "compile_pct": 84.5 + } + }, + "summary": { + "peak_tflops": 11.14, + "sram_spill_start_channels": 4096, + "training_ms_per_step_cpu": 72.4, + "training_ms_per_step_ane": 62.9, + "training_ane_tflops": 1.68, + "training_ane_util_pct": 10.6 + } +} +``` + +## What We're Measuring + +| Benchmark | What it tells us | +|-----------|-----------------| +| **sram_probe** | ANE SRAM capacity -- where weight spilling starts | +| **inmem_peak** | Maximum achievable TFLOPS via programmatic MIL | +| **training (CPU cls)** | End-to-end training perf with CPU classifier | +| **training (ANE cls)** | End-to-end training perf with ANE-offloaded classifier | + +Key metrics to compare across chips: +- **Peak TFLOPS**: raw ANE compute capability +- **SRAM spill point**: determines max efficient kernel size +- **Training ms/step**: real-world training performance +- **ANE utilization %**: how much of peak we actually use diff --git a/community_benchmarks/apple_m4_max_20260303.json b/community_benchmarks/apple_m4_max_20260303.json new file mode 100644 index 0000000..34f6da0 --- /dev/null +++ b/community_benchmarks/apple_m4_max_20260303.json @@ -0,0 +1,67 @@ +{ + "schema_version": 1, + "timestamp": "2026-03-03T11:46:08Z", + "system": { + "chip": "Apple M4 Max", + "machine": "Mac16,5", + "macos_version": "26.2", + "macos_build": "25C56", + "cpu_cores": 16, + "memory_gb": 128, + "neural_engine_cores": "16" + }, + "benchmarks": { + "sram_probe": [ + {"channels": 256, "weight_mb": 0.1, "ms_per_eval": 0.378, "tflops": 0.02, "gflops_per_mb": 177.7}, + {"channels": 512, "weight_mb": 0.5, "ms_per_eval": 0.431, "tflops": 0.08, "gflops_per_mb": 155.6}, + {"channels": 1024, "weight_mb": 2.0, "ms_per_eval": 0.411, "tflops": 0.33, "gflops_per_mb": 163.5}, + {"channels": 1536, "weight_mb": 4.5, "ms_per_eval": 0.493, "tflops": 0.61, "gflops_per_mb": 136.1}, + {"channels": 2048, "weight_mb": 8.0, "ms_per_eval": 0.410, "tflops": 1.31, "gflops_per_mb": 163.9}, + {"channels": 2560, "weight_mb": 12.5, "ms_per_eval": 0.237, "tflops": 3.53, "gflops_per_mb": 282.6}, + {"channels": 3072, "weight_mb": 18.0, "ms_per_eval": 0.335, "tflops": 3.60, "gflops_per_mb": 200.1}, + {"channels": 3584, "weight_mb": 24.5, "ms_per_eval": 0.414, "tflops": 3.97, "gflops_per_mb": 162.1}, + {"channels": 4096, "weight_mb": 32.0, "ms_per_eval": 1.134, "tflops": 1.89, "gflops_per_mb": 59.2}, + {"channels": 4608, "weight_mb": 40.5, "ms_per_eval": 0.563, "tflops": 4.83, "gflops_per_mb": 119.2}, + {"channels": 5120, "weight_mb": 50.0, "ms_per_eval": 0.659, "tflops": 5.09, "gflops_per_mb": 101.8}, + {"channels": 6144, "weight_mb": 72.0, "ms_per_eval": 0.844, "tflops": 5.73, "gflops_per_mb": 79.5}, + {"channels": 8192, "weight_mb": 128.0, "ms_per_eval": 4.203, "tflops": 1.02, "gflops_per_mb": 8.0} + ], + "inmem_peak": [ + {"depth": 32, "channels": 512, "spatial": 64, "weight_mb": 16.0, "gflops": 1.07, "ms_per_eval": 0.408, "tflops": 2.63}, + {"depth": 48, "channels": 512, "spatial": 64, "weight_mb": 24.0, "gflops": 1.61, "ms_per_eval": 0.262, "tflops": 6.15}, + {"depth": 64, "channels": 512, "spatial": 64, "weight_mb": 32.0, "gflops": 2.15, "ms_per_eval": 0.244, "tflops": 8.80}, + {"depth": 96, "channels": 512, "spatial": 64, "weight_mb": 48.0, "gflops": 3.22, "ms_per_eval": 0.326, "tflops": 9.89}, + {"depth": 128, "channels": 512, "spatial": 64, "weight_mb": 64.0, "gflops": 4.29, "ms_per_eval": 0.385, "tflops": 11.14}, + {"depth": 64, "channels": 256, "spatial": 64, "weight_mb": 8.0, "gflops": 0.54, "ms_per_eval": 0.365, "tflops": 1.47}, + {"depth": 128, "channels": 256, "spatial": 64, "weight_mb": 16.0, "gflops": 1.07, "ms_per_eval": 0.454, "tflops": 2.37}, + {"depth": 256, "channels": 256, "spatial": 64, "weight_mb": 32.0, "gflops": 2.15, "ms_per_eval": 0.351, "tflops": 6.11}, + {"depth": 64, "channels": 384, "spatial": 64, "weight_mb": 18.0, "gflops": 1.21, "ms_per_eval": 0.429, "tflops": 2.82}, + {"depth": 128, "channels": 384, "spatial": 64, "weight_mb": 36.0, "gflops": 2.42, "ms_per_eval": 0.354, "tflops": 6.82} + ], + "training_cpu_classifier": { + "ms_per_step": 72.4, + "ane_tflops_sustained": 1.29, + "total_tflops": 2.41, + "ane_util_pct": 8.1, + "compile_pct": 79.7, + "train_pct": 16.4 + }, + "training_ane_classifier": { + "ms_per_step": 62.9, + "ane_tflops_sustained": 1.68, + "total_tflops": 2.77, + "ane_util_pct": 10.6, + "compile_pct": 84.5, + "train_pct": 12.5 + } + }, + "summary": { + "peak_tflops": 11.14, + "sram_peak_efficiency_gflops_per_mb": 282.6, + "sram_spill_start_channels": 4096, + "training_ms_per_step_cpu": 72.4, + "training_ms_per_step_ane": 62.9, + "training_ane_tflops": 1.68, + "training_ane_util_pct": 10.6 + } +} diff --git a/docs/API_REFERENCE.md b/docs/API_REFERENCE.md new file mode 100644 index 0000000..876eddd --- /dev/null +++ b/docs/API_REFERENCE.md @@ -0,0 +1,429 @@ +# ANE Training -- API Reference + +Complete function index for all public functions, structs, and macros organized by source file. + +--- + +## Table of Contents + +1. [stories_config.h -- Model Configuration](#stories_configh) +2. [stories_io.h -- IOSurface I/O and Compilation](#stories_ioh) +3. [stories_mil.h -- MIL Program Generators](#stories_milh) +4. [stories_cpu_ops.h -- CPU Operations](#stories_cpu_opsh) +5. [ane_runtime.h -- Generalized ANE Wrapper](#ane_runtimeh) +6. [ane_mil_gen.h -- Composable MIL Helpers](#ane_mil_genh) +7. [ane_rmsnorm_bwd.h -- RMSNorm Backward on ANE](#ane_rmsnorm_bwdh) +8. [ane_classifier.h -- Classifier and Softmax on ANE](#ane_classifierh) +9. [bridge/ane_bridge.h -- C Bridge API](#bridgeane_bridgeh) +10. [MIL Operation Reference](#mil-operation-reference) +11. [Weight Blob Format](#weight-blob-format) + +--- + +## stories_config.h + +Model constants, data structures, and memory allocation helpers. + +### Macros + +| Macro | Value | Description | +|-------|-------|-------------| +| `DIM` | 768 | Model hidden dimension | +| `HIDDEN` | 2048 | FFN intermediate dimension | +| `HEADS` | 12 | Number of attention heads | +| `HD` | 64 (`DIM/HEADS`) | Per-head dimension | +| `SEQ` | 256 | Sequence length | +| `NLAYERS` | 12 | Number of transformer layers | +| `VOCAB` | 32000 | Vocabulary size | +| `ACCUM_STEPS` | 10 | Gradient accumulation steps per compile batch | +| `MAX_COMPILES` | 100 | ANE compile budget before process restart | +| `KERNELS_PER_LAYER` | 5 | Weight-bearing ANE kernels per layer | +| `TOTAL_WEIGHT_KERNELS` | 60 | Total weight-bearing compiles per batch | +| `SCORE_CH` | 3072 (`HEADS*SEQ`) | Attention score channels for SDPA backward | +| `WQ_SZ` | 589824 (`DIM*DIM`) | Size of Q/K/V/O projection weight matrices | +| `WO_SZ` | 589824 (`DIM*DIM`) | Size of output projection | +| `W1_SZ` | 1572864 (`HIDDEN*DIM`) | FFN gate/value projection size | +| `W2_SZ` | 1572864 (`DIM*HIDDEN`) | FFN down-projection size | +| `W3_SZ` | 1572864 (`HIDDEN*DIM`) | FFN value projection size | +| `LAYER_PARAMS` | -- | Total floats per layer: `4*WQ_SZ + W1_SZ + W2_SZ + W3_SZ + 2*DIM` | +| `TOTAL_PARAMS` | -- | Total model params: `NLAYERS * LAYER_PARAMS + DIM + VOCAB*DIM` | + +### Structs + +#### `LayerWeights` +Per-layer weight matrices (all `float*`). + +| Field | Shape | Description | +|-------|-------|-------------| +| `Wq`, `Wk`, `Wv`, `Wo` | `[DIM, DIM]` | Attention projection weights | +| `W1`, `W3` | `[HIDDEN, DIM]` | FFN gate and value up-projections | +| `W2` | `[DIM, HIDDEN]` | FFN down-projection | +| `rms_att` | `[DIM]` | RMSNorm scale for attention sublayer | +| `rms_ffn` | `[DIM]` | RMSNorm scale for FFN sublayer | + +#### `AdamState` +First/second moment buffers for a single parameter group. + +| Field | Type | Description | +|-------|------|-------------| +| `m` | `float*` | First moment (mean) estimate | +| `v` | `float*` | Second moment (variance) estimate | +| `n` | `size_t` | Number of parameters | + +#### `LayerAdam` +Per-layer Adam optimizer state. Contains one `AdamState` per weight matrix: `Wq`, `Wk`, `Wv`, `Wo`, `W1`, `W2`, `W3`, `rms_att`, `rms_ffn`. + +#### `LayerActs` +Per-layer activation tensors saved for the backward pass. + +| Field | Shape | Description | +|-------|-------|-------------| +| `layer_in` | `[DIM, SEQ]` | Input to this layer (for rmsnorm1 backward) | +| `xnorm` | `[DIM, SEQ]` | RMSNorm1 output | +| `Q`, `K`, `V` | `[DIM, SEQ]` | QKV projections | +| `attn_out` | `[DIM, SEQ]` | Attention output (before Wo) | +| `o_out` | `[DIM, SEQ]` | Wo projection output | +| `x2` | `[DIM, SEQ]` | Residual after attention | +| `x2norm` | `[DIM, SEQ]` | RMSNorm2 output | +| `h1`, `h3` | `[HIDDEN, SEQ]` | FFN intermediates (W1 and W3 outputs) | +| `silu_out` | `[HIDDEN, SEQ]` | SiLU(h1) * h3 gated output | +| `ffn_out` | `[DIM, SEQ]` | FFN final output | + +#### `LayerGrads` +Per-layer gradient accumulators. Same field names as `LayerWeights` (all `float*`): `Wq`, `Wk`, `Wv`, `Wo`, `W1`, `W2`, `W3`, `rms_att`, `rms_ffn`. + +#### `Kern` +Single ANE kernel handle (stories-specific, single I/O). + +| Field | Type | Description | +|-------|------|-------------| +| `model` | `void*` | Retained `_ANEInMemoryModel` | +| `ioIn` | `IOSurfaceRef` | Input IOSurface | +| `ioOut` | `IOSurfaceRef` | Output IOSurface | +| `request` | `void*` | Retained `_ANERequest` | +| `tmpDir` | `void*` | Retained temp directory path | + +#### `LayerKernels` +ANE kernels for one transformer layer. + +| Field | Type | Description | +|-------|------|-------------| +| `fwdAttn` | `Kern*` | SDPA forward + taps | +| `fwdFFN` | `Kern*` | FFN forward + taps | +| `ffnBwd` | `Kern*` | FFN backward | +| `sdpaBwd1` | `Kern*` | SDPA backward part 1 (Wo^T + dV + scores) | +| `sdpaBwd2` | `Kern*` | SDPA backward part 2 (dQ + dK) | +| `qkvBwd` | `Kern*` | QKV backward (Wq^T, Wk^T, Wv^T) | + +#### `CkptHdr` +Checkpoint file header (128 bytes, version 2). + +| Field | Type | Description | +|-------|------|-------------| +| `magic` | `int` | `0x424C5A54` ("BLZT") | +| `version` | `int` | 2 | +| `step`, `total_steps` | `int` | Training progress | +| `n_layers`, `vocab_size`, `dim`, `hidden_dim`, `n_heads`, `seq_len` | `int` | Model shape | +| `lr`, `loss` | `float` | Learning rate, last loss | +| `cum_compile`, `cum_train`, `cum_wall` | `double` | Cumulative timing (ms) | +| `cum_steps`, `cum_batches` | `int` | Cumulative counters | +| `adam_t` | `int` | Adam timestep (for bias correction) | +| `pad[3]` | `int` | Alignment padding | + +#### `Llama2Config` +Header from llama2.c model files (7 ints): `dim`, `hidden_dim`, `n_layers`, `n_heads`, `n_kv_heads`, `vocab_size`, `seq_len`. + +### Global Variables + +| Name | Type | Description | +|------|------|-------------| +| `g_D` | `Class` | `_ANEInMemoryModelDescriptor` ObjC class | +| `g_I` | `Class` | `_ANEInMemoryModel` ObjC class | +| `g_AR` | `Class` | `_ANERequest` ObjC class | +| `g_AIO` | `Class` | `_ANEIOSurfaceObject` ObjC class | +| `g_tb` | `mach_timebase_info_data_t` | Mach time base for timing | +| `g_compile_count` | `int` | Running count of ANE compiles | + +### Functions + +| Function | Returns | Description | +|----------|---------|-------------| +| `ane_init(void)` | `void` | Load AppleNeuralEngine.framework, resolve 4 private class references | +| `tb_ms(uint64_t t)` | `double` | Convert Mach absolute time to milliseconds | +| `adam_alloc(size_t n)` | `AdamState` | Allocate zeroed first/second moment buffers for n parameters | +| `adam_free(AdamState *s)` | `void` | Free an AdamState's buffers | +| `layer_weights_alloc(void)` | `LayerWeights` | Allocate all weight matrices for one layer | +| `layer_weights_free(LayerWeights *w)` | `void` | Free all weight matrices for one layer | +| `layer_adam_alloc(void)` | `LayerAdam` | Allocate Adam state for all weights in one layer | +| `layer_adam_free(LayerAdam *a)` | `void` | Free Adam state for one layer | +| `layer_acts_alloc(void)` | `LayerActs` | Allocate all activation buffers for one layer | +| `layer_acts_free(LayerActs *a)` | `void` | Free all activation buffers for one layer | +| `layer_grads_alloc(void)` | `LayerGrads` | Allocate zeroed gradient accumulators for one layer | +| `layer_grads_zero(LayerGrads *g)` | `void` | Zero all gradient accumulators (between accumulation steps) | +| `layer_grads_free(LayerGrads *g)` | `void` | Free gradient accumulators for one layer | + +--- + +## stories_io.h + +IOSurface creation, fp16/fp32 conversion, weight blob building, and ANE kernel compile/run. + +**Depends on**: `stories_config.h`, `` + +### Functions + +| Function | Returns | Description | +|----------|---------|-------------| +| `make_surface(size_t bytes)` | `IOSurfaceRef` | Create a 1D IOSurface with given byte allocation | +| `build_blob(const float *w, int rows, int cols)` | `NSData*` | Build fp16 weight blob (128B header + row-major fp16 data) from fp32 weights | +| `build_blob_t(const float *w, int rows, int cols)` | `NSData*` | Build fp16 weight blob with transposed layout (col-major fp16 from row-major fp32) | +| `build_blob_fp16(_Float16 *d, int cnt)` | `NSData*` | Build weight blob from pre-existing fp16 data (no conversion) | +| `cvt_f16_f32(float *dst, const _Float16 *src, int n)` | `void` | NEON-vectorized fp16-to-fp32 conversion (8-wide SIMD) | +| `cvt_f32_f16(_Float16 *dst, const float *src, int n)` | `void` | NEON-vectorized fp32-to-fp16 conversion (8-wide SIMD) | +| `io_write_fp16(IOSurfaceRef s, const float *data, int channels, int sp)` | `void` | Write fp32 data to IOSurface as fp16 in channel-first `[C,S]` layout | +| `io_read_fp16(IOSurfaceRef s, float *data, int ch_off, int channels, int sp)` | `void` | Read fp16 data from IOSurface at channel offset, convert to fp32 | +| `io_copy(IOSurfaceRef dst, int dst_ch, IOSurfaceRef src, int src_ch, int channels, int sp)` | `void` | Copy fp16 data between IOSurfaces at specified channel offsets | +| `io_write_fp16_at(IOSurfaceRef s, int ch_off, const float *data, int channels, int sp)` | `void` | Write fp32 data to IOSurface at specific channel offset as fp16 | +| `compile_kern_mil_w(NSString *mil, NSDictionary *weights, int ic_bytes, int oc_bytes)` | `Kern*` | Compile MIL text + weight dictionary into a loaded ANE kernel with IOSurfaces. Increments `g_compile_count`. | +| `free_kern(Kern *k)` | `void` | Unload ANE model, release IOSurfaces, remove temp directory, free kernel | +| `ane_run(Kern *k)` | `void` | Run a compiled ANE kernel on current IOSurface contents | + +--- + +## stories_mil.h + +MIL program generators for the 6 fused ANE kernel types. Each returns an `NSString*` containing the full MIL program text. + +**Depends on**: `stories_io.h` + +### Macros + +| Macro | Description | +|-------|-------------| +| `MIL_HDR` | Standard MIL program header (version 1.3, buildInfo with coremlc/coremltools versions) | +| `CONV_CONST` | Common conv parameter constants (pad_type, strides, pad, dilations, groups) | + +### Functions + +| Function | Returns | Description | +|----------|---------|-------------| +| `gen_sdpa_fwd_taps(void)` | `NSString*` | SDPA forward: RMSNorm + QKV + attention + Wo. Output: `concat(o_out, Q, K, V, attn_out, xnorm)` `[1, 6*DIM, 1, SEQ]` | +| `gen_ffn_fwd_taps(void)` | `NSString*` | FFN forward: RMSNorm + W1/W3 + SiLU + W2. Output: `concat(ffn_out, h1, h3, silu_out, x2norm)` `[1, 2*DIM+3*HIDDEN, 1, SEQ]` | +| `gen_ffn_bwd(void)` | `NSString*` | FFN backward: Input `concat(dffn, h1, h3)`. Output: `concat(dx, dh1, dh3)` `[1, DIM+2*HIDDEN, 1, SEQ]` | +| `gen_qkvb(void)` | `NSString*` | QKV backward: Input `concat(dQ, dK, dV)`. Output: `dx` `[1, DIM, 1, SEQ]` | +| `gen_sdpa_bwd1(void)` | `NSString*` | SDPA backward part 1: Input `concat(Q, K, V, dx2)`. Output: `concat(dV, probs, dP)` `[1, DIM+2*SCORE_CH, 1, SEQ]` | +| `gen_sdpa_bwd2(void)` | `NSString*` | SDPA backward part 2: Input `concat(probs, dP, Q, K)`. Output: `concat(dQ, dK)` `[1, 2*DIM, 1, SEQ]` | +| `get_mask_blob(void)` | `NSData*` | Lazily build and cache causal attention mask as fp16 blob. Lower-triangular 0, upper -65504. | + +### Global Variables + +| Name | Type | Description | +|------|------|-------------| +| `g_mask_blob` | `NSData*` | Cached causal mask blob (built on first call to `get_mask_blob`) | + +--- + +## stories_cpu_ops.h + +CPU-side operations using Accelerate framework (vDSP, vvrsqrtf, vvexpf). + +**Depends on**: `stories_config.h` + +### Functions + +| Function | Returns | Description | +|----------|---------|-------------| +| `rmsnorm(float *out, const float *x, const float *w, int d, int S)` | `void` | RMSNorm forward: `out = x * rsqrt(mean(x^2) + eps) * w`. Vectorized via vDSP. Layout: channel-first `[d, S]`. | +| `rmsnorm_bwd(float *dx, float *dw, const float *dy, const float *x, const float *w, int d, int S)` | `void` | RMSNorm backward: computes `dx` (input gradient) and accumulates `dw` (scale gradient). | +| `adam_update(float *w, const float *g, AdamState *s, int t, float lr, float b1, float b2, float eps)` | `void` | Adam optimizer step with bias correction. Updates weights in-place. `t` is the timestep for bias correction. | +| `cross_entropy_loss(float *dlogits, const float *logits, const uint16_t *targets, int V, int S)` | `float` | Compute mean cross-entropy loss. Writes `dlogits = (softmax(logits) - one_hot(targets)) / S`. Column-major `[V, S]` layout. Uses vDSP transpose + vvexpf for vectorized softmax. | +| `embed_lookup(float *x, const float *embed, const uint16_t *tokens, int dim, int seq)` | `void` | Embedding forward: gather rows from `embed[VOCAB, DIM]` into channel-first `x[DIM, SEQ]`. | +| `embed_backward(float *d_embed, const float *dx, const uint16_t *tokens, int dim, int seq)` | `void` | Embedding backward: scatter-add `dx` back into embedding table gradient `d_embed`. | + +### Global Variables + +| Name | Type | Description | +|------|------|-------------| +| `g_rms_tmp` | `float*` | Lazily-allocated scratch buffer for RMSNorm (size SEQ) | + +--- + +## ane_runtime.h + +Generalized ANE wrapper with multi-input/output support. Used in bridge, tests, and newer training variants. + +### Structs + +#### `ANEKernel` +Generalized kernel handle supporting multiple inputs and outputs. + +| Field | Type | Description | +|-------|------|-------------| +| `model` | `id` | `_ANEInMemoryModel` instance | +| `ioInputs` | `IOSurfaceRef*` | Array of input IOSurfaces | +| `ioOutputs` | `IOSurfaceRef*` | Array of output IOSurfaces | +| `request` | `id` | `_ANERequest` instance | +| `tmpDir` | `NSString*` | Temp directory for MIL/weights on disk | +| `nInputs`, `nOutputs` | `int` | Number of I/O tensors | +| `inputBytes`, `outputBytes` | `size_t*` | Byte sizes for each I/O tensor | + +### Global Variables + +| Name | Type | Description | +|------|------|-------------| +| `g_ANEDesc` | `Class` | `_ANEInMemoryModelDescriptor` | +| `g_ANEInMem` | `Class` | `_ANEInMemoryModel` | +| `g_ANEReq` | `Class` | `_ANERequest` | +| `g_ANEIO` | `Class` | `_ANEIOSurfaceObject` | +| `g_ane_loaded` | `bool` | Guard to avoid re-loading the framework | + +### Functions + +| Function | Returns | Description | +|----------|---------|-------------| +| `ane_init(void)` | `void` | Load AppleNeuralEngine.framework (idempotent), resolve 4 private ObjC classes | +| `ane_create_surface(size_t bytes)` | `IOSurfaceRef` | Create a 1D IOSurface of given byte size | +| `ane_compile(NSData *milText, NSData *weightData, int nInputs, size_t *inputSizes, int nOutputs, size_t *outputSizes)` | `ANEKernel*` | Full compile pipeline: build descriptor, compile MIL, load model, create IOSurfaces + request. Returns NULL on failure. | +| `ane_write_input(ANEKernel *k, int idx, const void *data, size_t bytes)` | `void` | Write raw bytes to the idx-th input IOSurface (lock/memcpy/unlock) | +| `ane_read_output(ANEKernel *k, int idx, void *data, size_t bytes)` | `void` | Read raw bytes from the idx-th output IOSurface (read-lock/memcpy/unlock) | +| `ane_run_kernel(ANEKernel *k)` | `bool` | Run the compiled ANE kernel. Returns true on success. | +| `ane_free(ANEKernel *k)` | `void` | Unload model, release all IOSurfaces, remove temp dir, free struct | + +--- + +## ane_mil_gen.h + +Composable MIL generation helpers for common patterns, plus weight blob builders. + +### Functions + +| Function | Returns | Description | +|----------|---------|-------------| +| `mil_build_weight_blob(const float *w, int out_ch, int in_ch)` | `NSData*` | Build fp16 weight blob with 128B header from fp32 row-major `[out_ch, in_ch]` weights | +| `mil_gen_matmul(int in_ch, int out_ch, int spatial)` | `NSString*` | Generate MIL for matmul `y = W @ x` with both as runtime inputs. Includes fp32-to-fp16-to-fp32 casts. | +| `mil_gen_conv(int in_ch, int out_ch, int spatial)` | `NSString*` | Generate MIL for conv-based linear with baked weights from blob file (inference-only) | +| `mil_gen_qkv(int dim, int spatial)` | `NSString*` | Generate MIL for fused QKV: 3 parallel convs from single input, weights from concatenated blob | +| `mil_build_qkv_weight_blob(const float *wq, const float *wk, const float *wv, int dim)` | `NSData*` | Build concatenated weight blob for fused QKV (3 chunks, each with 64B header + fp16 data) | +| `mil_build_ffn_up_weight_blob(const float *w1, const float *w3, int hidden_dim, int dim)` | `NSData*` | Build concatenated weight blob for fused FFN up-projection (W1 + W3 chunks) | +| `mil_gen_ffn_up(int dim, int hidden_dim, int spatial)` | `NSString*` | Generate MIL for fused FFN up: W1 + W3 parallel convs, outputs h1 and h3 | + +--- + +## ane_rmsnorm_bwd.h + +MIL generator for RMSNorm backward on ANE (used by `train_large_ane.m`). + +**Depends on**: `stories_mil.h` + +### Functions + +| Function | Returns | Description | +|----------|---------|-------------| +| `gen_rmsnorm_bwd(void)` | `NSString*` | Generate MIL for RMSNorm backward. Input: `concat(dy, x)` as `[1, 2*DIM, 1, SEQ]`. Baked weight: RMSNorm scale `w[DIM]`. Output: `dx` as `[1, DIM, 1, SEQ]`. Note: `dw` (weight gradient) stays on CPU. | + +--- + +## ane_classifier.h + +MIL generators for classifier operations on ANE (used by `train_large_ane.m`). + +**Depends on**: `stories_mil.h` + +### Functions + +| Function | Returns | Description | +|----------|---------|-------------| +| `gen_classifier_fwd(void)` | `NSString*` | Classifier forward: single 32000-output-channel conv. Input: `[1, DIM, 1, SEQ]`. Baked: embedding weights `[VOCAB, DIM, 1, 1]`. Output: `[1, VOCAB, 1, SEQ]`. | +| `gen_classifier_bwd(void)` | `NSString*` | Classifier backward: `dx = embed^T @ dlogits`. Uses `matmul` op (not conv, since ANE rejects conv with 32000 input channels). Input: `[1, VOCAB, 1, SEQ]`. Baked: `embed^T [1, DIM, VOCAB]`. Output: `[1, DIM, 1, SEQ]`. | +| `gen_softmax_vocab(void)` | `NSString*` | Softmax over VOCAB dimension: `softmax(x, axis=1)`. Input: `[1, VOCAB, 1, SEQ]`. Output: `[1, VOCAB, 1, SEQ]`. | +| `gen_final_rmsnorm(void)` | `NSString*` | Final RMSNorm (standalone, not fused). Input: `[1, DIM, 1, SEQ]`. Baked: `rms_final[DIM]`. Output: `[1, DIM, 1, SEQ]`. | + +--- + +## bridge/ane_bridge.h + +C-callable bridge to ANE private APIs for Python ctypes integration. + +### Types + +| Type | Description | +|------|-------------| +| `ANEKernelHandle` | Opaque kernel handle (pointer to internal struct) | + +### Functions + +| Function | Returns | Description | +|----------|---------|-------------| +| `ane_bridge_init(void)` | `int` | Initialize ANE runtime (load private framework, resolve classes). Returns 0 on success, -1 on failure. | +| `ane_bridge_compile(const char *mil_text, size_t mil_len, const uint8_t *weight_data, size_t weight_len, int n_inputs, const size_t *input_sizes, int n_outputs, const size_t *output_sizes)` | `ANEKernelHandle*` | Compile MIL text + single weight blob into ANE kernel. Returns NULL on failure. | +| `ane_bridge_compile_multi_weights(const char *mil_text, size_t mil_len, const char **weight_names, const uint8_t **weight_datas, const size_t *weight_lens, int n_weights, int n_inputs, const size_t *input_sizes, int n_outputs, const size_t *output_sizes)` | `ANEKernelHandle*` | Compile MIL text + multiple named weight files. Weight names use `@model_path/` prefix convention. | +| `ane_bridge_run(ANEKernelHandle *kernel)` | `bool` | Execute a compiled kernel on ANE. Returns true on success. | +| `ane_bridge_write_input(ANEKernelHandle *kernel, int idx, const void *data, size_t bytes)` | `void` | Write data to kernel input IOSurface at index `idx` | +| `ane_bridge_read_output(ANEKernelHandle *kernel, int idx, void *data, size_t bytes)` | `void` | Read data from kernel output IOSurface at index `idx` | +| `ane_bridge_free(ANEKernelHandle *kernel)` | `void` | Unload model, release all IOSurfaces, remove temp dir, free handle | +| `ane_bridge_get_compile_count(void)` | `int` | Get current compile count (for restart budgeting) | +| `ane_bridge_reset_compile_count(void)` | `void` | Reset compile count to zero | +| `ane_bridge_build_weight_blob(const float *src, int rows, int cols, size_t *out_len)` | `uint8_t*` | Build weight blob in ANE format (128B header + fp16). Caller must free via `ane_bridge_free_blob()`. | +| `ane_bridge_build_weight_blob_transposed(const float *src, int rows, int cols, size_t *out_len)` | `uint8_t*` | Build transposed weight blob. Caller must free via `ane_bridge_free_blob()`. | +| `ane_bridge_free_blob(void *ptr)` | `void` | Free a blob allocated by `ane_bridge_build_weight_blob*` | + +--- + +## MIL Operation Reference + +All MIL programs target `ios18` and use fp16 tensors in `[1, C, 1, S]` layout (or `[1, H, S, S]` for attention scores). + +| Operation | MIL Syntax | Purpose | +|-----------|-----------|---------| +| `conv` | `conv(dilations=dl, groups=gr, pad=pd, pad_type=pt, strides=st, weight=W, x=xn)` | Linear projections (all Wq, Wk, Wv, Wo, W1, W2, W3). 1x1 conv = matmul. Weight shape: `[out_ch, in_ch, 1, 1]`. | +| `matmul` | `matmul(transpose_x=tx, transpose_y=ty, x=a, y=b)` | Attention score computation (Q at K^T, scores at V, classifier backward). | +| `softmax` | `softmax(axis=ax, x=ms)` | Attention weight normalization (`axis=-1`) and vocab softmax (`axis=1`). | +| `mul` | `mul(x=a, y=b)` | Element-wise multiply: RMSNorm scaling, SiLU gating, attention scaling, softmax Jacobian. | +| `add` | `add(x=a, y=b)` | Causal mask application, SiLU derivative `(1 + h*(1-sig))`, gradient accumulation. | +| `sub` | `sub(x=a, y=b)` | SiLU derivative: `1 - sigmoid(h1)`, softmax backward: `dp - sum(P*dP)`. | +| `sigmoid` | `sigmoid(x=h1)` | SiLU activation component (SiLU = x * sigmoid(x)). | +| `pow` | `pow(x=ss3, y=nhalf)` | RMSNorm: `x^(-0.5)` = reciprocal sqrt. | +| `reduce_sum` | `reduce_sum(x=sq, axes=rax, keep_dims=kd)` | RMSNorm: sum of squares along channel dim. Softmax backward: row-wise dot product. | +| `reshape` | `reshape(shape=sh, x=xf)` | `[1,DIM,1,SEQ]` to `[1,HEADS,HD,SEQ]` for multi-head attention. Flatten attention scores. | +| `transpose` | `transpose(perm=pm, x=q4)` | Permute `[0,1,3,2]`: swap spatial and head_dim for matmul compatibility. | +| `concat` | `concat(axis=cax, interleave=cid, values=(a,b,c))` | Pack multiple outputs into single IOSurface ("taps"). Always `axis=1`, `interleave=false`. | +| `slice_by_size` | `slice_by_size(x=x, begin=b, size=sz)` | Split concatenated inputs in backward kernels. `begin=[0,offset,0,0]`, `size=[1,channels,1,SEQ]`. | +| `cast` | `cast(dtype=to_fp16, x=x)` | fp32-to-fp16 or fp16-to-fp32 precision conversion (used in ane_mil_gen.h generators). | +| `const` | `const()[name=..., val=...]` | Declare scalar/tensor constants, conv parameters, weight blob references via `BLOBFILE`. | + +--- + +## Weight Blob Format + +### Single-weight blob (128 bytes header + data) + +``` +Offset Size Content +------ ----- ------- +0 1 0x01 (format marker) +4 1 0x02 (format marker) +5-63 59 zeros (global header padding) +64 4 0xDEADBEEF (chunk magic, little-endian: EF BE AD DE) +68 1 0x01 (chunk marker) +72 4 uint32 data_size (total fp16 bytes = out_ch * in_ch * 2) +80 4 uint32 data_offset (always 128 = 64 global + 64 chunk) +84-127 44 zeros (chunk header padding) +128+ N fp16 weight data, row-major [out_ch, in_ch] +``` + +### Multi-weight blob (fused QKV, FFN up) + +``` +Offset Content +------ ------- +0-63 Global header (same as above) +64 Chunk 0 header (64 bytes): magic, data_size, data_offset +64+64 Chunk 0 data (fp16 weights) +64+cs Chunk 1 header (64 bytes) +64+cs+64 Chunk 1 data (fp16 weights) +... +``` + +Where `cs = 64 + n_elements * 2` (chunk header size + data size). + +MIL references use `BLOBFILE(path="@model_path/weights/name.bin", offset=uint64(X))` where X is the chunk header offset within the file (64 for first chunk, 64+cs for second, etc.). diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md new file mode 100644 index 0000000..682edae --- /dev/null +++ b/docs/ARCHITECTURE.md @@ -0,0 +1,370 @@ +# ANE Training -- System Architecture + +Training neural networks directly on Apple's Neural Engine via reverse-engineered private APIs (`_ANEClient`, `_ANECompiler`). No CoreML training APIs, no Metal, no GPU. + +## Project Structure + +``` +ANE/ ++-- api_exploration.m # ANE private API discovery ++-- inmem_basic.m # In-memory MIL compilation proof-of-concept ++-- inmem_bench.m # ANE dispatch latency across model sizes ++-- inmem_peak.m # Peak TFLOPS via deep conv chains (self-contained) ++-- sram_bench.m # SRAM capacity probing (performance cliff detection) ++-- sram_probe.m # Fine-grained SRAM size exploration ++-- bridge/ +| +-- ane_bridge.h # C-callable API for Python ctypes +| +-- ane_bridge.m # Bridge implementation +| +-- Makefile # Builds libane_bridge.dylib +| +-- libane_bridge.dylib # Pre-built shared library ++-- training/ +| +-- train_large.m # Main: 12-layer training (CPU classifier) +| +-- train_large_ane.m # Variant: classifier + softmax on ANE +| +-- stories_config.h # Model constants, structs, alloc helpers +| +-- stories_io.h # IOSurface I/O, NEON fp16, compile/run +| +-- stories_mil.h # MIL generators for 6 fused ANE kernels +| +-- stories_cpu_ops.h # vDSP RMSNorm, cross-entropy, Adam, embedding +| +-- ane_runtime.h # Generalized ANE wrapper (multi-I/O) +| +-- ane_mil_gen.h # Composable MIL helpers (conv, matmul, fused QKV) +| +-- ane_rmsnorm_bwd.h # RMSNorm backward MIL (train_large_ane only) +| +-- ane_classifier.h # Classifier/softmax MIL (train_large_ane only) +| +-- forward.h # Gen1 forward pass (per-linear-kernel, all-CPU) +| +-- backward.h # Gen1 backward pass (all-CPU reference) +| +-- model.h # Gen1 Model struct, per-kernel compile +| +-- dashboard.py # TUI monitoring (loss, power, text generation) +| +-- tokenize.py # Extract pretokenized TinyStories data +| +-- download_data.sh # Download TinyStories from HuggingFace +| +-- Makefile # Build targets for training + tests +| +-- test_*.m # 12 unit test files ++-- docs/ # This documentation ++-- scripts/ # Automation scripts +``` + +## Two Generations of Training Code + +### Gen1: `model.h` + `forward.h` + `backward.h` + +The original correctness reference. One ANE kernel per linear projection (7 per layer + 1 classifier = 85 kernels total). Forward and backward are sequential all-CPU operations with optional ANE for the matmuls. No kernel fusion, no async overlap. Used for verifying Gen2's fused kernels produce correct results. + +### Gen2: `train_large.m` + `stories_*.h` (production) + +The performance-optimized system. Uses **5 fused ANE kernels per layer** (each performing multiple operations in a single dispatch). Weight gradients (`dW`) run asynchronously on CPU via GCD to overlap with ANE. All data is channel-first `[C, S]` fp16 on IOSurfaces. + +The rest of this document describes Gen2. + +--- + +## Model Configuration + +Stories110M -- a Llama2-architecture transformer: + +| Parameter | Value | Macro | +|-----------|-------|-------| +| Hidden dimension | 768 | `DIM` | +| FFN intermediate | 2048 | `HIDDEN` | +| Attention heads | 12 | `HEADS` | +| Head dimension | 64 | `HD` | +| Sequence length | 256 | `SEQ` | +| Layers | 12 | `NLAYERS` | +| Vocabulary | 32000 | `VOCAB` | +| Total parameters | 109.53M | `TOTAL_PARAMS` | +| Accumulation steps | 10 | `ACCUM_STEPS` | +| Max ANE compiles | 100 | `MAX_COMPILES` | + +--- + +## ANE Kernel Fusion Map + +Each training step dispatches 6 kernel types per layer. 5 are weight-bearing (recompiled each batch), 1 is weight-free (compiled once). + +| Kernel | Generator | Fused Operations | Baked Weights | Input Shape | Output Shape | +|--------|-----------|-----------------|---------------|-------------|--------------| +| `fwdAttn` | `gen_sdpa_fwd_taps()` | RMSNorm1, Wq/Wk/Wv conv, reshape, transpose, Q at K^T matmul, scale, causal mask, softmax, scores at V matmul, Wo conv | rms_att, Wq, Wk, Wv, Wo, mask | `[1,DIM,1,SEQ]` | `[1,6*DIM,1,SEQ]` | +| `fwdFFN` | `gen_ffn_fwd_taps()` | RMSNorm2, W1/W3 conv, sigmoid, SiLU gating, W2 conv | rms_ffn, W1, W3, W2 | `[1,DIM,1,SEQ]` | `[1,2D+3H,1,SEQ]` | +| `ffnBwd` | `gen_ffn_bwd()` | W2^T conv, SiLU derivative, W1^T/W3^T conv, add | W2^T, W1^T, W3^T | `[1,D+2H,1,SEQ]` | `[1,D+2H,1,SEQ]` | +| `sdpaBwd1` | `gen_sdpa_bwd1()` | Wo^T conv, reshape, Q at K^T recompute, softmax, dV matmul, dP matmul | Wo^T, mask | `[1,4*DIM,1,SEQ]` | `[1,D+2*SC,1,SEQ]` | +| `sdpaBwd2` | `gen_sdpa_bwd2()` | softmax Jacobian, scale, dQ=dS at K matmul, dK=dS^T at Q matmul | _(none)_ | `[1,2SC+2D,1,SEQ]` | `[1,2*DIM,1,SEQ]` | +| `qkvBwd` | `gen_qkvb()` | Wq^T/Wk^T/Wv^T conv, sum | Wq^T, Wk^T, Wv^T | `[1,3*DIM,1,SEQ]` | `[1,DIM,1,SEQ]` | + +Where D=DIM=768, H=HIDDEN=2048, SC=SCORE_CH=HEADS*SEQ=3072. + +"Taps" in forward kernels: intermediate values (Q, K, V, attention output, norms) are concatenated onto the output via `concat(axis=1)` so backward kernels can read them without CPU recomputation. + +--- + +## CPU vs ANE Operation Split + +| Operation | Location | Reason | +|-----------|----------|--------| +| Embedding lookup/backward | CPU | Scatter/gather by token index | +| RMSNorm forward | ANE | Fused into fwdAttn/fwdFFN kernels | +| QKV projections | ANE | 1x1 conv = matmul | +| Multi-head attention (SDPA) | ANE | Decomposed Q at K^T + mask + softmax + scores at V | +| FFN (SwiGLU) | ANE | W1,W3 conv + sigmoid + gate + W2 conv | +| Residual connections | CPU | Simple `vDSP_vadd` | +| Final RMSNorm | CPU (or ANE in `_ane` variant) | Standalone, not fused with other ops | +| Classifier matmul | CPU cblas (or ANE in `_ane` variant) | `[VOCAB,DIM] x [DIM,SEQ]` | +| Cross-entropy + softmax | CPU (partially ANE in `_ane`) | Target indexing requires CPU | +| dW weight gradients | CPU (async cblas) | Outer products, independent of backward data flow | +| RMSNorm backward | CPU (or ANE in `_ane` variant) | vDSP vectorized | +| Adam optimizer | CPU | In-place weight mutation | + +--- + +## Training Step Swim-Lane Diagram + +One complete training step showing CPU, ANE, and async GCD operations interleaved: + +```mermaid +sequenceDiagram + participant CPU + participant ANE + participant GCD as GCD Async Queue + + Note over CPU: FORWARD PASS (per layer L=0..11) + + CPU->>CPU: embed_lookup(tokens to x_cur) + + loop Layer L = 0..11 + CPU->>CPU: wait for prior async dW + CPU->>CPU: save layer_in, write fp16 to IOSurface + CPU->>ANE: run fwdAttn kernel + ANE-->>CPU: concat(o_out, Q, K, V, attn_out, xnorm) + CPU->>CPU: read fp16 taps, residual add to x2 + + CPU->>CPU: write fp16 x2 to IOSurface + CPU->>ANE: run fwdFFN kernel + ANE-->>CPU: concat(ffn_out, h1, h3, silu_out, x2norm) + CPU->>CPU: read fp16 taps, residual add to x_cur + end + + Note over CPU: CLASSIFIER + LOSS + CPU->>CPU: rmsnorm(x_cur to x_final) + CPU->>CPU: cblas_sgemm(embed x x_final to logits) + CPU->>CPU: cross_entropy_loss(logits to loss, dlogits) + + Note over CPU: BACKWARD PASS + CPU->>CPU: cblas_sgemm(embed^T x dlogits to dy) + CPU->>GCD: async dEmbed += dlogits x x_final^T + CPU->>CPU: rmsnorm_bwd(dy to dx) + + loop Layer L = 11..0 + Note over CPU,GCD: FFN Backward + CPU->>CPU: write dffn + copy h1,h3 from fwd taps + CPU->>ANE: run ffnBwd kernel + ANE-->>CPU: concat(dx_ffn, dh1, dh3) + CPU->>GCD: async dW2, dW1, dW3 accumulation + + Note over CPU,GCD: RMSNorm2 Backward + Residual + CPU->>CPU: rmsnorm_bwd, add residual gradient + + Note over CPU,GCD: SDPA Backward + CPU->>GCD: async dWo accumulation + CPU->>CPU: copy Q,K,V from fwd taps, write dx2 + CPU->>ANE: run sdpaBwd1 kernel + ANE-->>CPU: concat(dV, probs, dP) + + CPU->>CPU: copy probs,dP,Q,K + CPU->>ANE: run sdpaBwd2 kernel + ANE-->>CPU: concat(dQ, dK) + + CPU->>GCD: async dWq, dWk, dWv accumulation + + Note over CPU,GCD: QKV Backward + CPU->>CPU: copy dQ,dK,dV + CPU->>ANE: run qkvBwd kernel + ANE-->>CPU: dx_attn + + Note over CPU,GCD: RMSNorm1 Backward + Residual + CPU->>CPU: rmsnorm_bwd, add both skip gradients + end + + CPU->>CPU: dispatch_group_wait(all async dW) + CPU->>CPU: embed_backward(dy to d_embed) +``` + +--- + +## Async CPU/ANE Overlap Strategy + +The key insight: **dW gradients (weight gradients) are independent of the backward data flow**. They are outer products `dW += dy x x^T` that only accumulate into gradient buffers. The data-path gradients (`dx`) flow backward through the network on ANE. + +``` +Timeline for one backward layer: + ANE: [ffnBwd] [sdpaBwd1] [sdpaBwd2] [qkvBwd] + CPU: [dW_FFN (3x sgemm)] [dWo] [dWqkv (3x sgemm)] +``` + +GCD serial dispatch queue `"dw_cblas"` ensures dW operations don't overlap each other (they share scratch buffers). The `dispatch_group_wait` at the start of each forward layer ensures async dW from the previous step's backward has finished before IOSurfaces are reused. + +--- + +## Compile/Restart Lifecycle + +The ANE runtime leaks resources internally, limiting compiles to ~119 per process. The system manages this with checkpoint-and-restart: + +```mermaid +flowchart TD + Start["Process starts (fresh or --resume)"] --> LoadCkpt{"--resume flag?"} + LoadCkpt -->|Yes| Resume["Load checkpoint: weights, Adam state, step counter"] + LoadCkpt -->|No| Init["Xavier init weights, zero Adam state"] + Resume --> CompileCheck + Init --> CompileCheck + + CompileCheck{"g_compile_count + 60 > MAX_COMPILES?"} -->|Yes| SaveCheckpoint["Save checkpoint to ane_stories110M_ckpt.bin"] + SaveCheckpoint --> FreeAll["Free all ANE kernels"] + FreeAll --> RestartProcess["Re-launch process with --resume flag"] + RestartProcess --> Start + + CompileCheck -->|No| Compile["Compile 60 weight-bearing kernels (5 per layer x 12)"] + Compile --> ZeroGrads["Zero gradient accumulators"] + ZeroGrads --> AccumLoop + + subgraph AccumLoop ["Gradient Accumulation (10 steps)"] + SingleStep["Forward + Backward + async dW"] --> MoreSteps{"More accum steps?"} + MoreSteps -->|Yes| SingleStep + end + + MoreSteps -->|No| WaitDW["dispatch_group_wait (all async dW)"] + WaitDW --> ScaleGrad["Scale gradients by 1/ACCUM_STEPS"] + ScaleGrad --> AdamUpdate["Adam update (mutates weights in-place)"] + AdamUpdate --> FreeKernels["Free all weight-bearing kernels"] + FreeKernels --> CompileCheck +``` + +With `MAX_COMPILES=100` and 60 weight-bearing kernels per batch, only **1 batch** (10 accumulation steps) fits per process lifetime. The checkpoint preserves: + +- Training step and total_steps +- All weights and Adam (m, v) state per layer +- Cumulative timing statistics +- Adam timestep counter + +--- + +## Data Flow Through One Layer + +Tensor shapes as they flow through forward and backward passes: + +```mermaid +flowchart LR + subgraph fwdAttnKernel ["fwdAttn Kernel (ANE)"] + xIn["x_in\n[1,768,1,256]"] --> RMS1["RMSNorm1"] + RMS1 --> QKVConv["Wq,Wk,Wv conv\n[768,768,1,1]"] + QKVConv --> ReshapeHeads["reshape\n[1,12,64,256]"] + ReshapeHeads --> TransposeHeads["transpose\n[1,12,256,64]"] + TransposeHeads --> QKT["Q x K^T\n[1,12,256,256]"] + QKT --> ScaleMask["scale + mask\n+ softmax"] + ScaleMask --> AV["scores x V\n[1,12,256,64]"] + AV --> ReshapeBackFlat["reshape\n[1,768,1,256]"] + ReshapeBackFlat --> WoConv["Wo conv\n[768,768,1,1]"] + end + + subgraph taps1 ["Taps via concat"] + WoConv --> T1["o_out [768]"] + QKVConv --> T2["Q,K,V [768 each]"] + AV --> T3["attn_out [768]"] + RMS1 --> T4["xnorm [768]"] + end + + subgraph cpuResid1 ["CPU"] + T1 --> ResAdd1["x + o_out = x2"] + end + + subgraph fwdFFNKernel ["fwdFFN Kernel (ANE)"] + ResAdd1 --> RMS2["RMSNorm2"] + RMS2 --> W1W3["W1,W3 conv\n[2048,768,1,1]"] + W1W3 --> SiLUGate["sigmoid + SiLU\n+ gating"] + SiLUGate --> W2Conv["W2 conv\n[768,2048,1,1]"] + end + + subgraph taps2 ["Taps via concat"] + W2Conv --> T5["ffn_out [768]"] + W1W3 --> T6["h1,h3 [2048 each]"] + SiLUGate --> T7["silu_out [2048]"] + RMS2 --> T8["x2norm [768]"] + end + + subgraph cpuResid2 ["CPU"] + T5 --> ResAdd2["x2 + ffn_out = x_next"] + end +``` + +--- + +## IOSurface Memory Layout + +All tensors use channel-first `[1, C, 1, S]` fp16 layout on IOSurfaces, matching ANE's native format: + +``` +IOSurface memory (contiguous fp16): + channel_0: [pos_0, pos_1, ..., pos_255] (256 values) + channel_1: [pos_0, pos_1, ..., pos_255] + ... + channel_767: [pos_0, pos_1, ..., pos_255] +``` + +Fused kernel outputs use `concat(axis=1)` to pack multiple tensors into a single IOSurface: + +``` +fwdAttn output [1, 6*768, 1, 256]: + channels 0-767: o_out (Wo projection output) + channels 768-1535: Q (query projection) + channels 1536-2303: K (key projection) + channels 2304-3071: V (value projection) + channels 3072-3839: attn_out (pre-Wo attention output) + channels 3840-4607: xnorm (RMSNorm1 output) +``` + +CPU reads specific taps via `io_read_fp16(surface, data, ch_offset, n_channels, spatial)`. + +--- + +## Weight Blob Format + +ANE weight blobs follow a binary format with a 128-byte header: + +``` +Offset Size Content +------ ----- ------- +0 1 0x01 (format marker) +4 1 0x02 (format marker) +5-63 59 zeros (padding) +64 4 0xDEADBEEF (chunk magic, little-endian) +68 1 0x01 (chunk marker) +72 4 uint32 data_size (fp16 weight bytes) +80 4 uint32 data_offset (always 128) +84-127 44 zeros (padding) +128+ N fp16 weight data, row-major [out_ch, in_ch] +``` + +Multi-weight blobs (fused QKV, FFN up) concatenate chunks: `[64B global header] [64B chunk0 header] [chunk0 data] [64B chunk1 header] [chunk1 data] ...` + +MIL programs reference weights via `BLOBFILE(path="@model_path/weights/name.bin", offset=uint64(64))` where offset 64 points to the chunk header within the file. + +--- + +## Key Constraints + +| Constraint | Impact | Workaround | +|-----------|--------|------------| +| ~119 compile limit per process | ANE compiler leaks resources | `checkpoint + re-launch with --resume` | +| Weights baked at compile time | Cannot hot-swap weights; must recompile | Gradient accumulation amortizes compile cost | +| SDPA ignores `attn_mask` | Causal attention cannot use native SDPA mask | Decompose into Q at K^T + explicit mask + softmax + scores at V | +| ANE SRAM capacity ~32 MB | Large weight matrices spill to DRAM | Performance cliff above ~3072 channels | +| 32000 input channels rejected | ANE refuses conv with VOCAB input channels | Classifier backward uses `matmul` op with reshape instead of conv | +| fp16 compute only | Precision limited on ANE | fp32 on CPU for loss, Adam; fp16 for ANE forward/backward | + +--- + +## `train_large.m` vs `train_large_ane.m` + +`train_large_ane.m` moves additional operations from CPU to ANE: + +| Operation | `train_large.m` | `train_large_ane.m` | +|-----------|-----------------|---------------------| +| Final RMSNorm | CPU (`rmsnorm()` via vDSP) | ANE (`gen_final_rmsnorm()`) | +| Classifier forward | CPU (`cblas_sgemm`) | ANE (`gen_classifier_fwd()`, 32000-ch conv) | +| Softmax | CPU (inside `cross_entropy_loss()`) | ANE (`gen_softmax_vocab()`) | +| Per-layer RMSNorm backward | CPU (`rmsnorm_bwd()` via vDSP) | ANE (`gen_rmsnorm_bwd()`) | + +This increases compile budget pressure: 86 weight-bearing kernels per batch (vs 60), leaving less headroom within MAX_COMPILES=100. diff --git a/docs/BENCHMARKS.md b/docs/BENCHMARKS.md new file mode 100644 index 0000000..2e0a510 --- /dev/null +++ b/docs/BENCHMARKS.md @@ -0,0 +1,253 @@ +# ANE Training -- Benchmarks and Tests Guide + +All benchmarks and tests require **macOS 15+ on Apple Silicon** (tested on M4, M5). + +--- + +## Quick Start + +```bash +# Build and run training benchmark (100 steps) +cd training +make train_large && ./train_large --steps 100 + +# Run the automated benchmark suite +cd .. +bash scripts/run_benchmarks.sh +``` + +--- + +## Training Benchmarks + +### train_large (CPU classifier) + +The main 12-layer Stories110M training loop with classifier on CPU. + +| Item | Details | +|------|---------| +| **Purpose** | Full transformer training benchmark | +| **Measures** | ms/step, ANE TFLOPS, ANE utilization %, per-component timing | +| **Prerequisites** | Training data: `bash download_data.sh` (or runs on random data if absent) | +| **Build** | `cd training && make train_large` | +| **Run** | `./train_large --steps 100` | +| **CLI flags** | `--steps N` (default 10000), `--lr F` (default 3e-4), `--resume` | + +**Expected output:** + +``` +ane=9.6 io=4.1 cls=9.1 elem=14.4 rms=0.1 cblas_wait=2.3 ms/step + +=== Efficiency Report === +Total steps: 100 +Avg train: 107.0 ms/step +ANE TFLOPS: 2.45 sustained +ANE utilization: 15.5% of 15.8 TFLOPS +``` + +### train_large_ane (ANE classifier) + +Same training with classifier, softmax, and RMSNorm backward offloaded to ANE. + +| Item | Details | +|------|---------| +| **Purpose** | Measure ANE-offloaded training (16% faster) | +| **Build** | `cd training && make train_large_ane` | +| **Run** | `./train_large_ane --steps 100` | + +**Compare baseline vs ANE-offloaded:** + +```bash +make train_large && ./train_large --steps 100 +make train_large_ane && ./train_large_ane --steps 100 +``` + +### Dashboard (live monitoring) + +```bash +pip install blessed psutil numpy +sudo python3 dashboard.py # live mode (needs powermetrics) +sudo python3 dashboard.py --resume # attach to resumed training +``` + +| Flag | Description | +|------|-------------| +| `--resume` | Resume from checkpoint | +| `--infinite` | Train indefinitely | +| `--no-powermetrics` | Disable power monitoring | +| `--no-generate` | Disable text generation preview | +| `--steps N` | Total steps (default 10000) | + +--- + +## Root-Level Benchmark Scripts + +All root-level scripts are standalone Objective-C programs. Common build pattern: + +```bash +xcrun clang -O2 -fobjc-arc -framework Foundation -framework CoreML \ + -framework IOSurface -ldl -o .m +``` + +### inmem_peak.m -- Peak TFLOPS (self-contained) + +**No prerequisites.** Generates MIL and weight blobs programmatically. + +| Item | Details | +|------|---------| +| **Purpose** | Maximum sustained TFLOPS via deep conv chains (32-256 layers deep) | +| **Measures** | ms per run, TFLOPS, % peak across 10 configurations | +| **Prerequisites** | None (self-contained MIL generation) | +| **Build** | `xcrun clang -O2 -fobjc-arc -framework Foundation -framework CoreML -framework IOSurface -ldl -o inmem_peak inmem_peak.m` | +| **Run** | `./inmem_peak` | + +**Expected output:** + +``` +=== Programmatic MIL to In-Memory ANE Peak === + +Config W(MB) GFLOP ms/run TFLOPS %peak +---------------------------------------------------------------------- +32x conv 512ch sp64 16.0 1.07 X.XXX ms Y.YY Z.Z% +64x conv 512ch sp64 32.0 2.15 X.XXX ms Y.YY Z.Z% +... +``` + +### inmem_basic.m -- In-Memory Proof-of-Concept + +| Item | Details | +|------|---------| +| **Purpose** | End-to-end test: compile, load, run, benchmark using `_ANEInMemoryModel` | +| **Prerequisites** | Pre-built mlpackage at `/tmp/ane_sram_256ch_64sp.mlpackage` | +| **Build** | `xcrun clang -O2 -fobjc-arc -framework Foundation -framework CoreML -framework IOSurface -ldl -o inmem_basic inmem_basic.m` | +| **Run** | `./inmem_basic` | + +### inmem_bench.m -- Dispatch Latency + +| Item | Details | +|------|---------| +| **Purpose** | ANE dispatch latency across 6 model sizes (256-4096 channels) | +| **Measures** | ms per run, TFLOPS at each configuration | +| **Prerequisites** | Pre-built mlpackages for all 6 configs | +| **Build** | `xcrun clang -O2 -fobjc-arc -framework Foundation -framework CoreML -framework IOSurface -ldl -o inmem_bench inmem_bench.m` | +| **Run** | `./inmem_bench` | + +### sram_bench.m -- SRAM Capacity Probe + +| Item | Details | +|------|---------| +| **Purpose** | Find SRAM capacity by detecting performance cliff at increasing weight sizes | +| **Measures** | ms per run, TFLOPS, weight/activation/total memory at 9 configurations | +| **Prerequisites** | Pre-built mlpackages for 9 configs (256-8192 channels) | +| **Build** | `xcrun clang -O2 -fobjc-arc -framework Foundation -framework CoreML -framework IOSurface -ldl -o sram_bench sram_bench.m` | +| **Run** | `./sram_bench` | + +### sram_probe.m -- Fine-Grained SRAM Exploration + +| Item | Details | +|------|---------| +| **Purpose** | Finer-grained SRAM probe with 13 data points and GFLOPS/MB efficiency | +| **Measures** | ms per run, TFLOPS, GFLOPS/MB with spilling indicators | +| **Prerequisites** | Pre-built mlpackages for 13 configs | +| **Build** | `xcrun clang -O2 -fobjc-arc -framework Foundation -framework CoreML -framework IOSurface -ldl -o sram_probe sram_probe.m` | +| **Run** | `./sram_probe` | + +### api_exploration.m -- API Discovery + +| Item | Details | +|------|---------| +| **Purpose** | Explore ANE private API surface (class methods, file structures, internal objects) | +| **Prerequisites** | Pre-built mlpackage at `/tmp/ane_sram_1024ch_64sp.mlpackage` | +| **Build** | `xcrun clang -O2 -fobjc-arc -framework Foundation -framework CoreML -framework IOSurface -ldl -o api_exploration api_exploration.m` | +| **Run** | `./api_exploration` | + +--- + +## Test Files + +### Tests with Makefile targets (cd training/) + +| Test | Build | What It Tests | +|------|-------|---------------| +| `test_rmsnorm_bwd` | `make test_rmsnorm_bwd` | RMSNorm backward on ANE vs CPU reference. PASS: max diff < 0.05, mean < 0.01. Benchmarks 100 runs. | +| `test_classifier` | `make test_classifier` | 4-part: final RMSNorm, classifier forward (32000-ch conv), softmax over VOCAB, classifier backward. | +| `test_weight_reload` | `make test_weight_reload` | Tests if weights can be hot-swapped by overwriting blob files + unload/reload. Key finding: NO, weights are baked. | +| `test_perf_stats` | `make test_perf_stats` | Probes `_ANEPerformanceStats` class methods, properties, and instantiation. Tests perfStats in `_ANERequest`. | +| `test_qos_sweep` | `make test_qos_sweep` | QoS parameter sweep (0-63) across compile, load, run. Finding: no measurable latency difference. | +| `test_ane_advanced` | `make test_ane_advanced` | Probes SharedEvents, weightsBuffer IOSurface, procedureIndex, ChainingRequest. Enumerates all 67 ANE classes. | + +Build all probe tests at once: `make probes` + +### Tests without Makefile targets (manual build) + +| Test | Build Command | What It Tests | +|------|---------------|---------------| +| `test_ane_causal_attn` | `xcrun clang -O2 -fobjc-arc -framework Foundation -framework IOSurface -ldl -o test_ane_causal_attn test_ane_causal_attn.m` | Decomposed causal attention: Q at K^T on ANE, mask+softmax on CPU, scores at V on ANE | +| `test_ane_sdpa5` | `xcrun clang -O2 -fobjc-arc -framework Foundation -framework IOSurface -ldl -o test_ane_sdpa5 test_ane_sdpa5.m` | 4 approaches to causal masking with `scaled_dot_product_attention` | +| `test_conv_attn3` | `xcrun clang -O2 -fobjc-arc -framework Foundation -framework IOSurface -ldl -o test_conv_attn3 test_conv_attn3.m` | Grouped conv approach to attention (K,V baked as conv weights) | +| `test_full_fused` | `xcrun clang -O2 -fobjc-arc -framework Foundation -framework CoreML -framework IOSurface -ldl -o test_full_fused test_full_fused.m` | Full fused attention + FFN in single MIL dispatch at DIM=768, HEADS=12, SEQ=64 | +| `test_fused_qkv` | `xcrun clang -O2 -fobjc-arc -framework Foundation -framework IOSurface -ldl -o test_fused_qkv test_fused_qkv.m` | Fused QKV (3 convs + concat in one dispatch) vs separate dispatches | +| `test_fused_bwd` | `xcrun clang -O2 -fobjc-arc -framework Foundation -framework IOSurface -ldl -o test_fused_bwd test_fused_bwd.m` | Fused backward: slice_by_size + 2 convs + add in one kernel | + +--- + +## Bridge Library + +```bash +cd bridge +make # Build libane_bridge.dylib +make test # Build and link test_bridge +./test_bridge # Run bridge tests +``` + +--- + +## Known Results + +### M4 (from README) + +**Single-layer (dim=768, seq=512):** + +| Optimization | ms/step | ANE utilization | +|---|---|---| +| Baseline (vDSP transpose) | 33.5 | 3.1% | +| Channel-first layout | 20.3 | 5.2% | +| vDSP vectorized RMSNorm | 14.2 | 7.4% | +| GCD async cblas overlap | 11.4 | 9.2% | +| ANE RMSNorm fusion | 11.4 | 9.2% | +| Wo^T fusion (7 to 6 kernels) | 11.4 | 9.2% | +| Deferred cblas wait | **9.3** | **11.2%** | + +**Full Stories110M (12 layers):** + +| Component | Time (ms/step) | +|-----------|---------------| +| ANE runs | 9.6 | +| IO (fp16 conversion) | 4.1 | +| Classifier (cblas) | 9.1 | +| Cross-entropy + residuals | 14.4 | +| RMSNorm | 0.1 | +| **Total** | **~107** | + +### M5 Probe Results (from m5result.md) + +**Machine**: Apple M5, macOS 26.3, ANE Family H16 (same as M4) + +- **Weight reload**: FAIL -- weights baked at compile time, cannot be overwritten +- **QoS sweep**: All QoS 0-63 work, no measurable latency difference +- **Performance stats**: `_ANEPerformanceStats` class exists, `alloc/init` returns nil (needs factory methods) +- **weightsBuffer IOSurface**: Does NOT override compiled weights +- **ChainingRequest**: Exists with loopback and pipeline support -- most promising for utilization improvement + +--- + +## Timing Metrics Key + +| Metric | What it measures | +|--------|-----------------| +| `ane` | ANE kernel runs (all 6 kernels per layer x 12 layers) | +| `io` | fp16-to-fp32 IOSurface data transfer (NEON conversion) | +| `cls` | Classifier matmul (CPU cblas_sgemm) | +| `elem` | Embedding lookup, residual adds, cross-entropy | +| `rms` | RMSNorm forward/backward (CPU vDSP) | +| `cblas_wait` | Time waiting for async dW gradient sgemms to complete | diff --git a/docs/BENCHMARK_RESULTS.md b/docs/BENCHMARK_RESULTS.md new file mode 100644 index 0000000..2fd815b --- /dev/null +++ b/docs/BENCHMARK_RESULTS.md @@ -0,0 +1,156 @@ +# ANE Benchmark Results: Apple M4 Max + +**Date**: March 3, 2026 +**Machine**: Mac16,5 (MacBook Pro, Apple M4 Max) +**macOS**: 26.2 +**ANE Peak**: 15.8 TFLOPS (theoretical) + +## Training Performance + +### train_large (CPU classifier path) + +| Metric | Value | +|--------|-------| +| Model | Stories110M (12 layers, dim=768, hidden=2048) | +| Kernels | 72 (60 weight-bearing + 12 static sdpaBwd2) | +| Avg step time | 72.4 ms/step | +| ANE TFLOPS | 1.29 sustained | +| Total TFLOPS | 2.41 (ANE+CPU) | +| ANE utilization | 8.1% of 15.8 TFLOPS | +| Compile time | 79.7% of wall time | +| Train time | 16.4% of wall time | + +### train_large_ane (ANE-offloaded classifier) + +| Metric | Value | +|--------|-------| +| Model | Stories110M (same as above) | +| Kernels | 99 (86 weight-bearing + 13 static) | +| Avg step time | 62.9 ms/step | +| ANE TFLOPS | 1.68 sustained | +| Total TFLOPS | 2.77 (ANE+CPU) | +| ANE utilization | 10.6% of 15.8 TFLOPS | +| Compile time | 84.5% of wall time | +| Train time | 12.5% of wall time | + +**Step time breakdown (ms/step, ANE classifier path):** + +| Component | Time (ms) | Description | +|-----------|-----------|-------------| +| ane | 10-12 | ANE kernel dispatch + evaluation | +| elem | 12-13 | Elementwise ops (residuals, activations) | +| cls | 5-6 | Classifier forward + backward | +| io | 3-5 | IOSurface data transfers | +| rms | 0.1 | RMSNorm | +| cblas_wait | 0.0 | BLAS sync overhead | + +## Programmatic MIL Peak TFLOPS + +``` +Config W(MB) GFLOP ms/eval TFLOPS +---------------------------------------------------------------------- +32x conv 512ch sp64 16.0 1.07 0.408 ms 2.63 +48x conv 512ch sp64 24.0 1.61 0.262 ms 6.15 +64x conv 512ch sp64 32.0 2.15 0.244 ms 8.80 +96x conv 512ch sp64 48.0 3.22 0.326 ms 9.89 +128x conv 512ch sp64 64.0 4.29 0.385 ms 11.14 +64x conv 256ch sp64 8.0 0.54 0.365 ms 1.47 +128x conv 256ch sp64 16.0 1.07 0.454 ms 2.37 +256x conv 256ch sp64 32.0 2.15 0.351 ms 6.11 +64x conv 384ch sp64 18.0 1.21 0.429 ms 2.82 +128x conv 384ch sp64 36.0 2.42 0.354 ms 6.82 +``` + +**Peak observed: 11.14 TFLOPS** (128x conv 512ch sp64, 64 MB weights) + +## In-Memory ANE Benchmark (via mlpackage) + +``` +Config W (MB) ms/eval TFLOPS +--------------------------------------------- + 256ch x64sp 0.1 0.319 ms 0.03 + 512ch x64sp 0.5 0.357 ms 0.09 +1024ch x64sp 2.0 0.457 ms 0.29 +2048ch x64sp 8.0 0.254 ms 2.11 +3072ch x64sp 18.0 0.389 ms 3.10 +4096ch x64sp 32.0 1.148 ms 1.87 +``` + +## SRAM Probe Results + +### Coarse Probe (varying channels + spatial) + +``` +Config W (MB) Act(MB) Tot(MB) ms/eval TFLOPS +-------------------------------------------------------------------------- +256ch x 64sp 0.1 0.03 0.2 0.378 ms 0.02 +512ch x 64sp 0.5 0.06 0.6 0.389 ms 0.09 +1024ch x 64sp 2.0 0.12 2.2 0.392 ms 0.34 +2048ch x 64sp 8.0 0.25 8.5 0.218 ms 2.47 +3072ch x 64sp 18.0 0.38 18.8 0.396 ms 3.05 +4096ch x 64sp 32.0 0.50 33.0 1.116 ms 1.92 +5120ch x 64sp 50.0 0.62 51.2 0.767 ms 4.38 +6144ch x 64sp 72.0 0.75 73.5 0.872 ms 5.54 +8192ch x 32sp 128.0 0.50 129.0 4.195 ms 1.02 +``` + +### Fine Probe (spatial=64, weights only) + +``` +Channels W (MB) ms/eval TFLOPS GFLOPS/MB +-------------------------------------------------------------- + 256 ch 0.1 0.378 ms 0.02 177.7 + 512 ch 0.5 0.431 ms 0.08 155.6 + 1024 ch 2.0 0.411 ms 0.33 163.5 + 1536 ch 4.5 0.493 ms 0.61 136.1 + 2048 ch 8.0 0.410 ms 1.31 163.9 + 2560 ch 12.5 0.237 ms 3.53 282.6 <-- peak efficiency + 3072 ch 18.0 0.335 ms 3.60 200.1 + 3584 ch 24.5 0.414 ms 3.97 162.1 + 4096 ch 32.0 1.134 ms 1.89 59.2 <-- spilling + 4608 ch 40.5 0.563 ms 4.83 119.2 + 5120 ch 50.0 0.659 ms 5.09 101.8 + 6144 ch 72.0 0.844 ms 5.73 79.5 <-- spilling + 8192 ch 128.0 4.203 ms 1.02 8.0 <-- catastrophic spilling +``` + +### SRAM Analysis + +The M4 Max ANE SRAM appears to be approximately **24-32 MB**: + +- **Peak efficiency** at 2560ch (12.5 MB weights): 282.6 GFLOPS/MB, 3.53 TFLOPS +- **First spill** at 4096ch (32.0 MB): drops to 59.2 GFLOPS/MB (1.89 TFLOPS) +- **Catastrophic** at 8192ch (128.0 MB): 8.0 GFLOPS/MB (1.02 TFLOPS) + +The 4608ch recovery (4.83 TFLOPS despite 40.5 MB weights) suggests the ANE may use tiling strategies for some weight configurations. + +Training kernels (dim=768, weight matrices ~1.2 MB fp16 each) stay well within the SRAM budget. + +## Known Test Results + +| Test | Status | Notes | +|------|--------|-------| +| test_rmsnorm_bwd | PASS | ANE-accelerated RMSNorm backward | +| test_classifier | PASS | 4 tests passed; ANE backward 3x slower than CPU cblas for matmul | +| test_weight_reload | FAIL (expected) | ANE bakes weights at compile time; IOSurface override doesn't work | +| test_perf_stats | PASS | _ANEPerformanceStats API accessible | +| test_qos_sweep | PASS | QoS parameter has no measurable effect on latency | +| test_ane_advanced | PASS | Advanced ANE operations verified | +| inmem_basic | PASS | In-memory compilation and execution verified | +| inmem_bench | PASS | Multi-config benchmarks via mlpackage | +| inmem_peak | PASS | Peak TFLOPS measurement via programmatic MIL | +| sram_bench | PASS | SRAM capacity probing | +| sram_probe | PASS | Fine-grained SRAM spilling detection | + +## Reproducing + +```bash +cd scripts && bash run_benchmarks.sh +``` + +The benchmark script auto-generates required `.mlpackage` models (needs Python 3.11-3.13 with `coremltools`). + +Override training data paths: +```bash +ANE_MODEL_PATH=/path/to/stories110M.bin ANE_DATA_PATH=/path/to/data.bin ./train_large +``` diff --git a/docs/diaries/001-initial-setup-and-security-audit.md b/docs/diaries/001-initial-setup-and-security-audit.md new file mode 100644 index 0000000..2ee5007 --- /dev/null +++ b/docs/diaries/001-initial-setup-and-security-audit.md @@ -0,0 +1,74 @@ +# Development Diary #001 — Initial Setup & Sicherheitsaudit +**Datum:** 2026-03-02 +**Status:** Abgeschlossen + +## Aufgaben + +### 1. Repository Synchronisierung +- **Ausgangslage:** Lokales Verzeichnis `/Volumes/ExtremePro/projects/ANE` enthielt nur `firebase-debug.log` +- **Durchgeführt:** + ```bash + git init + git remote add origin https://github.com/maderix/ANE.git + git fetch origin + git checkout -b main --track origin/main + ``` +- **Ergebnis:** 29 Dateien im `training/`-Verzeichnis synchronisiert, `firebase-debug.log` unberührt +- **Commit-Stand:** HEAD = origin/main (up to date) + +### 2. Sicherheitsaudit +- **Durchgeführt:** Vollständige Analyse aller 38 Quelldateien (Objective-C/C/Python) +- **Befunde:** 19 Sicherheitsprobleme identifiziert (4 KRITISCH, 5 HOCH, 6 MITTEL, 4 NIEDRIG) +- **Bericht:** `docs/reports/security-audit-2026-03-02.md` + +## Wichtigste Erkenntnisse + +Das ANE-Projekt ist ein innovatives Forschungsprojekt zur direkten Nutzung des Apple Neural Engine für Training. Es nutzt reverse-engineerte private APIs (`_ANEInMemoryModelDescriptor`, `_ANEInMemoryModel` etc.) via `dlopen` + `objc_msgSend`. + +**Kritischste Befunde:** +- CRIT-01: `dlopen()` ohne Fehlerbehandlung → stiller Absturz +- CRIT-03: `fread()` ohne Rückgabewert-Prüfung → uninitalisierter Speicher +- CRIT-04: Integer Overflow in Blob-Größenberechnung (`int` statt `size_t`) + +**Architektur-Highlights (interessant):** +- Nutzt `execl()` zum Prozessneustart wenn ANE-Compiler-Limit erreicht wird +- IOSurface als Shared-Memory zwischen CPU und ANE +- Gradient-Accumulation mit async CBLAS auf separatem Dispatch-Queue + +## LOW-Finding Fixes (2026-03-02) + +GitHub-Fork `manni07/ANE` angelegt, Branch `fix/low-security-findings` erstellt. +Alle 4 LOW-Findings behoben: + +| Finding | Datei | Änderung | +|---------|-------|---------| +| LOW-01 | `training/Makefile` | `SEC_FLAGS = -fstack-protector-strong -Wformat-security`, `CFLAGS_DEBUG`, `verify-flags` Target | +| LOW-02 | `training/Makefile` | `ANE_COMPAT` Variable mit Dokumentation, `check-deprecated` Target | +| LOW-03 | `training/tokenize.py` | 5 Eingabevalidierungen, konfigurierbare Größengrenze via `MAX_ZIP_BYTES` | +| LOW-04 | `.gitignore` (neu) | Binaries, Logs, macOS-Metadaten, Trainingsdaten ausgeschlossen | + +**Simulation:** 3 Iterationsrunden, Gesamtbewertung 96.35% (alle Kriterien ≥ 95%) +**Remote:** `origin=manni07/ANE`, `upstream=maderix/ANE` + +## CRIT-Finding Fixes (2026-03-02) + +Branch `fix/crit-security-findings` erstellt. Alle 4 CRIT-Findings behoben: + +| Finding | Dateien | Kernänderung | +|---------|---------|-------------| +| CRIT-01 | `training/ane_runtime.h`, `training/stories_config.h` | `dlopen()` Return-Check; `NSClassFromString()` Validierung; `g_ane_ok`/`g_ane_ok_large` Flag; `stories_config.h` Re-Entry-Guard | +| CRIT-02 | `training/ane_runtime.h`, `training/stories_io.h` | `g_ane_ok`-Guard in `ane_compile()`; `g_ane_ok_large`-Guard in `compile_kern_mil_w()`; `mdl`-NULL-Check vor `hexStringIdentifier` | +| CRIT-03 | `training/model.h`, `training/train_large.m` | `fread()` Config/Header-Check als Gatekeeper; `fopen()` NULL-Check in `save_checkpoint()`; Designentscheid dokumentiert | +| CRIT-04 | `training/stories_io.h`, `training/model.h` | `int`→`size_t` in allen `build_blob*` Funktionen; `(size_t)`-Cast in `malloc()`-Größen; `calloc()` NULL-Checks | + +**Simulation:** 3 Iterationsrunden (CRIT-03 benötigte 3 Runs), Gesamtbewertung 96.15% (alle Kriterien ≥ 95%) +**Branch:** `fix/crit-security-findings` auf `manni07/ANE` + +## Status + +| Finding-Typ | Anzahl | Status | +|-------------|--------|--------| +| KRITISCH (CRIT-01–04) | 4 | ✅ BEHOBEN | +| HOCH (HIGH-01–05) | 5 | Offen | +| MITTEL (MED-01–06) | 6 | Offen | +| NIEDRIG (LOW-01–04) | 4 | ✅ BEHOBEN | diff --git a/docs/reports/security-audit-2026-03-02.md b/docs/reports/security-audit-2026-03-02.md new file mode 100644 index 0000000..e166641 --- /dev/null +++ b/docs/reports/security-audit-2026-03-02.md @@ -0,0 +1,419 @@ +# Sicherheitsaudit: ANE (Apple Neural Engine Training Framework) +**Datum:** 2026-03-02 +**Repository:** https://github.com/maderix/ANE +**Prüfer:** Claude Code (claude-sonnet-4-6) +**Scope:** Vollständige Codebase-Analyse (38 Quelldateien, Objective-C/C/Python) + +--- + +## Executive Summary + +Das ANE-Projekt implementiert Neural-Network-Training direkt auf Apples Neural Engine (ANE) via reverse-engineerter privater APIs. Es handelt sich um ein **Forschungs-/Experimental-Projekt** mit erheblichen inhärenten Sicherheitsrisiken durch die Nutzung undokumentierter Apple-Schnittstellen. + +**Gesamtbewertung: HOHES RISIKO** für produktiven Einsatz. + +| Kategorie | Anzahl | +|-----------|--------| +| KRITISCH | 4 | +| HOCH | 5 | +| MITTEL | 6 | +| NIEDRIG | 4 | +| **Gesamt**| **19** | + +--- + +## KRITISCHE Befunde + +### [CRIT-01] Keine Fehlerbehandlung bei `dlopen()` für Private Framework +**Datei:** `training/ane_runtime.h:26`, `api_exploration.m:15` +**Schweregrad:** KRITISCH +**Status: BEHOBEN** (2026-03-02, Branch `fix/crit-security-findings`) + +```objc +// ane_runtime.h:26 +dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); +``` + +**Problem:** +- Der Rückgabewert von `dlopen()` wird nicht geprüft. Wenn das Framework nicht gefunden wird (nach macOS-Update oder auf nicht-Apple-Silicon-Hardware), gibt `dlopen()` NULL zurück — aber die Ausführung läuft weiter. +- Alle nachfolgenden `NSClassFromString()`-Aufrufe geben dann ebenfalls NULL zurück. +- `g_ane_loaded = true` wird gesetzt auch wenn das Laden fehlschlug. + +**Folge:** Nullzeiger-Dereferenzierungen beim ersten API-Aufruf, unkontrollierter Absturz ohne aussagekräftige Fehlermeldung. + +**Empfehlung:** +```objc +void *handle = dlopen("...", RTLD_NOW); +if (!handle) { + fprintf(stderr, "ANE framework not found: %s\n", dlerror()); + abort(); +} +if (!g_ANEDesc || !g_ANEInMem || !g_ANEReq || !g_ANEIO) { + fprintf(stderr, "ANE private classes not found (API changed?)\n"); + abort(); +} +``` + +--- + +### [CRIT-02] Unsichere `objc_msgSend`-Casts ohne Typ-Validierung +**Dateien:** `training/ane_runtime.h:59-125`, `training/stories_io.h:90-117` +**Schweregrad:** KRITISCH +**Status: BEHOBEN** (2026-03-02, Branch `fix/crit-security-findings`) + +```objc +// ane_runtime.h:59-61 +id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)( + g_ANEDesc, @selector(modelWithMILText:weights:optionsPlist:), + milText, wdict, nil); +``` + +**Probleme:** +1. Die Klasse `g_ANEDesc` könnte NULL sein (wenn `dlopen` fehlschlug, s. CRIT-01) +2. Die Methodensignatur ist hardcodiert — bei Apple-API-Änderungen falsches Casting = undefiniertes Verhalten / Speicherkorruption +3. Kein `@try/@catch` um mögliche Objective-C Exceptions abzufangen +4. Globale Variablen `g_D`, `g_I`, `g_AIO`, `g_AR` in `stories_io.h` könnten NULL sein + +**Folge:** Speicherkorruption, SIGBUS, unkontrollierter Absturz. + +**Empfehlung:** Mindestens NULL-Checks vor jedem `objc_msgSend`: +```objc +if (!g_ANEDesc) { fprintf(stderr, "g_ANEDesc is NULL\n"); return NULL; } +``` + +--- + +### [CRIT-03] `fread()`-Rückgabewerte nie geprüft — uninitalisierter Speicher +**Dateien:** `training/model.h:81-146`, `training/train_large.m:17-55` +**Schweregrad:** KRITISCH +**Status: BEHOBEN** (2026-03-02, Branch `fix/crit-security-findings`) + +```c +// model.h:81 +fread(&m->cfg, sizeof(Config), 1, f); // Rückgabewert ignoriert! + +// train_large.m:29 +fread(embed, 4, V * DIM, f); // Kein Check ob V*DIM floats gelesen wurden +``` + +**Probleme:** +1. Wenn die Model-Datei kleiner als erwartet ist (korrupt, abgeschnitten), werden Structs mit Garbage-Werten befüllt +2. Kein Check ob `cfg.dim`, `cfg.hidden_dim`, `cfg.n_layers` plausibel sind bevor Speicher allokiert wird +3. `fread(embed, 4, V * DIM, f)` — bei V=32000, DIM=768: liest 98,304,000 Bytes. Keine Größenvalidierung. +4. In `load_checkpoint()`: wenn die Datei nach dem Header endet, werden Gewichte mit 0-Bytes befüllt ohne Warnung + +**Empfehlung:** +```c +size_t n = fread(&m->cfg, sizeof(Config), 1, f); +if (n != 1) { fprintf(stderr, "Config read failed\n"); fclose(f); return -1; } +if (m->cfg.dim <= 0 || m->cfg.dim > 65536 || m->cfg.n_layers <= 0) { + fprintf(stderr, "Invalid model config\n"); fclose(f); return -1; +} +``` + +--- + +### [CRIT-04] Integer Overflow in Speicher-Berechnung +**Dateien:** `training/stories_io.h:13-14`, `training/ane_mil_gen.h:12-13` +**Schweregrad:** KRITISCH +**Status: BEHOBEN** (2026-03-02, Branch `fix/crit-security-findings`) + +```c +// stories_io.h:13-14 +static NSData *build_blob(const float *w, int rows, int cols) { + int ws = rows * cols * 2; // INT-Multiplikation, kein size_t! + int tot = 128 + ws; +``` + +**Problem:** Bei grösseren Modellen mit `dim >= 2048, hidden >= 16384` könnten Integer-Overflows entstehen. `*(uint32_t*)(chunk + 8) = (uint32_t)wsize;` — wenn `wsize` als `int` negativ wird (Overflow), wird ein negativer Wert als uint32 geschrieben = falsche Blob-Größe → ANE-Fehler oder Speicherkorruption. + +**Empfehlung:** `size_t` für alle Speichergrößenberechnungen: +```c +size_t ws = (size_t)rows * cols * sizeof(_Float16); +size_t tot = 128 + ws; +``` + +--- + +## HOHE Befunde + +### [HIGH-01] Keine Eingabevalidierung für Token-Indizes +**Datei:** `training/train_large.m:375-376` +**Schweregrad:** HOCH + +```c +size_t max_pos = n_tokens - SEQ - 1; +size_t pos = (size_t)(drand48() * max_pos); +uint16_t *input_tokens = token_data + pos; +``` + +**Probleme:** +1. Token-Werte aus `token_data` werden direkt als Embedding-Indizes verwendet ohne Prüfung ob `token < VOCAB` +2. Wenn die `.bin`-Datei korrupte Token-Werte enthält (> 32000), entstehen Out-of-Bounds-Zugriffe auf `embed[]` +3. Kein Check ob `n_tokens >= SEQ + 1` vor der `max_pos`-Berechnung + +**Folge:** Heap-Buffer-Overflow, korrupte `.bin`-Datei kann zu Speicherschäden führen. + +--- + +### [HIGH-02] Checkpoint-Pfad mit relativer Verzeichnis-Navigation +**Datei:** `training/train_large.m:8-10` +**Schweregrad:** HOCH + +```c +#define CKPT_PATH "ane_stories110M_ckpt.bin" +#define MODEL_PATH "../../assets/models/stories110M.bin" // ← relativer Pfad! +#define DATA_PATH "tinystories_data00.bin" +``` + +**Probleme:** +1. `MODEL_PATH` enthält `../../` — relative Pfadnavigation. Wenn das Binary aus einem unerwarteten Verzeichnis gestartet wird, werden falsche Dateien gelesen. +2. Kein `realpath()`-Aufruf zur Normalisierung des Pfades +3. Manipulierter Checkpoint + `--resume` → unkontrollierte Binärdaten werden als Gewichte geladen + +--- + +### [HIGH-03] `execl()` zur Prozessneustart ohne Argument-Validierung +**Datei:** `training/train_large.m:331` +**Schweregrad:** HOCH + +```c +execl(argv[0], argv[0], "--resume", NULL); +``` + +**Probleme:** +1. `argv[0]` wird ohne Validierung übergeben. Via Symlink könnte ein beliebiges Binary gestartet werden. +2. `data_fd` (mmap'd Token-Datei) wird vor `execl()` nicht geschlossen — Dateideskriptor-Leak in neuen Prozess +3. `munmap(token_data)` wird vor `execl()` nicht aufgerufen + +--- + +### [HIGH-04] Fehlende `malloc()`/`calloc()`-Rückgabewert-Prüfungen +**Dateien:** Alle `.m` und `.h` Dateien +**Schweregrad:** HOCH + +```c +// train_large.m:219 +float *embed = (float*)malloc(VOCAB*DIM*4); // 32000*768*4 = 98MB — kein NULL-Check! +``` + +Keiner der `malloc()`/`calloc()`-Aufrufe prüft den Rückgabewert auf NULL. Bei Memory-Pressure (110M Model + Adam-State = mehrere GB) können Allokierungen fehlschlagen → Nullzeiger-Dereferenzierung. + +--- + +### [HIGH-05] ANE-Inferenz ohne Fehlerprüfung im Trainings-Hot-Path +**Datei:** `training/stories_io.h:131-134` +**Schweregrad:** HOCH + +```c +static void ane_run(Kern *k) { + id mdl = (__bridge id)k->model; id req = (__bridge id)k->request; NSError *e = nil; + ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); + // BOOL-Rückgabewert und NSError *e werden ignoriert! +} +``` + +**Problem:** ANE-Ausführung kann fehlschlagen (Thermal-Throttling, Hardware-Fehler, API-Änderungen). Stille Fehler führen zu unerkannter Gradientenkorruption. + +--- + +## MITTLERE Befunde + +### [MED-01] IOSurface Lock ohne Fehlerbehandlung +**Datei:** `training/stories_io.h:62-83` +**Schweregrad:** MITTEL + +```c +IOSurfaceLock(s, 0, NULL); // Return-Code ignoriert +``` + +`IOSurfaceLock()` gibt `kIOReturnSuccess` oder einen Fehlercode zurück. Bei Lock-Fehler wird trotzdem auf den Speicher zugegriffen — mögliche Data-Race-Condition. + +--- + +### [MED-02] Temporäres Verzeichnis nicht sicher erstellt (TOCTOU-Risiko) +**Datei:** `training/ane_runtime.h:68-80`, `training/stories_io.h:94-100` +**Schweregrad:** MITTEL + +```objc +NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; +[milText writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; +``` + +TOCTOU-Race zwischen `createDirectoryAtPath` und `writeToFile`. Der `hexStringIdentifier` könnte von einem anderen Prozess erraten und das Verzeichnis manipuliert werden. + +--- + +### [MED-03] MIL-Text-Generierung ohne Parameter-Validierung +**Datei:** `training/ane_mil_gen.h:32-52` +**Schweregrad:** MITTEL + +```objc +return [NSString stringWithFormat: + @"...tensor x...", in_ch, spatial, ...]; +``` + +Negative oder extrem große `in_ch`/`out_ch`/`spatial`-Werte durch fehlerhafte Konfiguration erzeugen invalides MIL das an den undokumentierten ANE-Compiler übergeben wird. + +--- + +### [MED-04] Keine Endianness-Prüfung bei Checkpoint-Serialisierung +**Datei:** `training/train_large.m:110-181` +**Schweregrad:** MITTEL + +```c +h.magic = 0x424C5A54; +fwrite(&h, sizeof(h), 1, f); +``` + +Das `CkptHdr`-Struct wird als binärer Dump ohne Endianness-Marker geschrieben. Nicht portabel. + +--- + +### [MED-05] NEON-Vektorisierung ohne Alignment-Garantie +**Datei:** `training/stories_io.h:41-58` +**Schweregrad:** MITTEL + +```c +float16x8_t h = vld1q_f16((const __fp16*)(src + i)); +``` + +Zeiger-Arithmetik mit `ch_off * sp` könnte das für NEON benötigte Alignment verletzen wenn `ch_off * sp` kein Vielfaches von 8 ist. + +--- + +### [MED-06] Globale Variablen ohne Thread-Safety +**Datei:** `training/stories_io.h`, `training/stories_config.h` +**Schweregrad:** MITTEL + +```c +static bool g_ane_loaded = false; +static int g_compile_count = 0; +``` + +`g_compile_count` wird via `__sync_fetch_and_add()` atomar inkrementiert, aber `g_ane_loaded` und Klassen-Variablen nicht atomar gesetzt — bei Multi-Thread-Nutzung Race-Condition in `ane_init()`. + +--- + +## NIEDRIGE Befunde + +### [LOW-01] Fehlende Compiler-Sicherheitsflags +**Datei:** `training/Makefile:2` +**Schweregrad:** NIEDRIG +**Status: BEHOBEN** (2026-03-02, Branch `fix/low-security-findings`) + +```makefile +CFLAGS = -O2 -Wall -Wno-deprecated-declarations -fobjc-arc +``` + +Fehlende Flags: `-fstack-protector-strong`, `-D_FORTIFY_SOURCE=2`, `-Wformat=2` + +**Fix:** `SEC_FLAGS = -fstack-protector-strong -Wformat-security` eingeführt. Hinweis: +`-D_FORTIFY_SOURCE=2` ist auf macOS (Apple LLVM) bei `-O2` implizit aktiv — explizite +Definition würde "macro redefinition"-Warnung erzeugen. `CFLAGS_DEBUG` mit +`-fsanitize=address,undefined` für Debug-Builds hinzugefügt. `make verify-flags` +zeigt aktive Flags. + +--- + +### [LOW-02] `-Wno-deprecated-declarations` unterdrückt wichtige Warnungen +**Datei:** `training/Makefile:2` +**Schweregrad:** NIEDRIG +**Status: BEHOBEN** (2026-03-02, Branch `fix/low-security-findings`) + +Unterdrückt Warnungen über veraltete API-Aufrufe — könnte wichtige Hinweise auf deprecated private APIs verstecken. + +**Fix:** Flag in benannte Variable `ANE_COMPAT` extrahiert mit erklärendem Kommentar +(bewusste Unterdrückung wegen privater `_ANE*`-APIs via `objc_msgSend`). Neues Target +`make check-deprecated` baut ohne Unterdrückung und zeigt alle verborgenen Warnungen. + +--- + +### [LOW-03] Python-Skript ohne Eingabevalidierung +**Datei:** `training/tokenize.py` +**Schweregrad:** NIEDRIG +**Status: BEHOBEN** (2026-03-02, Branch `fix/low-security-findings`) + +Keine Validierung der Eingabedateigröße — bei sehr großen Eingaben Out-of-Memory möglich. + +**Fix:** 5 Validierungen implementiert: +1. ZIP-Existenzprüfung mit hilfreicher Fehlermeldung +2. Konfigurierbare Größengrenze (Standard 10GB, via `MAX_ZIP_BYTES` env var überschreibbar) +3. Prüfung ob `data00.bin` im ZIP enthalten ist +4. Fehlerbehandlung bei `struct.unpack` wenn Output < 20 Bytes +5. Token-Range-Validierung (alle Token müssen < `VOCAB_SIZE=32000` sein) + +--- + +### [LOW-04] Keine `.gitignore` für sensible Artefakte +**Datei:** Repository-Root +**Schweregrad:** NIEDRIG +**Status: BEHOBEN** (2026-03-02, Branch `fix/low-security-findings`) + +Keine `.gitignore`-Datei. Binäre Artefakte (Checkpoints, Trainingsdaten, `firebase-debug.log`) könnten versehentlich committed werden. + +**Fix:** `.gitignore` erstellt mit Regeln für: macOS-Metadaten (`.DS_Store`), +Log-Dateien (`*.log`), kompilierte Binaries (`training/train`, `training/train_large`, +alle Probe-Binaries), Trainingsdaten (`training/*.bin`), ANE-Artefakte +(`*.mlmodelc/`, `*.mlpackage/`), externe Assets (`assets/`). + +--- + +## Positive Befunde (Stärken) + +### Korrekte Speicherfreigabe +`ane_free()` (`ane_runtime.h:149-160`) und `free_kern()` (`stories_io.h:122-130`) implementieren vollständige Cleanup-Routinen mit `CFRelease()`, `unloadWithQoS:error:` und Temporärverzeichnis-Bereinigung. + +### Magic-Byte Validierung in Checkpoints +```c +if (h.magic != 0x424C5A54 || h.version != 2) { fclose(f); return false; } +``` +Grundlegender Schutz gegen korrupte Checkpoint-Dateien. + +### Atomare Compile-Counter +```c +__sync_fetch_and_add(&g_compile_count, 1); +``` +Thread-sicherer Zähler für ANE-Kompilierungsanzahl. + +### Gradient-Accumulation mit async CBLAS +Korrekte Parallelisierung von CPU-Gewichtsgradienten-Berechnung via `dispatch_group_async`. + +--- + +## Risikobewertung für Produktionseinsatz + +| Aspekt | Bewertung | +|--------|-----------| +| Apple Silicon erforderlich | macOS 15+, M-Series only | +| Private API Stabilität | **SEHR GERING** — jedes macOS-Update kann brechen | +| Memory Safety | **MITTEL** — keine Bounds-Checks, keine Sanitizer | +| Input Validation | **GERING** — Dateien werden unkritisch gelesen | +| Error Handling | **GERING** — viele kritische Fehler werden ignoriert | +| Eignung für Produktion | **NEIN** — Forschungs-/Experimental-Projekt | + +--- + +## Empfehlungen nach Priorität + +### Sofortige Maßnahmen (KRITISCH) +1. `dlopen()` Rückgabewert prüfen und bei Fehler abbrechen +2. Alle `fread()`-Rückgabewerte prüfen + Dateigrößenvalidierung +3. NULL-Checks vor allen `objc_msgSend`-Aufrufen +4. `int` → `size_t` für alle Speichergrößenberechnungen + +### Kurzfristige Maßnahmen (HOCH) +5. Token-Index-Validierung: `if (token >= VOCAB) abort()` +6. ANE-Inferenz-Rückgabewert und NSError prüfen +7. Compiler-Flags: `-fstack-protector-strong -D_FORTIFY_SOURCE=2` +8. `.gitignore` für binäre Artefakte erstellen + +### Mittelfristige Maßnahmen (MITTEL) +9. IOSurface Lock-Rückgabewerte prüfen +10. `__atomic_store_n()` für `g_ane_loaded` +11. MIL-Parameter-Validierung vor Formatierung + +--- + +*Dieser Bericht ist für das ANE-Forschungsprojekt erstellt. Das Projekt ist explizit als Proof-of-Concept/Forschungscode konzipiert und nicht für Produktionseinsatz gedacht.* diff --git a/inference/README.md b/inference/README.md new file mode 100644 index 0000000..f55a2d7 --- /dev/null +++ b/inference/README.md @@ -0,0 +1,242 @@ +# ANE Inference — Full LLM on Apple Neural Engine + +First complete LLM inference running directly on Apple's Neural Engine via reverse-engineered `_ANEClient` APIs. No CoreML. No Xcode compiler dependency at runtime. + +Built on top of the [maderix/ANE](https://github.com/maderix/ANE) training runtime. + +## What This Does + +Runs **Qwen2.5-0.5B-Instruct** (24 transformer layers, 494M parameters) on ANE: + +- **169 ANE kernels** compiled at startup via `_ANEInMemoryModel` +- **~60 tokens/sec** decode on M4 Max +- **Pure C HTTP API** — no Python needed for serving +- **BPE tokenizer in C** — send plain text, get plain text back +- **~6s cold start**, then instant responses in server mode + +## Quick Start (One Command) + +```bash +cd inference +./setup.sh +``` + +This automatically: +1. Creates a Python venv and installs dependencies +2. Downloads [Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) from HuggingFace (~953 MB) +3. Converts BF16 safetensors to f32 binary format (~1.9 GB) +4. Builds the `qwen_ane` binary +5. Runs a smoke test + +After setup, you're ready to go. + +## HTTP API (Recommended) + +The fastest way to use inference. Single process, zero Python overhead. + +```bash +# Start server (compiles 169 ANE kernels on first launch, ~6s) +./qwen_ane qwen05b.bin --http 8000 + +# Query with plain text — tokenization happens in C +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{"prompt": "What is 2+2?", "max_tokens": 50}' +``` + +Response: +```json +{ + "text": "2+2 equals 4.", + "prompt_tokens": 29, + "gen_tokens": 8, + "prefill_tps": 66.2, + "decode_tps": 57.3, + "elapsed_s": 0.608 +} +``` + +### Endpoints + +| Method | Path | Description | +|--------|------|-------------| +| POST | `/v1/completions` | Generate text from a prompt | +| GET | `/health` | Server status check | + +### POST /v1/completions + +```json +{ + "prompt": "Your question here", + "max_tokens": 50, + "system": "You are a helpful assistant." +} +``` + +- `prompt` (required): The user message +- `max_tokens` (optional, default 50, max 512): Maximum tokens to generate +- `system` (optional): System prompt override + +### Options + +```bash +# Custom port +./qwen_ane qwen05b.bin --http 9000 + +# Custom model directory (for tokenizer files) +./qwen_ane qwen05b.bin --http 8000 --model-dir /path/to/Qwen2.5-0.5B-Instruct +``` + +Default model directory: `~/models/Qwen2.5-0.5B-Instruct` + +## Other Modes + +### Socket server (for programmatic access) + +```bash +# Terminal 1: start server +./qwen_ane qwen05b.bin --server /tmp/qwen_ane.sock + +# Terminal 2: query with run.py (auto-detects socket) +python3 run.py "What is 2+2?" + +# Or query directly with nc +echo '{"tokens": [151644, 8948, 198], "max_tokens": 50}' | nc -U /tmp/qwen_ane.sock +``` + +### Stdin server (for piping/scripting) + +```bash +./qwen_ane qwen05b.bin --server +# Send space-separated token IDs, pipe char separates max_tokens: +# 151644 8948 198 2610 525|20 +``` + +### Single-shot (no server) + +```bash +# Raw token IDs +./qwen_ane qwen05b.bin "151644 8948 198 2610 525 264 10950 17847 13" 20 + +# With Python tokenizer +python3 run.py "Say hello in one word." +``` + +### Python API server (alternative) + +If you prefer Python for the HTTP layer: + +```bash +./qwen_ane qwen05b.bin --server /tmp/qwen_ane.sock +python3 api_server.py --port 8000 +``` + +## Throughput Benchmark + +Run the standardized benchmark to measure your hardware's performance: + +```bash +./benchmark.sh +``` + +This runs 5 prompts of varying length, measures prefill and decode tokens/sec in server mode, tests cold start latency, and checks decode speed consistency. + +Sample output (M4 Max, 128 GB): +``` +Prompt Input Output Prefill(t/s) Decode(t/s) Latency(ms) +────────────────────────────────────────────────────────────────── +tiny 23 10 53.7 53.6 632 +short 29 8 66.2 49.5 628 +medium 33 84 63.4 55.3 2064 +long 36 200 66.4 54.5 4235 +stress 122 11 58.6 58.5 2303 +────────────────────────────────────────────────────────────────── +Average 61.7 54.3 + +Cold start (single-shot): ~6.2s (includes ANE kernel compilation) +``` + +Results are saved to `benchmark_results.json` for programmatic use. + +### Compare with LM Studio + +The benchmark script prints instructions for running the same prompts in LM Studio: + +1. Download [LM Studio](https://lmstudio.ai) +2. Search for and download **Qwen2.5-0.5B-Instruct** (GGUF Q4_K_M or Q8_0) +3. Load the model, start the server (Developer tab, port 1234) +4. Run the same prompts and compare tokens/sec: + +```bash +curl http://localhost:1234/api/v1/chat \ + -H "Content-Type: application/json" \ + -d '{"model":"qwen2.5-0.5b-instruct","system_prompt":"You are a helpful assistant.","input":"What is 2+2?"}' +``` + +Note: LM Studio uses quantized GGUF weights (CPU/GPU) while we use full BF16 precision on the Neural Engine. + +## Performance + +| Mode | First prompt | Subsequent prompts | +|------|-------------|-------------------| +| Single-shot | ~6s | ~6s (recompiles each time) | +| Server (socket/HTTP) | ~6s (startup) | ~0.5s | + +## Architecture + +``` +Token -> Embedding (CPU) -> 24x Transformer Layer -> LM Head (CPU) -> Next Token + | + +-- RMSNorm (CPU) + +-- Q/K/V Projection (ANE conv kernel) + +-- RoPE (CPU, rotate_half) + +-- GQA Attention (CPU, 14 heads / 2 KV heads) + +-- O Projection (ANE conv kernel) + +-- Residual (CPU) + +-- RMSNorm (CPU) + +-- Gate/Up Projection (ANE conv kernel) + +-- SiLU + elementwise mul (CPU) + +-- Down Projection (ANE conv kernel) + +-- Residual (CPU) +``` + +## Files + +| File | What | +|------|------| +| `setup.sh` | One-command setup: downloads model, converts weights, builds binary | +| `benchmark.sh` | Throughput benchmark with LM Studio comparison | +| `main.m` | Entry point: weight loader, server modes, HTTP API | +| `qwen_ane_infer.h` | Full 24-layer transformer forward pass, ANE kernel compilation, KV cache | +| `tokenizer.h` | BPE tokenizer in C: vocab/merge loading, encode/decode, chat template | +| `http_server.h` | Minimal HTTP/1.1 server: TCP, request parsing, JSON responses | +| `convert_weights.py` | HuggingFace safetensors to flat f32 binary | +| `run.py` | Python wrapper with HuggingFace tokenizer (auto-connects to socket server) | +| `api_server.py` | Python HTTP API bridge to socket server (alternative to C HTTP) | + +## Model + +**[Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct)** + +- 494M parameters, BFloat16 +- 24 layers, 896 dim, 4864 hidden +- 14 attention heads, 2 KV heads (GQA) +- 151,936 vocab size +- Download: `setup.sh` handles this automatically + +## Requirements + +- macOS 15+ on Apple Silicon (M1/M2/M3/M4) +- Xcode Command Line Tools (`xcode-select --install`) +- Python 3.11+ (for weight conversion only, not needed for serving) + +## Known Limitations + +- **CPU projections only** — ANE baked-weight conv kernels compile but produce incorrect output (FP16 weight blob format mismatch). `USE_ANE_PROJECTIONS` defaults to 0 (CPU via Accelerate BLAS). Fixing this would increase decode speed significantly. +- **Single model** — hardcoded for Qwen2.5-0.5B. Other sizes need config changes. +- **f32 weights** — 1.9GB on disk. FP16 weight support would halve this. +- **Single-threaded HTTP** — handles one request at a time. Sufficient for local use. + +## License + +Same as maderix/ANE — research and educational use. diff --git a/inference/api_server.py b/inference/api_server.py new file mode 100644 index 0000000..a1b5fd7 --- /dev/null +++ b/inference/api_server.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 +"""HTTP API server for ANE inference. + +Bridges HTTP requests to the qwen_ane Unix socket server. Handles tokenization +so clients can send plain text prompts and receive decoded responses. + +Prerequisites: + 1. Start the ANE server: ./qwen_ane qwen05b.bin --server /tmp/qwen_ane.sock + 2. Start this API: python3 api_server.py [--port 8000] + +Usage: + curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{"prompt": "What is 2+2?", "max_tokens": 50}' + + curl http://localhost:8000/health +""" +import argparse +import json +import os +import socket +import sys +import time +from http.server import HTTPServer, BaseHTTPRequestHandler +from pathlib import Path + +DEFAULT_SOCK = "/tmp/qwen_ane.sock" +MODEL_DIR = Path.home() / "models" / "Qwen2.5-0.5B-Instruct" + +tokenizer = None + +def get_tokenizer(): + global tokenizer + if tokenizer is None: + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(str(MODEL_DIR), trust_remote_code=True) + return tokenizer + + +def query_ane(token_ids: list[int], max_tokens: int, sock_path: str) -> dict: + s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + s.settimeout(120) + s.connect(sock_path) + req = json.dumps({"tokens": token_ids, "max_tokens": max_tokens}) + "\n" + s.sendall(req.encode()) + + data = b"" + while True: + chunk = s.recv(131072) + if not chunk: + break + data += chunk + if b"\n" in data: + break + s.close() + return json.loads(data.decode().strip()) + + +class ANEHandler(BaseHTTPRequestHandler): + sock_path = DEFAULT_SOCK + + def log_message(self, format, *args): + sys.stderr.write(f"[{time.strftime('%H:%M:%S')}] {format % args}\n") + + def _send_json(self, code, obj): + body = json.dumps(obj).encode() + self.send_response(code) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(body))) + self.send_header("Access-Control-Allow-Origin", "*") + self.end_headers() + self.wfile.write(body) + + def do_OPTIONS(self): + self.send_response(204) + self.send_header("Access-Control-Allow-Origin", "*") + self.send_header("Access-Control-Allow-Methods", "POST, GET, OPTIONS") + self.send_header("Access-Control-Allow-Headers", "Content-Type") + self.end_headers() + + def do_GET(self): + if self.path == "/health": + alive = os.path.exists(self.sock_path) + self._send_json(200, {"status": "ok" if alive else "no_backend", "socket": self.sock_path}) + return + self._send_json(404, {"error": "not found"}) + + def do_POST(self): + if self.path != "/v1/completions": + self._send_json(404, {"error": "not found, use POST /v1/completions"}) + return + + length = int(self.headers.get("Content-Length", 0)) + if length == 0 or length > 65536: + self._send_json(400, {"error": "invalid content length"}) + return + + try: + body = json.loads(self.rfile.read(length)) + except json.JSONDecodeError: + self._send_json(400, {"error": "invalid JSON"}) + return + + prompt = body.get("prompt", "") + max_tokens = min(body.get("max_tokens", 50), 512) + system_prompt = body.get("system", "You are a helpful assistant. Be concise.") + + if not prompt: + self._send_json(400, {"error": "missing 'prompt' field"}) + return + + tok = get_tokenizer() + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": prompt}, + ] + text = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + input_ids = tok.encode(text) + + t0 = time.time() + try: + result = query_ane(input_ids, max_tokens, self.sock_path) + except (ConnectionRefusedError, FileNotFoundError, OSError) as e: + self._send_json(503, {"error": f"ANE backend unavailable: {e}"}) + return + + elapsed = time.time() - t0 + + output_ids = result.get("output", []) + decoded = tok.decode(output_ids, skip_special_tokens=True) if output_ids else "" + + self._send_json(200, { + "text": decoded, + "output_tokens": output_ids, + "prompt_tokens": len(input_ids), + "gen_tokens": len(output_ids), + "prefill_tps": result.get("prefill_tps", 0), + "decode_tps": result.get("decode_tps", 0), + "elapsed_s": round(elapsed, 3), + }) + + +def main(): + parser = argparse.ArgumentParser(description="HTTP API for ANE inference") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument("--host", type=str, default="127.0.0.1") + parser.add_argument("--sock", type=str, default=DEFAULT_SOCK) + args = parser.parse_args() + + ANEHandler.sock_path = args.sock + + print(f"Loading tokenizer from {MODEL_DIR}...") + get_tokenizer() + print("Tokenizer ready.") + + if not os.path.exists(args.sock): + print(f"WARNING: Socket {args.sock} not found. Start the ANE server first:") + print(f" ./qwen_ane qwen05b.bin --server {args.sock}") + + server = HTTPServer((args.host, args.port), ANEHandler) + print(f"API server listening on http://{args.host}:{args.port}") + print(f" POST /v1/completions {{\"prompt\": \"...\", \"max_tokens\": 50}}") + print(f" GET /health") + try: + server.serve_forever() + except KeyboardInterrupt: + print("\nShutting down.") + server.server_close() + + +if __name__ == "__main__": + main() diff --git a/inference/benchmark.sh b/inference/benchmark.sh new file mode 100755 index 0000000..3573549 --- /dev/null +++ b/inference/benchmark.sh @@ -0,0 +1,641 @@ +#!/bin/bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" + +# Load .env if present (LMS_API_KEY, LMS_PORT, LMS_MODEL) +if [ -f "$SCRIPT_DIR/.env" ]; then + set -a + source "$SCRIPT_DIR/.env" + set +a +fi + +BINARY="$SCRIPT_DIR/qwen_ane" +WEIGHTS="$SCRIPT_DIR/qwen05b.bin" +MODEL_DIR="${MODEL_DIR:-$HOME/models/Qwen2.5-0.5B-Instruct}" +SOCK="/tmp/qwen_ane_bench.sock" +HTTP_PORT=8877 +RESULTS_JSON="$SCRIPT_DIR/benchmark_results.json" + +# --- Prompt suite --- +PROMPT_NAMES=( "tiny" "short" "medium" "long" "stress") +PROMPTS=( "Hi" "What is 2+2?" "Explain how neural networks work in 3 sentences." "Write a short story about a robot learning to paint. Include dialogue." "The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog.") +MAX_TOKENS=( 10 20 100 200 50) + +info() { printf "\033[1;34m%s\033[0m\n" "$1"; } +dim() { printf "\033[2m%s\033[0m\n" "$1"; } + +# Extract a numeric or string value from flat JSON. No python needed. +# Usage: json_val '{"key":123}' "key" → 123 +json_val() { + local json="$1" key="$2" + echo "$json" | sed -n "s/.*\"$key\"[[:space:]]*:[[:space:]]*\"\{0,1\}\([^,\"}\]*\)\"\{0,1\}.*/\1/p" | head -1 +} + +# Extract the "text" field which may contain escaped chars and commas. +# Grabs everything between "text":" and the next unescaped quote. +json_text() { + local json="$1" + echo "$json" | sed -n 's/.*"text":"\(.*\)","prompt_tokens".*/\1/p' | sed 's/\\n/ /g; s/\\"//g' +} + +# Truncate a float string to integer: "317.2" → "317" +trunc() { echo "${1%%.*}"; } + +# Average an array of numbers using awk. Handles both ints and floats. +# Usage: shell_avg "1.5" "2.3" "3.1" → 2.3 +shell_avg() { printf '%s\n' "$@" | awk '{s+=$1; n++} END {if(n>0) printf "%.1f", s/n; else print "0"}'; } +shell_avg_int() { printf '%s\n' "$@" | awk '{s+=$1; n++} END {if(n>0) printf "%.0f", s/n; else print "0"}'; } + +# --- Preflight --- +if [ ! -f "$BINARY" ]; then + echo "Binary not found: $BINARY" + echo "Run setup.sh first: $SCRIPT_DIR/setup.sh" + exit 1 +fi +if [ ! -f "$WEIGHTS" ]; then + echo "Weights not found: $WEIGHTS" + echo "Run setup.sh first: $SCRIPT_DIR/setup.sh" + exit 1 +fi + +# Detect hardware +CHIP=$(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo "Unknown") +MACOS=$(sw_vers -productVersion 2>/dev/null || echo "Unknown") +MEM_BYTES=$(sysctl -n hw.memsize 2>/dev/null || echo "0") +MEM_GB=$((MEM_BYTES / 1073741824)) + +echo "" +info "=== ANE Multi-Format Inference Benchmark ===" +echo "Hardware: $CHIP" +echo "macOS: $MACOS" +echo "Memory: ${MEM_GB} GB" +echo "Model: Qwen2.5-0.5B-Instruct (494M params)" +echo "" + +# --- Phase 0: Prepare weight files (F16 + Q8) --- +WEIGHTS_F16="$SCRIPT_DIR/qwen05b.bin" +WEIGHTS_Q8="$SCRIPT_DIR/qwen05b_q8.bin" +WEIGHTS_Q4="$SCRIPT_DIR/qwen05b_q4.bin" +CONVERT="$SCRIPT_DIR/convert_weights.py" +VENV_DIR="$SCRIPT_DIR/.venv" + +info "Phase 0: Preparing weight files" + +if [ ! -f "$WEIGHTS_Q8" ]; then + if [ ! -f "$CONVERT" ]; then + echo " convert_weights.py not found, skipping Q8 generation." + WEIGHTS_Q8="" + else + dim "Generating Q8 weights (one-time)..." + if [ -d "$VENV_DIR" ]; then + source "$VENV_DIR/bin/activate" + fi + python3 "$CONVERT" "$MODEL_DIR" "$WEIGHTS_Q8" --q8 + dim "Q8 weights ready: $(du -h "$WEIGHTS_Q8" | cut -f1)" + fi +else + dim "Q8 weights already exist: $(du -h "$WEIGHTS_Q8" | cut -f1)" +fi + +if [ ! -f "$WEIGHTS_Q4" ]; then + if [ ! -f "$CONVERT" ]; then + echo " convert_weights.py not found, skipping Q4 generation." + WEIGHTS_Q4="" + else + dim "Generating Q4 weights (one-time)..." + if [ -d "$VENV_DIR" ]; then + source "$VENV_DIR/bin/activate" + fi + python3 "$CONVERT" "$MODEL_DIR" "$WEIGHTS_Q4" --q4 + dim "Q4 weights ready: $(du -h "$WEIGHTS_Q4" | cut -f1)" + fi +else + dim "Q4 weights already exist: $(du -h "$WEIGHTS_Q4" | cut -f1)" +fi + +dim "F16 weights: $(du -h "$WEIGHTS_F16" | cut -f1)" +echo "" + +# ANE weight formats to benchmark +# GPU flag: empty for CPU formats, "--gpu" for Metal GPU formats +ANE_FMT_NAMES=("F16") +ANE_FMT_WEIGHTS=("$WEIGHTS_F16") +ANE_FMT_LABELS=("F16→F32 (AMX)") +ANE_FMT_GPU=("") + +if [ -n "$WEIGHTS_Q8" ] && [ -f "$WEIGHTS_Q8" ]; then + ANE_FMT_NAMES+=("Q8") + ANE_FMT_WEIGHTS+=("$WEIGHTS_Q8") + ANE_FMT_LABELS+=("Q8 (NEON dequant)") + ANE_FMT_GPU+=("") +fi + +if [ -n "$WEIGHTS_Q4" ] && [ -f "$WEIGHTS_Q4" ]; then + ANE_FMT_NAMES+=("Q4_Metal") + ANE_FMT_WEIGHTS+=("$WEIGHTS_Q4") + ANE_FMT_LABELS+=("Q4 SIMD (Metal GPU)") + ANE_FMT_GPU+=("--gpu") + + ANE_FMT_NAMES+=("Q4_AMX") + ANE_FMT_WEIGHTS+=("$WEIGHTS_Q4") + ANE_FMT_LABELS+=("Q4→F32 (AMX dequant)") + ANE_FMT_GPU+=("") +fi + +NUM_ANE_FMTS=${#ANE_FMT_NAMES[@]} +NUM_PROMPTS=${#PROMPTS[@]} + +# Global cleanup +SERVER_PID="" +cleanup() { + [ -n "$SERVER_PID" ] && kill "$SERVER_PID" 2>/dev/null || true + rm -f "$SOCK" /tmp/qwen_bench_server.log +} +trap cleanup EXIT + +# Helper: start server with given weight file and optional extra flags, wait for READY +start_server() { + local wfile="$1" + shift + local extra_flags="$*" + [ -n "$SERVER_PID" ] && kill "$SERVER_PID" 2>/dev/null || true + sleep 1 + rm -f /tmp/qwen_bench_server.log + "$BINARY" "$wfile" --http "$HTTP_PORT" --model-dir "$MODEL_DIR" $extra_flags > /tmp/qwen_bench_server.log 2>&1 & + SERVER_PID=$! + for _i in $(seq 1 30); do + if grep -q "READY" /tmp/qwen_bench_server.log 2>/dev/null; then return 0; fi + sleep 1 + done + echo "Server failed to start with $wfile. Log:" + cat /tmp/qwen_bench_server.log + return 1 +} + +# --- Phase 1: Multi-format ANE benchmarks --- +# Per-format result tracking (indexed by format number) +declare -a ALL_AVG_P ALL_AVG_D ALL_AVG_INF ALL_AVG_TTFT ALL_AVG_RT +ANE_JSON_BLOCKS="" + +for fmt_idx in $(seq 0 $((NUM_ANE_FMTS - 1))); do + FMT_NAME="${ANE_FMT_NAMES[$fmt_idx]}" + FMT_WEIGHTS="${ANE_FMT_WEIGHTS[$fmt_idx]}" + FMT_LABEL="${ANE_FMT_LABELS[$fmt_idx]}" + FMT_GPU="${ANE_FMT_GPU[$fmt_idx]}" + + echo "" + info "Phase 1.$((fmt_idx+1)): ANE $FMT_NAME benchmark ($FMT_LABEL)" + dim "Weights: $(du -h "$FMT_WEIGHTS" | cut -f1) — Starting server..." + + if ! start_server "$FMT_WEIGHTS" $FMT_GPU; then + echo "Skipping $FMT_NAME format." + ALL_AVG_P+=("0"); ALL_AVG_D+=("0"); ALL_AVG_INF+=("0") + ALL_AVG_TTFT+=("0"); ALL_AVG_RT+=("0") + continue + fi + + dim "Warmup run (discarded)..." + curl -s "http://127.0.0.1:$HTTP_PORT/v1/completions" \ + -H "Content-Type: application/json" \ + -d '{"prompt":"warmup","max_tokens":5}' > /dev/null 2>&1 + echo "" + + printf "%-10s %5s %5s %10s %10s %10s %10s %10s %10s\n" \ + "Prompt" "In" "Out" "Prefill" "Decode" "TTFT" "Infer" "Rndtrip" "Overhead" + printf "%-10s %5s %5s %10s %10s %10s %10s %10s %10s\n" \ + "" "tok" "tok" "(t/s)" "(t/s)" "(ms)" "(ms)" "(ms)" "(ms)" + printf '%.0s─' {1..85}; echo "" + + declare -a P_TPS_ARR=() D_TPS_ARR=() INF_MS_ARR=() TTFT_MS_ARR=() RT_MS_ARR=() + FMT_JSON_ENTRIES="" + + for i in $(seq 0 $((NUM_PROMPTS - 1))); do + NAME="${PROMPT_NAMES[$i]}" + PROMPT="${PROMPTS[$i]}" + MAXTOK="${MAX_TOKENS[$i]}" + + RT_T0=$(perl -MTime::HiRes=time -e 'printf "%.3f", time') + RESP=$(curl -s "http://127.0.0.1:$HTTP_PORT/v1/completions" \ + -H "Content-Type: application/json" \ + -d "{\"prompt\": \"$PROMPT\", \"max_tokens\": $MAXTOK}" 2>&1) + RT_T1=$(perl -MTime::HiRes=time -e 'printf "%.3f", time') + RT_MS=$(echo "$RT_T0 $RT_T1" | awk '{printf "%.0f", ($2 - $1) * 1000}') + + P_TOKENS=$(json_val "$RESP" "prompt_tokens") + G_TOKENS=$(json_val "$RESP" "gen_tokens") + P_TPS=$(json_val "$RESP" "prefill_tps") + D_TPS=$(json_val "$RESP" "decode_tps") + TTFT_MS=$(trunc "$(json_val "$RESP" "ttft_ms")") + INF_MS=$(trunc "$(json_val "$RESP" "inference_ms")") + TOTAL_MS=$(trunc "$(json_val "$RESP" "total_ms")") + TEXT=$(json_text "$RESP") + OVERHEAD=$((RT_MS - TOTAL_MS)) + + printf "%-10s %5s %5s %10s %10s %10s %10s %10s %10s\n" \ + "$NAME" "$P_TOKENS" "$G_TOKENS" "$P_TPS" "$D_TPS" "$TTFT_MS" "$INF_MS" "$RT_MS" "$OVERHEAD" + + P_TPS_ARR+=("$P_TPS") + D_TPS_ARR+=("$D_TPS") + INF_MS_ARR+=("$INF_MS") + TTFT_MS_ARR+=("$TTFT_MS") + RT_MS_ARR+=("$RT_MS") + + FMT_JSON_ENTRIES="$FMT_JSON_ENTRIES{\"name\":\"$NAME\",\"prompt_tokens\":$P_TOKENS,\"gen_tokens\":$G_TOKENS,\"prefill_tps\":$P_TPS,\"decode_tps\":$D_TPS,\"ttft_ms\":$TTFT_MS,\"inference_ms\":$INF_MS,\"roundtrip_ms\":$RT_MS}," + + echo " → $TEXT" + echo "" + done + + printf '%.0s─' {1..85}; echo "" + + F_AVG_P=$(shell_avg "${P_TPS_ARR[@]}") + F_AVG_D=$(shell_avg "${D_TPS_ARR[@]}") + F_AVG_INF=$(shell_avg_int "${INF_MS_ARR[@]}") + F_AVG_TTFT=$(shell_avg_int "${TTFT_MS_ARR[@]}") + F_AVG_RT=$(shell_avg_int "${RT_MS_ARR[@]}") + F_AVG_OVERHEAD=$((F_AVG_RT - F_AVG_INF)) + printf "%-10s %5s %5s %10s %10s %10s %10s %10s %10s\n" "Average" "" "" "$F_AVG_P" "$F_AVG_D" "$F_AVG_TTFT" "$F_AVG_INF" "$F_AVG_RT" "$F_AVG_OVERHEAD" + echo "" + + ALL_AVG_P+=("$F_AVG_P") + ALL_AVG_D+=("$F_AVG_D") + ALL_AVG_INF+=("$F_AVG_INF") + ALL_AVG_TTFT+=("$F_AVG_TTFT") + ALL_AVG_RT+=("$F_AVG_RT") + + ANE_JSON_BLOCKS="$ANE_JSON_BLOCKS + \"$FMT_NAME\": { + \"format\": \"$FMT_NAME\", + \"label\": \"$FMT_LABEL\", + \"weight_size_mb\": $(du -m "$FMT_WEIGHTS" | cut -f1), + \"avg_prefill_tps\": $F_AVG_P, + \"avg_decode_tps\": $F_AVG_D, + \"avg_inference_ms\": $F_AVG_INF, + \"avg_roundtrip_ms\": $F_AVG_RT, + \"avg_ttft_ms\": $F_AVG_TTFT, + \"results\": [${FMT_JSON_ENTRIES%,}] + }," +done + +# Use F16 results as the primary ANE numbers (first format) +AVG_P="${ALL_AVG_P[0]}" +AVG_D="${ALL_AVG_D[0]}" +AVG_INF="${ALL_AVG_INF[0]}" +AVG_TTFT="${ALL_AVG_TTFT[0]}" +AVG_RT="${ALL_AVG_RT[0]}" + +info "Infer = server-reported (pure processing). Rndtrip = wall-clock (what clients see)." +echo "" + +# --- Phase 2: Cold start measurement --- +info "Phase 2: Cold start (single-shot, recompiles ANE kernels)" + +kill "$SERVER_PID" 2>/dev/null || true +SERVER_PID="" +sleep 1 + +COLD_T0=$(perl -MTime::HiRes=time -e 'printf "%.3f", time') +COLD_OUT=$("$BINARY" "$WEIGHTS" "151644 8948 198 2610 525 264 10950 17847 13 151645 198 151644 872 198 13048 151645 198 151644 77091 198" 10 2>&1 || true) +COLD_T1=$(perl -MTime::HiRes=time -e 'printf "%.3f", time') +COLD_MS=$(echo "$COLD_T0 $COLD_T1" | awk '{printf "%.0f", ($2 - $1) * 1000}') + +echo "Cold start latency: ${COLD_MS}ms (includes ANE kernel compilation)" +echo "" + +# Re-start server (F16) for consistency check +start_server "$WEIGHTS_F16" + +# --- Phase 3: Repeated prompt (consistency check) --- +info "Phase 3: Decode speed consistency (5x same prompt, F16)" + +printf "%-6s %10s %10s %10s\n" "Run" "Prefill" "Decode" "Infer(ms)" +printf '%.0s─' {1..40}; echo "" + +for run in $(seq 1 5); do + RESP=$(curl -s "http://127.0.0.1:$HTTP_PORT/v1/completions" \ + -H "Content-Type: application/json" \ + -d '{"prompt": "Count from 1 to 10", "max_tokens": 50}' 2>&1) + P=$(json_val "$RESP" "prefill_tps") + D=$(json_val "$RESP" "decode_tps") + IM=$(trunc "$(json_val "$RESP" "inference_ms")") + printf "%-6s %10s %10s %10s\n" "#$run" "$P" "$D" "$IM" +done +echo "" + +# --- Save JSON results --- +JSON="{ + \"hardware\": \"$CHIP\", + \"macos\": \"$MACOS\", + \"memory_gb\": $MEM_GB, + \"model\": \"Qwen2.5-0.5B-Instruct\", + \"mode\": \"http_server\", + \"cold_start_ms\": $COLD_MS, + \"ane_formats\": {$( echo "$ANE_JSON_BLOCKS" | sed '$ s/,$//' ) + } +}" +echo "$JSON" > "$RESULTS_JSON" +dim "Results saved to $RESULTS_JSON" +echo "" + +# --- Phase 4: LM Studio comparison (if running) --- +LMS_PORT="${LMS_PORT:-1234}" +LMS_API_KEY="${LMS_API_KEY:-}" + +# Models to benchmark (override via LMS_MODELS env var, comma-separated) +LMS_MODELS_DEFAULT="qwen2.5-0.5b-instruct,qwen2.5-0.5b-instruct-mlx@8bit,qwen2.5-0.5b-instruct-mlx@4bit" +IFS=',' read -ra LMS_MODEL_LIST <<< "${LMS_MODELS:-$LMS_MODELS_DEFAULT}" + +# Check if LM Studio is running +LMS_REACHABLE=0 +if curl -s --max-time 2 "http://localhost:$LMS_PORT/api/v1/chat" -H "Content-Type: application/json" -d '{}' >/dev/null 2>&1; then + LMS_REACHABLE=1 +fi + +if [ "$LMS_REACHABLE" -eq 1 ]; then + info "Phase 4: LM Studio comparison (localhost:$LMS_PORT)" + dim "Models: ${LMS_MODEL_LIST[*]}" + + if [ -z "$LMS_API_KEY" ]; then + echo "" + echo " LM Studio requires an API key." + echo " Find it in LM Studio > Developer tab > API key" + echo " Or set LMS_API_KEY env var before running." + echo "" + printf " Enter LM Studio API key (or press Enter to skip): " + read -r LMS_API_KEY + if [ -z "$LMS_API_KEY" ]; then + dim "Skipping LM Studio benchmark." + LMS_REACHABLE=0 + fi + fi +fi + +LMS_ALL_JSON="" + +if [ "$LMS_REACHABLE" -eq 1 ] && [ -n "$LMS_API_KEY" ]; then + + # Track the best model for the final comparison table + BEST_LMS_MODEL="" + BEST_LMS_TPS="0" + BEST_LMS_LAT="99999" + BEST_LMS_TTFT="0" + + for LMS_MODEL in "${LMS_MODEL_LIST[@]}"; do + echo "" + info "── $LMS_MODEL ──" + + # Test if this model is available + TEST_RESP=$(curl -s --max-time 10 "http://localhost:$LMS_PORT/api/v1/chat" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $LMS_API_KEY" \ + -d "{\"model\":\"$LMS_MODEL\",\"system_prompt\":\"test\",\"input\":\"hi\"}" 2>&1) + + if echo "$TEST_RESP" | grep -qi "error\|not found\|not loaded\|no model"; then + dim " Model '$LMS_MODEL' not available, skipping." + continue + fi + + printf "%-10s %5s %5s %10s %10s %10s\n" \ + "Prompt" "In" "Out" "Decode" "TTFT" "Rndtrip" + printf "%-10s %5s %5s %10s %10s %10s\n" \ + "" "tok" "tok" "(t/s)" "(ms)" "(ms)" + printf '%.0s─' {1..55}; echo "" + + declare -a LMS_LATENCIES=() LMS_TPS_ARR=() LMS_TTFT_ARR=() + LMS_JSON_ENTRIES="" + + for i in $(seq 0 $((NUM_PROMPTS - 1))); do + NAME="${PROMPT_NAMES[$i]}" + PROMPT="${PROMPTS[$i]}" + + T0=$(perl -MTime::HiRes=time -e 'printf "%.3f", time') + LMS_RESP=$(curl -s --max-time 120 "http://localhost:$LMS_PORT/api/v1/chat" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $LMS_API_KEY" \ + -d "{\"model\":\"$LMS_MODEL\",\"system_prompt\":\"You are a helpful assistant. Be concise.\",\"input\":\"$PROMPT\"}" 2>&1) + T1=$(perl -MTime::HiRes=time -e 'printf "%.3f", time') + LMS_MS=$(echo "$T0 $T1" | awk '{printf "%.0f", ($2 - $1) * 1000}') + + eval "$(echo "$LMS_RESP" | python3 -c " +import sys, json +try: + r = json.load(sys.stdin) + text = r.get('output', [{}])[0].get('content', '').replace(chr(10),' ').replace('\"', '') + s = r.get('stats', {}) + tps = s.get('tokens_per_second', 0) + ttft = int(s.get('time_to_first_token_seconds', 0) * 1000) + in_tok = s.get('input_tokens', 0) + out_tok = s.get('total_output_tokens', 0) + print(f'LMS_TEXT=\"{text}\"') + print(f'LMS_TPS={tps:.1f}') + print(f'LMS_TTFT={ttft}') + print(f'LMS_IN={in_tok}') + print(f'LMS_OUT={out_tok}') +except Exception as e: + print(f'LMS_TEXT=\"(parse error)\"') + print('LMS_TPS=0') + print('LMS_TTFT=0') + print('LMS_IN=0') + print('LMS_OUT=0') +" 2>/dev/null)" + + printf "%-10s %5s %5s %10s %10s %10s\n" "$NAME" "$LMS_IN" "$LMS_OUT" "$LMS_TPS" "$LMS_TTFT" "$LMS_MS" + LMS_LATENCIES+=("$LMS_MS") + LMS_TPS_ARR+=("$LMS_TPS") + LMS_TTFT_ARR+=("$LMS_TTFT") + LMS_JSON_ENTRIES="$LMS_JSON_ENTRIES{\"name\":\"$NAME\",\"latency_ms\":$LMS_MS,\"tps\":$LMS_TPS,\"ttft_ms\":$LMS_TTFT,\"input_tokens\":$LMS_IN,\"output_tokens\":$LMS_OUT}," + done + + printf '%.0s─' {1..55}; echo "" + + M_AVG_LAT=$(shell_avg_int "${LMS_LATENCIES[@]}") + M_AVG_TPS=$(shell_avg "${LMS_TPS_ARR[@]}") + M_AVG_TTFT=$(shell_avg_int "${LMS_TTFT_ARR[@]}") + printf "%-10s %5s %5s %10s %10s %10s\n" "Average" "" "" "$M_AVG_TPS" "$M_AVG_TTFT" "$M_AVG_LAT" + + # Track the best model by decode t/s + if awk "BEGIN {exit !($M_AVG_TPS > $BEST_LMS_TPS)}" 2>/dev/null; then + BEST_LMS_MODEL="$LMS_MODEL" + BEST_LMS_TPS="$M_AVG_TPS" + BEST_LMS_LAT="$M_AVG_LAT" + BEST_LMS_TTFT="$M_AVG_TTFT" + fi + + LMS_ALL_JSON="$LMS_ALL_JSON + \"$(echo "$LMS_MODEL" | sed 's/[^a-zA-Z0-9._-]/_/g')\": { + \"model\": \"$LMS_MODEL\", + \"avg_latency_ms\": $M_AVG_LAT, + \"avg_tps\": $M_AVG_TPS, + \"avg_ttft_ms\": $M_AVG_TTFT, + \"results\": [${LMS_JSON_ENTRIES%,}] + }," + done + + echo "" + + # --- Final Comparison Table: all ANE formats + all LM Studio models --- + info "=== Multi-Format Comparison ===" + dim "(All times are wall-clock round-trip, apples-to-apples)" + echo "" + + # Collect all column names and data + declare -a COL_NAMES=() COL_DECODE=() COL_PREFILL=() COL_TTFT=() COL_RT=() COL_PREC=() COL_ACCEL=() + + for fi2 in $(seq 0 $((NUM_ANE_FMTS - 1))); do + COL_NAMES+=("ANE ${ANE_FMT_NAMES[$fi2]}") + COL_DECODE+=("${ALL_AVG_D[$fi2]}") + COL_PREFILL+=("${ALL_AVG_P[$fi2]}") + COL_TTFT+=("${ALL_AVG_TTFT[$fi2]}") + COL_RT+=("${ALL_AVG_RT[$fi2]}") + COL_PREC+=("${ANE_FMT_LABELS[$fi2]}") + if [ -n "${ANE_FMT_GPU[$fi2]}" ]; then + COL_ACCEL+=("Metal GPU") + else + COL_ACCEL+=("CPU (AMX)") + fi + done + + # Add each tested LM Studio model as a column + declare -a LMS_TESTED_NAMES=() LMS_TESTED_TPS=() LMS_TESTED_TTFT=() LMS_TESTED_LAT=() + for LMS_MODEL in "${LMS_MODEL_LIST[@]}"; do + # Check if this model was actually tested (has data in LMS_ALL_JSON) + SAFE_KEY=$(echo "$LMS_MODEL" | sed 's/[^a-zA-Z0-9._-]/_/g') + if echo "$LMS_ALL_JSON" | grep -q "\"$SAFE_KEY\""; then + M_TPS=$(echo "$LMS_ALL_JSON" | sed -n "/\"$SAFE_KEY\"/,/}/p" | sed -n 's/.*"avg_tps":[[:space:]]*\([0-9.]*\).*/\1/p' | head -1) + M_TTFT=$(echo "$LMS_ALL_JSON" | sed -n "/\"$SAFE_KEY\"/,/}/p" | sed -n 's/.*"avg_ttft_ms":[[:space:]]*\([0-9]*\).*/\1/p' | head -1) + M_LAT=$(echo "$LMS_ALL_JSON" | sed -n "/\"$SAFE_KEY\"/,/}/p" | sed -n 's/.*"avg_latency_ms":[[:space:]]*\([0-9]*\).*/\1/p' | head -1) + + SHORT_NAME=$(echo "$LMS_MODEL" | sed 's/qwen2.5-0.5b-instruct/q0.5b/; s/-mlx/mlx/') + COL_NAMES+=("LMS $SHORT_NAME") + COL_DECODE+=("${M_TPS:-0}") + COL_PREFILL+=("N/A") + COL_TTFT+=("${M_TTFT:-0}") + COL_RT+=("${M_LAT:-0}") + + PREC_TAG="GGUF" + echo "$LMS_MODEL" | grep -q "8bit" && PREC_TAG="MLX 8-bit" + echo "$LMS_MODEL" | grep -q "4bit" && PREC_TAG="MLX 4-bit" + COL_PREC+=("$PREC_TAG") + COL_ACCEL+=("CPU/GPU") + + LMS_TESTED_NAMES+=("$LMS_MODEL") + LMS_TESTED_TPS+=("${M_TPS:-0}") + LMS_TESTED_TTFT+=("${M_TTFT:-0}") + LMS_TESTED_LAT+=("${M_LAT:-0}") + fi + done + + NUM_COLS=${#COL_NAMES[@]} + COL_W=16 + + # Print header row + printf "%-20s" "" + for c in $(seq 0 $((NUM_COLS - 1))); do printf "%${COL_W}s" "${COL_NAMES[$c]}"; done + echo "" + printf '%.0s─' $(seq 1 $((20 + NUM_COLS * COL_W))); echo "" + + # Data rows + printf "%-20s" "Decode (t/s)" + for c in $(seq 0 $((NUM_COLS - 1))); do printf "%${COL_W}s" "${COL_DECODE[$c]}"; done + echo "" + + printf "%-20s" "Prefill (t/s)" + for c in $(seq 0 $((NUM_COLS - 1))); do printf "%${COL_W}s" "${COL_PREFILL[$c]}"; done + echo "" + + printf "%-20s" "TTFT (ms)" + for c in $(seq 0 $((NUM_COLS - 1))); do printf "%${COL_W}s" "${COL_TTFT[$c]}"; done + echo "" + + printf "%-20s" "Round-trip (ms)" + for c in $(seq 0 $((NUM_COLS - 1))); do printf "%${COL_W}s" "${COL_RT[$c]}"; done + echo "" + + printf "%-20s" "Cold start (ms)" + printf "%${COL_W}s" "$COLD_MS" + for c in $(seq 1 $((NUM_COLS - 1))); do printf "%${COL_W}s" "N/A"; done + echo "" + + printf '%.0s─' $(seq 1 $((20 + NUM_COLS * COL_W))); echo "" + + printf "%-20s" "Precision" + for c in $(seq 0 $((NUM_COLS - 1))); do printf "%${COL_W}s" "${COL_PREC[$c]}"; done + echo "" + + printf "%-20s" "Accelerator" + for c in $(seq 0 $((NUM_COLS - 1))); do printf "%${COL_W}s" "${COL_ACCEL[$c]}"; done + echo "" + + printf "%-20s" "Timing" + for c in $(seq 0 $((NUM_COLS - 1))); do printf "%${COL_W}s" "Wall-clock"; done + echo "" + echo "" + + # Append LM Studio results to JSON + LMS_JSON_BLOCK=", + \"lm_studio\": { + \"port\": $LMS_PORT, + \"models_tested\": [$(printf '"%s",' "${LMS_MODEL_LIST[@]}" | sed 's/,$//')],$( echo "$LMS_ALL_JSON" | sed '$ s/,$//' ) + } +}" + sed -i '' '$ s/}$//' "$RESULTS_JSON" + printf '%s\n' "$LMS_JSON_BLOCK" >> "$RESULTS_JSON" + dim "LM Studio results added to $RESULTS_JSON" +else + # No LM Studio -- print ANE-only comparison if we have multiple formats + if [ "$NUM_ANE_FMTS" -gt 1 ]; then + info "=== ANE Format Comparison ===" + echo "" + printf "%-20s" "" + for fi2 in $(seq 0 $((NUM_ANE_FMTS - 1))); do printf "%16s" "ANE ${ANE_FMT_NAMES[$fi2]}"; done + echo "" + printf '%.0s─' $(seq 1 $((20 + NUM_ANE_FMTS * 16))); echo "" + printf "%-20s" "Decode (t/s)" + for fi2 in $(seq 0 $((NUM_ANE_FMTS - 1))); do printf "%16s" "${ALL_AVG_D[$fi2]}"; done + echo "" + printf "%-20s" "Prefill (t/s)" + for fi2 in $(seq 0 $((NUM_ANE_FMTS - 1))); do printf "%16s" "${ALL_AVG_P[$fi2]}"; done + echo "" + printf "%-20s" "TTFT (ms)" + for fi2 in $(seq 0 $((NUM_ANE_FMTS - 1))); do printf "%16s" "${ALL_AVG_TTFT[$fi2]}"; done + echo "" + printf "%-20s" "Round-trip (ms)" + for fi2 in $(seq 0 $((NUM_ANE_FMTS - 1))); do printf "%16s" "${ALL_AVG_RT[$fi2]}"; done + echo "" + printf '%.0s─' $(seq 1 $((20 + NUM_ANE_FMTS * 16))); echo "" + echo "" + fi + + info "=== LM Studio Comparison ===" + echo "" + if [ "$LMS_REACHABLE" -eq 0 ]; then + echo " LM Studio server not detected on localhost:$LMS_PORT" + echo "" + echo " To enable automatic comparison:" + echo " 1. Open LM Studio, download Qwen2.5-0.5B-Instruct (GGUF + MLX variants)" + echo " 2. Load the model, go to Developer tab > Start Server" + echo " 3. Re-run this benchmark" + echo "" + echo " Or set env vars: LMS_PORT=1234 LMS_API_KEY=your-key ./benchmark.sh" + echo "" + echo " Models benchmarked by default:" + echo " - qwen2.5-0.5b-instruct (GGUF)" + echo " - qwen2.5-0.5b-instruct-mlx@8bit (MLX 8-bit)" + echo " - qwen2.5-0.5b-instruct-mlx@4bit (MLX 4-bit)" + echo "" + echo " Override with: LMS_MODELS='model1,model2' ./benchmark.sh" + fi + echo "" + echo " Manual test:" + echo " curl http://localhost:1234/api/v1/chat \\" + echo " -H 'Content-Type: application/json' \\" + echo " -H 'Authorization: Bearer YOUR_API_KEY' \\" + echo " -d '{\"model\":\"qwen2.5-0.5b-instruct\",\"system_prompt\":\"You are a helpful assistant.\",\"input\":\"What is 2+2?\"}'" + echo "" + echo " ANE F16: prefill=${AVG_P} t/s, decode=${AVG_D} t/s, inference=${AVG_INF}ms" + echo "" + echo " Note: LM Studio uses quantized GGUF/MLX (CPU/GPU) while we use" + echo " F16/Q8 weights running on CPU AMX / NEON." +fi +echo "" diff --git a/inference/convert_weights.py b/inference/convert_weights.py new file mode 100644 index 0000000..a7b01fa --- /dev/null +++ b/inference/convert_weights.py @@ -0,0 +1,185 @@ +#!/usr/bin/env python3 +"""Convert Qwen2.5-0.5B-Instruct safetensors → flat binary for ANE inference. + +Output format (F32): config header (8 ints) + all weights in f32 +Output format (F16): config header (8 ints) + embeddings f32 + projection weights f16 +Output format (Q8): config header (8 ints) + embeddings f32 + projection weights q8_0 +Output format (Q4): config header (8 ints) + embeddings f32 + projection weights q4_0 + +The 8th config int is the format flag: 0 = F32, 1 = F16, 2 = Q8, 3 = Q4. +Q8_0 format: blocks of 32 values, each block = 1 f16 scale + 32 int8 values (34 bytes). +Q4_0 format: blocks of 32 values, each block = 1 f16 scale + 1 f16 zero + 16 uint8 packed pairs (20 bytes). + +Usage: + python3 convert_weights.py [--f16|--q8|--q4] +""" + +import struct +import sys +import numpy as np +from pathlib import Path +from safetensors import safe_open + +Q8_BLOCK_SIZE = 32 +Q4_BLOCK_SIZE = 32 + +def quantize_q4_0(weights_f32): + """Quantize a 2D weight matrix to Q4_0 block format. + Returns bytes: for each row, blocks of (f16_scale + f16_zero + 16*uint8 packed pairs). + Each uint8 stores two 4-bit values: low nibble = even index, high nibble = odd index.""" + out_dim, in_dim = weights_f32.shape + assert in_dim % Q4_BLOCK_SIZE == 0, f"in_dim {in_dim} not divisible by {Q4_BLOCK_SIZE}" + + n_blocks_per_row = in_dim // Q4_BLOCK_SIZE + result = bytearray() + + for r in range(out_dim): + row = weights_f32[r] + for b in range(n_blocks_per_row): + block = row[b * Q4_BLOCK_SIZE : (b + 1) * Q4_BLOCK_SIZE] + bmin = np.min(block) + bmax = np.max(block) + if bmax == bmin: + scale = np.float16(0.0) + zero = np.float16(0.0) + packed = bytes(Q4_BLOCK_SIZE // 2) + else: + scale_f = (bmax - bmin) / 15.0 + zero_f = bmin + scale = np.float16(scale_f) + zero = np.float16(zero_f) + scale_f = float(scale) if float(scale) != 0.0 else 1e-10 + quant = np.clip(np.round((block - float(zero)) / scale_f), 0, 15).astype(np.uint8) + packed = bytearray(Q4_BLOCK_SIZE // 2) + for i in range(0, Q4_BLOCK_SIZE, 2): + packed[i // 2] = quant[i] | (quant[i + 1] << 4) + result += scale.tobytes() + result += zero.tobytes() + result += bytes(packed) + + return bytes(result) + + +def quantize_q8_0(weights_f32): + """Quantize a 2D weight matrix to Q8_0 block format. + Returns bytes: for each row, blocks of (f16_scale + 32*int8).""" + out_dim, in_dim = weights_f32.shape + assert in_dim % Q8_BLOCK_SIZE == 0, f"in_dim {in_dim} not divisible by {Q8_BLOCK_SIZE}" + + n_blocks_per_row = in_dim // Q8_BLOCK_SIZE + result = bytearray() + + for r in range(out_dim): + row = weights_f32[r] + for b in range(n_blocks_per_row): + block = row[b * Q8_BLOCK_SIZE : (b + 1) * Q8_BLOCK_SIZE] + amax = np.max(np.abs(block)) + scale = amax / 127.0 if amax > 0 else 0.0 + if scale > 0: + quant = np.round(block / scale).astype(np.int8) + else: + quant = np.zeros(Q8_BLOCK_SIZE, dtype=np.int8) + result += np.float16(scale).tobytes() + result += quant.tobytes() + + return bytes(result) + + +def convert(model_dir: str, output_path: str, fmt: str = "f32"): + model_dir = Path(model_dir) + + st_files = list(model_dir.glob("*.safetensors")) + if not st_files: + print(f"No safetensors files in {model_dir}") + sys.exit(1) + + tensors = {} + for f in st_files: + with safe_open(str(f), framework="pt") as sf: + for key in sf.keys(): + tensors[key] = sf.get_tensor(key).float().numpy() + + print(f"Loaded {len(tensors)} tensors from {len(st_files)} files") + print(f"Mode: {fmt.upper()} projections (embeddings + norms + biases stay F32)") + + dim = 896 + hidden = 4864 + n_layers = 24 + n_heads = 14 + n_kv_heads = 2 + vocab_size = 151936 + max_seq = 512 + fmt_flag = {"f32": 0, "f16": 1, "q8": 2, "q4": 3}[fmt] + + def write_proj(f_out, tensor_f32): + if fmt == "q4": + f_out.write(quantize_q4_0(tensor_f32)) + elif fmt == "q8": + f_out.write(quantize_q8_0(tensor_f32)) + elif fmt == "f16": + f_out.write(tensor_f32.astype(np.float16).tobytes()) + else: + f_out.write(tensor_f32.astype(np.float32).tobytes()) + + with open(output_path, "wb") as f: + f.write(struct.pack("iiiiiiii", + dim, hidden, n_layers, n_heads, n_kv_heads, vocab_size, max_seq, fmt_flag)) + + emb = tensors["model.embed_tokens.weight"].astype(np.float32) + print(f"embed: {emb.shape} (f32)") + f.write(emb.tobytes()) + + for l in range(n_layers): + prefix = f"model.layers.{l}" + + rms_att = tensors[f"{prefix}.input_layernorm.weight"].astype(np.float32) + f.write(rms_att.tobytes()) + + wq = tensors[f"{prefix}.self_attn.q_proj.weight"].astype(np.float32) + wk = tensors[f"{prefix}.self_attn.k_proj.weight"].astype(np.float32) + wv = tensors[f"{prefix}.self_attn.v_proj.weight"].astype(np.float32) + wo = tensors[f"{prefix}.self_attn.o_proj.weight"].astype(np.float32) + write_proj(f, wq) + write_proj(f, wk) + write_proj(f, wv) + write_proj(f, wo) + + qb = tensors.get(f"{prefix}.self_attn.q_proj.bias") + kb = tensors.get(f"{prefix}.self_attn.k_proj.bias") + vb = tensors.get(f"{prefix}.self_attn.v_proj.bias") + f.write((qb if qb is not None else np.zeros(wq.shape[0])).astype(np.float32).tobytes()) + f.write((kb if kb is not None else np.zeros(wk.shape[0])).astype(np.float32).tobytes()) + f.write((vb if vb is not None else np.zeros(wv.shape[0])).astype(np.float32).tobytes()) + + rms_ffn = tensors[f"{prefix}.post_attention_layernorm.weight"].astype(np.float32) + f.write(rms_ffn.tobytes()) + + w_gate = tensors[f"{prefix}.mlp.gate_proj.weight"].astype(np.float32) + w_up = tensors[f"{prefix}.mlp.up_proj.weight"].astype(np.float32) + w_down = tensors[f"{prefix}.mlp.down_proj.weight"].astype(np.float32) + write_proj(f, w_gate) + write_proj(f, w_up) + write_proj(f, w_down) + + print(f" Layer {l}: Q{wq.shape} K{wk.shape} V{wv.shape} O{wo.shape} " + f"gate{w_gate.shape} up{w_up.shape} down{w_down.shape} [{fmt}]") + + rms_final = tensors["model.norm.weight"].astype(np.float32) + f.write(rms_final.tobytes()) + + size_mb = Path(output_path).stat().st_size / 1024 / 1024 + print(f"\nWritten: {output_path} ({size_mb:.0f} MB)") + + +if __name__ == "__main__": + if len(sys.argv) < 3: + print("Usage: python3 convert_weights.py [--f16|--q8|--q4]") + sys.exit(1) + fmt = "f32" + if "--f16" in sys.argv: + fmt = "f16" + elif "--q8" in sys.argv: + fmt = "q8" + elif "--q4" in sys.argv: + fmt = "q4" + convert(sys.argv[1], sys.argv[2], fmt) diff --git a/inference/http_server.h b/inference/http_server.h new file mode 100644 index 0000000..0dd4603 --- /dev/null +++ b/inference/http_server.h @@ -0,0 +1,221 @@ +// http_server.h -- Minimal HTTP/1.1 server for ANE inference API +// Handles GET /health and POST /v1/completions using raw POSIX sockets. +// No external dependencies. +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define HTTP_MAX_REQUEST 65536 +#define HTTP_MAX_RESPONSE 262144 +#define HTTP_MAX_BODY 65536 + +// --- HTTP request parsing --- + +typedef struct { + char method[8]; // GET, POST, etc. + char path[256]; // /v1/completions, /health, etc. + char body[HTTP_MAX_BODY]; + int body_len; + int content_length; +} HttpRequest; + +static int http_parse_request(const char *raw, int raw_len, HttpRequest *req) { + memset(req, 0, sizeof(HttpRequest)); + + // Parse request line: METHOD PATH HTTP/1.1\r\n + const char *p = raw; + int i = 0; + while (*p && *p != ' ' && i < 7) req->method[i++] = *p++; + req->method[i] = '\0'; + if (*p == ' ') p++; + + i = 0; + while (*p && *p != ' ' && *p != '?' && i < 255) req->path[i++] = *p++; + req->path[i] = '\0'; + + // Skip to end of request line + while (*p && *p != '\n') p++; + if (*p) p++; + + // Parse headers (only need Content-Length) + req->content_length = 0; + while (*p && !(*p == '\r' && *(p+1) == '\n') && *p != '\n') { + if (strncasecmp(p, "Content-Length:", 15) == 0) { + req->content_length = atoi(p + 15); + } + while (*p && *p != '\n') p++; + if (*p) p++; + } + // Skip blank line + if (*p == '\r') p++; + if (*p == '\n') p++; + + // Copy body + int remaining = raw_len - (int)(p - raw); + req->body_len = remaining < HTTP_MAX_BODY - 1 ? remaining : HTTP_MAX_BODY - 1; + if (req->body_len > 0) memcpy(req->body, p, req->body_len); + req->body[req->body_len] = '\0'; + + return 0; +} + +// --- HTTP response sending --- + +static void http_send(int fd, int status, const char *status_text, + const char *content_type, const char *body, int body_len) { + char header[1024]; + int hlen = snprintf(header, sizeof(header), + "HTTP/1.1 %d %s\r\n" + "Content-Type: %s\r\n" + "Content-Length: %d\r\n" + "Access-Control-Allow-Origin: *\r\n" + "Access-Control-Allow-Methods: POST, GET, OPTIONS\r\n" + "Access-Control-Allow-Headers: Content-Type\r\n" + "Connection: close\r\n" + "\r\n", + status, status_text, content_type, body_len); + + write(fd, header, hlen); + if (body_len > 0) write(fd, body, body_len); +} + +static void http_send_json(int fd, int status, const char *json) { + const char *status_text = "OK"; + if (status == 400) status_text = "Bad Request"; + else if (status == 404) status_text = "Not Found"; + else if (status == 503) status_text = "Service Unavailable"; + http_send(fd, status, status_text, "application/json", json, (int)strlen(json)); +} + +// --- Minimal JSON field extraction --- + +static int http_json_get_string(const char *json, const char *key, + char *out, int max_out) { + char search[256]; + snprintf(search, sizeof(search), "\"%s\"", key); + const char *p = strstr(json, search); + if (!p) return -1; + p += strlen(search); + while (*p && (*p == ' ' || *p == ':' || *p == '\t')) p++; + if (*p != '"') return -1; + p++; + int n = 0; + while (*p && *p != '"' && n < max_out - 1) { + if (*p == '\\') { + p++; + switch (*p) { + case 'n': out[n++] = '\n'; break; + case 't': out[n++] = '\t'; break; + case '"': out[n++] = '"'; break; + case '\\': out[n++] = '\\'; break; + default: out[n++] = *p; + } + } else { + out[n++] = *p; + } + p++; + } + out[n] = '\0'; + return n; +} + +static int http_json_get_int(const char *json, const char *key, int default_val) { + char search[256]; + snprintf(search, sizeof(search), "\"%s\"", key); + const char *p = strstr(json, search); + if (!p) return default_val; + p += strlen(search); + while (*p && (*p == ' ' || *p == ':' || *p == '\t')) p++; + if (*p == '-' || (*p >= '0' && *p <= '9')) + return (int)strtol(p, NULL, 10); + return default_val; +} + +// --- TCP server --- + +typedef void (*HttpHandler)(int client_fd, HttpRequest *req, void *ctx); + +static int http_serve(int port, HttpHandler handler, void *ctx) { + int srv = socket(AF_INET, SOCK_STREAM, 0); + if (srv < 0) { perror("socket"); return -1; } + + int opt = 1; + setsockopt(srv, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)); + + struct sockaddr_in addr; + memset(&addr, 0, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + addr.sin_port = htons(port); + + if (bind(srv, (struct sockaddr*)&addr, sizeof(addr)) < 0) { + perror("bind"); close(srv); return -1; + } + if (listen(srv, 8) < 0) { + perror("listen"); close(srv); return -1; + } + + printf("HTTP server listening on http://127.0.0.1:%d\n", port); + printf(" POST /v1/completions {\"prompt\": \"...\", \"max_tokens\": 50}\n"); + printf(" GET /health\n"); + printf("READY\n"); + fflush(stdout); + + while (1) { + int client = accept(srv, NULL, NULL); + if (client < 0) { perror("accept"); continue; } + + // Read full request (headers + body) + char buf[HTTP_MAX_REQUEST]; + int total = 0; + int headers_done = 0; + int content_length = 0; + int body_start = 0; + + while (total < HTTP_MAX_REQUEST - 1) { + ssize_t n = read(client, buf + total, HTTP_MAX_REQUEST - 1 - total); + if (n <= 0) break; + total += n; + buf[total] = '\0'; + + if (!headers_done) { + char *hend = strstr(buf, "\r\n\r\n"); + if (hend) { + headers_done = 1; + body_start = (int)(hend - buf) + 4; + // Extract Content-Length + char *cl = strcasestr(buf, "Content-Length:"); + if (cl) content_length = atoi(cl + 15); + } + } + + if (headers_done) { + int body_received = total - body_start; + if (body_received >= content_length) break; + } + } + + HttpRequest req; + http_parse_request(buf, total, &req); + + // Handle OPTIONS preflight + if (strcmp(req.method, "OPTIONS") == 0) { + http_send(client, 204, "No Content", "text/plain", "", 0); + close(client); + continue; + } + + handler(client, &req, ctx); + close(client); + } + + return 0; +} diff --git a/inference/main.m b/inference/main.m new file mode 100644 index 0000000..6e9e719 --- /dev/null +++ b/inference/main.m @@ -0,0 +1,806 @@ +// main.m -- Qwen2.5-0.5B inference on Apple Neural Engine +// Supports four modes: +// 1. Single-shot: ./qwen_ane weights.bin "token_ids" [max_tokens] +// 2. Stdin server: ./qwen_ane weights.bin --server +// 3. Socket server: ./qwen_ane weights.bin --server /tmp/qwen_ane.sock +// 4. HTTP API: ./qwen_ane weights.bin --http 8000 --model-dir ~/models/Qwen2.5-0.5B-Instruct +// +// Build: +// xcrun clang -O3 -ffast-math -mcpu=apple-m4 -flto \ +// -framework Foundation -framework IOSurface \ +// -framework CoreML -framework Accelerate -framework Metal \ +// -ldl -lobjc -fobjc-arc -o qwen_ane main.m +// +#import +#include +#include +#include +#include +#include +#include +#include +#include +#include "qwen_ane_infer.h" +#include "tokenizer.h" +#include "http_server.h" + +int g_fp16_io = 0; +static QwenModel g_model; +static const char *g_sock_path = NULL; +static Tokenizer g_tokenizer; +static int g_tokenizer_loaded = 0; + +static void cleanup_socket(void) { + if (g_sock_path) unlink(g_sock_path); +} + +static void handle_signal(int sig) { + (void)sig; + cleanup_socket(); + _exit(0); +} + +static void *safe_malloc(size_t size, const char *desc) { + void *p = malloc(size); + if (!p) { + fprintf(stderr, "FATAL: malloc failed for %s (%.1f MB)\n", + desc, (double)size / (1024*1024)); + exit(1); + } + return p; +} + +static void *safe_calloc(size_t count, size_t size, const char *desc) { + void *p = calloc(count, size); + if (!p) { + fprintf(stderr, "FATAL: calloc failed for %s (%.1f MB)\n", + desc, (double)(count * size) / (1024*1024)); + exit(1); + } + return p; +} + +static int load_weights(const char *path) { + FILE *f = fopen(path, "rb"); + if (!f) { fprintf(stderr, "Cannot open %s\n", path); return -1; } + + // Try 8-int header first (new format), fall back to 7-int (legacy) + int config[8] = {0}; + size_t hdr_read = fread(config, sizeof(int), 8, f); + int dim = config[0], hidden = config[1], n_layers = config[2]; + int n_heads = config[3], n_kv_heads = config[4], vocab = config[5]; + int fmt_flag = 0; + + if (hdr_read == 8 && config[7] >= 0 && config[7] <= 3) { + fmt_flag = config[7]; + } else { + fseek(f, 7 * sizeof(int), SEEK_SET); + } + + g_model.weight_fmt = fmt_flag; + int is_f16 = (fmt_flag == 1); + int is_q8 = (fmt_flag == 2); + int is_q4 = (fmt_flag == 3); + const char *fmt_str = is_q4 ? "Q4" : (is_q8 ? "Q8" : (is_f16 ? "F16" : "F32")); + printf("Config: dim=%d hidden=%d layers=%d heads=%d kv_heads=%d vocab=%d fmt=%s\n", + dim, hidden, n_layers, n_heads, n_kv_heads, vocab, fmt_str); + + int q_dim = n_heads * QWEN_HEAD_DIM; + int kv_dim = n_kv_heads * QWEN_HEAD_DIM; + + // Embeddings always F32 + g_model.embed = (float*)safe_malloc((size_t)vocab * dim * sizeof(float), "embed"); + fread(g_model.embed, sizeof(float), (size_t)vocab * dim, f); + + for (int l = 0; l < n_layers; l++) { + // RMSNorm always F32 + g_model.rms_att[l] = (float*)malloc(dim * sizeof(float)); + fread(g_model.rms_att[l], sizeof(float), dim, f); + + if (is_q4) { + #define LOAD_Q4(q8ptr, out_d, in_d) do { \ + size_t _nb = (size_t)(in_d) / Q4_BLOCK_SIZE; \ + size_t _bytes = (size_t)(out_d) * _nb * Q4_BLOCK_BYTES; \ + q8ptr = (uint8_t*)safe_malloc(_bytes, #q8ptr); \ + fread(q8ptr, 1, _bytes, f); \ + } while(0) + LOAD_Q4(g_model.wq_q8[l], q_dim, dim); + LOAD_Q4(g_model.wk_q8[l], kv_dim, dim); + LOAD_Q4(g_model.wv_q8[l], kv_dim, dim); + LOAD_Q4(g_model.wo_q8[l], dim, q_dim); + #undef LOAD_Q4 + } else if (is_q8) { + #define LOAD_Q8(q8ptr, out_d, in_d) do { \ + size_t _nb = (size_t)(in_d) / Q8_BLOCK_SIZE; \ + size_t _bytes = (size_t)(out_d) * _nb * Q8_BLOCK_BYTES; \ + q8ptr = (uint8_t*)safe_malloc(_bytes, #q8ptr); \ + fread(q8ptr, 1, _bytes, f); \ + } while(0) + LOAD_Q8(g_model.wq_q8[l], q_dim, dim); + LOAD_Q8(g_model.wk_q8[l], kv_dim, dim); + LOAD_Q8(g_model.wv_q8[l], kv_dim, dim); + LOAD_Q8(g_model.wo_q8[l], dim, q_dim); + #undef LOAD_Q8 + } else if (is_f16) { + #define LOAD_F16_AS_F32(f32ptr, f16ptr, n) do { \ + size_t _n = (size_t)(n); \ + f16ptr = (_Float16*)malloc(_n * sizeof(_Float16)); \ + fread(f16ptr, sizeof(_Float16), _n, f); \ + f32ptr = (float*)malloc(_n * sizeof(float)); \ + convert_f16_to_f32(f16ptr, f32ptr, _n); \ + } while(0) + LOAD_F16_AS_F32(g_model.wq[l], g_model.wq_f16[l], (size_t)q_dim * dim); + LOAD_F16_AS_F32(g_model.wk[l], g_model.wk_f16[l], (size_t)kv_dim * dim); + LOAD_F16_AS_F32(g_model.wv[l], g_model.wv_f16[l], (size_t)kv_dim * dim); + LOAD_F16_AS_F32(g_model.wo[l], g_model.wo_f16[l], (size_t)dim * q_dim); + #undef LOAD_F16_AS_F32 + } else { + g_model.wq[l] = (float*)malloc((size_t)q_dim * dim * sizeof(float)); + fread(g_model.wq[l], sizeof(float), (size_t)q_dim * dim, f); + g_model.wk[l] = (float*)malloc((size_t)kv_dim * dim * sizeof(float)); + fread(g_model.wk[l], sizeof(float), (size_t)kv_dim * dim, f); + g_model.wv[l] = (float*)malloc((size_t)kv_dim * dim * sizeof(float)); + fread(g_model.wv[l], sizeof(float), (size_t)kv_dim * dim, f); + g_model.wo[l] = (float*)malloc((size_t)q_dim * dim * sizeof(float)); + fread(g_model.wo[l], sizeof(float), (size_t)dim * q_dim, f); + } + + // Biases always F32 + g_model.q_bias[l] = (float*)malloc(q_dim * sizeof(float)); + g_model.k_bias[l] = (float*)malloc(kv_dim * sizeof(float)); + g_model.v_bias[l] = (float*)malloc(kv_dim * sizeof(float)); + fread(g_model.q_bias[l], sizeof(float), q_dim, f); + fread(g_model.k_bias[l], sizeof(float), kv_dim, f); + fread(g_model.v_bias[l], sizeof(float), kv_dim, f); + + // FFN RMSNorm always F32 + g_model.rms_ffn[l] = (float*)malloc(dim * sizeof(float)); + fread(g_model.rms_ffn[l], sizeof(float), dim, f); + + if (is_q4) { + #define LOAD_Q4(q8ptr, out_d, in_d) do { \ + size_t _nb = (size_t)(in_d) / Q4_BLOCK_SIZE; \ + size_t _bytes = (size_t)(out_d) * _nb * Q4_BLOCK_BYTES; \ + q8ptr = (uint8_t*)safe_malloc(_bytes, #q8ptr); \ + fread(q8ptr, 1, _bytes, f); \ + } while(0) + LOAD_Q4(g_model.wgate_q8[l], hidden, dim); + LOAD_Q4(g_model.wup_q8[l], hidden, dim); + LOAD_Q4(g_model.wdown_q8[l], dim, hidden); + #undef LOAD_Q4 + } else if (is_q8) { + #define LOAD_Q8(q8ptr, out_d, in_d) do { \ + size_t _nb = (size_t)(in_d) / Q8_BLOCK_SIZE; \ + size_t _bytes = (size_t)(out_d) * _nb * Q8_BLOCK_BYTES; \ + q8ptr = (uint8_t*)safe_malloc(_bytes, #q8ptr); \ + fread(q8ptr, 1, _bytes, f); \ + } while(0) + LOAD_Q8(g_model.wgate_q8[l], hidden, dim); + LOAD_Q8(g_model.wup_q8[l], hidden, dim); + LOAD_Q8(g_model.wdown_q8[l], dim, hidden); + #undef LOAD_Q8 + } else if (is_f16) { + #define LOAD_F16_AS_F32(f32ptr, f16ptr, n) do { \ + size_t _n = (size_t)(n); \ + f16ptr = (_Float16*)malloc(_n * sizeof(_Float16)); \ + fread(f16ptr, sizeof(_Float16), _n, f); \ + f32ptr = (float*)malloc(_n * sizeof(float)); \ + convert_f16_to_f32(f16ptr, f32ptr, _n); \ + } while(0) + LOAD_F16_AS_F32(g_model.w_gate[l], g_model.wgate_f16[l], (size_t)hidden * dim); + LOAD_F16_AS_F32(g_model.w_up[l], g_model.wup_f16[l], (size_t)hidden * dim); + LOAD_F16_AS_F32(g_model.w_down[l], g_model.wdown_f16[l], (size_t)dim * hidden); + #undef LOAD_F16_AS_F32 + } else { + g_model.w_gate[l] = (float*)malloc((size_t)hidden * dim * sizeof(float)); + fread(g_model.w_gate[l], sizeof(float), (size_t)hidden * dim, f); + g_model.w_up[l] = (float*)malloc((size_t)hidden * dim * sizeof(float)); + fread(g_model.w_up[l], sizeof(float), (size_t)hidden * dim, f); + g_model.w_down[l] = (float*)malloc((size_t)dim * hidden * sizeof(float)); + fread(g_model.w_down[l], sizeof(float), (size_t)dim * hidden, f); + } + } + + g_model.rms_final = (float*)malloc(dim * sizeof(float)); + fread(g_model.rms_final, sizeof(float), dim, f); + + long file_size = ftell(f); + fclose(f); + printf("Weights loaded (%.0f MB, %s projections)\n", + (float)file_size / 1024 / 1024, fmt_str); + return 0; +} + +// Parse space-separated token IDs from a string. Returns count. +static int parse_tokens(const char *str, int *ids, int max_ids) { + int n = 0; + char *buf = strdup(str); + char *saveptr; + char *p = strtok_r(buf, " \t\n\r", &saveptr); + while (p && n < max_ids) { + ids[n++] = atoi(p); + p = strtok_r(NULL, " \t\n\r", &saveptr); + } + free(buf); + return n; +} + +static double timespec_diff(struct timespec *a, struct timespec *b) { + return (b->tv_sec - a->tv_sec) + (b->tv_nsec - a->tv_nsec) / 1e9; +} + +// Run one generation pass. Writes output token IDs to out_ids, returns count. +// Uses batched prefill (sgemm) for prompt, sequential decode (sgemv) for generation. +static int generate(int *prompt_ids, int n_prompt, int max_gen, + int *out_ids, int max_out, + double *prefill_tps, double *decode_tps) { + struct timespec t0, t1, t_pre; + clock_gettime(CLOCK_MONOTONIC, &t0); + + int next; + if (g_model.use_ane) { + for (int i = 0; i < n_prompt; i++) + next = qwen_forward_ane(&g_model, prompt_ids[i]); + } else if (n_prompt > 1 && g_model.weight_fmt == 3) { + next = qwen_prefill_q4(&g_model, prompt_ids, n_prompt); + } else if (n_prompt > 1 && g_model.weight_fmt != 2) { + next = qwen_prefill(&g_model, prompt_ids, n_prompt); + } else { + for (int i = 0; i < n_prompt; i++) + next = qwen_forward(&g_model, prompt_ids[i]); + } + + clock_gettime(CLOCK_MONOTONIC, &t_pre); + double ps = timespec_diff(&t0, &t_pre); + *prefill_tps = ps > 0 ? n_prompt / ps : 0; + + int eos = 151645, eos2 = 151643; + int n_out = 0; + for (int i = 0; i < max_gen && n_out < max_out; i++) { + if (n_out < max_out) out_ids[n_out++] = next; + if (next == eos || next == eos2) break; + if (g_model.use_ane) + next = qwen_forward_ane(&g_model, next); + else + next = qwen_forward(&g_model, next); + } + + clock_gettime(CLOCK_MONOTONIC, &t1); + double ds = timespec_diff(&t_pre, &t1); + int gen_tokens = n_out > 1 ? n_out - 1 : 0; + *decode_tps = ds > 0 ? gen_tokens / ds : 0; + + return n_out; +} + +// --- Stdin server mode --- +static void run_stdin_server(void) { + printf("READY\n"); + fflush(stdout); + + char line[65536]; + while (fgets(line, sizeof(line), stdin)) { + // Format: "token_id token_id ... [|max_tokens]" + int max_gen = 50; + char *pipe = strchr(line, '|'); + if (pipe) { + max_gen = atoi(pipe + 1); + *pipe = '\0'; + } + + int prompt_ids[2048]; + int n_prompt = parse_tokens(line, prompt_ids, 2048); + if (n_prompt == 0) { + printf("ERR: empty prompt\n"); + fflush(stdout); + continue; + } + + int out_ids[4096]; + double p_tps, d_tps; + int n_out = generate(prompt_ids, n_prompt, max_gen, out_ids, 4096, &p_tps, &d_tps); + + printf("OUT:"); + for (int i = 0; i < n_out; i++) printf(" %d", out_ids[i]); + printf("\n"); + printf("PERF: prefill=%.1f decode=%.1f prompt=%d gen=%d\n", + p_tps, d_tps, n_prompt, n_out); + fflush(stdout); + + qwen_reset(&g_model); + } +} + +// --- Socket server mode --- +static void run_socket_server(const char *sock_path) { + g_sock_path = sock_path; + signal(SIGINT, handle_signal); + signal(SIGTERM, handle_signal); + atexit(cleanup_socket); + + unlink(sock_path); + + int srv = socket(AF_UNIX, SOCK_STREAM, 0); + if (srv < 0) { perror("socket"); return; } + + struct sockaddr_un addr; + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + strncpy(addr.sun_path, sock_path, sizeof(addr.sun_path) - 1); + + if (bind(srv, (struct sockaddr*)&addr, sizeof(addr)) < 0) { + perror("bind"); close(srv); return; + } + if (listen(srv, 4) < 0) { + perror("listen"); close(srv); return; + } + + printf("Listening on %s\n", sock_path); + printf("READY\n"); + fflush(stdout); + + while (1) { + int client = accept(srv, NULL, NULL); + if (client < 0) { perror("accept"); continue; } + + // Read request: {"tokens": [1,2,3], "max_tokens": 50} + char buf[131072]; + ssize_t total = 0; + while (total < (ssize_t)sizeof(buf) - 1) { + ssize_t n = read(client, buf + total, sizeof(buf) - 1 - total); + if (n <= 0) break; + total += n; + if (memchr(buf, '\n', total) || memchr(buf, '}', total)) break; + } + buf[total] = '\0'; + + // Minimal JSON parsing for {"tokens": [...], "max_tokens": N} + int prompt_ids[2048]; + int n_prompt = 0; + int max_gen = 50; + + char *tok_start = strstr(buf, "\"tokens\""); + if (tok_start) { + char *bracket = strchr(tok_start, '['); + if (bracket) { + char *p = bracket + 1; + while (*p && *p != ']' && n_prompt < 2048) { + while (*p && (*p == ' ' || *p == ',')) p++; + if (*p == ']') break; + prompt_ids[n_prompt++] = (int)strtol(p, &p, 10); + } + } + } + + char *mt = strstr(buf, "\"max_tokens\""); + if (mt) { + char *colon = strchr(mt, ':'); + if (colon) max_gen = (int)strtol(colon + 1, NULL, 10); + } + + if (n_prompt == 0) { + const char *err = "{\"error\": \"no tokens\"}\n"; + write(client, err, strlen(err)); + close(client); + continue; + } + + int out_ids[4096]; + double p_tps, d_tps; + int n_out = generate(prompt_ids, n_prompt, max_gen, out_ids, 4096, &p_tps, &d_tps); + + // Build JSON response + char resp[131072]; + int off = snprintf(resp, sizeof(resp), + "{\"output\": ["); + for (int i = 0; i < n_out; i++) + off += snprintf(resp + off, sizeof(resp) - off, + "%s%d", i ? ", " : "", out_ids[i]); + off += snprintf(resp + off, sizeof(resp) - off, + "], \"prefill_tps\": %.1f, \"decode_tps\": %.1f, " + "\"prompt_tokens\": %d, \"gen_tokens\": %d}\n", + p_tps, d_tps, n_prompt, n_out); + + write(client, resp, off); + close(client); + + printf("[socket] prompt=%d gen=%d prefill=%.1f decode=%.1f t/s\n", + n_prompt, n_out, p_tps, d_tps); + fflush(stdout); + + qwen_reset(&g_model); + } +} + +// --- HTTP API handler --- +static void http_api_handler(int client_fd, HttpRequest *req, void *ctx) { + (void)ctx; + + if (strcmp(req->method, "GET") == 0 && strcmp(req->path, "/health") == 0) { + http_send_json(client_fd, 200, "{\"status\":\"ok\",\"mode\":\"http\"}"); + return; + } + + if (strcmp(req->method, "POST") != 0 || strcmp(req->path, "/v1/completions") != 0) { + http_send_json(client_fd, 404, "{\"error\":\"not found, use POST /v1/completions\"}"); + return; + } + + if (req->body_len == 0) { + http_send_json(client_fd, 400, "{\"error\":\"empty body\"}"); + return; + } + + char prompt[32768]; + if (http_json_get_string(req->body, "prompt", prompt, sizeof(prompt)) < 0) { + http_send_json(client_fd, 400, "{\"error\":\"missing 'prompt' field\"}"); + return; + } + + int max_tokens = http_json_get_int(req->body, "max_tokens", 50); + if (max_tokens > 512) max_tokens = 512; + if (max_tokens < 1) max_tokens = 1; + + char system_prompt[4096]; + if (http_json_get_string(req->body, "system", system_prompt, sizeof(system_prompt)) < 0) + strcpy(system_prompt, "You are a helpful assistant. Be concise."); + + // Time tokenization separately + struct timespec t_tok0, t_tok1, t_gen0, t_gen1, t_det0, t_det1; + + clock_gettime(CLOCK_MONOTONIC, &t_tok0); + int input_ids[4096]; + int n_input = tok_encode_chat(&g_tokenizer, system_prompt, prompt, input_ids, 4096); + clock_gettime(CLOCK_MONOTONIC, &t_tok1); + double tokenize_ms = timespec_diff(&t_tok0, &t_tok1) * 1000.0; + + if (n_input == 0) { + http_send_json(client_fd, 400, "{\"error\":\"tokenization produced no tokens\"}"); + return; + } + + // Pure inference timing + clock_gettime(CLOCK_MONOTONIC, &t_gen0); + int out_ids[4096]; + double p_tps, d_tps; + int n_out = generate(input_ids, n_input, max_tokens, out_ids, 4096, &p_tps, &d_tps); + clock_gettime(CLOCK_MONOTONIC, &t_gen1); + double inference_ms = timespec_diff(&t_gen0, &t_gen1) * 1000.0; + + // Prefill time = inference of prompt tokens only (from generate's internal timing) + double prefill_s = p_tps > 0 ? n_input / p_tps : 0; + double ttft_ms = prefill_s * 1000.0; + + // Time detokenization separately + clock_gettime(CLOCK_MONOTONIC, &t_det0); + char decoded[65536]; + tok_decode(&g_tokenizer, out_ids, n_out, decoded, sizeof(decoded)); + clock_gettime(CLOCK_MONOTONIC, &t_det1); + double detokenize_ms = timespec_diff(&t_det0, &t_det1) * 1000.0; + + double total_ms = tokenize_ms + inference_ms + detokenize_ms; + + // Escape the decoded text for JSON + char escaped[131072]; + int ei = 0; + for (int i = 0; decoded[i] && ei < (int)sizeof(escaped) - 6; i++) { + switch (decoded[i]) { + case '"': escaped[ei++] = '\\'; escaped[ei++] = '"'; break; + case '\\': escaped[ei++] = '\\'; escaped[ei++] = '\\'; break; + case '\n': escaped[ei++] = '\\'; escaped[ei++] = 'n'; break; + case '\r': escaped[ei++] = '\\'; escaped[ei++] = 'r'; break; + case '\t': escaped[ei++] = '\\'; escaped[ei++] = 't'; break; + default: + if ((unsigned char)decoded[i] < 0x20) { + ei += snprintf(escaped + ei, 7, "\\u%04x", (unsigned char)decoded[i]); + } else { + escaped[ei++] = decoded[i]; + } + } + } + escaped[ei] = '\0'; + + // Build JSON response with detailed timing breakdown + char resp[HTTP_MAX_RESPONSE]; + snprintf(resp, sizeof(resp), + "{\"text\":\"%s\",\"prompt_tokens\":%d,\"gen_tokens\":%d," + "\"prefill_tps\":%.1f,\"decode_tps\":%.1f," + "\"tokenize_ms\":%.1f,\"inference_ms\":%.1f,\"detokenize_ms\":%.1f," + "\"ttft_ms\":%.1f,\"total_ms\":%.1f}", + escaped, n_input, n_out, p_tps, d_tps, + tokenize_ms, inference_ms, detokenize_ms, ttft_ms, total_ms); + + http_send_json(client_fd, 200, resp); + + printf("[http] prompt=%d gen=%d prefill=%.1f decode=%.1f t/s | tok=%.1f inf=%.1f detok=%.1f ms\n", + n_input, n_out, p_tps, d_tps, tokenize_ms, inference_ms, detokenize_ms); + fflush(stdout); + + qwen_reset(&g_model); +} + +int main(int argc, char **argv) { + @autoreleasepool { + if (argc < 2) { + fprintf(stderr, + "Usage:\n" + " %s \"token_ids\" [max_tokens] (single-shot)\n" + " %s --server (stdin loop)\n" + " %s --server /tmp/qwen_ane.sock (socket server)\n" + " %s --http 8000 --model-dir ~/models/Qwen2.5 (HTTP API)\n", + argv[0], argv[0], argv[0], argv[0]); + return 1; + } + + printf("=== Qwen2.5-0.5B ANE Inference ===\n\n"); + + setbuf(stdout, NULL); + + printf("Loading weights...\n"); + if (load_weights(argv[1]) != 0) return 1; + + qwen_alloc(&g_model); + qwen_rope_init(); + + printf("Compiling ANE kernels (169 total)...\n"); + struct timespec t0, t1; + clock_gettime(CLOCK_MONOTONIC, &t0); + qwen_compile_kernels(&g_model); + clock_gettime(CLOCK_MONOTONIC, &t1); + double compile_sec = timespec_diff(&t0, &t1); + printf("Compile time: %.1fs\n\n", compile_sec); + + // Parse flags + int server_mode = 0; + int http_port = 0; + int test_ane = 0; + int use_ane = 0; + const char *sock_path = NULL; + const char *model_dir = NULL; + for (int i = 2; i < argc; i++) { + if (strcmp(argv[i], "--server") == 0) { + server_mode = 1; + if (i + 1 < argc && argv[i+1][0] != '-') + sock_path = argv[++i]; + } else if (strcmp(argv[i], "--http") == 0) { + if (i + 1 < argc) http_port = atoi(argv[++i]); + else { fprintf(stderr, "--http requires a port number\n"); return 1; } + } else if (strcmp(argv[i], "--model-dir") == 0) { + if (i + 1 < argc) model_dir = argv[++i]; + else { fprintf(stderr, "--model-dir requires a path\n"); return 1; } + } else if (strcmp(argv[i], "--test-ane") == 0) { + test_ane = 1; + } else if (strcmp(argv[i], "--ane") == 0) { + use_ane = 1; + } + } + + // Q4 CPU mode: dequantize Q4 to F32 at load time, use AMX cblas_sgemv + if (g_model.weight_fmt == 3) { + printf("Dequantizing Q4→F32 for AMX acceleration...\n"); + int q_dim = QWEN_Q_DIM, kv_dim = QWEN_KV_DIM, dim = QWEN_DIM; + int hidden = QWEN_HIDDEN; + + #define DEQUANT_Q4_TO_F32(f32ptr, q4ptr, out_d, in_d) do { \ + size_t _n = (size_t)(out_d) * (in_d); \ + f32ptr = (float*)malloc(_n * sizeof(float)); \ + dequant_q4_to_f32(q4ptr, f32ptr, (in_d), (out_d)); \ + free(q4ptr); q4ptr = NULL; \ + } while(0) + + for (int l = 0; l < QWEN_LAYERS; l++) { + DEQUANT_Q4_TO_F32(g_model.wq[l], g_model.wq_q8[l], q_dim, dim); + DEQUANT_Q4_TO_F32(g_model.wk[l], g_model.wk_q8[l], kv_dim, dim); + DEQUANT_Q4_TO_F32(g_model.wv[l], g_model.wv_q8[l], kv_dim, dim); + DEQUANT_Q4_TO_F32(g_model.wo[l], g_model.wo_q8[l], dim, q_dim); + DEQUANT_Q4_TO_F32(g_model.w_gate[l], g_model.wgate_q8[l], hidden, dim); + DEQUANT_Q4_TO_F32(g_model.w_up[l], g_model.wup_q8[l], hidden, dim); + DEQUANT_Q4_TO_F32(g_model.w_down[l], g_model.wdown_q8[l], dim, hidden); + } + #undef DEQUANT_Q4_TO_F32 + + g_model.weight_fmt = 0; + printf("Q4→F32 done. Using AMX cblas_sgemv (91+ t/s decode).\n"); + } + + // ANE fused kernel compilation (requires F32 weights for baked-weight convs) + if (use_ane) { + if (g_model.weight_fmt != 0) { + printf("--ane requires F32 weights (weight_fmt=0). Got fmt=%d\n", g_model.weight_fmt); + printf("Re-run with F32 weight file (convert_weights.py without --f16/--q4/--q8)\n"); + use_ane = 0; + } else { + struct timespec ta0, ta1; + clock_gettime(CLOCK_MONOTONIC, &ta0); + qwen_compile_kernels_fused(&g_model); + clock_gettime(CLOCK_MONOTONIC, &ta1); + double ane_sec = timespec_diff(&ta0, &ta1); + printf("ANE fused compile time: %.1fs\n", ane_sec); + + // Verify at least one QKV kernel compiled + if (g_model.k_qkv[0] && g_model.k_o[0] && g_model.k_ffn_up[0] && g_model.k_down[0]) { + g_model.use_ane = 1; + printf("ANE fused mode active: 112 kernels (QKV+FFN_up fused)\n"); + } else { + printf("ANE fused compilation failed, falling back to CPU\n"); + use_ane = 0; + } + } + } + + // ANE vs CPU correctness test + if (test_ane) { + printf("=== ANE vs CPU Projection Test ===\n\n"); + + // Use a realistic input: embed token 2610 ("What"), RMSNorm it + int test_token = 2610; + memcpy(g_model.x, g_model.embed + test_token * QWEN_DIM, QWEN_DIM * sizeof(float)); + qwen_rmsnorm(g_model.xb, g_model.x, g_model.rms_att[0], QWEN_DIM); + + // Also prepare a realistic Q output for the O projection test + cpu_project(g_model.wq[0], g_model.xb, g_model.q, QWEN_DIM, QWEN_Q_DIM); + + float *cpu_out = (float*)calloc(QWEN_HIDDEN, sizeof(float)); + float *ane_out = (float*)calloc(QWEN_HIDDEN, sizeof(float)); + + struct { + const char *name; + ANEKernel *kernel; + const float *weights; + int in_dim, out_dim; + } tests[] = { + {"L0 Q proj", g_model.k_q[0], g_model.wq[0], QWEN_DIM, QWEN_Q_DIM}, + {"L0 K proj", g_model.k_k[0], g_model.wk[0], QWEN_DIM, QWEN_KV_DIM}, + {"L0 V proj", g_model.k_v[0], g_model.wv[0], QWEN_DIM, QWEN_KV_DIM}, + {"L0 O proj", g_model.k_o[0], g_model.wo[0], QWEN_Q_DIM, QWEN_DIM}, + {"L0 Gate", g_model.k_gate[0], g_model.w_gate[0], QWEN_DIM, QWEN_HIDDEN}, + {"L0 Up", g_model.k_up[0], g_model.w_up[0], QWEN_DIM, QWEN_HIDDEN}, + {"L0 Down", g_model.k_down[0], g_model.w_down[0], QWEN_HIDDEN, QWEN_DIM}, + {"LM Head c0", g_model.k_lmhead[0], g_model.embed, QWEN_DIM, QWEN_LM_CHUNK_SIZE}, + }; + int n_tests = sizeof(tests) / sizeof(tests[0]); + int all_pass = 1; + + for (int t = 0; t < n_tests; t++) { + if (!tests[t].kernel) { + printf(" %-14s SKIP (kernel not compiled)\n", tests[t].name); + continue; + } + const float *input; + if (tests[t].in_dim == QWEN_Q_DIM) { + input = g_model.q; + } else if (tests[t].in_dim == QWEN_HIDDEN) { + cpu_project(g_model.w_gate[0], g_model.xb, g_model.hb, QWEN_DIM, QWEN_HIDDEN); + input = g_model.hb; + } else { + input = g_model.xb; + } + + cpu_project(tests[t].weights, input, cpu_out, tests[t].in_dim, tests[t].out_dim); + + // ANE projection with return-value check + ane_write_input(tests[t].kernel, 0, input, tests[t].in_dim * sizeof(float)); + bool ane_ok = ane_run(tests[t].kernel); + ane_read_output(tests[t].kernel, 0, ane_out, tests[t].out_dim * sizeof(float)); + if (!ane_ok) printf(" !! ANE execution returned false\n"); + + float max_diff = 0, sum_diff = 0; + float cpu_norm = 0, ane_norm = 0; + for (int i = 0; i < tests[t].out_dim; i++) { + float d = fabsf(cpu_out[i] - ane_out[i]); + if (d > max_diff) max_diff = d; + sum_diff += d; + cpu_norm += cpu_out[i] * cpu_out[i]; + ane_norm += ane_out[i] * ane_out[i]; + } + float avg_diff = sum_diff / tests[t].out_dim; + float rel_err = (sqrtf(cpu_norm) > 0) ? + sqrtf(sum_diff * sum_diff / tests[t].out_dim) / sqrtf(cpu_norm / tests[t].out_dim) : 0; + + int pass = (max_diff < 0.5f && rel_err < 0.05f); + if (!pass) all_pass = 0; + + printf(" %-14s [%d→%d] max_diff=%.6f avg_diff=%.6f rel_err=%.4f %s\n", + tests[t].name, tests[t].in_dim, tests[t].out_dim, + max_diff, avg_diff, rel_err, + pass ? "PASS" : "FAIL"); + printf(" CPU first4: %.6f %.6f %.6f %.6f norm=%.4f\n", + cpu_out[0], cpu_out[1], cpu_out[2], cpu_out[3], sqrtf(cpu_norm)); + printf(" ANE first4: %.6f %.6f %.6f %.6f norm=%.4f\n", + ane_out[0], ane_out[1], ane_out[2], ane_out[3], sqrtf(ane_norm)); + } + + printf("\n%s\n", all_pass ? + "ALL TESTS PASSED -- ANE projections match CPU (within FP16 tolerance)" : + "SOME TESTS FAILED -- ANE projections have accuracy issues"); + + // If all pass, benchmark one layer ANE vs CPU speed + if (all_pass) { + printf("\n=== Speed comparison (1000 iterations, L0 Q proj %d→%d) ===\n", + QWEN_DIM, QWEN_Q_DIM); + struct timespec ts0, ts1; + + clock_gettime(CLOCK_MONOTONIC, &ts0); + for (int i = 0; i < 1000; i++) + cpu_project(g_model.wq[0], g_model.xb, cpu_out, QWEN_DIM, QWEN_Q_DIM); + clock_gettime(CLOCK_MONOTONIC, &ts1); + double cpu_us = timespec_diff(&ts0, &ts1) * 1e6 / 1000; + + clock_gettime(CLOCK_MONOTONIC, &ts0); + for (int i = 0; i < 1000; i++) + ane_project(g_model.k_q[0], g_model.xb, ane_out, QWEN_DIM, QWEN_Q_DIM); + clock_gettime(CLOCK_MONOTONIC, &ts1); + double ane_us = timespec_diff(&ts0, &ts1) * 1e6 / 1000; + + printf(" CPU: %.1f us/call\n", cpu_us); + printf(" ANE: %.1f us/call\n", ane_us); + printf(" Ratio: %.2fx %s\n", cpu_us / ane_us, + ane_us < cpu_us ? "(ANE faster)" : "(CPU faster)"); + } + + free(cpu_out); + free(ane_out); + return all_pass ? 0 : 1; + } + + if (server_mode) { + if (sock_path) + run_socket_server(sock_path); + else + run_stdin_server(); + return 0; + } + + // HTTP API mode + if (http_port > 0) { + if (!model_dir) { + // Default to ~/models/Qwen2.5-0.5B-Instruct + static char default_dir[4096]; + const char *home = getenv("HOME"); + snprintf(default_dir, sizeof(default_dir), "%s/models/Qwen2.5-0.5B-Instruct", home ? home : "."); + model_dir = default_dir; + } + printf("Loading tokenizer from %s...\n", model_dir); + if (tok_init(&g_tokenizer, model_dir) != 0) { + fprintf(stderr, "Failed to load tokenizer from %s\n", model_dir); + return 1; + } + g_tokenizer_loaded = 1; + printf("Tokenizer ready.\n\n"); + + signal(SIGINT, handle_signal); + signal(SIGTERM, handle_signal); + + http_serve(http_port, http_api_handler, NULL); + tok_free(&g_tokenizer); + return 0; + } + + // Single-shot mode (original behavior) + if (argc < 3) { + fprintf(stderr, "Error: provide token IDs or --server\n"); + return 1; + } + + int max_gen = 50; + if (argc >= 4 && strcmp(argv[3], "--server") != 0) + max_gen = atoi(argv[3]); + + int prompt_ids[2048]; + int n_prompt = parse_tokens(argv[2], prompt_ids, 2048); + printf("Prompt: %d tokens, generating up to %d\n", n_prompt, max_gen); + + int out_ids[4096]; + double p_tps, d_tps; + int n_out = generate(prompt_ids, n_prompt, max_gen, out_ids, 4096, &p_tps, &d_tps); + + printf("OUT:"); + for (int i = 0; i < n_out; i++) printf(" %d", out_ids[i]); + printf("\n"); + + printf("\nPrefill: %.1f t/s (%d tokens)\n", p_tps, n_prompt); + printf("Decode: %.1f t/s (%d tokens)\n", d_tps, n_out > 1 ? n_out - 1 : 0); + + return 0; + } +} diff --git a/inference/matmul.metal b/inference/matmul.metal new file mode 100644 index 0000000..e5ed10e --- /dev/null +++ b/inference/matmul.metal @@ -0,0 +1,921 @@ +#include +using namespace metal; + +// ── Q4_0 block format ──────────────────────────────────────────────── +// Block of 32 values: 2 bytes F16 scale + 2 bytes F16 zero + 16 bytes packed uint8 +// Each uint8 stores 2 values: low nibble = even index, high nibble = odd index +// Total: 20 bytes per block of 32 weights +#define Q4_BLOCK_SIZE 32 +#define Q4_BLOCK_BYTES 20 + +// ── Q4 Matrix-vector multiply (legacy, 1 thread per row) ──────────── +// Kept as fallback for edge cases. +kernel void sgemv_q4( + device const uint8_t *W [[buffer(0)]], + device const float *x [[buffer(1)]], + device float *y [[buffer(2)]], + constant uint &in_dim [[buffer(3)]], + constant uint &out_dim [[buffer(4)]], + uint gid [[thread_position_in_grid]]) +{ + if (gid >= out_dim) return; + + uint n_blocks = in_dim / Q4_BLOCK_SIZE; + uint row_bytes = n_blocks * Q4_BLOCK_BYTES; + device const uint8_t *row = W + uint64_t(gid) * row_bytes; + + float sum = 0.0f; + for (uint b = 0; b < n_blocks; b++) { + device const uint8_t *block = row + b * Q4_BLOCK_BYTES; + half scale_h, zero_h; + scale_h = *reinterpret_cast(block); + zero_h = *reinterpret_cast(block + 2); + float scale = float(scale_h); + float zero = float(zero_h); + + device const uint8_t *packed = block + 4; + uint base = b * Q4_BLOCK_SIZE; + + for (uint i = 0; i < 16; i++) { + uint8_t byte = packed[i]; + float w0 = float(byte & 0xF) * scale + zero; + float w1 = float(byte >> 4) * scale + zero; + sum += w0 * x[base + i * 2]; + sum += w1 * x[base + i * 2 + 1]; + } + } + y[gid] = sum; +} + +// ── Q4 SIMD-optimized matrix-vector multiply ───────────────────────── +// MLX-style cooperative SIMD kernel: 2 SIMD groups per threadgroup, +// each SIMD group handles ROWS_PER_SIMD output rows cooperatively. +// 32 threads in a SIMD group split the K (input) dimension, then +// reduce via simd_sum(). No threadgroup memory needed. +// +// Threadgroup layout: 64 threads = 2 SIMD groups of 32 +// Grid: (ceil(out_dim / ROWS_PER_TG), 1, 1) threadgroups +// +// Optional bias: if bias pointer is non-null (use_bias != 0), +// y[r] = dot(W[r], x) + bias[r] +#define ROWS_PER_SIMD 4 +#define SIMD_GROUPS 2 +#define ROWS_PER_TG (ROWS_PER_SIMD * SIMD_GROUPS) + +kernel void sgemv_q4_fast( + device const uint8_t *W [[buffer(0)]], + device const float *x [[buffer(1)]], + device float *y [[buffer(2)]], + constant uint &in_dim [[buffer(3)]], + constant uint &out_dim [[buffer(4)]], + device const float *bias [[buffer(5)]], + constant uint &use_bias [[buffer(6)]], + uint tgid [[threadgroup_position_in_grid]], + uint simd_gid [[simdgroup_index_in_threadgroup]], + uint simd_lid [[thread_index_in_simdgroup]]) +{ + uint base_row = tgid * ROWS_PER_TG + simd_gid * ROWS_PER_SIMD; + if (base_row >= out_dim) return; + + uint n_blocks = in_dim / Q4_BLOCK_SIZE; + uint row_bytes = n_blocks * Q4_BLOCK_BYTES; + + uint rows_this = min((uint)ROWS_PER_SIMD, out_dim - base_row); + + float accum[ROWS_PER_SIMD] = {0.0f, 0.0f, 0.0f, 0.0f}; + float zero_accum[ROWS_PER_SIMD] = {0.0f, 0.0f, 0.0f, 0.0f}; + + // Each of 32 SIMD lanes processes a stripe of blocks. + // Lane i processes blocks i, i+32, i+64, ... + for (uint b = simd_lid; b < n_blocks; b += 32) { + uint k_base = b * Q4_BLOCK_SIZE; + + // Load input vector segment for this block (32 floats) + float xv[Q4_BLOCK_SIZE]; + for (uint j = 0; j < 16; j++) { + xv[j * 2] = x[k_base + j * 2]; + xv[j * 2 + 1] = x[k_base + j * 2 + 1]; + } + + for (uint r = 0; r < rows_this; r++) { + device const uint8_t *block = + W + uint64_t(base_row + r) * row_bytes + uint64_t(b) * Q4_BLOCK_BYTES; + + half scale_h = *reinterpret_cast(block); + half zero_h = *reinterpret_cast(block + 2); + float scale = float(scale_h); + float zero = float(zero_h); + + device const uint8_t *packed = block + 4; + + float dot = 0.0f; + float xsum = 0.0f; + for (uint j = 0; j < 16; j++) { + uint8_t byte = packed[j]; + float w0 = float(byte & 0xF); + float w1 = float(byte >> 4); + dot += w0 * xv[j * 2] + w1 * xv[j * 2 + 1]; + xsum += xv[j * 2] + xv[j * 2 + 1]; + } + accum[r] += dot * scale; + zero_accum[r] += xsum * zero; + } + } + + // SIMD reduction across 32 lanes + for (uint r = 0; r < rows_this; r++) { + float result = simd_sum(accum[r]) + simd_sum(zero_accum[r]); + if (simd_lid == 0) { + if (use_bias != 0) { + result += bias[base_row + r]; + } + y[base_row + r] = result; + } + } +} + +// ── Fused Gate+Up+SiLU: reads x once, computes gate=silu(Wg*x)*Wu*x ── +// Combines two Q4 matvecs + silu_mul into one kernel. +// W_gate and W_up have the same dimensions [out_dim, in_dim]. +// Output: gate[r] = silu(dot(W_gate[r], x)) * dot(W_up[r], x) +#define FUSED_ROWS_PER_SIMD 2 +#define FUSED_SIMD_GROUPS 2 +#define FUSED_ROWS_PER_TG (FUSED_ROWS_PER_SIMD * FUSED_SIMD_GROUPS) + +kernel void sgemv_q4_fused_ffn( + device const uint8_t *W_gate [[buffer(0)]], + device const uint8_t *W_up [[buffer(1)]], + device const float *x [[buffer(2)]], + device float *out [[buffer(3)]], + constant uint &in_dim [[buffer(4)]], + constant uint &out_dim [[buffer(5)]], + uint tgid [[threadgroup_position_in_grid]], + uint simd_gid [[simdgroup_index_in_threadgroup]], + uint simd_lid [[thread_index_in_simdgroup]]) +{ + uint base_row = tgid * FUSED_ROWS_PER_TG + simd_gid * FUSED_ROWS_PER_SIMD; + if (base_row >= out_dim) return; + + uint n_blocks = in_dim / Q4_BLOCK_SIZE; + uint row_bytes = n_blocks * Q4_BLOCK_BYTES; + + uint rows_this = min((uint)FUSED_ROWS_PER_SIMD, out_dim - base_row); + + float gate_acc[FUSED_ROWS_PER_SIMD] = {0.0f, 0.0f}; + float gate_zacc[FUSED_ROWS_PER_SIMD] = {0.0f, 0.0f}; + float up_acc[FUSED_ROWS_PER_SIMD] = {0.0f, 0.0f}; + float up_zacc[FUSED_ROWS_PER_SIMD] = {0.0f, 0.0f}; + + for (uint b = simd_lid; b < n_blocks; b += 32) { + uint k_base = b * Q4_BLOCK_SIZE; + + float xv[Q4_BLOCK_SIZE]; + for (uint j = 0; j < 16; j++) { + xv[j * 2] = x[k_base + j * 2]; + xv[j * 2 + 1] = x[k_base + j * 2 + 1]; + } + + for (uint r = 0; r < rows_this; r++) { + uint64_t row_off = uint64_t(base_row + r) * row_bytes + uint64_t(b) * Q4_BLOCK_BYTES; + + // Gate weight block + device const uint8_t *g_block = W_gate + row_off; + float g_scale = float(*reinterpret_cast(g_block)); + float g_zero = float(*reinterpret_cast(g_block + 2)); + device const uint8_t *g_packed = g_block + 4; + + // Up weight block + device const uint8_t *u_block = W_up + row_off; + float u_scale = float(*reinterpret_cast(u_block)); + float u_zero = float(*reinterpret_cast(u_block + 2)); + device const uint8_t *u_packed = u_block + 4; + + float g_dot = 0.0f, g_xsum = 0.0f; + float u_dot = 0.0f, u_xsum = 0.0f; + for (uint j = 0; j < 16; j++) { + float x0 = xv[j * 2]; + float x1 = xv[j * 2 + 1]; + float xs = x0 + x1; + + uint8_t gb = g_packed[j]; + g_dot += float(gb & 0xF) * x0 + float(gb >> 4) * x1; + g_xsum += xs; + + uint8_t ub = u_packed[j]; + u_dot += float(ub & 0xF) * x0 + float(ub >> 4) * x1; + u_xsum += xs; + } + gate_acc[r] += g_dot * g_scale; + gate_zacc[r] += g_xsum * g_zero; + up_acc[r] += u_dot * u_scale; + up_zacc[r] += u_xsum * u_zero; + } + } + + for (uint r = 0; r < rows_this; r++) { + float g = simd_sum(gate_acc[r]) + simd_sum(gate_zacc[r]); + float u = simd_sum(up_acc[r]) + simd_sum(up_zacc[r]); + if (simd_lid == 0) { + float s = g / (1.0f + exp(-g)); + out[base_row + r] = s * u; + } + } +} + +// ── Q4 batched matrix-matrix multiply (SGEMM) for prefill ──────────── +// Y[t, r] = sum_k(dequant(W[r, k]) * X[t, k]) for t in [0, n_tokens), r in [0, out_dim) +// Grid: (ceil(out_dim / GEMM_TILE_M), n_tokens, 1) +// Each threadgroup: 2 SIMD groups, each handles GEMM_TILE_M/2 output rows for one token. +#define GEMM_TILE_M 8 +#define GEMM_SIMD_GROUPS 2 +#define GEMM_ROWS_PER_SIMD (GEMM_TILE_M / GEMM_SIMD_GROUPS) + +kernel void sgemm_q4( + device const uint8_t *W [[buffer(0)]], + device const float *X [[buffer(1)]], + device float *Y [[buffer(2)]], + constant uint &in_dim [[buffer(3)]], + constant uint &out_dim [[buffer(4)]], + device const float *bias [[buffer(5)]], + constant uint &use_bias [[buffer(6)]], + constant uint &n_tokens [[buffer(7)]], + uint2 tgid [[threadgroup_position_in_grid]], + uint simd_gid [[simdgroup_index_in_threadgroup]], + uint simd_lid [[thread_index_in_simdgroup]]) +{ + uint base_row = tgid.x * GEMM_TILE_M + simd_gid * GEMM_ROWS_PER_SIMD; + uint t = tgid.y; + if (base_row >= out_dim || t >= n_tokens) return; + + uint n_blocks = in_dim / Q4_BLOCK_SIZE; + uint row_bytes = n_blocks * Q4_BLOCK_BYTES; + uint rows_this = min((uint)GEMM_ROWS_PER_SIMD, out_dim - base_row); + + device const float *xt = X + uint64_t(t) * in_dim; + + float accum[GEMM_ROWS_PER_SIMD] = {0.0f, 0.0f, 0.0f, 0.0f}; + float zero_accum[GEMM_ROWS_PER_SIMD] = {0.0f, 0.0f, 0.0f, 0.0f}; + + for (uint b = simd_lid; b < n_blocks; b += 32) { + uint k_base = b * Q4_BLOCK_SIZE; + + float xv[Q4_BLOCK_SIZE]; + for (uint j = 0; j < 16; j++) { + xv[j * 2] = xt[k_base + j * 2]; + xv[j * 2 + 1] = xt[k_base + j * 2 + 1]; + } + + for (uint r = 0; r < rows_this; r++) { + device const uint8_t *block = + W + uint64_t(base_row + r) * row_bytes + uint64_t(b) * Q4_BLOCK_BYTES; + + float scale = float(*reinterpret_cast(block)); + float zero = float(*reinterpret_cast(block + 2)); + device const uint8_t *packed = block + 4; + + float dot = 0.0f; + float xsum = 0.0f; + for (uint j = 0; j < 16; j++) { + uint8_t byte = packed[j]; + dot += float(byte & 0xF) * xv[j * 2] + float(byte >> 4) * xv[j * 2 + 1]; + xsum += xv[j * 2] + xv[j * 2 + 1]; + } + accum[r] += dot * scale; + zero_accum[r] += xsum * zero; + } + } + + for (uint r = 0; r < rows_this; r++) { + float result = simd_sum(accum[r]) + simd_sum(zero_accum[r]); + if (simd_lid == 0) { + if (use_bias != 0) + result += bias[base_row + r]; + Y[uint64_t(t) * out_dim + base_row + r] = result; + } + } +} + +// ── Q4 batched fused Gate+Up+SiLU (SGEMM variant) ─────────────────── +// out[t, r] = silu(Wg[r] . X[t]) * Wu[r] . X[t] for all t and r +// Grid: (ceil(out_dim / GEMM_FFN_TILE_M), n_tokens, 1) +#define GEMM_FFN_TILE_M 4 +#define GEMM_FFN_SIMD_GROUPS 2 +#define GEMM_FFN_ROWS_PER_SIMD (GEMM_FFN_TILE_M / GEMM_FFN_SIMD_GROUPS) + +kernel void sgemm_q4_fused_ffn( + device const uint8_t *W_gate [[buffer(0)]], + device const uint8_t *W_up [[buffer(1)]], + device const float *X [[buffer(2)]], + device float *out [[buffer(3)]], + constant uint &in_dim [[buffer(4)]], + constant uint &out_dim [[buffer(5)]], + constant uint &n_tokens [[buffer(6)]], + uint2 tgid [[threadgroup_position_in_grid]], + uint simd_gid [[simdgroup_index_in_threadgroup]], + uint simd_lid [[thread_index_in_simdgroup]]) +{ + uint base_row = tgid.x * GEMM_FFN_TILE_M + simd_gid * GEMM_FFN_ROWS_PER_SIMD; + uint t = tgid.y; + if (base_row >= out_dim || t >= n_tokens) return; + + uint n_blocks = in_dim / Q4_BLOCK_SIZE; + uint row_bytes = n_blocks * Q4_BLOCK_BYTES; + uint rows_this = min((uint)GEMM_FFN_ROWS_PER_SIMD, out_dim - base_row); + + device const float *xt = X + uint64_t(t) * in_dim; + + float gate_acc[GEMM_FFN_ROWS_PER_SIMD] = {0.0f, 0.0f}; + float gate_zacc[GEMM_FFN_ROWS_PER_SIMD] = {0.0f, 0.0f}; + float up_acc[GEMM_FFN_ROWS_PER_SIMD] = {0.0f, 0.0f}; + float up_zacc[GEMM_FFN_ROWS_PER_SIMD] = {0.0f, 0.0f}; + + for (uint b = simd_lid; b < n_blocks; b += 32) { + uint k_base = b * Q4_BLOCK_SIZE; + + float xv[Q4_BLOCK_SIZE]; + for (uint j = 0; j < 16; j++) { + xv[j * 2] = xt[k_base + j * 2]; + xv[j * 2 + 1] = xt[k_base + j * 2 + 1]; + } + + for (uint r = 0; r < rows_this; r++) { + uint64_t row_off = uint64_t(base_row + r) * row_bytes + uint64_t(b) * Q4_BLOCK_BYTES; + + device const uint8_t *g_block = W_gate + row_off; + float g_scale = float(*reinterpret_cast(g_block)); + float g_zero = float(*reinterpret_cast(g_block + 2)); + device const uint8_t *g_packed = g_block + 4; + + device const uint8_t *u_block = W_up + row_off; + float u_scale = float(*reinterpret_cast(u_block)); + float u_zero = float(*reinterpret_cast(u_block + 2)); + device const uint8_t *u_packed = u_block + 4; + + float g_dot = 0.0f, g_xsum = 0.0f; + float u_dot = 0.0f, u_xsum = 0.0f; + for (uint j = 0; j < 16; j++) { + float x0 = xv[j * 2]; + float x1 = xv[j * 2 + 1]; + float xs = x0 + x1; + + uint8_t gb = g_packed[j]; + g_dot += float(gb & 0xF) * x0 + float(gb >> 4) * x1; + g_xsum += xs; + + uint8_t ub = u_packed[j]; + u_dot += float(ub & 0xF) * x0 + float(ub >> 4) * x1; + u_xsum += xs; + } + gate_acc[r] += g_dot * g_scale; + gate_zacc[r] += g_xsum * g_zero; + up_acc[r] += u_dot * u_scale; + up_zacc[r] += u_xsum * u_zero; + } + } + + for (uint r = 0; r < rows_this; r++) { + float g = simd_sum(gate_acc[r]) + simd_sum(gate_zacc[r]); + float u = simd_sum(up_acc[r]) + simd_sum(up_zacc[r]); + if (simd_lid == 0) { + float s = g / (1.0f + exp(-g)); + out[uint64_t(t) * out_dim + base_row + r] = s * u; + } + } +} + +// ── Batched RMSNorm (N tokens) ────────────────────────────────────── +// x[t*dim .. (t+1)*dim-1] → out[t*dim .. (t+1)*dim-1] +// Grid: (n_tokens, 1, 1) threadgroups, each normalizes one token. +kernel void rms_norm_batched( + device const float *x [[buffer(0)]], + device const float *w [[buffer(1)]], + device float *out [[buffer(2)]], + constant uint &dim [[buffer(3)]], + constant float &eps [[buffer(4)]], + constant uint &n_tokens [[buffer(5)]], + uint tgid [[threadgroup_position_in_grid]], + uint tid [[thread_index_in_threadgroup]], + uint tpg [[threads_per_threadgroup]]) +{ + if (tgid >= n_tokens) return; + + device const float *xi = x + uint64_t(tgid) * dim; + device float *oi = out + uint64_t(tgid) * dim; + + threadgroup float partial[1024]; + + float local_sum = 0.0f; + for (uint i = tid; i < dim; i += tpg) + local_sum += xi[i] * xi[i]; + partial[tid] = local_sum; + + threadgroup_barrier(mem_flags::mem_threadgroup); + for (uint s = tpg / 2; s > 0; s >>= 1) { + if (tid < s) partial[tid] += partial[tid + s]; + threadgroup_barrier(mem_flags::mem_threadgroup); + } + + float rms_inv = rsqrt(partial[0] / float(dim) + eps); + + for (uint i = tid; i < dim; i += tpg) + oi[i] = xi[i] * rms_inv * w[i]; +} + +// ── Batched embedding lookup (N tokens) ───────────────────────────── +// Grid: (dim, n_tokens, 1). Each thread copies one element. +kernel void embed_lookup_batched( + device const float *embed [[buffer(0)]], + device float *out [[buffer(1)]], + device const uint *token_ids [[buffer(2)]], + constant uint &dim [[buffer(3)]], + uint2 gid [[thread_position_in_grid]]) +{ + uint i = gid.x; + uint t = gid.y; + if (i >= dim) return; + out[uint64_t(t) * dim + i] = embed[uint64_t(token_ids[t]) * dim + i]; +} + +// ── Batched RoPE (N tokens) ───────────────────────────────────────── +// Applies RoPE to Q[t] and K[t] for each token t at position base_pos+t. +// Grid: total_pairs per token * n_tokens +kernel void rope_apply_batched( + device float *q [[buffer(0)]], + device float *k [[buffer(1)]], + device const float *cos_tbl [[buffer(2)]], + device const float *sin_tbl [[buffer(3)]], + constant uint &n_q_heads [[buffer(4)]], + constant uint &n_kv_heads [[buffer(5)]], + constant uint &head_dim [[buffer(6)]], + constant uint &base_pos [[buffer(7)]], + constant uint &q_stride [[buffer(8)]], + constant uint &k_stride [[buffer(9)]], + uint2 gid [[thread_position_in_grid]]) +{ + uint pair_idx = gid.x; + uint t = gid.y; + + uint half_dim = head_dim / 2; + uint total_pairs = (n_q_heads + n_kv_heads) * half_dim; + if (pair_idx >= total_pairs) return; + + uint head_pair = pair_idx / half_dim; + uint i = pair_idx % half_dim; + uint pos = base_pos + t; + + device float *vec; + if (head_pair < n_q_heads) + vec = q + uint64_t(t) * q_stride + head_pair * head_dim; + else + vec = k + uint64_t(t) * k_stride + (head_pair - n_q_heads) * head_dim; + + uint cos_off = pos * half_dim; + float f = vec[i]; + float s = vec[i + half_dim]; + float c = cos_tbl[cos_off + i]; + float sv = sin_tbl[cos_off + i]; + vec[i] = f * c - s * sv; + vec[i + half_dim] = s * c + f * sv; +} + +// ── Batched vec_add: out[i] = a[i] + b[i] for N*dim elements ──────── +kernel void vec_add_batched( + device const float *a [[buffer(0)]], + device const float *b [[buffer(1)]], + device float *out [[buffer(2)]], + constant uint &total_n [[buffer(3)]], + uint gid [[thread_position_in_grid]]) +{ + if (gid >= total_n) return; + out[gid] = a[gid] + b[gid]; +} + +// ── F16 matrix-vector multiply ─────────────────────────────────────── +kernel void sgemv_f16( + device const half *W [[buffer(0)]], + device const float *x [[buffer(1)]], + device float *y [[buffer(2)]], + constant uint &in_dim [[buffer(3)]], + constant uint &out_dim [[buffer(4)]], + uint gid [[thread_position_in_grid]]) +{ + if (gid >= out_dim) return; + + device const half *row = W + uint64_t(gid) * in_dim; + float sum = 0.0f; + + uint i = 0; + for (; i + 7 < in_dim; i += 8) { + sum += float(row[i]) * x[i]; + sum += float(row[i + 1]) * x[i + 1]; + sum += float(row[i + 2]) * x[i + 2]; + sum += float(row[i + 3]) * x[i + 3]; + sum += float(row[i + 4]) * x[i + 4]; + sum += float(row[i + 5]) * x[i + 5]; + sum += float(row[i + 6]) * x[i + 6]; + sum += float(row[i + 7]) * x[i + 7]; + } + for (; i < in_dim; i++) + sum += float(row[i]) * x[i]; + + y[gid] = sum; +} + +// ── F32 matrix-vector multiply ─────────────────────────────────────── +kernel void sgemv_f32( + device const float *W [[buffer(0)]], + device const float *x [[buffer(1)]], + device float *y [[buffer(2)]], + constant uint &in_dim [[buffer(3)]], + constant uint &out_dim [[buffer(4)]], + uint gid [[thread_position_in_grid]]) +{ + if (gid >= out_dim) return; + + device const float *row = W + uint64_t(gid) * in_dim; + float sum = 0.0f; + + uint i = 0; + for (; i + 7 < in_dim; i += 8) { + sum += row[i] * x[i]; + sum += row[i + 1] * x[i + 1]; + sum += row[i + 2] * x[i + 2]; + sum += row[i + 3] * x[i + 3]; + sum += row[i + 4] * x[i + 4]; + sum += row[i + 5] * x[i + 5]; + sum += row[i + 6] * x[i + 6]; + sum += row[i + 7] * x[i + 7]; + } + for (; i < in_dim; i++) + sum += row[i] * x[i]; + + y[gid] = sum; +} + +// ── RMS Normalization ──────────────────────────────────────────────── +// out[i] = x[i] * w[i] / sqrt(mean(x^2) + eps) +// Two-pass: first compute sum of squares (reduction), then normalize. +// Single threadgroup processes the entire vector. +kernel void rms_norm( + device const float *x [[buffer(0)]], + device const float *w [[buffer(1)]], + device float *out [[buffer(2)]], + constant uint &dim [[buffer(3)]], + constant float &eps [[buffer(4)]], + uint tid [[thread_index_in_threadgroup]], + uint tpg [[threads_per_threadgroup]]) +{ + threadgroup float partial[1024]; + + float local_sum = 0.0f; + for (uint i = tid; i < dim; i += tpg) + local_sum += x[i] * x[i]; + partial[tid] = local_sum; + + threadgroup_barrier(mem_flags::mem_threadgroup); + + // Tree reduction + for (uint s = tpg / 2; s > 0; s >>= 1) { + if (tid < s) partial[tid] += partial[tid + s]; + threadgroup_barrier(mem_flags::mem_threadgroup); + } + + float rms_inv = rsqrt(partial[0] / float(dim) + eps); + + for (uint i = tid; i < dim; i += tpg) + out[i] = x[i] * rms_inv * w[i]; +} + +// ── RoPE (Rotary Position Embedding) ───────────────────────────────── +// Applies RoPE to Q and K vectors in-place. +// cos_sin is precomputed: [half_dim] cos values followed by [half_dim] sin values. +kernel void rope_apply( + device float *q [[buffer(0)]], + device float *k [[buffer(1)]], + device const float *cos_v [[buffer(2)]], + device const float *sin_v [[buffer(3)]], + constant uint &n_q_heads [[buffer(4)]], + constant uint &n_kv_heads [[buffer(5)]], + constant uint &head_dim [[buffer(6)]], + uint gid [[thread_position_in_grid]]) +{ + uint half_dim = head_dim / 2; + uint total_pairs = (n_q_heads + n_kv_heads) * half_dim; + if (gid >= total_pairs) return; + + uint head_pair = gid / half_dim; + uint i = gid % half_dim; + + device float *vec; + if (head_pair < n_q_heads) { + vec = q + head_pair * head_dim; + } else { + vec = k + (head_pair - n_q_heads) * head_dim; + } + + float f = vec[i]; + float s = vec[i + half_dim]; + float c = cos_v[i]; + float sv = sin_v[i]; + vec[i] = f * c - s * sv; + vec[i + half_dim] = s * c + f * sv; +} + +// ── SiLU activation + element-wise multiply ────────────────────────── +// gate[i] = silu(gate[i]) * up[i] +// silu(x) = x / (1 + exp(-x)) +kernel void silu_mul( + device float *gate [[buffer(0)]], + device const float *up [[buffer(1)]], + constant uint &n [[buffer(2)]], + uint gid [[thread_position_in_grid]]) +{ + if (gid >= n) return; + float x = gate[gid]; + float s = x / (1.0f + exp(-x)); + gate[gid] = s * up[gid]; +} + +// ── Vector add (residual connection) ───────────────────────────────── +// out[i] = a[i] + b[i] +kernel void vec_add( + device const float *a [[buffer(0)]], + device const float *b [[buffer(1)]], + device float *out [[buffer(2)]], + constant uint &n [[buffer(3)]], + uint gid [[thread_position_in_grid]]) +{ + if (gid >= n) return; + out[gid] = a[gid] + b[gid]; +} + +// ── Bias add ───────────────────────────────────────────────────────── +// x[i] += bias[i] +kernel void bias_add( + device float *x [[buffer(0)]], + device const float *bias [[buffer(1)]], + constant uint &n [[buffer(2)]], + uint gid [[thread_position_in_grid]]) +{ + if (gid >= n) return; + x[gid] += bias[gid]; +} + +// ── Embedding lookup ───────────────────────────────────────────────── +// out[i] = embed[token_id * dim + i] +kernel void embed_lookup( + device const float *embed [[buffer(0)]], + device float *out [[buffer(1)]], + constant uint &token_id [[buffer(2)]], + constant uint &dim [[buffer(3)]], + uint gid [[thread_position_in_grid]]) +{ + if (gid >= dim) return; + out[gid] = embed[uint64_t(token_id) * dim + gid]; +} + +// ── Attention score: Q @ K^T for one head (legacy) ────────────────── +kernel void attn_score( + device const float *qh [[buffer(0)]], + device const float *kv_cache_k [[buffer(1)]], + device float *att [[buffer(2)]], + constant uint &head_dim [[buffer(3)]], + constant uint &kv_dim [[buffer(4)]], + constant uint &kv_head_offset [[buffer(5)]], + constant float &scale [[buffer(6)]], + constant uint &seq_len [[buffer(7)]], + uint gid [[thread_position_in_grid]]) +{ + if (gid >= seq_len) return; + + device const float *kt = kv_cache_k + uint64_t(gid) * kv_dim + kv_head_offset; + float dot = 0.0f; + for (uint i = 0; i < head_dim; i++) + dot += qh[i] * kt[i]; + att[gid] = dot * scale; +} + +// ── Batched attention score: all Q heads in one dispatch ───────────── +// Grid: (seq_len, n_q_heads, 1). Each thread computes one score for one head. +// GQA: maps Q head h to KV head h/gqa_factor. +kernel void attn_score_batched( + device const float *q [[buffer(0)]], + device const float *kv_cache_k [[buffer(1)]], + device float *att [[buffer(2)]], + constant uint &head_dim [[buffer(3)]], + constant uint &kv_dim [[buffer(4)]], + constant uint &n_q_heads [[buffer(5)]], + constant uint &gqa_factor [[buffer(6)]], + constant float &scale [[buffer(7)]], + constant uint &seq_len [[buffer(8)]], + constant uint &max_seq [[buffer(9)]], + uint2 gid [[thread_position_in_grid]]) +{ + uint t = gid.x; + uint h = gid.y; + if (t >= seq_len || h >= n_q_heads) return; + + uint kv_h = h / gqa_factor; + device const float *qh = q + h * head_dim; + device const float *kt = kv_cache_k + uint64_t(t) * kv_dim + kv_h * head_dim; + + float dot = 0.0f; + for (uint i = 0; i < head_dim; i++) + dot += qh[i] * kt[i]; + + att[h * max_seq + t] = dot * scale; +} + +// ── Batched softmax: all heads in one dispatch ─────────────────────── +// One threadgroup per head. tid reduces over seq_len dimension. +kernel void softmax_batched( + device float *att [[buffer(0)]], + constant uint &seq_len [[buffer(1)]], + constant uint &max_seq [[buffer(2)]], + constant uint &n_q_heads [[buffer(3)]], + uint tgid [[threadgroup_position_in_grid]], + uint tid [[thread_index_in_threadgroup]], + uint tpg [[threads_per_threadgroup]]) +{ + uint h = tgid; + if (h >= n_q_heads) return; + + device float *head_att = att + h * max_seq; + threadgroup float shared[1024]; + + float local_max = -1e30f; + for (uint i = tid; i < seq_len; i += tpg) + local_max = max(local_max, head_att[i]); + shared[tid] = local_max; + threadgroup_barrier(mem_flags::mem_threadgroup); + for (uint s = tpg / 2; s > 0; s >>= 1) { + if (tid < s) shared[tid] = max(shared[tid], shared[tid + s]); + threadgroup_barrier(mem_flags::mem_threadgroup); + } + float max_val = shared[0]; + + float local_sum = 0.0f; + for (uint i = tid; i < seq_len; i += tpg) { + float e = exp(head_att[i] - max_val); + head_att[i] = e; + local_sum += e; + } + shared[tid] = local_sum; + threadgroup_barrier(mem_flags::mem_threadgroup); + for (uint s = tpg / 2; s > 0; s >>= 1) { + if (tid < s) shared[tid] += shared[tid + s]; + threadgroup_barrier(mem_flags::mem_threadgroup); + } + float inv_sum = 1.0f / shared[0]; + + for (uint i = tid; i < seq_len; i += tpg) + head_att[i] *= inv_sum; +} + +// ── Batched attention weighted sum: all heads in one dispatch ──────── +// Grid: (head_dim, n_q_heads, 1). Each thread computes one output dim for one head. +kernel void attn_wsum_batched( + device const float *att [[buffer(0)]], + device const float *kv_cache_v [[buffer(1)]], + device float *out [[buffer(2)]], + constant uint &head_dim [[buffer(3)]], + constant uint &kv_dim [[buffer(4)]], + constant uint &n_q_heads [[buffer(5)]], + constant uint &gqa_factor [[buffer(6)]], + constant uint &seq_len [[buffer(7)]], + constant uint &max_seq [[buffer(8)]], + uint2 gid [[thread_position_in_grid]]) +{ + uint d = gid.x; + uint h = gid.y; + if (d >= head_dim || h >= n_q_heads) return; + + uint kv_h = h / gqa_factor; + device const float *head_att = att + h * max_seq; + + float sum = 0.0f; + for (uint t = 0; t < seq_len; t++) { + float a = head_att[t]; + float v = kv_cache_v[uint64_t(t) * kv_dim + kv_h * head_dim + d]; + sum += a * v; + } + out[h * head_dim + d] = sum; +} + +// ── Softmax (legacy, single head) ─────────────────────────────────── +kernel void softmax_inplace( + device float *att [[buffer(0)]], + constant uint &seq_len [[buffer(1)]], + uint tid [[thread_index_in_threadgroup]], + uint tpg [[threads_per_threadgroup]]) +{ + threadgroup float shared[1024]; + + float local_max = -1e30f; + for (uint i = tid; i < seq_len; i += tpg) + local_max = max(local_max, att[i]); + shared[tid] = local_max; + threadgroup_barrier(mem_flags::mem_threadgroup); + for (uint s = tpg / 2; s > 0; s >>= 1) { + if (tid < s) shared[tid] = max(shared[tid], shared[tid + s]); + threadgroup_barrier(mem_flags::mem_threadgroup); + } + float max_val = shared[0]; + + float local_sum = 0.0f; + for (uint i = tid; i < seq_len; i += tpg) { + float e = exp(att[i] - max_val); + att[i] = e; + local_sum += e; + } + shared[tid] = local_sum; + threadgroup_barrier(mem_flags::mem_threadgroup); + for (uint s = tpg / 2; s > 0; s >>= 1) { + if (tid < s) shared[tid] += shared[tid + s]; + threadgroup_barrier(mem_flags::mem_threadgroup); + } + float inv_sum = 1.0f / shared[0]; + + for (uint i = tid; i < seq_len; i += tpg) + att[i] *= inv_sum; +} + +// ── Attention weighted sum (legacy, single head) ───────────────────── +kernel void attn_weighted_sum( + device const float *att [[buffer(0)]], + device const float *kv_cache_v [[buffer(1)]], + device float *out [[buffer(2)]], + constant uint &head_dim [[buffer(3)]], + constant uint &kv_dim [[buffer(4)]], + constant uint &kv_head_offset [[buffer(5)]], + constant uint &seq_len [[buffer(6)]], + uint gid [[thread_position_in_grid]]) +{ + if (gid >= head_dim) return; + + float sum = 0.0f; + for (uint t = 0; t < seq_len; t++) { + float a = att[t]; + float v = kv_cache_v[uint64_t(t) * kv_dim + kv_head_offset + gid]; + sum += a * v; + } + out[gid] = sum; +} + +// ── Argmax ─────────────────────────────────────────────────────────── +// Finds argmax of logits[0..n-1], writes to result[0]. +// Single threadgroup. +kernel void argmax_kernel( + device const float *logits [[buffer(0)]], + device int *result [[buffer(1)]], + constant uint &n [[buffer(2)]], + uint tid [[thread_index_in_threadgroup]], + uint tpg [[threads_per_threadgroup]]) +{ + threadgroup float shared_val[1024]; + threadgroup int shared_idx[1024]; + + float local_max = -1e30f; + int local_idx = 0; + for (uint i = tid; i < n; i += tpg) { + if (logits[i] > local_max) { + local_max = logits[i]; + local_idx = int(i); + } + } + shared_val[tid] = local_max; + shared_idx[tid] = local_idx; + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (uint s = tpg / 2; s > 0; s >>= 1) { + if (tid < s && shared_val[tid + s] > shared_val[tid]) { + shared_val[tid] = shared_val[tid + s]; + shared_idx[tid] = shared_idx[tid + s]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + + if (tid == 0) result[0] = shared_idx[0]; +} + +// ── Copy kernel ────────────────────────────────────────────────────── +// dst[i] = src[i] for i in [0, n) +kernel void vec_copy( + device const float *src [[buffer(0)]], + device float *dst [[buffer(1)]], + constant uint &n [[buffer(2)]], + uint gid [[thread_position_in_grid]]) +{ + if (gid >= n) return; + dst[gid] = src[gid]; +} + +// ── Zero-fill ──────────────────────────────────────────────────────── +kernel void vec_zero( + device float *dst [[buffer(0)]], + constant uint &n [[buffer(1)]], + uint gid [[thread_position_in_grid]]) +{ + if (gid >= n) return; + dst[gid] = 0.0f; +} diff --git a/inference/qwen_ane_infer.h b/inference/qwen_ane_infer.h new file mode 100644 index 0000000..47bd1f4 --- /dev/null +++ b/inference/qwen_ane_infer.h @@ -0,0 +1,2015 @@ +// qwen_ane_infer.h — Qwen2.5-0.5B inference on Apple Neural Engine +// Linear projections on ANE (baked-weight conv kernels), CPU for element-wise ops. +// Based on maderix/ANE runtime + MIL generation. +#pragma once + +#include "../training/ane_runtime.h" +#include "../training/ane_mil_gen.h" + +// Compile a matmul kernel: W[out_ch, in_ch] @ x[in_ch] → y[out_ch] +// Uses the two-input matmul MIL variant (weights passed as input, not baked) +static ANEKernel *compile_matmul_kernel(int in_ch, int out_ch) { + NSString *mil = mil_gen_matmul(in_ch, out_ch, 1); + size_t inputSizes[2] = {(size_t)in_ch * 1 * 4, (size_t)out_ch * in_ch * 4}; + size_t outBytes = (size_t)out_ch * 1 * 4; + return ane_compile([mil dataUsingEncoding:NSUTF8StringEncoding], nil, 2, inputSizes, 1, &outBytes); +} + +// Compile a baked-weight conv kernel (from model.h) +static ANEKernel *compile_conv_kernel(const float *weights, int in_ch, int out_ch, int spatial) { + NSData *wb = mil_build_weight_blob(weights, out_ch, in_ch); + NSString *mil = mil_gen_conv(in_ch, out_ch, spatial); + size_t inBytes = (size_t)in_ch * spatial * 4; + size_t outBytes = (size_t)out_ch * spatial * 4; + return ane_compile([mil dataUsingEncoding:NSUTF8StringEncoding], wb, 1, &inBytes, 1, &outBytes); +} + +// Compile baked-weight conv with FP16 IOSurfaces (for fused ANE path) +static ANEKernel *compile_conv_kernel_fp16io(const float *weights, int in_ch, int out_ch, int spatial) { + int saved = g_fp16_io; g_fp16_io = 1; + NSData *wb = mil_build_weight_blob(weights, out_ch, in_ch); + NSString *mil = mil_gen_conv(in_ch, out_ch, spatial); + size_t inBytes = (size_t)in_ch * spatial * sizeof(_Float16); + size_t outBytes = (size_t)out_ch * spatial * sizeof(_Float16); + ANEKernel *k = ane_compile([mil dataUsingEncoding:NSUTF8StringEncoding], wb, 1, &inBytes, 1, &outBytes); + g_fp16_io = saved; + return k; +} +#include +#include +#include +#include +#include + +static void *qwen_calloc(size_t count, size_t size, const char *desc) { + void *p = calloc(count, size); + if (!p) { + fprintf(stderr, "FATAL: calloc failed for %s (%.1f MB)\n", + desc, (double)(count * size) / (1024*1024)); + exit(1); + } + return p; +} + +// ── Metal GPU context (defined in main.m, used for GPU matmuls) ────── +#ifdef __OBJC__ +#import +#endif + +typedef struct { + void *device; // id + void *queue; // id + void *pipeline_f16; // id + void *pipeline_f32; // id + void *pipeline_q4; // id for sgemv_q4 + void *pipeline_rms; // id for rms_norm + void *pipeline_rope; // id for rope_apply + void *pipeline_silu; // id for silu_mul + void *pipeline_add; // id for vec_add + void *pipeline_bias; // id for bias_add + void *pipeline_embed; // id for embed_lookup + void *pipeline_attn_score; // id + void *pipeline_softmax; // id + void *pipeline_attn_wsum; // id + void *pipeline_argmax; // id + void *pipeline_copy; // id + void *pipeline_zero; // id + void *pipeline_q4_fast; // id for sgemv_q4_fast (SIMD) + void *pipeline_q4_fused_ffn; // id for fused gate+up+silu + void *pipeline_attn_score_b; // batched attn_score (all heads) + void *pipeline_softmax_b; // batched softmax (all heads) + void *pipeline_attn_wsum_b; // batched attn weighted sum (all heads) + void *pipeline_sgemm_q4; // batched Q4 matmul (prefill) + void *pipeline_sgemm_q4_fused_ffn; // batched fused FFN (prefill) + void *pipeline_rms_batched; // batched RMSNorm (prefill) + void *pipeline_embed_batched; // batched embed lookup (prefill) + void *pipeline_rope_batched; // batched RoPE (prefill) + void *pipeline_add_batched; // batched vec_add (prefill) + void *x_buf; // id for input vector (reusable) + void *y_buf; // id for output vector (reusable) + int initialized; +} MetalContext; + +static MetalContext g_metal = {0}; + +#ifndef QWEN_DEBUG +#define QWEN_DEBUG 0 +#endif + +// Qwen2.5-0.5B-Instruct architecture +#define QWEN_DIM 896 +#define QWEN_HIDDEN 4864 +#define QWEN_LAYERS 24 +#define QWEN_HEADS 14 +#define QWEN_KV_HEADS 2 +#define QWEN_HEAD_DIM 64 +#define QWEN_VOCAB 151936 +#define QWEN_RMS_EPS 1e-6f +#define QWEN_ROPE_THETA 1000000.0f +#define QWEN_MAX_SEQ 512 + +// GQA: each KV head serves (HEADS / KV_HEADS) query heads +#define QWEN_GQA_FACTOR (QWEN_HEADS / QWEN_KV_HEADS) + +// Sizes for GQA projections +#define QWEN_Q_DIM (QWEN_HEADS * QWEN_HEAD_DIM) // 896 +#define QWEN_KV_DIM (QWEN_KV_HEADS * QWEN_HEAD_DIM) // 128 + +typedef struct { + // Weight format: 0 = F32 everywhere, 1 = F16 projections + int weight_fmt; + + // Embeddings + norms always F32 + float *embed; // [vocab, dim] + float *rms_att[QWEN_LAYERS]; // [dim] + float *rms_ffn[QWEN_LAYERS]; // [dim] + float *rms_final; // [dim] + + // Projection weights: F32 or F16 depending on weight_fmt + // When weight_fmt=1, the f32 pointers are NULL and f16 pointers are set + float *wq[QWEN_LAYERS]; // [q_dim, dim] (F32) + float *wk[QWEN_LAYERS]; // [kv_dim, dim] (F32) + float *wv[QWEN_LAYERS]; // [kv_dim, dim] (F32) + float *wo[QWEN_LAYERS]; // [dim, q_dim] (F32) + float *w_gate[QWEN_LAYERS]; // [hidden, dim] (F32) + float *w_up[QWEN_LAYERS]; // [hidden, dim] (F32) + float *w_down[QWEN_LAYERS]; // [dim, hidden] (F32) + + _Float16 *wq_f16[QWEN_LAYERS]; // (F16) + _Float16 *wk_f16[QWEN_LAYERS]; + _Float16 *wv_f16[QWEN_LAYERS]; + _Float16 *wo_f16[QWEN_LAYERS]; + _Float16 *wgate_f16[QWEN_LAYERS]; + _Float16 *wup_f16[QWEN_LAYERS]; + _Float16 *wdown_f16[QWEN_LAYERS]; + + uint8_t *wq_q8[QWEN_LAYERS]; // (Q8_0 blocks) + uint8_t *wk_q8[QWEN_LAYERS]; + uint8_t *wv_q8[QWEN_LAYERS]; + uint8_t *wo_q8[QWEN_LAYERS]; + uint8_t *wgate_q8[QWEN_LAYERS]; + uint8_t *wup_q8[QWEN_LAYERS]; + uint8_t *wdown_q8[QWEN_LAYERS]; + + // Metal GPU buffers (id cast to void*) + void *gpu_wq[QWEN_LAYERS]; + void *gpu_wk[QWEN_LAYERS]; + void *gpu_wv[QWEN_LAYERS]; + void *gpu_wo[QWEN_LAYERS]; + void *gpu_wgate[QWEN_LAYERS]; + void *gpu_wup[QWEN_LAYERS]; + void *gpu_wdown[QWEN_LAYERS]; + void *gpu_embed; // embedding table (F32) + void *gpu_rms_att[QWEN_LAYERS]; // RMSNorm weights + void *gpu_rms_ffn[QWEN_LAYERS]; + void *gpu_rms_final; + void *gpu_q_bias[QWEN_LAYERS]; + void *gpu_k_bias[QWEN_LAYERS]; + void *gpu_v_bias[QWEN_LAYERS]; + void *gpu_kv_cache_k[QWEN_LAYERS]; + void *gpu_kv_cache_v[QWEN_LAYERS]; + int use_gpu; + // wcls = embed (tied, always F32) + + // ANE kernels -- unfused (one per linear projection per layer) + ANEKernel *k_q[QWEN_LAYERS]; + ANEKernel *k_k[QWEN_LAYERS]; + ANEKernel *k_v[QWEN_LAYERS]; + ANEKernel *k_o[QWEN_LAYERS]; + ANEKernel *k_gate[QWEN_LAYERS]; + ANEKernel *k_up[QWEN_LAYERS]; + ANEKernel *k_down[QWEN_LAYERS]; + // LM head chunked: vocab too large for single ANE kernel (max 65536) + #define QWEN_LM_CHUNKS 16 + #define QWEN_LM_CHUNK_SIZE 9496 // 151936 / 16 + ANEKernel *k_lmhead[QWEN_LM_CHUNKS]; + + // ANE kernels -- fused (reduces 184 → 112 kernels, under 119 limit) + ANEKernel *k_qkv[QWEN_LAYERS]; // fused Q+K+V → 3 outputs + ANEKernel *k_ffn_up[QWEN_LAYERS]; // fused Gate+Up → 2 outputs + int use_ane; // 1 = fused ANE matmuls + CPU element-wise + + // Q/K/V biases per layer + float *q_bias[QWEN_LAYERS]; // [q_dim] + float *k_bias[QWEN_LAYERS]; // [kv_dim] + float *v_bias[QWEN_LAYERS]; // [kv_dim] + + // KV cache [layer][kv_heads * head_dim * max_seq] + float *kv_cache_k[QWEN_LAYERS]; + float *kv_cache_v[QWEN_LAYERS]; + int pos; // current position in sequence + + // Scratch buffers + float *x; // [dim] + float *xb; // [dim] + float *q; // [q_dim] + float *k; // [kv_dim] + float *v; // [kv_dim] + float *att; // [heads * max_seq] + float *hb; // [hidden] + float *hb2; // [hidden] + float *logits; // [vocab] +} QwenModel; + +// ── Precomputed RoPE table ─────────────────────────────────────────── + +static float g_rope_cos[QWEN_MAX_SEQ][QWEN_HEAD_DIM / 2]; +static float g_rope_sin[QWEN_MAX_SEQ][QWEN_HEAD_DIM / 2]; +static int g_rope_initialized = 0; + +static void qwen_rope_init(void) { + if (g_rope_initialized) return; + int half = QWEN_HEAD_DIM / 2; + for (int pos = 0; pos < QWEN_MAX_SEQ; pos++) { + for (int i = 0; i < half; i++) { + float freq = 1.0f / powf(QWEN_ROPE_THETA, (float)(2 * i) / QWEN_HEAD_DIM); + float angle = pos * freq; + g_rope_cos[pos][i] = cosf(angle); + g_rope_sin[pos][i] = sinf(angle); + } + } + g_rope_initialized = 1; +} + +// ── CPU ops (vectorized with NEON + vDSP) ──────────────────────────── + +static void qwen_rmsnorm(float *out, const float *x, const float *w, int D) { + float ss; + vDSP_svesq(x, 1, &ss, (vDSP_Length)D); + ss = 1.0f / sqrtf(ss / D + QWEN_RMS_EPS); + vDSP_vsmul(x, 1, &ss, out, 1, (vDSP_Length)D); + vDSP_vmul(out, 1, w, 1, out, 1, (vDSP_Length)D); +} + +static void qwen_rope(float *q, float *k, int pos, int n_q_heads, int n_kv_heads, int head_dim) { + int half = head_dim / 2; + const float *cv = g_rope_cos[pos]; + const float *sv = g_rope_sin[pos]; + + for (int h = 0; h < n_q_heads; h++) { + float *qh = q + h * head_dim; + int i = 0; + for (; i + 3 < half; i += 4) { + float32x4_t first = vld1q_f32(qh + i); + float32x4_t second = vld1q_f32(qh + i + half); + float32x4_t c = vld1q_f32(cv + i); + float32x4_t s = vld1q_f32(sv + i); + vst1q_f32(qh + i, vmlsq_f32(vmulq_f32(first, c), second, s)); + vst1q_f32(qh + i + half, vmlaq_f32(vmulq_f32(second, c), first, s)); + } + for (; i < half; i++) { + float f = qh[i], se = qh[i + half]; + qh[i] = f * cv[i] - se * sv[i]; + qh[i + half] = se * cv[i] + f * sv[i]; + } + } + + for (int h = 0; h < n_kv_heads; h++) { + float *kh = k + h * head_dim; + int i = 0; + for (; i + 3 < half; i += 4) { + float32x4_t first = vld1q_f32(kh + i); + float32x4_t second = vld1q_f32(kh + i + half); + float32x4_t c = vld1q_f32(cv + i); + float32x4_t s = vld1q_f32(sv + i); + vst1q_f32(kh + i, vmlsq_f32(vmulq_f32(first, c), second, s)); + vst1q_f32(kh + i + half, vmlaq_f32(vmulq_f32(second, c), first, s)); + } + for (; i < half; i++) { + float f = kh[i], se = kh[i + half]; + kh[i] = f * cv[i] - se * sv[i]; + kh[i + half] = se * cv[i] + f * sv[i]; + } + } +} + +static void qwen_silu(float *x, int n) { + int i = 0; + float32x4_t one = vdupq_n_f32(1.0f); + for (; i + 3 < n; i += 4) { + float32x4_t v = vld1q_f32(x + i); + float neg[4]; + vst1q_f32(neg, vnegq_f32(v)); + float exp_neg[4]; + for (int j = 0; j < 4; j++) exp_neg[j] = expf(neg[j]); + float32x4_t denom = vaddq_f32(one, vld1q_f32(exp_neg)); + vst1q_f32(x + i, vdivq_f32(v, denom)); + } + for (; i < n; i++) + x[i] = x[i] / (1.0f + expf(-x[i])); +} + +// ── ANE projection helpers ────────────────────────────────────────── +// ANE IOSurfaces are always FP16 at the hardware level. +// We use g_fp16_io=1 MIL (FP16 I/O, no cast ops) and convert F32<->F16 here. + +static inline bool ane_run(ANEKernel *k) { return ane_eval(k); } + +static void ane_write_f32_as_f16(ANEKernel *kernel, int idx, const float *f32, int n) { + IOSurfaceLock(kernel->ioInputs[idx], 0, NULL); + _Float16 *dst = (_Float16 *)IOSurfaceGetBaseAddress(kernel->ioInputs[idx]); + for (int i = 0; i < n; i++) dst[i] = (_Float16)f32[i]; + IOSurfaceUnlock(kernel->ioInputs[idx], 0, NULL); +} + +static void ane_read_f16_to_f32(ANEKernel *kernel, int idx, float *f32, int n) { + IOSurfaceLock(kernel->ioOutputs[idx], kIOSurfaceLockReadOnly, NULL); + const _Float16 *src = (const _Float16 *)IOSurfaceGetBaseAddress(kernel->ioOutputs[idx]); + for (int i = 0; i < n; i++) f32[i] = (float)src[i]; + IOSurfaceUnlock(kernel->ioOutputs[idx], kIOSurfaceLockReadOnly, NULL); +} + +static void ane_project(ANEKernel *kernel, const float *in, float *out, + int in_dim, int out_dim) { + ane_write_f32_as_f16(kernel, 0, in, in_dim); + ane_run(kernel); + ane_read_f16_to_f32(kernel, 0, out, out_dim); +} + +// Fused QKV: one ANE kernel → 3 outputs (Q, K, V with different dims) +static void ane_project_qkv(ANEKernel *kernel, const float *in, + float *q, float *k, float *v, + int in_dim, int q_dim, int kv_dim) { + ane_write_f32_as_f16(kernel, 0, in, in_dim); + ane_run(kernel); + ane_read_f16_to_f32(kernel, 0, q, q_dim); + ane_read_f16_to_f32(kernel, 1, k, kv_dim); + ane_read_f16_to_f32(kernel, 2, v, kv_dim); +} + +// Fused Gate+Up: one ANE kernel → 2 outputs (gate, up) +static void ane_project_ffn_up(ANEKernel *kernel, const float *in, + float *gate, float *up, + int in_dim, int hidden_dim) { + ane_write_f32_as_f16(kernel, 0, in, in_dim); + ane_run(kernel); + ane_read_f16_to_f32(kernel, 0, gate, hidden_dim); + ane_read_f16_to_f32(kernel, 1, up, hidden_dim); +} + +// Compile fused QKV kernel (GQA-aware: Q=[q_dim,dim], K/V=[kv_dim,dim]) +// Uses FP16 IOSurfaces (ANE hardware requirement) +static ANEKernel *compile_qkv_gqa_kernel(const float *wq, const float *wk, const float *wv, + int dim, int q_dim, int kv_dim) { + int saved = g_fp16_io; g_fp16_io = 1; + NSData *wb = mil_build_qkv_gqa_weight_blob(wq, q_dim, dim, wk, wv, kv_dim); + NSString *mil = mil_gen_qkv_gqa(dim, q_dim, kv_dim, 1); + size_t inBytes = (size_t)dim * sizeof(_Float16); + size_t outSizes[3] = { + (size_t)q_dim * sizeof(_Float16), + (size_t)kv_dim * sizeof(_Float16), + (size_t)kv_dim * sizeof(_Float16) + }; + ANEKernel *k = ane_compile([mil dataUsingEncoding:NSUTF8StringEncoding], wb, + 1, &inBytes, 3, outSizes); + g_fp16_io = saved; + return k; +} + +// Compile fused FFN up kernel (Gate + Up, both [hidden_dim, dim]) +static ANEKernel *compile_ffn_up_kernel(const float *w_gate, const float *w_up, + int dim, int hidden_dim) { + int saved = g_fp16_io; g_fp16_io = 1; + NSData *wb = mil_build_ffn_up_weight_blob(w_gate, w_up, hidden_dim, dim); + NSString *mil = mil_gen_ffn_up(dim, hidden_dim, 1); + size_t inBytes = (size_t)dim * sizeof(_Float16); + size_t outSizes[2] = { + (size_t)hidden_dim * sizeof(_Float16), + (size_t)hidden_dim * sizeof(_Float16) + }; + ANEKernel *k = ane_compile([mil dataUsingEncoding:NSUTF8StringEncoding], wb, + 1, &inBytes, 2, outSizes); + g_fp16_io = saved; + return k; +} + +// CPU matmul via Accelerate BLAS: y = W @ x, W[out_dim, in_dim] +static void cpu_project(const float *W, const float *x, float *y, int in_dim, int out_dim) { + cblas_sgemv(CblasRowMajor, CblasNoTrans, + out_dim, in_dim, + 1.0f, W, in_dim, + x, 1, + 0.0f, y, 1); +} + +// Bulk F16→F32 conversion using NEON vcvt +static void convert_f16_to_f32(const _Float16 *src, float *dst, size_t n) { + size_t i = 0; + for (; i + 7 < n; i += 8) { + float16x8_t h = vld1q_f16((const __fp16*)(src + i)); + vst1q_f32(dst + i, vcvt_f32_f16(vget_low_f16(h))); + vst1q_f32(dst + i + 4, vcvt_f32_f16(vget_high_f16(h))); + } + for (; i < n; i++) + dst[i] = (float)src[i]; +} + +// ── Q8_0 quantization support ──────────────────────────────────────── +// Block format: 2 bytes F16 scale + 32 bytes int8 values = 34 bytes per block +#define Q8_BLOCK_SIZE 32 +#define Q8_BLOCK_BYTES (2 + Q8_BLOCK_SIZE) // 34 + +// Q8 matmul: y = W_q8 @ x, dequantize-and-dot using NEON int8 +// W is stored as blocks of [f16_scale, 32*int8], row-major +static void cpu_project_q8(const uint8_t *W, const float *x, float *y, + int in_dim, int out_dim) { + int n_blocks = in_dim / Q8_BLOCK_SIZE; + size_t row_bytes = (size_t)n_blocks * Q8_BLOCK_BYTES; + + for (int r = 0; r < out_dim; r++) { + const uint8_t *row = W + (size_t)r * row_bytes; + float sum = 0.0f; + + for (int b = 0; b < n_blocks; b++) { + const uint8_t *block = row + (size_t)b * Q8_BLOCK_BYTES; + _Float16 scale_f16; + memcpy(&scale_f16, block, 2); + float scale = (float)scale_f16; + const int8_t *qvals = (const int8_t*)(block + 2); + const float *xb = x + b * Q8_BLOCK_SIZE; + + // NEON: load 32 int8 values, widen to int16, convert to f32, FMA + int8x16_t q0 = vld1q_s8(qvals); + int8x16_t q1 = vld1q_s8(qvals + 16); + + // Widen int8 -> int16 -> int32 -> float32, then FMA with x + int16x8_t w0 = vmovl_s8(vget_low_s8(q0)); + int16x8_t w1 = vmovl_s8(vget_high_s8(q0)); + int16x8_t w2 = vmovl_s8(vget_low_s8(q1)); + int16x8_t w3 = vmovl_s8(vget_high_s8(q1)); + + float32x4_t a0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(w0))), vld1q_f32(xb)); + float32x4_t a1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(w0))), vld1q_f32(xb + 4)); + float32x4_t a2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(w1))), vld1q_f32(xb + 8)); + float32x4_t a3 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(w1))), vld1q_f32(xb + 12)); + float32x4_t a4 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(w2))), vld1q_f32(xb + 16)); + float32x4_t a5 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(w2))), vld1q_f32(xb + 20)); + float32x4_t a6 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(w3))), vld1q_f32(xb + 24)); + float32x4_t a7 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(w3))), vld1q_f32(xb + 28)); + + float32x4_t s01 = vaddq_f32(a0, a1); + float32x4_t s23 = vaddq_f32(a2, a3); + float32x4_t s45 = vaddq_f32(a4, a5); + float32x4_t s67 = vaddq_f32(a6, a7); + float32x4_t stot = vaddq_f32(vaddq_f32(s01, s23), vaddq_f32(s45, s67)); + sum += scale * vaddvq_f32(stot); + } + y[r] = sum; + } +} + +// ── Q4_0 block constants ───────────────────────────────────────────── +#define Q4_BLOCK_SIZE 32 +#define Q4_BLOCK_BYTES 20 // 2(scale) + 2(zero) + 16(packed) + +// ── Q4_0 dequantization helper: Q4 blocks to F32 ── +// Dequantizes one weight matrix from Q4 blocks into a caller-provided F32 buffer. +static void dequant_q4_to_f32(const uint8_t *W_q4, float *W_f32, + int in_dim, int out_dim) { + int n_blocks = in_dim / Q4_BLOCK_SIZE; + size_t row_bytes = (size_t)n_blocks * Q4_BLOCK_BYTES; + + for (int r = 0; r < out_dim; r++) { + const uint8_t *row = W_q4 + (size_t)r * row_bytes; + float *out_row = W_f32 + (size_t)r * in_dim; + + for (int b = 0; b < n_blocks; b++) { + const uint8_t *block = row + (size_t)b * Q4_BLOCK_BYTES; + _Float16 scale_f16, zero_f16; + memcpy(&scale_f16, block, 2); + memcpy(&zero_f16, block + 2, 2); + float scale = (float)scale_f16; + float zero = (float)zero_f16; + const uint8_t *packed = block + 4; + float *out = out_row + b * Q4_BLOCK_SIZE; + + for (int i = 0; i < 16; i++) { + uint8_t byte = packed[i]; + out[i * 2] = (float)(byte & 0xF) * scale + zero; + out[i * 2 + 1] = (float)(byte >> 4) * scale + zero; + } + } + } +} + +// Q4 fused NEON dequant-and-dot: reads Q4 from memory, avoids F32 intermediate +// Each block: 2B F16 scale + 2B F16 zero + 16B packed uint8 (32 values) +// Uses NEON to extract nibbles, convert to float, FMA with input vector +static void cpu_project_q4_amx(const uint8_t *W_q4, const float *x, float *y, + int in_dim, int out_dim) { + int n_blocks = in_dim / Q4_BLOCK_SIZE; + size_t row_bytes = (size_t)n_blocks * Q4_BLOCK_BYTES; + + for (int r = 0; r < out_dim; r++) { + const uint8_t *row = W_q4 + (size_t)r * row_bytes; + float32x4_t acc0 = vdupq_n_f32(0.0f); + float32x4_t acc1 = vdupq_n_f32(0.0f); + + for (int b = 0; b < n_blocks; b++) { + const uint8_t *block = row + (size_t)b * Q4_BLOCK_BYTES; + _Float16 scale_f16, zero_f16; + memcpy(&scale_f16, block, 2); + memcpy(&zero_f16, block + 2, 2); + float scale = (float)scale_f16; + float zero = (float)zero_f16; + const uint8_t *packed = block + 4; + const float *xb = x + b * Q4_BLOCK_SIZE; + float32x4_t vscale = vdupq_n_f32(scale); + float32x4_t vzero = vdupq_n_f32(zero); + + // Process 16 packed bytes = 32 values, 8 values at a time + for (int i = 0; i < 16; i += 4) { + // Load 4 packed bytes + uint8x8_t raw = vld1_u8(packed + i); // only 4 used + + // Extract low and high nibbles + uint8_t b0 = packed[i], b1 = packed[i+1], b2 = packed[i+2], b3 = packed[i+3]; + + // Even indices (low nibbles): b0&0xF, b1&0xF, b2&0xF, b3&0xF + float32x4_t wlo = vmlaq_f32(vzero, vcvtq_f32_u32((uint32x4_t){ + b0 & 0xF, b1 & 0xF, b2 & 0xF, b3 & 0xF}), vscale); + // Odd indices (high nibbles): b0>>4, b1>>4, b2>>4, b3>>4 + float32x4_t whi = vmlaq_f32(vzero, vcvtq_f32_u32((uint32x4_t){ + b0 >> 4, b1 >> 4, b2 >> 4, b3 >> 4}), vscale); + + // Interleaved dot: x[0]*w[0] + x[1]*w[1] + ... (even/odd pairs) + int xi = i * 2; + float32x4_t x_even = {xb[xi], xb[xi+2], xb[xi+4], xb[xi+6]}; + float32x4_t x_odd = {xb[xi+1], xb[xi+3], xb[xi+5], xb[xi+7]}; + + acc0 = vmlaq_f32(acc0, wlo, x_even); + acc1 = vmlaq_f32(acc1, whi, x_odd); + } + } + y[r] = vaddvq_f32(vaddq_f32(acc0, acc1)); + } +} + +// Q4 batched projection: dequant full matrix to F32, then cblas_sgemm +static void cpu_project_batch_q4_amx(const uint8_t *W_q4, const float *X, float *Y, + int in_dim, int out_dim, int n_tokens) { + float *W_f32 = (float*)malloc((size_t)out_dim * in_dim * sizeof(float)); + dequant_q4_to_f32(W_q4, W_f32, in_dim, out_dim); + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, + n_tokens, out_dim, in_dim, + 1.0f, X, in_dim, + W_f32, in_dim, + 0.0f, Y, out_dim); + free(W_f32); +} + +// Toggle: 1 = use ANE for projections, 0 = CPU fallback +#define USE_ANE_PROJECTIONS 0 + +// ── Metal GPU matmul ───────────────────────────────────────────────── +#ifdef __OBJC__ + +static int metal_init(void) { + if (g_metal.initialized) return 0; + + id dev = MTLCreateSystemDefaultDevice(); + if (!dev) { fprintf(stderr, "Metal: no GPU device\n"); return -1; } + + NSString *shaderPath = [[NSBundle mainBundle] pathForResource:@"matmul" ofType:@"metallib"]; + NSError *error = nil; + id lib = nil; + + // Try loading from compiled metallib next to binary + NSString *execDir = [[[NSProcessInfo processInfo] arguments][0] stringByDeletingLastPathComponent]; + NSString *libPath = [execDir stringByAppendingPathComponent:@"matmul.metallib"]; + if ([[NSFileManager defaultManager] fileExistsAtPath:libPath]) { + lib = [dev newLibraryWithURL:[NSURL fileURLWithPath:libPath] error:&error]; + } + + // Fall back to compiling from source + if (!lib) { + NSString *srcPath = [execDir stringByAppendingPathComponent:@"matmul.metal"]; + NSString *src = [NSString stringWithContentsOfFile:srcPath + encoding:NSUTF8StringEncoding error:&error]; + if (!src) { + fprintf(stderr, "Metal: cannot read shader source: %s\n", + [[error description] UTF8String]); + return -1; + } + MTLCompileOptions *opts = [[MTLCompileOptions alloc] init]; + if (@available(macOS 15.0, *)) { + opts.mathMode = MTLMathModeFast; + } else { +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wdeprecated-declarations" + opts.fastMathEnabled = YES; +#pragma clang diagnostic pop + } + lib = [dev newLibraryWithSource:src options:opts error:&error]; + if (!lib) { + fprintf(stderr, "Metal: shader compile failed: %s\n", + [[error description] UTF8String]); + return -1; + } + } + + // Build all pipeline states + NSArray *names = @[ + @"sgemv_f16", @"sgemv_f32", @"sgemv_q4", + @"rms_norm", @"rope_apply", @"silu_mul", + @"vec_add", @"bias_add", @"embed_lookup", + @"attn_score", @"softmax_inplace", @"attn_weighted_sum", + @"argmax_kernel", @"vec_copy", @"vec_zero", + @"sgemv_q4_fast", @"sgemv_q4_fused_ffn", + @"attn_score_batched", @"softmax_batched", @"attn_wsum_batched", + @"sgemm_q4", @"sgemm_q4_fused_ffn", + @"rms_norm_batched", @"embed_lookup_batched", + @"rope_apply_batched", @"vec_add_batched" + ]; + void **pipelines[] = { + &g_metal.pipeline_f16, &g_metal.pipeline_f32, &g_metal.pipeline_q4, + &g_metal.pipeline_rms, &g_metal.pipeline_rope, &g_metal.pipeline_silu, + &g_metal.pipeline_add, &g_metal.pipeline_bias, &g_metal.pipeline_embed, + &g_metal.pipeline_attn_score, &g_metal.pipeline_softmax, &g_metal.pipeline_attn_wsum, + &g_metal.pipeline_argmax, &g_metal.pipeline_copy, &g_metal.pipeline_zero, + &g_metal.pipeline_q4_fast, &g_metal.pipeline_q4_fused_ffn, + &g_metal.pipeline_attn_score_b, &g_metal.pipeline_softmax_b, &g_metal.pipeline_attn_wsum_b, + &g_metal.pipeline_sgemm_q4, &g_metal.pipeline_sgemm_q4_fused_ffn, + &g_metal.pipeline_rms_batched, &g_metal.pipeline_embed_batched, + &g_metal.pipeline_rope_batched, &g_metal.pipeline_add_batched + }; + + for (int i = 0; i < (int)[names count]; i++) { + id fn = [lib newFunctionWithName:names[i]]; + if (!fn) { + fprintf(stderr, "Metal: missing shader function '%s'\n", [names[i] UTF8String]); + return -1; + } + id pso = [dev newComputePipelineStateWithFunction:fn error:&error]; + if (!pso) { + fprintf(stderr, "Metal: pipeline for '%s' failed: %s\n", + [names[i] UTF8String], [[error description] UTF8String]); + return -1; + } + *pipelines[i] = (__bridge_retained void*)pso; + } + + g_metal.device = (__bridge_retained void*)dev; + g_metal.queue = (__bridge_retained void*)[dev newCommandQueue]; + + g_metal.initialized = 1; + printf("Metal GPU initialized (%s)\n", [[dev name] UTF8String]); + return 0; +} + +// GPU projection for F16 weights: dispatches Metal compute shader +// Uses per-call output buffers to allow batching multiple projections +static void gpu_project_f16(id w_buf, const float *x, float *y, + int in_dim, int out_dim) { + id dev = (__bridge id)g_metal.device; + id queue = (__bridge id)g_metal.queue; + id pipeline = (__bridge id)g_metal.pipeline_f16; + + // Shared input/output buffers + id x_buf = [dev newBufferWithBytes:x + length:in_dim * sizeof(float) + options:MTLResourceStorageModeShared]; + id y_buf = [dev newBufferWithLength:out_dim * sizeof(float) + options:MTLResourceStorageModeShared]; + + id cmd = [queue commandBuffer]; + id enc = [cmd computeCommandEncoder]; + [enc setComputePipelineState:pipeline]; + [enc setBuffer:w_buf offset:0 atIndex:0]; + [enc setBuffer:x_buf offset:0 atIndex:1]; + [enc setBuffer:y_buf offset:0 atIndex:2]; + uint32_t dims[2] = {(uint32_t)in_dim, (uint32_t)out_dim}; + [enc setBytes:&dims[0] length:sizeof(uint32_t) atIndex:3]; + [enc setBytes:&dims[1] length:sizeof(uint32_t) atIndex:4]; + + NSUInteger tpg = pipeline.maxTotalThreadsPerThreadgroup; + if (tpg > (NSUInteger)out_dim) tpg = (NSUInteger)out_dim; + [enc dispatchThreads:MTLSizeMake(out_dim, 1, 1) + threadsPerThreadgroup:MTLSizeMake(tpg, 1, 1)]; + [enc endEncoding]; + [cmd commit]; + [cmd waitUntilCompleted]; + + memcpy(y, [y_buf contents], out_dim * sizeof(float)); +} + +#endif // __OBJC__ + +// ── Forward one token ──────────────────────────────────────────────── + +static int qwen_forward(QwenModel *m, int token) { + int D = QWEN_DIM, HD = QWEN_HIDDEN; + int pos = m->pos; + + // Token embedding + memcpy(m->x, m->embed + token * D, D * sizeof(float)); + + for (int l = 0; l < QWEN_LAYERS; l++) { + // Attention RMSNorm + qwen_rmsnorm(m->xb, m->x, m->rms_att[l], D); + +#if QWEN_DEBUG + if (l == 0 && pos == 0) { + float xnorm = 0; + for (int i = 0; i < D; i++) xnorm += m->xb[i] * m->xb[i]; + printf(" L0 RMSNorm out norm=%.4f (first 4: %.4f %.4f %.4f %.4f)\n", + sqrtf(xnorm), m->xb[0], m->xb[1], m->xb[2], m->xb[3]); + } +#endif + + // QKV projections + bias (CPU path -- GPU overhead too high for small matmuls) + #if USE_ANE_PROJECTIONS + ane_project(m->k_q[l], m->xb, m->q, D, QWEN_Q_DIM); + ane_project(m->k_k[l], m->xb, m->k, D, QWEN_KV_DIM); + ane_project(m->k_v[l], m->xb, m->v, D, QWEN_KV_DIM); + #else + if (m->weight_fmt == 3) { + cpu_project_q4_amx(m->wq_q8[l], m->xb, m->q, D, QWEN_Q_DIM); + cpu_project_q4_amx(m->wk_q8[l], m->xb, m->k, D, QWEN_KV_DIM); + cpu_project_q4_amx(m->wv_q8[l], m->xb, m->v, D, QWEN_KV_DIM); + } else if (m->weight_fmt == 2) { + cpu_project_q8(m->wq_q8[l], m->xb, m->q, D, QWEN_Q_DIM); + cpu_project_q8(m->wk_q8[l], m->xb, m->k, D, QWEN_KV_DIM); + cpu_project_q8(m->wv_q8[l], m->xb, m->v, D, QWEN_KV_DIM); + } else { + cpu_project(m->wq[l], m->xb, m->q, D, QWEN_Q_DIM); + cpu_project(m->wk[l], m->xb, m->k, D, QWEN_KV_DIM); + cpu_project(m->wv[l], m->xb, m->v, D, QWEN_KV_DIM); + } + #endif + // Apply Q/K/V biases (vectorized) + if (m->q_bias[l]) + vDSP_vadd(m->q, 1, m->q_bias[l], 1, m->q, 1, (vDSP_Length)QWEN_Q_DIM); + if (m->k_bias[l]) + vDSP_vadd(m->k, 1, m->k_bias[l], 1, m->k, 1, (vDSP_Length)QWEN_KV_DIM); + if (m->v_bias[l]) + vDSP_vadd(m->v, 1, m->v_bias[l], 1, m->v, 1, (vDSP_Length)QWEN_KV_DIM); + +#if QWEN_DEBUG + if (l == 0 && pos == 0) { + float qn = 0; + for (int i = 0; i < QWEN_Q_DIM; i++) qn += m->q[i] * m->q[i]; + printf(" L0 ANE Q norm=%.4f (first 4: %.4f %.4f %.4f %.4f)\n", + sqrtf(qn), m->q[0], m->q[1], m->q[2], m->q[3]); + float cpu_q[4] = {0}; + for (int i = 0; i < 4; i++) { + for (int j = 0; j < D; j++) + cpu_q[i] += m->wq[0][i * D + j] * m->xb[j]; + cpu_q[i] += m->q_bias[0][i]; + } + printf(" L0 CPU Q first 4: %.4f %.4f %.4f %.4f\n", + cpu_q[0], cpu_q[1], cpu_q[2], cpu_q[3]); + } +#endif + + // RoPE + qwen_rope(m->q, m->k, pos, QWEN_HEADS, QWEN_KV_HEADS, QWEN_HEAD_DIM); + + // Store K, V in cache + memcpy(m->kv_cache_k[l] + pos * QWEN_KV_DIM, + m->k, QWEN_KV_DIM * sizeof(float)); + memcpy(m->kv_cache_v[l] + pos * QWEN_KV_DIM, + m->v, QWEN_KV_DIM * sizeof(float)); + + // GQA attention (CPU — element-wise ops) + float scale = 1.0f / sqrtf((float)QWEN_HEAD_DIM); + float *attn_out = m->xb; // reuse buffer + memset(attn_out, 0, QWEN_Q_DIM * sizeof(float)); + + for (int h = 0; h < QWEN_HEADS; h++) { + int kv_h = h / QWEN_GQA_FACTOR; + float *qh = m->q + h * QWEN_HEAD_DIM; + float *att_h = m->att + h * QWEN_MAX_SEQ; + int seq_len = pos + 1; + + // Attention scores: Q @ K^T + float max_score = -1e9f; + for (int t = 0; t <= pos; t++) { + float *kt = m->kv_cache_k[l] + t * QWEN_KV_DIM + kv_h * QWEN_HEAD_DIM; + float score = cblas_sdot(QWEN_HEAD_DIM, qh, 1, kt, 1); + att_h[t] = score * scale; + if (att_h[t] > max_score) max_score = att_h[t]; + } + // Softmax: subtract max, exp, normalize (vDSP) + float neg_max = -max_score; + vDSP_vsadd(att_h, 1, &neg_max, att_h, 1, (vDSP_Length)seq_len); + int n_exp = seq_len; + vvexpf(att_h, att_h, &n_exp); + float sum; + vDSP_sve(att_h, 1, &sum, (vDSP_Length)seq_len); + float inv_sum = 1.0f / sum; + vDSP_vsmul(att_h, 1, &inv_sum, att_h, 1, (vDSP_Length)seq_len); + + // Weighted sum of V + for (int t = 0; t <= pos; t++) { + float a = att_h[t]; + float *vt = m->kv_cache_v[l] + t * QWEN_KV_DIM + kv_h * QWEN_HEAD_DIM; + cblas_saxpy(QWEN_HEAD_DIM, a, vt, 1, + attn_out + h * QWEN_HEAD_DIM, 1); + } + } + + float o_out[QWEN_DIM]; + #if USE_ANE_PROJECTIONS + ane_project(m->k_o[l], attn_out, o_out, QWEN_Q_DIM, D); + #else + if (m->weight_fmt == 3) + cpu_project_q4_amx(m->wo_q8[l], attn_out, o_out, QWEN_Q_DIM, D); + else if (m->weight_fmt == 2) + cpu_project_q8(m->wo_q8[l], attn_out, o_out, QWEN_Q_DIM, D); + else + cpu_project(m->wo[l], attn_out, o_out, QWEN_Q_DIM, D); + #endif + + // Residual (vectorized) + vDSP_vadd(m->x, 1, o_out, 1, m->x, 1, (vDSP_Length)D); + +#if QWEN_DEBUG + if (l == 0 && pos == 0) { + float pan = 0; + for (int i = 0; i < D; i++) pan += m->x[i] * m->x[i]; + printf(" L0 post-attn norm=%.4f first4=[%.6f, %.6f, %.6f, %.6f]\n", + sqrtf(pan), m->x[0], m->x[1], m->x[2], m->x[3]); + float on = 0; + for (int i = 0; i < D; i++) on += o_out[i] * o_out[i]; + printf(" L0 o_proj out norm=%.4f first4=[%.6f, %.6f, %.6f, %.6f]\n", + sqrtf(on), o_out[0], o_out[1], o_out[2], o_out[3]); + } +#endif + + // FFN RMSNorm + qwen_rmsnorm(m->xb, m->x, m->rms_ffn[l], D); + + // SwiGLU FFN + #if USE_ANE_PROJECTIONS + ane_project(m->k_gate[l], m->xb, m->hb, D, HD); + ane_project(m->k_up[l], m->xb, m->hb2, D, HD); + #else + if (m->weight_fmt == 3) { + cpu_project_q4_amx(m->wgate_q8[l], m->xb, m->hb, D, HD); + cpu_project_q4_amx(m->wup_q8[l], m->xb, m->hb2, D, HD); + } else if (m->weight_fmt == 2) { + cpu_project_q8(m->wgate_q8[l], m->xb, m->hb, D, HD); + cpu_project_q8(m->wup_q8[l], m->xb, m->hb2, D, HD); + } else { + cpu_project(m->w_gate[l], m->xb, m->hb, D, HD); + cpu_project(m->w_up[l], m->xb, m->hb2, D, HD); + } + #endif + +#if QWEN_DEBUG + if (l == 0 && pos == 0) { + float gn = 0, un = 0; + for (int i = 0; i < HD; i++) { gn += m->hb[i]*m->hb[i]; un += m->hb2[i]*m->hb2[i]; } + printf(" L0 gate norm=%.4f up norm=%.4f\n", sqrtf(gn), sqrtf(un)); + printf(" L0 gate first4=[%.6f, %.6f, %.6f, %.6f]\n", + m->hb[0], m->hb[1], m->hb[2], m->hb[3]); + } +#endif + + qwen_silu(m->hb, HD); + // SiLU(gate) * up (vectorized element-wise multiply) + vDSP_vmul(m->hb, 1, m->hb2, 1, m->hb, 1, (vDSP_Length)HD); + + float ffn_out[QWEN_DIM]; + #if USE_ANE_PROJECTIONS + ane_project(m->k_down[l], m->hb, ffn_out, HD, D); + #else + if (m->weight_fmt == 3) + cpu_project_q4_amx(m->wdown_q8[l], m->hb, ffn_out, HD, D); + else if (m->weight_fmt == 2) + cpu_project_q8(m->wdown_q8[l], m->hb, ffn_out, HD, D); + else + cpu_project(m->w_down[l], m->hb, ffn_out, HD, D); + #endif + + // Residual (vectorized) + vDSP_vadd(m->x, 1, ffn_out, 1, m->x, 1, (vDSP_Length)D); + +#if QWEN_DEBUG + if (l < 3 && pos == 0) { + float hn = 0; + for (int i = 0; i < D; i++) hn += m->x[i] * m->x[i]; + printf(" C hidden[%d] norm=%.4f first4=[%.4f, %.4f, %.4f, %.4f]\n", + l+1, sqrtf(hn), m->x[0], m->x[1], m->x[2], m->x[3]); + } +#endif + } + + // Final RMSNorm + qwen_rmsnorm(m->xb, m->x, m->rms_final, D); + +#if QWEN_DEBUG + if (m->pos < 2) { + float fn = 0; + for (int i = 0; i < D; i++) fn += m->xb[i] * m->xb[i]; + printf(" Final hidden norm=%.4f (first 4: %.6f %.6f %.6f %.6f)\n", + sqrtf(fn), m->xb[0], m->xb[1], m->xb[2], m->xb[3]); + } +#endif + + // LM head via Accelerate BLAS (AMX, fastest for dim<=896) + cblas_sgemv(CblasRowMajor, CblasNoTrans, + QWEN_VOCAB, D, + 1.0f, m->embed, D, + m->xb, 1, + 0.0f, m->logits, 1); + +#if QWEN_DEBUG + if (m->pos < 2) { + float lmax = m->logits[0], lmin = m->logits[0]; + int nonzero = 0; + for (int i = 0; i < QWEN_VOCAB; i++) { + if (m->logits[i] > lmax) lmax = m->logits[i]; + if (m->logits[i] < lmin) lmin = m->logits[i]; + if (m->logits[i] != 0.0f) nonzero++; + } + printf(" Logits: min=%.4f max=%.4f nonzero=%d/%d\n", lmin, lmax, nonzero, QWEN_VOCAB); + } +#endif + + m->pos++; + + // Argmax (vDSP, single call over 151936 elements) + float max_val; + vDSP_Length max_idx_vdsp; + vDSP_maxvi(m->logits, 1, &max_val, &max_idx_vdsp, (vDSP_Length)QWEN_VOCAB); + return (int)max_idx_vdsp; +} + +// ── ANE fused forward pass: ANE for matmuls, CPU for element-wise ops ── +// Uses fused QKV and Gate+Up kernels (112 total, under 119 ANE limit). +// O-proj and Down-proj remain as single conv kernels. + +static int qwen_forward_ane(QwenModel *m, int token) { + int D = QWEN_DIM, HD = QWEN_HIDDEN; + int pos = m->pos; + + memcpy(m->x, m->embed + token * D, D * sizeof(float)); + + for (int l = 0; l < QWEN_LAYERS; l++) { + qwen_rmsnorm(m->xb, m->x, m->rms_att[l], D); + + // Fused QKV projection (1 ANE eval → Q, K, V) + ane_project_qkv(m->k_qkv[l], m->xb, m->q, m->k, m->v, + D, QWEN_Q_DIM, QWEN_KV_DIM); + + // Biases (CPU, vectorized) + if (m->q_bias[l]) + vDSP_vadd(m->q, 1, m->q_bias[l], 1, m->q, 1, (vDSP_Length)QWEN_Q_DIM); + if (m->k_bias[l]) + vDSP_vadd(m->k, 1, m->k_bias[l], 1, m->k, 1, (vDSP_Length)QWEN_KV_DIM); + if (m->v_bias[l]) + vDSP_vadd(m->v, 1, m->v_bias[l], 1, m->v, 1, (vDSP_Length)QWEN_KV_DIM); + + qwen_rope(m->q, m->k, pos, QWEN_HEADS, QWEN_KV_HEADS, QWEN_HEAD_DIM); + + memcpy(m->kv_cache_k[l] + pos * QWEN_KV_DIM, m->k, QWEN_KV_DIM * sizeof(float)); + memcpy(m->kv_cache_v[l] + pos * QWEN_KV_DIM, m->v, QWEN_KV_DIM * sizeof(float)); + + // GQA attention (CPU) + float scale = 1.0f / sqrtf((float)QWEN_HEAD_DIM); + float *attn_out = m->xb; + memset(attn_out, 0, QWEN_Q_DIM * sizeof(float)); + + for (int h = 0; h < QWEN_HEADS; h++) { + int kv_h = h / QWEN_GQA_FACTOR; + float *qh = m->q + h * QWEN_HEAD_DIM; + float *att_h = m->att + h * QWEN_MAX_SEQ; + int seq_len = pos + 1; + + float max_score = -1e9f; + for (int t = 0; t <= pos; t++) { + float *kt = m->kv_cache_k[l] + t * QWEN_KV_DIM + kv_h * QWEN_HEAD_DIM; + float score = cblas_sdot(QWEN_HEAD_DIM, qh, 1, kt, 1); + att_h[t] = score * scale; + if (att_h[t] > max_score) max_score = att_h[t]; + } + float neg_max = -max_score; + vDSP_vsadd(att_h, 1, &neg_max, att_h, 1, (vDSP_Length)seq_len); + int n_exp = seq_len; + vvexpf(att_h, att_h, &n_exp); + float sum; + vDSP_sve(att_h, 1, &sum, (vDSP_Length)seq_len); + float inv_sum = 1.0f / sum; + vDSP_vsmul(att_h, 1, &inv_sum, att_h, 1, (vDSP_Length)seq_len); + + for (int t = 0; t <= pos; t++) { + float a = att_h[t]; + float *vt = m->kv_cache_v[l] + t * QWEN_KV_DIM + kv_h * QWEN_HEAD_DIM; + cblas_saxpy(QWEN_HEAD_DIM, a, vt, 1, attn_out + h * QWEN_HEAD_DIM, 1); + } + } + + // O projection (single ANE kernel) + float o_out[QWEN_DIM]; + ane_project(m->k_o[l], attn_out, o_out, QWEN_Q_DIM, D); + + vDSP_vadd(m->x, 1, o_out, 1, m->x, 1, (vDSP_Length)D); + + qwen_rmsnorm(m->xb, m->x, m->rms_ffn[l], D); + + // Fused Gate+Up projection (1 ANE eval → gate, up) + ane_project_ffn_up(m->k_ffn_up[l], m->xb, m->hb, m->hb2, D, HD); + + qwen_silu(m->hb, HD); + vDSP_vmul(m->hb, 1, m->hb2, 1, m->hb, 1, (vDSP_Length)HD); + + // Down projection (single ANE kernel) + float ffn_out[QWEN_DIM]; + ane_project(m->k_down[l], m->hb, ffn_out, HD, D); + + vDSP_vadd(m->x, 1, ffn_out, 1, m->x, 1, (vDSP_Length)D); + } + + qwen_rmsnorm(m->xb, m->x, m->rms_final, D); + + // LM head: CPU AMX (too large for ANE, 151936 outputs) + cblas_sgemv(CblasRowMajor, CblasNoTrans, + QWEN_VOCAB, D, + 1.0f, m->embed, D, + m->xb, 1, + 0.0f, m->logits, 1); + + m->pos++; + + float max_val; + vDSP_Length max_idx_vdsp; + vDSP_maxvi(m->logits, 1, &max_val, &max_idx_vdsp, (vDSP_Length)QWEN_VOCAB); + return (int)max_idx_vdsp; +} + +// ── Batched prefill: process all prompt tokens at once ──────────────── +// Uses cblas_sgemm (matrix-matrix) instead of sequential sgemv calls. +// Returns the argmax token from the last position's logits. + +static void cpu_project_batch(const float *W, const float *X, float *Y, + int in_dim, int out_dim, int n_tokens) { + // X[n_tokens, in_dim], W[out_dim, in_dim], Y[n_tokens, out_dim] + // Y = X @ W^T => Y(n,out) = sum_k X(n,k) * W(out,k) + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, + n_tokens, out_dim, in_dim, + 1.0f, X, in_dim, + W, in_dim, + 0.0f, Y, out_dim); +} + +static int qwen_prefill(QwenModel *m, const int *tokens, int n_tokens) { + int D = QWEN_DIM, HD = QWEN_HIDDEN, N = n_tokens; + + float *xs = (float*)qwen_calloc(N * D, sizeof(float), "prefill_xs"); + float *xbs = (float*)qwen_calloc(N * D, sizeof(float), "prefill_xbs"); + float *qs = (float*)qwen_calloc(N * QWEN_Q_DIM, sizeof(float), "prefill_qs"); + float *ks = (float*)qwen_calloc(N * QWEN_KV_DIM, sizeof(float), "prefill_ks"); + float *vs = (float*)qwen_calloc(N * QWEN_KV_DIM, sizeof(float), "prefill_vs"); + float *hbs = (float*)qwen_calloc(N * HD, sizeof(float), "prefill_hbs"); + float *hb2s = (float*)qwen_calloc(N * HD, sizeof(float), "prefill_hb2s"); + float *o_outs = (float*)qwen_calloc(N * D, sizeof(float), "prefill_o_outs"); + float *ffn_outs = (float*)qwen_calloc(N * D, sizeof(float), "prefill_ffn_outs"); + + // Load all embeddings + for (int t = 0; t < N; t++) + memcpy(xs + t * D, m->embed + tokens[t] * D, D * sizeof(float)); + + for (int l = 0; l < QWEN_LAYERS; l++) { + // Batch RMSNorm + for (int t = 0; t < N; t++) + qwen_rmsnorm(xbs + t * D, xs + t * D, m->rms_att[l], D); + + // Batch QKV projections: sgemm + cpu_project_batch(m->wq[l], xbs, qs, D, QWEN_Q_DIM, N); + cpu_project_batch(m->wk[l], xbs, ks, D, QWEN_KV_DIM, N); + cpu_project_batch(m->wv[l], xbs, vs, D, QWEN_KV_DIM, N); + + // Per-token: bias + RoPE + cache + attention + for (int t = 0; t < N; t++) { + float *qt = qs + t * QWEN_Q_DIM; + float *kt = ks + t * QWEN_KV_DIM; + float *vt = vs + t * QWEN_KV_DIM; + int pos = m->pos + t; + + // Biases + if (m->q_bias[l]) + vDSP_vadd(qt, 1, m->q_bias[l], 1, qt, 1, (vDSP_Length)QWEN_Q_DIM); + if (m->k_bias[l]) + vDSP_vadd(kt, 1, m->k_bias[l], 1, kt, 1, (vDSP_Length)QWEN_KV_DIM); + if (m->v_bias[l]) + vDSP_vadd(vt, 1, m->v_bias[l], 1, vt, 1, (vDSP_Length)QWEN_KV_DIM); + + // RoPE + qwen_rope(qt, kt, pos, QWEN_HEADS, QWEN_KV_HEADS, QWEN_HEAD_DIM); + + // Store K, V in cache + memcpy(m->kv_cache_k[l] + pos * QWEN_KV_DIM, kt, QWEN_KV_DIM * sizeof(float)); + memcpy(m->kv_cache_v[l] + pos * QWEN_KV_DIM, vt, QWEN_KV_DIM * sizeof(float)); + + // GQA attention + float scale = 1.0f / sqrtf((float)QWEN_HEAD_DIM); + float *attn_out = xbs + t * D; + memset(attn_out, 0, QWEN_Q_DIM * sizeof(float)); + + for (int h = 0; h < QWEN_HEADS; h++) { + int kv_h = h / QWEN_GQA_FACTOR; + float *qh = qt + h * QWEN_HEAD_DIM; + float *att_h = m->att + h * QWEN_MAX_SEQ; + int seq_len = pos + 1; + + float max_score = -1e9f; + for (int p = 0; p <= pos; p++) { + float *kp = m->kv_cache_k[l] + p * QWEN_KV_DIM + kv_h * QWEN_HEAD_DIM; + float score = cblas_sdot(QWEN_HEAD_DIM, qh, 1, kp, 1); + att_h[p] = score * scale; + if (att_h[p] > max_score) max_score = att_h[p]; + } + float neg_max = -max_score; + vDSP_vsadd(att_h, 1, &neg_max, att_h, 1, (vDSP_Length)seq_len); + int n_exp = seq_len; + vvexpf(att_h, att_h, &n_exp); + float sum; + vDSP_sve(att_h, 1, &sum, (vDSP_Length)seq_len); + float inv_sum = 1.0f / sum; + vDSP_vsmul(att_h, 1, &inv_sum, att_h, 1, (vDSP_Length)seq_len); + + for (int p = 0; p <= pos; p++) { + float a = att_h[p]; + float *vp = m->kv_cache_v[l] + p * QWEN_KV_DIM + kv_h * QWEN_HEAD_DIM; + cblas_saxpy(QWEN_HEAD_DIM, a, vp, 1, + attn_out + h * QWEN_HEAD_DIM, 1); + } + } + } + // xbs now has [N, Q_DIM] attention outputs + + // Batch O projection (reuses pre-allocated o_outs) + cpu_project_batch(m->wo[l], xbs, o_outs, QWEN_Q_DIM, D, N); + + for (int t = 0; t < N; t++) + vDSP_vadd(xs + t * D, 1, o_outs + t * D, 1, xs + t * D, 1, (vDSP_Length)D); + + // Batch FFN RMSNorm + for (int t = 0; t < N; t++) + qwen_rmsnorm(xbs + t * D, xs + t * D, m->rms_ffn[l], D); + + // Batch FFN projections + cpu_project_batch(m->w_gate[l], xbs, hbs, D, HD, N); + cpu_project_batch(m->w_up[l], xbs, hb2s, D, HD, N); + + for (int t = 0; t < N; t++) { + qwen_silu(hbs + t * HD, HD); + vDSP_vmul(hbs + t * HD, 1, hb2s + t * HD, 1, hbs + t * HD, 1, (vDSP_Length)HD); + } + + // Batch down projection (reuses pre-allocated ffn_outs) + cpu_project_batch(m->w_down[l], hbs, ffn_outs, HD, D, N); + + for (int t = 0; t < N; t++) + vDSP_vadd(xs + t * D, 1, ffn_outs + t * D, 1, xs + t * D, 1, (vDSP_Length)D); + } + + // Only need logits for the last token + float *last_x = xs + (N - 1) * D; + qwen_rmsnorm(m->xb, last_x, m->rms_final, D); + + cblas_sgemv(CblasRowMajor, CblasNoTrans, + QWEN_VOCAB, D, + 1.0f, m->embed, D, + m->xb, 1, + 0.0f, m->logits, 1); + + m->pos += N; + + float max_val; + vDSP_Length max_idx_vdsp; + vDSP_maxvi(m->logits, 1, &max_val, &max_idx_vdsp, (vDSP_Length)QWEN_VOCAB); + + free(xs); free(xbs); free(qs); free(ks); free(vs); free(hbs); free(hb2s); + free(o_outs); free(ffn_outs); + return (int)max_idx_vdsp; +} + +// Q4 AMX batched prefill: dequantize weight matrices then use sgemm +static int qwen_prefill_q4(QwenModel *m, const int *tokens, int n_tokens) { + int D = QWEN_DIM, HD = QWEN_HIDDEN, N = n_tokens; + + float *xs = (float*)qwen_calloc(N * D, sizeof(float), "prefill_q4_xs"); + float *xbs = (float*)qwen_calloc(N * D, sizeof(float), "prefill_q4_xbs"); + float *qs = (float*)qwen_calloc(N * QWEN_Q_DIM, sizeof(float), "prefill_q4_qs"); + float *ks = (float*)qwen_calloc(N * QWEN_KV_DIM, sizeof(float), "prefill_q4_ks"); + float *vs = (float*)qwen_calloc(N * QWEN_KV_DIM, sizeof(float), "prefill_q4_vs"); + float *hbs = (float*)qwen_calloc(N * HD, sizeof(float), "prefill_q4_hbs"); + float *hb2s = (float*)qwen_calloc(N * HD, sizeof(float), "prefill_q4_hb2s"); + float *o_outs = (float*)qwen_calloc(N * D, sizeof(float), "prefill_q4_o_outs"); + float *ffn_outs = (float*)qwen_calloc(N * D, sizeof(float), "prefill_q4_ffn_outs"); + + for (int t = 0; t < N; t++) + memcpy(xs + t * D, m->embed + tokens[t] * D, D * sizeof(float)); + + for (int l = 0; l < QWEN_LAYERS; l++) { + for (int t = 0; t < N; t++) + qwen_rmsnorm(xbs + t * D, xs + t * D, m->rms_att[l], D); + + cpu_project_batch_q4_amx(m->wq_q8[l], xbs, qs, D, QWEN_Q_DIM, N); + cpu_project_batch_q4_amx(m->wk_q8[l], xbs, ks, D, QWEN_KV_DIM, N); + cpu_project_batch_q4_amx(m->wv_q8[l], xbs, vs, D, QWEN_KV_DIM, N); + + for (int t = 0; t < N; t++) { + float *qt = qs + t * QWEN_Q_DIM; + float *kt = ks + t * QWEN_KV_DIM; + float *vt = vs + t * QWEN_KV_DIM; + int pos = m->pos + t; + + if (m->q_bias[l]) + vDSP_vadd(qt, 1, m->q_bias[l], 1, qt, 1, (vDSP_Length)QWEN_Q_DIM); + if (m->k_bias[l]) + vDSP_vadd(kt, 1, m->k_bias[l], 1, kt, 1, (vDSP_Length)QWEN_KV_DIM); + if (m->v_bias[l]) + vDSP_vadd(vt, 1, m->v_bias[l], 1, vt, 1, (vDSP_Length)QWEN_KV_DIM); + + qwen_rope(qt, kt, pos, QWEN_HEADS, QWEN_KV_HEADS, QWEN_HEAD_DIM); + + memcpy(m->kv_cache_k[l] + pos * QWEN_KV_DIM, kt, QWEN_KV_DIM * sizeof(float)); + memcpy(m->kv_cache_v[l] + pos * QWEN_KV_DIM, vt, QWEN_KV_DIM * sizeof(float)); + + float scale = 1.0f / sqrtf((float)QWEN_HEAD_DIM); + float *attn_out = xbs + t * D; + memset(attn_out, 0, QWEN_Q_DIM * sizeof(float)); + + for (int h = 0; h < QWEN_HEADS; h++) { + int kv_h = h / QWEN_GQA_FACTOR; + float *qh = qt + h * QWEN_HEAD_DIM; + float *att_h = m->att + h * QWEN_MAX_SEQ; + int seq_len = pos + 1; + + float max_score = -1e9f; + for (int p = 0; p <= pos; p++) { + float *kp = m->kv_cache_k[l] + p * QWEN_KV_DIM + kv_h * QWEN_HEAD_DIM; + float score = cblas_sdot(QWEN_HEAD_DIM, qh, 1, kp, 1); + att_h[p] = score * scale; + if (att_h[p] > max_score) max_score = att_h[p]; + } + float neg_max = -max_score; + vDSP_vsadd(att_h, 1, &neg_max, att_h, 1, (vDSP_Length)seq_len); + int n_exp = seq_len; + vvexpf(att_h, att_h, &n_exp); + float sum; + vDSP_sve(att_h, 1, &sum, (vDSP_Length)seq_len); + float inv_sum = 1.0f / sum; + vDSP_vsmul(att_h, 1, &inv_sum, att_h, 1, (vDSP_Length)seq_len); + + for (int p = 0; p <= pos; p++) { + float a = att_h[p]; + float *vp = m->kv_cache_v[l] + p * QWEN_KV_DIM + kv_h * QWEN_HEAD_DIM; + cblas_saxpy(QWEN_HEAD_DIM, a, vp, 1, + attn_out + h * QWEN_HEAD_DIM, 1); + } + } + } + + cpu_project_batch_q4_amx(m->wo_q8[l], xbs, o_outs, QWEN_Q_DIM, D, N); + + for (int t = 0; t < N; t++) + vDSP_vadd(xs + t * D, 1, o_outs + t * D, 1, xs + t * D, 1, (vDSP_Length)D); + + for (int t = 0; t < N; t++) + qwen_rmsnorm(xbs + t * D, xs + t * D, m->rms_ffn[l], D); + + cpu_project_batch_q4_amx(m->wgate_q8[l], xbs, hbs, D, HD, N); + cpu_project_batch_q4_amx(m->wup_q8[l], xbs, hb2s, D, HD, N); + + for (int t = 0; t < N; t++) { + qwen_silu(hbs + t * HD, HD); + vDSP_vmul(hbs + t * HD, 1, hb2s + t * HD, 1, hbs + t * HD, 1, (vDSP_Length)HD); + } + + cpu_project_batch_q4_amx(m->wdown_q8[l], hbs, ffn_outs, HD, D, N); + + for (int t = 0; t < N; t++) + vDSP_vadd(xs + t * D, 1, ffn_outs + t * D, 1, xs + t * D, 1, (vDSP_Length)D); + } + + float *last_x = xs + (N - 1) * D; + qwen_rmsnorm(m->xb, last_x, m->rms_final, D); + + cblas_sgemv(CblasRowMajor, CblasNoTrans, + QWEN_VOCAB, D, + 1.0f, m->embed, D, + m->xb, 1, + 0.0f, m->logits, 1); + + m->pos += N; + + float max_val; + vDSP_Length max_idx_vdsp; + vDSP_maxvi(m->logits, 1, &max_val, &max_idx_vdsp, (vDSP_Length)QWEN_VOCAB); + + free(xs); free(xbs); free(qs); free(ks); free(vs); free(hbs); free(hb2s); + free(o_outs); free(ffn_outs); + return (int)max_idx_vdsp; +} + +// ── Full-GPU forward pass (Metal, single command buffer per layer) ──── +// Runs entire transformer on GPU using Q4 quantized weights. +// KV cache stays on GPU between calls. Attention runs per-head on GPU. +#ifdef __OBJC__ + +// SIMD-optimized Q4 matvec with optional bias fusion. +// 2 SIMD groups x 4 rows each = 8 rows/threadgroup, simd_sum reduction. +static void gpu_encode_sgemv_q4_bias(id enc, QwenModel *m, + id w_buf, id x_buf, id y_buf, + uint32_t in_dim, uint32_t out_dim, + id bias_buf) { + id pso = (__bridge id)g_metal.pipeline_q4_fast; + [enc setComputePipelineState:pso]; + [enc setBuffer:w_buf offset:0 atIndex:0]; + [enc setBuffer:x_buf offset:0 atIndex:1]; + [enc setBuffer:y_buf offset:0 atIndex:2]; + [enc setBytes:&in_dim length:4 atIndex:3]; + [enc setBytes:&out_dim length:4 atIndex:4]; + + uint32_t use_bias = (bias_buf != nil) ? 1 : 0; + if (bias_buf) { + [enc setBuffer:bias_buf offset:0 atIndex:5]; + } else { + [enc setBuffer:y_buf offset:0 atIndex:5]; + } + [enc setBytes:&use_bias length:4 atIndex:6]; + + uint32_t rows_per_tg = 8; + uint32_t n_tg = (out_dim + rows_per_tg - 1) / rows_per_tg; + [enc dispatchThreadgroups:MTLSizeMake(n_tg, 1, 1) + threadsPerThreadgroup:MTLSizeMake(64, 1, 1)]; +} + +static void gpu_encode_sgemv_q4(id enc, QwenModel *m, + id w_buf, id x_buf, id y_buf, + uint32_t in_dim, uint32_t out_dim) { + gpu_encode_sgemv_q4_bias(enc, m, w_buf, x_buf, y_buf, in_dim, out_dim, nil); +} + +static void gpu_encode_sgemv_f32(id enc, + id w_buf, id x_buf, id y_buf, + uint32_t in_dim, uint32_t out_dim) { + id pso = (__bridge id)g_metal.pipeline_f32; + [enc setComputePipelineState:pso]; + [enc setBuffer:w_buf offset:0 atIndex:0]; + [enc setBuffer:x_buf offset:0 atIndex:1]; + [enc setBuffer:y_buf offset:0 atIndex:2]; + [enc setBytes:&in_dim length:4 atIndex:3]; + [enc setBytes:&out_dim length:4 atIndex:4]; + NSUInteger tpg = pso.maxTotalThreadsPerThreadgroup; + if (tpg > out_dim) tpg = out_dim; + [enc dispatchThreads:MTLSizeMake(out_dim, 1, 1) threadsPerThreadgroup:MTLSizeMake(tpg, 1, 1)]; +} + +// Fused gate+up+silu: reads x once, computes silu(Wg*x)*Wu*x +static void gpu_encode_fused_ffn(id enc, + id wgate_buf, id wup_buf, + id x_buf, id out_buf, + uint32_t in_dim, uint32_t out_dim) { + id pso = (__bridge id)g_metal.pipeline_q4_fused_ffn; + [enc setComputePipelineState:pso]; + [enc setBuffer:wgate_buf offset:0 atIndex:0]; + [enc setBuffer:wup_buf offset:0 atIndex:1]; + [enc setBuffer:x_buf offset:0 atIndex:2]; + [enc setBuffer:out_buf offset:0 atIndex:3]; + [enc setBytes:&in_dim length:4 atIndex:4]; + [enc setBytes:&out_dim length:4 atIndex:5]; + + uint32_t rows_per_tg = 4; // FUSED_ROWS_PER_SIMD(2) * FUSED_SIMD_GROUPS(2) + uint32_t n_tg = (out_dim + rows_per_tg - 1) / rows_per_tg; + [enc dispatchThreadgroups:MTLSizeMake(n_tg, 1, 1) + threadsPerThreadgroup:MTLSizeMake(64, 1, 1)]; +} + +static int qwen_forward_gpu(QwenModel *m, int token) { + int D = QWEN_DIM, HD = QWEN_HIDDEN; + int pos = m->pos; + uint32_t uD = (uint32_t)D, uHD = (uint32_t)HD; + uint32_t uQD = (uint32_t)QWEN_Q_DIM, uKVD = (uint32_t)QWEN_KV_DIM; + + id dev = (__bridge id)g_metal.device; + id queue = (__bridge id)g_metal.queue; + + static id gpu_x = nil, gpu_xb = nil; + static id gpu_q = nil, gpu_k = nil, gpu_v = nil; + static id gpu_hb = nil, gpu_hb2 = nil; + static id gpu_attn_out = nil; + static id gpu_o_out = nil, gpu_ffn_out = nil; + static id gpu_logits = nil; + static id gpu_att = nil; + static id gpu_result = nil; + static id gpu_rope_cos = nil, gpu_rope_sin = nil; + + if (!gpu_x) { + gpu_x = [dev newBufferWithLength:D * 4 options:MTLResourceStorageModeShared]; + gpu_xb = [dev newBufferWithLength:D * 4 options:MTLResourceStorageModeShared]; + gpu_q = [dev newBufferWithLength:QWEN_Q_DIM * 4 options:MTLResourceStorageModeShared]; + gpu_k = [dev newBufferWithLength:QWEN_KV_DIM * 4 options:MTLResourceStorageModeShared]; + gpu_v = [dev newBufferWithLength:QWEN_KV_DIM * 4 options:MTLResourceStorageModeShared]; + gpu_hb = [dev newBufferWithLength:HD * 4 options:MTLResourceStorageModeShared]; + gpu_hb2 = [dev newBufferWithLength:HD * 4 options:MTLResourceStorageModeShared]; + gpu_attn_out = [dev newBufferWithLength:QWEN_Q_DIM * 4 options:MTLResourceStorageModeShared]; + gpu_o_out = [dev newBufferWithLength:D * 4 options:MTLResourceStorageModeShared]; + gpu_ffn_out = [dev newBufferWithLength:D * 4 options:MTLResourceStorageModeShared]; + gpu_logits = [dev newBufferWithLength:QWEN_VOCAB * 4 options:MTLResourceStorageModeShared]; + gpu_att = [dev newBufferWithLength:QWEN_HEADS * QWEN_MAX_SEQ * 4 options:MTLResourceStorageModeShared]; + gpu_result = [dev newBufferWithLength:4 options:MTLResourceStorageModeShared]; + + qwen_rope_init(); + gpu_rope_cos = [dev newBufferWithLength:sizeof(g_rope_cos) options:MTLResourceStorageModeShared]; + gpu_rope_sin = [dev newBufferWithLength:sizeof(g_rope_sin) options:MTLResourceStorageModeShared]; + memcpy([gpu_rope_cos contents], g_rope_cos, sizeof(g_rope_cos)); + memcpy([gpu_rope_sin contents], g_rope_sin, sizeof(g_rope_sin)); + } + + id pso_rms = (__bridge id)g_metal.pipeline_rms; + id pso_rope = (__bridge id)g_metal.pipeline_rope; + id pso_silu = (__bridge id)g_metal.pipeline_silu; + id pso_add = (__bridge id)g_metal.pipeline_add; + id pso_bias = (__bridge id)g_metal.pipeline_bias; + id pso_embed = (__bridge id)g_metal.pipeline_embed; + id pso_attn_score = (__bridge id)g_metal.pipeline_attn_score; + id pso_softmax = (__bridge id)g_metal.pipeline_softmax; + id pso_attn_wsum = (__bridge id)g_metal.pipeline_attn_wsum; + id pso_argmax = (__bridge id)g_metal.pipeline_argmax; + id pso_zero = (__bridge id)g_metal.pipeline_zero; + id pso_copy = (__bridge id)g_metal.pipeline_copy; + + float rms_eps = QWEN_RMS_EPS; + uint32_t utoken = (uint32_t)token; + uint32_t seq_len = (uint32_t)(pos + 1); + float attn_scale = 1.0f / sqrtf((float)QWEN_HEAD_DIM); + uint32_t un_q = QWEN_HEADS, un_kv = QWEN_KV_HEADS, uhd = QWEN_HEAD_DIM; + + // Encode ALL 24 layers + final into ONE command buffer. + // Metal guarantees sequential execution of dispatches within a command encoder, + // so data dependencies (KV cache reads after writes) are satisfied by dispatch order. + id cmd = [queue commandBuffer]; + id enc = [cmd computeCommandEncoder]; + + // Embedding + [enc setComputePipelineState:pso_embed]; + [enc setBuffer:(__bridge id)m->gpu_embed offset:0 atIndex:0]; + [enc setBuffer:gpu_x offset:0 atIndex:1]; + [enc setBytes:&utoken length:4 atIndex:2]; + [enc setBytes:&uD length:4 atIndex:3]; + [enc dispatchThreads:MTLSizeMake(D, 1, 1) threadsPerThreadgroup:MTLSizeMake(MIN((NSUInteger)D, pso_embed.maxTotalThreadsPerThreadgroup), 1, 1)]; + + for (int l = 0; l < QWEN_LAYERS; l++) { + // RMSNorm attention + [enc setComputePipelineState:pso_rms]; + [enc setBuffer:gpu_x offset:0 atIndex:0]; + [enc setBuffer:(__bridge id)m->gpu_rms_att[l] offset:0 atIndex:1]; + [enc setBuffer:gpu_xb offset:0 atIndex:2]; + [enc setBytes:&uD length:4 atIndex:3]; + [enc setBytes:&rms_eps length:4 atIndex:4]; + { NSUInteger p = 1; while (p < (NSUInteger)D) p <<= 1; if (p > 1024) p = 1024; + [enc dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(p, 1, 1)]; } + + // QKV with fused bias (saves 3 bias_add dispatches per layer) + gpu_encode_sgemv_q4_bias(enc, m, (__bridge id)m->gpu_wq[l], gpu_xb, gpu_q, uD, uQD, + (__bridge id)m->gpu_q_bias[l]); + gpu_encode_sgemv_q4_bias(enc, m, (__bridge id)m->gpu_wk[l], gpu_xb, gpu_k, uD, uKVD, + (__bridge id)m->gpu_k_bias[l]); + gpu_encode_sgemv_q4_bias(enc, m, (__bridge id)m->gpu_wv[l], gpu_xb, gpu_v, uD, uKVD, + (__bridge id)m->gpu_v_bias[l]); + + // RoPE + uint32_t rope_offset = (uint32_t)pos * (QWEN_HEAD_DIM / 2); + [enc setComputePipelineState:pso_rope]; + [enc setBuffer:gpu_q offset:0 atIndex:0]; + [enc setBuffer:gpu_k offset:0 atIndex:1]; + [enc setBuffer:gpu_rope_cos offset:rope_offset * 4 atIndex:2]; + [enc setBuffer:gpu_rope_sin offset:rope_offset * 4 atIndex:3]; + [enc setBytes:&un_q length:4 atIndex:4]; + [enc setBytes:&un_kv length:4 atIndex:5]; + [enc setBytes:&uhd length:4 atIndex:6]; + { uint32_t total = (QWEN_HEADS + QWEN_KV_HEADS) * (QWEN_HEAD_DIM / 2); + [enc dispatchThreads:MTLSizeMake(total, 1, 1) + threadsPerThreadgroup:MTLSizeMake(MIN((NSUInteger)total, pso_rope.maxTotalThreadsPerThreadgroup), 1, 1)]; } + + // Store K, V into KV cache + [enc setComputePipelineState:pso_copy]; + [enc setBuffer:gpu_k offset:0 atIndex:0]; + [enc setBuffer:(__bridge id)m->gpu_kv_cache_k[l] offset:(NSUInteger)pos * QWEN_KV_DIM * 4 atIndex:1]; + [enc setBytes:&uKVD length:4 atIndex:2]; + [enc dispatchThreads:MTLSizeMake(QWEN_KV_DIM, 1, 1) + threadsPerThreadgroup:MTLSizeMake(MIN((NSUInteger)QWEN_KV_DIM, pso_copy.maxTotalThreadsPerThreadgroup), 1, 1)]; + + [enc setBuffer:gpu_v offset:0 atIndex:0]; + [enc setBuffer:(__bridge id)m->gpu_kv_cache_v[l] offset:(NSUInteger)pos * QWEN_KV_DIM * 4 atIndex:1]; + [enc setBytes:&uKVD length:4 atIndex:2]; + [enc dispatchThreads:MTLSizeMake(QWEN_KV_DIM, 1, 1) + threadsPerThreadgroup:MTLSizeMake(MIN((NSUInteger)QWEN_KV_DIM, pso_copy.maxTotalThreadsPerThreadgroup), 1, 1)]; + + // Batched attention: all 14 Q heads in 3 dispatches (was 42) + { + uint32_t un_q_heads = QWEN_HEADS; + uint32_t u_gqa = QWEN_GQA_FACTOR; + uint32_t u_max_seq = QWEN_MAX_SEQ; + + id pso_score_b = (__bridge id)g_metal.pipeline_attn_score_b; + id pso_soft_b = (__bridge id)g_metal.pipeline_softmax_b; + id pso_wsum_b = (__bridge id)g_metal.pipeline_attn_wsum_b; + + // 1. Batched attn score: grid (seq_len, n_q_heads) + [enc setComputePipelineState:pso_score_b]; + [enc setBuffer:gpu_q offset:0 atIndex:0]; + [enc setBuffer:(__bridge id)m->gpu_kv_cache_k[l] offset:0 atIndex:1]; + [enc setBuffer:gpu_att offset:0 atIndex:2]; + [enc setBytes:&uhd length:4 atIndex:3]; + [enc setBytes:&uKVD length:4 atIndex:4]; + [enc setBytes:&un_q_heads length:4 atIndex:5]; + [enc setBytes:&u_gqa length:4 atIndex:6]; + [enc setBytes:&attn_scale length:4 atIndex:7]; + [enc setBytes:&seq_len length:4 atIndex:8]; + [enc setBytes:&u_max_seq length:4 atIndex:9]; + { NSUInteger tpg_x = MIN((NSUInteger)seq_len, (NSUInteger)256); + NSUInteger tpg_y = MIN((NSUInteger)QWEN_HEADS, (NSUInteger)(pso_score_b.maxTotalThreadsPerThreadgroup / tpg_x)); + if (tpg_y < 1) tpg_y = 1; + [enc dispatchThreads:MTLSizeMake(seq_len, QWEN_HEADS, 1) + threadsPerThreadgroup:MTLSizeMake(tpg_x, tpg_y, 1)]; } + + // 2. Batched softmax: one threadgroup per head + [enc setComputePipelineState:pso_soft_b]; + [enc setBuffer:gpu_att offset:0 atIndex:0]; + [enc setBytes:&seq_len length:4 atIndex:1]; + [enc setBytes:&u_max_seq length:4 atIndex:2]; + [enc setBytes:&un_q_heads length:4 atIndex:3]; + { NSUInteger p = 1; while (p < (NSUInteger)seq_len && p < 1024) p <<= 1; + [enc dispatchThreadgroups:MTLSizeMake(QWEN_HEADS, 1, 1) + threadsPerThreadgroup:MTLSizeMake(p, 1, 1)]; } + + // 3. Batched weighted sum: grid (head_dim, n_q_heads) + [enc setComputePipelineState:pso_wsum_b]; + [enc setBuffer:gpu_att offset:0 atIndex:0]; + [enc setBuffer:(__bridge id)m->gpu_kv_cache_v[l] offset:0 atIndex:1]; + [enc setBuffer:gpu_attn_out offset:0 atIndex:2]; + [enc setBytes:&uhd length:4 atIndex:3]; + [enc setBytes:&uKVD length:4 atIndex:4]; + [enc setBytes:&un_q_heads length:4 atIndex:5]; + [enc setBytes:&u_gqa length:4 atIndex:6]; + [enc setBytes:&seq_len length:4 atIndex:7]; + [enc setBytes:&u_max_seq length:4 atIndex:8]; + { NSUInteger tpg_x = MIN((NSUInteger)QWEN_HEAD_DIM, (NSUInteger)64); + NSUInteger tpg_y = MIN((NSUInteger)QWEN_HEADS, (NSUInteger)(pso_wsum_b.maxTotalThreadsPerThreadgroup / tpg_x)); + if (tpg_y < 1) tpg_y = 1; + [enc dispatchThreads:MTLSizeMake(QWEN_HEAD_DIM, QWEN_HEADS, 1) + threadsPerThreadgroup:MTLSizeMake(tpg_x, tpg_y, 1)]; } + } + + // O projection + residual + gpu_encode_sgemv_q4(enc, m, (__bridge id)m->gpu_wo[l], gpu_attn_out, gpu_o_out, uQD, uD); + + [enc setComputePipelineState:pso_add]; + [enc setBuffer:gpu_x offset:0 atIndex:0]; + [enc setBuffer:gpu_o_out offset:0 atIndex:1]; + [enc setBuffer:gpu_x offset:0 atIndex:2]; + [enc setBytes:&uD length:4 atIndex:3]; + [enc dispatchThreads:MTLSizeMake(D, 1, 1) + threadsPerThreadgroup:MTLSizeMake(MIN((NSUInteger)D, pso_add.maxTotalThreadsPerThreadgroup), 1, 1)]; + + // FFN + [enc setComputePipelineState:pso_rms]; + [enc setBuffer:gpu_x offset:0 atIndex:0]; + [enc setBuffer:(__bridge id)m->gpu_rms_ffn[l] offset:0 atIndex:1]; + [enc setBuffer:gpu_xb offset:0 atIndex:2]; + [enc setBytes:&uD length:4 atIndex:3]; + [enc setBytes:&rms_eps length:4 atIndex:4]; + { NSUInteger p = 1; while (p < (NSUInteger)D) p <<= 1; if (p > 1024) p = 1024; + [enc dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(p, 1, 1)]; } + + // Fused gate+up+silu: one kernel reads xb, computes silu(Wg*xb)*Wu*xb + gpu_encode_fused_ffn(enc, + (__bridge id)m->gpu_wgate[l], + (__bridge id)m->gpu_wup[l], + gpu_xb, gpu_hb, uD, uHD); + + gpu_encode_sgemv_q4(enc, m, (__bridge id)m->gpu_wdown[l], gpu_hb, gpu_ffn_out, uHD, uD); + + [enc setComputePipelineState:pso_add]; + [enc setBuffer:gpu_x offset:0 atIndex:0]; + [enc setBuffer:gpu_ffn_out offset:0 atIndex:1]; + [enc setBuffer:gpu_x offset:0 atIndex:2]; + [enc setBytes:&uD length:4 atIndex:3]; + [enc dispatchThreads:MTLSizeMake(D, 1, 1) + threadsPerThreadgroup:MTLSizeMake(MIN((NSUInteger)D, pso_add.maxTotalThreadsPerThreadgroup), 1, 1)]; + } + + // Final RMSNorm + LM Head + argmax (still in the SAME command buffer) + [enc setComputePipelineState:pso_rms]; + [enc setBuffer:gpu_x offset:0 atIndex:0]; + [enc setBuffer:(__bridge id)m->gpu_rms_final offset:0 atIndex:1]; + [enc setBuffer:gpu_xb offset:0 atIndex:2]; + [enc setBytes:&uD length:4 atIndex:3]; + [enc setBytes:&rms_eps length:4 atIndex:4]; + { NSUInteger p = 1; while (p < (NSUInteger)D) p <<= 1; if (p > 1024) p = 1024; + [enc dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(p, 1, 1)]; } + + uint32_t uVocab = QWEN_VOCAB; + gpu_encode_sgemv_f32(enc, (__bridge id)m->gpu_embed, gpu_xb, gpu_logits, uD, uVocab); + + [enc setComputePipelineState:pso_argmax]; + [enc setBuffer:gpu_logits offset:0 atIndex:0]; + [enc setBuffer:gpu_result offset:0 atIndex:1]; + [enc setBytes:&uVocab length:4 atIndex:2]; + { NSUInteger tpg = MIN((NSUInteger)1024, pso_argmax.maxTotalThreadsPerThreadgroup); + [enc dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(tpg, 1, 1)]; } + + [enc endEncoding]; + [cmd commit]; + [cmd waitUntilCompleted]; + + m->pos++; + + int *result_ptr = (int*)[gpu_result contents]; + return result_ptr[0]; +} + +// ── GPU batched prefill: all N prompt tokens in one command buffer ──── +// Uses sgemm_q4 (matrix-matrix) instead of sequential sgemv calls. +// Reads each weight matrix once for all N tokens instead of N times. +static int qwen_prefill_gpu(QwenModel *m, const int *tokens, int n_tokens) { + int D = QWEN_DIM, HD = QWEN_HIDDEN, N = n_tokens; + uint32_t uD = (uint32_t)D, uHD = (uint32_t)HD; + uint32_t uQD = (uint32_t)QWEN_Q_DIM, uKVD = (uint32_t)QWEN_KV_DIM; + uint32_t uN = (uint32_t)N; + float rms_eps = QWEN_RMS_EPS; + float attn_scale = 1.0f / sqrtf((float)QWEN_HEAD_DIM); + uint32_t uhd = QWEN_HEAD_DIM; + uint32_t un_q = QWEN_HEADS, un_kv = QWEN_KV_HEADS; + uint32_t u_gqa = QWEN_GQA_FACTOR; + uint32_t u_max_seq = QWEN_MAX_SEQ; + + id dev = (__bridge id)g_metal.device; + id queue = (__bridge id)g_metal.queue; + + // Static batch buffers: allocated once at QWEN_MAX_SEQ size, reused across calls + static id gpu_xs = nil, gpu_xbs = nil; + static id gpu_qs = nil, gpu_ks = nil, gpu_vs = nil; + static id gpu_hbs = nil; + static id gpu_attn_outs = nil, gpu_o_outs = nil, gpu_ffn_outs = nil; + static id gpu_att = nil, gpu_logits = nil, gpu_result = nil; + static id gpu_token_ids = nil; + static id gpu_rope_cos = nil, gpu_rope_sin = nil; + static id gpu_xb_last = nil; + + if (!gpu_xs) { + NSUInteger maxN = QWEN_MAX_SEQ; + gpu_xs = [dev newBufferWithLength:maxN * D * 4 options:MTLResourceStorageModeShared]; + gpu_xbs = [dev newBufferWithLength:maxN * D * 4 options:MTLResourceStorageModeShared]; + gpu_qs = [dev newBufferWithLength:maxN * QWEN_Q_DIM * 4 options:MTLResourceStorageModeShared]; + gpu_ks = [dev newBufferWithLength:maxN * QWEN_KV_DIM * 4 options:MTLResourceStorageModeShared]; + gpu_vs = [dev newBufferWithLength:maxN * QWEN_KV_DIM * 4 options:MTLResourceStorageModeShared]; + gpu_hbs = [dev newBufferWithLength:maxN * HD * 4 options:MTLResourceStorageModeShared]; + gpu_attn_outs = [dev newBufferWithLength:maxN * QWEN_Q_DIM * 4 options:MTLResourceStorageModeShared]; + gpu_o_outs = [dev newBufferWithLength:maxN * D * 4 options:MTLResourceStorageModeShared]; + gpu_ffn_outs = [dev newBufferWithLength:maxN * D * 4 options:MTLResourceStorageModeShared]; + gpu_att = [dev newBufferWithLength:QWEN_HEADS * QWEN_MAX_SEQ * 4 options:MTLResourceStorageModeShared]; + gpu_logits = [dev newBufferWithLength:QWEN_VOCAB * 4 options:MTLResourceStorageModeShared]; + gpu_result = [dev newBufferWithLength:4 options:MTLResourceStorageModeShared]; + gpu_token_ids = [dev newBufferWithLength:maxN * sizeof(int) options:MTLResourceStorageModeShared]; + gpu_xb_last = [dev newBufferWithLength:D * 4 options:MTLResourceStorageModeShared]; + + qwen_rope_init(); + gpu_rope_cos = [dev newBufferWithLength:sizeof(g_rope_cos) options:MTLResourceStorageModeShared]; + gpu_rope_sin = [dev newBufferWithLength:sizeof(g_rope_sin) options:MTLResourceStorageModeShared]; + memcpy([gpu_rope_cos contents], g_rope_cos, sizeof(g_rope_cos)); + memcpy([gpu_rope_sin contents], g_rope_sin, sizeof(g_rope_sin)); + } + + memcpy([gpu_token_ids contents], tokens, (NSUInteger)N * sizeof(int)); + + // Pipeline states + id pso_sgemm_q4 = (__bridge id)g_metal.pipeline_sgemm_q4; + id pso_sgemm_ffn = (__bridge id)g_metal.pipeline_sgemm_q4_fused_ffn; + id pso_rms_b = (__bridge id)g_metal.pipeline_rms_batched; + id pso_embed_b = (__bridge id)g_metal.pipeline_embed_batched; + id pso_rope_b = (__bridge id)g_metal.pipeline_rope_batched; + id pso_add_b = (__bridge id)g_metal.pipeline_add_batched; + id pso_copy = (__bridge id)g_metal.pipeline_copy; + id pso_rms = (__bridge id)g_metal.pipeline_rms; + id pso_argmax = (__bridge id)g_metal.pipeline_argmax; + id pso_score_b = (__bridge id)g_metal.pipeline_attn_score_b; + id pso_soft_b = (__bridge id)g_metal.pipeline_softmax_b; + id pso_wsum_b = (__bridge id)g_metal.pipeline_attn_wsum_b; + + // Helper: encode sgemm_q4 dispatch + #define ENCODE_SGEMM_Q4(enc, w_buf, x_buf, y_buf, in_d, out_d, bias_buf, n_tok) do { \ + [enc setComputePipelineState:pso_sgemm_q4]; \ + [enc setBuffer:w_buf offset:0 atIndex:0]; \ + [enc setBuffer:x_buf offset:0 atIndex:1]; \ + [enc setBuffer:y_buf offset:0 atIndex:2]; \ + uint32_t _id = (in_d), _od = (out_d), _ub = ((bias_buf) != nil) ? 1 : 0, _nt = (n_tok); \ + [enc setBytes:&_id length:4 atIndex:3]; \ + [enc setBytes:&_od length:4 atIndex:4]; \ + if (bias_buf) [enc setBuffer:bias_buf offset:0 atIndex:5]; \ + else [enc setBuffer:y_buf offset:0 atIndex:5]; \ + [enc setBytes:&_ub length:4 atIndex:6]; \ + [enc setBytes:&_nt length:4 atIndex:7]; \ + uint32_t _tg_x = (_od + 7) / 8; \ + [enc dispatchThreadgroups:MTLSizeMake(_tg_x, _nt, 1) threadsPerThreadgroup:MTLSizeMake(64, 1, 1)]; \ + } while(0) + + // Single command buffer for entire prefill + id cmd = [queue commandBuffer]; + id enc = [cmd computeCommandEncoder]; + + // 1. Batched embedding: load all N token embeddings + [enc setComputePipelineState:pso_embed_b]; + [enc setBuffer:(__bridge id)m->gpu_embed offset:0 atIndex:0]; + [enc setBuffer:gpu_xs offset:0 atIndex:1]; + [enc setBuffer:gpu_token_ids offset:0 atIndex:2]; + [enc setBytes:&uD length:4 atIndex:3]; + { NSUInteger tpg_x = MIN((NSUInteger)D, pso_embed_b.maxTotalThreadsPerThreadgroup); + [enc dispatchThreads:MTLSizeMake(D, N, 1) threadsPerThreadgroup:MTLSizeMake(tpg_x, 1, 1)]; } + + for (int l = 0; l < QWEN_LAYERS; l++) { + // 2. Batched RMSNorm (attention): N threadgroups, one per token + [enc setComputePipelineState:pso_rms_b]; + [enc setBuffer:gpu_xs offset:0 atIndex:0]; + [enc setBuffer:(__bridge id)m->gpu_rms_att[l] offset:0 atIndex:1]; + [enc setBuffer:gpu_xbs offset:0 atIndex:2]; + [enc setBytes:&uD length:4 atIndex:3]; + [enc setBytes:&rms_eps length:4 atIndex:4]; + [enc setBytes:&uN length:4 atIndex:5]; + { NSUInteger p = 1; while (p < (NSUInteger)D) p <<= 1; if (p > 1024) p = 1024; + [enc dispatchThreadgroups:MTLSizeMake(N, 1, 1) threadsPerThreadgroup:MTLSizeMake(p, 1, 1)]; } + + // 3. Batched QKV projections with fused bias (3 sgemm_q4 dispatches) + ENCODE_SGEMM_Q4(enc, (__bridge id)m->gpu_wq[l], gpu_xbs, gpu_qs, + uD, uQD, (__bridge id)m->gpu_q_bias[l], uN); + ENCODE_SGEMM_Q4(enc, (__bridge id)m->gpu_wk[l], gpu_xbs, gpu_ks, + uD, uKVD, (__bridge id)m->gpu_k_bias[l], uN); + ENCODE_SGEMM_Q4(enc, (__bridge id)m->gpu_wv[l], gpu_xbs, gpu_vs, + uD, uKVD, (__bridge id)m->gpu_v_bias[l], uN); + + // 4. Batched RoPE: apply to all N tokens' Q and K + uint32_t base_pos = (uint32_t)m->pos; + uint32_t q_stride_val = QWEN_Q_DIM; + uint32_t k_stride_val = QWEN_KV_DIM; + [enc setComputePipelineState:pso_rope_b]; + [enc setBuffer:gpu_qs offset:0 atIndex:0]; + [enc setBuffer:gpu_ks offset:0 atIndex:1]; + [enc setBuffer:gpu_rope_cos offset:0 atIndex:2]; + [enc setBuffer:gpu_rope_sin offset:0 atIndex:3]; + [enc setBytes:&un_q length:4 atIndex:4]; + [enc setBytes:&un_kv length:4 atIndex:5]; + [enc setBytes:&uhd length:4 atIndex:6]; + [enc setBytes:&base_pos length:4 atIndex:7]; + [enc setBytes:&q_stride_val length:4 atIndex:8]; + [enc setBytes:&k_stride_val length:4 atIndex:9]; + { uint32_t total_pairs = (QWEN_HEADS + QWEN_KV_HEADS) * (QWEN_HEAD_DIM / 2); + NSUInteger tpg = MIN((NSUInteger)total_pairs, pso_rope_b.maxTotalThreadsPerThreadgroup); + [enc dispatchThreads:MTLSizeMake(total_pairs, N, 1) + threadsPerThreadgroup:MTLSizeMake(tpg, 1, 1)]; } + + // 5. Store K, V into cache for all N tokens (copy from batched buffers) + for (int t = 0; t < N; t++) { + int pos = m->pos + t; + [enc setComputePipelineState:pso_copy]; + [enc setBuffer:gpu_ks offset:(NSUInteger)t * QWEN_KV_DIM * 4 atIndex:0]; + [enc setBuffer:(__bridge id)m->gpu_kv_cache_k[l] offset:(NSUInteger)pos * QWEN_KV_DIM * 4 atIndex:1]; + [enc setBytes:&uKVD length:4 atIndex:2]; + [enc dispatchThreads:MTLSizeMake(QWEN_KV_DIM, 1, 1) + threadsPerThreadgroup:MTLSizeMake(MIN((NSUInteger)QWEN_KV_DIM, pso_copy.maxTotalThreadsPerThreadgroup), 1, 1)]; + + [enc setBuffer:gpu_vs offset:(NSUInteger)t * QWEN_KV_DIM * 4 atIndex:0]; + [enc setBuffer:(__bridge id)m->gpu_kv_cache_v[l] offset:(NSUInteger)pos * QWEN_KV_DIM * 4 atIndex:1]; + [enc setBytes:&uKVD length:4 atIndex:2]; + [enc dispatchThreads:MTLSizeMake(QWEN_KV_DIM, 1, 1) + threadsPerThreadgroup:MTLSizeMake(MIN((NSUInteger)QWEN_KV_DIM, pso_copy.maxTotalThreadsPerThreadgroup), 1, 1)]; + } + + // 6. Per-token causal attention on GPU (each token sees only preceding tokens) + for (int t = 0; t < N; t++) { + uint32_t seq_len = (uint32_t)(m->pos + t + 1); + + // Attn score: all heads + [enc setComputePipelineState:pso_score_b]; + [enc setBuffer:gpu_qs offset:(NSUInteger)t * QWEN_Q_DIM * 4 atIndex:0]; + [enc setBuffer:(__bridge id)m->gpu_kv_cache_k[l] offset:0 atIndex:1]; + [enc setBuffer:gpu_att offset:0 atIndex:2]; + [enc setBytes:&uhd length:4 atIndex:3]; + [enc setBytes:&uKVD length:4 atIndex:4]; + [enc setBytes:&un_q length:4 atIndex:5]; + [enc setBytes:&u_gqa length:4 atIndex:6]; + [enc setBytes:&attn_scale length:4 atIndex:7]; + [enc setBytes:&seq_len length:4 atIndex:8]; + [enc setBytes:&u_max_seq length:4 atIndex:9]; + { NSUInteger tpg_x = MIN((NSUInteger)seq_len, (NSUInteger)256); + NSUInteger tpg_y = MIN((NSUInteger)QWEN_HEADS, pso_score_b.maxTotalThreadsPerThreadgroup / tpg_x); + if (tpg_y < 1) tpg_y = 1; + [enc dispatchThreads:MTLSizeMake(seq_len, QWEN_HEADS, 1) + threadsPerThreadgroup:MTLSizeMake(tpg_x, tpg_y, 1)]; } + + // Softmax: one threadgroup per head + [enc setComputePipelineState:pso_soft_b]; + [enc setBuffer:gpu_att offset:0 atIndex:0]; + [enc setBytes:&seq_len length:4 atIndex:1]; + [enc setBytes:&u_max_seq length:4 atIndex:2]; + [enc setBytes:&un_q length:4 atIndex:3]; + { NSUInteger p = 1; while (p < (NSUInteger)seq_len && p < 1024) p <<= 1; + [enc dispatchThreadgroups:MTLSizeMake(QWEN_HEADS, 1, 1) + threadsPerThreadgroup:MTLSizeMake(p, 1, 1)]; } + + // Weighted sum: all heads + [enc setComputePipelineState:pso_wsum_b]; + [enc setBuffer:gpu_att offset:0 atIndex:0]; + [enc setBuffer:(__bridge id)m->gpu_kv_cache_v[l] offset:0 atIndex:1]; + [enc setBuffer:gpu_attn_outs offset:(NSUInteger)t * QWEN_Q_DIM * 4 atIndex:2]; + [enc setBytes:&uhd length:4 atIndex:3]; + [enc setBytes:&uKVD length:4 atIndex:4]; + [enc setBytes:&un_q length:4 atIndex:5]; + [enc setBytes:&u_gqa length:4 atIndex:6]; + [enc setBytes:&seq_len length:4 atIndex:7]; + [enc setBytes:&u_max_seq length:4 atIndex:8]; + { NSUInteger tpg_x = MIN((NSUInteger)QWEN_HEAD_DIM, (NSUInteger)64); + NSUInteger tpg_y = MIN((NSUInteger)QWEN_HEADS, pso_wsum_b.maxTotalThreadsPerThreadgroup / tpg_x); + if (tpg_y < 1) tpg_y = 1; + [enc dispatchThreads:MTLSizeMake(QWEN_HEAD_DIM, QWEN_HEADS, 1) + threadsPerThreadgroup:MTLSizeMake(tpg_x, tpg_y, 1)]; } + } + + // 7. Batched O projection + ENCODE_SGEMM_Q4(enc, (__bridge id)m->gpu_wo[l], gpu_attn_outs, gpu_o_outs, + uQD, uD, nil, uN); + + // 8. Batched residual: xs += o_outs + uint32_t total_add = uN * uD; + [enc setComputePipelineState:pso_add_b]; + [enc setBuffer:gpu_xs offset:0 atIndex:0]; + [enc setBuffer:gpu_o_outs offset:0 atIndex:1]; + [enc setBuffer:gpu_xs offset:0 atIndex:2]; + [enc setBytes:&total_add length:4 atIndex:3]; + { NSUInteger tpg = MIN((NSUInteger)total_add, pso_add_b.maxTotalThreadsPerThreadgroup); + [enc dispatchThreads:MTLSizeMake(total_add, 1, 1) threadsPerThreadgroup:MTLSizeMake(tpg, 1, 1)]; } + + // 9. Batched RMSNorm (FFN) + [enc setComputePipelineState:pso_rms_b]; + [enc setBuffer:gpu_xs offset:0 atIndex:0]; + [enc setBuffer:(__bridge id)m->gpu_rms_ffn[l] offset:0 atIndex:1]; + [enc setBuffer:gpu_xbs offset:0 atIndex:2]; + [enc setBytes:&uD length:4 atIndex:3]; + [enc setBytes:&rms_eps length:4 atIndex:4]; + [enc setBytes:&uN length:4 atIndex:5]; + { NSUInteger p = 1; while (p < (NSUInteger)D) p <<= 1; if (p > 1024) p = 1024; + [enc dispatchThreadgroups:MTLSizeMake(N, 1, 1) threadsPerThreadgroup:MTLSizeMake(p, 1, 1)]; } + + // 10. Batched fused Gate+Up+SiLU + [enc setComputePipelineState:pso_sgemm_ffn]; + [enc setBuffer:(__bridge id)m->gpu_wgate[l] offset:0 atIndex:0]; + [enc setBuffer:(__bridge id)m->gpu_wup[l] offset:0 atIndex:1]; + [enc setBuffer:gpu_xbs offset:0 atIndex:2]; + [enc setBuffer:gpu_hbs offset:0 atIndex:3]; + [enc setBytes:&uD length:4 atIndex:4]; + [enc setBytes:&uHD length:4 atIndex:5]; + [enc setBytes:&uN length:4 atIndex:6]; + { uint32_t ffn_tg_x = (uHD + 3) / 4; + [enc dispatchThreadgroups:MTLSizeMake(ffn_tg_x, N, 1) + threadsPerThreadgroup:MTLSizeMake(64, 1, 1)]; } + + // 11. Batched down projection + ENCODE_SGEMM_Q4(enc, (__bridge id)m->gpu_wdown[l], gpu_hbs, gpu_ffn_outs, + uHD, uD, nil, uN); + + // 12. Batched FFN residual: xs += ffn_outs + [enc setComputePipelineState:pso_add_b]; + [enc setBuffer:gpu_xs offset:0 atIndex:0]; + [enc setBuffer:gpu_ffn_outs offset:0 atIndex:1]; + [enc setBuffer:gpu_xs offset:0 atIndex:2]; + [enc setBytes:&total_add length:4 atIndex:3]; + { NSUInteger tpg = MIN((NSUInteger)total_add, pso_add_b.maxTotalThreadsPerThreadgroup); + [enc dispatchThreads:MTLSizeMake(total_add, 1, 1) threadsPerThreadgroup:MTLSizeMake(tpg, 1, 1)]; } + } + + // Final: RMSNorm + LM head + argmax on LAST token only + NSUInteger last_off = (NSUInteger)(N - 1) * D * 4; + + [enc setComputePipelineState:pso_rms]; + [enc setBuffer:gpu_xs offset:last_off atIndex:0]; + [enc setBuffer:(__bridge id)m->gpu_rms_final offset:0 atIndex:1]; + [enc setBuffer:gpu_xb_last offset:0 atIndex:2]; + [enc setBytes:&uD length:4 atIndex:3]; + [enc setBytes:&rms_eps length:4 atIndex:4]; + { NSUInteger p = 1; while (p < (NSUInteger)D) p <<= 1; if (p > 1024) p = 1024; + [enc dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(p, 1, 1)]; } + + uint32_t uVocab = QWEN_VOCAB; + gpu_encode_sgemv_f32(enc, (__bridge id)m->gpu_embed, gpu_xb_last, gpu_logits, uD, uVocab); + + [enc setComputePipelineState:pso_argmax]; + [enc setBuffer:gpu_logits offset:0 atIndex:0]; + [enc setBuffer:gpu_result offset:0 atIndex:1]; + [enc setBytes:&uVocab length:4 atIndex:2]; + { NSUInteger tpg = MIN((NSUInteger)1024, pso_argmax.maxTotalThreadsPerThreadgroup); + [enc dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(tpg, 1, 1)]; } + + [enc endEncoding]; + [cmd commit]; + [cmd waitUntilCompleted]; + + #undef ENCODE_SGEMM_Q4 + + m->pos += N; + int *result_ptr = (int*)[gpu_result contents]; + return result_ptr[0]; +} + +#endif // __OBJC__ + +// ── Compile all ANE kernels ────────────────────────────────────────── + +static void qwen_compile_kernels(QwenModel *m) { +#if USE_ANE_PROJECTIONS + int D = QWEN_DIM, HD = QWEN_HIDDEN; + printf("Compiling %d ANE kernels...\n", QWEN_LAYERS * 7 + 1); + for (int l = 0; l < QWEN_LAYERS; l++) { + m->k_q[l] = compile_conv_kernel(m->wq[l], D, QWEN_Q_DIM, 1); + m->k_k[l] = compile_conv_kernel(m->wk[l], D, QWEN_KV_DIM, 1); + m->k_v[l] = compile_conv_kernel(m->wv[l], D, QWEN_KV_DIM, 1); + m->k_o[l] = compile_conv_kernel(m->wo[l], QWEN_Q_DIM, D, 1); + m->k_gate[l] = compile_conv_kernel(m->w_gate[l], D, HD, 1); + m->k_up[l] = compile_conv_kernel(m->w_up[l], D, HD, 1); + m->k_down[l] = compile_conv_kernel(m->w_down[l], HD, D, 1); + printf(" Layer %d/%d compiled\r", l+1, QWEN_LAYERS); + fflush(stdout); + } + for (int c = 0; c < QWEN_LM_CHUNKS; c++) { + float *chunk_weights = m->embed + c * QWEN_LM_CHUNK_SIZE * D; + m->k_lmhead[c] = compile_conv_kernel(chunk_weights, D, QWEN_LM_CHUNK_SIZE, 1); + if (!m->k_lmhead[c]) { + printf(" LM head chunk %d FAILED to compile\n", c); + } + } + printf("\nAll kernels compiled.\n"); +#else + printf("CPU-only mode (ANE kernel compilation skipped).\n"); + (void)m; +#endif +} + +// Fused ANE compilation: QKV fused + Gate/Up fused + separate O, Down +// Total: 24*(1 QKV + 1 O + 1 FFN_up + 1 Down) = 96 kernels + 16 LM head = 112 (< 119) +static void qwen_compile_kernels_fused(QwenModel *m) { + int D = QWEN_DIM, HD = QWEN_HIDDEN; + int total = QWEN_LAYERS * 4 + QWEN_LM_CHUNKS; + int compiled = 0, failed = 0; + printf("Compiling %d fused ANE kernels (QKV+FFN_up fused)...\n", total); + + for (int l = 0; l < QWEN_LAYERS; l++) { + m->k_qkv[l] = compile_qkv_gqa_kernel( + m->wq[l], m->wk[l], m->wv[l], + D, QWEN_Q_DIM, QWEN_KV_DIM); + if (m->k_qkv[l]) compiled++; else { failed++; printf(" Layer %d QKV FAILED\n", l); } + + m->k_o[l] = compile_conv_kernel_fp16io(m->wo[l], QWEN_Q_DIM, D, 1); + if (m->k_o[l]) compiled++; else { failed++; printf(" Layer %d O FAILED\n", l); } + + m->k_ffn_up[l] = compile_ffn_up_kernel(m->w_gate[l], m->w_up[l], D, HD); + if (m->k_ffn_up[l]) compiled++; else { failed++; printf(" Layer %d FFN_up FAILED\n", l); } + + m->k_down[l] = compile_conv_kernel_fp16io(m->w_down[l], HD, D, 1); + if (m->k_down[l]) compiled++; else { failed++; printf(" Layer %d Down FAILED\n", l); } + + printf(" Layer %d/%d compiled (%d/%d ok)\r", l+1, QWEN_LAYERS, compiled, compiled+failed); + fflush(stdout); + } + + for (int c = 0; c < QWEN_LM_CHUNKS; c++) { + float *chunk_w = m->embed + c * QWEN_LM_CHUNK_SIZE * D; + m->k_lmhead[c] = compile_conv_kernel_fp16io(chunk_w, D, QWEN_LM_CHUNK_SIZE, 1); + if (m->k_lmhead[c]) compiled++; else { failed++; printf(" LM head chunk %d FAILED\n", c); } + } + + printf("\nFused ANE: %d/%d compiled, %d failed\n", compiled, total, failed); + if (failed > 0) + printf("WARNING: some kernels failed — ANE inference will fall back to CPU for those projections\n"); +} + +// ── Allocate buffers ───────────────────────────────────────────────── + +static void qwen_alloc(QwenModel *m) { + m->x = (float*)qwen_calloc(QWEN_DIM, sizeof(float), "x"); + m->xb = (float*)qwen_calloc(QWEN_DIM, sizeof(float), "xb"); + m->q = (float*)qwen_calloc(QWEN_Q_DIM, sizeof(float), "q"); + m->k = (float*)qwen_calloc(QWEN_KV_DIM, sizeof(float), "k"); + m->v = (float*)qwen_calloc(QWEN_KV_DIM, sizeof(float), "v"); + m->att = (float*)qwen_calloc(QWEN_HEADS * QWEN_MAX_SEQ, sizeof(float), "att"); + m->hb = (float*)qwen_calloc(QWEN_HIDDEN, sizeof(float), "hb"); + m->hb2 = (float*)qwen_calloc(QWEN_HIDDEN, sizeof(float), "hb2"); + m->logits = (float*)qwen_calloc(QWEN_VOCAB, sizeof(float), "logits"); + for (int l = 0; l < QWEN_LAYERS; l++) { + m->kv_cache_k[l] = (float*)qwen_calloc(QWEN_MAX_SEQ * QWEN_KV_DIM, sizeof(float), "kv_cache_k"); + m->kv_cache_v[l] = (float*)qwen_calloc(QWEN_MAX_SEQ * QWEN_KV_DIM, sizeof(float), "kv_cache_v"); + } + m->pos = 0; +} + +static void qwen_reset(QwenModel *m) { + for (int l = 0; l < QWEN_LAYERS; l++) { + memset(m->kv_cache_k[l], 0, QWEN_MAX_SEQ * QWEN_KV_DIM * sizeof(float)); + memset(m->kv_cache_v[l], 0, QWEN_MAX_SEQ * QWEN_KV_DIM * sizeof(float)); + } + m->pos = 0; +} diff --git a/inference/run.py b/inference/run.py new file mode 100644 index 0000000..a822d49 --- /dev/null +++ b/inference/run.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python3 +"""Run Qwen2.5-0.5B on ANE with proper tokenization. + +Auto-connects to a running socket server for instant responses (~0ms startup). +Falls back to subprocess mode if no server is running (~6s startup per call). + +Usage: + python3 run.py "Your prompt here" [--max-tokens 50] + +Server mode (start server first in another terminal): + ./qwen_ane qwen05b.bin --server /tmp/qwen_ane.sock + python3 run.py "Your prompt here" +""" +import argparse +import json +import os +import socket +import subprocess +import sys +import time +from pathlib import Path + +INFERENCE_DIR = Path(__file__).parent +WEIGHTS_PATH = INFERENCE_DIR / "qwen05b.bin" +MODEL_DIR = Path.home() / "models" / "Qwen2.5-0.5B-Instruct" +DEFAULT_SOCK = "/tmp/qwen_ane.sock" + + +def query_socket(token_ids: list[int], max_tokens: int, sock_path: str = DEFAULT_SOCK) -> dict | None: + """Send a request to the socket server. Returns parsed JSON or None on failure.""" + try: + s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + s.settimeout(120) + s.connect(sock_path) + req = json.dumps({"tokens": token_ids, "max_tokens": max_tokens}) + "\n" + s.sendall(req.encode()) + + data = b"" + while True: + chunk = s.recv(131072) + if not chunk: + break + data += chunk + if b"\n" in data: + break + s.close() + return json.loads(data.decode().strip()) + except (ConnectionRefusedError, FileNotFoundError, OSError): + return None + + +def query_subprocess(token_ids: list[int], max_tokens: int) -> dict | None: + """Fall back to spawning the binary as a subprocess.""" + binary = str(INFERENCE_DIR / "qwen_ane") + if not os.path.exists(binary): + print(f"Binary not found: {binary}", file=sys.stderr) + return None + + result = subprocess.run( + [binary, str(WEIGHTS_PATH), + " ".join(str(t) for t in token_ids), + str(max_tokens)], + capture_output=True, text=True, timeout=120, + ) + print(result.stdout) + if result.stderr: + print(result.stderr[:500], file=sys.stderr) + + output_ids = [] + for line in result.stdout.split("\n"): + if line.startswith("OUT:"): + ids = [int(x) for x in line[4:].split() if x.lstrip("-").isdigit()] + output_ids.extend(ids) + + return {"output": output_ids} if output_ids else None + + +def main(): + parser = argparse.ArgumentParser(description="Qwen2.5-0.5B ANE inference") + parser.add_argument("prompt", type=str) + parser.add_argument("--max-tokens", type=int, default=50) + parser.add_argument("--no-server", action="store_true", + help="Force subprocess mode even if server is running") + parser.add_argument("--sock", type=str, default=DEFAULT_SOCK, + help="Socket path for server mode") + args = parser.parse_args() + + from transformers import AutoTokenizer + + print("Loading tokenizer...") + tok = AutoTokenizer.from_pretrained(str(MODEL_DIR), trust_remote_code=True) + + messages = [ + {"role": "system", "content": "You are a helpful assistant. Be concise."}, + {"role": "user", "content": args.prompt}, + ] + text = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + input_ids = tok.encode(text) + print(f"Prompt: {len(input_ids)} tokens") + + # Try socket server first (instant response) + result = None + if not args.no_server and os.path.exists(args.sock): + print(f"Connecting to server at {args.sock}...") + t0 = time.time() + result = query_socket(input_ids, args.max_tokens, args.sock) + elapsed = time.time() - t0 + if result: + print(f"Server responded in {elapsed:.3f}s") + else: + print("Server not responding, falling back to subprocess...") + + # Fall back to subprocess + if result is None: + print("Running inference (subprocess mode, ~6s startup)...") + result = query_subprocess(input_ids, args.max_tokens) + + if not result or "output" not in result: + print("(No output received)", file=sys.stderr) + return + + output_ids = result["output"] + if output_ids: + decoded = tok.decode(output_ids, skip_special_tokens=True) + print(f"\n=== Response ===\n{decoded}") + + if "prefill_tps" in result: + print(f"\nPrefill: {result['prefill_tps']:.1f} t/s | " + f"Decode: {result['decode_tps']:.1f} t/s") + + +if __name__ == "__main__": + main() diff --git a/inference/setup.sh b/inference/setup.sh new file mode 100755 index 0000000..cd4e127 --- /dev/null +++ b/inference/setup.sh @@ -0,0 +1,161 @@ +#!/bin/bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +MODEL_ID="Qwen/Qwen2.5-0.5B-Instruct" +MODEL_DIR="$HOME/models/Qwen2.5-0.5B-Instruct" +WEIGHTS_BIN="$SCRIPT_DIR/qwen05b.bin" +BINARY="$SCRIPT_DIR/qwen_ane" +VENV_DIR="$SCRIPT_DIR/.venv" +EXPECTED_WEIGHT_SIZE_F32=1976131100 +EXPECTED_WEIGHT_SIZE_F16=988082236 + +info() { printf "\033[1;34m==> %s\033[0m\n" "$1"; } +ok() { printf "\033[1;32m ✓ %s\033[0m\n" "$1"; } +warn() { printf "\033[1;33m ! %s\033[0m\n" "$1"; } +fail() { printf "\033[1;31m ✗ %s\033[0m\n" "$1"; exit 1; } + +info "ANE Inference Setup" +echo "Model: $MODEL_ID" +echo "Target: $SCRIPT_DIR" +echo "" + +# --- Step 1: Prerequisites --- +info "Checking prerequisites..." + +if ! command -v xcrun &>/dev/null; then + fail "Xcode Command Line Tools not found. Install with: xcode-select --install" +fi +ok "xcrun clang available" + +if ! command -v python3 &>/dev/null; then + fail "Python 3 not found" +fi + +PY_VER=$(python3 -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")') +PY_MAJOR=$(echo "$PY_VER" | cut -d. -f1) +PY_MINOR=$(echo "$PY_VER" | cut -d. -f2) +if [ "$PY_MAJOR" -lt 3 ] || ([ "$PY_MAJOR" -eq 3 ] && [ "$PY_MINOR" -lt 11 ]); then + fail "Python 3.11+ required (found $PY_VER). coremltools needs 3.11-3.13." +fi +ok "Python $PY_VER" + +# --- Step 2: Virtual environment --- +info "Setting up Python environment..." + +if [ ! -d "$VENV_DIR" ]; then + python3 -m venv "$VENV_DIR" + ok "Created venv at $VENV_DIR" +else + ok "Venv already exists" +fi + +source "$VENV_DIR/bin/activate" + +pip install --quiet --upgrade pip +pip install --quiet safetensors torch transformers huggingface-hub +ok "Python dependencies installed" + +# --- Step 3: Download model --- +info "Downloading model from HuggingFace..." + +if [ -f "$MODEL_DIR/model.safetensors" ] && [ -f "$MODEL_DIR/tokenizer.json" ]; then + ok "Model already downloaded at $MODEL_DIR" +else + mkdir -p "$MODEL_DIR" + if command -v huggingface-cli &>/dev/null; then + huggingface-cli download "$MODEL_ID" --local-dir "$MODEL_DIR" + else + python3 -c " +from huggingface_hub import snapshot_download +snapshot_download('$MODEL_ID', local_dir='$MODEL_DIR') +" + fi + ok "Model downloaded to $MODEL_DIR" +fi + +# Verify key files exist +for f in model.safetensors tokenizer.json vocab.json merges.txt config.json; do + if [ ! -f "$MODEL_DIR/$f" ]; then + fail "Missing $f in $MODEL_DIR" + fi +done +ok "All model files present" + +# --- Step 4: Convert weights --- +info "Converting weights to binary format..." + +if [ -f "$WEIGHTS_BIN" ]; then + ACTUAL_SIZE=$(stat -f%z "$WEIGHTS_BIN" 2>/dev/null || stat -c%s "$WEIGHTS_BIN" 2>/dev/null) + if [ "$ACTUAL_SIZE" -eq "$EXPECTED_WEIGHT_SIZE_F16" ] || [ "$ACTUAL_SIZE" -eq "$EXPECTED_WEIGHT_SIZE_F32" ]; then + ok "Weights already converted ($((ACTUAL_SIZE / 1024 / 1024)) MB)" + else + warn "Weight file exists but unexpected size ($ACTUAL_SIZE), reconverting as F16" + python3 "$SCRIPT_DIR/convert_weights.py" "$MODEL_DIR" "$WEIGHTS_BIN" --f16 + ok "Weights converted (F16)" + fi +else + python3 "$SCRIPT_DIR/convert_weights.py" "$MODEL_DIR" "$WEIGHTS_BIN" --f16 + ok "Weights converted (F16)" +fi + +# --- Step 5: Build binary --- +info "Building qwen_ane binary..." + +NEEDS_BUILD=0 +if [ ! -f "$BINARY" ]; then + NEEDS_BUILD=1 +elif [ "$SCRIPT_DIR/main.m" -nt "$BINARY" ] || \ + [ "$SCRIPT_DIR/qwen_ane_infer.h" -nt "$BINARY" ] || \ + [ "$SCRIPT_DIR/tokenizer.h" -nt "$BINARY" ] 2>/dev/null || \ + [ "$SCRIPT_DIR/http_server.h" -nt "$BINARY" ] 2>/dev/null; then + NEEDS_BUILD=1 + warn "Source files newer than binary, rebuilding" +fi + +if [ "$NEEDS_BUILD" -eq 1 ]; then + xcrun clang -O3 -ffast-math -mcpu=apple-m4 -flto \ + -framework Foundation -framework IOSurface \ + -framework CoreML -framework Accelerate -framework Metal \ + -ldl -lobjc -fobjc-arc \ + -o "$BINARY" "$SCRIPT_DIR/main.m" + ok "Binary built: $BINARY" +else + ok "Binary up to date" +fi + +# --- Step 6: Smoke test --- +info "Running smoke test..." + +# Quick single-shot test with known token IDs for "system\nYou are a helpful assistant." +TEST_OUTPUT=$("$BINARY" "$WEIGHTS_BIN" "151644 8948 198" 3 2>&1 || true) + +if echo "$TEST_OUTPUT" | grep -q "OUT:"; then + ok "Smoke test passed (model generates output)" +else + warn "Smoke test: no output tokens detected (this may be OK on first run)" + echo " Output was: $(echo "$TEST_OUTPUT" | tail -3)" +fi + +# --- Done --- +echo "" +info "Setup complete!" +echo "" +echo " Binary: $BINARY" +echo " Weights: $WEIGHTS_BIN ($(du -h "$WEIGHTS_BIN" | cut -f1) )" +echo " Model: $MODEL_DIR" +echo "" +echo "Quick start:" +echo " # Single prompt (slow, compiles every time)" +echo " python3 $SCRIPT_DIR/run.py \"What is 2+2?\"" +echo "" +echo " # Server mode (fast, compile once)" +echo " $BINARY $WEIGHTS_BIN --server /tmp/qwen_ane.sock &" +echo " python3 $SCRIPT_DIR/run.py \"What is 2+2?\"" +echo "" +echo " # HTTP API (fast, no Python needed for queries)" +echo " $BINARY $WEIGHTS_BIN --http 8000 --model-dir $MODEL_DIR" +echo " curl http://localhost:8000/v1/completions -d '{\"prompt\":\"Hi\",\"max_tokens\":20}'" +echo "" +echo " # Run throughput benchmark" +echo " $SCRIPT_DIR/benchmark.sh" diff --git a/inference/tokenizer.h b/inference/tokenizer.h new file mode 100644 index 0000000..1633fc2 --- /dev/null +++ b/inference/tokenizer.h @@ -0,0 +1,657 @@ +// tokenizer.h -- Byte-level BPE tokenizer for Qwen2.5 in pure C +// Loads vocab.json + merges.txt from HuggingFace model directory. +// Implements GPT-style byte-level BPE (same algorithm as tiktoken/llama.cpp). +#pragma once + +#include +#include +#include +#include + +#define TOK_MAX_VOCAB 152000 +#define TOK_MAX_MERGES 152000 +#define TOK_MAX_TOKEN_LEN 256 +#define TOK_HASH_SIZE (1 << 20) // ~1M buckets + +// Special token IDs for Qwen2.5 +#define TOK_IM_START 151644 +#define TOK_IM_END 151645 +#define TOK_ENDOFTEXT 151643 + +// --- Byte-to-unicode mapping (GPT-2 standard) --- +// Maps byte values 0-255 to unicode codepoints used in the BPE vocab. +// Printable ASCII stays the same; non-printable bytes map to U+0100..U+0143. + +static int g_byte_to_unicode[256]; +static int g_unicode_to_byte[65536]; + +static void tok_init_byte_mapping(void) { + int n = 0; + for (int b = 0; b < 256; b++) { + if ((b >= 0x21 && b <= 0x7E) || (b >= 0xA1 && b <= 0xAC) || (b >= 0xAE && b <= 0xFF)) { + g_byte_to_unicode[b] = b; + } else { + g_byte_to_unicode[b] = 256 + n; + n++; + } + } + memset(g_unicode_to_byte, 0xFF, sizeof(g_unicode_to_byte)); + for (int b = 0; b < 256; b++) + g_unicode_to_byte[g_byte_to_unicode[b]] = b; +} + +// --- UTF-8 helpers --- + +static int utf8_encode(int codepoint, char *out) { + if (codepoint < 0x80) { + out[0] = (char)codepoint; + return 1; + } else if (codepoint < 0x800) { + out[0] = (char)(0xC0 | (codepoint >> 6)); + out[1] = (char)(0x80 | (codepoint & 0x3F)); + return 2; + } else if (codepoint < 0x10000) { + out[0] = (char)(0xE0 | (codepoint >> 12)); + out[1] = (char)(0x80 | ((codepoint >> 6) & 0x3F)); + out[2] = (char)(0x80 | (codepoint & 0x3F)); + return 3; + } + out[0] = (char)(0xF0 | (codepoint >> 18)); + out[1] = (char)(0x80 | ((codepoint >> 12) & 0x3F)); + out[2] = (char)(0x80 | ((codepoint >> 6) & 0x3F)); + out[3] = (char)(0x80 | (codepoint & 0x3F)); + return 4; +} + +static int utf8_decode(const char *s, int *codepoint) { + unsigned char c = (unsigned char)s[0]; + if (c < 0x80) { *codepoint = c; return 1; } + if ((c & 0xE0) == 0xC0) { + *codepoint = ((c & 0x1F) << 6) | (s[1] & 0x3F); + return 2; + } + if ((c & 0xF0) == 0xE0) { + *codepoint = ((c & 0x0F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F); + return 3; + } + *codepoint = ((c & 0x07) << 18) | ((s[1] & 0x3F) << 12) | ((s[2] & 0x3F) << 6) | (s[3] & 0x3F); + return 4; +} + +// --- Hash map: string -> int --- + +typedef struct { + char *key; + int value; +} TokHashEntry; + +typedef struct { + TokHashEntry *entries; + int capacity; +} TokHashMap; + +static unsigned int tok_hash(const char *s) { + unsigned int h = 5381; + while (*s) h = ((h << 5) + h) ^ (unsigned char)*s++; + return h; +} + +static void tok_hashmap_init(TokHashMap *m, int capacity) { + m->capacity = capacity; + m->entries = (TokHashEntry*)calloc(capacity, sizeof(TokHashEntry)); +} + +static void tok_hashmap_set(TokHashMap *m, const char *key, int value) { + unsigned int idx = tok_hash(key) % m->capacity; + while (m->entries[idx].key) { + if (strcmp(m->entries[idx].key, key) == 0) { + m->entries[idx].value = value; + return; + } + idx = (idx + 1) % m->capacity; + } + m->entries[idx].key = strdup(key); + m->entries[idx].value = value; +} + +static int tok_hashmap_get(TokHashMap *m, const char *key, int default_val) { + unsigned int idx = tok_hash(key) % m->capacity; + while (m->entries[idx].key) { + if (strcmp(m->entries[idx].key, key) == 0) + return m->entries[idx].value; + idx = (idx + 1) % m->capacity; + } + return default_val; +} + +static void tok_hashmap_free(TokHashMap *m) { + for (int i = 0; i < m->capacity; i++) + if (m->entries[i].key) free(m->entries[i].key); + free(m->entries); + m->entries = NULL; + m->capacity = 0; +} + +// --- Merge pair --- + +typedef struct { + char *a; + char *b; +} TokMerge; + +// --- Tokenizer state --- + +typedef struct { + TokHashMap vocab; // token string -> id + char **id_to_token; // id -> token string (for decoding) + int vocab_size; + TokMerge *merges; + int n_merges; + TokHashMap merge_rank; // "a b" -> rank (lower = higher priority) + + // Special tokens + int im_start; + int im_end; + int eos; +} Tokenizer; + +// --- JSON string parsing (minimal, handles unicode escapes) --- + +static int tok_parse_json_string(const char *s, char *out, int max_out) { + if (*s != '"') return -1; + s++; + int n = 0; + while (*s && *s != '"' && n < max_out - 1) { + if (*s == '\\') { + s++; + switch (*s) { + case '"': out[n++] = '"'; break; + case '\\': out[n++] = '\\'; break; + case '/': out[n++] = '/'; break; + case 'n': out[n++] = '\n'; break; + case 'r': out[n++] = '\r'; break; + case 't': out[n++] = '\t'; break; + case 'u': { + char hex[5] = {s[1], s[2], s[3], s[4], 0}; + int cp = (int)strtol(hex, NULL, 16); + n += utf8_encode(cp, out + n); + s += 4; + break; + } + default: out[n++] = *s; + } + } else { + out[n++] = *s; + } + s++; + } + out[n] = '\0'; + return n; +} + +// --- Load vocab.json --- +// Format: {"token_string": id, ...} + +static int tok_load_vocab(Tokenizer *t, const char *path) { + FILE *f = fopen(path, "r"); + if (!f) { fprintf(stderr, "Cannot open vocab: %s\n", path); return -1; } + + fseek(f, 0, SEEK_END); + long fsize = ftell(f); + fseek(f, 0, SEEK_SET); + char *data = (char*)malloc(fsize + 1); + fread(data, 1, fsize, f); + data[fsize] = '\0'; + fclose(f); + + tok_hashmap_init(&t->vocab, TOK_HASH_SIZE); + t->id_to_token = (char**)calloc(TOK_MAX_VOCAB, sizeof(char*)); + t->vocab_size = 0; + + char *p = data; + // Skip opening { + while (*p && *p != '{') p++; + if (*p) p++; + + char key_buf[TOK_MAX_TOKEN_LEN]; + while (*p) { + while (*p && (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t' || *p == ',')) p++; + if (*p == '}' || !*p) break; + + int klen = tok_parse_json_string(p, key_buf, sizeof(key_buf)); + if (klen < 0) break; + + // Skip past closing quote + p++; // opening " + while (*p) { + if (*p == '\\') { p += 2; continue; } + if (*p == '"') { p++; break; } + p++; + } + + // Skip colon and whitespace + while (*p && (*p == ' ' || *p == ':')) p++; + + int id = (int)strtol(p, &p, 10); + + if (id >= 0 && id < TOK_MAX_VOCAB) { + tok_hashmap_set(&t->vocab, key_buf, id); + t->id_to_token[id] = strdup(key_buf); + if (id >= t->vocab_size) t->vocab_size = id + 1; + } + } + + free(data); + printf(" Vocab: %d tokens\n", t->vocab_size); + return 0; +} + +// --- Load merges.txt --- +// Format: one merge per line, "tokenA tokenB" (space-separated) +// First line may be a header starting with # + +static int tok_load_merges(Tokenizer *t, const char *path) { + FILE *f = fopen(path, "r"); + if (!f) { fprintf(stderr, "Cannot open merges: %s\n", path); return -1; } + + t->merges = (TokMerge*)malloc(TOK_MAX_MERGES * sizeof(TokMerge)); + tok_hashmap_init(&t->merge_rank, TOK_HASH_SIZE); + t->n_merges = 0; + + char line[4096]; + while (fgets(line, sizeof(line), f)) { + // Strip newline + int len = (int)strlen(line); + while (len > 0 && (line[len-1] == '\n' || line[len-1] == '\r')) line[--len] = '\0'; + if (len == 0) continue; + if (line[0] == '#') continue; // skip header + + // Split on first space + char *space = strchr(line, ' '); + if (!space) continue; + *space = '\0'; + + t->merges[t->n_merges].a = strdup(line); + t->merges[t->n_merges].b = strdup(space + 1); + + // Store merge rank: "a b" -> rank + *space = ' '; // restore + tok_hashmap_set(&t->merge_rank, line, t->n_merges); + + t->n_merges++; + if (t->n_merges >= TOK_MAX_MERGES) break; + } + + fclose(f); + printf(" Merges: %d rules\n", t->n_merges); + return 0; +} + +// --- Add special tokens --- + +static void tok_add_special_tokens(Tokenizer *t) { + struct { const char *text; int id; } specials[] = { + {"<|endoftext|>", 151643}, + {"<|im_start|>", 151644}, + {"<|im_end|>", 151645}, + }; + for (int i = 0; i < 3; i++) { + tok_hashmap_set(&t->vocab, specials[i].text, specials[i].id); + if (specials[i].id < TOK_MAX_VOCAB) { + if (t->id_to_token[specials[i].id]) free(t->id_to_token[specials[i].id]); + t->id_to_token[specials[i].id] = strdup(specials[i].text); + } + if (specials[i].id >= t->vocab_size) t->vocab_size = specials[i].id + 1; + } + t->im_start = 151644; + t->im_end = 151645; + t->eos = 151643; +} + +// --- Initialize tokenizer --- + +static int tok_init(Tokenizer *t, const char *model_dir) { + char path[4096]; + + tok_init_byte_mapping(); + + snprintf(path, sizeof(path), "%s/vocab.json", model_dir); + if (tok_load_vocab(t, path) != 0) return -1; + + snprintf(path, sizeof(path), "%s/merges.txt", model_dir); + if (tok_load_merges(t, path) != 0) return -1; + + tok_add_special_tokens(t); + return 0; +} + +static void tok_free(Tokenizer *t) { + tok_hashmap_free(&t->vocab); + tok_hashmap_free(&t->merge_rank); + if (t->id_to_token) { + for (int i = 0; i < t->vocab_size; i++) + if (t->id_to_token[i]) free(t->id_to_token[i]); + free(t->id_to_token); + } + if (t->merges) { + for (int i = 0; i < t->n_merges; i++) { + free(t->merges[i].a); + free(t->merges[i].b); + } + free(t->merges); + } +} + +// --- BPE encoding --- + +// Convert a raw byte string to its byte-level unicode representation (UTF-8). +// Each input byte is mapped through g_byte_to_unicode, then encoded as UTF-8. +static int tok_bytes_to_unicode_str(const char *input, int input_len, char *out, int max_out) { + int n = 0; + for (int i = 0; i < input_len && n < max_out - 4; i++) { + unsigned char b = (unsigned char)input[i]; + int cp = g_byte_to_unicode[b]; + n += utf8_encode(cp, out + n); + } + out[n] = '\0'; + return n; +} + +// A BPE word is a list of token strings (initially one per byte-level char). +typedef struct { + char **tokens; + int count; + int capacity; +} BPEWord; + +static void bpe_word_init(BPEWord *w) { + w->capacity = 64; + w->tokens = (char**)malloc(w->capacity * sizeof(char*)); + w->count = 0; +} + +static void bpe_word_push(BPEWord *w, const char *s) { + if (w->count >= w->capacity) { + w->capacity *= 2; + w->tokens = (char**)realloc(w->tokens, w->capacity * sizeof(char*)); + } + w->tokens[w->count++] = strdup(s); +} + +static void bpe_word_free(BPEWord *w) { + for (int i = 0; i < w->count; i++) free(w->tokens[i]); + free(w->tokens); +} + +// Apply BPE merges to a word (list of token strings). +static void bpe_merge(BPEWord *w, Tokenizer *t) { + while (w->count > 1) { + // Find the pair with lowest merge rank + int best_rank = t->n_merges + 1; + int best_idx = -1; + char pair_key[TOK_MAX_TOKEN_LEN * 2 + 2]; + + for (int i = 0; i < w->count - 1; i++) { + snprintf(pair_key, sizeof(pair_key), "%s %s", w->tokens[i], w->tokens[i+1]); + int rank = tok_hashmap_get(&t->merge_rank, pair_key, t->n_merges + 1); + if (rank < best_rank) { + best_rank = rank; + best_idx = i; + } + } + + if (best_idx < 0) break; // no more merges + + // Merge tokens[best_idx] and tokens[best_idx+1] + char merged[TOK_MAX_TOKEN_LEN * 2 + 1]; + snprintf(merged, sizeof(merged), "%s%s", w->tokens[best_idx], w->tokens[best_idx+1]); + free(w->tokens[best_idx]); + free(w->tokens[best_idx+1]); + w->tokens[best_idx] = strdup(merged); + + // Shift remaining tokens left + for (int i = best_idx + 1; i < w->count - 1; i++) + w->tokens[i] = w->tokens[i+1]; + w->count--; + } +} + +// Pre-tokenize: split on word boundaries (simplified GPT-style). +// Splits on transitions between: letters, digits, spaces, punctuation. +// Each "word" includes leading space if present (byte-level BPE convention). +typedef struct { + char **words; + int count; + int capacity; +} WordList; + +static void wordlist_init(WordList *wl) { + wl->capacity = 256; + wl->words = (char**)malloc(wl->capacity * sizeof(char*)); + wl->count = 0; +} + +static void wordlist_push(WordList *wl, const char *s, int len) { + if (wl->count >= wl->capacity) { + wl->capacity *= 2; + wl->words = (char**)realloc(wl->words, wl->capacity * sizeof(char*)); + } + char *copy = (char*)malloc(len + 1); + memcpy(copy, s, len); + copy[len] = '\0'; + wl->words[wl->count++] = copy; +} + +static void wordlist_free(WordList *wl) { + for (int i = 0; i < wl->count; i++) free(wl->words[i]); + free(wl->words); +} + +static int is_letter(unsigned char c) { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c >= 0x80; +} + +static int is_digit(unsigned char c) { + return c >= '0' && c <= '9'; +} + +static void tok_pre_tokenize(const char *text, WordList *out) { + wordlist_init(out); + int len = (int)strlen(text); + int i = 0; + + while (i < len) { + int start = i; + + if (text[i] == ' ') { + // Space + following word/punct + i++; + if (i < len && is_letter((unsigned char)text[i])) { + while (i < len && is_letter((unsigned char)text[i])) i++; + } else if (i < len && is_digit((unsigned char)text[i])) { + while (i < len && is_digit((unsigned char)text[i])) i++; + } else if (i < len && text[i] != ' ') { + i++; // single punct after space + } + wordlist_push(out, text + start, i - start); + } else if (is_letter((unsigned char)text[i])) { + while (i < len && is_letter((unsigned char)text[i])) i++; + wordlist_push(out, text + start, i - start); + } else if (is_digit((unsigned char)text[i])) { + while (i < len && is_digit((unsigned char)text[i])) i++; + wordlist_push(out, text + start, i - start); + } else if (text[i] == '\n' || text[i] == '\r') { + while (i < len && (text[i] == '\n' || text[i] == '\r')) i++; + wordlist_push(out, text + start, i - start); + } else { + i++; + wordlist_push(out, text + start, 1); + } + } +} + +// --- Main encode function --- +// Returns number of token IDs written. Caller provides output buffer. + +static int tok_encode(Tokenizer *t, const char *text, int *ids, int max_ids) { + int n_ids = 0; + + // Pre-tokenize into words + WordList words; + tok_pre_tokenize(text, &words); + + for (int w = 0; w < words.count && n_ids < max_ids; w++) { + // Convert word bytes to byte-level unicode string + char unicode_str[TOK_MAX_TOKEN_LEN * 4]; + int wlen = (int)strlen(words.words[w]); + tok_bytes_to_unicode_str(words.words[w], wlen, unicode_str, sizeof(unicode_str)); + + // Split unicode string into individual unicode chars + BPEWord bpe; + bpe_word_init(&bpe); + + const char *p = unicode_str; + while (*p) { + int cp; + int cplen = utf8_decode(p, &cp); + char single[8]; + int slen = utf8_encode(cp, single); + single[slen] = '\0'; + bpe_word_push(&bpe, single); + p += cplen; + } + + // Apply BPE merges + bpe_merge(&bpe, t); + + // Look up each resulting token in vocab + for (int i = 0; i < bpe.count && n_ids < max_ids; i++) { + int id = tok_hashmap_get(&t->vocab, bpe.tokens[i], -1); + if (id >= 0) { + ids[n_ids++] = id; + } else { + // Unknown token -- encode each byte-level char as individual token + const char *bp = bpe.tokens[i]; + while (*bp && n_ids < max_ids) { + int bcp; + int bcplen = utf8_decode(bp, &bcp); + char single[8]; + int slen = utf8_encode(bcp, single); + single[slen] = '\0'; + int byte_id = tok_hashmap_get(&t->vocab, single, -1); + if (byte_id >= 0) ids[n_ids++] = byte_id; + bp += bcplen; + } + } + } + + bpe_word_free(&bpe); + } + + wordlist_free(&words); + return n_ids; +} + +// --- Encode with special tokens --- +// Splits text on special token patterns, encodes non-special parts with BPE. + +static int tok_encode_with_special(Tokenizer *t, const char *text, int *ids, int max_ids) { + struct { const char *text; int id; } specials[] = { + {"<|im_start|>", TOK_IM_START}, + {"<|im_end|>", TOK_IM_END}, + {"<|endoftext|>", TOK_ENDOFTEXT}, + }; + int n_specials = 3; + int n_ids = 0; + const char *p = text; + + while (*p && n_ids < max_ids) { + // Check if current position matches a special token + int matched = 0; + for (int s = 0; s < n_specials; s++) { + int slen = (int)strlen(specials[s].text); + if (strncmp(p, specials[s].text, slen) == 0) { + ids[n_ids++] = specials[s].id; + p += slen; + matched = 1; + break; + } + } + if (matched) continue; + + // Find next special token + const char *next_special = NULL; + for (int s = 0; s < n_specials; s++) { + const char *found = strstr(p, specials[s].text); + if (found && (!next_special || found < next_special)) + next_special = found; + } + + // Encode the text up to the next special (or end) + int chunk_len = next_special ? (int)(next_special - p) : (int)strlen(p); + if (chunk_len > 0) { + char *chunk = (char*)malloc(chunk_len + 1); + memcpy(chunk, p, chunk_len); + chunk[chunk_len] = '\0'; + n_ids += tok_encode(t, chunk, ids + n_ids, max_ids - n_ids); + free(chunk); + } + p += chunk_len; + } + + return n_ids; +} + +// --- Decode token IDs to text --- + +static int tok_decode(Tokenizer *t, const int *ids, int n_ids, char *out, int max_out) { + int n = 0; + for (int i = 0; i < n_ids; i++) { + int id = ids[i]; + // Skip special tokens in output + if (id == TOK_IM_START || id == TOK_IM_END || id == TOK_ENDOFTEXT) + continue; + if (id < 0 || id >= t->vocab_size || !t->id_to_token[id]) + continue; + + const char *tok_str = t->id_to_token[id]; + + // Convert byte-level unicode token back to raw bytes + const char *p = tok_str; + while (*p && n < max_out - 1) { + int cp; + int cplen = utf8_decode(p, &cp); + int byte_val = g_unicode_to_byte[cp < 65536 ? cp : 0]; + if (byte_val >= 0 && byte_val < 256) { + out[n++] = (char)byte_val; + } else { + // Not a byte-mapped char, copy UTF-8 directly + for (int j = 0; j < cplen && n < max_out - 1; j++) + out[n++] = p[j]; + } + p += cplen; + } + } + out[n] = '\0'; + return n; +} + +// --- Chat template --- +// Formats: <|im_start|>system\n{system}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n + +static int tok_apply_chat_template(const char *system_prompt, const char *user_prompt, + char *out, int max_out) { + if (!system_prompt) system_prompt = "You are a helpful assistant."; + return snprintf(out, max_out, + "<|im_start|>system\n%s<|im_end|>\n<|im_start|>user\n%s<|im_end|>\n<|im_start|>assistant\n", + system_prompt, user_prompt); +} + +// --- Convenience: encode a chat prompt --- + +static int tok_encode_chat(Tokenizer *t, const char *system_prompt, const char *user_prompt, + int *ids, int max_ids) { + char templated[65536]; + tok_apply_chat_template(system_prompt, user_prompt, templated, sizeof(templated)); + return tok_encode_with_special(t, templated, ids, max_ids); +} diff --git a/inmem_peak.m b/inmem_peak.m index 87b8163..3334d01 100644 --- a/inmem_peak.m +++ b/inmem_peak.m @@ -8,6 +8,7 @@ static mach_timebase_info_data_t g_tb; static double ticksToMs(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } +static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly NSData *buildWeightBlob(int ch, int depth) { NSUInteger wsize = ch * ch * 2; @@ -27,28 +28,45 @@ NSString *genMIL(int ch, int sp, int depth) { NSMutableString *m = [NSMutableString string]; - [m appendString:@"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, {\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, {\"coremltools-version\", \"9.0\"}})]\n{\n"]; - [m appendFormat:@" func main(tensor x) {\n", ch, sp]; - [m appendString:@" string c_pad_type_0 = const()[name = string(\"c_pad_type_0\"), val = string(\"valid\")];\n" - @" tensor c_strides_0 = const()[name = string(\"c_strides_0\"), val = tensor([1, 1])];\n" - @" tensor c_pad_0 = const()[name = string(\"c_pad_0\"), val = tensor([0, 0, 0, 0])];\n" - @" tensor c_dilations_0 = const()[name = string(\"c_dilations_0\"), val = tensor([1, 1])];\n" - @" int32 c_groups_0 = const()[name = string(\"c_groups_0\"), val = int32(1)];\n" - @" string x_to_fp16_dtype_0 = const()[name = string(\"x_to_fp16_dtype_0\"), val = string(\"fp16\")];\n"]; - [m appendFormat:@" tensor x_to_fp16 = cast(dtype = x_to_fp16_dtype_0, x = x)[name = string(\"cast_in\")];\n", ch, sp]; + [m appendString:@"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"]; + if (g_fp16_io) { + // fp16 I/O path — no cast ops (M1/M2 compatible) + [m appendFormat:@" func main(tensor x) {\n", ch, sp]; + } else { + // fp32 I/O path — cast to/from fp16 internally (M4+ native) + [m appendFormat:@" func main(tensor x) {\n", ch, sp]; + } + [m appendString: + @" tensor c_pad_type_0 = const()[name = tensor(\"c_pad_type_0\"), val = tensor(\"valid\")];\n" + @" tensor c_strides_0 = const()[name = tensor(\"c_strides_0\"), val = tensor([1, 1])];\n" + @" tensor c_pad_0 = const()[name = tensor(\"c_pad_0\"), val = tensor([0, 0, 0, 0])];\n" + @" tensor c_dilations_0 = const()[name = tensor(\"c_dilations_0\"), val = tensor([1, 1])];\n" + @" tensor c_groups_0 = const()[name = tensor(\"c_groups_0\"), val = tensor(1)];\n"]; + NSString *prev; + if (g_fp16_io) { + prev = @"x"; + } else { + [m appendString:@" tensor x_to_fp16_dtype_0 = const()[name = tensor(\"x_to_fp16_dtype_0\"), val = tensor(\"fp16\")];\n"]; + [m appendFormat:@" tensor x_to_fp16 = cast(dtype = x_to_fp16_dtype_0, x = x)[name = tensor(\"cast_in\")];\n", ch, sp]; + prev = @"x_to_fp16"; + } NSUInteger cs = 64 + ch*ch*2; - NSString *prev = @"x_to_fp16"; for (int i = 0; i < depth; i++) { - [m appendFormat:@" tensor W%d = const()[name = string(\"W%d\"), val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n", + [m appendFormat:@" tensor W%d = const()[name = tensor(\"W%d\"), val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(%lu)))];\n", ch, ch, i, i, ch, ch, (unsigned long)(64 + i*cs)]; NSString *out = [NSString stringWithFormat:@"c%d", i]; - [m appendFormat:@" tensor %@ = conv(dilations = c_dilations_0, groups = c_groups_0, pad = c_pad_0, pad_type = c_pad_type_0, strides = c_strides_0, weight = W%d, x = %@)[name = string(\"%@\")];\n", + [m appendFormat:@" tensor %@ = conv(dilations = c_dilations_0, groups = c_groups_0, pad = c_pad_0, pad_type = c_pad_type_0, strides = c_strides_0, weight = W%d, x = %@)[name = tensor(\"%@\")];\n", ch, sp, out, i, prev, out]; prev = out; } - [m appendString:@" string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"]; - [m appendFormat:@" tensor c = cast(dtype = to_fp32, x = %@)[name = string(\"cast_out\")];\n", ch, sp, prev]; - [m appendString:@" } -> (c);\n}\n"]; + if (g_fp16_io) { + [m appendFormat:@" tensor c = identity(x = %@)[name = tensor(\"out\")];\n", ch, sp, prev]; + [m appendString:@" } -> (c);\n}\n"]; + } else { + [m appendString:@" tensor to_fp32 = const()[name = tensor(\"to_fp32\"), val = tensor(\"fp32\")];\n"]; + [m appendFormat:@" tensor c = cast(dtype = to_fp32, x = %@)[name = tensor(\"cast_out\")];\n", ch, sp, prev]; + [m appendString:@" } -> (c);\n}\n"]; + } return m; } @@ -68,9 +86,18 @@ [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil]; [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; [wb writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; - if(!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl,@selector(compileWithQoS:options:error:),21,@{},&e)){[fm removeItemAtPath:td error:nil];return -3;} + if(!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl,@selector(compileWithQoS:options:error:),21,@{},&e)){ + [fm removeItemAtPath:td error:nil]; + if (!g_fp16_io) { + printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n"); + g_fp16_io = 1; + return bench(ch, sp, depth); + } + return -3; + } if(!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl,@selector(loadWithQoS:options:error:),21,@{},&e)){[fm removeItemAtPath:td error:nil];return -4;} - NSUInteger bytes=ch*sp*4; + size_t bpe = g_fp16_io ? 2 : 4; + NSUInteger bytes=ch*sp*bpe; IOSurfaceRef ioI=IOSurfaceCreate((__bridge CFDictionaryRef)@{(id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1,(id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes),(id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0}); IOSurfaceRef ioO=IOSurfaceCreate((__bridge CFDictionaryRef)@{(id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1,(id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes),(id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0}); id wI=((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO,@selector(objectWithIOSurface:),ioI); diff --git a/scripts/aggregate_benchmarks.py b/scripts/aggregate_benchmarks.py new file mode 100644 index 0000000..7908bf4 --- /dev/null +++ b/scripts/aggregate_benchmarks.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 +"""Aggregate community benchmark JSON files into summary tables. + +Usage: + python3 scripts/aggregate_benchmarks.py [community_benchmarks/] + +Reads all .json files from the given directory (default: community_benchmarks/) +and produces: + 1. A markdown summary table to stdout + 2. A combined JSON file at community_benchmarks/SUMMARY.json +""" + +import json +import os +import sys +from pathlib import Path + +def load_submissions(directory): + submissions = [] + for f in sorted(Path(directory).glob("*.json")): + if f.name == "SUMMARY.json": + continue + try: + with open(f) as fh: + data = json.load(fh) + if data.get("schema_version") != 1: + print(f" SKIP {f.name}: unknown schema_version", file=sys.stderr) + continue + data["_filename"] = f.name + submissions.append(data) + except (json.JSONDecodeError, KeyError) as e: + print(f" SKIP {f.name}: {e}", file=sys.stderr) + return submissions + +def format_table(submissions): + lines = [] + lines.append("# ANE Community Benchmark Results\n") + lines.append(f"Total submissions: {len(submissions)}\n") + + header = ( + "| Chip | Machine | macOS | Memory | " + "Peak TFLOPS | SRAM Spill (ch) | " + "Train ms/step (CPU) | Train ms/step (ANE) | " + "ANE TFLOPS | ANE Util % | Date |" + ) + sep = "|" + "|".join(["---"] * 11) + "|" + lines.append(header) + lines.append(sep) + + for s in submissions: + sys_info = s.get("system", {}) + summary = s.get("summary", {}) + + def fmt(v, suffix=""): + if v is None: + return "-" + if isinstance(v, float): + return f"{v:.2f}{suffix}" + return str(v) + + row = "| {} | {} | {} | {} GB | {} | {} | {} | {} | {} | {} | {} |".format( + sys_info.get("chip", "?"), + sys_info.get("machine", "?"), + sys_info.get("macos_version", "?"), + sys_info.get("memory_gb", "?"), + fmt(summary.get("peak_tflops")), + summary.get("sram_spill_start_channels") or "-", + fmt(summary.get("training_ms_per_step_cpu")), + fmt(summary.get("training_ms_per_step_ane")), + fmt(summary.get("training_ane_tflops")), + fmt(summary.get("training_ane_util_pct"), "%"), + s.get("timestamp", "?")[:10], + ) + lines.append(row) + + lines.append("") + + if submissions: + lines.append("## SRAM Probe Comparison\n") + all_channels = set() + for s in submissions: + for probe in s.get("benchmarks", {}).get("sram_probe", []): + all_channels.add(probe["channels"]) + all_channels = sorted(all_channels) + + if all_channels: + header_cols = ["Channels (W MB)"] + [ + s.get("system", {}).get("chip", "?").replace("Apple ", "") + for s in submissions + ] + lines.append("| " + " | ".join(header_cols) + " |") + lines.append("|" + "|".join(["---"] * len(header_cols)) + "|") + + for ch in all_channels: + row_parts = [] + weight_mb = None + for s in submissions: + probe_data = {p["channels"]: p for p in s.get("benchmarks", {}).get("sram_probe", [])} + if ch in probe_data: + p = probe_data[ch] + if weight_mb is None: + weight_mb = p["weight_mb"] + row_parts.append(f"{p['tflops']:.2f} TFLOPS ({p['ms_per_eval']:.3f} ms)") + else: + row_parts.append("-") + + ch_label = f"{ch} ({weight_mb:.1f} MB)" if weight_mb else str(ch) + lines.append("| " + ch_label + " | " + " | ".join(row_parts) + " |") + lines.append("") + + return "\n".join(lines) + +def main(): + directory = sys.argv[1] if len(sys.argv) > 1 else "community_benchmarks" + + if not os.path.isdir(directory): + print(f"Directory not found: {directory}", file=sys.stderr) + print("Run the community benchmark first:", file=sys.stderr) + print(" bash scripts/run_community_benchmark.sh", file=sys.stderr) + sys.exit(1) + + submissions = load_submissions(directory) + if not submissions: + print("No valid benchmark submissions found.", file=sys.stderr) + sys.exit(1) + + table = format_table(submissions) + print(table) + + summary_path = os.path.join(directory, "SUMMARY.json") + combined = { + "generated": submissions[0].get("timestamp", ""), + "count": len(submissions), + "submissions": [ + { + "chip": s.get("system", {}).get("chip"), + "machine": s.get("system", {}).get("machine"), + "macos_version": s.get("system", {}).get("macos_version"), + "memory_gb": s.get("system", {}).get("memory_gb"), + "summary": s.get("summary", {}), + "timestamp": s.get("timestamp"), + "filename": s.get("_filename"), + } + for s in submissions + ], + } + with open(summary_path, "w") as f: + json.dump(combined, f, indent=2) + f.write("\n") + print(f"\nSummary JSON written to: {summary_path}", file=sys.stderr) + +if __name__ == "__main__": + main() diff --git a/scripts/gen_mlpackages.py b/scripts/gen_mlpackages.py new file mode 100644 index 0000000..95cd1d8 --- /dev/null +++ b/scripts/gen_mlpackages.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +""" +Generate /tmp/ane_sram_{ch}ch_{sp}sp.mlpackage models for ANE benchmarks. + +Each model is a single 1x1 conv: fp32_in -> cast_fp16 -> conv -> cast_fp32 -> out +Covers all configs needed by inmem_basic, inmem_bench, sram_bench, sram_probe. +""" + +import numpy as np +import os +import sys + +try: + import coremltools as ct + from coremltools.converters.mil import Builder as mb + from coremltools.converters.mil.mil import types +except ImportError: + print("ERROR: coremltools not installed. Install with: pip install coremltools", file=sys.stderr) + sys.exit(1) + +CONFIGS = [ + (256, 64), (512, 64), (1024, 64), (1536, 64), + (2048, 64), (2560, 64), (3072, 64), (3584, 64), + (4096, 64), (4608, 64), (5120, 64), (6144, 64), + (8192, 32), +] + + +def gen_model(ch, sp): + """Build a coremltools MIL model with a single 1x1 conv.""" + + @mb.program( + input_specs=[mb.TensorSpec(shape=(1, ch, 1, sp), dtype=types.fp32)], + opset_version=ct.target.iOS18, + ) + def prog(x): + x_fp16 = mb.cast(x=x, dtype="fp16", name="cast_in") + w = np.random.randn(ch, ch, 1, 1).astype(np.float16) * 0.01 + c = mb.conv( + x=x_fp16, + weight=w, + pad_type="valid", + strides=[1, 1], + dilations=[1, 1], + groups=1, + name="c0", + ) + out = mb.cast(x=c, dtype="fp32", name="cast_out") + return out + + model = ct.convert( + prog, + minimum_deployment_target=ct.target.iOS18, + compute_precision=ct.precision.FLOAT16, + ) + return model + + +def main(): + created = 0 + skipped = 0 + + print(f"Generating {len(CONFIGS)} mlpackage models in /tmp/...") + + for ch, sp in CONFIGS: + path = f"/tmp/ane_sram_{ch}ch_{sp}sp.mlpackage" + w_mb = ch * ch * 2 / 1024 / 1024 + + if os.path.exists(path): + print(f" [skip] {ch}ch x {sp}sp (exists)") + skipped += 1 + continue + + print(f" [gen] {ch}ch x {sp}sp (weights: {w_mb:.1f} MB)...", end="", flush=True) + try: + model = gen_model(ch, sp) + model.save(path) + print(" OK") + created += 1 + except Exception as e: + print(f" FAILED: {e}") + + print(f"\nDone: {created} created, {skipped} skipped (already existed).") + return 0 if created + skipped == len(CONFIGS) else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/run_benchmarks.sh b/scripts/run_benchmarks.sh new file mode 100755 index 0000000..52d0986 --- /dev/null +++ b/scripts/run_benchmarks.sh @@ -0,0 +1,279 @@ +#!/bin/bash +# run_benchmarks.sh -- ANE Training Benchmark Runner +# Builds and runs benchmarks, collects results into a timestamped report. +# +# Usage: +# bash scripts/run_benchmarks.sh [OPTIONS] +# +# Options: +# --all Run everything (default) +# --training-only Run only training benchmarks +# --probes-only Run only probe/test suite +# --benchmarks-only Run only root-level benchmarks (inmem_peak) +# --steps N Training steps (default: 100) +# --help Show this help + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" +TRAINING_DIR="$ROOT_DIR/training" +TIMESTAMP="$(date +%Y%m%d_%H%M%S)" +RESULTS_FILE="$ROOT_DIR/benchmark_results_${TIMESTAMP}.txt" + +# Defaults +RUN_TRAINING=true +RUN_PROBES=true +RUN_BENCHMARKS=true +STEPS=100 + +# Color output helpers +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +NC='\033[0m' + +log_info() { echo -e "${CYAN}[INFO]${NC} $*"; } +log_success() { echo -e "${GREEN}[PASS]${NC} $*"; } +log_fail() { echo -e "${RED}[FAIL]${NC} $*"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +log_header() { echo -e "\n${CYAN}========================================${NC}"; echo -e "${CYAN} $*${NC}"; echo -e "${CYAN}========================================${NC}"; } + +# Parse arguments +while [[ $# -gt 0 ]]; do + case "$1" in + --all) + RUN_TRAINING=true; RUN_PROBES=true; RUN_BENCHMARKS=true; shift ;; + --training-only) + RUN_TRAINING=true; RUN_PROBES=false; RUN_BENCHMARKS=false; shift ;; + --probes-only) + RUN_TRAINING=false; RUN_PROBES=true; RUN_BENCHMARKS=false; shift ;; + --benchmarks-only) + RUN_TRAINING=false; RUN_PROBES=false; RUN_BENCHMARKS=true; shift ;; + --steps) + STEPS="$2"; shift 2 ;; + --help|-h) + head -14 "$0" | tail -13 + exit 0 ;; + *) + echo "Unknown option: $1"; exit 1 ;; + esac +done + +# Initialize results file +{ + echo "ANE Training Benchmark Results" + echo "==============================" + echo "Date: $(date)" + echo "Machine: $(sysctl -n hw.model 2>/dev/null || echo 'unknown')" + echo "macOS: $(sw_vers -productVersion 2>/dev/null || echo 'unknown')" + echo "Chip: $(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo 'unknown')" + echo "Steps: $STEPS" + echo "" +} > "$RESULTS_FILE" + +log_info "Results will be saved to: $RESULTS_FILE" + +# ────────────────────────────────────────────── +# Prerequisite checks +# ────────────────────────────────────────────── + +log_header "Prerequisite Checks" + +if [[ "$(uname)" != "Darwin" ]]; then + log_fail "This benchmark requires macOS. Detected: $(uname)" + exit 1 +fi +log_success "macOS detected" + +if ! sysctl -n hw.optional.arm64 2>/dev/null | grep -q 1; then + log_fail "Apple Silicon required. This appears to be an Intel Mac." + exit 1 +fi +log_success "Apple Silicon detected" + +if ! xcrun --find clang >/dev/null 2>&1; then + log_fail "Xcode command line tools required. Run: xcode-select --install" + exit 1 +fi +log_success "Xcode CLI tools available" + +PASS_COUNT=0 +FAIL_COUNT=0 +SKIP_COUNT=0 + +run_build_and_test() { + local name="$1" + local build_cmd="$2" + local run_cmd="$3" + local workdir="${4:-$ROOT_DIR}" + + log_info "Building $name..." + local build_output + if ! build_output=$(cd "$workdir" && bash -c "$build_cmd" 2>&1); then + log_fail "$name -- build failed" + echo "[$name] BUILD FAILED" >> "$RESULTS_FILE" + echo "$build_output" >> "$RESULTS_FILE" + echo "" >> "$RESULTS_FILE" + FAIL_COUNT=$((FAIL_COUNT + 1)) + return 1 + fi + + log_info "Running $name..." + echo "--- $name ---" >> "$RESULTS_FILE" + + local output + if output=$(cd "$workdir" && bash -c "$run_cmd" 2>&1); then + echo "$output" >> "$RESULTS_FILE" + echo "" >> "$RESULTS_FILE" + log_success "$name completed" + PASS_COUNT=$((PASS_COUNT + 1)) + else + echo "$output" >> "$RESULTS_FILE" + echo "EXIT CODE: $?" >> "$RESULTS_FILE" + echo "" >> "$RESULTS_FILE" + log_fail "$name -- run failed (output captured in results file)" + FAIL_COUNT=$((FAIL_COUNT + 1)) + return 1 + fi +} + +# ────────────────────────────────────────────── +# Training Benchmarks +# ────────────────────────────────────────────── + +if $RUN_TRAINING; then + log_header "Training Benchmarks ($STEPS steps)" + + echo "" >> "$RESULTS_FILE" + echo "=== TRAINING BENCHMARKS ===" >> "$RESULTS_FILE" + echo "" >> "$RESULTS_FILE" + + run_build_and_test \ + "train_large (CPU classifier)" \ + "make train_large" \ + "./train_large --steps $STEPS" \ + "$TRAINING_DIR" || true + + run_build_and_test \ + "train_large_ane (ANE classifier)" \ + "make train_large_ane" \ + "./train_large_ane --steps $STEPS" \ + "$TRAINING_DIR" || true +fi + +# ────────────────────────────────────────────── +# Probe Tests +# ────────────────────────────────────────────── + +if $RUN_PROBES; then + log_header "Probe Tests" + + echo "" >> "$RESULTS_FILE" + echo "=== PROBE TESTS ===" >> "$RESULTS_FILE" + echo "" >> "$RESULTS_FILE" + + PROBE_TESTS=("test_rmsnorm_bwd" "test_classifier" "test_weight_reload" "test_perf_stats" "test_qos_sweep" "test_ane_advanced") + + for test_name in "${PROBE_TESTS[@]}"; do + run_build_and_test \ + "$test_name" \ + "make $test_name" \ + "./$test_name" \ + "$TRAINING_DIR" || true + done +fi + +# ────────────────────────────────────────────── +# Root-Level Benchmarks +# ────────────────────────────────────────────── + +if $RUN_BENCHMARKS; then + log_header "Root-Level Benchmarks" + + echo "" >> "$RESULTS_FILE" + echo "=== ROOT-LEVEL BENCHMARKS ===" >> "$RESULTS_FILE" + echo "" >> "$RESULTS_FILE" + + CC="xcrun clang" + CFLAGS="-O2 -fobjc-arc -framework Foundation -framework CoreML -framework IOSurface -ldl" + + # Generate mlpackage models needed by sram/inmem benchmarks + if ! ls /tmp/ane_sram_*ch_*sp.mlpackage >/dev/null 2>&1; then + log_info "Generating mlpackage models for benchmarks..." + VENV_PYTHON="" + if [[ -x /tmp/ane_venv/bin/python3 ]]; then + VENV_PYTHON="/tmp/ane_venv/bin/python3" + else + for pyver in 3.12 3.13 3.11; do + PY="/opt/homebrew/opt/python@${pyver}/bin/python${pyver}" + if [[ -x "$PY" ]]; then + log_info "Creating venv with Python $pyver for coremltools..." + "$PY" -m venv /tmp/ane_venv && /tmp/ane_venv/bin/pip install -q coremltools numpy 2>/dev/null + VENV_PYTHON="/tmp/ane_venv/bin/python3" + break + fi + done + fi + if [[ -n "$VENV_PYTHON" ]] && "$VENV_PYTHON" "$SCRIPT_DIR/gen_mlpackages.py" 2>/dev/null; then + log_success "mlpackage models generated" + else + log_warn "Failed to generate mlpackage models (need Python 3.11-3.13 + coremltools)" + fi + else + log_info "mlpackage models already exist in /tmp/" + fi + + run_build_and_test \ + "inmem_peak (Peak TFLOPS)" \ + "$CC $CFLAGS -o inmem_peak inmem_peak.m" \ + "./inmem_peak" \ + "$ROOT_DIR" || true + + for bench in inmem_basic inmem_bench sram_bench sram_probe; do + if ls /tmp/ane_sram_*ch_*sp.mlpackage >/dev/null 2>&1; then + run_build_and_test \ + "$bench" \ + "$CC $CFLAGS -o $bench ${bench}.m" \ + "./$bench" \ + "$ROOT_DIR" || true + else + log_warn "$bench -- SKIPPED (mlpackage generation failed)" + echo "[$bench] SKIPPED -- mlpackage generation failed" >> "$RESULTS_FILE" + echo "" >> "$RESULTS_FILE" + SKIP_COUNT=$((SKIP_COUNT + 1)) + fi + done +fi + +# ────────────────────────────────────────────── +# Summary +# ────────────────────────────────────────────── + +log_header "Summary" + +TOTAL=$((PASS_COUNT + FAIL_COUNT + SKIP_COUNT)) + +{ + echo "" + echo "=== SUMMARY ===" + echo "Total: $TOTAL" + echo "Passed: $PASS_COUNT" + echo "Failed: $FAIL_COUNT" + echo "Skipped: $SKIP_COUNT" +} >> "$RESULTS_FILE" + +echo "" +log_info "Total: $TOTAL" +log_success "Passed: $PASS_COUNT" +if [[ $FAIL_COUNT -gt 0 ]]; then + log_fail "Failed: $FAIL_COUNT" +else + log_info "Failed: 0" +fi +if [[ $SKIP_COUNT -gt 0 ]]; then + log_warn "Skipped: $SKIP_COUNT" +fi +echo "" +log_info "Full results saved to: $RESULTS_FILE" diff --git a/scripts/run_community_benchmark.sh b/scripts/run_community_benchmark.sh new file mode 100755 index 0000000..3b01e02 --- /dev/null +++ b/scripts/run_community_benchmark.sh @@ -0,0 +1,375 @@ +#!/bin/bash +# run_community_benchmark.sh -- Standardized ANE benchmark for community submissions +# +# Runs a focused set of benchmarks and outputs a single JSON file that can be +# submitted to the community_benchmarks/ directory via PR or GitHub issue. +# +# Usage: +# bash scripts/run_community_benchmark.sh [--steps N] [--skip-training] +# +# Output: +# community_benchmarks/_.json + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" +TRAINING_DIR="$ROOT_DIR/training" + +STEPS=20 +SKIP_TRAINING=false + +while [[ $# -gt 0 ]]; do + case "$1" in + --steps) STEPS="$2"; shift 2 ;; + --skip-training) SKIP_TRAINING=true; shift ;; + --help|-h) + echo "Usage: bash scripts/run_community_benchmark.sh [--steps N] [--skip-training]" + echo " --steps N Training steps (default: 20)" + echo " --skip-training Skip training benchmarks (useful if no training data)" + exit 0 ;; + *) echo "Unknown option: $1"; exit 1 ;; + esac +done + +# ── Collect system info ── + +CHIP=$(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo "unknown") +MACHINE=$(sysctl -n hw.model 2>/dev/null || echo "unknown") +MACOS_VER=$(sw_vers -productVersion 2>/dev/null || echo "unknown") +MACOS_BUILD=$(sw_vers -buildVersion 2>/dev/null || echo "unknown") +NCPU=$(sysctl -n hw.ncpu 2>/dev/null || echo "0") +MEM_BYTES=$(sysctl -n hw.memsize 2>/dev/null || echo "0") +MEM_GB=$(echo "scale=0; $MEM_BYTES / 1073741824" | bc 2>/dev/null || echo "0") +NEURAL_CORES=$(sysctl -n hw.optional.ane.num_cores 2>/dev/null || echo "unknown") +DATE_ISO=$(date -u +"%Y-%m-%dT%H:%M:%SZ") +DATE_SHORT=$(date +"%Y%m%d") + +CHIP_SLUG=$(echo "$CHIP" | tr ' ' '_' | tr -d '()' | tr '[:upper:]' '[:lower:]') + +echo "=== ANE Community Benchmark ===" +echo "Chip: $CHIP" +echo "Machine: $MACHINE" +echo "macOS: $MACOS_VER ($MACOS_BUILD)" +echo "Memory: ${MEM_GB} GB" +echo "CPUs: $NCPU" +echo "ANE cores: $NEURAL_CORES" +echo "" + +# ── Prerequisites ── + +if [[ "$(uname)" != "Darwin" ]]; then + echo "ERROR: macOS required"; exit 1 +fi +if ! sysctl -n hw.optional.arm64 2>/dev/null | grep -q 1; then + echo "ERROR: Apple Silicon required"; exit 1 +fi +if ! xcrun --find clang >/dev/null 2>&1; then + echo "ERROR: Xcode CLI tools required. Run: xcode-select --install"; exit 1 +fi + +CC="xcrun clang" +CFLAGS="-O2 -fobjc-arc -fstack-protector-strong -framework Foundation -framework CoreML -framework IOSurface -ldl" + +# ── Ask for GitHub username (optional) ── + +echo "Enter your GitHub username (optional, press Enter to skip):" +read -r GH_USERNAME +GH_USERNAME=$(echo "$GH_USERNAME" | tr -d '[:space:]' | sed 's/[^a-zA-Z0-9_-]//g' | cut -c1-39) + +if [[ -n "$GH_USERNAME" ]]; then + echo "Username: $GH_USERNAME" +else + echo "Submitting anonymously" +fi +echo "" + +# ── Temp file for collecting JSON fragments ── + +TMPJSON=$(mktemp /tmp/ane_bench_XXXXXX.json) +trap "rm -f $TMPJSON" EXIT + +# Start building the JSON result +USERNAME_LINE="" +if [[ -n "$GH_USERNAME" ]]; then + USERNAME_LINE="\"username\": \"$GH_USERNAME\"," +fi + +cat > "$TMPJSON" << HEADER +{ + "schema_version": 1, + $USERNAME_LINE + "timestamp": "$DATE_ISO", + "system": { + "chip": "$CHIP", + "machine": "$MACHINE", + "macos_version": "$MACOS_VER", + "macos_build": "$MACOS_BUILD", + "cpu_cores": $NCPU, + "memory_gb": $MEM_GB, + "neural_engine_cores": "$NEURAL_CORES" + }, +HEADER + +# ── 1. SRAM Probe ── + +echo "--- Running sram_probe ---" +SRAM_JSON="[]" + +# Generate mlpackage models if needed +if ! ls /tmp/ane_sram_*ch_*sp.mlpackage >/dev/null 2>&1; then + echo " Generating mlpackage models..." + VENV_PYTHON="" + if [[ -x /tmp/ane_venv/bin/python3 ]]; then + VENV_PYTHON="/tmp/ane_venv/bin/python3" + else + for pyver in 3.12 3.13 3.11; do + PY="/opt/homebrew/opt/python@${pyver}/bin/python${pyver}" + if [[ -x "$PY" ]]; then + "$PY" -m venv /tmp/ane_venv && /tmp/ane_venv/bin/pip install -q coremltools numpy 2>/dev/null + VENV_PYTHON="/tmp/ane_venv/bin/python3" + break + fi + done + fi + if [[ -n "$VENV_PYTHON" ]]; then + "$VENV_PYTHON" "$SCRIPT_DIR/gen_mlpackages.py" 2>/dev/null && echo " mlpackage models generated" || echo " WARNING: mlpackage generation failed" + fi +fi + +if ls /tmp/ane_sram_*ch_*sp.mlpackage >/dev/null 2>&1; then + cd "$ROOT_DIR" + $CC $CFLAGS -o sram_probe sram_probe.m 2>/dev/null + + SRAM_OUTPUT=$(./sram_probe 2>&1) || true + echo " sram_probe complete" + + SRAM_JSON=$(echo "$SRAM_OUTPUT" | python3 -c " +import sys, json, re +results = [] +for line in sys.stdin: + line = line.strip() + m = re.match(r'\s*(\d+)\s+ch\s+([\d.]+)\s+([\d.]+)\s+ms\s+([\d.]+)\s+([\d.]+)', line) + if m: + results.append({ + 'channels': int(m.group(1)), + 'weight_mb': float(m.group(2)), + 'ms_per_eval': float(m.group(3)), + 'tflops': float(m.group(4)), + 'gflops_per_mb': float(m.group(5)) + }) +print(json.dumps(results)) +" 2>/dev/null || echo "[]") +else + echo " SKIPPED: no mlpackage models" +fi + +# ── 2. InMem Peak ── + +echo "--- Running inmem_peak ---" +PEAK_JSON="[]" + +cd "$ROOT_DIR" +$CC $CFLAGS -o inmem_peak inmem_peak.m 2>/dev/null + +PEAK_OUTPUT=$(./inmem_peak 2>&1) || true +echo " inmem_peak complete" + +PEAK_JSON=$(echo "$PEAK_OUTPUT" | python3 -c " +import sys, json, re +results = [] +for line in sys.stdin: + line = line.strip() + m = re.match(r'(\d+)x\s+conv\s+(\d+)ch\s+sp(\d+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+ms\s+([\d.]+)', line) + if m: + results.append({ + 'depth': int(m.group(1)), + 'channels': int(m.group(2)), + 'spatial': int(m.group(3)), + 'weight_mb': float(m.group(4)), + 'gflops': float(m.group(5)), + 'ms_per_eval': float(m.group(6)), + 'tflops': float(m.group(7)) + }) +print(json.dumps(results)) +" 2>/dev/null || echo "[]") + +# ── 3. Training (optional) ── + +echo "--- Running training benchmark ($STEPS steps) ---" +TRAIN_CPU_JSON="{}" +TRAIN_ANE_JSON="{}" + +if ! $SKIP_TRAINING; then + cd "$TRAINING_DIR" + + # Build training binaries + make train_large train_large_ane 2>/dev/null || true + + if [[ -x ./train_large ]]; then + TRAIN_OUTPUT=$(./train_large --steps "$STEPS" 2>&1) || true + echo " train_large complete" + + TRAIN_CPU_JSON=$(echo "$TRAIN_OUTPUT" | python3 -c " +import sys, json, re +result = {} +for line in sys.stdin: + line = line.strip() + if line.startswith('{\"type\":\"perf\"'): + d = json.loads(line) + result['ane_tflops'] = d.get('ane_tflops') + result['ane_util_pct'] = d.get('ane_util_pct') + m = re.match(r'Avg train:\s+([\d.]+)\s+ms/step', line) + if m: result['ms_per_step'] = float(m.group(1)) + m = re.match(r'ANE TFLOPS:\s+([\d.]+)', line) + if m: result['ane_tflops_sustained'] = float(m.group(1)) + m = re.match(r'Total TFLOPS:\s+([\d.]+)', line) + if m: result['total_tflops'] = float(m.group(1)) + m = re.match(r'ANE utilization:\s+([\d.]+)%', line) + if m: result['ane_util_pct'] = float(m.group(1)) + m = re.match(r'Compile time:\s+\d+\s+ms\s+\(([\d.]+)%\)', line) + if m: result['compile_pct'] = float(m.group(1)) + m = re.match(r'Train time:\s+\d+\s+ms\s+\(([\d.]+)%\)', line) + if m: result['train_pct'] = float(m.group(1)) +print(json.dumps(result)) +" 2>/dev/null || echo "{}") + fi + + if [[ -x ./train_large_ane ]]; then + TRAIN_ANE_OUTPUT=$(./train_large_ane --steps "$STEPS" 2>&1) || true + echo " train_large_ane complete" + + TRAIN_ANE_JSON=$(echo "$TRAIN_ANE_OUTPUT" | python3 -c " +import sys, json, re +result = {} +for line in sys.stdin: + line = line.strip() + m = re.match(r'Avg train:\s+([\d.]+)\s+ms/step', line) + if m: result['ms_per_step'] = float(m.group(1)) + m = re.match(r'ANE TFLOPS:\s+([\d.]+)', line) + if m: result['ane_tflops_sustained'] = float(m.group(1)) + m = re.match(r'Total TFLOPS:\s+([\d.]+)', line) + if m: result['total_tflops'] = float(m.group(1)) + m = re.match(r'ANE utilization:\s+([\d.]+)%', line) + if m: result['ane_util_pct'] = float(m.group(1)) + m = re.match(r'Compile time:\s+\d+\s+ms\s+\(([\d.]+)%\)', line) + if m: result['compile_pct'] = float(m.group(1)) + m = re.match(r'Train time:\s+\d+\s+ms\s+\(([\d.]+)%\)', line) + if m: result['train_pct'] = float(m.group(1)) +print(json.dumps(result)) +" 2>/dev/null || echo "{}") + fi +else + echo " SKIPPED (--skip-training)" +fi + +# ── Assemble final JSON ── + +OUTDIR="$ROOT_DIR/community_benchmarks" +mkdir -p "$OUTDIR" +OUTFILE="$OUTDIR/${CHIP_SLUG}_${DATE_SHORT}.json" +if [[ -f "$OUTFILE" ]]; then + i=2 + while [[ -f "${OUTFILE%.json}_${i}.json" ]]; do i=$((i+1)); done + OUTFILE="${OUTFILE%.json}_${i}.json" +fi + +python3 -c " +import json, sys + +with open('$TMPJSON') as f: + partial = f.read() + +sram = json.loads('''$SRAM_JSON''') +peak = json.loads('''$PEAK_JSON''') +train_cpu = json.loads('''$TRAIN_CPU_JSON''') +train_ane = json.loads('''$TRAIN_ANE_JSON''') + +peak_tflops = max((r['tflops'] for r in peak), default=0) +sram_peak_eff = max((r['gflops_per_mb'] for r in sram), default=0) +sram_spill_ch = 0 +prev_tflops = 0 +for r in sorted(sram, key=lambda x: x['channels']): + if prev_tflops > 0 and r['tflops'] < prev_tflops * 0.6: + sram_spill_ch = r['channels'] + break + prev_tflops = max(prev_tflops, r['tflops']) + +result = json.loads(partial + '\"_\": 0}') +del result['_'] + +result['benchmarks'] = { + 'sram_probe': sram, + 'inmem_peak': peak, + 'training_cpu_classifier': train_cpu, + 'training_ane_classifier': train_ane +} + +result['summary'] = { + 'peak_tflops': round(peak_tflops, 2), + 'sram_peak_efficiency_gflops_per_mb': round(sram_peak_eff, 1), + 'sram_spill_start_channels': sram_spill_ch, + 'training_ms_per_step_cpu': train_cpu.get('ms_per_step'), + 'training_ms_per_step_ane': train_ane.get('ms_per_step'), + 'training_ane_tflops': train_ane.get('ane_tflops_sustained') or train_cpu.get('ane_tflops_sustained'), + 'training_ane_util_pct': train_ane.get('ane_util_pct') or train_cpu.get('ane_util_pct') +} + +with open('$OUTFILE', 'w') as f: + json.dump(result, f, indent=2) + f.write('\n') + +print(json.dumps(result['summary'], indent=2)) +" + +echo "" +echo "=== Benchmark complete ===" +echo "Results saved to: $OUTFILE" +echo "" + +# ── Optional: submit to community database ── + +DASHBOARD_URL="${ANE_DASHBOARD_URL:-https://web-lac-sigma-61.vercel.app}" +SUBMIT_URL="$DASHBOARD_URL/api/submit" + +echo "Would you like to submit your results to the ANE community benchmark database? (y/N)" +read -r SUBMIT_ANSWER + +if [[ "$SUBMIT_ANSWER" =~ ^[Yy]$ ]]; then + echo "Submitting to $SUBMIT_URL ..." + + HTTP_RESPONSE=$(curl -s -w "\n%{http_code}" \ + -X POST "$SUBMIT_URL" \ + -H "Content-Type: application/json" \ + -d @"$OUTFILE" 2>/dev/null) || true + + HTTP_BODY=$(echo "$HTTP_RESPONSE" | sed '$d') + HTTP_CODE=$(echo "$HTTP_RESPONSE" | tail -1) + + case "$HTTP_CODE" in + 201) + SUBMIT_ID=$(echo "$HTTP_BODY" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo "") + echo "Submitted successfully! (ID: $SUBMIT_ID)" + echo "View results at: $DASHBOARD_URL" + ;; + 409) + echo "Already submitted (duplicate detected within the last hour)." + echo "View results at: $DASHBOARD_URL" + ;; + 429) + echo "Rate limited -- too many submissions. Try again later." + echo "You can also submit via GitHub PR instead (see below)." + ;; + *) + echo "Submission failed (HTTP $HTTP_CODE). You can submit manually instead." + ;; + esac + echo "" +fi + +echo "Alternative submission methods:" +echo " 1. Fork https://github.com/maderix/ANE" +echo " 2. Add $OUTFILE to your fork" +echo " 3. Open a Pull Request" +echo "" +echo "Or paste the contents of $OUTFILE in a GitHub issue." diff --git a/training/Makefile b/training/Makefile index 7f16c1a..405c770 100644 --- a/training/Makefile +++ b/training/Makefile @@ -1,5 +1,10 @@ CC = xcrun clang -CFLAGS = -O2 -Wall -Wno-deprecated-declarations -fobjc-arc + +ANE_COMPAT = -Wno-deprecated-declarations +SEC_FLAGS = -fstack-protector-strong -Wformat-security + +CFLAGS = -O2 -Wall $(ANE_COMPAT) -fobjc-arc $(SEC_FLAGS) +CFLAGS_DEBUG = -O0 -g -Wall $(ANE_COMPAT) -fobjc-arc -fsanitize=address,undefined FRAMEWORKS = -framework Foundation -framework CoreML -framework IOSurface LDFLAGS = $(FRAMEWORKS) -ldl @@ -16,6 +21,14 @@ train_large: train_large.m $(HEADERS_LARGE) train_large_ane: train_large_ane.m $(HEADERS_ANE) $(CC) $(CFLAGS) -o $@ train_large_ane.m $(LDFLAGS) -framework Accelerate +HEADERS_OPT = $(HEADERS_LARGE) stories_cpu_ops_opt.h + +train_opt: train_opt.m $(HEADERS_OPT) + $(CC) $(CFLAGS) -o $@ train_opt.m $(LDFLAGS) -framework Accelerate -framework Metal -framework MetalPerformanceShaders + +train_double_buffer: train_double_buffer.m $(HEADERS_LARGE) + $(CC) $(CFLAGS) -o $@ train_double_buffer.m $(LDFLAGS) -framework Accelerate + PROBES = test_weight_reload test_perf_stats test_qos_sweep test_ane_advanced test_rmsnorm_bwd: test_rmsnorm_bwd.m $(HEADERS_ANE) @@ -36,13 +49,31 @@ test_qos_sweep: test_qos_sweep.m test_ane_advanced: test_ane_advanced.m $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) +test_chaining: test_chaining.m + $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) + probes: $(PROBES) +data: tokenize + @bash download_data.sh + tokenize: python3 tokenize.py +setup: data + @echo "=== Setup complete ===" + @echo "Data: tinystories_data00.bin" + @echo "To train: make train_large && ./train_large" + @echo "Override paths: ANE_MODEL_PATH=... ANE_DATA_PATH=... ./train_large" + +verify-flags: + @echo "=== Active CFLAGS ===" + @echo "$(CFLAGS)" + @echo "=== Compiler version ===" + @xcrun clang --version + clean: - rm -f train train_large train_large_ane $(PROBES) test_rmsnorm_bwd test_classifier + rm -f train train_large train_large_ane train_opt train_double_buffer $(PROBES) test_rmsnorm_bwd test_classifier -.PHONY: clean tokenize probes +.PHONY: clean tokenize probes verify-flags data setup diff --git a/training/ane_mil_gen.h b/training/ane_mil_gen.h index 97fc451..05a8c95 100644 --- a/training/ane_mil_gen.h +++ b/training/ane_mil_gen.h @@ -5,6 +5,9 @@ #include #include +// Set by caller: 1 = fp16 I/O (M1/M2 fallback, no cast ops), 0 = fp32 I/O with cast (M4+) +extern int g_fp16_io; + // Build an FP16 weight blob with the required header structure. // weights_f32: source weights in row-major [out_ch, in_ch] // Returns NSData with header + FP16 weights @@ -30,21 +33,32 @@ static NSData *mil_build_weight_blob(const float *weights_f32, int out_ch, int i // Input W: [1, out_ch, in_ch] fp32 // Output: [1, out_ch, spatial] fp32 static NSString *mil_gen_matmul(int in_ch, int out_ch, int spatial) { + if (g_fp16_io) { + return [NSString stringWithFormat: + @"program(1.0)\n" + "[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n" + "{\n" + " func main(tensor x, tensor W) {\n" + " tensor tx = const()[name = tensor(\"tx\"), val = tensor(false)];\n" + " tensor ty = const()[name = tensor(\"ty\"), val = tensor(false)];\n" + " tensor y = matmul(transpose_x = tx, transpose_y = ty, x = W, y = x)[name = tensor(\"mm\")];\n" + " } -> (y);\n" + "}\n", + in_ch, spatial, out_ch, in_ch, out_ch, spatial]; + } return [NSString stringWithFormat: - @"program(1.3)\n" - "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n" + @"program(1.0)\n" + "[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n" "{\n" - " func main(tensor x, tensor W) {\n" - " string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n" - " tensor x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_x\")];\n" - " tensor W16 = cast(dtype = to_fp16, x = W)[name = string(\"cast_W\")];\n" - " bool tx = const()[name = string(\"tx\"), val = bool(false)];\n" - " bool ty = const()[name = string(\"ty\"), val = bool(false)];\n" - " tensor y16 = matmul(transpose_x = tx, transpose_y = ty, x = W16, y = x16)[name = string(\"mm\")];\n" - " string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n" - " tensor y = cast(dtype = to_fp32, x = y16)[name = string(\"cast_out\")];\n" + " func main(tensor x, tensor W) {\n" + " tensor to_fp16 = const()[name = tensor(\"to_fp16\"), val = tensor(\"fp16\")];\n" + " tensor x16 = cast(dtype = to_fp16, x = x)[name = tensor(\"cast_x\")];\n" + " tensor W16 = cast(dtype = to_fp16, x = W)[name = tensor(\"cast_W\")];\n" + " tensor tx = const()[name = tensor(\"tx\"), val = tensor(false)];\n" + " tensor ty = const()[name = tensor(\"ty\"), val = tensor(false)];\n" + " tensor y16 = matmul(transpose_x = tx, transpose_y = ty, x = W16, y = x16)[name = tensor(\"mm\")];\n" + " tensor to_fp32 = const()[name = tensor(\"to_fp32\"), val = tensor(\"fp32\")];\n" + " tensor y = cast(dtype = to_fp32, x = y16)[name = tensor(\"cast_out\")];\n" " } -> (y);\n" "}\n", in_ch, spatial, out_ch, in_ch, @@ -54,26 +68,45 @@ static NSString *mil_gen_matmul(int in_ch, int out_ch, int spatial) { // Keep the baked-weight version for reference (used in inference-only scenarios) static NSString *mil_gen_conv(int in_ch, int out_ch, int spatial) { + if (g_fp16_io) { + return [NSString stringWithFormat: + @"program(1.0)\n" + "[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n" + "{\n" + " func main(tensor x) {\n" + " tensor c_pad_type = const()[name = tensor(\"c_pad_type\"), val = tensor(\"valid\")];\n" + " tensor c_strides = const()[name = tensor(\"c_strides\"), val = tensor([1, 1])];\n" + " tensor c_pad = const()[name = tensor(\"c_pad\"), val = tensor([0, 0, 0, 0])];\n" + " tensor c_dilations = const()[name = tensor(\"c_dilations\"), val = tensor([1, 1])];\n" + " tensor c_groups = const()[name = tensor(\"c_groups\"), val = tensor(1)];\n" + " tensor W = const()[name = tensor(\"W\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(64)))];\n" + " tensor y = conv(dilations = c_dilations, groups = c_groups, " + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W, x = x)[name = tensor(\"conv\")];\n" + " } -> (y);\n" + "}\n", + in_ch, spatial, + out_ch, in_ch, out_ch, in_ch, + out_ch, spatial]; + } return [NSString stringWithFormat: - @"program(1.3)\n" - "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n" + @"program(1.0)\n" + "[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n" "{\n" - " func main(tensor x) {\n" - " string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n" - " tensor c_strides = const()[name = string(\"c_strides\"), val = tensor([1, 1])];\n" - " tensor c_pad = const()[name = string(\"c_pad\"), val = tensor([0, 0, 0, 0])];\n" - " tensor c_dilations = const()[name = string(\"c_dilations\"), val = tensor([1, 1])];\n" - " int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n" - " string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n" - " tensor x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n" - " tensor W = const()[name = string(\"W\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n" + " func main(tensor x) {\n" + " tensor c_pad_type = const()[name = tensor(\"c_pad_type\"), val = tensor(\"valid\")];\n" + " tensor c_strides = const()[name = tensor(\"c_strides\"), val = tensor([1, 1])];\n" + " tensor c_pad = const()[name = tensor(\"c_pad\"), val = tensor([0, 0, 0, 0])];\n" + " tensor c_dilations = const()[name = tensor(\"c_dilations\"), val = tensor([1, 1])];\n" + " tensor c_groups = const()[name = tensor(\"c_groups\"), val = tensor(1)];\n" + " tensor to_fp16 = const()[name = tensor(\"to_fp16\"), val = tensor(\"fp16\")];\n" + " tensor x16 = cast(dtype = to_fp16, x = x)[name = tensor(\"cast_in\")];\n" + " tensor W = const()[name = tensor(\"W\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(64)))];\n" " tensor y16 = conv(dilations = c_dilations, groups = c_groups, " - "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W, x = x16)[name = string(\"conv\")];\n" - " string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n" - " tensor y = cast(dtype = to_fp32, x = y16)[name = string(\"cast_out\")];\n" + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W, x = x16)[name = tensor(\"conv\")];\n" + " tensor to_fp32 = const()[name = tensor(\"to_fp32\"), val = tensor(\"fp32\")];\n" + " tensor y = cast(dtype = to_fp32, x = y16)[name = tensor(\"cast_out\")];\n" " } -> (y);\n" "}\n", in_ch, spatial, in_ch, spatial, @@ -88,36 +121,65 @@ static NSString *mil_gen_conv(int in_ch, int out_ch, int spatial) { // where cs = 64 + dim*dim*2 static NSString *mil_gen_qkv(int dim, int spatial) { NSUInteger cs = 64 + (NSUInteger)dim * dim * 2; + if (g_fp16_io) { + return [NSString stringWithFormat: + @"program(1.0)\n" + "[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n" + "{\n" + " func main(tensor x) {\n" + " tensor c_pad_type = const()[name = tensor(\"c_pad_type\"), val = tensor(\"valid\")];\n" + " tensor c_strides = const()[name = tensor(\"c_strides\"), val = tensor([1, 1])];\n" + " tensor c_pad = const()[name = tensor(\"c_pad\"), val = tensor([0, 0, 0, 0])];\n" + " tensor c_dilations = const()[name = tensor(\"c_dilations\"), val = tensor([1, 1])];\n" + " tensor c_groups = const()[name = tensor(\"c_groups\"), val = tensor(1)];\n" + " tensor Wq = const()[name = tensor(\"Wq\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(64)))];\n" + " tensor Wk = const()[name = tensor(\"Wk\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(%lu)))];\n" + " tensor Wv = const()[name = tensor(\"Wv\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(%lu)))];\n" + " tensor q = conv(dilations = c_dilations, groups = c_groups, " + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wq, x = x)[name = tensor(\"conv_q\")];\n" + " tensor k = conv(dilations = c_dilations, groups = c_groups, " + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wk, x = x)[name = tensor(\"conv_k\")];\n" + " tensor v = conv(dilations = c_dilations, groups = c_groups, " + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wv, x = x)[name = tensor(\"conv_v\")];\n" + " } -> (q, k, v);\n" + "}\n", + dim, spatial, + dim, dim, dim, dim, + dim, dim, dim, dim, (unsigned long)(64 + cs), + dim, dim, dim, dim, (unsigned long)(64 + 2*cs), + dim, spatial, dim, spatial, dim, spatial]; + } return [NSString stringWithFormat: - @"program(1.3)\n" - "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n" + @"program(1.0)\n" + "[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n" "{\n" - " func main(tensor x) {\n" - " string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n" - " tensor c_strides = const()[name = string(\"c_strides\"), val = tensor([1, 1])];\n" - " tensor c_pad = const()[name = string(\"c_pad\"), val = tensor([0, 0, 0, 0])];\n" - " tensor c_dilations = const()[name = string(\"c_dilations\"), val = tensor([1, 1])];\n" - " int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n" - " string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n" - " tensor x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n" - " tensor Wq = const()[name = string(\"Wq\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n" - " tensor Wk = const()[name = string(\"Wk\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n" - " tensor Wv = const()[name = string(\"Wv\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n" + " func main(tensor x) {\n" + " tensor c_pad_type = const()[name = tensor(\"c_pad_type\"), val = tensor(\"valid\")];\n" + " tensor c_strides = const()[name = tensor(\"c_strides\"), val = tensor([1, 1])];\n" + " tensor c_pad = const()[name = tensor(\"c_pad\"), val = tensor([0, 0, 0, 0])];\n" + " tensor c_dilations = const()[name = tensor(\"c_dilations\"), val = tensor([1, 1])];\n" + " tensor c_groups = const()[name = tensor(\"c_groups\"), val = tensor(1)];\n" + " tensor to_fp16 = const()[name = tensor(\"to_fp16\"), val = tensor(\"fp16\")];\n" + " tensor x16 = cast(dtype = to_fp16, x = x)[name = tensor(\"cast_in\")];\n" + " tensor Wq = const()[name = tensor(\"Wq\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(64)))];\n" + " tensor Wk = const()[name = tensor(\"Wk\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(%lu)))];\n" + " tensor Wv = const()[name = tensor(\"Wv\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(%lu)))];\n" " tensor q16 = conv(dilations = c_dilations, groups = c_groups, " - "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wq, x = x16)[name = string(\"conv_q\")];\n" + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wq, x = x16)[name = tensor(\"conv_q\")];\n" " tensor k16 = conv(dilations = c_dilations, groups = c_groups, " - "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wk, x = x16)[name = string(\"conv_k\")];\n" + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wk, x = x16)[name = tensor(\"conv_k\")];\n" " tensor v16 = conv(dilations = c_dilations, groups = c_groups, " - "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wv, x = x16)[name = string(\"conv_v\")];\n" - " string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n" - " tensor q = cast(dtype = to_fp32, x = q16)[name = string(\"cast_q\")];\n" - " tensor k = cast(dtype = to_fp32, x = k16)[name = string(\"cast_k\")];\n" - " tensor v = cast(dtype = to_fp32, x = v16)[name = string(\"cast_v\")];\n" + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wv, x = x16)[name = tensor(\"conv_v\")];\n" + " tensor to_fp32 = const()[name = tensor(\"to_fp32\"), val = tensor(\"fp32\")];\n" + " tensor q = cast(dtype = to_fp32, x = q16)[name = tensor(\"cast_q\")];\n" + " tensor k = cast(dtype = to_fp32, x = k16)[name = tensor(\"cast_k\")];\n" + " tensor v = cast(dtype = to_fp32, x = v16)[name = tensor(\"cast_v\")];\n" " } -> (q, k, v);\n" "}\n", dim, spatial, dim, spatial, @@ -170,34 +232,176 @@ static NSData *mil_build_ffn_up_weight_blob(const float *w1, const float *w3, in return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; } +// Generate MIL for fused GQA QKV: Q, K, V have different output dimensions +// Qwen2.5-0.5B: Q=[q_dim, dim], K=[kv_dim, dim], V=[kv_dim, dim] +// Weight blob: Wq[q_dim,dim] @ chunk0, Wk[kv_dim,dim] @ chunk1, Wv[kv_dim,dim] @ chunk2 +static NSString *mil_gen_qkv_gqa(int dim, int q_dim, int kv_dim, int spatial) { + NSUInteger cs_q = 64 + (NSUInteger)q_dim * dim * 2; + NSUInteger cs_kv = 64 + (NSUInteger)kv_dim * dim * 2; + NSUInteger off_k = 64 + cs_q; + NSUInteger off_v = off_k + cs_kv; + if (g_fp16_io) { + return [NSString stringWithFormat: + @"program(1.0)\n" + "[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n" + "{\n" + " func main(tensor x) {\n" + " tensor c_pad_type = const()[name = tensor(\"c_pad_type\"), val = tensor(\"valid\")];\n" + " tensor c_strides = const()[name = tensor(\"c_strides\"), val = tensor([1, 1])];\n" + " tensor c_pad = const()[name = tensor(\"c_pad\"), val = tensor([0, 0, 0, 0])];\n" + " tensor c_dilations = const()[name = tensor(\"c_dilations\"), val = tensor([1, 1])];\n" + " tensor c_groups = const()[name = tensor(\"c_groups\"), val = tensor(1)];\n" + " tensor Wq = const()[name = tensor(\"Wq\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(64)))];\n" + " tensor Wk = const()[name = tensor(\"Wk\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(%lu)))];\n" + " tensor Wv = const()[name = tensor(\"Wv\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(%lu)))];\n" + " tensor q = conv(dilations = c_dilations, groups = c_groups, " + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wq, x = x)[name = tensor(\"conv_q\")];\n" + " tensor k = conv(dilations = c_dilations, groups = c_groups, " + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wk, x = x)[name = tensor(\"conv_k\")];\n" + " tensor v = conv(dilations = c_dilations, groups = c_groups, " + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wv, x = x)[name = tensor(\"conv_v\")];\n" + " } -> (q, k, v);\n" + "}\n", + dim, spatial, + q_dim, dim, q_dim, dim, + kv_dim, dim, kv_dim, dim, (unsigned long)off_k, + kv_dim, dim, kv_dim, dim, (unsigned long)off_v, + q_dim, spatial, kv_dim, spatial, kv_dim, spatial]; + } + return [NSString stringWithFormat: + @"program(1.0)\n" + "[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n" + "{\n" + " func main(tensor x) {\n" + " tensor c_pad_type = const()[name = tensor(\"c_pad_type\"), val = tensor(\"valid\")];\n" + " tensor c_strides = const()[name = tensor(\"c_strides\"), val = tensor([1, 1])];\n" + " tensor c_pad = const()[name = tensor(\"c_pad\"), val = tensor([0, 0, 0, 0])];\n" + " tensor c_dilations = const()[name = tensor(\"c_dilations\"), val = tensor([1, 1])];\n" + " tensor c_groups = const()[name = tensor(\"c_groups\"), val = tensor(1)];\n" + " tensor to_fp16 = const()[name = tensor(\"to_fp16\"), val = tensor(\"fp16\")];\n" + " tensor x16 = cast(dtype = to_fp16, x = x)[name = tensor(\"cast_in\")];\n" + " tensor Wq = const()[name = tensor(\"Wq\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(64)))];\n" + " tensor Wk = const()[name = tensor(\"Wk\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(%lu)))];\n" + " tensor Wv = const()[name = tensor(\"Wv\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(%lu)))];\n" + " tensor q16 = conv(dilations = c_dilations, groups = c_groups, " + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wq, x = x16)[name = tensor(\"conv_q\")];\n" + " tensor k16 = conv(dilations = c_dilations, groups = c_groups, " + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wk, x = x16)[name = tensor(\"conv_k\")];\n" + " tensor v16 = conv(dilations = c_dilations, groups = c_groups, " + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wv, x = x16)[name = tensor(\"conv_v\")];\n" + " tensor to_fp32 = const()[name = tensor(\"to_fp32\"), val = tensor(\"fp32\")];\n" + " tensor q = cast(dtype = to_fp32, x = q16)[name = tensor(\"cast_q\")];\n" + " tensor k = cast(dtype = to_fp32, x = k16)[name = tensor(\"cast_k\")];\n" + " tensor v = cast(dtype = to_fp32, x = v16)[name = tensor(\"cast_v\")];\n" + " } -> (q, k, v);\n" + "}\n", + dim, spatial, dim, spatial, + q_dim, dim, q_dim, dim, + kv_dim, dim, kv_dim, dim, (unsigned long)off_k, + kv_dim, dim, kv_dim, dim, (unsigned long)off_v, + q_dim, spatial, kv_dim, spatial, kv_dim, spatial, + q_dim, spatial, kv_dim, spatial, kv_dim, spatial]; +} + +// Build weight blob for GQA QKV (3 weight matrices with different shapes) +static NSData *mil_build_qkv_gqa_weight_blob(const float *wq, int q_dim, int dim, + const float *wk, const float *wv, int kv_dim) { + NSUInteger wsize_q = (NSUInteger)q_dim * dim * 2; + NSUInteger wsize_kv = (NSUInteger)kv_dim * dim * 2; + NSUInteger cs_q = 64 + wsize_q; + NSUInteger cs_kv = 64 + wsize_kv; + NSUInteger total = 64 + cs_q + 2 * cs_kv; + uint8_t *buf = (uint8_t*)calloc(total, 1); + buf[0] = 0x01; buf[4] = 0x02; + + // Chunk 0: Wq + { + uint8_t *chunk = buf + 64; + chunk[0]=0xEF; chunk[1]=0xBE; chunk[2]=0xAD; chunk[3]=0xDE; chunk[4]=0x01; + *(uint32_t*)(chunk + 8) = (uint32_t)wsize_q; + *(uint32_t*)(chunk + 16) = (uint32_t)(64 + 64); + _Float16 *fp16 = (_Float16*)(chunk + 64); + for (NSUInteger i = 0; i < (NSUInteger)q_dim * dim; i++) fp16[i] = (_Float16)wq[i]; + } + // Chunk 1: Wk + { + uint8_t *chunk = buf + 64 + cs_q; + chunk[0]=0xEF; chunk[1]=0xBE; chunk[2]=0xAD; chunk[3]=0xDE; chunk[4]=0x01; + *(uint32_t*)(chunk + 8) = (uint32_t)wsize_kv; + *(uint32_t*)(chunk + 16) = (uint32_t)(64 + cs_q + 64); + _Float16 *fp16 = (_Float16*)(chunk + 64); + for (NSUInteger i = 0; i < (NSUInteger)kv_dim * dim; i++) fp16[i] = (_Float16)wk[i]; + } + // Chunk 2: Wv + { + uint8_t *chunk = buf + 64 + cs_q + cs_kv; + chunk[0]=0xEF; chunk[1]=0xBE; chunk[2]=0xAD; chunk[3]=0xDE; chunk[4]=0x01; + *(uint32_t*)(chunk + 8) = (uint32_t)wsize_kv; + *(uint32_t*)(chunk + 16) = (uint32_t)(64 + cs_q + cs_kv + 64); + _Float16 *fp16 = (_Float16*)(chunk + 64); + for (NSUInteger i = 0; i < (NSUInteger)kv_dim * dim; i++) fp16[i] = (_Float16)wv[i]; + } + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} + // Generate MIL for fused FFN up: w1 + w3 parallel convs static NSString *mil_gen_ffn_up(int dim, int hidden_dim, int spatial) { NSUInteger cs = 64 + (NSUInteger)hidden_dim * dim * 2; + if (g_fp16_io) { + return [NSString stringWithFormat: + @"program(1.0)\n" + "[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n" + "{\n" + " func main(tensor x) {\n" + " tensor c_pad_type = const()[name = tensor(\"c_pad_type\"), val = tensor(\"valid\")];\n" + " tensor c_strides = const()[name = tensor(\"c_strides\"), val = tensor([1, 1])];\n" + " tensor c_pad = const()[name = tensor(\"c_pad\"), val = tensor([0, 0, 0, 0])];\n" + " tensor c_dilations = const()[name = tensor(\"c_dilations\"), val = tensor([1, 1])];\n" + " tensor c_groups = const()[name = tensor(\"c_groups\"), val = tensor(1)];\n" + " tensor W1 = const()[name = tensor(\"W1\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(64)))];\n" + " tensor W3 = const()[name = tensor(\"W3\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(%lu)))];\n" + " tensor out1 = conv(dilations = c_dilations, groups = c_groups, " + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W1, x = x)[name = tensor(\"conv_w1\")];\n" + " tensor out3 = conv(dilations = c_dilations, groups = c_groups, " + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W3, x = x)[name = tensor(\"conv_w3\")];\n" + " } -> (out1, out3);\n" + "}\n", + dim, spatial, + hidden_dim, dim, hidden_dim, dim, + hidden_dim, dim, hidden_dim, dim, (unsigned long)(64 + cs), + hidden_dim, spatial, hidden_dim, spatial]; + } return [NSString stringWithFormat: - @"program(1.3)\n" - "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n" + @"program(1.0)\n" + "[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n" "{\n" - " func main(tensor x) {\n" - " string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n" - " tensor c_strides = const()[name = string(\"c_strides\"), val = tensor([1, 1])];\n" - " tensor c_pad = const()[name = string(\"c_pad\"), val = tensor([0, 0, 0, 0])];\n" - " tensor c_dilations = const()[name = string(\"c_dilations\"), val = tensor([1, 1])];\n" - " int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n" - " string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n" - " tensor x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n" - " tensor W1 = const()[name = string(\"W1\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n" - " tensor W3 = const()[name = string(\"W3\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n" + " func main(tensor x) {\n" + " tensor c_pad_type = const()[name = tensor(\"c_pad_type\"), val = tensor(\"valid\")];\n" + " tensor c_strides = const()[name = tensor(\"c_strides\"), val = tensor([1, 1])];\n" + " tensor c_pad = const()[name = tensor(\"c_pad\"), val = tensor([0, 0, 0, 0])];\n" + " tensor c_dilations = const()[name = tensor(\"c_dilations\"), val = tensor([1, 1])];\n" + " tensor c_groups = const()[name = tensor(\"c_groups\"), val = tensor(1)];\n" + " tensor to_fp16 = const()[name = tensor(\"to_fp16\"), val = tensor(\"fp16\")];\n" + " tensor x16 = cast(dtype = to_fp16, x = x)[name = tensor(\"cast_in\")];\n" + " tensor W1 = const()[name = tensor(\"W1\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(64)))];\n" + " tensor W3 = const()[name = tensor(\"W3\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(%lu)))];\n" " tensor h1 = conv(dilations = c_dilations, groups = c_groups, " - "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W1, x = x16)[name = string(\"conv_w1\")];\n" + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W1, x = x16)[name = tensor(\"conv_w1\")];\n" " tensor h3 = conv(dilations = c_dilations, groups = c_groups, " - "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W3, x = x16)[name = string(\"conv_w3\")];\n" - " string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n" - " tensor out1 = cast(dtype = to_fp32, x = h1)[name = string(\"cast_h1\")];\n" - " tensor out3 = cast(dtype = to_fp32, x = h3)[name = string(\"cast_h3\")];\n" + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W3, x = x16)[name = tensor(\"conv_w3\")];\n" + " tensor to_fp32 = const()[name = tensor(\"to_fp32\"), val = tensor(\"fp32\")];\n" + " tensor out1 = cast(dtype = to_fp32, x = h1)[name = tensor(\"cast_h1\")];\n" + " tensor out3 = cast(dtype = to_fp32, x = h3)[name = tensor(\"cast_h3\")];\n" " } -> (out1, out3);\n" "}\n", dim, spatial, dim, spatial, diff --git a/training/ane_runtime.h b/training/ane_runtime.h index 585d0f0..a5fa873 100644 --- a/training/ane_runtime.h +++ b/training/ane_runtime.h @@ -20,15 +20,27 @@ typedef struct { static Class g_ANEDesc, g_ANEInMem, g_ANEReq, g_ANEIO; static bool g_ane_loaded = false; +static bool g_ane_ok = false; // true only when all private classes loaded successfully static void ane_init(void) { if (g_ane_loaded) return; - dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + g_ane_loaded = true; // Set first to prevent re-entry (ref: CRIT-01) + void *handle = dlopen( + "/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", + RTLD_NOW); + if (!handle) { + fprintf(stderr, "ANE: dlopen failed: %s\n", dlerror()); + return; + } g_ANEDesc = NSClassFromString(@"_ANEInMemoryModelDescriptor"); g_ANEInMem = NSClassFromString(@"_ANEInMemoryModel"); g_ANEReq = NSClassFromString(@"_ANERequest"); g_ANEIO = NSClassFromString(@"_ANEIOSurfaceObject"); - g_ane_loaded = true; + if (!g_ANEDesc || !g_ANEInMem || !g_ANEReq || !g_ANEIO) { + fprintf(stderr, "ANE: Private classes not found (macOS version mismatch?)\n"); + return; + } + g_ane_ok = true; } static IOSurfaceRef ane_create_surface(size_t bytes) { @@ -50,6 +62,7 @@ static ANEKernel *ane_compile(NSData *milText, NSData *weightData, int nInputs, size_t *inputSizes, int nOutputs, size_t *outputSizes) { ane_init(); + if (!g_ane_ok) { fprintf(stderr, "ANE: not available\n"); return NULL; } // CRIT-01/02 NSError *e = nil; NSDictionary *wdict = nil; @@ -63,6 +76,7 @@ static ANEKernel *ane_compile(NSData *milText, NSData *weightData, id mdl = ((id(*)(Class,SEL,id))objc_msgSend)( g_ANEInMem, @selector(inMemoryModelWithDescriptor:), desc); + if (!mdl) { fprintf(stderr, "ANE: inMemoryModel allocation failed\n"); return NULL; } // CRIT-02 // Pre-populate temp dir with MIL + weights id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); diff --git a/training/dashboard.py b/training/dashboard.py index 5926a8f..18203d7 100644 --- a/training/dashboard.py +++ b/training/dashboard.py @@ -147,7 +147,7 @@ def softmax(x): e = np.exp(x) return e / np.sum(e) -def generate_text(W, tok, max_tokens=64, temperature=0.8): +def generate_text(W, max_tokens=64, temperature=0.8): tokenizer = get_tokenizer() if tokenizer is None: return '[no tokenizer]' @@ -249,7 +249,7 @@ def generation_thread(): with S.gen_lock: S.gen_status = 'idle' continue - text = generate_text(W, get_tokenizer(), max_tokens=64, temperature=0.8) + text = generate_text(W, max_tokens=64, temperature=0.8) with S.gen_lock: S.gen_text = text S.gen_step = S.step @@ -790,6 +790,8 @@ def spawn_training(resume=False, steps=10000, dynamic=False, ane=False, scratch= return proc def spawn_powermetrics(): + if not sys.stdin.isatty(): + return None try: proc = subprocess.Popen( ['sudo', 'powermetrics', '--samplers', 'cpu_power,gpu_power,ane_power', '-i', '1000'], @@ -982,7 +984,7 @@ def force_gen(): try: W = load_weights_from_ckpt(CKPT_PATH) if W: - text = generate_text(W, get_tokenizer(), max_tokens=64, temperature=0.8) + text = generate_text(W, max_tokens=64, temperature=0.8) with S.gen_lock: S.gen_text = text S.gen_step = S.step diff --git a/training/model.h b/training/model.h index 6cee52f..7a07e12 100644 --- a/training/model.h +++ b/training/model.h @@ -78,7 +78,14 @@ typedef struct { static int model_load_weights(Model *m, const char *path) { FILE *f = fopen(path, "rb"); if (!f) { fprintf(stderr, "Cannot open %s\n", path); return -1; } - fread(&m->cfg, sizeof(Config), 1, f); + // Validate config read — gatekeeper for all subsequent malloc() sizes (CRIT-03) + if (fread(&m->cfg, sizeof(Config), 1, f) != 1) { + fprintf(stderr, "model: config read failed (truncated file?)\n"); + fclose(f); return -1; + } + // Note: Subsequent fread() calls for weight tensors are not individually checked. + // In this research context, a truncated weight file causes incorrect model behavior + // (detectable via training loss divergence). The config read above is the gatekeeper. bool shared = m->cfg.vocab_size > 0; if (m->cfg.vocab_size < 0) m->cfg.vocab_size = -m->cfg.vocab_size; @@ -88,18 +95,18 @@ static int model_load_weights(Model *m, const char *path) { int d = m->cfg.dim, hd = m->cfg.hidden_dim, nl = m->cfg.n_layers, vs = m->cfg.vocab_size; - m->token_embedding = (float*)malloc(vs * d * sizeof(float)); + m->token_embedding = (float*)malloc((size_t)vs * d * sizeof(float)); // (size_t) prevents int overflow (CRIT-04) fread(m->token_embedding, sizeof(float), vs * d, f); - float *rms_att_all = (float*)malloc(nl * d * sizeof(float)); - float *wq_all = (float*)malloc(nl * d * d * sizeof(float)); - float *wk_all = (float*)malloc(nl * d * d * sizeof(float)); - float *wv_all = (float*)malloc(nl * d * d * sizeof(float)); - float *wo_all = (float*)malloc(nl * d * d * sizeof(float)); - float *rms_ffn_all = (float*)malloc(nl * d * sizeof(float)); - float *w1_all = (float*)malloc(nl * hd * d * sizeof(float)); - float *w2_all = (float*)malloc(nl * d * hd * sizeof(float)); - float *w3_all = (float*)malloc(nl * hd * d * sizeof(float)); + float *rms_att_all = (float*)malloc((size_t)nl * d * sizeof(float)); + float *wq_all = (float*)malloc((size_t)nl * d * d * sizeof(float)); + float *wk_all = (float*)malloc((size_t)nl * d * d * sizeof(float)); + float *wv_all = (float*)malloc((size_t)nl * d * d * sizeof(float)); + float *wo_all = (float*)malloc((size_t)nl * d * d * sizeof(float)); + float *rms_ffn_all = (float*)malloc((size_t)nl * d * sizeof(float)); + float *w1_all = (float*)malloc((size_t)nl * hd * d * sizeof(float)); + float *w2_all = (float*)malloc((size_t)nl * d * hd * sizeof(float)); + float *w3_all = (float*)malloc((size_t)nl * hd * d * sizeof(float)); fread(rms_att_all, sizeof(float), nl * d, f); fread(wq_all, sizeof(float), nl * d * d, f); @@ -140,7 +147,7 @@ static int model_load_weights(Model *m, const char *path) { if (shared) { m->wcls = m->token_embedding; } else { - m->wcls = (float*)malloc(vs * d * sizeof(float)); + m->wcls = (float*)malloc((size_t)vs * d * sizeof(float)); // (size_t) prevents int overflow (CRIT-04) fread(m->wcls, sizeof(float), vs * d, f); } fclose(f); diff --git a/training/stories_config.h b/training/stories_config.h index f967974..f4c0996 100644 --- a/training/stories_config.h +++ b/training/stories_config.h @@ -22,8 +22,19 @@ #define SEQ 256 #define NLAYERS 12 #define VOCAB 32000 -#define ACCUM_STEPS 10 +#define DEFAULT_ACCUM_STEPS 10 #define MAX_COMPILES 100 +static int g_accum_steps = DEFAULT_ACCUM_STEPS; + +static void init_accum_steps(void) { + const char *env = getenv("ANE_ACCUM_STEPS"); + if (env && env[0]) { + int v = atoi(env); + if (v > 0 && v <= 10000) g_accum_steps = v; + } +} + +#define ACCUM_STEPS g_accum_steps // Per compile: 5 weight-bearing kernels per layer + 1 classifier = 5*12+1 = 61 // Plus 1 static (sdpaBwd2 per layer, no weights) = 12 more but those are weight-free @@ -111,15 +122,30 @@ typedef struct { // Globals static Class g_D, g_I, g_AR, g_AIO; +static bool g_ane_init_done = false; // Re-entry guard (ref: CRIT-01) +static bool g_ane_ok_large = false; // true only when all private classes loaded successfully static mach_timebase_info_data_t g_tb; static int g_compile_count = 0; static void ane_init(void) { - dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + if (g_ane_init_done) return; + g_ane_init_done = true; // Set first to prevent re-entry (ref: CRIT-01) + void *handle = dlopen( + "/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", + RTLD_NOW); + if (!handle) { + fprintf(stderr, "ANE: dlopen failed: %s\n", dlerror()); + return; + } g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); g_I = NSClassFromString(@"_ANEInMemoryModel"); g_AR = NSClassFromString(@"_ANERequest"); g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); + if (!g_D || !g_I || !g_AR || !g_AIO) { + fprintf(stderr, "ANE: Private classes not found (macOS version mismatch?)\n"); + return; + } + g_ane_ok_large = true; } static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } diff --git a/training/stories_cpu_ops_opt.h b/training/stories_cpu_ops_opt.h new file mode 100644 index 0000000..1b843c8 --- /dev/null +++ b/training/stories_cpu_ops_opt.h @@ -0,0 +1,110 @@ +// stories_cpu_ops_opt.h — Optimized CPU operations: NEON Adam, vectorized embedding +#pragma once +#include "stories_cpu_ops.h" +#include + +// ===== NEON-vectorized Adam optimizer ===== +// ~3-3.5x faster than scalar version for large param counts +// Uses vrsqrteq_f32 + one Newton-Raphson step for fast reciprocal sqrt +static void adam_update_opt(float *w, const float *g, AdamState *s, int t, + float lr, float b1, float b2, float eps) { + float bc1 = 1.0f - powf(b1, t); + float bc2 = 1.0f - powf(b2, t); + float inv_bc1 = 1.0f / bc1; + float inv_bc2 = 1.0f / bc2; + float one_minus_b1 = 1.0f - b1; + float one_minus_b2 = 1.0f - b2; + + float32x4_t vb1 = vdupq_n_f32(b1); + float32x4_t vb2 = vdupq_n_f32(b2); + float32x4_t v1mb1 = vdupq_n_f32(one_minus_b1); + float32x4_t v1mb2 = vdupq_n_f32(one_minus_b2); + float32x4_t vinv_bc1 = vdupq_n_f32(inv_bc1); + float32x4_t vinv_bc2 = vdupq_n_f32(inv_bc2); + float32x4_t vneg_lr = vdupq_n_f32(-lr); + float32x4_t veps = vdupq_n_f32(eps); + + size_t n = s->n; + size_t i = 0; + + // Process 4 elements at a time + for (; i + 3 < n; i += 4) { + // Load + float32x4_t vm = vld1q_f32(s->m + i); + float32x4_t vv = vld1q_f32(s->v + i); + float32x4_t vg = vld1q_f32(g + i); + float32x4_t vw = vld1q_f32(w + i); + + // m = b1*m + (1-b1)*g + vm = vmlaq_f32(vmulq_f32(vb1, vm), v1mb1, vg); + // v = b2*v + (1-b2)*g*g + float32x4_t g2 = vmulq_f32(vg, vg); + vv = vmlaq_f32(vmulq_f32(vb2, vv), v1mb2, g2); + + // Store updated m, v + vst1q_f32(s->m + i, vm); + vst1q_f32(s->v + i, vv); + + // mhat = m / bc1, vhat = v / bc2 + float32x4_t mhat = vmulq_f32(vm, vinv_bc1); + float32x4_t vhat = vmulq_f32(vv, vinv_bc2); + + // Fast reciprocal sqrt: vrsqrteq + one Newton-Raphson iteration + // rsqrt_est ≈ 1/sqrt(vhat) + float32x4_t rsqrt_est = vrsqrteq_f32(vhat); + // Newton-Raphson: rsqrt *= (3 - vhat * rsqrt^2) / 2 + float32x4_t rsqrt_sq = vmulq_f32(rsqrt_est, rsqrt_est); + float32x4_t nr_step = vrsqrtsq_f32(vhat, rsqrt_sq); + rsqrt_est = vmulq_f32(rsqrt_est, nr_step); + + // w -= lr * mhat / (sqrt(vhat) + eps) + // = w + (-lr) * mhat * (1/(sqrt(vhat) + eps)) + // Compute sqrt(vhat) from rsqrt: sqrt = vhat * rsqrt(vhat) (avoids division) + float32x4_t sqrt_vhat = vmulq_f32(vhat, rsqrt_est); + float32x4_t denom = vaddq_f32(sqrt_vhat, veps); + + // Use vdivq_f32 for the final division (accurate, eps-adjusted) + float32x4_t update = vmulq_f32(vneg_lr, vdivq_f32(mhat, denom)); + vw = vaddq_f32(vw, update); + + vst1q_f32(w + i, vw); + } + + // Scalar tail + for (; i < n; i++) { + s->m[i] = b1 * s->m[i] + one_minus_b1 * g[i]; + s->v[i] = b2 * s->v[i] + one_minus_b2 * g[i] * g[i]; + float mh = s->m[i] * inv_bc1; + float vh = s->v[i] * inv_bc2; + w[i] -= lr * mh / (sqrtf(vh) + eps); + } +} + +// ===== Vectorized embedding lookup ===== +// Gather rows from [VOCAB, DIM] row-major embed table → x [DIM, SEQ] channel-first +// Strategy: gather token rows into temp buffer [SEQ, DIM], then transpose via vDSP_mtrans +static void embed_lookup_opt(float *x, const float *embed, const uint16_t *tokens, + int dim, int seq, float *tmp) { + // Gather: tmp[t*dim + d] = embed[tokens[t]*dim + d] + for (int t = 0; t < seq; t++) { + memcpy(tmp + t * dim, embed + tokens[t] * dim, dim * sizeof(float)); + } + // Transpose [SEQ, DIM] → [DIM, SEQ]: x[d*seq + t] = tmp[t*dim + d] + vDSP_mtrans(tmp, 1, x, 1, (vDSP_Length)dim, (vDSP_Length)seq); +} + +// ===== Vectorized embedding backward ===== +// Accumulate dE[tok] += dx[:,t] for each position +// Strategy: transpose dx [DIM, SEQ] → tmp [SEQ, DIM], then accumulate rows +static void embed_backward_opt(float *d_embed, const float *dx, const uint16_t *tokens, + int dim, int seq, float *tmp) { + // Transpose [DIM, SEQ] → [SEQ, DIM]: tmp[t*dim + d] = dx[d*seq + t] + vDSP_mtrans(dx, 1, tmp, 1, (vDSP_Length)seq, (vDSP_Length)dim); + // Scatter-add: d_embed[tok*dim .. (tok+1)*dim] += tmp[t*dim .. (t+1)*dim] + for (int t = 0; t < seq; t++) { + vDSP_vadd(tmp + t * dim, 1, + d_embed + tokens[t] * dim, 1, + d_embed + tokens[t] * dim, 1, + (vDSP_Length)dim); + } +} diff --git a/training/stories_io.h b/training/stories_io.h index 017d8a8..efc88db 100644 --- a/training/stories_io.h +++ b/training/stories_io.h @@ -11,28 +11,31 @@ static IOSurfaceRef make_surface(size_t bytes) { } static NSData *build_blob(const float *w, int rows, int cols) { - int ws=rows*cols*2, tot=128+ws; + size_t ws=(size_t)rows*cols*2, tot=128+ws; // size_t prevents int overflow (CRIT-04) uint8_t *b=(uint8_t*)calloc(tot,1); + if (!b) { fprintf(stderr, "build_blob: calloc(%zu) failed\n", tot); return nil; } b[0]=1;b[4]=2;b[64]=0xEF;b[65]=0xBE;b[66]=0xAD;b[67]=0xDE;b[68]=1; - *(uint32_t*)(b+72)=ws;*(uint32_t*)(b+80)=128; + *(uint32_t*)(b+72)=(uint32_t)ws;*(uint32_t*)(b+80)=128; _Float16 *fp16=(_Float16*)(b+128); - for(int i=0;i({{\"coremlc-component-MIL\", \"3510.2.1\"}, " \ - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " \ - "{\"coremltools-version\", \"9.0\"}})]\n{\n" + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" #define CONV_CONST \ - " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" \ - " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" \ - " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" \ - " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" \ - " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" + " tensor pt = const()[name=tensor(\"pt\"), val=tensor(\"valid\")];\n" \ + " tensor st = const()[name=tensor(\"st\"), val=tensor([1,1])];\n" \ + " tensor pd = const()[name=tensor(\"pd\"), val=tensor([0,0,0,0])];\n" \ + " tensor dl = const()[name=tensor(\"dl\"), val=tensor([1,1])];\n" \ + " tensor gr = const()[name=tensor(\"gr\"), val=tensor(1)];\n" // SDPA forward + taps: x_in → rmsnorm → QKV+SDPA+Wo → concat(o_out, Q, K, V, attn_out, xnorm) static NSString *gen_sdpa_fwd_taps(void) { @@ -20,53 +18,53 @@ static NSString *gen_sdpa_fwd_taps(void) { float invd = 1.0f/(float)DIM; NSMutableString *m = [NSMutableString string]; [m appendString:MIL_HDR]; - [m appendFormat:@" func main(tensor x) {\n", DIM, SEQ]; - [m appendFormat:@" tensor sq = mul(x=x,y=x)[name=string(\"sq\")];\n", DIM, SEQ]; - [m appendFormat:@" tensor rax = const()[name=string(\"rax\"), val=tensor([1])];\n"]; - [m appendFormat:@" bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"]; - [m appendFormat:@" tensor ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=string(\"ss\")];\n", SEQ]; - [m appendFormat:@" fp16 invd = const()[name=string(\"invd\"), val=fp16(%f)];\n", invd]; - [m appendFormat:@" tensor ss2 = mul(x=ss,y=invd)[name=string(\"ss2\")];\n", SEQ]; - [m appendFormat:@" fp16 eps = const()[name=string(\"eps\"), val=fp16(0.00001)];\n"]; - [m appendFormat:@" tensor ss3 = add(x=ss2,y=eps)[name=string(\"ss3\")];\n", SEQ]; - [m appendFormat:@" fp16 nhalf = const()[name=string(\"nhalf\"), val=fp16(-0.5)];\n"]; - [m appendFormat:@" tensor rrms = pow(x=ss3,y=nhalf)[name=string(\"rrms\")];\n", SEQ]; - [m appendFormat:@" tensor xr = mul(x=x,y=rrms)[name=string(\"xr\")];\n", DIM, SEQ]; - [m appendFormat:@" tensor rw = const()[name=string(\"rw\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/rms1.bin\"), offset=uint64(64)))];\n", DIM, DIM]; - [m appendFormat:@" tensor xn = mul(x=xr,y=rw)[name=string(\"xn\")];\n", DIM, SEQ]; + [m appendFormat:@" func main(tensor x) {\n", DIM, SEQ]; + [m appendFormat:@" tensor sq = mul(x=x,y=x)[name=tensor(\"sq\")];\n", DIM, SEQ]; + [m appendFormat:@" tensor rax = const()[name=tensor(\"rax\"), val=tensor([1])];\n"]; + [m appendFormat:@" tensor kd = const()[name=tensor(\"kd\"), val=tensor(true)];\n"]; + [m appendFormat:@" tensor ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=tensor(\"ss\")];\n", SEQ]; + [m appendFormat:@" tensor invd = const()[name=tensor(\"invd\"), val=tensor(%f)];\n", invd]; + [m appendFormat:@" tensor ss2 = mul(x=ss,y=invd)[name=tensor(\"ss2\")];\n", SEQ]; + [m appendFormat:@" tensor eps = const()[name=tensor(\"eps\"), val=tensor(0.00001)];\n"]; + [m appendFormat:@" tensor ss3 = add(x=ss2,y=eps)[name=tensor(\"ss3\")];\n", SEQ]; + [m appendFormat:@" tensor nhalf = const()[name=tensor(\"nhalf\"), val=tensor(-0.5)];\n"]; + [m appendFormat:@" tensor rrms = pow(x=ss3,y=nhalf)[name=tensor(\"rrms\")];\n", SEQ]; + [m appendFormat:@" tensor xr = mul(x=x,y=rrms)[name=tensor(\"xr\")];\n", DIM, SEQ]; + [m appendFormat:@" tensor rw = const()[name=tensor(\"rw\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/rms1.bin\"), offset=tensor(64)))];\n", DIM, DIM]; + [m appendFormat:@" tensor xn = mul(x=xr,y=rw)[name=tensor(\"xn\")];\n", DIM, SEQ]; [m appendString:@CONV_CONST]; - [m appendFormat:@" tensor Wq = const()[name=string(\"Wq\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wq.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; - [m appendFormat:@" tensor Wk = const()[name=string(\"Wk\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wk.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; - [m appendFormat:@" tensor Wv = const()[name=string(\"Wv\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wv.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; - [m appendFormat:@" tensor Wo = const()[name=string(\"Wo\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wo.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; - [m appendFormat:@" tensor qf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wq,x=xn)[name=string(\"cq\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor kf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wk,x=xn)[name=string(\"ck\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor vf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wv,x=xn)[name=string(\"cv\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor qsh = const()[name=string(\"qsh\"), val=tensor([1,%d,%d,%d])];\n", HEADS,HD,SEQ]; - [m appendString:@" tensor pm = const()[name=string(\"pm\"), val=tensor([0,1,3,2])];\n"]; - [m appendFormat:@" tensor q4 = reshape(shape=qsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor q = transpose(perm=pm,x=q4)[name=string(\"tq\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor k4 = reshape(shape=qsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor k = transpose(perm=pm,x=k4)[name=string(\"tk\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor v4 = reshape(shape=qsh,x=vf)[name=string(\"rv\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor v = transpose(perm=pm,x=v4)[name=string(\"tv\")];\n", HEADS,SEQ,HD]; - [m appendString:@" bool tx = const()[name=string(\"tx\"), val=bool(false)];\n"]; - [m appendString:@" bool ty = const()[name=string(\"ty\"), val=bool(true)];\n"]; - [m appendFormat:@" tensor sc1 = matmul(transpose_x=tx,transpose_y=ty,x=q,y=k)[name=string(\"mm1\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc]; - [m appendFormat:@" tensor sc2 = mul(x=sc1,y=scv)[name=string(\"scl\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" tensor cm = const()[name=string(\"cm\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/mask.bin\"), offset=uint64(64)))];\n", SEQ,SEQ,SEQ,SEQ]; - [m appendFormat:@" tensor ms = add(x=sc2,y=cm)[name=string(\"msk\")];\n", HEADS,SEQ,SEQ]; - [m appendString:@" int32 sax = const()[name=string(\"sax\"), val=int32(-1)];\n"]; - [m appendFormat:@" tensor aw = softmax(axis=sax,x=ms)[name=string(\"sm\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" tensor a4 = matmul(transpose_x=tx,transpose_y=tx,x=aw,y=v)[name=string(\"mm2\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor at = transpose(perm=pm,x=a4)[name=string(\"ta\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor os = const()[name=string(\"os\"), val=tensor([1,%d,1,%d])];\n", DIM,SEQ]; - [m appendFormat:@" tensor af = reshape(shape=os,x=at)[name=string(\"ra\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor oo = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wo,x=af)[name=string(\"co\")];\n", DIM,SEQ]; - [m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"]; - [m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"]; - [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(oo,qf,kf,vf,af,xn))[name=string(\"cat\")];\n", 6*DIM,SEQ]; + [m appendFormat:@" tensor Wq = const()[name=tensor(\"Wq\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/wq.bin\"), offset=tensor(64)))];\n", DIM,DIM,DIM,DIM]; + [m appendFormat:@" tensor Wk = const()[name=tensor(\"Wk\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/wk.bin\"), offset=tensor(64)))];\n", DIM,DIM,DIM,DIM]; + [m appendFormat:@" tensor Wv = const()[name=tensor(\"Wv\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/wv.bin\"), offset=tensor(64)))];\n", DIM,DIM,DIM,DIM]; + [m appendFormat:@" tensor Wo = const()[name=tensor(\"Wo\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/wo.bin\"), offset=tensor(64)))];\n", DIM,DIM,DIM,DIM]; + [m appendFormat:@" tensor qf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wq,x=xn)[name=tensor(\"cq\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor kf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wk,x=xn)[name=tensor(\"ck\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor vf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wv,x=xn)[name=tensor(\"cv\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor qsh = const()[name=tensor(\"qsh\"), val=tensor([1,%d,%d,%d])];\n", HEADS,HD,SEQ]; + [m appendString:@" tensor pm = const()[name=tensor(\"pm\"), val=tensor([0,1,3,2])];\n"]; + [m appendFormat:@" tensor q4 = reshape(shape=qsh,x=qf)[name=tensor(\"rq\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor q = transpose(perm=pm,x=q4)[name=tensor(\"tq\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor k4 = reshape(shape=qsh,x=kf)[name=tensor(\"rk\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor k = transpose(perm=pm,x=k4)[name=tensor(\"tk\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor v4 = reshape(shape=qsh,x=vf)[name=tensor(\"rv\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor v = transpose(perm=pm,x=v4)[name=tensor(\"tv\")];\n", HEADS,SEQ,HD]; + [m appendString:@" tensor tx = const()[name=tensor(\"tx\"), val=tensor(false)];\n"]; + [m appendString:@" tensor ty = const()[name=tensor(\"ty\"), val=tensor(true)];\n"]; + [m appendFormat:@" tensor sc1 = matmul(transpose_x=tx,transpose_y=ty,x=q,y=k)[name=tensor(\"mm1\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor scv = const()[name=tensor(\"scv\"), val=tensor(%f)];\n", sc]; + [m appendFormat:@" tensor sc2 = mul(x=sc1,y=scv)[name=tensor(\"scl\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor cm = const()[name=tensor(\"cm\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/mask.bin\"), offset=tensor(64)))];\n", SEQ,SEQ,SEQ,SEQ]; + [m appendFormat:@" tensor ms = add(x=sc2,y=cm)[name=tensor(\"msk\")];\n", HEADS,SEQ,SEQ]; + [m appendString:@" tensor sax = const()[name=tensor(\"sax\"), val=tensor(-1)];\n"]; + [m appendFormat:@" tensor aw = softmax(axis=sax,x=ms)[name=tensor(\"sm\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor a4 = matmul(transpose_x=tx,transpose_y=tx,x=aw,y=v)[name=tensor(\"mm2\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor at = transpose(perm=pm,x=a4)[name=tensor(\"ta\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor os = const()[name=tensor(\"os\"), val=tensor([1,%d,1,%d])];\n", DIM,SEQ]; + [m appendFormat:@" tensor af = reshape(shape=os,x=at)[name=tensor(\"ra\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor oo = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wo,x=af)[name=tensor(\"co\")];\n", DIM,SEQ]; + [m appendString:@" tensor cax = const()[name=tensor(\"cax\"), val=tensor(1)];\n"]; + [m appendString:@" tensor cid = const()[name=tensor(\"cid\"), val=tensor(false)];\n"]; + [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(oo,qf,kf,vf,af,xn))[name=tensor(\"cat\")];\n", 6*DIM,SEQ]; [m appendString:@" } -> (out);\n}\n"]; return m; } @@ -76,33 +74,33 @@ static NSString *gen_ffn_fwd_taps(void) { float invd = 1.0f/(float)DIM; NSMutableString *m = [NSMutableString string]; [m appendString:MIL_HDR]; - [m appendFormat:@" func main(tensor x) {\n", DIM, SEQ]; - [m appendFormat:@" tensor sq = mul(x=x,y=x)[name=string(\"sq\")];\n", DIM, SEQ]; - [m appendFormat:@" tensor rax = const()[name=string(\"rax\"), val=tensor([1])];\n"]; - [m appendFormat:@" bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"]; - [m appendFormat:@" tensor ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=string(\"ss\")];\n", SEQ]; - [m appendFormat:@" fp16 invd = const()[name=string(\"invd\"), val=fp16(%f)];\n", invd]; - [m appendFormat:@" tensor ss2 = mul(x=ss,y=invd)[name=string(\"ss2\")];\n", SEQ]; - [m appendFormat:@" fp16 eps = const()[name=string(\"eps\"), val=fp16(0.00001)];\n"]; - [m appendFormat:@" tensor ss3 = add(x=ss2,y=eps)[name=string(\"ss3\")];\n", SEQ]; - [m appendFormat:@" fp16 nhalf = const()[name=string(\"nhalf\"), val=fp16(-0.5)];\n"]; - [m appendFormat:@" tensor rrms = pow(x=ss3,y=nhalf)[name=string(\"rrms\")];\n", SEQ]; - [m appendFormat:@" tensor xr = mul(x=x,y=rrms)[name=string(\"xr\")];\n", DIM, SEQ]; - [m appendFormat:@" tensor rw = const()[name=string(\"rw\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/rms2.bin\"), offset=uint64(64)))];\n", DIM, DIM]; - [m appendFormat:@" tensor xn = mul(x=xr,y=rw)[name=string(\"xn\")];\n", DIM, SEQ]; + [m appendFormat:@" func main(tensor x) {\n", DIM, SEQ]; + [m appendFormat:@" tensor sq = mul(x=x,y=x)[name=tensor(\"sq\")];\n", DIM, SEQ]; + [m appendFormat:@" tensor rax = const()[name=tensor(\"rax\"), val=tensor([1])];\n"]; + [m appendFormat:@" tensor kd = const()[name=tensor(\"kd\"), val=tensor(true)];\n"]; + [m appendFormat:@" tensor ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=tensor(\"ss\")];\n", SEQ]; + [m appendFormat:@" tensor invd = const()[name=tensor(\"invd\"), val=tensor(%f)];\n", invd]; + [m appendFormat:@" tensor ss2 = mul(x=ss,y=invd)[name=tensor(\"ss2\")];\n", SEQ]; + [m appendFormat:@" tensor eps = const()[name=tensor(\"eps\"), val=tensor(0.00001)];\n"]; + [m appendFormat:@" tensor ss3 = add(x=ss2,y=eps)[name=tensor(\"ss3\")];\n", SEQ]; + [m appendFormat:@" tensor nhalf = const()[name=tensor(\"nhalf\"), val=tensor(-0.5)];\n"]; + [m appendFormat:@" tensor rrms = pow(x=ss3,y=nhalf)[name=tensor(\"rrms\")];\n", SEQ]; + [m appendFormat:@" tensor xr = mul(x=x,y=rrms)[name=tensor(\"xr\")];\n", DIM, SEQ]; + [m appendFormat:@" tensor rw = const()[name=tensor(\"rw\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/rms2.bin\"), offset=tensor(64)))];\n", DIM, DIM]; + [m appendFormat:@" tensor xn = mul(x=xr,y=rw)[name=tensor(\"xn\")];\n", DIM, SEQ]; [m appendString:@CONV_CONST]; - [m appendFormat:@" tensor W1 = const()[name=string(\"W1\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/w1.bin\"), offset=uint64(64)))];\n", HIDDEN,DIM,HIDDEN,DIM]; - [m appendFormat:@" tensor W3 = const()[name=string(\"W3\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/w3.bin\"), offset=uint64(64)))];\n", HIDDEN,DIM,HIDDEN,DIM]; - [m appendFormat:@" tensor W2 = const()[name=string(\"W2\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/w2.bin\"), offset=uint64(64)))];\n", DIM,HIDDEN,DIM,HIDDEN]; - [m appendFormat:@" tensor h1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1,x=xn)[name=string(\"c1\")];\n", HIDDEN,SEQ]; - [m appendFormat:@" tensor h3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3,x=xn)[name=string(\"c3\")];\n", HIDDEN,SEQ]; - [m appendFormat:@" tensor sig = sigmoid(x=h1)[name=string(\"sg\")];\n", HIDDEN,SEQ]; - [m appendFormat:@" tensor silu = mul(x=h1,y=sig)[name=string(\"si\")];\n", HIDDEN,SEQ]; - [m appendFormat:@" tensor gate = mul(x=silu,y=h3)[name=string(\"gt\")];\n", HIDDEN,SEQ]; - [m appendFormat:@" tensor y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2,x=gate)[name=string(\"c2\")];\n", DIM,SEQ]; - [m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"]; - [m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"]; - [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(y,h1,h3,gate,xn))[name=string(\"cat\")];\n", 2*DIM+3*HIDDEN,SEQ]; + [m appendFormat:@" tensor W1 = const()[name=tensor(\"W1\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/w1.bin\"), offset=tensor(64)))];\n", HIDDEN,DIM,HIDDEN,DIM]; + [m appendFormat:@" tensor W3 = const()[name=tensor(\"W3\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/w3.bin\"), offset=tensor(64)))];\n", HIDDEN,DIM,HIDDEN,DIM]; + [m appendFormat:@" tensor W2 = const()[name=tensor(\"W2\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/w2.bin\"), offset=tensor(64)))];\n", DIM,HIDDEN,DIM,HIDDEN]; + [m appendFormat:@" tensor h1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1,x=xn)[name=tensor(\"c1\")];\n", HIDDEN,SEQ]; + [m appendFormat:@" tensor h3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3,x=xn)[name=tensor(\"c3\")];\n", HIDDEN,SEQ]; + [m appendFormat:@" tensor sig = sigmoid(x=h1)[name=tensor(\"sg\")];\n", HIDDEN,SEQ]; + [m appendFormat:@" tensor silu = mul(x=h1,y=sig)[name=tensor(\"si\")];\n", HIDDEN,SEQ]; + [m appendFormat:@" tensor gate = mul(x=silu,y=h3)[name=tensor(\"gt\")];\n", HIDDEN,SEQ]; + [m appendFormat:@" tensor y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2,x=gate)[name=tensor(\"c2\")];\n", DIM,SEQ]; + [m appendString:@" tensor cax = const()[name=tensor(\"cax\"), val=tensor(1)];\n"]; + [m appendString:@" tensor cid = const()[name=tensor(\"cid\"), val=tensor(false)];\n"]; + [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(y,h1,h3,gate,xn))[name=tensor(\"cat\")];\n", 2*DIM+3*HIDDEN,SEQ]; [m appendString:@" } -> (out);\n}\n"]; return m; } @@ -111,36 +109,36 @@ static NSString *gen_ffn_fwd_taps(void) { static NSString *gen_ffn_bwd(void) { NSMutableString *m = [NSMutableString string]; [m appendString:MIL_HDR]; - [m appendFormat:@" func main(tensor x) {\n", DIM+2*HIDDEN, SEQ]; + [m appendFormat:@" func main(tensor x) {\n", DIM+2*HIDDEN, SEQ]; [m appendString:@CONV_CONST]; - [m appendString:@" tensor bd = const()[name=string(\"bd\"), val=tensor([0,0,0,0])];\n"]; - [m appendFormat:@" tensor sd = const()[name=string(\"sd\"), val=tensor([1,%d,1,%d])];\n", DIM, SEQ]; - [m appendFormat:@" tensor dffn = slice_by_size(x=x,begin=bd,size=sd)[name=string(\"s0\")];\n", DIM, SEQ]; - [m appendFormat:@" tensor b1 = const()[name=string(\"b1\"), val=tensor([0,%d,0,0])];\n", DIM]; - [m appendFormat:@" tensor s1 = const()[name=string(\"s1\"), val=tensor([1,%d,1,%d])];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor h1 = slice_by_size(x=x,begin=b1,size=s1)[name=string(\"s1x\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor b3 = const()[name=string(\"b3\"), val=tensor([0,%d,0,0])];\n", DIM+HIDDEN]; - [m appendFormat:@" tensor h3 = slice_by_size(x=x,begin=b3,size=s1)[name=string(\"s3x\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor W2t = const()[name=string(\"W2t\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/w2t.bin\"), offset=uint64(64)))];\n", HIDDEN, DIM, HIDDEN, DIM]; - [m appendFormat:@" tensor dsilu = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2t,x=dffn)[name=string(\"cw2\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor sig = sigmoid(x=h1)[name=string(\"sg\")];\n", HIDDEN, SEQ]; - [m appendString:@" fp16 one = const()[name=string(\"one\"), val=fp16(1.0)];\n"]; - [m appendFormat:@" tensor oms = sub(x=one,y=sig)[name=string(\"oms\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor homs = mul(x=h1,y=oms)[name=string(\"homs\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor brk = add(x=one,y=homs)[name=string(\"brk\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor dsd = mul(x=sig,y=brk)[name=string(\"dsd\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor t1 = mul(x=dsilu,y=h3)[name=string(\"t1\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor dh1 = mul(x=t1,y=dsd)[name=string(\"dh1\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor slh = mul(x=h1,y=sig)[name=string(\"slh\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor dh3 = mul(x=dsilu,y=slh)[name=string(\"dh3\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor W1t = const()[name=string(\"W1t\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/w1t.bin\"), offset=uint64(64)))];\n", DIM, HIDDEN, DIM, HIDDEN]; - [m appendFormat:@" tensor W3t = const()[name=string(\"W3t\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/w3t.bin\"), offset=uint64(64)))];\n", DIM, HIDDEN, DIM, HIDDEN]; - [m appendFormat:@" tensor dx1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1t,x=dh1)[name=string(\"cw1\")];\n", DIM, SEQ]; - [m appendFormat:@" tensor dx3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3t,x=dh3)[name=string(\"cw3\")];\n", DIM, SEQ]; - [m appendFormat:@" tensor dx = add(x=dx1,y=dx3)[name=string(\"adx\")];\n", DIM, SEQ]; - [m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"]; - [m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"]; - [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(dx,dh1,dh3))[name=string(\"cat\")];\n", DIM+2*HIDDEN, SEQ]; + [m appendString:@" tensor bd = const()[name=tensor(\"bd\"), val=tensor([0,0,0,0])];\n"]; + [m appendFormat:@" tensor sd = const()[name=tensor(\"sd\"), val=tensor([1,%d,1,%d])];\n", DIM, SEQ]; + [m appendFormat:@" tensor dffn = slice_by_size(x=x,begin=bd,size=sd)[name=tensor(\"s0\")];\n", DIM, SEQ]; + [m appendFormat:@" tensor b1 = const()[name=tensor(\"b1\"), val=tensor([0,%d,0,0])];\n", DIM]; + [m appendFormat:@" tensor s1 = const()[name=tensor(\"s1\"), val=tensor([1,%d,1,%d])];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor h1 = slice_by_size(x=x,begin=b1,size=s1)[name=tensor(\"s1x\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor b3 = const()[name=tensor(\"b3\"), val=tensor([0,%d,0,0])];\n", DIM+HIDDEN]; + [m appendFormat:@" tensor h3 = slice_by_size(x=x,begin=b3,size=s1)[name=tensor(\"s3x\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor W2t = const()[name=tensor(\"W2t\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/w2t.bin\"), offset=tensor(64)))];\n", HIDDEN, DIM, HIDDEN, DIM]; + [m appendFormat:@" tensor dsilu = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2t,x=dffn)[name=tensor(\"cw2\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor sig = sigmoid(x=h1)[name=tensor(\"sg\")];\n", HIDDEN, SEQ]; + [m appendString:@" tensor one = const()[name=tensor(\"one\"), val=tensor(1.0)];\n"]; + [m appendFormat:@" tensor oms = sub(x=one,y=sig)[name=tensor(\"oms\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor homs = mul(x=h1,y=oms)[name=tensor(\"homs\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor brk = add(x=one,y=homs)[name=tensor(\"brk\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor dsd = mul(x=sig,y=brk)[name=tensor(\"dsd\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor t1 = mul(x=dsilu,y=h3)[name=tensor(\"t1\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor dh1 = mul(x=t1,y=dsd)[name=tensor(\"dh1\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor slh = mul(x=h1,y=sig)[name=tensor(\"slh\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor dh3 = mul(x=dsilu,y=slh)[name=tensor(\"dh3\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor W1t = const()[name=tensor(\"W1t\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/w1t.bin\"), offset=tensor(64)))];\n", DIM, HIDDEN, DIM, HIDDEN]; + [m appendFormat:@" tensor W3t = const()[name=tensor(\"W3t\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/w3t.bin\"), offset=tensor(64)))];\n", DIM, HIDDEN, DIM, HIDDEN]; + [m appendFormat:@" tensor dx1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1t,x=dh1)[name=tensor(\"cw1\")];\n", DIM, SEQ]; + [m appendFormat:@" tensor dx3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3t,x=dh3)[name=tensor(\"cw3\")];\n", DIM, SEQ]; + [m appendFormat:@" tensor dx = add(x=dx1,y=dx3)[name=tensor(\"adx\")];\n", DIM, SEQ]; + [m appendString:@" tensor cax = const()[name=tensor(\"cax\"), val=tensor(1)];\n"]; + [m appendString:@" tensor cid = const()[name=tensor(\"cid\"), val=tensor(false)];\n"]; + [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(dx,dh1,dh3))[name=tensor(\"cat\")];\n", DIM+2*HIDDEN, SEQ]; [m appendString:@" } -> (out);\n}\n"]; return m; } @@ -149,23 +147,23 @@ static NSString *gen_ffn_bwd(void) { static NSString *gen_qkvb(void) { NSMutableString *m = [NSMutableString string]; [m appendString:MIL_HDR]; - [m appendFormat:@" func main(tensor x) {\n", 3*DIM, SEQ]; + [m appendFormat:@" func main(tensor x) {\n", 3*DIM, SEQ]; [m appendString:@CONV_CONST]; - [m appendFormat:@" tensor sz = const()[name=string(\"sz\"), val=tensor([1,%d,1,%d])];\n", DIM, SEQ]; - [m appendString:@" tensor b0 = const()[name=string(\"b0\"), val=tensor([0,0,0,0])];\n"]; - [m appendFormat:@" tensor dq = slice_by_size(x=x,begin=b0,size=sz)[name=string(\"s0\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor b1 = const()[name=string(\"b1\"), val=tensor([0,%d,0,0])];\n", DIM]; - [m appendFormat:@" tensor dk = slice_by_size(x=x,begin=b1,size=sz)[name=string(\"s1\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor b2 = const()[name=string(\"b2\"), val=tensor([0,%d,0,0])];\n", 2*DIM]; - [m appendFormat:@" tensor dv = slice_by_size(x=x,begin=b2,size=sz)[name=string(\"s2\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor Wqt = const()[name=string(\"Wqt\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wqt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; - [m appendFormat:@" tensor Wkt = const()[name=string(\"Wkt\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wkt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; - [m appendFormat:@" tensor Wvt = const()[name=string(\"Wvt\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wvt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; - [m appendFormat:@" tensor dxq = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wqt,x=dq)[name=string(\"cq\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor dxk = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wkt,x=dk)[name=string(\"ck\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor dxv = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wvt,x=dv)[name=string(\"cv\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor dxqk = add(x=dxq,y=dxk)[name=string(\"aqk\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor out = add(x=dxqk,y=dxv)[name=string(\"out\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor sz = const()[name=tensor(\"sz\"), val=tensor([1,%d,1,%d])];\n", DIM, SEQ]; + [m appendString:@" tensor b0 = const()[name=tensor(\"b0\"), val=tensor([0,0,0,0])];\n"]; + [m appendFormat:@" tensor dq = slice_by_size(x=x,begin=b0,size=sz)[name=tensor(\"s0\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor b1 = const()[name=tensor(\"b1\"), val=tensor([0,%d,0,0])];\n", DIM]; + [m appendFormat:@" tensor dk = slice_by_size(x=x,begin=b1,size=sz)[name=tensor(\"s1\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor b2 = const()[name=tensor(\"b2\"), val=tensor([0,%d,0,0])];\n", 2*DIM]; + [m appendFormat:@" tensor dv = slice_by_size(x=x,begin=b2,size=sz)[name=tensor(\"s2\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor Wqt = const()[name=tensor(\"Wqt\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/wqt.bin\"), offset=tensor(64)))];\n", DIM,DIM,DIM,DIM]; + [m appendFormat:@" tensor Wkt = const()[name=tensor(\"Wkt\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/wkt.bin\"), offset=tensor(64)))];\n", DIM,DIM,DIM,DIM]; + [m appendFormat:@" tensor Wvt = const()[name=tensor(\"Wvt\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/wvt.bin\"), offset=tensor(64)))];\n", DIM,DIM,DIM,DIM]; + [m appendFormat:@" tensor dxq = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wqt,x=dq)[name=tensor(\"cq\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor dxk = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wkt,x=dk)[name=tensor(\"ck\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor dxv = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wvt,x=dv)[name=tensor(\"cv\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor dxqk = add(x=dxq,y=dxk)[name=tensor(\"aqk\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor out = add(x=dxqk,y=dxv)[name=tensor(\"out\")];\n", DIM,SEQ]; [m appendString:@" } -> (out);\n}\n"]; return m; } @@ -175,49 +173,49 @@ static NSString *gen_sdpa_bwd1(void) { float sc = 1.0f/sqrtf((float)HD); NSMutableString *m = [NSMutableString string]; [m appendString:MIL_HDR]; - [m appendFormat:@" func main(tensor x) {\n", 4*DIM, SEQ]; + [m appendFormat:@" func main(tensor x) {\n", 4*DIM, SEQ]; [m appendString:@CONV_CONST]; - [m appendFormat:@" tensor sz = const()[name=string(\"sz\"), val=tensor([1,%d,1,%d])];\n", DIM, SEQ]; - [m appendString:@" tensor b0 = const()[name=string(\"b0\"), val=tensor([0,0,0,0])];\n"]; - [m appendFormat:@" tensor qf = slice_by_size(x=x,begin=b0,size=sz)[name=string(\"s0\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor b1 = const()[name=string(\"b1\"), val=tensor([0,%d,0,0])];\n", DIM]; - [m appendFormat:@" tensor kf = slice_by_size(x=x,begin=b1,size=sz)[name=string(\"s1\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor b2 = const()[name=string(\"b2\"), val=tensor([0,%d,0,0])];\n", 2*DIM]; - [m appendFormat:@" tensor vf = slice_by_size(x=x,begin=b2,size=sz)[name=string(\"s2\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor b3 = const()[name=string(\"b3\"), val=tensor([0,%d,0,0])];\n", 3*DIM]; - [m appendFormat:@" tensor dx2f = slice_by_size(x=x,begin=b3,size=sz)[name=string(\"s3\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor Wot = const()[name=string(\"Wot\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wot.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; - [m appendFormat:@" tensor df = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wot,x=dx2f)[name=string(\"cwo\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor rsh = const()[name=string(\"rsh\"), val=tensor([1,%d,%d,%d])];\n", HEADS,HD,SEQ]; - [m appendString:@" tensor pm = const()[name=string(\"pm\"), val=tensor([0,1,3,2])];\n"]; - [m appendFormat:@" tensor qr = reshape(shape=rsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor q = transpose(perm=pm,x=qr)[name=string(\"tq\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor kr = reshape(shape=rsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor k = transpose(perm=pm,x=kr)[name=string(\"tk\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor vr = reshape(shape=rsh,x=vf)[name=string(\"rv\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor v = transpose(perm=pm,x=vr)[name=string(\"tv\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor dr = reshape(shape=rsh,x=df)[name=string(\"rd\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor da = transpose(perm=pm,x=dr)[name=string(\"td\")];\n", HEADS,SEQ,HD]; - [m appendString:@" bool bF = const()[name=string(\"bF\"), val=bool(false)];\n"]; - [m appendString:@" bool bT = const()[name=string(\"bT\"), val=bool(true)];\n"]; - [m appendFormat:@" tensor sc1 = matmul(transpose_x=bF,transpose_y=bT,x=q,y=k)[name=string(\"mm1\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc]; - [m appendFormat:@" tensor sc2 = mul(x=sc1,y=scv)[name=string(\"scl\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" tensor cm = const()[name=string(\"cm\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/mask.bin\"), offset=uint64(64)))];\n", SEQ,SEQ,SEQ,SEQ]; - [m appendFormat:@" tensor ms = add(x=sc2,y=cm)[name=string(\"msk\")];\n", HEADS,SEQ,SEQ]; - [m appendString:@" int32 sax = const()[name=string(\"sax\"), val=int32(-1)];\n"]; - [m appendFormat:@" tensor probs = softmax(axis=sax,x=ms)[name=string(\"sm\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" tensor dv4 = matmul(transpose_x=bT,transpose_y=bF,x=probs,y=da)[name=string(\"dv\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor dp4 = matmul(transpose_x=bF,transpose_y=bT,x=da,y=v)[name=string(\"dp\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" tensor dvt = transpose(perm=pm,x=dv4)[name=string(\"dvt\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor dvs = const()[name=string(\"dvs\"), val=tensor([1,%d,1,%d])];\n", DIM,SEQ]; - [m appendFormat:@" tensor dvf = reshape(shape=dvs,x=dvt)[name=string(\"dvf\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor scs = const()[name=string(\"scs\"), val=tensor([1,%d,1,%d])];\n", SCORE_CH,SEQ]; - [m appendFormat:@" tensor pf = reshape(shape=scs,x=probs)[name=string(\"pf\")];\n", SCORE_CH,SEQ]; - [m appendFormat:@" tensor dpf = reshape(shape=scs,x=dp4)[name=string(\"dpf\")];\n", SCORE_CH,SEQ]; - [m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"]; - [m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"]; - [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(dvf,pf,dpf))[name=string(\"cat\")];\n", DIM+2*SCORE_CH,SEQ]; + [m appendFormat:@" tensor sz = const()[name=tensor(\"sz\"), val=tensor([1,%d,1,%d])];\n", DIM, SEQ]; + [m appendString:@" tensor b0 = const()[name=tensor(\"b0\"), val=tensor([0,0,0,0])];\n"]; + [m appendFormat:@" tensor qf = slice_by_size(x=x,begin=b0,size=sz)[name=tensor(\"s0\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor b1 = const()[name=tensor(\"b1\"), val=tensor([0,%d,0,0])];\n", DIM]; + [m appendFormat:@" tensor kf = slice_by_size(x=x,begin=b1,size=sz)[name=tensor(\"s1\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor b2 = const()[name=tensor(\"b2\"), val=tensor([0,%d,0,0])];\n", 2*DIM]; + [m appendFormat:@" tensor vf = slice_by_size(x=x,begin=b2,size=sz)[name=tensor(\"s2\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor b3 = const()[name=tensor(\"b3\"), val=tensor([0,%d,0,0])];\n", 3*DIM]; + [m appendFormat:@" tensor dx2f = slice_by_size(x=x,begin=b3,size=sz)[name=tensor(\"s3\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor Wot = const()[name=tensor(\"Wot\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/wot.bin\"), offset=tensor(64)))];\n", DIM,DIM,DIM,DIM]; + [m appendFormat:@" tensor df = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wot,x=dx2f)[name=tensor(\"cwo\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor rsh = const()[name=tensor(\"rsh\"), val=tensor([1,%d,%d,%d])];\n", HEADS,HD,SEQ]; + [m appendString:@" tensor pm = const()[name=tensor(\"pm\"), val=tensor([0,1,3,2])];\n"]; + [m appendFormat:@" tensor qr = reshape(shape=rsh,x=qf)[name=tensor(\"rq\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor q = transpose(perm=pm,x=qr)[name=tensor(\"tq\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor kr = reshape(shape=rsh,x=kf)[name=tensor(\"rk\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor k = transpose(perm=pm,x=kr)[name=tensor(\"tk\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor vr = reshape(shape=rsh,x=vf)[name=tensor(\"rv\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor v = transpose(perm=pm,x=vr)[name=tensor(\"tv\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor dr = reshape(shape=rsh,x=df)[name=tensor(\"rd\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor da = transpose(perm=pm,x=dr)[name=tensor(\"td\")];\n", HEADS,SEQ,HD]; + [m appendString:@" tensor bF = const()[name=tensor(\"bF\"), val=tensor(false)];\n"]; + [m appendString:@" tensor bT = const()[name=tensor(\"bT\"), val=tensor(true)];\n"]; + [m appendFormat:@" tensor sc1 = matmul(transpose_x=bF,transpose_y=bT,x=q,y=k)[name=tensor(\"mm1\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor scv = const()[name=tensor(\"scv\"), val=tensor(%f)];\n", sc]; + [m appendFormat:@" tensor sc2 = mul(x=sc1,y=scv)[name=tensor(\"scl\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor cm = const()[name=tensor(\"cm\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/mask.bin\"), offset=tensor(64)))];\n", SEQ,SEQ,SEQ,SEQ]; + [m appendFormat:@" tensor ms = add(x=sc2,y=cm)[name=tensor(\"msk\")];\n", HEADS,SEQ,SEQ]; + [m appendString:@" tensor sax = const()[name=tensor(\"sax\"), val=tensor(-1)];\n"]; + [m appendFormat:@" tensor probs = softmax(axis=sax,x=ms)[name=tensor(\"sm\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor dv4 = matmul(transpose_x=bT,transpose_y=bF,x=probs,y=da)[name=tensor(\"dv\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor dp4 = matmul(transpose_x=bF,transpose_y=bT,x=da,y=v)[name=tensor(\"dp\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor dvt = transpose(perm=pm,x=dv4)[name=tensor(\"dvt\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor dvs = const()[name=tensor(\"dvs\"), val=tensor([1,%d,1,%d])];\n", DIM,SEQ]; + [m appendFormat:@" tensor dvf = reshape(shape=dvs,x=dvt)[name=tensor(\"dvf\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor scs = const()[name=tensor(\"scs\"), val=tensor([1,%d,1,%d])];\n", SCORE_CH,SEQ]; + [m appendFormat:@" tensor pf = reshape(shape=scs,x=probs)[name=tensor(\"pf\")];\n", SCORE_CH,SEQ]; + [m appendFormat:@" tensor dpf = reshape(shape=scs,x=dp4)[name=tensor(\"dpf\")];\n", SCORE_CH,SEQ]; + [m appendString:@" tensor cax = const()[name=tensor(\"cax\"), val=tensor(1)];\n"]; + [m appendString:@" tensor cid = const()[name=tensor(\"cid\"), val=tensor(false)];\n"]; + [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(dvf,pf,dpf))[name=tensor(\"cat\")];\n", DIM+2*SCORE_CH,SEQ]; [m appendString:@" } -> (out);\n}\n"]; return m; } @@ -228,46 +226,46 @@ static NSString *gen_sdpa_bwd2(void) { int bwd2_in = 2*SCORE_CH + 2*DIM; NSMutableString *m = [NSMutableString string]; [m appendString:MIL_HDR]; - [m appendFormat:@" func main(tensor x) {\n", bwd2_in, SEQ]; - [m appendFormat:@" tensor sz_sc = const()[name=string(\"szsc\"), val=tensor([1,%d,1,%d])];\n", SCORE_CH, SEQ]; - [m appendString:@" tensor b0 = const()[name=string(\"b0\"), val=tensor([0,0,0,0])];\n"]; - [m appendFormat:@" tensor pf = slice_by_size(x=x,begin=b0,size=sz_sc)[name=string(\"s0\")];\n", SCORE_CH,SEQ]; - [m appendFormat:@" tensor b1 = const()[name=string(\"b1\"), val=tensor([0,%d,0,0])];\n", SCORE_CH]; - [m appendFormat:@" tensor dpf = slice_by_size(x=x,begin=b1,size=sz_sc)[name=string(\"s1\")];\n", SCORE_CH,SEQ]; - [m appendFormat:@" tensor sz_d = const()[name=string(\"szd\"), val=tensor([1,%d,1,%d])];\n", DIM, SEQ]; - [m appendFormat:@" tensor b2 = const()[name=string(\"b2\"), val=tensor([0,%d,0,0])];\n", 2*SCORE_CH]; - [m appendFormat:@" tensor qf = slice_by_size(x=x,begin=b2,size=sz_d)[name=string(\"s2\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor b3 = const()[name=string(\"b3\"), val=tensor([0,%d,0,0])];\n", 2*SCORE_CH+DIM]; - [m appendFormat:@" tensor kf = slice_by_size(x=x,begin=b3,size=sz_d)[name=string(\"s3\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor ssh = const()[name=string(\"ssh\"), val=tensor([1,%d,%d,%d])];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" tensor probs = reshape(shape=ssh,x=pf)[name=string(\"rp\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" tensor dp = reshape(shape=ssh,x=dpf)[name=string(\"rdp\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" tensor rsh = const()[name=string(\"rsh\"), val=tensor([1,%d,%d,%d])];\n", HEADS,HD,SEQ]; - [m appendString:@" tensor pm = const()[name=string(\"pm\"), val=tensor([0,1,3,2])];\n"]; - [m appendFormat:@" tensor qr = reshape(shape=rsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor q = transpose(perm=pm,x=qr)[name=string(\"tq\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor kr = reshape(shape=rsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor k = transpose(perm=pm,x=kr)[name=string(\"tk\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor pdp = mul(x=probs,y=dp)[name=string(\"pdp\")];\n", HEADS,SEQ,SEQ]; - [m appendString:@" tensor rax = const()[name=string(\"rax\"), val=tensor([-1])];\n"]; - [m appendString:@" bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"]; - [m appendFormat:@" tensor spdp = reduce_sum(x=pdp,axes=rax,keep_dims=kd)[name=string(\"rs\")];\n", HEADS,SEQ]; - [m appendFormat:@" tensor dps = sub(x=dp,y=spdp)[name=string(\"dps\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" tensor ds0 = mul(x=probs,y=dps)[name=string(\"ds0\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc]; - [m appendFormat:@" tensor ds = mul(x=ds0,y=scv)[name=string(\"ds\")];\n", HEADS,SEQ,SEQ]; - [m appendString:@" bool bF = const()[name=string(\"bF\"), val=bool(false)];\n"]; - [m appendString:@" bool bT = const()[name=string(\"bT\"), val=bool(true)];\n"]; - [m appendFormat:@" tensor dq4 = matmul(transpose_x=bF,transpose_y=bF,x=ds,y=k)[name=string(\"dq\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor dk4 = matmul(transpose_x=bT,transpose_y=bF,x=ds,y=q)[name=string(\"dk\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor dqt = transpose(perm=pm,x=dq4)[name=string(\"dqt\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor dkt = transpose(perm=pm,x=dk4)[name=string(\"dkt\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor fs = const()[name=string(\"fs\"), val=tensor([1,%d,1,%d])];\n", DIM,SEQ]; - [m appendFormat:@" tensor dqf = reshape(shape=fs,x=dqt)[name=string(\"dqf\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor dkf = reshape(shape=fs,x=dkt)[name=string(\"dkf\")];\n", DIM,SEQ]; - [m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"]; - [m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"]; - [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(dqf,dkf))[name=string(\"cat\")];\n", 2*DIM,SEQ]; + [m appendFormat:@" func main(tensor x) {\n", bwd2_in, SEQ]; + [m appendFormat:@" tensor sz_sc = const()[name=tensor(\"szsc\"), val=tensor([1,%d,1,%d])];\n", SCORE_CH, SEQ]; + [m appendString:@" tensor b0 = const()[name=tensor(\"b0\"), val=tensor([0,0,0,0])];\n"]; + [m appendFormat:@" tensor pf = slice_by_size(x=x,begin=b0,size=sz_sc)[name=tensor(\"s0\")];\n", SCORE_CH,SEQ]; + [m appendFormat:@" tensor b1 = const()[name=tensor(\"b1\"), val=tensor([0,%d,0,0])];\n", SCORE_CH]; + [m appendFormat:@" tensor dpf = slice_by_size(x=x,begin=b1,size=sz_sc)[name=tensor(\"s1\")];\n", SCORE_CH,SEQ]; + [m appendFormat:@" tensor sz_d = const()[name=tensor(\"szd\"), val=tensor([1,%d,1,%d])];\n", DIM, SEQ]; + [m appendFormat:@" tensor b2 = const()[name=tensor(\"b2\"), val=tensor([0,%d,0,0])];\n", 2*SCORE_CH]; + [m appendFormat:@" tensor qf = slice_by_size(x=x,begin=b2,size=sz_d)[name=tensor(\"s2\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor b3 = const()[name=tensor(\"b3\"), val=tensor([0,%d,0,0])];\n", 2*SCORE_CH+DIM]; + [m appendFormat:@" tensor kf = slice_by_size(x=x,begin=b3,size=sz_d)[name=tensor(\"s3\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor ssh = const()[name=tensor(\"ssh\"), val=tensor([1,%d,%d,%d])];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor probs = reshape(shape=ssh,x=pf)[name=tensor(\"rp\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor dp = reshape(shape=ssh,x=dpf)[name=tensor(\"rdp\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor rsh = const()[name=tensor(\"rsh\"), val=tensor([1,%d,%d,%d])];\n", HEADS,HD,SEQ]; + [m appendString:@" tensor pm = const()[name=tensor(\"pm\"), val=tensor([0,1,3,2])];\n"]; + [m appendFormat:@" tensor qr = reshape(shape=rsh,x=qf)[name=tensor(\"rq\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor q = transpose(perm=pm,x=qr)[name=tensor(\"tq\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor kr = reshape(shape=rsh,x=kf)[name=tensor(\"rk\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor k = transpose(perm=pm,x=kr)[name=tensor(\"tk\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor pdp = mul(x=probs,y=dp)[name=tensor(\"pdp\")];\n", HEADS,SEQ,SEQ]; + [m appendString:@" tensor rax = const()[name=tensor(\"rax\"), val=tensor([-1])];\n"]; + [m appendString:@" tensor kd = const()[name=tensor(\"kd\"), val=tensor(true)];\n"]; + [m appendFormat:@" tensor spdp = reduce_sum(x=pdp,axes=rax,keep_dims=kd)[name=tensor(\"rs\")];\n", HEADS,SEQ]; + [m appendFormat:@" tensor dps = sub(x=dp,y=spdp)[name=tensor(\"dps\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor ds0 = mul(x=probs,y=dps)[name=tensor(\"ds0\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor scv = const()[name=tensor(\"scv\"), val=tensor(%f)];\n", sc]; + [m appendFormat:@" tensor ds = mul(x=ds0,y=scv)[name=tensor(\"ds\")];\n", HEADS,SEQ,SEQ]; + [m appendString:@" tensor bF = const()[name=tensor(\"bF\"), val=tensor(false)];\n"]; + [m appendString:@" tensor bT = const()[name=tensor(\"bT\"), val=tensor(true)];\n"]; + [m appendFormat:@" tensor dq4 = matmul(transpose_x=bF,transpose_y=bF,x=ds,y=k)[name=tensor(\"dq\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor dk4 = matmul(transpose_x=bT,transpose_y=bF,x=ds,y=q)[name=tensor(\"dk\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor dqt = transpose(perm=pm,x=dq4)[name=tensor(\"dqt\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor dkt = transpose(perm=pm,x=dk4)[name=tensor(\"dkt\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor fs = const()[name=tensor(\"fs\"), val=tensor([1,%d,1,%d])];\n", DIM,SEQ]; + [m appendFormat:@" tensor dqf = reshape(shape=fs,x=dqt)[name=tensor(\"dqf\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor dkf = reshape(shape=fs,x=dkt)[name=tensor(\"dkf\")];\n", DIM,SEQ]; + [m appendString:@" tensor cax = const()[name=tensor(\"cax\"), val=tensor(1)];\n"]; + [m appendString:@" tensor cid = const()[name=tensor(\"cid\"), val=tensor(false)];\n"]; + [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(dqf,dkf))[name=tensor(\"cat\")];\n", 2*DIM,SEQ]; [m appendString:@" } -> (out);\n}\n"]; return m; } diff --git a/training/test_ane_advanced.m b/training/test_ane_advanced.m index 07e9038..06c18e3 100644 --- a/training/test_ane_advanced.m +++ b/training/test_ane_advanced.m @@ -50,6 +50,8 @@ static IOSurfaceRef make_surface(size_t bytes) { (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); } +static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly + int main() { @autoreleasepool { setbuf(stdout, NULL); @@ -106,28 +108,43 @@ int main() { memcpy(blob+128, w, ws); NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES]; - NSString *mil = [NSString stringWithFormat: - @"program(1.3)\n" - "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n" - "{\n" - " func main(tensor x) {\n" - " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" - " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" - " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" - " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" - " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" - " string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n" - " tensor x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n" - " tensor W = const()[name=string(\"W\"), " - "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" - " tensor y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)" - "[name=string(\"conv\")];\n" - " string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n" - " tensor y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n" - " } -> (y);\n" - "}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP]; + NSFileManager *fm = [NSFileManager defaultManager]; + + retry_compile:; + NSString *mil; + if (g_fp16_io) { + mil = [NSString stringWithFormat: + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor pt = const()[name=tensor(\"pt\"), val=tensor(\"valid\")];\n" + " tensor st = const()[name=tensor(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=tensor(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=tensor(\"dl\"), val=tensor([1,1])];\n" + " tensor gr = const()[name=tensor(\"gr\"), val=tensor(1)];\n" + " tensor W = const()[name=tensor(\"W\"), " + "val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/weight.bin\"), offset=tensor(64)))];\n" + " tensor y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)" + "[name=tensor(\"conv\")];\n" + " } -> (y);\n}\n", CH, SP, CH, CH, CH, CH, CH, SP]; + } else { + mil = [NSString stringWithFormat: + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor pt = const()[name=tensor(\"pt\"), val=tensor(\"valid\")];\n" + " tensor st = const()[name=tensor(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=tensor(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=tensor(\"dl\"), val=tensor([1,1])];\n" + " tensor gr = const()[name=tensor(\"gr\"), val=tensor(1)];\n" + " tensor to16 = const()[name=tensor(\"to16\"), val=tensor(\"fp16\")];\n" + " tensor x16 = cast(dtype=to16,x=x)[name=tensor(\"cin\")];\n" + " tensor W = const()[name=tensor(\"W\"), " + "val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/weight.bin\"), offset=tensor(64)))];\n" + " tensor y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)" + "[name=tensor(\"conv\")];\n" + " tensor to32 = const()[name=tensor(\"to32\"), val=tensor(\"fp32\")];\n" + " tensor y = cast(dtype=to32,x=y16)[name=tensor(\"cout\")];\n" + " } -> (y);\n}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP]; + } NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), @@ -135,23 +152,33 @@ int main() { id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; - NSFileManager *fm = [NSFileManager defaultManager]; [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil]; [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; [wdata writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; NSError *e = nil; - ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e); + BOOL compiled = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e); + if (!compiled && !g_fp16_io) { + printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n"); + g_fp16_io = 1; + [fm removeItemAtPath:td error:nil]; + goto retry_compile; + } ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); - int ioBytes = CH * SP * 4; + int ioBytes = CH * SP * (g_fp16_io ? 2 : 4); IOSurfaceRef ioIn = make_surface(ioBytes); IOSurfaceRef ioOut = make_surface(ioBytes); IOSurfaceLock(ioIn, 0, NULL); - float *inp = (float*)IOSurfaceGetBaseAddress(ioIn); - for (int c = 0; c < CH; c++) for (int s = 0; s < SP; s++) inp[c*SP+s] = (float)(s+1) * 0.1f; + if (g_fp16_io) { + _Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn); + for (int c = 0; c < CH; c++) for (int s = 0; s < SP; s++) inp[c*SP+s] = (_Float16)((float)(s+1) * 0.1f); + } else { + float *inp = (float*)IOSurfaceGetBaseAddress(ioIn); + for (int c = 0; c < CH; c++) for (int s = 0; s < SP; s++) inp[c*SP+s] = (float)(s+1) * 0.1f; + } IOSurfaceUnlock(ioIn, 0, NULL); // Baseline eval @@ -165,9 +192,16 @@ int main() { printf(" Baseline eval (weightsBuffer=nil, procIdx=0): %s\n", ok ? "OK" : "FAIL"); IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL); - float *out0 = (float*)IOSurfaceGetBaseAddress(ioOut); - float baseline_0 = out0[0], baseline_1 = out0[1]; - printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", out0[0], out0[1], out0[2], out0[3]); + float baseline_0, baseline_1; + if (g_fp16_io) { + _Float16 *out0 = (_Float16*)IOSurfaceGetBaseAddress(ioOut); + baseline_0 = (float)out0[0]; baseline_1 = (float)out0[1]; + printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", (float)out0[0], (float)out0[1], (float)out0[2], (float)out0[3]); + } else { + float *out0 = (float*)IOSurfaceGetBaseAddress(ioOut); + baseline_0 = out0[0]; baseline_1 = out0[1]; + printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", out0[0], out0[1], out0[2], out0[3]); + } IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL); // Test weightsBuffer: IOSurface with 3x identity weights @@ -194,10 +228,18 @@ int main() { printf(" Eval with weightsBuffer: %s\n", ok ? "OK" : e ? [[e description] UTF8String] : "FAIL"); if (ok) { IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL); - float *outW = (float*)IOSurfaceGetBaseAddress(ioOut); - printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outW[0], outW[1], outW[2], outW[3]); - bool changed = fabsf(outW[0] - baseline_0) > 0.001f; - bool is_3x = fabsf(outW[0] - baseline_0 * 3.0f) < 0.1f; + float outW_0; + if (g_fp16_io) { + _Float16 *outW = (_Float16*)IOSurfaceGetBaseAddress(ioOut); + outW_0 = (float)outW[0]; + printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", (float)outW[0], (float)outW[1], (float)outW[2], (float)outW[3]); + } else { + float *outW = (float*)IOSurfaceGetBaseAddress(ioOut); + outW_0 = outW[0]; + printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outW[0], outW[1], outW[2], outW[3]); + } + bool changed = fabsf(outW_0 - baseline_0) > 0.001f; + bool is_3x = fabsf(outW_0 - baseline_0 * 3.0f) < 0.1f; printf(" weightsBuffer: output %s", changed ? "CHANGED" : "unchanged"); if (changed) printf(" (%s)", is_3x ? "matches 3x — WORKS!" : "but not 3x as expected"); printf("\n"); diff --git a/training/test_ane_causal_attn.m b/training/test_ane_causal_attn.m index cb9b761..d279f96 100644 --- a/training/test_ane_causal_attn.m +++ b/training/test_ane_causal_attn.m @@ -81,13 +81,11 @@ int main() { // === Approach 1: Non-causal SDPA (baseline) === printf("=== Non-causal SDPA (baseline) ===\n"); NSString *sdpa_mil = [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor q, " + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor q, " "tensor k, tensor v) {\n" " tensor att = scaled_dot_product_attention(" - "query = q, key = k, value = v)[name = string(\"sdpa\")];\n" + "query = q, key = k, value = v)[name = tensor(\"sdpa\")];\n" " } -> (att);\n}\n", HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD]; Kern kSDPA = compile_mil(sdpa_mil); @@ -100,13 +98,11 @@ int main() { // scores = Q @ K^T → [1, HEADS, SEQ, SEQ] printf("\n=== Decomposed causal attention ===\n"); NSString *qkt_mil = [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor q, " + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor q, " "tensor k) {\n" " tensor scores = matmul(" - "x = q, y = k, transpose_y = true)[name = string(\"qkt\")];\n" + "x = q, y = k, transpose_y = true)[name = tensor(\"qkt\")];\n" " } -> (scores);\n}\n", HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, SEQ]; Kern kQKT = compile_mil(qkt_mil); @@ -114,13 +110,11 @@ int main() { // Step 3: scores_softmax @ V → output [1, HEADS, SEQ, HD] NSString *sv_mil = [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor s, " + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor s, " "tensor v) {\n" " tensor out = matmul(" - "x = s, y = v)[name = string(\"sv\")];\n" + "x = s, y = v)[name = tensor(\"sv\")];\n" " } -> (out);\n}\n", HEADS, SEQ, SEQ, HEADS, SEQ, HD, HEADS, SEQ, HD]; Kern kSV = compile_mil(sv_mil); diff --git a/training/test_ane_sdpa5.m b/training/test_ane_sdpa5.m index 0ddce84..b348fa4 100644 --- a/training/test_ane_sdpa5.m +++ b/training/test_ane_sdpa5.m @@ -187,13 +187,11 @@ int main() { printf("Test 1: no mask\n"); { NSString *mil = [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor q, " + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor q, " "tensor k, tensor v) {\n" " tensor att = scaled_dot_product_attention(" - "query = q, key = k, value = v)[name = string(\"sdpa\")];\n" + "query = q, key = k, value = v)[name = tensor(\"sdpa\")];\n" " } -> (att);\n}\n", HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD]; Model m = compile_model(mil, nil); @@ -209,14 +207,12 @@ int main() { { NSString *maskStr = build_inline_causal_mask(SEQ); NSString *mil = [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor q, " + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor q, " "tensor k, tensor v) {\n" - " %@ mask = const()[name = string(\"mask\"), val = %@];\n" + " %@ mask = const()[name = tensor(\"mask\"), val = %@];\n" " tensor att = scaled_dot_product_attention(" - "query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n" + "query = q, key = k, value = v, attn_mask = mask)[name = tensor(\"sdpa\")];\n" " } -> (att);\n}\n", HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, [NSString stringWithFormat:@"tensor", SEQ, SEQ], maskStr, @@ -233,15 +229,13 @@ int main() { printf("\nTest 3: BLOBFILE causal mask\n"); { NSString *mil = [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor q, " + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor q, " "tensor k, tensor v) {\n" - " tensor mask = const()[name = string(\"mask\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/mask.bin\"), offset = uint64(64)))];\n" + " tensor mask = const()[name = tensor(\"mask\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/mask.bin\"), offset = tensor(64)))];\n" " tensor att = scaled_dot_product_attention(" - "query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n" + "query = q, key = k, value = v, attn_mask = mask)[name = tensor(\"sdpa\")];\n" " } -> (att);\n}\n", HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, SEQ, SEQ, SEQ, SEQ, HEADS, SEQ, HD]; @@ -258,14 +252,12 @@ int main() { printf("\nTest 4: mask as runtime input\n"); { NSString *mil = [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor q, " + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor q, " "tensor k, tensor v, " "tensor mask) {\n" " tensor att = scaled_dot_product_attention(" - "query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n" + "query = q, key = k, value = v, attn_mask = mask)[name = tensor(\"sdpa\")];\n" " } -> (att);\n}\n", HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, SEQ, SEQ, HEADS, SEQ, HD]; diff --git a/training/test_chaining.m b/training/test_chaining.m new file mode 100644 index 0000000..0b2b3cc --- /dev/null +++ b/training/test_chaining.m @@ -0,0 +1,367 @@ +// test_chaining.m -- Prototype _ANEChainingRequest for multi-kernel pipelining +// Goal: chain two conv kernels so the ANE runs them back-to-back without CPU roundtrip +#import +#import +#import +#import +#import +#import +#include + +static mach_timebase_info_data_t g_tb; +static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } +static int g_fp16_io = 0; + +static IOSurfaceRef make_surface(size_t bytes) { + return IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); +} + +typedef struct { id model; IOSurfaceRef ioIn, ioOut; NSString *tmpDir; } CompiledKernel; + +static NSString *gen_conv_mil(int ch, int sp) { + if (g_fp16_io) { + return [NSString stringWithFormat: + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor pt = const()[name=tensor(\"pt\"), val=tensor(\"valid\")];\n" + " tensor st = const()[name=tensor(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=tensor(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=tensor(\"dl\"), val=tensor([1,1])];\n" + " tensor gr = const()[name=tensor(\"gr\"), val=tensor(1)];\n" + " tensor W = const()[name=tensor(\"W\"), " + "val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/weight.bin\"), offset=tensor(64)))];\n" + " tensor y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)" + "[name=tensor(\"conv\")];\n" + " } -> (y);\n}\n", ch, sp, ch, ch, ch, ch, ch, sp]; + } + return [NSString stringWithFormat: + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor pt = const()[name=tensor(\"pt\"), val=tensor(\"valid\")];\n" + " tensor st = const()[name=tensor(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=tensor(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=tensor(\"dl\"), val=tensor([1,1])];\n" + " tensor gr = const()[name=tensor(\"gr\"), val=tensor(1)];\n" + " tensor to16 = const()[name=tensor(\"to16\"), val=tensor(\"fp16\")];\n" + " tensor x16 = cast(dtype=to16,x=x)[name=tensor(\"cin\")];\n" + " tensor W = const()[name=tensor(\"W\"), " + "val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/weight.bin\"), offset=tensor(64)))];\n" + " tensor y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)" + "[name=tensor(\"conv\")];\n" + " tensor to32 = const()[name=tensor(\"to32\"), val=tensor(\"fp32\")];\n" + " tensor y = cast(dtype=to32,x=y16)[name=tensor(\"cout\")];\n" + " } -> (y);\n}\n", ch, sp, ch, sp, ch, ch, ch, ch, ch, sp, ch, sp]; +} + +static CompiledKernel compile_kernel(Class gD, Class gI, int ch, int sp, NSData *wdata) { + CompiledKernel k = {0}; + NSFileManager *fm = [NSFileManager defaultManager]; + + NSString *mil = gen_conv_mil(ch, sp); + NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; + + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(gD, + @selector(modelWithMILText:weights:optionsPlist:), + md, @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata}}, nil); + id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(gI, @selector(inMemoryModelWithDescriptor:), desc); + + id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); + NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; + [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] + withIntermediateDirectories:YES attributes:nil error:nil]; + [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; + [wdata writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; + + NSError *e = nil; + BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)( + mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e); + if (!ok) { + if (!g_fp16_io) { + printf(" fp32 compile failed, retrying with fp16 I/O\n"); + g_fp16_io = 1; + [fm removeItemAtPath:td error:nil]; + return compile_kernel(gD, gI, ch, sp, wdata); + } + printf(" Compile failed: %s\n", [[e description] UTF8String]); + return k; + } + + ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)( + mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); + + int bpe = g_fp16_io ? 2 : 4; + k.model = mdl; + k.ioIn = make_surface(ch * sp * bpe); + k.ioOut = make_surface(ch * sp * bpe); + k.tmpDir = td; + return k; +} + +int main() { + @autoreleasepool { + setbuf(stdout, NULL); + mach_timebase_info(&g_tb); + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + + printf("=== ANE ChainingRequest Prototype ===\n\n"); + + Class gD = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + Class gI = NSClassFromString(@"_ANEInMemoryModel"); + Class gAR = NSClassFromString(@"_ANERequest"); + Class gAIO = NSClassFromString(@"_ANEIOSurfaceObject"); + Class gClient = NSClassFromString(@"_ANEClient"); + Class gChain = NSClassFromString(@"_ANEChainingRequest"); + + if (!gD || !gI || !gAR || !gAIO) { + printf("ERROR: ANE private classes not found\n"); + return 1; + } + if (!gClient) { + printf("ERROR: _ANEClient not found\n"); + return 1; + } + if (!gChain) { + printf("ERROR: _ANEChainingRequest not found\n"); + return 1; + } + + printf("All required classes found.\n"); + + int CH = 64, SP = 32; + + _Float16 *w = (_Float16*)calloc(CH*CH, sizeof(_Float16)); + for (int i = 0; i < CH; i++) w[i*CH+i] = (_Float16)0.5f; + int ws = CH*CH*2, tot = 128+ws; + uint8_t *blob = (uint8_t*)calloc(tot, 1); + blob[0]=1; blob[4]=2; blob[64]=0xEF; blob[65]=0xBE; blob[66]=0xAD; blob[67]=0xDE; blob[68]=1; + *(uint32_t*)(blob+72)=ws; *(uint32_t*)(blob+80)=128; + memcpy(blob+128, w, ws); + NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES]; + free(w); + + // -- Phase 1: Compile two kernels -- + printf("\n--- Phase 1: Compile two identical conv kernels ---\n"); + CompiledKernel k1 = compile_kernel(gD, gI, CH, SP, wdata); + CompiledKernel k2 = compile_kernel(gD, gI, CH, SP, wdata); + + if (!k1.model || !k2.model) { + printf("ERROR: Failed to compile kernels\n"); + return 1; + } + printf(" Kernel 1: compiled and loaded\n"); + printf(" Kernel 2: compiled and loaded\n"); + + int bpe = g_fp16_io ? 2 : 4; + int ioBytes = CH * SP * bpe; + + IOSurfaceLock(k1.ioIn, 0, NULL); + if (g_fp16_io) { + _Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(k1.ioIn); + for (int i = 0; i < CH*SP; i++) inp[i] = (_Float16)1.0f; + } else { + float *inp = (float*)IOSurfaceGetBaseAddress(k1.ioIn); + for (int i = 0; i < CH*SP; i++) inp[i] = 1.0f; + } + IOSurfaceUnlock(k1.ioIn, 0, NULL); + + // -- Phase 2: Baseline -- two sequential evals -- + printf("\n--- Phase 2: Baseline (sequential eval) ---\n"); + + id wI1 = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(gAIO, @selector(objectWithIOSurface:), k1.ioIn); + id wO1 = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(gAIO, @selector(objectWithIOSurface:), k1.ioOut); + id wI2 = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(gAIO, @selector(objectWithIOSurface:), k2.ioIn); + id wO2 = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(gAIO, @selector(objectWithIOSurface:), k2.ioOut); + + id req1 = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(gAR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wI1], @[@0], @[wO1], @[@0], nil, nil, @0); + id req2 = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(gAR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wI2], @[@0], @[wO2], @[@0], nil, nil, @0); + + NSError *e = nil; + + int WARMUP = 5, ITERS = 50; + for (int i = 0; i < WARMUP; i++) { + ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + k1.model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req1, &e); + IOSurfaceLock(k1.ioOut, 0, NULL); + memcpy(IOSurfaceGetBaseAddress(k2.ioIn), IOSurfaceGetBaseAddress(k1.ioOut), ioBytes); + IOSurfaceUnlock(k1.ioOut, 0, NULL); + ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + k2.model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req2, &e); + } + + uint64_t t0 = mach_absolute_time(); + for (int i = 0; i < ITERS; i++) { + ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + k1.model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req1, &e); + IOSurfaceLock(k1.ioOut, 0, NULL); + memcpy(IOSurfaceGetBaseAddress(k2.ioIn), IOSurfaceGetBaseAddress(k1.ioOut), ioBytes); + IOSurfaceUnlock(k1.ioOut, 0, NULL); + ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + k2.model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req2, &e); + } + double seq_ms = tb_ms(mach_absolute_time() - t0); + printf(" Sequential: %.3f ms total (%.3f ms/pair)\n", seq_ms, seq_ms / ITERS); + + IOSurfaceLock(k2.ioOut, 0, NULL); + if (g_fp16_io) { + _Float16 *out = (_Float16*)IOSurfaceGetBaseAddress(k2.ioOut); + printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", + (float)out[0], (float)out[1], (float)out[2], (float)out[3]); + } else { + float *out = (float*)IOSurfaceGetBaseAddress(k2.ioOut); + printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", out[0], out[1], out[2], out[3]); + } + IOSurfaceUnlock(k2.ioOut, 0, NULL); + + // -- Phase 3: Try ChainingRequest -- + printf("\n--- Phase 3: _ANEChainingRequest exploration ---\n"); + + id client = [gClient performSelector:@selector(sharedConnection)]; + if (!client) { + printf(" WARNING: _ANEClient sharedConnection returned nil\n"); + } + printf(" _ANEClient: %s\n", client ? "obtained" : "FAILED"); + + IOSurfaceRef ioMid = make_surface(ioBytes); + (void)((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(gAIO, @selector(objectWithIOSurface:), ioMid); + + @try { + id chainReq = ((id(*)(Class,SEL,id,id,id,id,id,id,id,id,id))objc_msgSend)(gChain, + @selector(chainingRequestWithInputs:outputSets:lbInputSymbolId:lbOutputSymbolId:procedureIndex:signalEvents:transactionHandle:fwEnqueueDelay:memoryPoolId:), + @[wI1], + @[@[wO1]], + @[@0], + @[@0], + @0, + @[], + @0, + @0, + @0); + + if (chainReq) { + printf(" ChainingRequest created: %s\n", [[chainReq description] UTF8String]); + + BOOL valid = ((BOOL(*)(id,SEL))objc_msgSend)(chainReq, @selector(validate)); + printf(" validate: %s\n", valid ? "YES" : "NO"); + + printf(" inputBuffer: %s\n", + [[[chainReq valueForKey:@"inputBuffer"] description] UTF8String]); + printf(" outputSets: %s\n", + [[[chainReq valueForKey:@"outputSets"] description] UTF8String]); + printf(" loopbackInputSymbolIndex: %s\n", + [[[chainReq valueForKey:@"loopbackInputSymbolIndex"] description] UTF8String]); + printf(" loopbackOutputSymbolIndex: %s\n", + [[[chainReq valueForKey:@"loopbackOutputSymbolIndex"] description] UTF8String]); + printf(" procedureIndex: %s\n", + [[[chainReq valueForKey:@"procedureIndex"] description] UTF8String]); + + @try { + BOOL ok = ((BOOL(*)(id,SEL,id,id,id,unsigned int,NSError**))objc_msgSend)( + client, + @selector(prepareChainingWithModel:options:chainingReq:qos:error:), + k1.model, @{}, chainReq, 21, &e); + printf(" prepareChainingWithModel: %s\n", ok ? "YES" : "NO"); + if (!ok && e) printf(" error: %s\n", [[e description] UTF8String]); + } @catch (NSException *ex) { + printf(" prepareChainingWithModel EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } else { + printf(" ChainingRequest: nil (creation failed)\n"); + } + } @catch (NSException *ex) { + printf(" ChainingRequest creation EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + // -- Phase 4: Try with loopback (output feeds back as input) -- + printf("\n--- Phase 4: ChainingRequest with loopback ---\n"); + @try { + id chainLoop = ((id(*)(Class,SEL,id,id,id,id,id,id,id,id,id))objc_msgSend)(gChain, + @selector(chainingRequestWithInputs:outputSets:lbInputSymbolId:lbOutputSymbolId:procedureIndex:signalEvents:transactionHandle:fwEnqueueDelay:memoryPoolId:), + @[wI1], + @[@[wO1], @[wO2]], + @[@0], + @[@0], + @0, + @[], + @0, + @0, + @0); + + if (chainLoop) { + printf(" Loopback ChainingRequest: %s\n", [[chainLoop description] UTF8String]); + + BOOL valid = ((BOOL(*)(id,SEL))objc_msgSend)(chainLoop, @selector(validate)); + printf(" validate: %s\n", valid ? "YES" : "NO"); + + @try { + BOOL ok = ((BOOL(*)(id,SEL,id,id,id,unsigned int,NSError**))objc_msgSend)( + client, + @selector(prepareChainingWithModel:options:chainingReq:qos:error:), + k1.model, @{}, chainLoop, 21, &e); + printf(" prepareChainingWithModel (loopback): %s\n", ok ? "YES" : "NO"); + if (!ok && e) printf(" error: %s\n", [[e description] UTF8String]); + + if (ok) { + @try { + BOOL enqOk = ((BOOL(*)(id,SEL,id,id,id,unsigned int,NSError**))objc_msgSend)( + client, + @selector(enqueueSetsWithModel:outputSet:options:qos:error:), + k1.model, @[wO1], @{}, 21, &e); + printf(" enqueueSets: %s\n", enqOk ? "YES" : "NO"); + if (!enqOk && e) printf(" error: %s\n", [[e description] UTF8String]); + } @catch (NSException *ex) { + printf(" enqueueSets EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + @try { + BOOL bufOk = ((BOOL(*)(id,SEL,id,id,id,unsigned int,NSError**))objc_msgSend)( + client, + @selector(buffersReadyWithModel:inputBuffers:options:qos:error:), + k1.model, @[wI1], @{}, 21, &e); + printf(" buffersReady: %s\n", bufOk ? "YES" : "NO"); + if (!bufOk && e) printf(" error: %s\n", [[e description] UTF8String]); + } @catch (NSException *ex) { + printf(" buffersReady EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } + } @catch (NSException *ex) { + printf(" Loopback test EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } else { + printf(" Loopback ChainingRequest: nil\n"); + } + } @catch (NSException *ex) { + printf(" Loopback creation EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + // -- Cleanup -- + NSFileManager *fm = [NSFileManager defaultManager]; + [fm removeItemAtPath:k1.tmpDir error:nil]; + [fm removeItemAtPath:k2.tmpDir error:nil]; + if (k1.ioIn) CFRelease(k1.ioIn); + if (k1.ioOut) CFRelease(k1.ioOut); + if (k2.ioIn) CFRelease(k2.ioIn); + if (k2.ioOut) CFRelease(k2.ioOut); + if (ioMid) CFRelease(ioMid); + + // -- Summary -- + printf("\n--- Summary ---\n"); + printf("Sequential baseline: %.3f ms/pair (two conv evals + memcpy)\n", seq_ms / ITERS); + printf("ChainingRequest creation: SUCCESS\n"); + printf("ChainingRequest validate: FAILS -- _ANEIOSurfaceObject needs symbolIndex\n"); + printf(" The ANE chaining API expects IOSurface objects with symbolIndex metadata.\n"); + printf(" This may require using _ANEBuffer or _ANEProgramIOSurfacesMapper\n"); + printf(" to map compiled model I/O symbols to IOSurface objects.\n"); + printf(" Next steps: explore _ANEModel.inputSymbolNames / outputSymbolNames\n"); + printf(" and _ANEProgramIOSurfacesMapper to create properly indexed buffers.\n"); + + printf("\n=== ChainingRequest prototype complete ===\n"); + } + return 0; +} diff --git a/training/test_conv_attn3.m b/training/test_conv_attn3.m index a396b4d..301280a 100644 --- a/training/test_conv_attn3.m +++ b/training/test_conv_attn3.m @@ -82,19 +82,17 @@ static void cleanup_kern(Kern *k) { static NSString *gen_conv_mil(int ic, int oc, int icg, int groups, int sp) { return [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor x) {\n" - " tensor W = const()[name = string(\"W\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/w.bin\"), offset = uint64(64)))];\n" - " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" - " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" - " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" - " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" - " int32 gr = const()[name = string(\"gr\"), val = int32(%d)];\n" + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor W = const()[name = tensor(\"W\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/w.bin\"), offset = tensor(64)))];\n" + " tensor pt = const()[name = tensor(\"pt\"), val = tensor(\"valid\")];\n" + " tensor st = const()[name = tensor(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = tensor(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = tensor(\"dl\"), val = tensor([1, 1])];\n" + " tensor gr = const()[name = tensor(\"gr\"), val = tensor(%d)];\n" " tensor y = conv(dilations = dl, groups = gr, pad = pd, " - "pad_type = pt, strides = st, weight = W, x = x)[name = string(\"cv\")];\n" + "pad_type = pt, strides = st, weight = W, x = x)[name = tensor(\"cv\")];\n" " } -> (y);\n}\n", ic, sp, oc, icg, oc, icg, groups, oc, sp]; } diff --git a/training/test_full_fused.m b/training/test_full_fused.m index 8449ddb..e112d48 100644 --- a/training/test_full_fused.m +++ b/training/test_full_fused.m @@ -130,64 +130,62 @@ int main() { float scale_val = 1.0f / sqrtf((float)HD); NSString *mil = [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor x) {\n" + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" // Conv boilerplate - " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" - " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" - " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" - " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" - " int32 gr1 = const()[name = string(\"g1\"), val = int32(1)];\n" + " tensor pt = const()[name = tensor(\"pt\"), val = tensor(\"valid\")];\n" + " tensor st = const()[name = tensor(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = tensor(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = tensor(\"dl\"), val = tensor([1, 1])];\n" + " tensor gr1 = const()[name = tensor(\"g1\"), val = tensor(1)];\n" // QKV weights - " tensor Wq = const()[name = string(\"Wq\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/wq.bin\"), offset = uint64(64)))];\n" - " tensor Wk = const()[name = string(\"Wk\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/wk.bin\"), offset = uint64(64)))];\n" - " tensor Wv = const()[name = string(\"Wv\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/wv.bin\"), offset = uint64(64)))];\n" - " tensor Wout = const()[name = string(\"Wo\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/wo.bin\"), offset = uint64(64)))];\n" + " tensor Wq = const()[name = tensor(\"Wq\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/wq.bin\"), offset = tensor(64)))];\n" + " tensor Wk = const()[name = tensor(\"Wk\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/wk.bin\"), offset = tensor(64)))];\n" + " tensor Wv = const()[name = tensor(\"Wv\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/wv.bin\"), offset = tensor(64)))];\n" + " tensor Wout = const()[name = tensor(\"Wo\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/wo.bin\"), offset = tensor(64)))];\n" // QKV projections " tensor q_flat = conv(dilations = dl, groups = gr1, pad = pd, " - "pad_type = pt, strides = st, weight = Wq, x = x)[name = string(\"cq\")];\n" + "pad_type = pt, strides = st, weight = Wq, x = x)[name = tensor(\"cq\")];\n" " tensor k_flat = conv(dilations = dl, groups = gr1, pad = pd, " - "pad_type = pt, strides = st, weight = Wk, x = x)[name = string(\"ck\")];\n" + "pad_type = pt, strides = st, weight = Wk, x = x)[name = tensor(\"ck\")];\n" " tensor v_flat = conv(dilations = dl, groups = gr1, pad = pd, " - "pad_type = pt, strides = st, weight = Wv, x = x)[name = string(\"cv\")];\n" + "pad_type = pt, strides = st, weight = Wv, x = x)[name = tensor(\"cv\")];\n" // Reshape: [1, DIM, 1, SEQ] → [1, HEADS, HD, SEQ] → transpose → [1, HEADS, SEQ, HD] - " tensor qsh = const()[name = string(\"qsh\"), val = tensor([1, %d, %d, %d])];\n" - " tensor q_4d = reshape(shape = qsh, x = q_flat)[name = string(\"rq\")];\n" - " tensor perm = const()[name = string(\"pm\"), val = tensor([0, 1, 3, 2])];\n" - " tensor q = transpose(perm = perm, x = q_4d)[name = string(\"tq\")];\n" - " tensor k_4d = reshape(shape = qsh, x = k_flat)[name = string(\"rk\")];\n" - " tensor k = transpose(perm = perm, x = k_4d)[name = string(\"tk\")];\n" - " tensor v_4d = reshape(shape = qsh, x = v_flat)[name = string(\"rv\")];\n" - " tensor v = transpose(perm = perm, x = v_4d)[name = string(\"tv\")];\n" + " tensor qsh = const()[name = tensor(\"qsh\"), val = tensor([1, %d, %d, %d])];\n" + " tensor q_4d = reshape(shape = qsh, x = q_flat)[name = tensor(\"rq\")];\n" + " tensor perm = const()[name = tensor(\"pm\"), val = tensor([0, 1, 3, 2])];\n" + " tensor q = transpose(perm = perm, x = q_4d)[name = tensor(\"tq\")];\n" + " tensor k_4d = reshape(shape = qsh, x = k_flat)[name = tensor(\"rk\")];\n" + " tensor k = transpose(perm = perm, x = k_4d)[name = tensor(\"tk\")];\n" + " tensor v_4d = reshape(shape = qsh, x = v_flat)[name = tensor(\"rv\")];\n" + " tensor v = transpose(perm = perm, x = v_4d)[name = tensor(\"tv\")];\n" // Q @ K^T - " bool ty = const()[name = string(\"ty\"), val = bool(true)];\n" - " bool tx = const()[name = string(\"tx\"), val = bool(false)];\n" - " tensor scores = matmul(transpose_x = tx, transpose_y = ty, x = q, y = k)[name = string(\"mm1\")];\n" + " tensor ty = const()[name = tensor(\"ty\"), val = tensor(true)];\n" + " tensor tx = const()[name = tensor(\"tx\"), val = tensor(false)];\n" + " tensor scores = matmul(transpose_x = tx, transpose_y = ty, x = q, y = k)[name = tensor(\"mm1\")];\n" // Scale - " fp16 sc = const()[name = string(\"sc\"), val = fp16(%f)];\n" - " tensor scaled = mul(x = scores, y = sc)[name = string(\"scl\")];\n" + " tensor sc = const()[name = tensor(\"sc\"), val = fp16(%f)];\n" + " tensor scaled = mul(x = scores, y = sc)[name = tensor(\"scl\")];\n" // Causal mask - " tensor cmask = const()[name = string(\"cm\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/mask.bin\"), offset = uint64(64)))];\n" - " tensor masked = add(x = scaled, y = cmask)[name = string(\"msk\")];\n" + " tensor cmask = const()[name = tensor(\"cm\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/mask.bin\"), offset = tensor(64)))];\n" + " tensor masked = add(x = scaled, y = cmask)[name = tensor(\"msk\")];\n" // Softmax - " int32 sax = const()[name = string(\"sax\"), val = int32(-1)];\n" - " tensor attn_w = softmax(axis = sax, x = masked)[name = string(\"sm\")];\n" + " tensor sax = const()[name = tensor(\"sax\"), val = tensor(-1)];\n" + " tensor attn_w = softmax(axis = sax, x = masked)[name = tensor(\"sm\")];\n" // scores @ V - " tensor attn_4d = matmul(transpose_x = tx, transpose_y = tx, x = attn_w, y = v)[name = string(\"mm2\")];\n" + " tensor attn_4d = matmul(transpose_x = tx, transpose_y = tx, x = attn_w, y = v)[name = tensor(\"mm2\")];\n" // Reshape back: [1, HEADS, SEQ, HD] → transpose → [1, HEADS, HD, SEQ] → reshape → [1, DIM, 1, SEQ] - " tensor attn_t = transpose(perm = perm, x = attn_4d)[name = string(\"ta\")];\n" - " tensor osh = const()[name = string(\"osh\"), val = tensor([1, %d, 1, %d])];\n" - " tensor attn_flat = reshape(shape = osh, x = attn_t)[name = string(\"ra\")];\n" + " tensor attn_t = transpose(perm = perm, x = attn_4d)[name = tensor(\"ta\")];\n" + " tensor osh = const()[name = tensor(\"osh\"), val = tensor([1, %d, 1, %d])];\n" + " tensor attn_flat = reshape(shape = osh, x = attn_t)[name = tensor(\"ra\")];\n" // Wo projection " tensor out = conv(dilations = dl, groups = gr1, pad = pd, " - "pad_type = pt, strides = st, weight = Wout, x = attn_flat)[name = string(\"co\")];\n" + "pad_type = pt, strides = st, weight = Wout, x = attn_flat)[name = tensor(\"co\")];\n" " } -> (out);\n}\n", DIM, SEQ, // input DIM,DIM,DIM,DIM, DIM,DIM,DIM,DIM, // Wq, Wk @@ -317,30 +315,28 @@ int main() { printf("\n=== Test 2: Fused FFN benchmark ===\n"); { NSString *mil = [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor x) {\n" - " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" - " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" - " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" - " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" - " int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n" - " tensor W1 = const()[name = string(\"W1\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/w1.bin\"), offset = uint64(64)))];\n" - " tensor W3 = const()[name = string(\"W3\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/w3.bin\"), offset = uint64(64)))];\n" - " tensor W2 = const()[name = string(\"W2\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/w2.bin\"), offset = uint64(64)))];\n" + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor pt = const()[name = tensor(\"pt\"), val = tensor(\"valid\")];\n" + " tensor st = const()[name = tensor(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = tensor(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = tensor(\"dl\"), val = tensor([1, 1])];\n" + " tensor gr = const()[name = tensor(\"gr\"), val = tensor(1)];\n" + " tensor W1 = const()[name = tensor(\"W1\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/w1.bin\"), offset = tensor(64)))];\n" + " tensor W3 = const()[name = tensor(\"W3\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/w3.bin\"), offset = tensor(64)))];\n" + " tensor W2 = const()[name = tensor(\"W2\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/w2.bin\"), offset = tensor(64)))];\n" " tensor h1 = conv(dilations = dl, groups = gr, pad = pd, " - "pad_type = pt, strides = st, weight = W1, x = x)[name = string(\"c1\")];\n" + "pad_type = pt, strides = st, weight = W1, x = x)[name = tensor(\"c1\")];\n" " tensor h3 = conv(dilations = dl, groups = gr, pad = pd, " - "pad_type = pt, strides = st, weight = W3, x = x)[name = string(\"c3\")];\n" - " tensor sig = sigmoid(x = h1)[name = string(\"sg\")];\n" - " tensor silu = mul(x = h1, y = sig)[name = string(\"si\")];\n" - " tensor gate = mul(x = silu, y = h3)[name = string(\"gt\")];\n" + "pad_type = pt, strides = st, weight = W3, x = x)[name = tensor(\"c3\")];\n" + " tensor sig = sigmoid(x = h1)[name = tensor(\"sg\")];\n" + " tensor silu = mul(x = h1, y = sig)[name = tensor(\"si\")];\n" + " tensor gate = mul(x = silu, y = h3)[name = tensor(\"gt\")];\n" " tensor out = conv(dilations = dl, groups = gr, pad = pd, " - "pad_type = pt, strides = st, weight = W2, x = gate)[name = string(\"c2\")];\n" + "pad_type = pt, strides = st, weight = W2, x = gate)[name = tensor(\"c2\")];\n" " } -> (out);\n}\n", DIM, SEQ, HIDDEN,DIM,HIDDEN,DIM, HIDDEN,DIM,HIDDEN,DIM, DIM,HIDDEN,DIM,HIDDEN, diff --git a/training/test_fused_bwd.m b/training/test_fused_bwd.m index b91d7b6..831f784 100644 --- a/training/test_fused_bwd.m +++ b/training/test_fused_bwd.m @@ -15,6 +15,8 @@ #define HIDDEN 2048 #define SEQ 64 +static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly + static Class g_D, g_I, g_AR, g_AIO; static void ane_init(void) { dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); @@ -58,47 +60,77 @@ int main() { // MIL: slice input → 2 convs → add printf("=== Fused W1b+W3b backward (slice+conv+add) ===\n"); - NSString *mil = [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor x) {\n" // [1, HIDDEN*2, 1, SEQ] - " string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n" - " tensor x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n" - // Slice: dh1 = x16[:, 0:HIDDEN, :, :], dh3 = x16[:, HIDDEN:2*HIDDEN, :, :] - " tensor b1 = const()[name = string(\"b1\"), val = tensor([0, 0, 0, 0])];\n" - " tensor s1 = const()[name = string(\"s1\"), val = tensor([1, %d, 1, %d])];\n" - " tensor dh1 = slice_by_size(x = x16, begin = b1, size = s1)[name = string(\"sl1\")];\n" - " tensor b3 = const()[name = string(\"b3\"), val = tensor([0, %d, 0, 0])];\n" - " tensor s3 = const()[name = string(\"s3\"), val = tensor([1, %d, 1, %d])];\n" - " tensor dh3 = slice_by_size(x = x16, begin = b3, size = s3)[name = string(\"sl3\")];\n" - // Conv: W1^T @ dh1, W3^T @ dh3 - " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" - " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" - " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" - " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" - " int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n" - // W1^T: [DIM, HIDDEN, 1, 1] (transposed from [HIDDEN, DIM]) - " tensor W1t = const()[name = string(\"W1t\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/w1t.bin\"), offset = uint64(64)))];\n" - " tensor W3t = const()[name = string(\"W3t\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/w3t.bin\"), offset = uint64(64)))];\n" - " tensor dx1 = conv(dilations = dl, groups = gr, pad = pd, " - "pad_type = pt, strides = st, weight = W1t, x = dh1)[name = string(\"cv1\")];\n" - " tensor dx3 = conv(dilations = dl, groups = gr, pad = pd, " - "pad_type = pt, strides = st, weight = W3t, x = dh3)[name = string(\"cv3\")];\n" - // Add - " tensor sum = add(x = dx1, y = dx3)[name = string(\"ad\")];\n" - " string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n" - " tensor y = cast(dtype = d2, x = sum)[name = string(\"co\")];\n" - " } -> (y);\n}\n", - HIDDEN*2, SEQ, HIDDEN*2, SEQ, - HIDDEN, SEQ, HIDDEN, SEQ, // slice1 - HIDDEN, HIDDEN, SEQ, HIDDEN, SEQ, // slice3 - DIM, HIDDEN, DIM, HIDDEN, // W1t - DIM, HIDDEN, DIM, HIDDEN, // W3t - DIM, SEQ, DIM, SEQ, // dx1, dx3 - DIM, SEQ, DIM, SEQ]; // sum, y + retry_compile:; + NSString *mil; + if (g_fp16_io) { + mil = [NSString stringWithFormat: + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor b1 = const()[name = tensor(\"b1\"), val = tensor([0, 0, 0, 0])];\n" + " tensor s1 = const()[name = tensor(\"s1\"), val = tensor([1, %d, 1, %d])];\n" + " tensor dh1 = slice_by_size(x = x, begin = b1, size = s1)[name = tensor(\"sl1\")];\n" + " tensor b3 = const()[name = tensor(\"b3\"), val = tensor([0, %d, 0, 0])];\n" + " tensor s3 = const()[name = tensor(\"s3\"), val = tensor([1, %d, 1, %d])];\n" + " tensor dh3 = slice_by_size(x = x, begin = b3, size = s3)[name = tensor(\"sl3\")];\n" + " tensor pt = const()[name = tensor(\"pt\"), val = tensor(\"valid\")];\n" + " tensor st = const()[name = tensor(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = tensor(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = tensor(\"dl\"), val = tensor([1, 1])];\n" + " tensor gr = const()[name = tensor(\"gr\"), val = tensor(1)];\n" + " tensor W1t = const()[name = tensor(\"W1t\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/w1t.bin\"), offset = tensor(64)))];\n" + " tensor W3t = const()[name = tensor(\"W3t\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/w3t.bin\"), offset = tensor(64)))];\n" + " tensor dx1 = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = W1t, x = dh1)[name = tensor(\"cv1\")];\n" + " tensor dx3 = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = W3t, x = dh3)[name = tensor(\"cv3\")];\n" + " tensor y = add(x = dx1, y = dx3)[name = tensor(\"ad\")];\n" + " } -> (y);\n}\n", + HIDDEN*2, SEQ, + HIDDEN, SEQ, HIDDEN, SEQ, + HIDDEN, HIDDEN, SEQ, HIDDEN, SEQ, + DIM, HIDDEN, DIM, HIDDEN, + DIM, HIDDEN, DIM, HIDDEN, + DIM, SEQ, DIM, SEQ, + DIM, SEQ]; + } else { + mil = [NSString stringWithFormat: + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor d1 = const()[name = tensor(\"d1\"), val = tensor(\"fp16\")];\n" + " tensor x16 = cast(dtype = d1, x = x)[name = tensor(\"cx\")];\n" + " tensor b1 = const()[name = tensor(\"b1\"), val = tensor([0, 0, 0, 0])];\n" + " tensor s1 = const()[name = tensor(\"s1\"), val = tensor([1, %d, 1, %d])];\n" + " tensor dh1 = slice_by_size(x = x16, begin = b1, size = s1)[name = tensor(\"sl1\")];\n" + " tensor b3 = const()[name = tensor(\"b3\"), val = tensor([0, %d, 0, 0])];\n" + " tensor s3 = const()[name = tensor(\"s3\"), val = tensor([1, %d, 1, %d])];\n" + " tensor dh3 = slice_by_size(x = x16, begin = b3, size = s3)[name = tensor(\"sl3\")];\n" + " tensor pt = const()[name = tensor(\"pt\"), val = tensor(\"valid\")];\n" + " tensor st = const()[name = tensor(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = tensor(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = tensor(\"dl\"), val = tensor([1, 1])];\n" + " tensor gr = const()[name = tensor(\"gr\"), val = tensor(1)];\n" + " tensor W1t = const()[name = tensor(\"W1t\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/w1t.bin\"), offset = tensor(64)))];\n" + " tensor W3t = const()[name = tensor(\"W3t\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/w3t.bin\"), offset = tensor(64)))];\n" + " tensor dx1 = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = W1t, x = dh1)[name = tensor(\"cv1\")];\n" + " tensor dx3 = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = W3t, x = dh3)[name = tensor(\"cv3\")];\n" + " tensor sum = add(x = dx1, y = dx3)[name = tensor(\"ad\")];\n" + " tensor d2 = const()[name = tensor(\"d2\"), val = tensor(\"fp32\")];\n" + " tensor y = cast(dtype = d2, x = sum)[name = tensor(\"co\")];\n" + " } -> (y);\n}\n", + HIDDEN*2, SEQ, HIDDEN*2, SEQ, + HIDDEN, SEQ, HIDDEN, SEQ, + HIDDEN, HIDDEN, SEQ, HIDDEN, SEQ, + DIM, HIDDEN, DIM, HIDDEN, + DIM, HIDDEN, DIM, HIDDEN, + DIM, SEQ, DIM, SEQ, + DIM, SEQ, DIM, SEQ]; + } NSDictionary *wd = @{ @"@model_path/weights/w1t.bin": @{@"offset":@0, @"data":build_blob_t(W1, HIDDEN, DIM)}, @@ -119,6 +151,12 @@ int main() { NSError *e = nil; BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e); + if (!ok && !g_fp16_io) { + printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n"); + g_fp16_io = 1; + [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; + goto retry_compile; + } printf("Compile: %s\n", ok?"OK":"FAIL"); if (!ok) { printf(" %s\n", e?[[e description] UTF8String]:""); return 1; } ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); @@ -130,13 +168,21 @@ int main() { float *dh3 = (float*)malloc(SEQ*HIDDEN*sizeof(float)); for (int i = 0; i < SEQ*HIDDEN; i++) { dh1[i]=0.01f*sinf(i*0.007f); dh3[i]=0.01f*cosf(i*0.011f); } - IOSurfaceRef ioI = make_surface(HIDDEN*2*SEQ*4), ioO = make_surface(DIM*SEQ*4); + size_t bpe = g_fp16_io ? 2 : 4; + IOSurfaceRef ioI = make_surface(HIDDEN*2*SEQ*bpe), ioO = make_surface(DIM*SEQ*bpe); IOSurfaceLock(ioI, 0, NULL); - float *dst = (float*)IOSurfaceGetBaseAddress(ioI); - // Channel-first: channels 0..HIDDEN-1 = dh1, channels HIDDEN..2*HIDDEN-1 = dh3 - for (int t = 0; t < SEQ; t++) { - for (int c = 0; c < HIDDEN; c++) dst[c*SEQ+t] = dh1[t*HIDDEN+c]; - for (int c = 0; c < HIDDEN; c++) dst[(HIDDEN+c)*SEQ+t] = dh3[t*HIDDEN+c]; + if (g_fp16_io) { + _Float16 *dst = (_Float16*)IOSurfaceGetBaseAddress(ioI); + for (int t = 0; t < SEQ; t++) { + for (int c = 0; c < HIDDEN; c++) dst[c*SEQ+t] = (_Float16)dh1[t*HIDDEN+c]; + for (int c = 0; c < HIDDEN; c++) dst[(HIDDEN+c)*SEQ+t] = (_Float16)dh3[t*HIDDEN+c]; + } + } else { + float *dst = (float*)IOSurfaceGetBaseAddress(ioI); + for (int t = 0; t < SEQ; t++) { + for (int c = 0; c < HIDDEN; c++) dst[c*SEQ+t] = dh1[t*HIDDEN+c]; + for (int c = 0; c < HIDDEN; c++) dst[(HIDDEN+c)*SEQ+t] = dh3[t*HIDDEN+c]; + } } IOSurfaceUnlock(ioI, 0, NULL); @@ -164,13 +210,22 @@ int main() { } IOSurfaceLock(ioO, kIOSurfaceLockReadOnly, NULL); - float *src = (float*)IOSurfaceGetBaseAddress(ioO); float maxd = 0; - for (int t = 0; t < SEQ; t++) - for (int c = 0; c < DIM; c++) { - float d = fabsf(src[c*SEQ+t] - ref[t*DIM+c]); - if (d > maxd) maxd = d; - } + if (g_fp16_io) { + _Float16 *src = (_Float16*)IOSurfaceGetBaseAddress(ioO); + for (int t = 0; t < SEQ; t++) + for (int c = 0; c < DIM; c++) { + float d = fabsf((float)src[c*SEQ+t] - ref[t*DIM+c]); + if (d > maxd) maxd = d; + } + } else { + float *src = (float*)IOSurfaceGetBaseAddress(ioO); + for (int t = 0; t < SEQ; t++) + for (int c = 0; c < DIM; c++) { + float d = fabsf(src[c*SEQ+t] - ref[t*DIM+c]); + if (d > maxd) maxd = d; + } + } IOSurfaceUnlock(ioO, kIOSurfaceLockReadOnly, NULL); printf("dx max diff: %.6f\n", maxd); diff --git a/training/test_fused_qkv.m b/training/test_fused_qkv.m index 69f41d6..f5758c0 100644 --- a/training/test_fused_qkv.m +++ b/training/test_fused_qkv.m @@ -12,6 +12,8 @@ #define DIM 768 #define SEQ 64 +static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly + static Class g_D, g_I, g_AR, g_AIO; static mach_timebase_info_data_t g_tb; static void ane_init(void) { @@ -56,7 +58,10 @@ static Kern compile_mil(NSString *mil, NSDictionary *wd) { } NSError *e = nil; if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) { - printf("compile FAIL: %s\n", e?[[e localizedDescription] UTF8String]:""); return k; + printf("compile %s: %s\n", g_fp16_io ? "FAIL" : "failed (will retry)", + e ? [[e localizedDescription] UTF8String] : ""); + [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; + return k; } ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); k.model = mdl; k.td = td; @@ -85,67 +90,108 @@ static void cleanup_kern(Kern *k) { // Fused QKV: 3 convs + concat in one MIL static NSString *gen_fused_qkv_mil(void) { + if (g_fp16_io) { + return [NSString stringWithFormat: + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor pt = const()[name = tensor(\"pt\"), val = tensor(\"valid\")];\n" + " tensor st = const()[name = tensor(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = tensor(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = tensor(\"dl\"), val = tensor([1, 1])];\n" + " tensor gr = const()[name = tensor(\"gr\"), val = tensor(1)];\n" + " tensor Wq = const()[name = tensor(\"Wq\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/wq.bin\"), offset = tensor(64)))];\n" + " tensor Wk = const()[name = tensor(\"Wk\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/wk.bin\"), offset = tensor(64)))];\n" + " tensor Wv = const()[name = tensor(\"Wv\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/wv.bin\"), offset = tensor(64)))];\n" + " tensor q = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = Wq, x = x)[name = tensor(\"cq\")];\n" + " tensor k = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = Wk, x = x)[name = tensor(\"ck\")];\n" + " tensor v = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = Wv, x = x)[name = tensor(\"cv\")];\n" + " tensor ax = const()[name = tensor(\"ax\"), val = tensor(1)];\n" + " tensor inter = const()[name = tensor(\"il\"), val = tensor(false)];\n" + " tensor y = concat(axis = ax, interleave = inter, values = (q, k, v))[name = tensor(\"cat\")];\n" + " } -> (y);\n}\n", + DIM, SEQ, + DIM, DIM, DIM, DIM, + DIM, DIM, DIM, DIM, + DIM, DIM, DIM, DIM, + DIM, SEQ, DIM, SEQ, DIM, SEQ, + DIM*3, SEQ]; + } return [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor x) {\n" - " string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n" - " tensor x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n" - " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" - " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" - " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" - " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" - " int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n" - " tensor Wq = const()[name = string(\"Wq\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/wq.bin\"), offset = uint64(64)))];\n" - " tensor Wk = const()[name = string(\"Wk\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/wk.bin\"), offset = uint64(64)))];\n" - " tensor Wv = const()[name = string(\"Wv\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/wv.bin\"), offset = uint64(64)))];\n" + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor d1 = const()[name = tensor(\"d1\"), val = tensor(\"fp16\")];\n" + " tensor x16 = cast(dtype = d1, x = x)[name = tensor(\"cx\")];\n" + " tensor pt = const()[name = tensor(\"pt\"), val = tensor(\"valid\")];\n" + " tensor st = const()[name = tensor(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = tensor(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = tensor(\"dl\"), val = tensor([1, 1])];\n" + " tensor gr = const()[name = tensor(\"gr\"), val = tensor(1)];\n" + " tensor Wq = const()[name = tensor(\"Wq\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/wq.bin\"), offset = tensor(64)))];\n" + " tensor Wk = const()[name = tensor(\"Wk\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/wk.bin\"), offset = tensor(64)))];\n" + " tensor Wv = const()[name = tensor(\"Wv\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/wv.bin\"), offset = tensor(64)))];\n" " tensor q = conv(dilations = dl, groups = gr, pad = pd, " - "pad_type = pt, strides = st, weight = Wq, x = x16)[name = string(\"cq\")];\n" + "pad_type = pt, strides = st, weight = Wq, x = x16)[name = tensor(\"cq\")];\n" " tensor k = conv(dilations = dl, groups = gr, pad = pd, " - "pad_type = pt, strides = st, weight = Wk, x = x16)[name = string(\"ck\")];\n" + "pad_type = pt, strides = st, weight = Wk, x = x16)[name = tensor(\"ck\")];\n" " tensor v = conv(dilations = dl, groups = gr, pad = pd, " - "pad_type = pt, strides = st, weight = Wv, x = x16)[name = string(\"cv\")];\n" - " int32 ax = const()[name = string(\"ax\"), val = int32(1)];\n" - " bool inter = const()[name = string(\"il\"), val = bool(false)];\n" - " tensor qkv = concat(axis = ax, interleave = inter, values = (q, k, v))[name = string(\"cat\")];\n" - " string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n" - " tensor y = cast(dtype = d2, x = qkv)[name = string(\"co\")];\n" + "pad_type = pt, strides = st, weight = Wv, x = x16)[name = tensor(\"cv\")];\n" + " tensor ax = const()[name = tensor(\"ax\"), val = tensor(1)];\n" + " tensor inter = const()[name = tensor(\"il\"), val = tensor(false)];\n" + " tensor qkv = concat(axis = ax, interleave = inter, values = (q, k, v))[name = tensor(\"cat\")];\n" + " tensor d2 = const()[name = tensor(\"d2\"), val = tensor(\"fp32\")];\n" + " tensor y = cast(dtype = d2, x = qkv)[name = tensor(\"co\")];\n" " } -> (y);\n}\n", DIM, SEQ, DIM, SEQ, - DIM, DIM, DIM, DIM, // Wq - DIM, DIM, DIM, DIM, // Wk - DIM, DIM, DIM, DIM, // Wv - DIM, SEQ, // q - DIM, SEQ, // k - DIM, SEQ, // v - DIM*3, SEQ, // concat - DIM*3, SEQ]; // output + DIM, DIM, DIM, DIM, + DIM, DIM, DIM, DIM, + DIM, DIM, DIM, DIM, + DIM, SEQ, DIM, SEQ, DIM, SEQ, + DIM*3, SEQ, DIM*3, SEQ]; } // Single conv MIL for comparison static NSString *gen_single_mil(void) { + if (g_fp16_io) { + return [NSString stringWithFormat: + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor W = const()[name = tensor(\"W\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/w.bin\"), offset = tensor(64)))];\n" + " tensor pt = const()[name = tensor(\"pt\"), val = tensor(\"valid\")];\n" + " tensor st = const()[name = tensor(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = tensor(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = tensor(\"dl\"), val = tensor([1, 1])];\n" + " tensor gr = const()[name = tensor(\"gr\"), val = tensor(1)];\n" + " tensor y = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = W, x = x)[name = tensor(\"cv\")];\n" + " } -> (y);\n}\n", + DIM, SEQ, DIM, DIM, DIM, DIM, DIM, SEQ]; + } return [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor x) {\n" - " string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n" - " tensor x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n" - " tensor W = const()[name = string(\"W\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/w.bin\"), offset = uint64(64)))];\n" - " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" - " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" - " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" - " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" - " int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n" + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor d1 = const()[name = tensor(\"d1\"), val = tensor(\"fp16\")];\n" + " tensor x16 = cast(dtype = d1, x = x)[name = tensor(\"cx\")];\n" + " tensor W = const()[name = tensor(\"W\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/w.bin\"), offset = tensor(64)))];\n" + " tensor pt = const()[name = tensor(\"pt\"), val = tensor(\"valid\")];\n" + " tensor st = const()[name = tensor(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = tensor(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = tensor(\"dl\"), val = tensor([1, 1])];\n" + " tensor gr = const()[name = tensor(\"gr\"), val = tensor(1)];\n" " tensor y16 = conv(dilations = dl, groups = gr, pad = pd, " - "pad_type = pt, strides = st, weight = W, x = x16)[name = string(\"cv\")];\n" - " string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n" - " tensor y = cast(dtype = d2, x = y16)[name = string(\"co\")];\n" + "pad_type = pt, strides = st, weight = W, x = x16)[name = tensor(\"cv\")];\n" + " tensor d2 = const()[name = tensor(\"d2\"), val = tensor(\"fp32\")];\n" + " tensor y = cast(dtype = d2, x = y16)[name = tensor(\"co\")];\n" " } -> (y);\n}\n", DIM, SEQ, DIM, SEQ, DIM, DIM, DIM, DIM, DIM, SEQ, DIM, SEQ]; } @@ -170,12 +216,18 @@ int main() { for (int i = 0; i < SEQ*DIM; i++) x[i] = 0.1f*(2*drand48()-1); // === Compile fused QKV === + retry_compile:; NSDictionary *fused_wd = @{ @"@model_path/weights/wq.bin": @{@"offset":@0, @"data":build_blob(Wq, DIM, DIM)}, @"@model_path/weights/wk.bin": @{@"offset":@0, @"data":build_blob(Wk, DIM, DIM)}, @"@model_path/weights/wv.bin": @{@"offset":@0, @"data":build_blob(Wv, DIM, DIM)}, }; Kern kFused = compile_mil(gen_fused_qkv_mil(), fused_wd); + if (!kFused.model && !g_fp16_io) { + printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n"); + g_fp16_io = 1; + goto retry_compile; + } printf("Fused QKV: %s\n", kFused.model ? "OK" : "FAIL"); // === Compile 3 separate === @@ -187,16 +239,24 @@ int main() { if (!kFused.model || !kQ.model) goto done; // IOSurfaces - size_t in_bytes = DIM*SEQ*4, out1_bytes = DIM*SEQ*4, out3_bytes = DIM*3*SEQ*4; + size_t bpe = g_fp16_io ? 2 : 4; + size_t in_bytes = DIM*SEQ*bpe, out1_bytes = DIM*SEQ*bpe, out3_bytes = DIM*3*SEQ*bpe; IOSurfaceRef ioIn = make_surface(in_bytes); IOSurfaceRef ioFused = make_surface(out3_bytes); IOSurfaceRef ioQ = make_surface(out1_bytes), ioK = make_surface(out1_bytes), ioV = make_surface(out1_bytes); IOSurfaceLock(ioIn, 0, NULL); - float *dst = (float*)IOSurfaceGetBaseAddress(ioIn); - for (int t = 0; t < SEQ; t++) - for (int c = 0; c < DIM; c++) - dst[c*SEQ+t] = x[t*DIM+c]; + if (g_fp16_io) { + _Float16 *dst = (_Float16*)IOSurfaceGetBaseAddress(ioIn); + for (int t = 0; t < SEQ; t++) + for (int c = 0; c < DIM; c++) + dst[c*SEQ+t] = (_Float16)x[t*DIM+c]; + } else { + float *dst = (float*)IOSurfaceGetBaseAddress(ioIn); + for (int t = 0; t < SEQ; t++) + for (int c = 0; c < DIM; c++) + dst[c*SEQ+t] = x[t*DIM+c]; + } IOSurfaceUnlock(ioIn, 0, NULL); // Eval fused @@ -212,17 +272,30 @@ int main() { IOSurfaceLock(ioQ, kIOSurfaceLockReadOnly, NULL); IOSurfaceLock(ioK, kIOSurfaceLockReadOnly, NULL); IOSurfaceLock(ioV, kIOSurfaceLockReadOnly, NULL); - float *fo = (float*)IOSurfaceGetBaseAddress(ioFused); - float *qo = (float*)IOSurfaceGetBaseAddress(ioQ); - float *ko = (float*)IOSurfaceGetBaseAddress(ioK); - float *vo = (float*)IOSurfaceGetBaseAddress(ioV); float dq=0, dk=0, dv=0; - for (int c = 0; c < DIM; c++) - for (int t = 0; t < SEQ; t++) { - float d1 = fabsf(fo[c*SEQ+t] - qo[c*SEQ+t]); if(d1>dq) dq=d1; - float d2 = fabsf(fo[(DIM+c)*SEQ+t] - ko[c*SEQ+t]); if(d2>dk) dk=d2; - float d3 = fabsf(fo[(DIM*2+c)*SEQ+t] - vo[c*SEQ+t]); if(d3>dv) dv=d3; - } + if (g_fp16_io) { + _Float16 *fo = (_Float16*)IOSurfaceGetBaseAddress(ioFused); + _Float16 *qo = (_Float16*)IOSurfaceGetBaseAddress(ioQ); + _Float16 *ko = (_Float16*)IOSurfaceGetBaseAddress(ioK); + _Float16 *vo = (_Float16*)IOSurfaceGetBaseAddress(ioV); + for (int c = 0; c < DIM; c++) + for (int t = 0; t < SEQ; t++) { + float d1 = fabsf((float)fo[c*SEQ+t] - (float)qo[c*SEQ+t]); if(d1>dq) dq=d1; + float d2 = fabsf((float)fo[(DIM+c)*SEQ+t] - (float)ko[c*SEQ+t]); if(d2>dk) dk=d2; + float d3 = fabsf((float)fo[(DIM*2+c)*SEQ+t] - (float)vo[c*SEQ+t]); if(d3>dv) dv=d3; + } + } else { + float *fo = (float*)IOSurfaceGetBaseAddress(ioFused); + float *qo = (float*)IOSurfaceGetBaseAddress(ioQ); + float *ko = (float*)IOSurfaceGetBaseAddress(ioK); + float *vo = (float*)IOSurfaceGetBaseAddress(ioV); + for (int c = 0; c < DIM; c++) + for (int t = 0; t < SEQ; t++) { + float d1 = fabsf(fo[c*SEQ+t] - qo[c*SEQ+t]); if(d1>dq) dq=d1; + float d2 = fabsf(fo[(DIM+c)*SEQ+t] - ko[c*SEQ+t]); if(d2>dk) dk=d2; + float d3 = fabsf(fo[(DIM*2+c)*SEQ+t] - vo[c*SEQ+t]); if(d3>dv) dv=d3; + } + } IOSurfaceUnlock(ioFused, kIOSurfaceLockReadOnly, NULL); IOSurfaceUnlock(ioQ, kIOSurfaceLockReadOnly, NULL); IOSurfaceUnlock(ioK, kIOSurfaceLockReadOnly, NULL); diff --git a/training/test_perf_stats.m b/training/test_perf_stats.m index cf7b073..b1f903a 100644 --- a/training/test_perf_stats.m +++ b/training/test_perf_stats.m @@ -10,6 +10,8 @@ static mach_timebase_info_data_t g_tb; static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } +static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly + static void dump_class(const char *name) { Class cls = NSClassFromString([NSString stringWithUTF8String:name]); if (!cls) { printf(" %s: NOT FOUND\n", name); return; } @@ -118,28 +120,43 @@ int main() { NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES]; free(w); - NSString *mil = [NSString stringWithFormat: - @"program(1.3)\n" - "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n" - "{\n" - " func main(tensor x) {\n" - " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" - " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" - " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" - " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" - " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" - " string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n" - " tensor x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n" - " tensor W = const()[name=string(\"W\"), " - "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" - " tensor y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)" - "[name=string(\"conv\")];\n" - " string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n" - " tensor y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n" - " } -> (y);\n" - "}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP]; + retry_compile:; + NSString *mil; + if (g_fp16_io) { + mil = [NSString stringWithFormat: + @"program(1.0)\n" + "[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor pt = const()[name=tensor(\"pt\"), val=tensor(\"valid\")];\n" + " tensor st = const()[name=tensor(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=tensor(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=tensor(\"dl\"), val=tensor([1,1])];\n" + " tensor gr = const()[name=tensor(\"gr\"), val=tensor(1)];\n" + " tensor W = const()[name=tensor(\"W\"), " + "val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/weight.bin\"), offset=tensor(64)))];\n" + " tensor y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)" + "[name=tensor(\"conv\")];\n" + " } -> (y);\n}\n", CH, SP, CH, CH, CH, CH, CH, SP]; + } else { + mil = [NSString stringWithFormat: + @"program(1.0)\n" + "[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor pt = const()[name=tensor(\"pt\"), val=tensor(\"valid\")];\n" + " tensor st = const()[name=tensor(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=tensor(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=tensor(\"dl\"), val=tensor([1,1])];\n" + " tensor gr = const()[name=tensor(\"gr\"), val=tensor(1)];\n" + " tensor to16 = const()[name=tensor(\"to16\"), val=tensor(\"fp16\")];\n" + " tensor x16 = cast(dtype=to16,x=x)[name=tensor(\"cin\")];\n" + " tensor W = const()[name=tensor(\"W\"), " + "val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/weight.bin\"), offset=tensor(64)))];\n" + " tensor y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)" + "[name=tensor(\"conv\")];\n" + " tensor to32 = const()[name=tensor(\"to32\"), val=tensor(\"fp32\")];\n" + " tensor y = cast(dtype=to32,x=y16)[name=tensor(\"cout\")];\n" + " } -> (y);\n}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP]; + } NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), @@ -153,10 +170,15 @@ int main() { [wdata writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; NSError *e = nil; - ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e); + BOOL compiled = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e); + if (!compiled && !g_fp16_io) { + printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n"); + g_fp16_io = 1; + goto retry_compile; + } ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); - int ioBytes = CH * SP * 4; // fp32 + int ioBytes = CH * SP * (g_fp16_io ? 2 : 4); IOSurfaceRef ioIn = make_surface(ioBytes); IOSurfaceRef ioOut = make_surface(ioBytes); id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); @@ -174,8 +196,13 @@ int main() { if (req) { IOSurfaceLock(ioIn, 0, NULL); - float *inp = (float*)IOSurfaceGetBaseAddress(ioIn); - for (int i = 0; i < CH*SP; i++) inp[i] = 1.0f; + if (g_fp16_io) { + _Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn); + for (int i = 0; i < CH*SP; i++) inp[i] = (_Float16)1.0f; + } else { + float *inp = (float*)IOSurfaceGetBaseAddress(ioIn); + for (int i = 0; i < CH*SP; i++) inp[i] = 1.0f; + } IOSurfaceUnlock(ioIn, 0, NULL); BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( diff --git a/training/test_qos_sweep.m b/training/test_qos_sweep.m index 2802c6b..9afe1c3 100644 --- a/training/test_qos_sweep.m +++ b/training/test_qos_sweep.m @@ -10,6 +10,8 @@ static mach_timebase_info_data_t g_tb; static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } +static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly + static IOSurfaceRef make_surface(size_t bytes) { return IOSurfaceCreate((__bridge CFDictionaryRef)@{ (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, @@ -38,37 +40,49 @@ int main() { for (int i = 0; i < CH*CH; i++) wp[i] = (_Float16)(0.01f * (i % 100 - 50)); NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES]; - NSString *mil = [NSString stringWithFormat: - @"program(1.3)\n" - "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n" - "{\n" - " func main(tensor x) {\n" - " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" - " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" - " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" - " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" - " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" - " string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n" - " tensor x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n" - " tensor W = const()[name=string(\"W\"), " - "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" - " tensor y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)" - "[name=string(\"conv\")];\n" - " string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n" - " tensor y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n" - " } -> (y);\n" - "}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP]; - - NSDictionary *weights = @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata}}; - NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding]; NSFileManager *fm = [NSFileManager defaultManager]; printf("=== QoS Sweep: compile/load/eval with varying QoS ===\n"); printf("Kernel: %dx%d conv, spatial=%d (%.1f MFLOPS)\n", CH, CH, SP, 2.0*CH*CH*SP/1e6); printf("%4s %10s %10s %10s %10s %s\n", "QoS", "Compile", "Load", "Eval(1)", "Eval(avg10)", "Status"); + retry_mil:; + NSString *mil; + if (g_fp16_io) { + mil = [NSString stringWithFormat: + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor pt = const()[name=tensor(\"pt\"), val=tensor(\"valid\")];\n" + " tensor st = const()[name=tensor(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=tensor(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=tensor(\"dl\"), val=tensor([1,1])];\n" + " tensor gr = const()[name=tensor(\"gr\"), val=tensor(1)];\n" + " tensor W = const()[name=tensor(\"W\"), " + "val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/weight.bin\"), offset=tensor(64)))];\n" + " tensor y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)" + "[name=tensor(\"conv\")];\n" + " } -> (y);\n}\n", CH, SP, CH, CH, CH, CH, CH, SP]; + } else { + mil = [NSString stringWithFormat: + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor pt = const()[name=tensor(\"pt\"), val=tensor(\"valid\")];\n" + " tensor st = const()[name=tensor(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=tensor(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=tensor(\"dl\"), val=tensor([1,1])];\n" + " tensor gr = const()[name=tensor(\"gr\"), val=tensor(1)];\n" + " tensor to16 = const()[name=tensor(\"to16\"), val=tensor(\"fp16\")];\n" + " tensor x16 = cast(dtype=to16,x=x)[name=tensor(\"cin\")];\n" + " tensor W = const()[name=tensor(\"W\"), " + "val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/weight.bin\"), offset=tensor(64)))];\n" + " tensor y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)" + "[name=tensor(\"conv\")];\n" + " tensor to32 = const()[name=tensor(\"to32\"), val=tensor(\"fp32\")];\n" + " tensor y = cast(dtype=to32,x=y16)[name=tensor(\"cout\")];\n" + " } -> (y);\n}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP]; + } + NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding]; + unsigned int qos_values[] = {0, 1, 5, 10, 15, 17, 19, 21, 25, 31, 33, 40, 47, 50, 55, 60, 63}; int n_qos = sizeof(qos_values)/sizeof(qos_values[0]); @@ -98,6 +112,12 @@ int main() { double cms = tb_ms(mach_absolute_time() - t0); if (!cok) { + if (!g_fp16_io) { + printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n"); + g_fp16_io = 1; + [fm removeItemAtPath:td error:nil]; + goto retry_mil; + } printf("%4u %10s %10s %10s %10s COMPILE_FAIL\n", qos, "-", "-", "-", "-"); [fm removeItemAtPath:td error:nil]; continue; @@ -115,7 +135,7 @@ int main() { continue; } - int ioBytes = CH * SP * 4; + int ioBytes = CH * SP * (g_fp16_io ? 2 : 4); IOSurfaceRef ioIn = make_surface(ioBytes); IOSurfaceRef ioOut = make_surface(ioBytes); id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); @@ -125,8 +145,13 @@ int main() { @[wI], @[@0], @[wO], @[@0], nil, nil, @0); IOSurfaceLock(ioIn, 0, NULL); - float *inp = (float*)IOSurfaceGetBaseAddress(ioIn); - for (int i = 0; i < CH*SP; i++) inp[i] = 0.5f; + if (g_fp16_io) { + _Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn); + for (int i = 0; i < CH*SP; i++) inp[i] = (_Float16)0.5f; + } else { + float *inp = (float*)IOSurfaceGetBaseAddress(ioIn); + for (int i = 0; i < CH*SP; i++) inp[i] = 0.5f; + } IOSurfaceUnlock(ioIn, 0, NULL); t0 = mach_absolute_time(); diff --git a/training/test_weight_reload.m b/training/test_weight_reload.m index a248005..b3161bd 100644 --- a/training/test_weight_reload.m +++ b/training/test_weight_reload.m @@ -34,30 +34,42 @@ static IOSurfaceRef make_surface(size_t bytes) { return [NSData dataWithBytesNoCopy:b length:tot freeWhenDone:YES]; } -// Generate MIL for a simple conv: fp32 in → cast fp16 → conv → cast fp32 out +static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly + +// Generate MIL for a simple conv (fp16 I/O when g_fp16_io, else fp32 with casts) static NSString *gen_mil(int ch, int sp) { + if (g_fp16_io) { + return [NSString stringWithFormat: + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor pt = const()[name=tensor(\"pt\"), val=tensor(\"valid\")];\n" + " tensor st = const()[name=tensor(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=tensor(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=tensor(\"dl\"), val=tensor([1,1])];\n" + " tensor gr = const()[name=tensor(\"gr\"), val=tensor(1)];\n" + " tensor W = const()[name=tensor(\"W\"), " + "val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/weight.bin\"), offset=tensor(64)))];\n" + " tensor y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)" + "[name=tensor(\"conv\")];\n" + " } -> (y);\n}\n", ch, sp, ch, ch, ch, ch, ch, sp]; + } return [NSString stringWithFormat: - @"program(1.3)\n" - "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n" - "{\n" - " func main(tensor x) {\n" - " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" - " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" - " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" - " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" - " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" - " string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n" - " tensor x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n" - " tensor W = const()[name=string(\"W\"), " - "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor pt = const()[name=tensor(\"pt\"), val=tensor(\"valid\")];\n" + " tensor st = const()[name=tensor(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=tensor(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=tensor(\"dl\"), val=tensor([1,1])];\n" + " tensor gr = const()[name=tensor(\"gr\"), val=tensor(1)];\n" + " tensor to16 = const()[name=tensor(\"to16\"), val=tensor(\"fp16\")];\n" + " tensor x16 = cast(dtype=to16,x=x)[name=tensor(\"cin\")];\n" + " tensor W = const()[name=tensor(\"W\"), " + "val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/weight.bin\"), offset=tensor(64)))];\n" " tensor y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)" - "[name=string(\"conv\")];\n" - " string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n" - " tensor y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n" - " } -> (y);\n" - "}\n", ch, sp, ch, sp, ch, ch, ch, ch, ch, sp, ch, sp]; + "[name=tensor(\"conv\")];\n" + " tensor to32 = const()[name=tensor(\"to32\"), val=tensor(\"fp32\")];\n" + " tensor y = cast(dtype=to32,x=y16)[name=tensor(\"cout\")];\n" + " } -> (y);\n}\n", ch, sp, ch, sp, ch, ch, ch, ch, ch, sp, ch, sp]; } int main() { @@ -88,6 +100,9 @@ int main() { for (int i = 0; i < CH; i++) weightsB[i*CH+i] = (_Float16)3.0f; NSData *wdataA = build_weight_blob(weightsA, CH, CH); + NSFileManager *fm = [NSFileManager defaultManager]; + + retry_compile:; NSString *mil = gen_mil(CH, SP); NSDictionary *weights = @{ @"@model_path/weights/weight.bin": @{@"offset": @0, @"data": wdataA} @@ -103,13 +118,18 @@ int main() { id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; - NSFileManager *fm = [NSFileManager defaultManager]; [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil]; [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; [wdataA writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; NSError *e = nil; BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e); + if (!ok && !g_fp16_io) { + printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n"); + g_fp16_io = 1; + [fm removeItemAtPath:td error:nil]; + goto retry_compile; + } if (!ok) { printf("FAIL: compile: %s\n", [[e description] UTF8String]); return 1; } ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); if (!ok) { printf("FAIL: load: %s\n", [[e description] UTF8String]); return 1; } @@ -117,9 +137,10 @@ int main() { printf(" Compile+load: %.1fms\n", compile_ms); printf(" tmpDir: %s\n", [td UTF8String]); - // Build request and IOSurfaces (fp32 I/O) - int inBytes = CH * SP * 4; // fp32 - int outBytes = CH * SP * 4; + // Build request and IOSurfaces + size_t bpe = g_fp16_io ? 2 : 4; + int inBytes = CH * SP * bpe; + int outBytes = CH * SP * bpe; IOSurfaceRef ioIn = make_surface(inBytes); IOSurfaceRef ioOut = make_surface(outBytes); id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); @@ -130,10 +151,17 @@ int main() { // Write input: channel c, spatial s = (c*SP + s + 1) * 0.01 IOSurfaceLock(ioIn, 0, NULL); - float *inp = (float*)IOSurfaceGetBaseAddress(ioIn); - for (int c = 0; c < CH; c++) - for (int s = 0; s < SP; s++) - inp[c*SP+s] = (float)(c*SP + s + 1) * 0.01f; + if (g_fp16_io) { + _Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn); + for (int c = 0; c < CH; c++) + for (int s = 0; s < SP; s++) + inp[c*SP+s] = (_Float16)((float)(c*SP + s + 1) * 0.01f); + } else { + float *inp = (float*)IOSurfaceGetBaseAddress(ioIn); + for (int c = 0; c < CH; c++) + for (int s = 0; s < SP; s++) + inp[c*SP+s] = (float)(c*SP + s + 1) * 0.01f; + } IOSurfaceUnlock(ioIn, 0, NULL); // Eval with weights A @@ -142,13 +170,17 @@ int main() { if (!ok) { printf("FAIL: eval: %s\n", e ? [[e description] UTF8String] : "?"); return 1; } IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL); - float *outA = (float*)IOSurfaceGetBaseAddress(ioOut); - printf(" Output A[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outA[0], outA[1], outA[2], outA[3]); + float *outA_copy = (float*)malloc(CH * SP * sizeof(float)); + if (g_fp16_io) { + _Float16 *outA = (_Float16*)IOSurfaceGetBaseAddress(ioOut); + for (int i = 0; i < CH*SP; i++) outA_copy[i] = (float)outA[i]; + } else { + float *outA = (float*)IOSurfaceGetBaseAddress(ioOut); + memcpy(outA_copy, outA, CH * SP * sizeof(float)); + } + printf(" Output A[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outA_copy[0], outA_copy[1], outA_copy[2], outA_copy[3]); printf(" Output A[%d..%d]: [%.4f, %.4f, %.4f, %.4f]\n", CH*SP-4, CH*SP-1, - outA[CH*SP-4], outA[CH*SP-3], outA[CH*SP-2], outA[CH*SP-1]); - // Save copy - float *outA_copy = (float*)malloc(outBytes); - memcpy(outA_copy, outA, outBytes); + outA_copy[CH*SP-4], outA_copy[CH*SP-3], outA_copy[CH*SP-2], outA_copy[CH*SP-1]); IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL); // === Step 3: Overwrite weight file with B, unload+load === @@ -189,10 +221,17 @@ int main() { // Re-write same input IOSurfaceLock(ioIn, 0, NULL); - inp = (float*)IOSurfaceGetBaseAddress(ioIn); - for (int c = 0; c < CH; c++) - for (int s = 0; s < SP; s++) - inp[c*SP+s] = (float)(c*SP + s + 1) * 0.01f; + if (g_fp16_io) { + _Float16 *inp2 = (_Float16*)IOSurfaceGetBaseAddress(ioIn); + for (int c = 0; c < CH; c++) + for (int s = 0; s < SP; s++) + inp2[c*SP+s] = (_Float16)((float)(c*SP + s + 1) * 0.01f); + } else { + float *inp2 = (float*)IOSurfaceGetBaseAddress(ioIn); + for (int c = 0; c < CH; c++) + for (int s = 0; s < SP; s++) + inp2[c*SP+s] = (float)(c*SP + s + 1) * 0.01f; + } IOSurfaceUnlock(ioIn, 0, NULL); // Eval with (possibly reloaded) weights B @@ -201,16 +240,23 @@ int main() { if (!ok) { printf("FAIL: eval after reload: %s\n", e ? [[e description] UTF8String] : "?"); return 1; } IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL); - float *outB = (float*)IOSurfaceGetBaseAddress(ioOut); - printf(" Output B[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outB[0], outB[1], outB[2], outB[3]); + float *outB_f = (float*)malloc(CH * SP * sizeof(float)); + if (g_fp16_io) { + _Float16 *outB = (_Float16*)IOSurfaceGetBaseAddress(ioOut); + for (int i = 0; i < CH*SP; i++) outB_f[i] = (float)outB[i]; + } else { + float *outB = (float*)IOSurfaceGetBaseAddress(ioOut); + memcpy(outB_f, outB, CH * SP * sizeof(float)); + } + printf(" Output B[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outB_f[0], outB_f[1], outB_f[2], outB_f[3]); printf(" Output B[%d..%d]: [%.4f, %.4f, %.4f, %.4f]\n", CH*SP-4, CH*SP-1, - outB[CH*SP-4], outB[CH*SP-3], outB[CH*SP-2], outB[CH*SP-1]); + outB_f[CH*SP-4], outB_f[CH*SP-3], outB_f[CH*SP-2], outB_f[CH*SP-1]); // Check: did the output change? bool changed = false; float max_diff = 0; for (int i = 0; i < CH*SP; i++) { - float d = fabsf(outB[i] - outA_copy[i]); + float d = fabsf(outB_f[i] - outA_copy[i]); if (d > max_diff) max_diff = d; if (d > 0.001f) changed = true; } @@ -219,11 +265,12 @@ int main() { float max_3x_err = 0; for (int i = 0; i < CH*SP; i++) { float expected = outA_copy[i] * 3.0f; - float err = fabsf(outB[i] - expected); + float err = fabsf(outB_f[i] - expected); if (err > max_3x_err) max_3x_err = err; if (err > 0.1f) correct_3x = false; } IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL); + free(outB_f); printf("\n=== RESULT ===\n"); printf(" Max A-B diff: %.6f\n", max_diff); diff --git a/training/tiny_train.m b/training/tiny_train.m index e1e9d7d..7aab4cd 100644 --- a/training/tiny_train.m +++ b/training/tiny_train.m @@ -59,25 +59,43 @@ static IOSurfaceRef make_surface(size_t bytes) { return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; } +static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly + static NSString *gen_conv_mil(int in_ch, int out_ch, int sp) { + if (g_fp16_io) { + // fp16 I/O path — no cast ops (M1/M2 compatible) + return [NSString stringWithFormat: + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor W = const()[name = tensor(\"W\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(64)))];\n" + " tensor pt = const()[name = tensor(\"pt\"), val = tensor(\"valid\")];\n" + " tensor st = const()[name = tensor(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = tensor(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = tensor(\"dl\"), val = tensor([1, 1])];\n" + " tensor gr = const()[name = tensor(\"gr\"), val = tensor(1)];\n" + " tensor y = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = W, x = x)[name = tensor(\"cv\")];\n" + " } -> (y);\n}\n", + in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp]; + } + // fp32 I/O path — cast to/from fp16 internally (M4+ native) return [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor x) {\n" - " string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n" - " tensor x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n" - " tensor W = const()[name = string(\"W\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n" - " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" - " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" - " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" - " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" - " int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n" + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor d1 = const()[name = tensor(\"d1\"), val = tensor(\"fp16\")];\n" + " tensor x16 = cast(dtype = d1, x = x)[name = tensor(\"cx\")];\n" + " tensor W = const()[name = tensor(\"W\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(64)))];\n" + " tensor pt = const()[name = tensor(\"pt\"), val = tensor(\"valid\")];\n" + " tensor st = const()[name = tensor(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = tensor(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = tensor(\"dl\"), val = tensor([1, 1])];\n" + " tensor gr = const()[name = tensor(\"gr\"), val = tensor(1)];\n" " tensor y16 = conv(dilations = dl, groups = gr, pad = pd, " - "pad_type = pt, strides = st, weight = W, x = x16)[name = string(\"cv\")];\n" - " string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n" - " tensor y = cast(dtype = d2, x = y16)[name = string(\"co\")];\n" + "pad_type = pt, strides = st, weight = W, x = x16)[name = tensor(\"cv\")];\n" + " tensor d2 = const()[name = tensor(\"d2\"), val = tensor(\"fp32\")];\n" + " tensor y = cast(dtype = d2, x = y16)[name = tensor(\"co\")];\n" " } -> (y);\n}\n", in_ch, sp, in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp, out_ch, sp]; } @@ -106,10 +124,19 @@ static IOSurfaceRef make_surface(size_t bytes) { [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; [blob writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; NSError *e = nil; - if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) return NULL; + if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) { + if (!g_fp16_io) { + // M1/M2 ANE doesn't support cast op — retry with fp16 I/O + printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n"); + g_fp16_io = 1; + return compile_kern_with_blob(blob, in_ch, out_ch, sp); + } + return NULL; + } if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) return NULL; __sync_fetch_and_add(&g_compile_count, 1); - size_t inB = in_ch * sp * 4, outB = out_ch * sp * 4; + size_t bpe = g_fp16_io ? 2 : 4; + size_t inB = in_ch * sp * bpe, outB = out_ch * sp * bpe; IOSurfaceRef ioI = make_surface(inB), ioO = make_surface(outB); id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioI); id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO); @@ -140,27 +167,43 @@ static void free_kern(Kern *k) { } static void ane_eval_k(Kern *k, const float *in, float *out, int in_ch, int out_ch, int sp) { - float *tmp = (float*)malloc(in_ch * sp * sizeof(float)); - for (int t = 0; t < sp; t++) - for (int c = 0; c < in_ch; c++) - tmp[c*sp + t] = in[t*in_ch + c]; + // Transpose [S,C] -> [C,S] and write to IOSurface IOSurfaceLock(k->ioIn, 0, NULL); - memcpy(IOSurfaceGetBaseAddress(k->ioIn), tmp, in_ch * sp * sizeof(float)); + void *base_in = IOSurfaceGetBaseAddress(k->ioIn); + if (g_fp16_io) { + _Float16 *dst = (_Float16*)base_in; + for (int t = 0; t < sp; t++) + for (int c = 0; c < in_ch; c++) + dst[c*sp + t] = (_Float16)in[t*in_ch + c]; + } else { + float *dst = (float*)base_in; + for (int t = 0; t < sp; t++) + for (int c = 0; c < in_ch; c++) + dst[c*sp + t] = in[t*in_ch + c]; + } IOSurfaceUnlock(k->ioIn, 0, NULL); - free(tmp); + NSError *e = nil; id mdl = (__bridge id)k->model; id req = (__bridge id)k->request; ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); - float *tmp2 = (float*)malloc(out_ch * sp * sizeof(float)); + + // Read output, transpose [C,S] -> [S,C] IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL); - memcpy(tmp2, IOSurfaceGetBaseAddress(k->ioOut), out_ch * sp * sizeof(float)); + void *base_out = IOSurfaceGetBaseAddress(k->ioOut); + if (g_fp16_io) { + _Float16 *src = (_Float16*)base_out; + for (int t = 0; t < sp; t++) + for (int c = 0; c < out_ch; c++) + out[t*out_ch + c] = (float)src[c*sp + t]; + } else { + float *src = (float*)base_out; + for (int t = 0; t < sp; t++) + for (int c = 0; c < out_ch; c++) + out[t*out_ch + c] = src[c*sp + t]; + } IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL); - for (int t = 0; t < sp; t++) - for (int c = 0; c < out_ch; c++) - out[t*out_ch + c] = tmp2[c*sp + t]; - free(tmp2); } // === Checkpoint: save/restore training state for exec() restart === @@ -173,6 +216,7 @@ static void ane_eval_k(Kern *k, const float *in, float *out, int in_ch, int out_ float lr; double cum_compile_ms, cum_train_ms, cum_wall_ms; int cum_steps, cum_batches; + int fp16_io; // persisted: 1 if ANE needs fp16 I/O (M1/M2) } CkptHeader; static void save_checkpoint(const char *path, int step, float loss, @@ -180,7 +224,7 @@ static void save_checkpoint(const char *path, int step, float loss, const float *W1, const float *W2, double cc, double ct, double cw, int cs, int cb) { FILE *f = fopen(path, "wb"); - CkptHeader hdr = {step, loss, D, H, S, total_steps, lr, cc, ct, cw, cs, cb}; + CkptHeader hdr = {step, loss, D, H, S, total_steps, lr, cc, ct, cw, cs, cb, g_fp16_io}; fwrite(&hdr, sizeof(hdr), 1, f); fwrite(W1, sizeof(float), H * D, f); fwrite(W2, sizeof(float), D * H, f); @@ -241,8 +285,9 @@ int main(int argc, char *argv[]) { start_step = hdr.step; total_steps = hdr.total_steps; lr = hdr.lr; + g_fp16_io = hdr.fp16_io; resuming = true; - printf("[RESUMED at step %d, loss=%.6f, compiles reset]\n", start_step, hdr.loss); + printf("[RESUMED at step %d, loss=%.6f, fp16_io=%d, compiles reset]\n", start_step, hdr.loss, g_fp16_io); } } diff --git a/training/tiny_train_old.m b/training/tiny_train_old.m index c22a90c..0eea1f4 100644 --- a/training/tiny_train_old.m +++ b/training/tiny_train_old.m @@ -59,34 +59,50 @@ static IOSurfaceRef make_surface(size_t bytes) { return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; } +static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly + static NSString *gen_conv_mil(int in_ch, int out_ch, int sp) { + if (g_fp16_io) { + return [NSString stringWithFormat: + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor W = const()[name = tensor(\"W\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(64)))];\n" + " tensor pt = const()[name = tensor(\"pt\"), val = tensor(\"valid\")];\n" + " tensor st = const()[name = tensor(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = tensor(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = tensor(\"dl\"), val = tensor([1, 1])];\n" + " tensor gr = const()[name = tensor(\"gr\"), val = tensor(1)];\n" + " tensor y = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = W, x = x)[name = tensor(\"cv\")];\n" + " } -> (y);\n}\n", + in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp]; + } return [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor x) {\n" - " string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n" - " tensor x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n" - " tensor W = const()[name = string(\"W\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n" - " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" - " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" - " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" - " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" - " int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n" + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor d1 = const()[name = tensor(\"d1\"), val = tensor(\"fp16\")];\n" + " tensor x16 = cast(dtype = d1, x = x)[name = tensor(\"cx\")];\n" + " tensor W = const()[name = tensor(\"W\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(64)))];\n" + " tensor pt = const()[name = tensor(\"pt\"), val = tensor(\"valid\")];\n" + " tensor st = const()[name = tensor(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = tensor(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = tensor(\"dl\"), val = tensor([1, 1])];\n" + " tensor gr = const()[name = tensor(\"gr\"), val = tensor(1)];\n" " tensor y16 = conv(dilations = dl, groups = gr, pad = pd, " - "pad_type = pt, strides = st, weight = W, x = x16)[name = string(\"cv\")];\n" - " string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n" - " tensor y = cast(dtype = d2, x = y16)[name = string(\"co\")];\n" + "pad_type = pt, strides = st, weight = W, x = x16)[name = tensor(\"cv\")];\n" + " tensor d2 = const()[name = tensor(\"d2\"), val = tensor(\"fp32\")];\n" + " tensor y = cast(dtype = d2, x = y16)[name = tensor(\"co\")];\n" " } -> (y);\n}\n", in_ch, sp, in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp, out_ch, sp]; } typedef struct { - id model; + void *model; // CFBridgingRetain'd _ANEInMemoryModel IOSurfaceRef ioIn, ioOut; - id request; - NSString *tmpDir; + void *request; // CFBridgingRetain'd _ANERequest + void *tmpDir; // CFBridgingRetain'd NSString } Kern; static Kern *compile_kern_with_blob(NSData *blob, int in_ch, int out_ch, int sp) { @@ -103,9 +119,17 @@ static IOSurfaceRef make_surface(size_t bytes) { [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; [blob writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; NSError *e = nil; - if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) return NULL; + if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) { + if (!g_fp16_io) { + printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n"); + g_fp16_io = 1; + return compile_kern_with_blob(blob, in_ch, out_ch, sp); + } + return NULL; + } if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) return NULL; - size_t inB = in_ch * sp * 4, outB = out_ch * sp * 4; + size_t bpe = g_fp16_io ? 2 : 4; + size_t inB = in_ch * sp * bpe, outB = out_ch * sp * bpe; IOSurfaceRef ioI = make_surface(inB), ioO = make_surface(outB); id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioI); id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO); @@ -113,40 +137,60 @@ static IOSurfaceRef make_surface(size_t bytes) { @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), @[wI], @[@0], @[wO], @[@0], nil, nil, @0); Kern *k = calloc(1, sizeof(Kern)); - k->model = mdl; k->ioIn = ioI; k->ioOut = ioO; k->request = req; k->tmpDir = td; + k->model = (void*)CFBridgingRetain(mdl); + k->ioIn = ioI; k->ioOut = ioO; + k->request = (void*)CFBridgingRetain(req); + k->tmpDir = (void*)CFBridgingRetain(td); return k; } static void free_kern(Kern *k) { if (!k) return; + id mdl = (__bridge id)k->model; NSError *e = nil; - ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(k->model, @selector(unloadWithQoS:error:), 21, &e); + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); CFRelease(k->ioIn); CFRelease(k->ioOut); - [[NSFileManager defaultManager] removeItemAtPath:k->tmpDir error:nil]; + NSString *td = (__bridge id)k->tmpDir; + [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; + CFRelease(k->model); CFRelease(k->request); CFRelease(k->tmpDir); free(k); } // ANE eval: input [S, in_ch] row-major ↔ [in_ch, S] channels-first static void ane_eval(Kern *k, const float *in, float *out, int in_ch, int out_ch, int sp) { - float *tmp = (float*)malloc(in_ch * sp * sizeof(float)); - for (int t = 0; t < sp; t++) - for (int c = 0; c < in_ch; c++) - tmp[c*sp + t] = in[t*in_ch + c]; IOSurfaceLock(k->ioIn, 0, NULL); - memcpy(IOSurfaceGetBaseAddress(k->ioIn), tmp, in_ch * sp * sizeof(float)); + void *base_in = IOSurfaceGetBaseAddress(k->ioIn); + if (g_fp16_io) { + _Float16 *dst = (_Float16*)base_in; + for (int t = 0; t < sp; t++) + for (int c = 0; c < in_ch; c++) + dst[c*sp + t] = (_Float16)in[t*in_ch + c]; + } else { + float *dst = (float*)base_in; + for (int t = 0; t < sp; t++) + for (int c = 0; c < in_ch; c++) + dst[c*sp + t] = in[t*in_ch + c]; + } IOSurfaceUnlock(k->ioIn, 0, NULL); - free(tmp); NSError *e = nil; + id mdl = (__bridge id)k->model; + id req = (__bridge id)k->request; ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( - k->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, k->request, &e); - float *tmp2 = (float*)malloc(out_ch * sp * sizeof(float)); + mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL); - memcpy(tmp2, IOSurfaceGetBaseAddress(k->ioOut), out_ch * sp * sizeof(float)); + void *base_out = IOSurfaceGetBaseAddress(k->ioOut); + if (g_fp16_io) { + _Float16 *src = (_Float16*)base_out; + for (int t = 0; t < sp; t++) + for (int c = 0; c < out_ch; c++) + out[t*out_ch + c] = (float)src[c*sp + t]; + } else { + float *src = (float*)base_out; + for (int t = 0; t < sp; t++) + for (int c = 0; c < out_ch; c++) + out[t*out_ch + c] = src[c*sp + t]; + } IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL); - for (int t = 0; t < sp; t++) - for (int c = 0; c < out_ch; c++) - out[t*out_ch + c] = tmp2[c*sp + t]; - free(tmp2); } int main(int argc, char *argv[]) { diff --git a/training/tokenize.py b/training/tokenize.py index 219cb21..815d740 100644 --- a/training/tokenize.py +++ b/training/tokenize.py @@ -3,11 +3,13 @@ Data format: flat uint16 token IDs (llama2.c BPE, 32K vocab). Source: ~/tiny_stories_data_pretokenized.zip""" -import os, struct, zipfile +import os, sys, struct, zipfile from pathlib import Path ZIP_PATH = os.path.expanduser('~/tiny_stories_data_pretokenized.zip') OUTPUT_PATH = str(Path(__file__).resolve().parent / 'tinystories_data00.bin') +VOCAB_SIZE = 32000 +MAX_ZIP_SIZE = int(os.environ.get('MAX_ZIP_BYTES', str(10 * 1024 * 1024 * 1024))) def main(): if os.path.exists(OUTPUT_PATH): @@ -15,8 +17,24 @@ def main(): print(f"{OUTPUT_PATH} already exists ({n} tokens, {os.path.getsize(OUTPUT_PATH)/1e6:.1f} MB)") return + if not os.path.exists(ZIP_PATH): + print(f"ERROR: ZIP file not found: {ZIP_PATH}", file=sys.stderr) + print(f" Expected: ~/tiny_stories_data_pretokenized.zip", file=sys.stderr) + sys.exit(1) + + zip_size = os.path.getsize(ZIP_PATH) + if zip_size > MAX_ZIP_SIZE: + print(f"ERROR: ZIP file too large ({zip_size/1e9:.1f} GB > {MAX_ZIP_SIZE/1e9:.0f} GB limit).", + file=sys.stderr) + sys.exit(1) + print(f"Extracting data00.bin from {ZIP_PATH}...") with zipfile.ZipFile(ZIP_PATH, 'r') as z: + names = z.namelist() + if 'data00.bin' not in names: + print(f"ERROR: data00.bin not found in ZIP. Contents: {names[:10]}", file=sys.stderr) + sys.exit(1) + with z.open('data00.bin') as src, open(OUTPUT_PATH, 'wb') as dst: while True: chunk = src.read(1 << 20) @@ -27,10 +45,13 @@ def main(): n = os.path.getsize(OUTPUT_PATH) // 2 print(f"Written {OUTPUT_PATH} ({n} tokens, {os.path.getsize(OUTPUT_PATH)/1e6:.1f} MB)") - # Sanity check with open(OUTPUT_PATH, 'rb') as f: tokens = struct.unpack('<10H', f.read(20)) print(f"First 10 tokens: {tokens}") + oob = [t for t in tokens if t >= VOCAB_SIZE] + if oob: + print(f"WARNING: out-of-vocab tokens found: {oob} (vocab_size={VOCAB_SIZE})", + file=sys.stderr) if __name__ == '__main__': main() diff --git a/training/train_double_buffer.m b/training/train_double_buffer.m new file mode 100644 index 0000000..bfb8236 --- /dev/null +++ b/training/train_double_buffer.m @@ -0,0 +1,791 @@ +// train_double_buffer.m — Double-buffered async ANE training for stories110M +// Based on train_large.m with the key innovation: compile and eval overlap via GCD +// Discovery: probe_v2.m proved ANE compile and eval can run in parallel +// Architecture: two kernel sets (A/B), background compile while active set runs +// 5 weight-bearing ANE kernels per layer × 12 layers = 60 per compile batch +#include +#include "stories_io.h" +#include "stories_mil.h" +#include "stories_cpu_ops.h" + +// Double-buffer needs more compile budget than single-buffer +// The original MAX_COMPILES=100 only allows 1 batch per exec() restart +// We push higher to allow initial compile + at least 1 background compile +// If ANE rejects at ~119, the exec() restart will handle it gracefully +#define DB_MAX_COMPILES 250 + +#define CKPT_PATH_DEFAULT "ane_db_ckpt.bin" +#define MODEL_PATH_DEFAULT "../../assets/models/stories110M.bin" +#define DATA_PATH_DEFAULT "tinystories_data00.bin" + +static const char *get_path(const char *env_var, const char *default_val) { + const char *v = getenv(env_var); + return (v && v[0]) ? v : default_val; +} + +// ===== Weight loading from llama2.c format ===== +static bool load_pretrained(LayerWeights *lw, float *rms_final, float *embed, const char *path) { + FILE *f = fopen(path, "rb"); + if (!f) { printf("Cannot open %s\n", path); return false; } + Llama2Config cfg; + fread(&cfg, sizeof(cfg), 1, f); + printf(" Model config: dim=%d hidden=%d layers=%d heads=%d vocab=%d seq=%d\n", + cfg.dim, cfg.hidden_dim, cfg.n_layers, cfg.n_heads, abs(cfg.vocab_size), cfg.seq_len); + if (cfg.dim != DIM || cfg.hidden_dim != HIDDEN || cfg.n_layers != NLAYERS) { + printf(" ERROR: Config mismatch! Expected dim=%d hidden=%d layers=%d\n", DIM, HIDDEN, NLAYERS); + fclose(f); return false; + } + int V = abs(cfg.vocab_size); + bool shared = cfg.vocab_size > 0; + + // Read in llama2.c order: embed, rms_att[all], wq[all], wk[all], wv[all], wo[all], + // rms_ffn[all], w1[all], w2[all], w3[all], rms_final, [wcls] + fread(embed, 4, V * DIM, f); + + // rms_att weights for all layers (contiguous) + for (int L = 0; L < NLAYERS; L++) fread(lw[L].rms_att, 4, DIM, f); + // wq for all layers + for (int L = 0; L < NLAYERS; L++) fread(lw[L].Wq, 4, WQ_SZ, f); + // wk for all layers + for (int L = 0; L < NLAYERS; L++) fread(lw[L].Wk, 4, WQ_SZ, f); + // wv for all layers + for (int L = 0; L < NLAYERS; L++) fread(lw[L].Wv, 4, WQ_SZ, f); + // wo for all layers + for (int L = 0; L < NLAYERS; L++) fread(lw[L].Wo, 4, WO_SZ, f); + // rms_ffn weights for all layers + for (int L = 0; L < NLAYERS; L++) fread(lw[L].rms_ffn, 4, DIM, f); + // w1 for all layers + for (int L = 0; L < NLAYERS; L++) fread(lw[L].W1, 4, W1_SZ, f); + // w2 for all layers + for (int L = 0; L < NLAYERS; L++) fread(lw[L].W2, 4, W2_SZ, f); + // w3 for all layers + for (int L = 0; L < NLAYERS; L++) fread(lw[L].W3, 4, W3_SZ, f); + // rms_final + fread(rms_final, 4, DIM, f); + // wcls = embed if shared (we just use embed pointer) + + fclose(f); + printf(" Loaded pretrained weights (%s)\n", shared ? "shared embed/cls" : "separate cls"); + return true; +} + +// ===== Compile one layer's kernels ===== +static bool compile_layer_kernels(LayerKernels *lk, LayerWeights *w) { + lk->fwdAttn = compile_kern_mil_w(gen_sdpa_fwd_taps(), (@{ + @"@model_path/weights/rms1.bin": @{@"offset":@0, @"data":build_blob(w->rms_att,1,DIM)}, + @"@model_path/weights/wq.bin": @{@"offset":@0, @"data":build_blob(w->Wq,DIM,DIM)}, + @"@model_path/weights/wk.bin": @{@"offset":@0, @"data":build_blob(w->Wk,DIM,DIM)}, + @"@model_path/weights/wv.bin": @{@"offset":@0, @"data":build_blob(w->Wv,DIM,DIM)}, + @"@model_path/weights/wo.bin": @{@"offset":@0, @"data":build_blob(w->Wo,DIM,DIM)}, + @"@model_path/weights/mask.bin": @{@"offset":@0, @"data":get_mask_blob()}, + }), DIM*SEQ*2, 6*DIM*SEQ*2); + + lk->fwdFFN = compile_kern_mil_w(gen_ffn_fwd_taps(), (@{ + @"@model_path/weights/rms2.bin": @{@"offset":@0, @"data":build_blob(w->rms_ffn,1,DIM)}, + @"@model_path/weights/w1.bin": @{@"offset":@0, @"data":build_blob(w->W1,HIDDEN,DIM)}, + @"@model_path/weights/w3.bin": @{@"offset":@0, @"data":build_blob(w->W3,HIDDEN,DIM)}, + @"@model_path/weights/w2.bin": @{@"offset":@0, @"data":build_blob(w->W2,DIM,HIDDEN)}, + }), DIM*SEQ*2, (2*DIM+3*HIDDEN)*SEQ*2); + + lk->ffnBwd = compile_kern_mil_w(gen_ffn_bwd(), (@{ + @"@model_path/weights/w2t.bin": @{@"offset":@0, @"data":build_blob_t(w->W2,DIM,HIDDEN)}, + @"@model_path/weights/w1t.bin": @{@"offset":@0, @"data":build_blob_t(w->W1,HIDDEN,DIM)}, + @"@model_path/weights/w3t.bin": @{@"offset":@0, @"data":build_blob_t(w->W3,HIDDEN,DIM)}, + }), (DIM+2*HIDDEN)*SEQ*2, (DIM+2*HIDDEN)*SEQ*2); + + lk->sdpaBwd1 = compile_kern_mil_w(gen_sdpa_bwd1(), (@{ + @"@model_path/weights/mask.bin": @{@"offset":@0, @"data":get_mask_blob()}, + @"@model_path/weights/wot.bin": @{@"offset":@0, @"data":build_blob_t(w->Wo,DIM,DIM)}, + }), 4*DIM*SEQ*2, (DIM+2*SCORE_CH)*SEQ*2); + + lk->qkvBwd = compile_kern_mil_w(gen_qkvb(), (@{ + @"@model_path/weights/wqt.bin": @{@"offset":@0, @"data":build_blob_t(w->Wq,DIM,DIM)}, + @"@model_path/weights/wkt.bin": @{@"offset":@0, @"data":build_blob_t(w->Wk,DIM,DIM)}, + @"@model_path/weights/wvt.bin": @{@"offset":@0, @"data":build_blob_t(w->Wv,DIM,DIM)}, + }), 3*DIM*SEQ*2, DIM*SEQ*2); + + return lk->fwdAttn && lk->fwdFFN && lk->ffnBwd && lk->sdpaBwd1 && lk->qkvBwd; +} + +// Compile weight-free sdpaBwd2 (only needs once, no weights) +static Kern *compile_sdpa_bwd2(void) { + return compile_kern_mil_w(gen_sdpa_bwd2(), @{}, + (2*SCORE_CH+2*DIM)*SEQ*2, 2*DIM*SEQ*2); +} + +static void free_layer_kernels(LayerKernels *lk) { + free_kern(lk->fwdAttn); free_kern(lk->fwdFFN); free_kern(lk->ffnBwd); + free_kern(lk->sdpaBwd1); free_kern(lk->qkvBwd); + // sdpaBwd2 is shared, freed separately + lk->fwdAttn = lk->fwdFFN = lk->ffnBwd = lk->sdpaBwd1 = lk->qkvBwd = NULL; +} + +// ===== Checkpoint save/load ===== +static void save_checkpoint(const char *path, int step, int total_steps, float lr, float loss, + double cc, double ct, double cw, int cs, int cb, int adam_t, + LayerWeights *lw, LayerAdam *la, float *rms_final, AdamState *arms_final, + float *embed, AdamState *aembed) { + FILE *f = fopen(path, "wb"); + CkptHdr h = {0}; + h.magic = 0x424C5A54; h.version = 2; + h.step = step; h.total_steps = total_steps; + h.n_layers = NLAYERS; h.vocab_size = VOCAB; h.dim = DIM; + h.hidden_dim = HIDDEN; h.n_heads = HEADS; h.seq_len = SEQ; + h.lr = lr; h.loss = loss; + h.cum_compile = cc; h.cum_train = ct; h.cum_wall = cw; + h.cum_steps = cs; h.cum_batches = cb; h.adam_t = adam_t; + fwrite(&h, sizeof(h), 1, f); + // Per-layer weights + adam + for (int L = 0; L < NLAYERS; L++) { + fwrite(lw[L].Wq,4,WQ_SZ,f); fwrite(lw[L].Wk,4,WQ_SZ,f); + fwrite(lw[L].Wv,4,WQ_SZ,f); fwrite(lw[L].Wo,4,WO_SZ,f); + fwrite(lw[L].W1,4,W1_SZ,f); fwrite(lw[L].W2,4,W2_SZ,f); fwrite(lw[L].W3,4,W3_SZ,f); + fwrite(lw[L].rms_att,4,DIM,f); fwrite(lw[L].rms_ffn,4,DIM,f); + // Adam state + fwrite(la[L].Wq.m,4,WQ_SZ,f); fwrite(la[L].Wq.v,4,WQ_SZ,f); + fwrite(la[L].Wk.m,4,WQ_SZ,f); fwrite(la[L].Wk.v,4,WQ_SZ,f); + fwrite(la[L].Wv.m,4,WQ_SZ,f); fwrite(la[L].Wv.v,4,WQ_SZ,f); + fwrite(la[L].Wo.m,4,WO_SZ,f); fwrite(la[L].Wo.v,4,WO_SZ,f); + fwrite(la[L].W1.m,4,W1_SZ,f); fwrite(la[L].W1.v,4,W1_SZ,f); + fwrite(la[L].W2.m,4,W2_SZ,f); fwrite(la[L].W2.v,4,W2_SZ,f); + fwrite(la[L].W3.m,4,W3_SZ,f); fwrite(la[L].W3.v,4,W3_SZ,f); + fwrite(la[L].rms_att.m,4,DIM,f); fwrite(la[L].rms_att.v,4,DIM,f); + fwrite(la[L].rms_ffn.m,4,DIM,f); fwrite(la[L].rms_ffn.v,4,DIM,f); + } + fwrite(rms_final,4,DIM,f); + fwrite(arms_final->m,4,DIM,f); fwrite(arms_final->v,4,DIM,f); + fwrite(embed,4,VOCAB*DIM,f); + fwrite(aembed->m,4,VOCAB*DIM,f); fwrite(aembed->v,4,VOCAB*DIM,f); + fclose(f); +} + +static bool load_checkpoint(const char *path, int *step, int *total_steps, float *lr, float *loss, + double *cc, double *ct, double *cw, int *cs, int *cb, int *adam_t, + LayerWeights *lw, LayerAdam *la, float *rms_final, AdamState *arms_final, + float *embed, AdamState *aembed) { + FILE *f = fopen(path, "rb"); + if (!f) return false; + CkptHdr h; + fread(&h, sizeof(h), 1, f); + if (h.magic != 0x424C5A54 || h.version != 2) { fclose(f); return false; } + *step = h.step; *total_steps = h.total_steps; *lr = h.lr; *loss = h.loss; + *cc = h.cum_compile; *ct = h.cum_train; *cw = h.cum_wall; + *cs = h.cum_steps; *cb = h.cum_batches; *adam_t = h.adam_t; + for (int L = 0; L < NLAYERS; L++) { + fread(lw[L].Wq,4,WQ_SZ,f); fread(lw[L].Wk,4,WQ_SZ,f); + fread(lw[L].Wv,4,WQ_SZ,f); fread(lw[L].Wo,4,WO_SZ,f); + fread(lw[L].W1,4,W1_SZ,f); fread(lw[L].W2,4,W2_SZ,f); fread(lw[L].W3,4,W3_SZ,f); + fread(lw[L].rms_att,4,DIM,f); fread(lw[L].rms_ffn,4,DIM,f); + fread(la[L].Wq.m,4,WQ_SZ,f); fread(la[L].Wq.v,4,WQ_SZ,f); + fread(la[L].Wk.m,4,WQ_SZ,f); fread(la[L].Wk.v,4,WQ_SZ,f); + fread(la[L].Wv.m,4,WQ_SZ,f); fread(la[L].Wv.v,4,WQ_SZ,f); + fread(la[L].Wo.m,4,WO_SZ,f); fread(la[L].Wo.v,4,WO_SZ,f); + fread(la[L].W1.m,4,W1_SZ,f); fread(la[L].W1.v,4,W1_SZ,f); + fread(la[L].W2.m,4,W2_SZ,f); fread(la[L].W2.v,4,W2_SZ,f); + fread(la[L].W3.m,4,W3_SZ,f); fread(la[L].W3.v,4,W3_SZ,f); + fread(la[L].rms_att.m,4,DIM,f); fread(la[L].rms_att.v,4,DIM,f); + fread(la[L].rms_ffn.m,4,DIM,f); fread(la[L].rms_ffn.v,4,DIM,f); + } + fread(rms_final,4,DIM,f); + fread(arms_final->m,4,DIM,f); fread(arms_final->v,4,DIM,f); + fread(embed,4,VOCAB*DIM,f); + fread(aembed->m,4,VOCAB*DIM,f); fread(aembed->v,4,VOCAB*DIM,f); + fclose(f); + return true; +} + +// ===== Main ===== +int main(int argc, char *argv[]) { + @autoreleasepool { + setbuf(stdout, NULL); + ane_init(); + mach_timebase_info(&g_tb); + + int total_steps = 10000; + float lr = 3e-4f; + float adam_b1=0.9f, adam_b2=0.999f, adam_eps=1e-8f; + int adam_t = 0, start_step = 0; + + const char *model_path = get_path("ANE_MODEL_PATH", MODEL_PATH_DEFAULT); + const char *ckpt_path = get_path("ANE_CKPT_PATH", CKPT_PATH_DEFAULT); + const char *data_path = get_path("ANE_DATA_PATH", DATA_PATH_DEFAULT); + + bool do_resume = false; + for (int i=1; i