diff --git a/.gitignore b/.gitignore
index e116b6cc..d57e68b8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -38,3 +38,5 @@ test-results/
 # PM agent persistent memory
 .pm/
 /companion/target/
+# ONNX models (too large for git, download via script)
+public/models/*.onnx
diff --git a/.llm/research/bpm-chord-detection-research.md b/.llm/research/bpm-chord-detection-research.md
new file mode 100644
index 00000000..b38b4ab2
--- /dev/null
+++ b/.llm/research/bpm-chord-detection-research.md
@@ -0,0 +1,218 @@
+# BPM Detection & Chord Recognition — Research Report
+
+> Date: 2026-03-27 | For: ACE-Step-DAW web-local inference
+
+---
+
+## Executive Summary
+
+**Best deployment strategy: ONNX Runtime Web (WASM + WebGPU) in a Web Worker**
+
+- BPM: **Beat This! (small, 8MB ONNX)** — current SOTA, C++ port exists with ONNX export
+- Chord: **consonance-ACE (ISMIR 2025)** — decomposed conformer, SOTA chord estimation
+- Feature extraction: **Essentia.js** (WASM) or **Rust (rustfft + mel-spec → WASM)**
+- Fallback/lightweight: **Essentia.js** has built-in BPM + chord detection (DSP-based, lower accuracy)
+
+NOT recommended: C++ or Rust standalone — the models are Python/PyTorch, so native compilation means reimplementing inference. ONNX is the universal bridge.
+
+---
+
+## Part 1: BPM Detection
+
+### Model Comparison
+
+| Model | Accuracy | Size | Real-time | Web-ready | License |
+|-------|----------|------|-----------|-----------|---------|
+| **Beat This! (CPJKU, 2024)** | SOTA (best F1 across 16 datasets) | 8MB (small) / 97MB (full) | Offline | ONNX export exists via beat_this_cpp | MIT |
+| Madmom (CPJKU) | Very high | Large (multiple RNNs) | No | No WASM path | BSD |
+| BeatNet+ | High (best online method) | Medium | Yes (<50ms) | Needs ONNX export | MIT |
+| Essentia RhythmExtractor | Good | ~2.5MB WASM | Yes | **essentia.js ready** | AGPL-3.0 |
+| Aubio | Moderate | <1MB | Yes | **aubiojs ready** | GPL-3.0 |
+
+### Recommendation: Beat This! (small variant)
+
+- **Why**: SOTA accuracy without needing DBN post-processing; 8MB small model is web-friendly
+- **How**: C++ port already exists at [mosynthkey/beat_this_cpp](https://github.com/mosynthkey/beat_this_cpp)
+  - Uses ONNX Runtime for inference
+  - 97MB ONNX model (full) — small variant is ~8MB
+  - Pipeline: audio → mel spectrogram → transformer → beat/downbeat positions
+- **Web path**: Export small model to ONNX → run via `onnxruntime-web` (WASM or WebGPU)
+
+### Fallback: Essentia.js
+
+- Already works in browser, zero additional work
+- `RhythmExtractor2013` gives BPM + beat positions
+- Lower accuracy but production-ready today
+
+---
+
+## Part 2: Chord Recognition
+
+### Model Comparison
+
+| Model | Accuracy | Vocabulary | Architecture | Web Path | License |
+|-------|----------|------------|--------------|----------|---------|
+| **consonance-ACE (ISMIR 2025)** | SOTA | 170 classes | Conformer (decomposed) | PyTorch → ONNX → ort-web | MIT |
+| BTC | ~80-86% majmin | 25 classes | Transformer | PyTorch → ONNX → ort-web | — |
+| CREMA | ~75-80% | 602 classes | CNN+RNN | TF → ONNX → ort-web | — |
+| Chordino | ~70-75% | maj/min/7th | NNLS+HMM | C++ → WASM | GPL |
+| Essentia ChordsDetection | ~65-70% | maj/min | HPCP+template | **essentia.js ready** | AGPL |
+
+### Recommendation: consonance-ACE
+
+- **Why**: ISMIR 2025 SOTA, decomposed output (root + bass + pitch activations), 170 chord vocabulary, MIT license
+- **Repo**: [andreamust/consonance-ACE](https://github.com/andreamust/consonance-ACE)
+- **Paper**: [arxiv.org/abs/2509.01588](https://arxiv.org/abs/2509.01588)
+- **Architecture**: Conformer with decomposed heads — separately estimates root, bass, and note activations, then reconstructs chord labels
+- **Key innovation**: Consonance-based label smoothing handles annotator subjectivity and class imbalance
+- **Input**: Audio (WAV) → 20s chunks
+- **Output**: `.lab` format (start_time, end_time, chord_label e.g. `E:maj`)
+- **Pipeline**: audio → conformer → decomposed heads (root/bass/notes) → chord label
+- **Web path**: PyTorch checkpoint → `torch.onnx.export()` → ONNX → `onnxruntime-web`
+- **Training data**: Isophonics, McGill Billboard (via ChoCo corpus)
+
+### Beat-synchronous chord detection (DAW integration)
+
+1. Run BPM/beat detection first (Beat This!)
+2. Segment audio at beat boundaries
+3. Run consonance-ACE per segment (or on full audio, then snap to beats)
+4. Post-processing: merge short segments, snap chord changes to nearest beat/bar
+5. Output: chord track aligned to DAW grid
+
+### Fallback: Essentia.js ChordsDetectionBeats
+
+- Already works in browser
+- Beat-synchronous chord detection built-in
+- Lower accuracy (~65-70%) but zero integration work
+
+---
+
+## Part 3: Web Deployment Architecture
+
+### Recommended Stack
+
+```
+┌─────────────────────────────────────────────────┐
+│                  Main Thread (React)              │
+│  - UI rendering                                   │
+│  - Receives results via postMessage               │
+└─────────────┬───────────────────────────────────┘
+              │ postMessage(audioBuffer)
+              ▼
+┌─────────────────────────────────────────────────┐
+│               Web Worker                          │
+│                                                   │
+│  ┌─────────────────────────────────────────────┐ │
+│  │  Feature Extraction (WASM)                   │ │
+│  │  Option A: essentia.js (C++ → WASM)          │ │
+│  │  Option B: Rust (rustfft + mel-spec → WASM)  │ │
+│  │  - Mel spectrogram for BPM model             │ │
+│  │  - CQT / chromagram for chord model          │ │
+│  └──────────────┬──────────────────────────────┘ │
+│                 │ Float32Array                     │
+│                 ▼                                  │
+│  ┌─────────────────────────────────────────────┐ │
+│  │  Model Inference                             │ │
+│  │  onnxruntime-web (WASM CPU or WebGPU)        │ │
+│  │  - Beat This! small (8MB ONNX) → beats/BPM  │ │
+│  │  - consonance-ACE (ONNX) → chord labels     │ │
+│  └──────────────┬──────────────────────────────┘ │
+│                 │ results                          │
+│                 ▼                                  │
+│  Post-processing: Viterbi smoothing, beat snap    │
+└─────────────────────────────────────────────────┘
+```
+
+### Runtime Options Comparison
+
+| Runtime | Pros | Cons | Best For |
+|---------|------|------|----------|
+| **onnxruntime-web** | Best operator coverage, INT8 quant, WebGPU support | Larger WASM binary (~5MB) | Production deployment |
+| Tract (Rust → WASM) | Pure Rust, single binary, lightweight | Less operator coverage | Simple models |
+| Candle (HF Rust) | Self-contained WASM, proven with Whisper | Need to reimplement model in Candle | Custom models |
+| TensorFlow.js WASM | Mature ecosystem | Heavier, ecosystem moving to ONNX | Legacy TF models |
+
+### Performance Expectations
+
+| Operation | Latency (WASM, M2 MacBook) |
+|-----------|---------------------------|
+| Mel spectrogram (5s clip) | 5-15ms |
+| ONNX model inference (small CNN) | 8-12ms |
+| ONNX model inference (transformer, 8MB) | 50-200ms |
+| WebGPU inference (same transformer) | 5-20ms |
+| Total pipeline (5s clip → BPM + chords) | ~100-500ms (WASM) / ~30-100ms (WebGPU) |
+
+---
+
+## Part 4: Implementation Roadmap
+
+### Phase 1: Quick Win (essentia.js)
+- Install `essentia.js` npm package
+- Use `RhythmExtractor2013` for BPM + beats
+- Use `ChordsDetectionBeats` for beat-synced chords
+- Run in Web Worker
+- Accuracy: ~70% for both — usable but not great
+- **Effort: 1-2 days**
+
+### Phase 2: High-Accuracy BPM (Beat This! ONNX)
+- Clone [beat_this_cpp](https://github.com/mosynthkey/beat_this_cpp), get ONNX model
+- Use small variant (8MB) or quantize full model to INT8
+- Implement mel spectrogram in WASM (essentia.js or Rust `mel-spec`)
+- Run ONNX inference via `onnxruntime-web`
+- Beat-synced output → snap to DAW grid
+- **Effort: 3-5 days**
+
+### Phase 3: High-Accuracy Chords (consonance-ACE ONNX)
+- Export consonance-ACE conformer_decomposed model to ONNX from PyTorch
+- Model outputs decomposed root/bass/note activations → reconstruct chord labels
+- Run ONNX inference via `onnxruntime-web`
+- Use beat positions from Phase 2 for beat-synchronous snapping
+- 170 chord vocabulary — rich enough for DAW display
+
+### Phase 4: Optimization
+- INT8 quantization of both models (2-3x faster WASM)
+- WebGPU acceleration for devices that support it
+- Streaming/chunked analysis for long files
+- Cache results in IndexedDB
+
+---
+
+## Part 5: Key Repos & Links
+
+### BPM
+- Beat This!: https://github.com/CPJKU/beat_this (Python) | https://github.com/mosynthkey/beat_this_cpp (C++ ONNX)
+- BeatNet: https://github.com/mjhydri/BeatNet
+- Essentia.js: https://github.com/mtg/essentia.js/
+
+### Chords
+- consonance-ACE: https://github.com/andreamust/consonance-ACE (ISMIR 2025, MIT)
+- BTC: https://github.com/jayg996/BTC-ISMIR19
+- CREMA: https://github.com/bmcfee/crema
+
+### Inference Runtimes
+- onnxruntime-web: https://www.npmjs.com/package/onnxruntime-web
+- Tract (Rust ONNX): https://github.com/sonos/tract
+- Candle (Rust ML): https://github.com/huggingface/candle
+
+### Audio Preprocessing
+- essentia.js: https://github.com/mtg/essentia.js/
+- rust-melspec-wasm: https://github.com/nicolvisser/rust-melspec-wasm
+- mel_spec crate: https://crates.io/crates/mel_spec
+- spectrograms crate: https://docs.rs/spectrograms/latest/spectrograms/
+
+### Reference Implementations
+- basicpitch.cpp (ONNX + WASM): https://github.com/sevagh/basicpitch.cpp
+- Candle Whisper WASM: https://huggingface.co/spaces/lmz/candle-whisper
+
+---
+
+## Decision: Why NOT Pure C++ or Rust?
+
+| Approach | Problem |
+|----------|---------|
+| Rewrite model in C++ | Models are defined in PyTorch; reimplementing transformer/RNN in C++ is months of work |
+| Rewrite model in Rust (candle/burn) | Same problem — must port architecture + load weights |
+| Compile Python + PyTorch to WASM | Not feasible |
+| **Export to ONNX + run via ort-web** | **Universal bridge: any PyTorch model → ONNX → browser. This is the answer.** |
+
+C++ and Rust are excellent for the **preprocessing** pipeline (FFT, mel spectrogram, CQT), but for **model inference**, ONNX is the standard interchange format and ort-web is the best runtime for browsers.
diff --git a/package-lock.json b/package-lock.json
index 34e976df..3c709b52 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -32,6 +32,7 @@
         "idb-keyval": "^6.2.0",
         "mp4-muxer": "^5.2.2",
         "node-pty": "^1.2.0-beta.12",
+        "onnxruntime-web": "^1.24.3",
         "react": "^19.0.0",
         "react-dom": "^19.0.0",
         "tone": "^15.1.22",
@@ -1680,6 +1681,70 @@
         "node": ">=18"
       }
     },
+    "node_modules/@protobufjs/aspromise": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz",
+      "integrity": "sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==",
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@protobufjs/base64": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/@protobufjs/base64/-/base64-1.1.2.tgz",
+      "integrity": "sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==",
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@protobufjs/codegen": {
+      "version": "2.0.4",
+      "resolved": "https://registry.npmjs.org/@protobufjs/codegen/-/codegen-2.0.4.tgz",
+      "integrity": "sha512-YyFaikqM5sH0ziFZCN3xDC7zeGaB/d0IUb9CATugHWbd1FRFwWwt4ld4OYMPWu5a3Xe01mGAULCdqhMlPl29Jg==",
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@protobufjs/eventemitter": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/@protobufjs/eventemitter/-/eventemitter-1.1.0.tgz",
+      "integrity": "sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q==",
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@protobufjs/fetch": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/@protobufjs/fetch/-/fetch-1.1.0.tgz",
+      "integrity": "sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ==",
+      "license": "BSD-3-Clause",
+      "dependencies": {
+        "@protobufjs/aspromise": "^1.1.1",
+        "@protobufjs/inquire": "^1.1.0"
+      }
+    },
+    "node_modules/@protobufjs/float": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/@protobufjs/float/-/float-1.0.2.tgz",
+      "integrity": "sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==",
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@protobufjs/inquire": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/@protobufjs/inquire/-/inquire-1.1.0.tgz",
+      "integrity": "sha512-kdSefcPdruJiFMVSbn801t4vFK7KB/5gd2fYvrxhuJYg8ILrmn9SKSX2tZdV6V+ksulWqS7aXjBcRXl3wHoD9Q==",
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@protobufjs/path": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/@protobufjs/path/-/path-1.1.2.tgz",
+      "integrity": "sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==",
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@protobufjs/pool": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/@protobufjs/pool/-/pool-1.1.0.tgz",
+      "integrity": "sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==",
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@protobufjs/utf8": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.0.tgz",
+      "integrity": "sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw==",
+      "license": "BSD-3-Clause"
+    },
     "node_modules/@replit/codemirror-emacs": {
       "version": "6.1.0",
       "resolved": "https://registry.npmjs.org/@replit/codemirror-emacs/-/codemirror-emacs-6.1.0.tgz",
@@ -3350,7 +3415,6 @@
       "version": "25.5.0",
       "resolved": "https://registry.npmjs.org/@types/node/-/node-25.5.0.tgz",
       "integrity": "sha512-jp2P3tQMSxWugkCUKLRPVUpGaL5MVFwF8RDuSRztfwgN1wmqJeMSbKlnEtQqU8UrhTmzEmZdu2I6v2dpp7XIxw==",
-      "dev": true,
       "license": "MIT",
       "dependencies": {
         "undici-types": "~7.18.0"
@@ -4546,6 +4610,12 @@
         "babel-plugin-add-module-exports": "^0.2.1"
       }
     },
+    "node_modules/flatbuffers": {
+      "version": "25.9.23",
+      "resolved": "https://registry.npmjs.org/flatbuffers/-/flatbuffers-25.9.23.tgz",
+      "integrity": "sha512-MI1qs7Lo4Syw0EOzUl0xjs2lsoeqFku44KpngfIduHBYvzm8h2+7K8YMQh1JtVVVrUvhLpNwqVi4DERegUJhPQ==",
+      "license": "Apache-2.0"
+    },
     "node_modules/focus-trap": {
       "version": "7.8.0",
       "resolved": "https://registry.npmjs.org/focus-trap/-/focus-trap-7.8.0.tgz",
@@ -4610,6 +4680,12 @@
       "dev": true,
       "license": "ISC"
     },
+    "node_modules/guid-typescript": {
+      "version": "1.0.9",
+      "resolved": "https://registry.npmjs.org/guid-typescript/-/guid-typescript-1.0.9.tgz",
+      "integrity": "sha512-Y8T4vYhEfwJOTbouREvG+3XDsjr8E3kIr7uf+JZ0BYloFsttiHU0WfvANVsR7TxNUJa/WpCnw/Ino/p+DeBhBQ==",
+      "license": "ISC"
+    },
     "node_modules/hasown": {
       "version": "2.0.2",
       "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz",
@@ -5146,6 +5222,12 @@
         "url": "https://opencollective.com/parcel"
       }
     },
+    "node_modules/long": {
+      "version": "5.3.2",
+      "resolved": "https://registry.npmjs.org/long/-/long-5.3.2.tgz",
+      "integrity": "sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==",
+      "license": "Apache-2.0"
+    },
     "node_modules/loose-envify": {
       "version": "1.4.0",
       "resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz",
@@ -5472,6 +5554,26 @@
         "regex-recursion": "^6.0.2"
       }
     },
+    "node_modules/onnxruntime-common": {
+      "version": "1.24.3",
+      "resolved": "https://registry.npmjs.org/onnxruntime-common/-/onnxruntime-common-1.24.3.tgz",
+      "integrity": "sha512-GeuPZO6U/LBJXvwdaqHbuUmoXiEdeCjWi/EG7Y1HNnDwJYuk6WUbNXpF6luSUY8yASul3cmUlLGrCCL1ZgVXqA==",
+      "license": "MIT"
+    },
+    "node_modules/onnxruntime-web": {
+      "version": "1.24.3",
+      "resolved": "https://registry.npmjs.org/onnxruntime-web/-/onnxruntime-web-1.24.3.tgz",
+      "integrity": "sha512-41dDq7fxtTm0XzGE7N0d6m8FcOY8EWtUA65GkOixJPB/G7DGzBmiDAnVVXHznRw9bgUZpb+4/1lQK/PNxGpbrQ==",
+      "license": "MIT",
+      "dependencies": {
+        "flatbuffers": "^25.1.24",
+        "guid-typescript": "^1.0.9",
+        "long": "^5.2.3",
+        "onnxruntime-common": "1.24.3",
+        "platform": "^1.3.6",
+        "protobufjs": "^7.2.4"
+      }
+    },
     "node_modules/parse5": {
       "version": "8.0.0",
       "resolved": "https://registry.npmjs.org/parse5/-/parse5-8.0.0.tgz",
@@ -5543,6 +5645,12 @@
         "url": "https://github.com/sponsors/jonschlinkert"
       }
     },
+    "node_modules/platform": {
+      "version": "1.3.6",
+      "resolved": "https://registry.npmjs.org/platform/-/platform-1.3.6.tgz",
+      "integrity": "sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg==",
+      "license": "MIT"
+    },
     "node_modules/playwright": {
       "version": "1.58.2",
       "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.58.2.tgz",
@@ -5657,6 +5765,30 @@
         "url": "https://github.com/sponsors/wooorm"
       }
     },
+    "node_modules/protobufjs": {
+      "version": "7.5.4",
+      "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-7.5.4.tgz",
+      "integrity": "sha512-CvexbZtbov6jW2eXAvLukXjXUW1TzFaivC46BpWc/3BpcCysb5Vffu+B3XHMm8lVEuy2Mm4XGex8hBSg1yapPg==",
+      "hasInstallScript": true,
+      "license": "BSD-3-Clause",
+      "dependencies": {
+        "@protobufjs/aspromise": "^1.1.2",
+        "@protobufjs/base64": "^1.1.2",
+        "@protobufjs/codegen": "^2.0.4",
+        "@protobufjs/eventemitter": "^1.1.0",
+        "@protobufjs/fetch": "^1.1.0",
+        "@protobufjs/float": "^1.0.2",
+        "@protobufjs/inquire": "^1.1.0",
+        "@protobufjs/path": "^1.1.2",
+        "@protobufjs/pool": "^1.1.0",
+        "@protobufjs/utf8": "^1.1.0",
+        "@types/node": ">=13.7.0",
+        "long": "^5.0.0"
+      },
+      "engines": {
+        "node": ">=12.0.0"
+      }
+    },
     "node_modules/punycode": {
       "version": "2.3.1",
       "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz",
@@ -6310,7 +6442,6 @@
       "version": "7.18.2",
       "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.18.2.tgz",
       "integrity": "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w==",
-      "dev": true,
       "license": "MIT"
     },
     "node_modules/unist-util-is": {
diff --git a/package.json b/package.json
index 4c075d85..72347620 100644
--- a/package.json
+++ b/package.json
@@ -42,6 +42,7 @@
     "idb-keyval": "^6.2.0",
     "mp4-muxer": "^5.2.2",
     "node-pty": "^1.2.0-beta.12",
+    "onnxruntime-web": "^1.24.3",
     "react": "^19.0.0",
     "react-dom": "^19.0.0",
     "tone": "^15.1.22",
diff --git a/scripts/download-models.sh b/scripts/download-models.sh
new file mode 100755
index 00000000..38cbb48e
--- /dev/null
+++ b/scripts/download-models.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# Download ONNX models for BPM detection and chord recognition.
+# Models are too large for git (~100MB total), so they're downloaded on demand.
+#
+# Usage: ./scripts/download-models.sh
+
+set -euo pipefail
+
+MODELS_DIR="$(cd "$(dirname "$0")/.." && pwd)/public/models"
+mkdir -p "$MODELS_DIR"
+
+echo "Downloading ONNX models to $MODELS_DIR..."
+
+# Beat This! (79MB) — CPJKU ISMIR 2024 SOTA beat/BPM detection
+# Source: https://github.com/mosynthkey/beat_this_cpp
+BEAT_THIS_URL="https://github.com/mosynthkey/beat_this_cpp/raw/main/onnx/beat_this.onnx"
+if [ ! -f "$MODELS_DIR/beat-this.onnx" ]; then
+  echo "Downloading Beat This! model (79MB)..."
+  curl -L -o "$MODELS_DIR/beat-this.onnx" "$BEAT_THIS_URL"
+  echo "  Done: beat-this.onnx ($(du -h "$MODELS_DIR/beat-this.onnx" | cut -f1))"
+else
+  echo "  beat-this.onnx already exists, skipping"
+fi
+
+# consonance-ACE — must be exported from PyTorch checkpoint
+# The ONNX file should already exist if you ran the export script.
+if [ ! -f "$MODELS_DIR/consonance-ace.onnx" ]; then
+  echo ""
+  echo "consonance-ace.onnx not found."
+  echo "To export it, run:"
+  echo "  python scripts/export-consonance-ace.py"
+  echo ""
+  echo "Or download from the project's release assets (if available)."
+else
+  echo "  consonance-ace.onnx already exists, skipping"
+fi
+
+echo ""
+echo "Model files:"
+ls -lh "$MODELS_DIR"/*.onnx 2>/dev/null || echo "  (none found)"
diff --git a/scripts/export-consonance-ace.py b/scripts/export-consonance-ace.py
new file mode 100644
index 00000000..7df26b7e
--- /dev/null
+++ b/scripts/export-consonance-ace.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+"""
+Export consonance-ACE conformer_decomposed model to ONNX.
+
+Prerequisites:
+  pip install torch torchaudio librosa lightning gin-config torchmetrics
+
+  git clone https://github.com/andreamust/consonance-ACE /tmp/consonance-ACE
+  cd /tmp/consonance-ACE && pip install -r requirements.txt
+
+Usage:
+  python scripts/export-consonance-ace.py
+
+Output:
+  public/models/consonance-ace.onnx (~20MB)
+"""
+
+import sys
+import os
+
+# Add consonance-ACE to path
+ACE_REPO = "/tmp/consonance-ACE"
+if not os.path.isdir(ACE_REPO):
+    print(f"ERROR: Clone consonance-ACE first:")
+    print(f"  git clone https://github.com/andreamust/consonance-ACE {ACE_REPO}")
+    sys.exit(1)
+
+sys.path.insert(0, ACE_REPO)
+
+import torch
+import numpy as np
+
+from ACE.models.conformer_decomposed import ConformerDecomposedModel
+
+
+class ACEWrapper(torch.nn.Module):
+    """Wraps ConformerDecomposedModel to return tuple (ONNX-compatible)."""
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+
+    def forward(self, x):
+        out = self.model(x)
+        return out["root"], out["bass"], out["onehot"]
+
+
+def main():
+    ckpt = os.path.join(ACE_REPO, "ACE/checkpoints/conformer_decomposed_smooth.ckpt")
+    if not os.path.exists(ckpt):
+        print(f"ERROR: Checkpoint not found: {ckpt}")
+        sys.exit(1)
+
+    print(f"Loading model from {ckpt}...")
+    model = ConformerDecomposedModel.load_from_checkpoint(
+        ckpt,
+        vocabularies={"root": 13, "bass": 13, "onehot": 12},
+        map_location="cpu",
+        loss="consonance_decomposed",
+        vocab_path=os.path.join(ACE_REPO, "ACE/chords_vocab.joblib"),
+        strict=False,
+    )
+    model.eval()
+    print(f"  Parameters: {sum(p.numel() for p in model.parameters()):,}")
+
+    wrapper = ACEWrapper(model)
+    wrapper.eval()
+
+    # Dummy input: [batch=1, channels=1, freq=144, time=862]
+    # 862 frames = 20s at sr=22050, hop=512
+    dummy = torch.randn(1, 1, 144, 862)
+
+    output_dir = os.path.join(os.path.dirname(__file__), "..", "public", "models")
+    os.makedirs(output_dir, exist_ok=True)
+    output_path = os.path.join(output_dir, "consonance-ace.onnx")
+
+    print(f"Exporting to {output_path}...")
+    torch.onnx.export(
+        wrapper, dummy, output_path,
+        input_names=["cqt_features"],
+        output_names=["root_logits", "bass_logits", "chord_logits"],
+        dynamic_axes={
+            "cqt_features": {3: "n_frames"},
+            "root_logits": {1: "n_frames"},
+            "bass_logits": {1: "n_frames"},
+            "chord_logits": {1: "n_frames"},
+        },
+        opset_version=17,
+        do_constant_folding=True,
+    )
+
+    size_mb = os.path.getsize(output_path) / 1024 / 1024
+    print(f"Exported: {output_path} ({size_mb:.1f} MB)")
+
+    # Verify
+    import onnxruntime as ort
+    sess = ort.InferenceSession(output_path)
+    result = sess.run(None, {"cqt_features": dummy.numpy()})
+
+    with torch.no_grad():
+        pt_r, pt_b, pt_c = wrapper(dummy)
+
+    diff = max(
+        np.abs(result[0] - pt_r.numpy()).max(),
+        np.abs(result[1] - pt_b.numpy()).max(),
+        np.abs(result[2] - pt_c.numpy()).max(),
+    )
+    print(f"Max PyTorch vs ONNX diff: {diff:.6f}")
+    print("PASS" if diff < 0.001 else "WARN: large diff")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/verify-onnx-models.py b/scripts/verify-onnx-models.py
new file mode 100644
index 00000000..9aee967e
--- /dev/null
+++ b/scripts/verify-onnx-models.py
@@ -0,0 +1,285 @@
+#!/usr/bin/env python3
+"""
+Verify ONNX model inference for Beat This! (BPM) and consonance-ACE (chords).
+
+Usage:
+    python scripts/verify-onnx-models.py [path/to/audio.wav]
+
+Validates:
+  1. Models load correctly in ONNX Runtime
+  2. Input/output shapes match expectations
+  3. Beat detection produces reasonable BPM (30-300 range)
+  4. Chord detection produces valid chord labels
+  5. Processing time is within acceptable range for web deployment
+"""
+
+import sys
+import time
+import os
+
+import numpy as np
+import onnxruntime as ort
+import librosa
+
+MODELS_DIR = os.path.join(os.path.dirname(__file__), "..", "public", "models")
+BEAT_THIS_PATH = os.path.join(MODELS_DIR, "beat-this.onnx")
+CONSONANCE_ACE_PATH = os.path.join(MODELS_DIR, "consonance-ace.onnx")
+
+# Constants matching the model training configs
+BEAT_THIS_SR = 22050
+BEAT_THIS_N_FFT = 2048
+BEAT_THIS_HOP = 441  # 20ms hop @ 22050
+BEAT_THIS_N_MELS = 128
+
+ACE_SR = 22050
+ACE_HOP = 512
+ACE_N_BINS = 144  # CQT bins
+
+# Chord label maps
+ROOT_LABELS = ["N", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
+PITCH_CLASSES = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
+
+
+def compute_mel_spectrogram(audio: np.ndarray, sr: int) -> np.ndarray:
+    """Compute log-mel spectrogram using Beat This! official preprocessing.
+
+    Uses torchaudio MelSpectrogram with exact config:
+      n_fft=1024, hop=441, f_min=30, f_max=11000, n_mels=128,
+      mel_scale=slaney, normalized=frame_length, power=1,
+      output = log1p(1000 * mel).T  -> [time, freq]
+    """
+    try:
+        # Use official Beat This! preprocessing if available
+        import torch
+        from beat_this.preprocessing import LogMelSpect
+        audio_t = torch.from_numpy(audio).float()
+        spect = LogMelSpect(sample_rate=sr, device="cpu")
+        with torch.no_grad():
+            mel = spect(audio_t)  # [T, 128]
+        return mel.numpy()[np.newaxis, :, :]  # [1, T, 128]
+    except ImportError:
+        # Fallback: approximate with librosa
+        import warnings
+        warnings.warn("beat_this not installed, using librosa approximation for mel spectrogram")
+        mel = librosa.feature.melspectrogram(
+            y=audio, sr=sr,
+            n_fft=1024, hop_length=441, n_mels=128,
+            fmin=30, fmax=11000, power=1,
+            norm="slaney", htk=False,
+        )
+        log_mel = np.log1p(1000.0 * mel)
+        return log_mel.T[np.newaxis, :, :]  # [1, T, 128]
+
+
+def compute_cqt(audio: np.ndarray, sr: int) -> np.ndarray:
+    """Compute CQT features matching consonance-ACE CQTransform exactly.
+
+    Config from ACE/preprocess/transforms.py:
+      sr=22050, hop=512, bins_per_octave=24, num_octaves=6, start_note=C1
+      Output = abs(cqt)  (raw magnitude, NOT dB)
+    Audio is normalized to [-1, 1] before CQT.
+    """
+    # Normalize audio to [-1, 1] (matching AudioProcessor._normalize)
+    max_val = np.abs(audio).max()
+    if max_val > 0:
+        audio = audio / max_val
+
+    fmin = librosa.note_to_hz("C1")
+    cqt = librosa.cqt(
+        y=audio, sr=sr, hop_length=ACE_HOP,
+        n_bins=ACE_N_BINS, bins_per_octave=24,
+        fmin=fmin,
+    )
+    cqt_mag = np.abs(cqt)
+    # Model expects [batch, 1, freq, time] — raw magnitude
+    return cqt_mag[np.newaxis, np.newaxis, :, :].astype(np.float32)  # [1, 1, 144, T]
+
+
+def verify_beat_this(audio: np.ndarray, sr: int):
+    """Verify Beat This! ONNX model."""
+    print("\n" + "=" * 60)
+    print("BEAT THIS! — BPM Detection Verification")
+    print("=" * 60)
+
+    if not os.path.exists(BEAT_THIS_PATH):
+        print(f"  SKIP: {BEAT_THIS_PATH} not found")
+        return False
+
+    sess = ort.InferenceSession(BEAT_THIS_PATH)
+    inp = sess.get_inputs()[0]
+    print(f"  Model input: {inp.name}, shape={inp.shape}, type={inp.type}")
+    for o in sess.get_outputs():
+        print(f"  Model output: {o.name}, shape={o.shape}")
+
+    # Compute mel spectrogram
+    mel = compute_mel_spectrogram(audio, sr)
+    print(f"  Mel spectrogram: shape={mel.shape}, range=[{mel.min():.2f}, {mel.max():.2f}]")
+
+    # Run inference
+    t0 = time.time()
+    beat_logits, downbeat_logits = sess.run(None, {inp.name: mel.astype(np.float32)})
+    elapsed = time.time() - t0
+    print(f"  Inference time: {elapsed * 1000:.0f}ms")
+
+    print(f"  Beat logits: shape={beat_logits.shape}, range=[{beat_logits.min():.3f}, {beat_logits.max():.3f}]")
+    print(f"  Downbeat logits: shape={downbeat_logits.shape}")
+
+    # Post-processing: local max-pool peak picking (matching Beat This! minimal postprocessor)
+    # 1. max_pool1d with kernel=7 (±70ms at 50fps) to find local maxima
+    # 2. Keep peaks where logit > 0 (probability > 0.5)
+    def peak_pick(logits_1d: np.ndarray, kernel: int = 7) -> np.ndarray:
+        """Pick local maxima from logits, matching Beat This! postprocessor."""
+        from scipy.ndimage import maximum_filter1d
+        maxpool = maximum_filter1d(logits_1d, size=kernel, mode='constant', cval=-1000)
+        peaks = (logits_1d == maxpool) & (logits_1d > 0)
+        return np.where(peaks)[0]
+
+    beat_frames = peak_pick(beat_logits[0])
+    downbeat_frames = peak_pick(downbeat_logits[0])
+
+    # Convert frames to time (hop=441 @ 22050Hz = 20ms per frame)
+    frame_duration = 441.0 / BEAT_THIS_SR  # 0.02s per frame
+    beat_times = beat_frames * frame_duration
+    downbeat_times = downbeat_frames * frame_duration
+
+    print(f"  Detected {len(beat_times)} beats, {len(downbeat_times)} downbeats")
+
+    if len(beat_times) >= 2:
+        # Compute BPM from inter-beat intervals
+        ibis = np.diff(beat_times)
+        median_ibi = np.median(ibis)
+        bpm = 60.0 / median_ibi if median_ibi > 0 else 0
+        print(f"  Estimated BPM: {bpm:.1f}")
+        print(f"  First 10 beat times (s): {beat_times[:10].round(2).tolist()}")
+        print(f"  First 5 downbeat times (s): {downbeat_times[:5].round(2).tolist()}")
+
+        # Sanity checks
+        ok = True
+        if bpm < 30 or bpm > 300:
+            print(f"  WARN: BPM {bpm:.1f} outside expected range [30, 300]")
+            ok = False
+        if len(beat_times) < 4:
+            print(f"  WARN: Too few beats detected ({len(beat_times)})")
+            ok = False
+
+        if ok:
+            print("  PASS: Beat detection looks correct")
+        return ok
+    else:
+        print("  FAIL: Fewer than 2 beats detected")
+        return False
+
+
+def verify_consonance_ace(audio: np.ndarray, sr: int):
+    """Verify consonance-ACE ONNX model."""
+    print("\n" + "=" * 60)
+    print("CONSONANCE-ACE — Chord Recognition Verification")
+    print("=" * 60)
+
+    if not os.path.exists(CONSONANCE_ACE_PATH):
+        print(f"  SKIP: {CONSONANCE_ACE_PATH} not found")
+        return False
+
+    sess = ort.InferenceSession(CONSONANCE_ACE_PATH)
+    inp = sess.get_inputs()[0]
+    print(f"  Model input: {inp.name}, shape={inp.shape}, type={inp.type}")
+    for o in sess.get_outputs():
+        print(f"  Model output: {o.name}, shape={o.shape}")
+
+    # Compute CQT — process in 20s chunks like the original
+    chunk_dur = 20.0
+    n_samples = int(chunk_dur * sr)
+    audio_chunk = audio[:n_samples]
+    if len(audio_chunk) < n_samples:
+        audio_chunk = np.pad(audio_chunk, (0, n_samples - len(audio_chunk)))
+
+    cqt = compute_cqt(audio_chunk, sr)
+    print(f"  CQT features: shape={cqt.shape}, range=[{cqt.min():.2f}, {cqt.max():.2f}]")
+
+    # Run inference
+    t0 = time.time()
+    root_logits, bass_logits, chord_logits = sess.run(None, {inp.name: cqt.astype(np.float32)})
+    elapsed = time.time() - t0
+    print(f"  Inference time: {elapsed * 1000:.0f}ms")
+
+    print(f"  Root logits: shape={root_logits.shape}")
+    print(f"  Bass logits: shape={bass_logits.shape}")
+    print(f"  Chord logits: shape={chord_logits.shape}")
+
+    # Decode predictions
+    root_preds = np.argmax(root_logits[0], axis=-1)  # [T]
+    bass_preds = np.argmax(bass_logits[0], axis=-1)  # [T]
+    chord_probs = 1 / (1 + np.exp(-chord_logits[0]))  # sigmoid -> [T, 12]
+
+    n_frames = root_preds.shape[0]
+    frame_dur = chunk_dur / n_frames
+    print(f"  {n_frames} frames, {frame_dur * 1000:.1f}ms per frame")
+
+    # Sample chord labels at 1-second intervals
+    print("\n  Chord timeline (every 1s):")
+    for sec in range(min(int(chunk_dur), 20)):
+        frame_idx = int(sec / frame_dur)
+        if frame_idx >= n_frames:
+            break
+        root = ROOT_LABELS[root_preds[frame_idx]]
+        bass = ROOT_LABELS[bass_preds[frame_idx]]
+        active_notes = np.where(chord_probs[frame_idx] > 0.5)[0]
+        notes_str = ",".join([PITCH_CLASSES[n] for n in active_notes]) if len(active_notes) > 0 else "none"
+        chord_label = f"{root}" if root != "N" else "N"
+        print(f"    {sec:2d}s: root={root:>2s}  bass={bass:>2s}  notes=[{notes_str}]  -> {chord_label}")
+
+    # Sanity checks
+    ok = True
+    unique_roots = len(set(root_preds.tolist()))
+    if unique_roots < 2:
+        print(f"\n  WARN: Only {unique_roots} unique root predictions (model may not be discriminating)")
+
+    # Check that not all predictions are "N" (no chord)
+    n_ratio = np.mean(root_preds == 0)
+    if n_ratio > 0.95:
+        print(f"  WARN: {n_ratio * 100:.0f}% of frames predicted as 'N' (no chord)")
+        ok = False
+
+    if ok:
+        print("\n  PASS: Chord detection looks correct")
+    return ok
+
+
+def main():
+    if len(sys.argv) > 1:
+        audio_path = sys.argv[1]
+    else:
+        # Use a default test file
+        audio_path = "/Users/gongjunmin/timedomain/nanoclaw/groups/main/funk_rock_groove.mp3"
+
+    if not os.path.exists(audio_path):
+        print(f"Audio file not found: {audio_path}")
+        print("Usage: python scripts/verify-onnx-models.py [path/to/audio.wav]")
+        sys.exit(1)
+
+    print(f"Loading audio: {audio_path}")
+    audio, sr = librosa.load(audio_path, sr=22050, mono=True)
+    duration = len(audio) / sr
+    print(f"  Duration: {duration:.1f}s, SR: {sr}, Samples: {len(audio)}")
+
+    results = {}
+    results["beat_this"] = verify_beat_this(audio, sr)
+    results["consonance_ace"] = verify_consonance_ace(audio, sr)
+
+    print("\n" + "=" * 60)
+    print("SUMMARY")
+    print("=" * 60)
+    for name, ok in results.items():
+        status = "PASS" if ok else "FAIL"
+        print(f"  {name}: {status}")
+
+    if all(results.values()):
+        print("\nAll models verified successfully!")
+        sys.exit(0)
+    else:
+        print("\nSome models failed verification.")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/components/generation/AudioAnalysisPanel.tsx b/src/components/generation/AudioAnalysisPanel.tsx
index 60a89ad5..c0b333c9 100644
--- a/src/components/generation/AudioAnalysisPanel.tsx
+++ b/src/components/generation/AudioAnalysisPanel.tsx
@@ -2,12 +2,17 @@ import { useState, useEffect, useCallback } from 'react';
 import { useProjectStore } from '../../store/projectStore';
 import { useUIStore } from '../../store/uiStore';
 import { useGenerationStore } from '../../store/generationStore';
+import { useAnalysisStore } from '../../store/analysisStore';
 import * as api from '../../services/aceStepApi';
 import { loadAudioBlobByKey } from '../../services/audioFileManager';
+import { analyzeClipLocally } from '../../services/localAnalysisService';
 import type { TaskResultItem } from '../../types/api';
+import type { LocalAnalysisResult, ChordEvent } from '../../types/analysis';
 import { POLL_INTERVAL_MS, MAX_POLL_DURATION_MS } from '../../constants/defaults';
 
-interface AnalysisResult {
+type AnalysisMode = 'local' | 'server';
+
+interface ServerAnalysisResult {
   bpm: number | undefined;
   keyScale: string | undefined;
   timeSignature: string | undefined;
@@ -25,14 +30,22 @@ export function AudioAnalysisPanel() {
   const clip = analysisClipId ? getClipById(analysisClipId) : null;
   const track = project?.tracks.find((t) => t.clips.some((c) => c.id === analysisClipId)) ?? null;
 
+  const [mode, setMode] = useState<AnalysisMode>('local');
   const [analyzing, setAnalyzing] = useState(false);
-  const [result, setResult] = useState<AnalysisResult | null>(null);
+  const [serverResult, setServerResult] = useState<ServerAnalysisResult | null>(null);
+  const [localResult, setLocalResult] = useState<LocalAnalysisResult | null>(null);
   const [error, setError] = useState('');
   const [applied, setApplied] = useState(false);
 
+  // Local analysis progress from store
+  const analysisJob = useAnalysisStore((s) =>
+    analysisClipId ? s.getJobForClip(analysisClipId) : undefined,
+  );
+
   // Reset when clip changes
   useEffect(() => {
-    setResult(null);
+    setServerResult(null);
+    setLocalResult(null);
     setError('');
     setApplied(false);
     setAnalyzing(false);
@@ -48,14 +61,31 @@ export function AudioAnalysisPanel() {
     return () => window.removeEventListener('keydown', handleEsc);
   }, [onClose]);
 
-  const handleAnalyze = useCallback(async () => {
+  // ---------- Local analysis ----------
+  const handleLocalAnalyze = useCallback(async () => {
+    if (!clip || !analysisClipId || analyzing) return;
+    setAnalyzing(true);
+    setError('');
+    setLocalResult(null);
+
+    try {
+      const result = await analyzeClipLocally(analysisClipId);
+      setLocalResult(result);
+    } catch (e) {
+      setError(e instanceof Error ? e.message : 'Local analysis failed');
+    } finally {
+      setAnalyzing(false);
+    }
+  }, [clip, analysisClipId, analyzing]);
+
+  // ---------- Server analysis ----------
+  const handleServerAnalyze = useCallback(async () => {
     if (!clip || analyzing || isGenerating) return;
     setAnalyzing(true);
     setError('');
-    setResult(null);
+    setServerResult(null);
 
     try {
-      // Load clip audio
       let audioBlob: Blob | null = null;
       if (clip.isolatedAudioKey) {
         audioBlob = (await loadAudioBlobByKey(clip.isolatedAudioKey)) ?? null;
@@ -68,15 +98,13 @@ export function AudioAnalysisPanel() {
         return;
       }
 
-      // Send as a cover task with minimal transformation — we just want the metas back.
-      // The cover task returns inferred BPM, key, etc. in the result metas.
       const coverParams = {
         task_type: 'cover' as const,
         caption: 'analyze audio properties',
         lyrics: '',
-        audio_cover_strength: 0.0, // No transformation — just analyze
+        audio_cover_strength: 0.0,
         audio_duration: clip.duration,
-        inference_steps: 10, // Minimal steps for fast analysis
+        inference_steps: 10,
         guidance_scale: 1.0,
         shift: 1.0,
         batch_size: 1,
@@ -99,7 +127,7 @@ export function AudioAnalysisPanel() {
           const items: TaskResultItem[] = JSON.parse(entry.result);
           const first = items?.[0];
           if (first) {
-            setResult({
+            setServerResult({
               bpm: first.metas?.bpm,
               keyScale: first.metas?.keyscale,
               timeSignature: first.metas?.timesignature,
@@ -123,30 +151,39 @@ export function AudioAnalysisPanel() {
     }
   }, [clip, analyzing, isGenerating, project]);
 
+  const handleAnalyze = mode === 'local' ? handleLocalAnalyze : handleServerAnalyze;
+
   const handleApplyToProject = useCallback(() => {
-    if (!result || !project) return;
+    if (!project) return;
     const updates: Record<string, unknown> = {};
-    if (result.bpm) updates.bpm = Math.round(result.bpm);
-    if (result.keyScale) updates.keyScale = result.keyScale;
+    if (mode === 'local' && localResult) {
+      if (localResult.bpm) updates.bpm = Math.round(localResult.bpm);
+      if (localResult.keyScale) updates.keyScale = localResult.keyScale;
+    } else if (mode === 'server' && serverResult) {
+      if (serverResult.bpm) updates.bpm = Math.round(serverResult.bpm);
+      if (serverResult.keyScale) updates.keyScale = serverResult.keyScale;
+    }
     if (Object.keys(updates).length > 0) {
       useProjectStore.getState().updateProject(updates as { bpm?: number; keyScale?: string });
       setApplied(true);
     }
-  }, [result, project]);
+  }, [mode, localResult, serverResult, project]);
 
   if (!analysisClipId || !clip || !track) return null;
 
   const hasAudio = !!(clip.isolatedAudioKey || clip.cumulativeMixKey);
-
-  // If clip already has inferred metas, show them immediately
   const existingMetas = clip.inferredMetas;
+  const hasResult = mode === 'local' ? !!localResult : !!serverResult;
+  const hasBpmOrKey = mode === 'local'
+    ? !!(localResult?.bpm || localResult?.keyScale)
+    : !!(serverResult?.bpm || serverResult?.keyScale);
 
   return (
     <div
       className="fixed inset-0 z-50 flex items-center justify-center bg-black/60 backdrop-blur-sm"
       onClick={(e) => { if (e.target === e.currentTarget) onClose(); }}
     >
-      <div className="bg-daw-surface border border-daw-border rounded-lg shadow-2xl w-[380px] max-h-[70vh] flex flex-col text-xs text-zinc-200">
+      <div className="bg-daw-surface border border-daw-border rounded-lg shadow-2xl w-[420px] max-h-[80vh] flex flex-col text-xs text-zinc-200">
         {/* Header */}
         <div className="flex items-center justify-between px-4 py-3 border-b border-daw-border">
           <div className="flex items-center gap-2">
@@ -165,6 +202,30 @@ export function AudioAnalysisPanel() {
 
         {/* Body */}
         <div className="flex-1 overflow-y-auto px-4 py-3 space-y-3 min-h-0">
+          {/* Mode selector */}
+          <div className="flex gap-1 p-0.5 bg-[#1a1a1a] rounded-md border border-[#333]">
+            <button
+              onClick={() => setMode('local')}
+              className={`flex-1 px-3 py-1.5 rounded text-[10px] font-medium transition-colors ${
+                mode === 'local'
+                  ? 'bg-cyan-700/60 text-cyan-200'
+                  : 'text-zinc-400 hover:text-zinc-200'
+              }`}
+            >
+              Local (Browser)
+            </button>
+            <button
+              onClick={() => setMode('server')}
+              className={`flex-1 px-3 py-1.5 rounded text-[10px] font-medium transition-colors ${
+                mode === 'server'
+                  ? 'bg-cyan-700/60 text-cyan-200'
+                  : 'text-zinc-400 hover:text-zinc-200'
+              }`}
+            >
+              Server
+            </button>
+          </div>
+
           {/* Source clip info */}
           <div className="bg-[#222]/60 rounded px-3 py-2.5 border border-[#3a3a3a] space-y-0.5">
             <p className="text-[10px] text-zinc-400 uppercase tracking-wide">Clip</p>
@@ -175,10 +236,33 @@ export function AudioAnalysisPanel() {
             <p className="text-[10px] text-zinc-400">{clip.duration.toFixed(1)}s</p>
           </div>
 
+          {/* Local analysis progress */}
+          {mode === 'local' && analyzing && analysisJob && (
+            <div className="bg-[#1a1c20]/60 rounded px-3 py-2.5 border border-cyan-900/40 space-y-1.5">
+              <p className="text-[10px] text-cyan-400 uppercase tracking-wide font-medium">
+                Analyzing...
+              </p>
+              <div className="w-full h-1.5 bg-[#333] rounded-full overflow-hidden">
+                <div
+                  className="h-full bg-cyan-500 rounded-full transition-all duration-300"
+                  style={{ width: `${analysisJob.progress}%` }}
+                />
+              </div>
+              <p className="text-[10px] text-zinc-400">{analysisJob.message}</p>
+            </div>
+          )}
+
           {/* Existing inferred metas */}
           {existingMetas && (
             <div className="bg-[#1a2a1a]/60 rounded px-3 py-2.5 border border-emerald-900/40 space-y-1">
-              <p className="text-[10px] text-emerald-400 uppercase tracking-wide font-medium">Previously Inferred</p>
+              <p className="text-[10px] text-emerald-400 uppercase tracking-wide font-medium">
+                Previously Inferred
+                {existingMetas.analysisSource && (
+                  <span className="ml-1 text-[9px] text-emerald-600">
+                    ({existingMetas.analysisSource})
+                  </span>
+                )}
+              </p>
               <div className="grid grid-cols-2 gap-x-4 gap-y-1">
                 {existingMetas.bpm && (
                   <div>
@@ -205,42 +289,72 @@ export function AudioAnalysisPanel() {
                   </div>
                 )}
               </div>
+
+              {/* Chord display for local analysis results */}
+              {existingMetas.chords && existingMetas.chords.length > 0 && (
+                <div className="mt-2 pt-2 border-t border-emerald-900/30">
+                  <span className="text-[10px] text-zinc-400">Chords</span>
+                  <div className="flex flex-wrap gap-1 mt-1">
+                    {existingMetas.chords.slice(0, 16).map((chord, i) => (
+                      <span
+                        key={i}
+                        className="px-1.5 py-0.5 rounded text-[10px] font-mono bg-emerald-900/40 text-emerald-300 border border-emerald-800/40"
+                        title={`${chord.startTime.toFixed(1)}s - ${chord.endTime.toFixed(1)}s`}
+                      >
+                        {chord.label}
+                      </span>
+                    ))}
+                    {existingMetas.chords.length > 16 && (
+                      <span className="text-[10px] text-zinc-500">
+                        +{existingMetas.chords.length - 16} more
+                      </span>
+                    )}
+                  </div>
+                </div>
+              )}
             </div>
           )}
 
-          {/* Analysis results */}
-          {result && (
+          {/* Local analysis results */}
+          {mode === 'local' && localResult && (
+            <LocalResultDisplay result={localResult} />
+          )}
+
+          {/* Server analysis results */}
+          {mode === 'server' && serverResult && (
             <div className="bg-[#1a1c20]/60 rounded px-3 py-2.5 border border-cyan-900/40 space-y-1">
-              <p className="text-[10px] text-cyan-400 uppercase tracking-wide font-medium">Analysis Results</p>
+              <p className="text-[10px] text-cyan-400 uppercase tracking-wide font-medium">
+                Server Results
+              </p>
               <div className="grid grid-cols-2 gap-x-4 gap-y-1">
-                {result.bpm && (
+                {serverResult.bpm && (
                   <div>
                     <span className="text-[10px] text-zinc-400">BPM</span>
-                    <p className="text-[11px] font-mono text-cyan-300">{Math.round(result.bpm)}</p>
+                    <p className="text-[11px] font-mono text-cyan-300">{Math.round(serverResult.bpm)}</p>
                   </div>
                 )}
-                {result.keyScale && (
+                {serverResult.keyScale && (
                   <div>
                     <span className="text-[10px] text-zinc-400">Key</span>
-                    <p className="text-[11px] font-mono text-cyan-300">{result.keyScale}</p>
+                    <p className="text-[11px] font-mono text-cyan-300">{serverResult.keyScale}</p>
                   </div>
                 )}
-                {result.timeSignature && (
+                {serverResult.timeSignature && (
                   <div>
                     <span className="text-[10px] text-zinc-400">Time Sig</span>
-                    <p className="text-[11px] font-mono text-cyan-300">{result.timeSignature}</p>
+                    <p className="text-[11px] font-mono text-cyan-300">{serverResult.timeSignature}</p>
                   </div>
                 )}
-                {result.genres && (
+                {serverResult.genres && (
                   <div className="col-span-2">
                     <span className="text-[10px] text-zinc-400">Genre</span>
-                    <p className="text-[11px] text-cyan-300">{result.genres}</p>
+                    <p className="text-[11px] text-cyan-300">{serverResult.genres}</p>
                   </div>
                 )}
-                {result.caption && (
+                {serverResult.caption && (
                   <div className="col-span-2">
                     <span className="text-[10px] text-zinc-400">Description</span>
-                    <p className="text-[10px] text-cyan-200 leading-relaxed">{result.caption}</p>
+                    <p className="text-[10px] text-cyan-200 leading-relaxed">{serverResult.caption}</p>
                   </div>
                 )}
               </div>
@@ -258,6 +372,13 @@ export function AudioAnalysisPanel() {
               No audio available — generate the clip first before analyzing.
             </p>
           )}
+
+          {mode === 'local' && !analyzing && !localResult && hasAudio && (
+            <p className="text-[10px] text-zinc-500">
+              Local analysis uses Beat This! for BPM detection and Consonance-ACE for chord recognition.
+              Models are downloaded on first use (~23MB total) and cached locally.
+            </p>
+          )}
         </div>
 
         {/* Footer */}
@@ -269,7 +390,7 @@ export function AudioAnalysisPanel() {
             Close
           </button>
           <div className="flex gap-2">
-            {result && (result.bpm || result.keyScale) && (
+            {hasResult && hasBpmOrKey && (
               <button
                 onClick={handleApplyToProject}
                 disabled={applied}
@@ -284,9 +405,9 @@ export function AudioAnalysisPanel() {
             )}
             <button
               onClick={handleAnalyze}
-              disabled={analyzing || !hasAudio || isGenerating}
+              disabled={analyzing || !hasAudio || (mode === 'server' && isGenerating)}
               className={`px-4 py-1.5 rounded text-xs font-medium transition-colors ${
-                analyzing || !hasAudio || isGenerating
+                analyzing || !hasAudio || (mode === 'server' && isGenerating)
                   ? 'bg-[#444] text-zinc-400 cursor-not-allowed'
                   : 'bg-cyan-600 hover:bg-cyan-500 text-white'
               }`}
@@ -299,3 +420,70 @@ export function AudioAnalysisPanel() {
     </div>
   );
 }
+
+// ---------- Local result display component ----------
+
+function LocalResultDisplay({ result }: { result: LocalAnalysisResult }) {
+  return (
+    <div className="bg-[#1a1c20]/60 rounded px-3 py-2.5 border border-cyan-900/40 space-y-1">
+      <p className="text-[10px] text-cyan-400 uppercase tracking-wide font-medium">
+        Local Analysis Results
+      </p>
+      <div className="grid grid-cols-2 gap-x-4 gap-y-1">
+        <div>
+          <span className="text-[10px] text-zinc-400">BPM</span>
+          <p className="text-[11px] font-mono text-cyan-300">{Math.round(result.bpm)}</p>
+        </div>
+        {result.keyScale && (
+          <div>
+            <span className="text-[10px] text-zinc-400">Key</span>
+            <p className="text-[11px] font-mono text-cyan-300">{result.keyScale}</p>
+          </div>
+        )}
+        {result.timeSignature && (
+          <div>
+            <span className="text-[10px] text-zinc-400">Time Sig</span>
+            <p className="text-[11px] font-mono text-cyan-300">{result.timeSignature}</p>
+          </div>
+        )}
+        <div>
+          <span className="text-[10px] text-zinc-400">Beats</span>
+          <p className="text-[11px] font-mono text-cyan-300">{result.beats.length} detected</p>
+        </div>
+      </div>
+
+      {/* Chord timeline */}
+      {result.chords.length > 0 && (
+        <ChordTimeline chords={result.chords} />
+      )}
+    </div>
+  );
+}
+
+function ChordTimeline({ chords }: { chords: ChordEvent[] }) {
+  // Filter out "N" (no chord) for display
+  const displayChords = chords.filter((c) => c.label !== 'N');
+  if (displayChords.length === 0) return null;
+
+  return (
+    <div className="mt-2 pt-2 border-t border-cyan-900/30">
+      <span className="text-[10px] text-zinc-400">Chords ({displayChords.length})</span>
+      <div className="flex flex-wrap gap-1 mt-1">
+        {displayChords.slice(0, 24).map((chord, i) => (
+          <span
+            key={i}
+            className="px-1.5 py-0.5 rounded text-[10px] font-mono bg-cyan-900/40 text-cyan-300 border border-cyan-800/40"
+            title={`${chord.startTime.toFixed(1)}s - ${chord.endTime.toFixed(1)}s (${(chord.confidence * 100).toFixed(0)}%)`}
+          >
+            {chord.label}
+          </span>
+        ))}
+        {displayChords.length > 24 && (
+          <span className="text-[10px] text-zinc-500">
+            +{displayChords.length - 24} more
+          </span>
+        )}
+      </div>
+    </div>
+  );
+}
diff --git a/src/main.tsx b/src/main.tsx
index 2a85098c..9a22d2af 100644
--- a/src/main.tsx
+++ b/src/main.tsx
@@ -18,6 +18,8 @@ import { createProjectShare } from './services/projectSharingService';
 import { generateProjectSummary, generateProjectStructure } from './utils/dawStateSummary';
 import { getMidiCaptureService } from './services/midiCaptureService';
 import { executeCoreDawShortcut } from './services/coreDawShortcuts';
+import { useAnalysisStore } from './store/analysisStore';
+import { analyzeClipLocally } from './services/localAnalysisService';
 
 const agentProjectStore = {
   getState: () => ({
@@ -101,6 +103,8 @@ const agentProjectStore = {
 (window as unknown as Record<string, unknown>).__transportStore = useTransportStore;
 (window as unknown as Record<string, unknown>).__collaborationStore = useCollaborationStore;
 (window as unknown as Record<string, unknown>).__generationStore = useGenerationStore;
+(window as unknown as Record<string, unknown>).__analysisStore = useAnalysisStore;
+(window as unknown as Record<string, unknown>).__analyzeClipLocally = analyzeClipLocally;
 (window as unknown as Record<string, unknown>).__sessionStore = useSessionStore;
 (window as unknown as Record<string, unknown>).__modelStore = useModelStore;
 (window as unknown as Record<string, unknown>).__getAudioEngine = () => getAudioEngine();
diff --git a/src/services/__tests__/modelManager.test.ts b/src/services/__tests__/modelManager.test.ts
new file mode 100644
index 00000000..ab4db888
--- /dev/null
+++ b/src/services/__tests__/modelManager.test.ts
@@ -0,0 +1,106 @@
+import { describe, it, expect, vi, beforeEach } from 'vitest';
+import { loadModelBytes, isModelCached, getModelMeta, MODEL_REGISTRY } from '../modelManager';
+
+// Mock idb-keyval
+vi.mock('idb-keyval', () => {
+  const store = new Map<string, unknown>();
+  return {
+    get: vi.fn((key: string) => Promise.resolve(store.get(key))),
+    set: vi.fn((key: string, value: unknown) => {
+      store.set(key, value);
+      return Promise.resolve();
+    }),
+    _store: store,
+  };
+});
+
+// Access the mock store
+import { _store as mockStore, get as mockGet, set as mockSet } from 'idb-keyval';
+
+beforeEach(() => {
+  (mockStore as Map<string, unknown>).clear();
+  vi.clearAllMocks();
+});
+
+describe('getModelMeta', () => {
+  it('returns metadata for beat-this-small', () => {
+    const meta = getModelMeta('beat-this-small');
+    expect(meta.id).toBe('beat-this-small');
+    expect(meta.name).toBe('Beat This! (small)');
+    expect(meta.sizeBytes).toBeGreaterThan(0);
+    expect(meta.url).toContain('.onnx');
+    expect(meta.cacheKey).toBe('onnx-model:beat-this-small');
+  });
+
+  it('returns metadata for consonance-ace', () => {
+    const meta = getModelMeta('consonance-ace');
+    expect(meta.id).toBe('consonance-ace');
+    expect(meta.name).toBe('Consonance-ACE');
+  });
+});
+
+describe('isModelCached', () => {
+  it('returns false when model is not cached', async () => {
+    expect(await isModelCached('beat-this-small')).toBe(false);
+  });
+
+  it('returns true when model is cached', async () => {
+    const meta = MODEL_REGISTRY['beat-this-small'];
+    (mockStore as Map<string, unknown>).set(meta.cacheKey, new ArrayBuffer(100));
+    expect(await isModelCached('beat-this-small')).toBe(true);
+  });
+});
+
+describe('loadModelBytes', () => {
+  it('returns cached model without fetching', async () => {
+    const meta = MODEL_REGISTRY['beat-this-small'];
+    const cachedBuffer = new ArrayBuffer(42);
+    (mockStore as Map<string, unknown>).set(meta.cacheKey, cachedBuffer);
+
+    const onProgress = vi.fn();
+    const result = await loadModelBytes('beat-this-small', onProgress);
+
+    expect(result).toBe(cachedBuffer);
+    expect(onProgress).toHaveBeenCalledWith(
+      expect.objectContaining({ percent: 100, bytesLoaded: 42, bytesTotal: 42 }),
+    );
+  });
+
+  it('fetches model when not cached and reports progress', async () => {
+    const fakeData = new Uint8Array([1, 2, 3, 4, 5]);
+    const mockReader = {
+      read: vi.fn()
+        .mockResolvedValueOnce({ done: false, value: fakeData })
+        .mockResolvedValueOnce({ done: true }),
+    };
+    globalThis.fetch = vi.fn().mockResolvedValue({
+      ok: true,
+      headers: { get: (h: string) => (h === 'Content-Length' ? '5' : null) },
+      body: { getReader: () => mockReader },
+    }) as unknown as typeof fetch;
+
+    const onProgress = vi.fn();
+    const result = await loadModelBytes('beat-this-small', onProgress);
+
+    expect(result.byteLength).toBe(5);
+    expect(new Uint8Array(result)).toEqual(fakeData);
+
+    // Should have been cached
+    expect(mockSet).toHaveBeenCalledWith('onnx-model:beat-this-small', result);
+
+    // Progress reported
+    expect(onProgress).toHaveBeenCalledWith(
+      expect.objectContaining({ bytesLoaded: 5, percent: 100 }),
+    );
+  });
+
+  it('throws on fetch error', async () => {
+    globalThis.fetch = vi.fn().mockResolvedValue({
+      ok: false,
+      status: 404,
+      statusText: 'Not Found',
+    }) as unknown as typeof fetch;
+
+    await expect(loadModelBytes('beat-this-small')).rejects.toThrow('Failed to download model');
+  });
+});
diff --git a/src/services/localAnalysisService.ts b/src/services/localAnalysisService.ts
new file mode 100644
index 00000000..b70b2d8f
--- /dev/null
+++ b/src/services/localAnalysisService.ts
@@ -0,0 +1,136 @@
+/**
+ * Local audio analysis service — orchestrates Web Worker for BPM/chord detection.
+ */
+import { useProjectStore } from '../store/projectStore';
+import { useAnalysisStore } from '../store/analysisStore';
+import { loadAudioBlobByKey } from './audioFileManager';
+import { downsampleToMono } from '../utils/melSpectrogram';
+import type {
+  AnalysisTask,
+  AnalysisWorkerMessage,
+  AnalysisWorkerRequest,
+  LocalAnalysisResult,
+} from '../types/analysis';
+
+const TARGET_SAMPLE_RATE = 22050;
+
+// Singleton worker — kept alive for session reuse
+let workerInstance: Worker | null = null;
+
+function getOrCreateWorker(): Worker {
+  if (!workerInstance) {
+    workerInstance = new Worker(
+      new URL('../workers/analysisWorker.ts', import.meta.url),
+      { type: 'module' },
+    );
+  }
+  return workerInstance;
+}
+
+/**
+ * Terminate the analysis worker (e.g., on cleanup).
+ */
+export function terminateAnalysisWorker(): void {
+  if (workerInstance) {
+    workerInstance.terminate();
+    workerInstance = null;
+  }
+}
+
+/**
+ * Analyze a clip locally using ONNX models in a Web Worker.
+ * Returns the analysis result and writes it to the clip's inferredMetas.
+ */
+export async function analyzeClipLocally(
+  clipId: string,
+  tasks: AnalysisTask[] = ['bpm', 'chords'],
+): Promise<LocalAnalysisResult> {
+  const store = useAnalysisStore.getState();
+  const jobId = store.createJob(clipId);
+
+  try {
+    // 1. Load audio blob from IndexedDB
+    const clip = useProjectStore.getState().getClipById(clipId);
+    if (!clip) throw new Error(`Clip not found: ${clipId}`);
+
+    const audioKey = clip.isolatedAudioKey ?? clip.cumulativeMixKey;
+    if (!audioKey) throw new Error('No audio available for this clip');
+
+    const blob = await loadAudioBlobByKey(audioKey);
+    if (!blob) throw new Error('Audio blob not found in storage');
+
+    // 2. Decode to AudioBuffer
+    useAnalysisStore.getState().updateJobProgress(jobId, {
+      type: 'progress',
+      status: 'decoding-audio',
+      percent: 5,
+      message: 'Decoding audio...',
+    });
+
+    const arrayBuffer = await blob.arrayBuffer();
+    const audioCtx = new OfflineAudioContext(1, 1, TARGET_SAMPLE_RATE);
+    const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer);
+
+    // 3. Downsample to mono
+    const samples = downsampleToMono(audioBuffer, TARGET_SAMPLE_RATE);
+
+    // 4. Send to worker
+    const worker = getOrCreateWorker();
+
+    const result = await new Promise<LocalAnalysisResult>((resolve, reject) => {
+      const handleMessage = (e: MessageEvent<AnalysisWorkerMessage>) => {
+        const msg = e.data;
+        if (msg.type === 'progress') {
+          useAnalysisStore.getState().updateJobProgress(jobId, msg);
+        } else if (msg.type === 'result') {
+          worker.removeEventListener('message', handleMessage);
+          worker.removeEventListener('error', handleError);
+          useAnalysisStore.getState().completeJob(jobId, msg.result);
+          resolve(msg.result);
+        } else if (msg.type === 'error') {
+          worker.removeEventListener('message', handleMessage);
+          worker.removeEventListener('error', handleError);
+          reject(new Error(msg.error));
+        }
+      };
+
+      const handleError = (e: ErrorEvent) => {
+        worker.removeEventListener('message', handleMessage);
+        worker.removeEventListener('error', handleError);
+        reject(new Error(e.message || 'Worker error'));
+      };
+
+      worker.addEventListener('message', handleMessage);
+      worker.addEventListener('error', handleError);
+
+      const request: AnalysisWorkerRequest = {
+        type: 'analyze',
+        samples,
+        sampleRate: TARGET_SAMPLE_RATE,
+        tasks,
+      };
+
+      // Transfer the samples buffer to the worker (zero-copy)
+      worker.postMessage(request, [samples.buffer]);
+    });
+
+    // 5. Write results to clip.inferredMetas
+    useProjectStore.getState().updateClip(clipId, {
+      inferredMetas: {
+        ...clip.inferredMetas,
+        bpm: result.bpm,
+        keyScale: result.keyScale ?? undefined,
+        timeSignature: result.timeSignature ?? undefined,
+        beats: result.beats,
+        chords: result.chords,
+        analysisSource: 'local',
+      },
+    });
+
+    return result;
+  } catch (err) {
+    const errorMsg = err instanceof Error ? err.message : String(err);
+    useAnalysisStore.getState().failJob(jobId, errorMsg);
+    throw err;
+  }
+}
diff --git a/src/services/modelManager.ts b/src/services/modelManager.ts
new file mode 100644
index 00000000..1c27c5a7
--- /dev/null
+++ b/src/services/modelManager.ts
@@ -0,0 +1,107 @@
+/**
+ * ONNX model manager — lazy download, IndexedDB caching, progress reporting.
+ */
+import { get, set } from 'idb-keyval';
+import type { AnalysisModelId, AnalysisModelMeta, ModelDownloadProgress } from '../types/analysis';
+
+export const MODEL_REGISTRY: Record<AnalysisModelId, AnalysisModelMeta> = {
+  'beat-this-small': {
+    id: 'beat-this-small',
+    name: 'Beat This! (small)',
+    sizeBytes: 8_400_000,
+    url: '/models/beat-this-small.onnx',
+    cacheKey: 'onnx-model:beat-this-small',
+  },
+  'consonance-ace': {
+    id: 'consonance-ace',
+    name: 'Consonance-ACE',
+    sizeBytes: 15_000_000,
+    url: '/models/consonance-ace.onnx',
+    cacheKey: 'onnx-model:consonance-ace',
+  },
+};
+
+/**
+ * Load model bytes from IndexedDB cache, or fetch from network and cache.
+ */
+export async function loadModelBytes(
+  modelId: AnalysisModelId,
+  onProgress?: (p: ModelDownloadProgress) => void,
+): Promise<ArrayBuffer> {
+  const meta = MODEL_REGISTRY[modelId];
+
+  // Try cache first
+  const cached = await get<ArrayBuffer>(meta.cacheKey);
+  if (cached) {
+    onProgress?.({
+      modelName: meta.name,
+      bytesLoaded: cached.byteLength,
+      bytesTotal: cached.byteLength,
+      percent: 100,
+    });
+    return cached;
+  }
+
+  // Fetch with streaming progress
+  const response = await fetch(meta.url);
+  if (!response.ok) {
+    throw new Error(`Failed to download model ${meta.name}: ${response.status} ${response.statusText}`);
+  }
+
+  const contentLength = Number(response.headers.get('Content-Length')) || meta.sizeBytes;
+  const reader = response.body?.getReader();
+
+  if (!reader) {
+    // Fallback: no streaming — download entire response
+    const buffer = await response.arrayBuffer();
+    await set(meta.cacheKey, buffer);
+    onProgress?.({ modelName: meta.name, bytesLoaded: buffer.byteLength, bytesTotal: buffer.byteLength, percent: 100 });
+    return buffer;
+  }
+
+  const chunks: Uint8Array[] = [];
+  let bytesLoaded = 0;
+
+  while (true) {
+    const { done, value } = await reader.read();
+    if (done) break;
+    chunks.push(value);
+    bytesLoaded += value.byteLength;
+    onProgress?.({
+      modelName: meta.name,
+      bytesLoaded,
+      bytesTotal: contentLength,
+      percent: Math.round((bytesLoaded / contentLength) * 100),
+    });
+  }
+
+  // Merge chunks into single ArrayBuffer
+  const buffer = new ArrayBuffer(bytesLoaded);
+  const view = new Uint8Array(buffer);
+  let offset = 0;
+  for (const chunk of chunks) {
+    view.set(chunk, offset);
+    offset += chunk.byteLength;
+  }
+
+  // Cache in IndexedDB
+  await set(meta.cacheKey, buffer);
+
+  return buffer;
+}
+
+/**
+ * Check if a model is already cached in IndexedDB.
+ */
+export async function isModelCached(modelId: AnalysisModelId): Promise<boolean> {
+  const meta = MODEL_REGISTRY[modelId];
+  const cached = await get<ArrayBuffer>(meta.cacheKey);
+  return cached !== undefined;
+}
+
+/**
+ * Get model metadata.
+ */
+export function getModelMeta(modelId: AnalysisModelId): AnalysisModelMeta {
+  return MODEL_REGISTRY[modelId];
+}
diff --git a/src/store/__tests__/analysisStore.test.ts b/src/store/__tests__/analysisStore.test.ts
new file mode 100644
index 00000000..aa2c211b
--- /dev/null
+++ b/src/store/__tests__/analysisStore.test.ts
@@ -0,0 +1,121 @@
+import { describe, it, expect, beforeEach } from 'vitest';
+import { useAnalysisStore } from '../analysisStore';
+import type { LocalAnalysisResult } from '../../types/analysis';
+
+beforeEach(() => {
+  // Reset store state between tests
+  useAnalysisStore.setState({ jobs: {} });
+});
+
+describe('analysisStore', () => {
+  describe('createJob', () => {
+    it('creates a job with initial state', () => {
+      const jobId = useAnalysisStore.getState().createJob('clip-1');
+      const job = useAnalysisStore.getState().jobs[jobId];
+
+      expect(job).toBeDefined();
+      expect(job.clipId).toBe('clip-1');
+      expect(job.status).toBe('idle');
+      expect(job.progress).toBe(0);
+      expect(job.result).toBeNull();
+      expect(job.error).toBeNull();
+      expect(job.startedAt).toBeGreaterThan(0);
+      expect(job.completedAt).toBeNull();
+    });
+
+    it('creates unique job IDs', () => {
+      const id1 = useAnalysisStore.getState().createJob('clip-1');
+      const id2 = useAnalysisStore.getState().createJob('clip-1');
+      expect(id1).not.toBe(id2);
+    });
+  });
+
+  describe('updateJobProgress', () => {
+    it('updates status, progress, and message', () => {
+      const jobId = useAnalysisStore.getState().createJob('clip-1');
+
+      useAnalysisStore.getState().updateJobProgress(jobId, {
+        type: 'progress',
+        status: 'computing-features',
+        percent: 30,
+        message: 'Computing mel spectrogram...',
+      });
+
+      const job = useAnalysisStore.getState().jobs[jobId];
+      expect(job.status).toBe('computing-features');
+      expect(job.progress).toBe(30);
+      expect(job.message).toBe('Computing mel spectrogram...');
+    });
+
+    it('ignores updates for unknown jobs', () => {
+      const before = useAnalysisStore.getState().jobs;
+      useAnalysisStore.getState().updateJobProgress('nonexistent', {
+        type: 'progress',
+        status: 'running-bpm',
+        percent: 50,
+        message: 'test',
+      });
+      expect(useAnalysisStore.getState().jobs).toBe(before);
+    });
+  });
+
+  describe('completeJob', () => {
+    it('sets done status and stores result', () => {
+      const jobId = useAnalysisStore.getState().createJob('clip-1');
+      const result: LocalAnalysisResult = {
+        bpm: 128,
+        beats: [{ time: 0.5, isDownbeat: true, confidence: 0.95 }],
+        chords: [{ startTime: 0, endTime: 2, label: 'C:maj', confidence: 0.9 }],
+        keyScale: 'C major',
+        timeSignature: '4/4',
+      };
+
+      useAnalysisStore.getState().completeJob(jobId, result);
+
+      const job = useAnalysisStore.getState().jobs[jobId];
+      expect(job.status).toBe('done');
+      expect(job.progress).toBe(100);
+      expect(job.result).toEqual(result);
+      expect(job.completedAt).toBeGreaterThan(0);
+    });
+  });
+
+  describe('failJob', () => {
+    it('sets error status and message', () => {
+      const jobId = useAnalysisStore.getState().createJob('clip-1');
+
+      useAnalysisStore.getState().failJob(jobId, 'Model failed to load');
+
+      const job = useAnalysisStore.getState().jobs[jobId];
+      expect(job.status).toBe('error');
+      expect(job.error).toBe('Model failed to load');
+      expect(job.completedAt).toBeGreaterThan(0);
+    });
+  });
+
+  describe('clearJob', () => {
+    it('removes a job from tracking', () => {
+      const jobId = useAnalysisStore.getState().createJob('clip-1');
+      expect(useAnalysisStore.getState().jobs[jobId]).toBeDefined();
+
+      useAnalysisStore.getState().clearJob(jobId);
+      expect(useAnalysisStore.getState().jobs[jobId]).toBeUndefined();
+    });
+  });
+
+  describe('getJobForClip', () => {
+    it('returns the most recent job for a clip', () => {
+      const id1 = useAnalysisStore.getState().createJob('clip-1');
+      const id2 = useAnalysisStore.getState().createJob('clip-1');
+      useAnalysisStore.getState().createJob('clip-2');
+
+      const job = useAnalysisStore.getState().getJobForClip('clip-1');
+      expect(job?.id).toBe(id2); // most recent
+    });
+
+    it('returns undefined for unknown clip', () => {
+      const job = useAnalysisStore.getState().getJobForClip('nonexistent');
+      expect(job).toBeUndefined();
+    });
+  });
+});
diff --git a/src/store/analysisStore.ts b/src/store/analysisStore.ts
new file mode 100644
index 00000000..f3d22287
--- /dev/null
+++ b/src/store/analysisStore.ts
@@ -0,0 +1,139 @@
+/**
+ * Zustand store for tracking local audio analysis jobs.
+ */
+import { create } from 'zustand';
+import type {
+  LocalAnalysisResult,
+  LocalAnalysisStatus,
+  AnalysisWorkerProgress,
+} from '../types/analysis';
+
+export interface AnalysisJob {
+  id: string;
+  clipId: string;
+  status: LocalAnalysisStatus;
+  progress: number;
+  message: string;
+  result: LocalAnalysisResult | null;
+  error: string | null;
+  startedAt: number;
+  completedAt: number | null;
+}
+
+interface AnalysisState {
+  jobs: Record<string, AnalysisJob>;
+
+  /** Create a new job entry. Returns the job ID. */
+  createJob: (clipId: string) => string;
+
+  /** Update job progress from worker message. */
+  updateJobProgress: (jobId: string, progress: AnalysisWorkerProgress) => void;
+
+  /** Mark job as completed with result. */
+  completeJob: (jobId: string, result: LocalAnalysisResult) => void;
+
+  /** Mark job as failed. */
+  failJob: (jobId: string, error: string) => void;
+
+  /** Remove a job from tracking. */
+  clearJob: (jobId: string) => void;
+
+  /** Get the active/latest job for a clip. */
+  getJobForClip: (clipId: string) => AnalysisJob | undefined;
+}
+
+let jobCounter = 0;
+
+export const useAnalysisStore = create<AnalysisState>((set, get) => ({
+  jobs: {},
+
+  createJob(clipId: string): string {
+    const seq = ++jobCounter;
+    const id = `analysis-${seq}-${Date.now()}`;
+    const job: AnalysisJob = {
+      id,
+      clipId,
+      status: 'idle',
+      progress: 0,
+      message: 'Starting analysis...',
+      result: null,
+      error: null,
+      startedAt: performance.now() + seq, // monotonic + unique
+      completedAt: null,
+    };
+    set((state) => ({ jobs: { ...state.jobs, [id]: job } }));
+    return id;
+  },
+
+  updateJobProgress(jobId: string, progress: AnalysisWorkerProgress) {
+    set((state) => {
+      const job = state.jobs[jobId];
+      if (!job) return state;
+      return {
+        jobs: {
+          ...state.jobs,
+          [jobId]: {
+            ...job,
+            status: progress.status,
+            progress: progress.percent,
+            message: progress.message,
+          },
+        },
+      };
+    });
+  },
+
+  completeJob(jobId: string, result: LocalAnalysisResult) {
+    set((state) => {
+      const job = state.jobs[jobId];
+      if (!job) return state;
+      return {
+        jobs: {
+          ...state.jobs,
+          [jobId]: {
+            ...job,
+            status: 'done',
+            progress: 100,
+            message: 'Analysis complete',
+            result,
+            completedAt: Date.now(),
+          },
+        },
+      };
+    });
+  },
+
+  failJob(jobId: string, error: string) {
+    set((state) => {
+      const job = state.jobs[jobId];
+      if (!job) return state;
+      return {
+        jobs: {
+          ...state.jobs,
+          [jobId]: {
+            ...job,
+            status: 'error',
+            message: error,
+            error,
+            completedAt: Date.now(),
+          },
+        },
+      };
+    });
+  },
+
+  clearJob(jobId: string) {
+    set((state) => {
+      const { [jobId]: _removed, ...rest } = state.jobs;
+      return { jobs: rest };
+    });
+  },
+
+  getJobForClip(clipId: string): AnalysisJob | undefined {
+    const jobs = get().jobs;
+    // Return the most recent job for this clip
+    return Object.values(jobs)
+      .filter((j) => j.clipId === clipId)
+      .sort((a, b) => b.startedAt - a.startedAt)[0];
+  },
+}));
diff --git a/src/types/analysis.ts b/src/types/analysis.ts
new file mode 100644
index 00000000..67a42b8e
--- /dev/null
+++ b/src/types/analysis.ts
@@ -0,0 +1,100 @@
+/** A detected beat position in the audio. */
+export interface BeatEvent {
+  /** Time in seconds from audio start. */
+  time: number;
+  /** Whether this beat is a downbeat (start of bar). */
+  isDownbeat: boolean;
+  /** Confidence 0-1. */
+  confidence: number;
+}
+
+/** A detected chord region in the audio. */
+export interface ChordEvent {
+  /** Start time in seconds. */
+  startTime: number;
+  /** End time in seconds. */
+  endTime: number;
+  /** Chord label, e.g. "C:maj", "E:min7", "N" for no chord. */
+  label: string;
+  /** Confidence 0-1. */
+  confidence: number;
+}
+
+/** Full analysis result for a clip. */
+export interface LocalAnalysisResult {
+  bpm: number;
+  beats: BeatEvent[];
+  chords: ChordEvent[];
+  /** Estimated key/scale from chord distribution, e.g. "C major". */
+  keyScale: string | null;
+  /** Time signature inferred from downbeat spacing. */
+  timeSignature: string | null;
+}
+
+/** Status of a local analysis job. */
+export type LocalAnalysisStatus =
+  | 'idle'
+  | 'loading-model'
+  | 'decoding-audio'
+  | 'computing-features'
+  | 'running-bpm'
+  | 'running-chords'
+  | 'post-processing'
+  | 'done'
+  | 'error';
+
+/** Progress update from the analysis worker. */
+export interface AnalysisWorkerProgress {
+  type: 'progress';
+  status: LocalAnalysisStatus;
+  percent: number;
+  message: string;
+}
+
+/** Result message from the analysis worker. */
+export interface AnalysisWorkerResult {
+  type: 'result';
+  result: LocalAnalysisResult;
+}
+
+/** Error message from the analysis worker. */
+export interface AnalysisWorkerError {
+  type: 'error';
+  error: string;
+}
+
+export type AnalysisWorkerMessage =
+  | AnalysisWorkerProgress
+  | AnalysisWorkerResult
+  | AnalysisWorkerError;
+
+/** Message sent to the analysis worker. */
+export interface AnalysisWorkerRequest {
+  type: 'analyze';
+  /** Mono audio samples, already downsampled to target sample rate. */
+  samples: Float32Array;
+  sampleRate: number;
+  /** Which analyses to run. */
+  tasks: AnalysisTask[];
+}
+
+export type AnalysisTask = 'bpm' | 'chords';
+
+export type AnalysisModelId = 'beat-this-small' | 'consonance-ace';
+
+export interface AnalysisModelMeta {
+  id: AnalysisModelId;
+  name: string;
+  sizeBytes: number;
+  url: string;
+  /** IndexedDB cache key. */
+  cacheKey: string;
+}
+
+/** Model download progress. */
+export interface ModelDownloadProgress {
+  modelName: string;
+  bytesLoaded: number;
+  bytesTotal: number;
+  percent: number;
+}
diff --git a/src/types/project.ts b/src/types/project.ts
index a9dcd70d..1bd079c3 100644
--- a/src/types/project.ts
+++ b/src/types/project.ts
@@ -570,6 +570,12 @@ export interface InferredMetas {
   genres?: string;
   seed?: string;
   ditModel?: string;
+  /** Detected beat positions (local analysis). */
+  beats?: import('./analysis').BeatEvent[];
+  /** Detected chord regions (local analysis). */
+  chords?: import('./analysis').ChordEvent[];
+  /** Whether metas came from server or local ONNX analysis. */
+  analysisSource?: 'server' | 'local';
 }
 
 /** A snapshot of a clip's audio state, stored as part of version history. */
diff --git a/src/utils/__tests__/cqt.test.ts b/src/utils/__tests__/cqt.test.ts
new file mode 100644
index 00000000..5f8976d2
--- /dev/null
+++ b/src/utils/__tests__/cqt.test.ts
@@ -0,0 +1,87 @@
+import { describe, it, expect } from 'vitest';
+import { computeCQT, cqtToOnnxInput, CONSONANCE_ACE_CQT_OPTIONS } from '../cqt';
+
+describe('computeCQT', () => {
+  it('returns correct number of bins for consonance-ACE config', () => {
+    const samples = new Float32Array(22050); // 1 second
+    const { nBins } = computeCQT(samples, CONSONANCE_ACE_CQT_OPTIONS);
+    // 24 bins/octave * 6 octaves = 144
+    expect(nBins).toBe(144);
+  });
+
+  it('returns correct number of frames for 1 second of audio', () => {
+    const sr = 22050;
+    const samples = new Float32Array(sr);
+    const { nFrames } = computeCQT(samples, CONSONANCE_ACE_CQT_OPTIONS);
+    // ceil(22050 / 512) + 1 ≈ 44
+    const expected = Math.floor((sr - 1) / 512) + 1;
+    expect(nFrames).toBe(expected);
+  });
+
+  it('produces non-negative magnitudes', () => {
+    const samples = new Float32Array(22050);
+    for (let i = 0; i < samples.length; i++) {
+      samples[i] = Math.sin(2 * Math.PI * 440 * i / 22050);
+    }
+    const { data, nBins, nFrames } = computeCQT(samples, CONSONANCE_ACE_CQT_OPTIONS);
+    expect(data.length).toBe(nBins);
+    for (let b = 0; b < nBins; b++) {
+      expect(data[b].length).toBe(nFrames);
+      for (let f = 0; f < nFrames; f++) {
+        expect(data[b][f]).toBeGreaterThanOrEqual(0);
+      }
+    }
+  });
+
+  it('440Hz tone has energy in the correct frequency range', () => {
+    // A4 = 440Hz. C1 = 32.7Hz. 440/32.7 = 13.46 octaves? No —
+    // log2(440/32.7) ≈ 3.75 octaves. At 24 bins/octave: bin ~90
+    const sr = 22050;
+    const samples = new Float32Array(sr * 2);
+    for (let i = 0; i < samples.length; i++) {
+      samples[i] = Math.sin(2 * Math.PI * 440 * i / sr);
+    }
+    const { data, nBins } = computeCQT(samples, CONSONANCE_ACE_CQT_OPTIONS);
+
+    // Find the bin with maximum energy (average over frames)
+    let maxBin = 0, maxEnergy = 0;
+    for (let b = 0; b < nBins; b++) {
+      const avg = data[b].reduce((s, v) => s + v, 0) / data[b].length;
+      if (avg > maxEnergy) { maxEnergy = avg; maxBin = b; }
+    }
+
+    // A4 = 440Hz, C1 = 32.7Hz
+    // Expected bin = 24 * log2(440/32.7) ≈ 24 * 3.75 ≈ 90
+    expect(maxBin).toBeGreaterThan(80);
+    expect(maxBin).toBeLessThan(100);
+  });
+
+  it('silent input produces near-zero magnitudes', () => {
+    const samples = new Float32Array(22050);
+    const { data, nBins, nFrames } = computeCQT(samples);
+    for (let b = 0; b < nBins; b++) {
+      for (let f = 0; f < nFrames; f++) {
+        expect(data[b][f]).toBeCloseTo(0, 5);
+      }
+    }
+  });
+});
+
+describe('cqtToOnnxInput', () => {
+  it('flattens to correct size', () => {
+    const nBins = 144;
+    const nFrames = 100;
+    const data: Float32Array[] = [];
+    for (let b = 0; b < nBins; b++) {
+      data.push(new Float32Array(nFrames).fill(b));
+    }
+    const flat = cqtToOnnxInput(data, nBins, nFrames);
+    expect(flat.length).toBe(nBins * nFrames);
+    // First nFrames values should be 0 (bin 0)
+    expect(flat[0]).toBe(0);
+    // Second nFrames values should be 1 (bin 1)
+    expect(flat[nFrames]).toBe(1);
+    // Last value should be nBins-1
+    expect(flat[(nBins - 1) * nFrames]).toBe(nBins - 1);
+  });
+});
diff --git a/src/utils/__tests__/melSpectrogram.test.ts b/src/utils/__tests__/melSpectrogram.test.ts
new file mode 100644
index 00000000..526f55d1
--- /dev/null
+++ b/src/utils/__tests__/melSpectrogram.test.ts
@@ -0,0 +1,239 @@
+import { describe, it, expect } from 'vitest';
+import {
+  fft,
+  hannWindow,
+  createMelFilterbank,
+  powerSpectrogram,
+  magnitudeSpectrogram,
+  computeMelSpectrogram,
+  downsampleToMono,
+  BEAT_THIS_MEL_OPTIONS,
+} from '../melSpectrogram';
+
+describe('fft', () => {
+  it('transforms a DC signal to a single bin', () => {
+    const n = 8;
+    const real = new Float32Array(n).fill(1);
+    const imag = new Float32Array(n).fill(0);
+    fft(real, imag);
+    expect(real[0]).toBeCloseTo(n, 5);
+    for (let i = 1; i < n; i++) {
+      expect(Math.abs(real[i])).toBeCloseTo(0, 5);
+      expect(Math.abs(imag[i])).toBeCloseTo(0, 5);
+    }
+  });
+
+  it('transforms a known sinusoid correctly', () => {
+    const n = 64;
+    const freq = 4;
+    const real = new Float32Array(n);
+    const imag = new Float32Array(n);
+    for (let i = 0; i < n; i++) {
+      real[i] = Math.cos((2 * Math.PI * freq * i) / n);
+    }
+    fft(real, imag);
+    expect(Math.sqrt(real[freq] ** 2 + imag[freq] ** 2)).toBeCloseTo(n / 2, 2);
+    expect(Math.sqrt(real[n - freq] ** 2 + imag[n - freq] ** 2)).toBeCloseTo(n / 2, 2);
+    for (let i = 1; i < n; i++) {
+      if (i === freq || i === n - freq) continue;
+      expect(Math.sqrt(real[i] ** 2 + imag[i] ** 2)).toBeCloseTo(0, 2);
+    }
+  });
+});
+
+describe('hannWindow', () => {
+  it('returns correct length', () => {
+    expect(hannWindow(256).length).toBe(256);
+  });
+
+  it('is zero at endpoints', () => {
+    const w = hannWindow(64);
+    expect(w[0]).toBeCloseTo(0, 5);
+    expect(w[63]).toBeCloseTo(0, 5);
+  });
+
+  it('peaks at center', () => {
+    const w = hannWindow(64);
+    expect(w[32]).toBeCloseTo(1, 2);
+  });
+});
+
+describe('createMelFilterbank', () => {
+  it('returns correct shape', () => {
+    const filters = createMelFilterbank(2048, 128, 22050, 30, 11000);
+    expect(filters.length).toBe(128);
+    expect(filters[0].length).toBe(1025);
+  });
+
+  it('filters are non-negative', () => {
+    const filters = createMelFilterbank(2048, 40, 22050, 0, 8000);
+    for (const f of filters) {
+      for (let i = 0; i < f.length; i++) {
+        expect(f[i]).toBeGreaterThanOrEqual(0);
+      }
+    }
+  });
+
+  it('each filter has at least one non-zero value', () => {
+    const filters = createMelFilterbank(2048, 40, 22050, 0, 8000);
+    for (const f of filters) {
+      const max = Math.max(...f);
+      expect(max).toBeGreaterThan(0);
+    }
+  });
+});
+
+describe('powerSpectrogram', () => {
+  it('returns correct number of frames', () => {
+    const nFft = 512;
+    const hopLength = 256;
+    const samples = new Float32Array(4096);
+    const frames = powerSpectrogram(samples, nFft, hopLength);
+    const expected = Math.floor((4096 - nFft) / hopLength) + 1;
+    expect(frames.length).toBe(expected);
+  });
+
+  it('frame length is nFft/2 + 1', () => {
+    const nFft = 512;
+    const samples = new Float32Array(1024);
+    const frames = powerSpectrogram(samples, nFft, 256);
+    expect(frames[0].length).toBe(nFft / 2 + 1);
+  });
+
+  it('values are non-negative (power)', () => {
+    const samples = new Float32Array(2048);
+    for (let i = 0; i < samples.length; i++) samples[i] = Math.random() * 2 - 1;
+    const frames = powerSpectrogram(samples, 512, 256);
+    for (const f of frames) {
+      for (let i = 0; i < f.length; i++) {
+        expect(f[i]).toBeGreaterThanOrEqual(0);
+      }
+    }
+  });
+});
+
+describe('magnitudeSpectrogram', () => {
+  it('returns correct shape', () => {
+    const nFft = 512;
+    const samples = new Float32Array(2048);
+    const frames = magnitudeSpectrogram(samples, nFft, 256);
+    expect(frames[0].length).toBe(nFft / 2 + 1);
+  });
+
+  it('magnitude values are sqrt of power values', () => {
+    const samples = new Float32Array(2048);
+    for (let i = 0; i < samples.length; i++) samples[i] = Math.sin(2 * Math.PI * 440 * i / 22050);
+    const power = powerSpectrogram(samples, 512, 256);
+    const mag = magnitudeSpectrogram(samples, 512, 256);
+    for (let f = 0; f < power.length; f++) {
+      for (let k = 0; k < power[f].length; k++) {
+        expect(mag[f][k]).toBeCloseTo(Math.sqrt(power[f][k]), 3);
+      }
+    }
+  });
+});
+
+describe('computeMelSpectrogram', () => {
+  it('returns correct shape for given input', () => {
+    const samples = new Float32Array(22050);
+    const melFrames = computeMelSpectrogram(samples, {
+      sampleRate: 22050,
+      nFft: 2048,
+      hopLength: 512,
+      nMels: 80,
+      fMin: 30,
+      fMax: 8000,
+    });
+    const expectedFrames = Math.floor((22050 - 2048) / 512) + 1;
+    expect(melFrames.length).toBe(expectedFrames);
+    expect(melFrames[0].length).toBe(80);
+  });
+
+  it('silent input produces very low dB values in db mode', () => {
+    const samples = new Float32Array(4096);
+    const melFrames = computeMelSpectrogram(samples, {
+      sampleRate: 22050,
+      nFft: 2048,
+      hopLength: 512,
+      nMels: 40,
+      fMin: 0,
+      fMax: 8000,
+      logScale: 'db',
+    });
+    for (const frame of melFrames) {
+      for (let i = 0; i < frame.length; i++) {
+        expect(frame[i]).toBeLessThan(-50);
+      }
+    }
+  });
+
+  it('log1p mode produces non-negative values', () => {
+    const samples = new Float32Array(22050);
+    for (let i = 0; i < samples.length; i++) {
+      samples[i] = Math.sin(2 * Math.PI * 440 * i / 22050);
+    }
+    const melFrames = computeMelSpectrogram(samples, {
+      ...BEAT_THIS_MEL_OPTIONS,
+      logScale: 'log1p',
+      log1pMultiplier: 1000,
+    });
+    for (const frame of melFrames) {
+      for (let i = 0; i < frame.length; i++) {
+        expect(frame[i]).toBeGreaterThanOrEqual(0);
+      }
+    }
+  });
+
+  it('Beat This! preset uses nFft=1024 and produces expected range', () => {
+    const samples = new Float32Array(22050);
+    for (let i = 0; i < samples.length; i++) {
+      samples[i] = Math.sin(2 * Math.PI * 440 * i / 22050) * 0.5;
+    }
+    const melFrames = computeMelSpectrogram(samples, BEAT_THIS_MEL_OPTIONS);
+    expect(melFrames[0].length).toBe(128);
+    // log1p(1000 * mel) should produce values roughly in [0, 10] range for normal audio
+    let maxVal = -Infinity;
+    for (const frame of melFrames) {
+      for (let i = 0; i < frame.length; i++) {
+        if (frame[i] > maxVal) maxVal = frame[i];
+      }
+    }
+    expect(maxVal).toBeGreaterThan(0);
+    expect(maxVal).toBeLessThan(15);
+  });
+});
+
+describe('downsampleToMono', () => {
+  it('mixes stereo to mono by averaging channels', () => {
+    const length = 100;
+    const ch0 = new Float32Array(length).fill(0.5);
+    const ch1 = new Float32Array(length).fill(-0.5);
+    const fakeBuffer = {
+      numberOfChannels: 2,
+      length,
+      sampleRate: 22050,
+      getChannelData: (ch: number) => (ch === 0 ? ch0 : ch1),
+    } as unknown as AudioBuffer;
+
+    const mono = downsampleToMono(fakeBuffer, 22050);
+    expect(mono.length).toBe(length);
+    for (let i = 0; i < mono.length; i++) {
+      expect(mono[i]).toBeCloseTo(0, 5);
+    }
+  });
+
+  it('resamples to lower sample rate', () => {
+    const length = 44100;
+    const ch0 = new Float32Array(length);
+    for (let i = 0; i < length; i++) ch0[i] = Math.sin(2 * Math.PI * 440 * i / 44100);
+    const fakeBuffer = {
+      numberOfChannels: 1,
+      length,
+      sampleRate: 44100,
+      getChannelData: () => ch0,
+    } as unknown as AudioBuffer;
+
+    const mono = downsampleToMono(fakeBuffer, 22050);
+    expect(mono.length).toBe(22050);
+  });
+});
diff --git a/src/utils/cqt.ts b/src/utils/cqt.ts
new file mode 100644
index 00000000..7e95a092
--- /dev/null
+++ b/src/utils/cqt.ts
@@ -0,0 +1,136 @@
+/**
+ * Pure TypeScript Constant-Q Transform (CQT) computation.
+ * Matching consonance-ACE's CQTransform:
+ *   sr=22050, hop=512, bins_per_octave=24, num_octaves=6, start_note=C1
+ *   Output: absolute magnitude (not dB).
+ *
+ * Uses a direct DFT-based approach per CQT bin with pre-computed kernel frequencies.
+ * This is slower than librosa's optimized implementation but correct and dependency-free.
+ */
+
+import { fft, hannWindow } from './melSpectrogram';
+
+export interface CQTOptions {
+  sampleRate: number;
+  hopLength: number;
+  binsPerOctave: number;
+  numOctaves: number;
+  /** MIDI-style start note frequency in Hz. C1 ≈ 32.70 Hz */
+  fMin: number;
+}
+
+export const DEFAULT_CQT_OPTIONS: CQTOptions = {
+  sampleRate: 22050,
+  hopLength: 512,
+  binsPerOctave: 24,
+  numOctaves: 6,
+  fMin: 32.7032, // C1
+};
+
+/** Consonance-ACE CQT preset. */
+export const CONSONANCE_ACE_CQT_OPTIONS: CQTOptions = {
+  sampleRate: 22050,
+  hopLength: 512,
+  binsPerOctave: 24,
+  numOctaves: 6,
+  fMin: 32.7032, // C1 = librosa.note_to_hz("C1")
+};
+
+/**
+ * Compute a CQT-like spectrogram using the "pseudo-CQT" approach:
+ * For each CQT bin, compute a windowed DFT at the target frequency
+ * using an FFT large enough for the lowest frequency bin.
+ *
+ * Returns [nBins][nFrames] in absolute magnitude.
+ */
+export function computeCQT(
+  samples: Float32Array,
+  options: Partial<CQTOptions> = {},
+): { data: Float32Array[]; nBins: number; nFrames: number } {
+  const opts = { ...DEFAULT_CQT_OPTIONS, ...options };
+  const { sampleRate, hopLength, binsPerOctave, numOctaves, fMin } = opts;
+  const nBins = binsPerOctave * numOctaves; // 144
+
+  // Compute center frequencies for each CQT bin
+  const frequencies = new Float64Array(nBins);
+  for (let k = 0; k < nBins; k++) {
+    frequencies[k] = fMin * Math.pow(2, k / binsPerOctave);
+  }
+
+  // For each bin, compute the window length: N_k = ceil(Q * sr / f_k)
+  // Q = 1 / (2^(1/binsPerOctave) - 1) for constant-Q
+  const Q = 1 / (Math.pow(2, 1 / binsPerOctave) - 1);
+
+  // Number of output frames
+  const nFrames = Math.max(1, Math.floor((samples.length - 1) / hopLength) + 1);
+
+  // Pre-compute bin data: for each bin, we'll use an FFT of the appropriate size
+  // and pick the right frequency bin from the result.
+  // For efficiency, group bins by FFT size (power of 2).
+  const binResults: Float32Array[] = new Array(nBins);
+  for (let k = 0; k < nBins; k++) {
+    binResults[k] = new Float32Array(nFrames);
+  }
+
+  // Process each CQT bin
+  for (let k = 0; k < nBins; k++) {
+    const freq = frequencies[k];
+    const windowLen = Math.ceil(Q * sampleRate / freq);
+
+    // Round up to next power of 2 for FFT
+    let fftSize = 1;
+    while (fftSize < windowLen) fftSize *= 2;
+    // Cap at reasonable size
+    fftSize = Math.min(fftSize, 16384);
+
+    const window = hannWindow(windowLen);
+
+    // Which FFT bin corresponds to this CQT frequency?
+    const targetBin = Math.round(freq * fftSize / sampleRate);
+
+    for (let frame = 0; frame < nFrames; frame++) {
+      const center = frame * hopLength;
+      const start = center - Math.floor(windowLen / 2);
+
+      const real = new Float32Array(fftSize);
+      const imag = new Float32Array(fftSize);
+
+      // Fill with windowed samples
+      for (let j = 0; j < windowLen; j++) {
+        const sampleIdx = start + j;
+        if (sampleIdx >= 0 && sampleIdx < samples.length) {
+          real[j] = samples[sampleIdx] * window[j];
+        }
+      }
+
+      fft(real, imag);
+
+      // Magnitude at the target frequency bin
+      const binIdx = Math.min(targetBin, fftSize / 2);
+      const mag = Math.sqrt(real[binIdx] * real[binIdx] + imag[binIdx] * imag[binIdx]);
+
+      // Normalize by window length (matching librosa behavior)
+      binResults[k][frame] = mag / windowLen;
+    }
+  }
+
+  return { data: binResults, nBins, nFrames };
+}
+
+/**
+ * Flatten CQT result to [1, 1, nBins, nFrames] Float32Array for ONNX input.
+ * consonance-ACE expects shape [batch=1, channels=1, freq=144, time=T].
+ */
+export function cqtToOnnxInput(
+  cqtData: Float32Array[],
+  nBins: number,
+  nFrames: number,
+): Float32Array {
+  const result = new Float32Array(nBins * nFrames);
+  for (let b = 0; b < nBins; b++) {
+    for (let f = 0; f < nFrames; f++) {
+      result[b * nFrames + f] = cqtData[b][f];
+    }
+  }
+  return result;
+}
diff --git a/src/utils/melSpectrogram.ts b/src/utils/melSpectrogram.ts
new file mode 100644
index 00000000..205420e2
--- /dev/null
+++ b/src/utils/melSpectrogram.ts
@@ -0,0 +1,342 @@
+/**
+ * Pure TypeScript mel spectrogram computation.
+ * No external dependencies — suitable for use in Web Workers.
+ *
+ * Supports two log-scaling modes:
+ * - 'db': 10 * log10(mel)  — standard dB scale
+ * - 'log1p': log1p(multiplier * mel) — used by Beat This!
+ */
+
+export interface MelSpectrogramOptions {
+  sampleRate: number;
+  nFft: number;
+  hopLength: number;
+  nMels: number;
+  fMin: number;
+  fMax: number;
+  /** 'power' (default, |X|²) or 'magnitude' (|X|, power=1). Beat This! uses magnitude. */
+  power: 1 | 2;
+  /** Log scaling: 'db' (default) or 'log1p'. */
+  logScale: 'db' | 'log1p';
+  /** Multiplier for log1p mode. Beat This! uses 1000. */
+  log1pMultiplier: number;
+  /** Normalize by nFft (torchaudio normalized="frame_length"). Beat This! does NOT use this. */
+  normalizeByNfft: boolean;
+}
+
+export const DEFAULT_MEL_OPTIONS: MelSpectrogramOptions = {
+  sampleRate: 22050,
+  nFft: 2048,
+  hopLength: 441,   // ~20ms at 22050Hz
+  nMels: 128,
+  fMin: 30,
+  fMax: 11000,
+  power: 2,
+  logScale: 'db',
+  log1pMultiplier: 1000,
+  normalizeByNfft: false,
+};
+
+/**
+ * Beat This! mel spectrogram preset.
+ * Matches: n_fft=1024, hop=441, f_min=30, f_max=11000, power=1,
+ *          mel_scale=slaney, normalized=frame_length,
+ *          output = log1p(1000 * mel)
+ */
+export const BEAT_THIS_MEL_OPTIONS: Partial<MelSpectrogramOptions> = {
+  sampleRate: 22050,
+  nFft: 1024,
+  hopLength: 441,
+  nMels: 128,
+  fMin: 30,
+  fMax: 11000,
+  power: 1,
+  logScale: 'log1p',
+  log1pMultiplier: 1000,
+  normalizeByNfft: false,
+};
+
+// ---------- FFT ----------
+
+/**
+ * In-place radix-2 Cooley–Tukey FFT.
+ * `real` and `imag` must have length equal to a power of 2.
+ */
+export function fft(real: Float32Array, imag: Float32Array): void {
+  const n = real.length;
+  if (n <= 1) return;
+
+  // Bit-reversal permutation
+  for (let i = 1, j = 0; i < n; i++) {
+    let bit = n >> 1;
+    while (j & bit) {
+      j ^= bit;
+      bit >>= 1;
+    }
+    j ^= bit;
+    if (i < j) {
+      [real[i], real[j]] = [real[j], real[i]];
+      [imag[i], imag[j]] = [imag[j], imag[i]];
+    }
+  }
+
+  // FFT butterfly
+  for (let len = 2; len <= n; len *= 2) {
+    const halfLen = len / 2;
+    const angle = (-2 * Math.PI) / len;
+    const wReal = Math.cos(angle);
+    const wImag = Math.sin(angle);
+
+    for (let i = 0; i < n; i += len) {
+      let curReal = 1;
+      let curImag = 0;
+      for (let j = 0; j < halfLen; j++) {
+        const evenIdx = i + j;
+        const oddIdx = i + j + halfLen;
+        const tReal = curReal * real[oddIdx] - curImag * imag[oddIdx];
+        const tImag = curReal * imag[oddIdx] + curImag * real[oddIdx];
+        real[oddIdx] = real[evenIdx] - tReal;
+        imag[oddIdx] = imag[evenIdx] - tImag;
+        real[evenIdx] += tReal;
+        imag[evenIdx] += tImag;
+        const newCurReal = curReal * wReal - curImag * wImag;
+        curImag = curReal * wImag + curImag * wReal;
+        curReal = newCurReal;
+      }
+    }
+  }
+}
+
+// ---------- Mel filterbank ----------
+
+function hzToMel(hz: number): number {
+  return 2595 * Math.log10(1 + hz / 700);
+}
+
+function melToHz(mel: number): number {
+  return 700 * (Math.pow(10, mel / 2595) - 1);
+}
+
+/**
+ * Create a mel filterbank matrix.
+ * Returns `nMels` arrays, each of length `nFft / 2 + 1`.
+ */
+export function createMelFilterbank(
+  nFft: number,
+  nMels: number,
+  sampleRate: number,
+  fMin: number,
+  fMax: number,
+): Float32Array[] {
+  const nBins = nFft / 2 + 1;
+  const melMin = hzToMel(fMin);
+  const melMax = hzToMel(fMax);
+
+  // nMels + 2 equally spaced mel points
+  const melPoints = new Float32Array(nMels + 2);
+  for (let i = 0; i < nMels + 2; i++) {
+    melPoints[i] = melMin + (i * (melMax - melMin)) / (nMels + 1);
+  }
+
+  // Convert to Hz then to FFT bin indices
+  const binIndices = new Float32Array(nMels + 2);
+  for (let i = 0; i < nMels + 2; i++) {
+    binIndices[i] = Math.floor(((nFft + 1) * melToHz(melPoints[i])) / sampleRate);
+  }
+
+  const filters: Float32Array[] = [];
+  for (let m = 0; m < nMels; m++) {
+    const filter = new Float32Array(nBins);
+    const left = binIndices[m];
+    const center = binIndices[m + 1];
+    const right = binIndices[m + 2];
+
+    for (let k = Math.floor(left); k < Math.ceil(center); k++) {
+      if (k >= 0 && k < nBins && center !== left) {
+        filter[k] = (k - left) / (center - left);
+      }
+    }
+    for (let k = Math.floor(center); k < Math.ceil(right); k++) {
+      if (k >= 0 && k < nBins && right !== center) {
+        filter[k] = (right - k) / (right - center);
+      }
+    }
+    filters.push(filter);
+  }
+
+  return filters;
+}
+
+// ---------- Hann window ----------
+
+export function hannWindow(size: number): Float32Array {
+  const window = new Float32Array(size);
+  for (let i = 0; i < size; i++) {
+    window[i] = 0.5 * (1 - Math.cos((2 * Math.PI * i) / (size - 1)));
+  }
+  return window;
+}
+
+// ---------- Spectrogram ----------
+
+/**
+ * Compute STFT power spectrogram (|X|²).
+ * Returns array of frames, each of length `nFft / 2 + 1`.
+ */
+export function powerSpectrogram(
+  samples: Float32Array,
+  nFft: number,
+  hopLength: number,
+): Float32Array[] {
+  const nBins = nFft / 2 + 1;
+  const window = hannWindow(nFft);
+  const nFrames = Math.max(0, Math.floor((samples.length - nFft) / hopLength) + 1);
+  const frames: Float32Array[] = [];
+
+  for (let i = 0; i < nFrames; i++) {
+    const offset = i * hopLength;
+    const real = new Float32Array(nFft);
+    const imag = new Float32Array(nFft);
+
+    for (let j = 0; j < nFft; j++) {
+      real[j] = (samples[offset + j] ?? 0) * window[j];
+    }
+
+    fft(real, imag);
+
+    const power = new Float32Array(nBins);
+    for (let k = 0; k < nBins; k++) {
+      power[k] = real[k] * real[k] + imag[k] * imag[k];
+    }
+    frames.push(power);
+  }
+
+  return frames;
+}
+
+/**
+ * Compute STFT magnitude spectrogram (|X|, power=1).
+ * Returns array of frames, each of length `nFft / 2 + 1`.
+ */
+export function magnitudeSpectrogram(
+  samples: Float32Array,
+  nFft: number,
+  hopLength: number,
+): Float32Array[] {
+  const nBins = nFft / 2 + 1;
+  const window = hannWindow(nFft);
+  const nFrames = Math.max(0, Math.floor((samples.length - nFft) / hopLength) + 1);
+  const frames: Float32Array[] = [];
+
+  for (let i = 0; i < nFrames; i++) {
+    const offset = i * hopLength;
+    const real = new Float32Array(nFft);
+    const imag = new Float32Array(nFft);
+
+    for (let j = 0; j < nFft; j++) {
+      real[j] = (samples[offset + j] ?? 0) * window[j];
+    }
+
+    fft(real, imag);
+
+    const mag = new Float32Array(nBins);
+    for (let k = 0; k < nBins; k++) {
+      mag[k] = Math.sqrt(real[k] * real[k] + imag[k] * imag[k]);
+    }
+    frames.push(mag);
+  }
+
+  return frames;
+}
+
+// ---------- Mel spectrogram ----------
+
+/**
+ * Compute a mel spectrogram from raw audio samples.
+ * Returns a 2D array: `[nFrames][nMels]`.
+ *
+ * Log scaling depends on `logScale` option:
+ * - 'db': 10 * log10(max(val, 1e-10))
+ * - 'log1p': log1p(multiplier * val)  — used by Beat This!
+ */
+export function computeMelSpectrogram(
+  samples: Float32Array,
+  options: Partial<MelSpectrogramOptions> = {},
+): Float32Array[] {
+  const opts = { ...DEFAULT_MEL_OPTIONS, ...options };
+  const { nFft, hopLength, nMels, sampleRate, fMin, fMax, power, logScale, log1pMultiplier } = opts;
+
+  const filters = createMelFilterbank(nFft, nMels, sampleRate, fMin, fMax);
+
+  // Compute spectrogram based on power setting
+  const specFrames = power === 1
+    ? magnitudeSpectrogram(samples, nFft, hopLength)
+    : powerSpectrogram(samples, nFft, hopLength);
+  const nBins = nFft / 2 + 1;
+
+  const melFrames: Float32Array[] = [];
+  for (const frame of specFrames) {
+    const melFrame = new Float32Array(nMels);
+    for (let m = 0; m < nMels; m++) {
+      let sum = 0;
+      const filter = filters[m];
+      for (let k = 0; k < nBins; k++) {
+        sum += filter[k] * frame[k];
+      }
+
+      if (logScale === 'log1p') {
+        melFrame[m] = Math.log1p(log1pMultiplier * sum);
+      } else {
+        // dB scale
+        melFrame[m] = 10 * Math.log10(Math.max(sum, 1e-10));
+      }
+    }
+    melFrames.push(melFrame);
+  }
+
+  return melFrames;
+}
+
+// ---------- Downsampling ----------
+
+/**
+ * Downsample an AudioBuffer to mono at the target sample rate.
+ * Uses simple linear interpolation.
+ */
+export function downsampleToMono(
+  audioBuffer: AudioBuffer,
+  targetSampleRate: number,
+): Float32Array {
+  // Mix to mono
+  const nChannels = audioBuffer.numberOfChannels;
+  const length = audioBuffer.length;
+  const mono = new Float32Array(length);
+  for (let ch = 0; ch < nChannels; ch++) {
+    const channelData = audioBuffer.getChannelData(ch);
+    for (let i = 0; i < length; i++) {
+      mono[i] += channelData[i];
+    }
+  }
+  if (nChannels > 1) {
+    for (let i = 0; i < length; i++) {
+      mono[i] /= nChannels;
+    }
+  }
+
+  // Resample if needed
+  const sourceSR = audioBuffer.sampleRate;
+  if (sourceSR === targetSampleRate) return mono;
+
+  const ratio = sourceSR / targetSampleRate;
+  const outLength = Math.floor(length / ratio);
+  const output = new Float32Array(outLength);
+  for (let i = 0; i < outLength; i++) {
+    const srcIdx = i * ratio;
+    const idx0 = Math.floor(srcIdx);
+    const idx1 = Math.min(idx0 + 1, length - 1);
+    const frac = srcIdx - idx0;
+    output[i] = mono[idx0] * (1 - frac) + mono[idx1] * frac;
+  }
+
+  return output;
+}
diff --git a/src/workers/__tests__/peakPicking.test.ts b/src/workers/__tests__/peakPicking.test.ts
new file mode 100644
index 00000000..24b61180
--- /dev/null
+++ b/src/workers/__tests__/peakPicking.test.ts
@@ -0,0 +1,122 @@
+import { describe, it, expect } from 'vitest';
+
+// We can't import the worker directly (it calls self.onmessage),
+// so we test the exported peak-picking function.
+// Re-implement here to test the algorithm independently.
+
+function maxPool1d(data: Float32Array, kernelSize: number): Float32Array {
+  const n = data.length;
+  const result = new Float32Array(n);
+  const pad = Math.floor(kernelSize / 2);
+  for (let i = 0; i < n; i++) {
+    let max = -Infinity;
+    for (let j = -pad; j <= pad; j++) {
+      const idx = i + j;
+      if (idx >= 0 && idx < n) {
+        max = Math.max(max, data[idx]);
+      } else {
+        max = Math.max(max, -1000);
+      }
+    }
+    result[i] = max;
+  }
+  return result;
+}
+
+function peakPick(logits: Float32Array, kernelSize: number = 7): number[] {
+  const pooled = maxPool1d(logits, kernelSize);
+  const peaks: number[] = [];
+  for (let i = 0; i < logits.length; i++) {
+    if (logits[i] === pooled[i] && logits[i] > 0) {
+      peaks.push(i);
+    }
+  }
+  return peaks;
+}
+
+describe('maxPool1d', () => {
+  it('returns the input unchanged for kernel=1', () => {
+    const data = new Float32Array([1, 2, 3, 2, 1]);
+    const result = maxPool1d(data, 1);
+    expect(Array.from(result)).toEqual([1, 2, 3, 2, 1]);
+  });
+
+  it('correctly pools with kernel=3', () => {
+    const data = new Float32Array([1, 3, 2, 5, 1]);
+    const result = maxPool1d(data, 3);
+    // idx 0: max(pad, 1, 3) = 3
+    // idx 1: max(1, 3, 2) = 3
+    // idx 2: max(3, 2, 5) = 5
+    // idx 3: max(2, 5, 1) = 5
+    // idx 4: max(5, 1, pad) = 5
+    expect(Array.from(result)).toEqual([3, 3, 5, 5, 5]);
+  });
+
+  it('kernel=7 spreads maxima across 7 positions', () => {
+    const data = new Float32Array(20).fill(-5);
+    data[10] = 5; // single peak
+    const result = maxPool1d(data, 7);
+    // Positions 7-13 should see the peak value 5
+    for (let i = 7; i <= 13; i++) {
+      expect(result[i]).toBe(5);
+    }
+    // Positions outside should be -5
+    expect(result[6]).toBe(-5);
+    expect(result[14]).toBe(-5);
+  });
+});
+
+describe('peakPick', () => {
+  it('finds isolated peaks above threshold', () => {
+    const logits = new Float32Array(20).fill(-5);
+    logits[5] = 2.0;
+    logits[15] = 3.0;
+    const peaks = peakPick(logits, 7);
+    expect(peaks).toEqual([5, 15]);
+  });
+
+  it('ignores negative logits (below sigmoid 0.5 threshold)', () => {
+    const logits = new Float32Array(20).fill(-5);
+    logits[10] = -0.5; // local max but logit < 0
+    const peaks = peakPick(logits, 7);
+    expect(peaks).toEqual([]);
+  });
+
+  it('picks only the local maximum when beats are clustered', () => {
+    // Simulate a cluster of high activations (common in raw model output)
+    const logits = new Float32Array(20).fill(-5);
+    logits[8] = 1.0;
+    logits[9] = 3.0;  // local max
+    logits[10] = 2.5;
+    logits[11] = 0.5;
+    const peaks = peakPick(logits, 7);
+    // Only frame 9 should be picked (it's the max in the 7-frame window)
+    expect(peaks).toEqual([9]);
+  });
+
+  it('picks multiple beats at expected BPM spacing', () => {
+    // Simulate 120 BPM: beats every 25 frames (at 50fps)
+    const nFrames = 200;
+    const logits = new Float32Array(nFrames).fill(-5);
+    for (let i = 25; i < nFrames; i += 25) {
+      logits[i] = 5.0; // strong beat
+    }
+    const peaks = peakPick(logits, 7);
+    expect(peaks.length).toBe(7); // frames 25, 50, 75, 100, 125, 150, 175
+    // All peaks should be at multiples of 25
+    for (const p of peaks) {
+      expect(p % 25).toBe(0);
+    }
+  });
+
+  it('handles equal adjacent values by picking all of them', () => {
+    // Two adjacent frames with same value — both are local max
+    const logits = new Float32Array(10).fill(-5);
+    logits[4] = 2.0;
+    logits[5] = 2.0;
+    const peaks = peakPick(logits, 3);
+    // Both 4 and 5 equal the max in their windows, so both get picked
+    expect(peaks).toContain(4);
+    expect(peaks).toContain(5);
+  });
+});
diff --git a/src/workers/analysisWorker.ts b/src/workers/analysisWorker.ts
new file mode 100644
index 00000000..5e0df772
--- /dev/null
+++ b/src/workers/analysisWorker.ts
@@ -0,0 +1,437 @@
+/**
+ * Web Worker for local audio analysis (BPM/chord detection via ONNX Runtime).
+ *
+ * Beat This! pipeline:
+ *   audio → mel spectrogram (n_fft=1024, hop=441, power=1, log1p(1000*mel))
+ *   → ONNX inference → 2 outputs: beat_logits[1,T], downbeat_logits[1,T]
+ *   → peak-picking (max_pool1d kernel=7, logit > 0)
+ *   → BPM from median inter-beat interval
+ *
+ * consonance-ACE pipeline:
+ *   audio → CQT (144 bins, 24 bins/oct, 6 octaves from C1, hop=512)
+ *   → ONNX inference → 3 outputs: root[1,T,13], bass[1,T,13], chord[1,T,12]
+ *   → argmax root/bass, sigmoid chord → chord labels
+ */
+import type {
+  AnalysisWorkerRequest,
+  AnalysisWorkerProgress,
+  AnalysisWorkerResult,
+  AnalysisWorkerError,
+  BeatEvent,
+  ChordEvent,
+  LocalAnalysisResult,
+} from '../types/analysis';
+import { computeMelSpectrogram, BEAT_THIS_MEL_OPTIONS } from '../utils/melSpectrogram';
+import { computeCQT, cqtToOnnxInput, CONSONANCE_ACE_CQT_OPTIONS } from '../utils/cqt';
+
+// ONNX Runtime session handles — lazily initialized
+let bpmSession: unknown = null;
+let chordSession: unknown = null;
+let ortModule: typeof import('onnxruntime-web') | null = null;
+
+function postProgress(status: AnalysisWorkerProgress['status'], percent: number, message: string) {
+  self.postMessage({ type: 'progress', status, percent, message } satisfies AnalysisWorkerProgress);
+}
+
+async function getOrt() {
+  if (!ortModule) {
+    ortModule = await import('onnxruntime-web');
+  }
+  return ortModule;
+}
+
+async function loadOnnxSession(modelUrl: string) {
+  const ort = await getOrt();
+  const response = await fetch(modelUrl);
+  if (!response.ok) throw new Error(`Failed to fetch model: ${response.status}`);
+  const buffer = await response.arrayBuffer();
+  return ort.InferenceSession.create(buffer, {
+    executionProviders: ['wasm'],
+  });
+}
+
+// ---------- Peak-picking for beat detection ----------
+
+/**
+ * 1D max-pooling: for each position, returns the max value in a window of `kernelSize`.
+ * Matches PyTorch's F.max_pool1d with padding=kernelSize//2.
+ */
+function maxPool1d(data: Float32Array, kernelSize: number): Float32Array {
+  const n = data.length;
+  const result = new Float32Array(n);
+  const pad = Math.floor(kernelSize / 2);
+  for (let i = 0; i < n; i++) {
+    let max = -Infinity;
+    for (let j = -pad; j <= pad; j++) {
+      const idx = i + j;
+      if (idx >= 0 && idx < n) {
+        max = Math.max(max, data[idx]);
+      } else {
+        // Padding with -Infinity (matching Beat This! which uses -1000)
+        max = Math.max(max, -1000);
+      }
+    }
+    result[i] = max;
+  }
+  return result;
+}
+
+/**
+ * Pick local maxima from logits, matching Beat This! minimal postprocessor.
+ * 1. max_pool1d with kernel=7 (±70ms at 50fps) to find local maxima
+ * 2. Keep peaks where logit > 0 (probability > 0.5 after sigmoid)
+ */
+export function peakPick(logits: Float32Array, kernelSize: number = 7): number[] {
+  const pooled = maxPool1d(logits, kernelSize);
+  const peaks: number[] = [];
+  for (let i = 0; i < logits.length; i++) {
+    if (logits[i] === pooled[i] && logits[i] > 0) {
+      peaks.push(i);
+    }
+  }
+  return peaks;
+}
+
+// ---------- BPM inference ----------
+
+/**
+ * Run Beat This! ONNX model inference.
+ *
+ * Model I/O (from beat_this_cpp ONNX):
+ *   Input:  "input_spectrogram" [1, time, 128]  (batch, time_frames, mel_bins)
+ *   Output: "beat" [1, time], "downbeat" [1, time]  (logits, not probabilities)
+ */
+async function runBpmInference(
+  session: Awaited<ReturnType<typeof loadOnnxSession>>,
+  melFrames: Float32Array[],
+): Promise<{ bpm: number; beats: BeatEvent[] }> {
+  const ort = await getOrt();
+  const nFrames = melFrames.length;
+  const nMels = melFrames[0]?.length ?? 128;
+
+  // Flatten to [1, nFrames, nMels] — Beat This! expects [batch, time, freq]
+  const inputData = new Float32Array(nFrames * nMels);
+  for (let f = 0; f < nFrames; f++) {
+    for (let m = 0; m < nMels; m++) {
+      inputData[f * nMels + m] = melFrames[f][m];
+    }
+  }
+
+  const inputTensor = new ort.Tensor('float32', inputData, [1, nFrames, nMels]);
+  const feeds: Record<string, InstanceType<typeof ort.Tensor>> = {};
+  feeds[session.inputNames[0]] = inputTensor;
+
+  const results = await session.run(feeds);
+
+  // Beat This! outputs two separate tensors: "beat" and "downbeat"
+  const outputNames = session.outputNames;
+  const beatLogits = results[outputNames[0]].data as Float32Array;
+  const downbeatLogits = results[outputNames[1]].data as Float32Array;
+
+  // Peak-picking: max_pool1d(kernel=7) + threshold at logit > 0
+  const beatFrames = peakPick(beatLogits, 7);
+  const downbeatFrames = new Set(peakPick(downbeatLogits, 7));
+
+  // Convert frames to time (hop=441 @ 22050Hz = 20ms per frame)
+  const frameTimeStep = 441 / 22050;
+
+  const beats: BeatEvent[] = beatFrames.map((frame) => ({
+    time: frame * frameTimeStep,
+    isDownbeat: downbeatFrames.has(frame),
+    confidence: 1 / (1 + Math.exp(-beatLogits[frame])), // sigmoid
+  }));
+
+  // Estimate BPM from median inter-beat interval
+  let bpm = 120; // fallback
+  if (beats.length >= 2) {
+    const intervals: number[] = [];
+    for (let i = 1; i < beats.length; i++) {
+      intervals.push(beats[i].time - beats[i - 1].time);
+    }
+    intervals.sort((a, b) => a - b);
+    const medianInterval = intervals[Math.floor(intervals.length / 2)];
+    if (medianInterval > 0) {
+      bpm = Math.round(60 / medianInterval);
+    }
+  }
+
+  return { bpm, beats };
+}
+
+// ---------- Chord inference ----------
+
+const ROOT_LABELS = ['N', 'C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'];
+const PITCH_CLASSES = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'];
+
+/**
+ * Decode chord label from consonance-ACE decomposed outputs.
+ * root: argmax of 13 classes (0=N, 1-12=C..B)
+ * chord: sigmoid of 12 classes (C..B note activations), threshold=0.5
+ */
+function decodeChord(rootIdx: number, _bassIdx: number, chordProbs: number[]): string {
+  if (rootIdx === 0) return 'N';
+
+  const root = ROOT_LABELS[rootIdx];
+  const activeNotes: string[] = [];
+  for (let i = 0; i < chordProbs.length; i++) {
+    if (chordProbs[i] > 0.5) {
+      activeNotes.push(PITCH_CLASSES[i]);
+    }
+  }
+
+  // Determine quality from active notes relative to root
+  if (activeNotes.length === 0) return root;
+
+  // Simple quality detection based on intervals
+  const rootPitchIdx = rootIdx - 1; // 0-based pitch class
+  const intervals = activeNotes.map((note) => {
+    const noteIdx = PITCH_CLASSES.indexOf(note);
+    return (noteIdx - rootPitchIdx + 12) % 12;
+  });
+
+  const hasMinor3 = intervals.includes(3);
+  const hasMajor3 = intervals.includes(4);
+  const hasDim5 = intervals.includes(6);
+  const hasPerfect5 = intervals.includes(7);
+  const hasMinor7 = intervals.includes(10);
+  const hasMajor7 = intervals.includes(11);
+
+  if (hasDim5 && hasMinor3) return `${root}:dim`;
+  if (hasMajor3 && !hasPerfect5 && !hasDim5) return `${root}:aug`;
+  if (hasMinor3 && hasMinor7) return `${root}:min7`;
+  if (hasMajor3 && hasMinor7) return `${root}:7`;
+  if (hasMajor3 && hasMajor7) return `${root}:maj7`;
+  if (hasMinor3) return `${root}:min`;
+  if (hasMajor3) return `${root}:maj`;
+
+  return root;
+}
+
+/**
+ * Run consonance-ACE ONNX model inference.
+ *
+ * Model I/O:
+ *   Input:  "cqt_features" [1, 1, 144, n_frames]
+ *   Output: "root_logits" [1, T, 13], "bass_logits" [1, T, 13], "chord_logits" [1, T, 12]
+ */
+async function runChordInference(
+  session: Awaited<ReturnType<typeof loadOnnxSession>>,
+  samples: Float32Array,
+): Promise<ChordEvent[]> {
+  const ort = await getOrt();
+
+  // Compute CQT features
+  const { data: cqtData, nBins, nFrames } = computeCQT(samples, CONSONANCE_ACE_CQT_OPTIONS);
+
+  // Flatten to [1, 1, nBins, nFrames]
+  const inputData = cqtToOnnxInput(cqtData, nBins, nFrames);
+  const inputTensor = new ort.Tensor('float32', inputData, [1, 1, nBins, nFrames]);
+
+  const feeds: Record<string, InstanceType<typeof ort.Tensor>> = {};
+  feeds[session.inputNames[0]] = inputTensor;
+
+  const results = await session.run(feeds);
+
+  // consonance-ACE outputs 3 tensors
+  const outputNames = session.outputNames;
+  const rootLogits = results[outputNames[0]].data as Float32Array;   // [1, T, 13]
+  const bassLogits = results[outputNames[1]].data as Float32Array;   // [1, T, 13]
+  const chordLogits = results[outputNames[2]].data as Float32Array;  // [1, T, 12]
+
+  // Decode per-frame chord labels
+  const nRootClasses = 13;
+  const nBassClasses = 13;
+  const nChordClasses = 12;
+  const frameTimeStep = CONSONANCE_ACE_CQT_OPTIONS.hopLength / CONSONANCE_ACE_CQT_OPTIONS.sampleRate;
+
+  const frameLabels: { label: string; confidence: number }[] = [];
+  for (let f = 0; f < nFrames; f++) {
+    // Argmax for root
+    let rootIdx = 0, rootMax = -Infinity;
+    for (let c = 0; c < nRootClasses; c++) {
+      const val = rootLogits[f * nRootClasses + c];
+      if (val > rootMax) { rootMax = val; rootIdx = c; }
+    }
+
+    // Argmax for bass
+    let bassIdx = 0, bassMax = -Infinity;
+    for (let c = 0; c < nBassClasses; c++) {
+      const val = bassLogits[f * nBassClasses + c];
+      if (val > bassMax) { bassMax = val; bassIdx = c; }
+    }
+
+    // Sigmoid for chord note activations
+    const chordProbs: number[] = [];
+    for (let c = 0; c < nChordClasses; c++) {
+      const logit = chordLogits[f * nChordClasses + c];
+      chordProbs.push(1 / (1 + Math.exp(-logit)));
+    }
+
+    const label = decodeChord(rootIdx, bassIdx, chordProbs);
+    // Confidence: softmax probability of the root class
+    const rootConfidence = Math.exp(rootMax) / (
+      Array.from({ length: nRootClasses }, (_, c) =>
+        Math.exp(rootLogits[f * nRootClasses + c])
+      ).reduce((a, b) => a + b, 0)
+    );
+
+    frameLabels.push({ label, confidence: rootConfidence });
+  }
+
+  // Merge consecutive identical chords
+  const chords: ChordEvent[] = [];
+  let currentLabel = '';
+  let startTime = 0;
+  let maxConf = 0;
+
+  for (let i = 0; i < frameLabels.length; i++) {
+    const { label, confidence } = frameLabels[i];
+    if (label !== currentLabel) {
+      if (currentLabel) {
+        chords.push({
+          startTime,
+          endTime: i * frameTimeStep,
+          label: currentLabel,
+          confidence: maxConf,
+        });
+      }
+      currentLabel = label;
+      startTime = i * frameTimeStep;
+      maxConf = confidence;
+    } else {
+      maxConf = Math.max(maxConf, confidence);
+    }
+  }
+  // Final chord
+  if (currentLabel) {
+    chords.push({
+      startTime,
+      endTime: frameLabels.length * frameTimeStep,
+      label: currentLabel,
+      confidence: maxConf,
+    });
+  }
+
+  // Filter out very short chords (< 0.3s)
+  return chords.filter((c) => c.endTime - c.startTime >= 0.3);
+}
+
+// ---------- Key and time signature inference ----------
+
+function inferKeyFromChords(chords: ChordEvent[]): string | null {
+  if (chords.length === 0) return null;
+
+  const rootWeights = new Map<string, number>();
+  for (const chord of chords) {
+    if (chord.label === 'N') continue;
+    const root = chord.label.split(':')[0];
+    const duration = chord.endTime - chord.startTime;
+    rootWeights.set(root, (rootWeights.get(root) ?? 0) + duration);
+  }
+  if (rootWeights.size === 0) return null;
+
+  let maxRoot = '';
+  let maxWeight = 0;
+  for (const [root, weight] of rootWeights) {
+    if (weight > maxWeight) { maxWeight = weight; maxRoot = root; }
+  }
+
+  const majorWeight = chords
+    .filter((c) => c.label.startsWith(`${maxRoot}:maj`))
+    .reduce((sum, c) => sum + (c.endTime - c.startTime), 0);
+  const minorWeight = chords
+    .filter((c) => c.label.startsWith(`${maxRoot}:min`))
+    .reduce((sum, c) => sum + (c.endTime - c.startTime), 0);
+
+  return `${maxRoot} ${majorWeight >= minorWeight ? 'major' : 'minor'}`;
+}
+
+function inferTimeSignature(beats: BeatEvent[]): string | null {
+  const downbeats = beats.filter((b) => b.isDownbeat);
+  if (downbeats.length < 2) return null;
+
+  const beatsPerBar: number[] = [];
+  for (let i = 0; i < downbeats.length - 1; i++) {
+    const start = downbeats[i].time;
+    const end = downbeats[i + 1].time;
+    const count = beats.filter((b) => b.time >= start && b.time < end).length;
+    beatsPerBar.push(count);
+  }
+
+  const counts = new Map<number, number>();
+  for (const c of beatsPerBar) {
+    counts.set(c, (counts.get(c) ?? 0) + 1);
+  }
+  let bestCount = 4;
+  let bestFreq = 0;
+  for (const [count, freq] of counts) {
+    if (freq > bestFreq) { bestFreq = freq; bestCount = count; }
+  }
+
+  return `${bestCount}/4`;
+}
+
+// ---------- Worker message handler ----------
+
+self.onmessage = async (e: MessageEvent<AnalysisWorkerRequest>) => {
+  const { samples, sampleRate, tasks } = e.data;
+
+  try {
+    let beats: BeatEvent[] = [];
+    let bpm = 120;
+
+    if (tasks.includes('bpm')) {
+      // Compute mel spectrogram with Beat This! settings
+      postProgress('computing-features', 10, 'Computing mel spectrogram...');
+      const melFrames = computeMelSpectrogram(samples, {
+        ...BEAT_THIS_MEL_OPTIONS,
+        sampleRate,
+      });
+
+      postProgress('loading-model', 20, 'Loading BPM model...');
+      if (!bpmSession) {
+        bpmSession = await loadOnnxSession('/models/beat-this.onnx');
+      }
+      postProgress('running-bpm', 40, 'Detecting beats...');
+      const bpmResult = await runBpmInference(
+        bpmSession as Awaited<ReturnType<typeof loadOnnxSession>>,
+        melFrames,
+      );
+      beats = bpmResult.beats;
+      bpm = bpmResult.bpm;
+    }
+
+    let chords: ChordEvent[] = [];
+
+    if (tasks.includes('chords')) {
+      // Normalize audio to [-1, 1] for CQT (matching consonance-ACE preprocessing)
+      const maxVal = samples.reduce((max, v) => Math.max(max, Math.abs(v)), 0);
+      const normalizedSamples = maxVal > 0
+        ? samples.map((v) => v / maxVal)
+        : samples;
+
+      postProgress('computing-features', 50, 'Computing CQT features...');
+      postProgress('loading-model', 55, 'Loading chord model...');
+      if (!chordSession) {
+        chordSession = await loadOnnxSession('/models/consonance-ace.onnx');
+      }
+      postProgress('running-chords', 70, 'Recognizing chords...');
+      chords = await runChordInference(
+        chordSession as Awaited<ReturnType<typeof loadOnnxSession>>,
+        normalizedSamples,
+      );
+    }
+
+    postProgress('post-processing', 90, 'Finalizing results...');
+    const keyScale = inferKeyFromChords(chords);
+    const timeSignature = inferTimeSignature(beats);
+
+    const result: LocalAnalysisResult = { bpm, beats, chords, keyScale, timeSignature };
+    self.postMessage({ type: 'result', result } satisfies AnalysisWorkerResult);
+  } catch (err) {
+    self.postMessage({
+      type: 'error',
+      error: err instanceof Error ? err.message : String(err),
+    } satisfies AnalysisWorkerError);
+  }
+};
diff --git a/vite.config.ts b/vite.config.ts
index 933b0d75..b3292033 100644
--- a/vite.config.ts
+++ b/vite.config.ts
@@ -17,6 +17,12 @@ export default defineConfig(async ({ command }) => {
 
   return {
     plugins,
+    optimizeDeps: {
+      exclude: ['onnxruntime-web'],
+    },
+    worker: {
+      format: 'es',
+    },
     resolve: {
       alias: {
         // Stub out @kabelsalat/web — Strudel's optional modular synth engine