From cd92f6f1b5b58abdeb48562cdf2a8e8b1bd5db68 Mon Sep 17 00:00:00 2001
From: dirtyhandz <antonrozanov@gmail.com>
Date: Sat, 28 Mar 2026 09:36:03 +0100
Subject: [PATCH 1/4] Add TurboQuant KV cache compression (3-bit, 4.6x)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implements TurboQuant (arXiv 2504.19874) KV cache compression:
- PolarQuant: randomized Hadamard rotation + Lloyd-Max codebook
- Bit-packed uint32 storage (3-bit: 10 values per word)
- Fused Metal kernels for quantize and dequantize
- Incremental decode buffer for O(1) per-step cost
- Layer-adaptive mode: FP16 for first/last N layers

Usage:
  generate_step(prompt, model, turbo_kv_bits=3)

Results (Qwen2.5-32B, M4 Pro 48GB):
- 4.6x compression, 0.98x FP16 speed, identical quality
- 16K context: 4.2GB → 897MB KV cache
---
 mlx_lm/generate.py                   |  46 +++++-
 mlx_lm/models/turboquant_cache.py    | 208 ++++++++++++++++++++++++
 mlx_lm/models/turboquant_kernels.py  | 196 ++++++++++++++++++++++
 mlx_lm/models/turboquant_metal.py    | 232 +++++++++++++++++++++++++++
 mlx_lm/models/turboquant_packing.py  |  89 ++++++++++
 mlx_lm/models/turboquant_rotation.py |  80 +++++++++
 6 files changed, 847 insertions(+), 4 deletions(-)
 create mode 100644 mlx_lm/models/turboquant_cache.py
 create mode 100644 mlx_lm/models/turboquant_kernels.py
 create mode 100644 mlx_lm/models/turboquant_metal.py
 create mode 100644 mlx_lm/models/turboquant_packing.py
 create mode 100644 mlx_lm/models/turboquant_rotation.py

diff --git a/mlx_lm/generate.py b/mlx_lm/generate.py
index ef8dbf7bf..f34d2b015 100644
--- a/mlx_lm/generate.py
+++ b/mlx_lm/generate.py
@@ -300,6 +300,32 @@ def maybe_quantize_kv_cache(prompt_cache, quantized_kv_start, kv_group_size, kv_
             prompt_cache[e] = c.to_quantized(group_size=kv_group_size, bits=kv_bits)
 
 
+def make_turboquant_cache(model, bits=3, fp16_layers=1):
+    """Create layer-adaptive TurboQuant cache.
+
+    First and last ``fp16_layers`` layers use standard FP16 KVCache.
+    Middle layers use TurboQuantKVCache with ``bits``-bit compression.
+
+    Args:
+        model: The model to create caches for.
+        bits (int): Quantization bits (1-4). Default: ``3`` (4.6x compression).
+        fp16_layers (int): Number of first/last layers to keep in FP16. Default: ``1``.
+
+    Returns:
+        List of cache objects (one per layer).
+    """
+    from mlx_lm.models.turboquant_cache import TurboQuantKVCache
+
+    num_layers = len(model.layers)
+    caches = []
+    for i in range(num_layers):
+        if i < fp16_layers or i >= num_layers - fp16_layers:
+            caches.append(cache.KVCache())
+        else:
+            caches.append(TurboQuantKVCache(bits=bits))
+    return caches
+
+
 def generate_step(
     prompt: mx.array,
     model: nn.Module,
@@ -313,6 +339,8 @@ def generate_step(
     kv_bits: Optional[int] = None,
     kv_group_size: int = 64,
     quantized_kv_start: int = 0,
+    turbo_kv_bits: Optional[int] = None,
+    turbo_fp16_layers: int = 1,
     prompt_progress_callback: Optional[Callable[[int, int], None]] = None,
     input_embeddings: Optional[mx.array] = None,
 ) -> Generator[Tuple[mx.array, mx.array], None, None]:
@@ -339,6 +367,11 @@ def generate_step(
         kv_group_size (int): Group size for KV cache quantization. Default: ``64``.
         quantized_kv_start (int): Step to begin using a quantized KV cache.
            when ``kv_bits`` is non-None. Default: ``0``.
+        turbo_kv_bits (int, optional): TurboQuant KV cache compression bits (1-4).
+          Uses PolarQuant with Hadamard rotation. 3-bit gives 4.6x compression.
+          None implies no TurboQuant. Default: ``None``.
+        turbo_fp16_layers (int): Number of first/last layers to keep in FP16 when
+          using TurboQuant. Default: ``1``.
         prompt_progress_callback (Callable[[int, int], None]): A call-back which takes the
            prompt tokens processed so far and the total number of prompt tokens.
         input_embeddings (mx.array, optional): Input embeddings to use instead of or in
@@ -365,10 +398,15 @@ def generate_step(
 
     # Create the KV cache for generation
     if prompt_cache is None:
-        prompt_cache = cache.make_prompt_cache(
-            model,
-            max_kv_size=max_kv_size,
-        )
+        if turbo_kv_bits is not None:
+            prompt_cache = make_turboquant_cache(
+                model, bits=turbo_kv_bits, fp16_layers=turbo_fp16_layers,
+            )
+        else:
+            prompt_cache = cache.make_prompt_cache(
+                model,
+                max_kv_size=max_kv_size,
+            )
 
     prompt_progress_callback = prompt_progress_callback or (lambda *_: None)
 
diff --git a/mlx_lm/models/turboquant_cache.py b/mlx_lm/models/turboquant_cache.py
new file mode 100644
index 000000000..2f196a862
--- /dev/null
+++ b/mlx_lm/models/turboquant_cache.py
@@ -0,0 +1,208 @@
+"""TurboQuantKVCache: PolarQuant KV cache compression with fused Metal kernels.
+
+Implements TurboQuant (arXiv 2504.19874, ICLR 2026) for MLX KV cache compression.
+4.6x compression via randomized Hadamard rotation + Lloyd-Max quantization.
+Bit-packed uint32 storage with fused Metal quantize/dequantize kernels.
+"""
+
+import mlx.core as mx
+import math
+from mlx_lm.models.turboquant_rotation import random_diagonal_sign
+from mlx_lm.models.turboquant_packing import pack_indices, unpack_indices, packed_dim, VALS_PER_WORD
+from mlx_lm.models.turboquant_metal import fused_quantize, dequant_fp16
+from mlx_lm.models.turboquant_kernels import packed_dequantize
+
+
+def _compute_gaussian_codebook(bits):
+    codebooks = {
+        1: [-0.7979, 0.7979],
+        2: [-1.5104, -0.4528, 0.4528, 1.5104],
+        3: [-2.1520, -1.3440, -0.7560, -0.2451,
+             0.2451, 0.7560, 1.3440, 2.1520],
+        4: [-2.7326, -2.0690, -1.6180, -1.2562,
+            -0.9423, -0.6568, -0.3881, -0.1284,
+             0.1284, 0.3881, 0.6568, 0.9423,
+             1.2562, 1.6180, 2.0690, 2.7326],
+    }
+    return mx.array(codebooks[bits], dtype=mx.float32)
+
+
+def _compute_boundaries(centroids):
+    return (centroids[:-1] + centroids[1:]) / 2.0
+
+
+class _Quantizer:
+    def __init__(self, dim, bits, seed):
+        self.dim = dim
+        self.bits = bits
+        self.signs = random_diagonal_sign(dim, seed=seed)
+        self.centroids = _compute_gaussian_codebook(bits)
+        self.boundaries = _compute_boundaries(self.centroids)
+
+
+class TurboQuantKVCache:
+    """TurboQuant KV cache — drop-in replacement for KVCache.
+
+    Compresses KV vectors using PolarQuant (Hadamard rotation + Lloyd-Max
+    codebook quantization). Stores bit-packed indices in uint32 + float32 norms.
+
+    Uses fused Metal kernels for quantize and dequantize operations.
+    Maintains an incremental decode buffer for O(1) per-step dequantization.
+    """
+
+    step = 256
+
+    def __init__(self, bits: int = 3, seed: int = 42):
+        self.quant_bits = bits
+        self.seed = seed
+        self.offset = 0
+
+        self.k_packed = None
+        self.k_norms = None
+        self.v_packed = None
+        self.v_norms = None
+
+        self._k_deq_buf = None
+        self._v_deq_buf = None
+        self._deq_offset = 0
+        self._deq_alloc = 0
+
+        self._k_q = None
+        self._v_q = None
+        self._k_dim = None
+        self._v_dim = None
+        self._k_pdim = None
+        self._v_pdim = None
+
+    def _ensure_quantizer(self, k_dim, v_dim):
+        if self._k_q is None:
+            self._k_q = _Quantizer(k_dim, self.quant_bits, self.seed)
+            self._k_dim = k_dim
+            self._k_pdim = packed_dim(k_dim, self.quant_bits)
+        if self._v_q is None:
+            self._v_q = _Quantizer(v_dim, self.quant_bits, self.seed + 1)
+            self._v_dim = v_dim
+            self._v_pdim = packed_dim(v_dim, self.quant_bits)
+
+    def _ensure_storage(self, B, H, num_new):
+        prev = self.offset
+        needed = prev + num_new
+        if self.k_packed is None or needed > self.k_packed.shape[2]:
+            n = ((needed + self.step - 1) // self.step) * self.step
+            new_kp = mx.zeros((B, H, n, self._k_pdim), dtype=mx.uint32)
+            new_kn = mx.zeros((B, H, n), dtype=mx.float32)
+            new_vp = mx.zeros((B, H, n, self._v_pdim), dtype=mx.uint32)
+            new_vn = mx.zeros((B, H, n), dtype=mx.float32)
+            if self.k_packed is not None:
+                self.k_packed = mx.concatenate([self.k_packed[..., :prev, :], new_kp], axis=2)
+                self.k_norms = mx.concatenate([self.k_norms[..., :prev], new_kn], axis=2)
+                self.v_packed = mx.concatenate([self.v_packed[..., :prev, :], new_vp], axis=2)
+                self.v_norms = mx.concatenate([self.v_norms[..., :prev], new_vn], axis=2)
+            else:
+                self.k_packed, self.k_norms = new_kp, new_kn
+                self.v_packed, self.v_norms = new_vp, new_vn
+
+    def _full_dequant(self, packed, norms, q, dim, B, H, total, dtype):
+        flat_p = packed[..., :total, :].reshape(-1, packed.shape[-1])
+        flat_n = norms[..., :total].reshape(-1)
+        out = packed_dequantize(flat_p, flat_n, q.centroids, q.signs, dim, self.quant_bits)
+        return out.reshape(B, H, total, dim).astype(dtype)
+
+    def update_and_fetch(self, keys, values):
+        B, H, S, k_dim = keys.shape
+        v_dim = values.shape[3]
+        self._ensure_quantizer(k_dim, v_dim)
+        self._ensure_storage(B, H, S)
+        prev = self.offset
+
+        # Fused Metal quantize
+        k_pk, k_nrm = fused_quantize(keys.reshape(-1, k_dim), self._k_q.signs, self._k_q.boundaries, k_dim, self.quant_bits)
+        k_pk = k_pk.reshape(B, H, S, self._k_pdim)
+        v_pk, v_nrm = fused_quantize(values.reshape(-1, v_dim), self._v_q.signs, self._v_q.boundaries, v_dim, self.quant_bits)
+        v_pk = v_pk.reshape(B, H, S, self._v_pdim)
+
+        self.k_packed[..., prev:prev+S, :] = k_pk
+        self.k_norms[..., prev:prev+S] = k_nrm.reshape(B, H, S)
+        self.v_packed[..., prev:prev+S, :] = v_pk
+        self.v_norms[..., prev:prev+S] = v_nrm.reshape(B, H, S)
+        self.offset += S
+        total = self.offset
+
+        # Incremental decode
+        if S <= 4 and self._v_deq_buf is not None and self._deq_offset == prev:
+            if total > self._deq_alloc:
+                na = ((total + self.step - 1) // self.step) * self.step
+                self._k_deq_buf = mx.concatenate([self._k_deq_buf[..., :self._deq_offset, :],
+                    mx.zeros((B, H, na - self._deq_alloc, k_dim), dtype=keys.dtype)], axis=2)
+                self._v_deq_buf = mx.concatenate([self._v_deq_buf[..., :self._deq_offset, :],
+                    mx.zeros((B, H, na - self._deq_alloc, v_dim), dtype=values.dtype)], axis=2)
+                self._deq_alloc = na
+
+            nk = dequant_fp16(k_pk.reshape(-1, self._k_pdim), k_nrm, self._k_q.centroids, self._k_q.signs, k_dim, self.quant_bits).reshape(B, H, S, k_dim)
+            nv = dequant_fp16(v_pk.reshape(-1, self._v_pdim), v_nrm, self._v_q.centroids, self._v_q.signs, v_dim, self.quant_bits).reshape(B, H, S, v_dim)
+            self._k_deq_buf[..., prev:total, :] = nk
+            self._v_deq_buf[..., prev:total, :] = nv
+            self._deq_offset = total
+            return self._k_deq_buf[..., :total, :], self._v_deq_buf[..., :total, :]
+
+        # Full dequant (prefill)
+        all_k = self._full_dequant(self.k_packed, self.k_norms, self._k_q, k_dim, B, H, total, keys.dtype)
+        all_v = self._full_dequant(self.v_packed, self.v_norms, self._v_q, v_dim, B, H, total, values.dtype)
+        alloc = ((total + self.step - 1) // self.step) * self.step
+        self._k_deq_buf = mx.zeros((B, H, alloc, k_dim), dtype=keys.dtype)
+        self._v_deq_buf = mx.zeros((B, H, alloc, v_dim), dtype=values.dtype)
+        self._k_deq_buf[..., :total, :] = all_k
+        self._v_deq_buf[..., :total, :] = all_v
+        self._deq_offset = total
+        self._deq_alloc = alloc
+        return all_k, all_v
+
+    def empty(self):
+        return self.k_packed is None
+
+    @property
+    def nbytes(self):
+        if self.k_packed is None:
+            return 0
+        return (self.k_packed[..., :self.offset, :].nbytes + self.v_packed[..., :self.offset, :].nbytes +
+                self.k_norms[..., :self.offset].nbytes + self.v_norms[..., :self.offset].nbytes)
+
+    @property
+    def state(self):
+        if self.k_packed is None:
+            return []
+        return [self.k_packed[..., :self.offset, :], self.k_norms[..., :self.offset],
+                self.v_packed[..., :self.offset, :], self.v_norms[..., :self.offset]]
+
+    @state.setter
+    def state(self, v):
+        if not v:
+            return
+        self.k_packed, self.k_norms, self.v_packed, self.v_norms = v
+        self.offset = self.k_packed.shape[2]
+
+    @property
+    def meta_state(self):
+        return f"{self.offset},{self.quant_bits},{self.seed},{self._k_dim or 0},{self._v_dim or 0}"
+
+    @meta_state.setter
+    def meta_state(self, v):
+        parts = v.split(",")
+        self.offset, self.quant_bits, self.seed = int(parts[0]), int(parts[1]), int(parts[2])
+        self._k_dim = int(parts[3]) or None
+        self._v_dim = int(parts[4]) or None
+
+    def is_trimmable(self):
+        return True
+
+    def trim(self, n):
+        n = min(self.offset, n)
+        self.offset -= n
+        return n
+
+    def size(self):
+        return self.offset
+
+    def make_mask(self, *args, **kwargs):
+        from mlx_lm.models.cache import create_attention_mask
+        return create_attention_mask(*args, offset=self.offset, **kwargs)
diff --git a/mlx_lm/models/turboquant_kernels.py b/mlx_lm/models/turboquant_kernels.py
new file mode 100644
index 000000000..6c473b3f4
--- /dev/null
+++ b/mlx_lm/models/turboquant_kernels.py
@@ -0,0 +1,196 @@
+"""Metal kernels v3: read directly from bit-packed uint32 storage.
+
+Eliminates Python unpack step — the kernel extracts 3-bit indices
+from packed uint32 words on the fly. Zero intermediate buffers.
+
+Packing format: 10 × 3-bit values per uint32 (30/32 bits used)
+  word = val0 | (val1 << 3) | (val2 << 6) | ... | (val9 << 27)
+"""
+
+import mlx.core as mx
+import math
+
+# Parallel dequant from packed storage
+PACKED_DEQUANT_KERNEL = """
+    uint pos = threadgroup_position_in_grid.x;
+    uint elem = thread_position_in_threadgroup.x;
+    uint dim = dims[0];
+    uint bits = dims[1];
+    uint vals_per_word = dims[2];
+    uint packed_dim = dims[3];
+    uint bit_mask = (1u << bits) - 1u;
+
+    // Extract index from packed uint32
+    uint word_idx = elem / vals_per_word;
+    uint pos_in_word = elem % vals_per_word;
+    uint word = packed[pos * packed_dim + word_idx];
+    uint idx = (word >> (pos_in_word * bits)) & bit_mask;
+
+    // Codebook lookup
+    T val = centroids[idx] * scale[0];
+
+    // Parallel WHT butterfly in threadgroup memory
+    threadgroup T shared[256];
+    shared[elem] = val;
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    uint h = 1;
+    while (h < dim) {
+        uint block = elem / (2 * h);
+        uint offset = elem % (2 * h);
+        if (offset < h) {
+            uint j = block * 2 * h + offset;
+            T a = shared[j];
+            T b = shared[j + h];
+            shared[j] = a + b;
+            shared[j + h] = a - b;
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        h *= 2;
+    }
+
+    // Apply WHT scale, signs, and vector norm
+    T result = shared[elem] * scale[0] * signs[elem] * norms[pos];
+    out[pos * dim + elem] = result;
+"""
+
+# Fused Q@K^T from packed storage — no unpack, no intermediate dequant
+PACKED_FUSED_QK_KERNEL = """
+    uint pos = threadgroup_position_in_grid.x;
+    uint head = threadgroup_position_in_grid.y;
+    uint elem = thread_position_in_threadgroup.x;
+    uint dim = dims[0];
+    uint seq_len = dims[1];
+    uint n_heads = dims[2];
+    uint bits = dims[3];
+    uint vals_per_word = dims[4];
+    uint packed_dim = dims[5];
+    uint bit_mask = (1u << bits) - 1u;
+
+    // Extract index from packed storage
+    uint kv_base = head * seq_len * packed_dim + pos * packed_dim;
+    uint word_idx = elem / vals_per_word;
+    uint pos_in_word = elem % vals_per_word;
+    uint word = packed[kv_base + word_idx];
+    uint idx = (word >> (pos_in_word * bits)) & bit_mask;
+
+    T val = centroids[idx] * scale[0];
+
+    // Parallel WHT butterfly
+    threadgroup T shared[256];
+    shared[elem] = val;
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    uint h = 1;
+    while (h < dim) {
+        uint block = elem / (2 * h);
+        uint offset = elem % (2 * h);
+        if (offset < h) {
+            uint j = block * 2 * h + offset;
+            T a = shared[j];
+            T b = shared[j + h];
+            shared[j] = a + b;
+            shared[j + h] = a - b;
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        h *= 2;
+    }
+
+    // Dequant value + dot product with query
+    T dequant_val = shared[elem] * scale[0] * signs[elem] * norms[head * seq_len + pos];
+    T partial = dequant_val * query[head * dim + elem];
+
+    // Parallel reduction
+    shared[elem] = partial;
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    for (uint stride = dim / 2; stride > 0; stride >>= 1) {
+        if (elem < stride) {
+            shared[elem] += shared[elem + stride];
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+    }
+
+    if (elem == 0) {
+        out[head * seq_len + pos] = shared[0];
+    }
+"""
+
+_packed_dequant = None
+_packed_fused_qk = None
+
+
+def packed_dequantize(
+    packed: mx.array,
+    norms: mx.array,
+    centroids: mx.array,
+    signs: mx.array,
+    dim: int,
+    bits: int,
+) -> mx.array:
+    """Dequantize directly from packed uint32 storage via Metal."""
+    global _packed_dequant
+    if _packed_dequant is None:
+        _packed_dequant = mx.fast.metal_kernel(
+            name="tq_packed_dequant",
+            input_names=["packed", "norms", "centroids", "signs", "scale", "dims"],
+            output_names=["out"],
+            source=PACKED_DEQUANT_KERNEL,
+        )
+
+    seq_len = norms.shape[0]
+    p_dim = packed.shape[-1]
+    vpw = {1: 32, 2: 16, 3: 10, 4: 8}[bits]
+    scale = mx.array([1.0 / math.sqrt(dim)], dtype=mx.float32)
+    dims_arr = mx.array([dim, bits, vpw, p_dim], dtype=mx.uint32)
+
+    outputs = _packed_dequant(
+        inputs=[packed.astype(mx.uint32).reshape(-1), norms.astype(mx.float32), centroids, signs, scale, dims_arr],
+        template=[("T", mx.float32)],
+        grid=(seq_len * dim, 1, 1),
+        threadgroup=(dim, 1, 1),
+        output_shapes=[(seq_len, dim)],
+        output_dtypes=[mx.float32],
+    )
+    return outputs[0]
+
+
+def packed_fused_qk_scores(
+    query: mx.array,
+    k_packed: mx.array,
+    k_norms: mx.array,
+    centroids: mx.array,
+    signs: mx.array,
+    dim: int,
+    bits: int,
+) -> mx.array:
+    """Fused Q@K^T reading directly from packed storage."""
+    global _packed_fused_qk
+    if _packed_fused_qk is None:
+        _packed_fused_qk = mx.fast.metal_kernel(
+            name="tq_packed_fused_qk",
+            input_names=["query", "packed", "norms", "centroids", "signs", "scale", "dims"],
+            output_names=["out"],
+            source=PACKED_FUSED_QK_KERNEL,
+        )
+
+    n_heads, seq_len = k_norms.shape
+    p_dim = k_packed.shape[-1]
+    vpw = {1: 32, 2: 16, 3: 10, 4: 8}[bits]
+    scale = mx.array([1.0 / math.sqrt(dim)], dtype=mx.float32)
+    dims_arr = mx.array([dim, seq_len, n_heads, bits, vpw, p_dim], dtype=mx.uint32)
+
+    outputs = _packed_fused_qk(
+        inputs=[
+            query.astype(mx.float32).reshape(n_heads * dim),
+            k_packed.astype(mx.uint32).reshape(n_heads * seq_len * p_dim),
+            k_norms.astype(mx.float32).reshape(n_heads * seq_len),
+            centroids, signs, scale, dims_arr,
+        ],
+        template=[("T", mx.float32)],
+        grid=(seq_len * dim, n_heads, 1),
+        threadgroup=(dim, 1, 1),
+        output_shapes=[(n_heads * seq_len,)],
+        output_dtypes=[mx.float32],
+    )
+    return outputs[0].reshape(n_heads, seq_len)
diff --git a/mlx_lm/models/turboquant_metal.py b/mlx_lm/models/turboquant_metal.py
new file mode 100644
index 000000000..11a1b7ca6
--- /dev/null
+++ b/mlx_lm/models/turboquant_metal.py
@@ -0,0 +1,232 @@
+"""Fused Metal quantize kernel: raw fp16 vector → packed uint32 + norm.
+
+Replaces the Python path: upcast → norm → normalize → signs → WHT → scale →
+nearest centroid → pack. All in one Metal dispatch per batch of vectors.
+
+Also includes fp16-output dequant for decode buffer writes.
+"""
+
+import mlx.core as mx
+import math
+
+# Fused quantize: one threadgroup per vector (dim threads)
+# Input: fp16 vectors. Output: packed uint32 indices + float32 norms.
+FUSED_QUANTIZE_KERNEL = """
+    uint pos = threadgroup_position_in_grid.x;
+    uint elem = thread_position_in_threadgroup.x;
+    uint dim = dims[0];
+    uint bits = dims[1];
+    uint vals_per_word = dims[2];
+    uint packed_dim = dims[3];
+    uint n_centroids = dims[4];
+
+    // Load input vector into shared memory as float32
+    threadgroup float shared[256];
+    shared[elem] = (float)inp[pos * dim + elem];
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    // Step 1: Compute L2 norm via parallel reduction
+    threadgroup float norm_shared[256];
+    norm_shared[elem] = shared[elem] * shared[elem];
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    for (uint stride = dim / 2; stride > 0; stride >>= 1) {
+        if (elem < stride) {
+            norm_shared[elem] += norm_shared[elem + stride];
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+    }
+    float vec_norm = sqrt(norm_shared[0]);
+    float safe_norm = max(vec_norm, 1e-8f);
+
+    // Step 2: Normalize
+    shared[elem] = shared[elem] / safe_norm;
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    // Step 3: Apply signs (randomized Hadamard = signs * WHT)
+    shared[elem] = shared[elem] * signs[elem];
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    // Step 4: WHT butterfly
+    uint h = 1;
+    while (h < dim) {
+        uint block = elem / (2 * h);
+        uint offset = elem % (2 * h);
+        if (offset < h) {
+            uint j = block * 2 * h + offset;
+            float a = shared[j];
+            float b = shared[j + h];
+            shared[j] = a + b;
+            shared[j + h] = a - b;
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        h *= 2;
+    }
+
+    // After raw butterfly (no 1/sqrt(d) normalization), values are already
+    // in N(0,1) space: butterfly(x_unit * signs) ≈ N(0, 1)
+    // No additional scaling needed — butterfly output matches codebook directly
+    float scaled = shared[elem];
+
+    // Step 6: Nearest centroid (count boundaries exceeded)
+    uint idx = 0;
+    for (uint b = 0; b < n_centroids - 1; b++) {
+        if (scaled > boundaries[b]) {
+            idx++;
+        }
+    }
+
+    // Step 7: Pack indices - thread 0 of each pack group collects and packs
+    // First store indices to shared memory
+    threadgroup uint idx_shared[256];
+    idx_shared[elem] = idx;
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    // Each thread responsible for one packed word writes it
+    uint word_idx = elem / vals_per_word;
+    uint pos_in_word = elem % vals_per_word;
+
+    if (pos_in_word == 0 && word_idx < packed_dim) {
+        uint word = 0;
+        for (uint i = 0; i < vals_per_word && (word_idx * vals_per_word + i) < dim; i++) {
+            word |= (idx_shared[word_idx * vals_per_word + i] & ((1u << bits) - 1u)) << (i * bits);
+        }
+        packed_out[pos * packed_dim + word_idx] = word;
+    }
+
+    // Thread 0 writes the norm
+    if (elem == 0) {
+        norms_out[pos] = vec_norm;
+    }
+"""
+
+# fp16-output dequant: same as v3 but outputs half precision
+DEQUANT_FP16_KERNEL = """
+    uint pos = threadgroup_position_in_grid.x;
+    uint elem = thread_position_in_threadgroup.x;
+    uint dim = dims[0];
+    uint bits = dims[1];
+    uint vals_per_word = dims[2];
+    uint packed_dim = dims[3];
+    uint bit_mask = (1u << bits) - 1u;
+
+    uint word_idx = elem / vals_per_word;
+    uint pos_in_word = elem % vals_per_word;
+    uint word = packed[pos * packed_dim + word_idx];
+    uint idx = (word >> (pos_in_word * bits)) & bit_mask;
+
+    float val = centroids[idx] * scale[0];
+
+    threadgroup float shared[256];
+    shared[elem] = val;
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    uint h = 1;
+    while (h < dim) {
+        uint block = elem / (2 * h);
+        uint offset = elem % (2 * h);
+        if (offset < h) {
+            uint j = block * 2 * h + offset;
+            float a = shared[j];
+            float b = shared[j + h];
+            shared[j] = a + b;
+            shared[j + h] = a - b;
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        h *= 2;
+    }
+
+    float result = shared[elem] * scale[0] * signs[elem] * norms[pos];
+    out[pos * dim + elem] = (half)result;
+"""
+
+_fused_quantize_kernel = None
+_dequant_fp16_kernel = None
+
+
+def fused_quantize(
+    vectors: mx.array,
+    signs: mx.array,
+    boundaries: mx.array,
+    dim: int,
+    bits: int,
+) -> tuple:
+    """Fused Metal quantize: raw vectors → packed uint32 + norms.
+
+    Args:
+        vectors: (n_vecs, dim) fp16/fp32 input
+        signs: (dim,) rotation signs
+        boundaries: (n_centroids-1,) decision boundaries
+        dim: head dimension
+        bits: quantization bits
+
+    Returns:
+        (packed, norms): packed uint32 (n_vecs, packed_dim), norms float32 (n_vecs,)
+    """
+    global _fused_quantize_kernel
+    if _fused_quantize_kernel is None:
+        _fused_quantize_kernel = mx.fast.metal_kernel(
+            name="tq_fused_quantize",
+            input_names=["inp", "signs", "boundaries", "dims"],
+            output_names=["packed_out", "norms_out"],
+            source=FUSED_QUANTIZE_KERNEL,
+        )
+
+    from mlx_lm.models.turboquant_packing import packed_dim as calc_packed_dim, VALS_PER_WORD
+    n_vecs = vectors.shape[0]
+    vpw = VALS_PER_WORD[bits]
+    p_dim = calc_packed_dim(dim, bits)
+    n_centroids = len(boundaries) + 1
+
+    dims_arr = mx.array([dim, bits, vpw, p_dim, n_centroids], dtype=mx.uint32)
+
+    outputs = _fused_quantize_kernel(
+        inputs=[
+            vectors.reshape(n_vecs * dim).astype(mx.float32),
+            signs.astype(mx.float32),
+            boundaries.astype(mx.float32),
+            dims_arr,
+        ],
+        template=[],
+        grid=(n_vecs * dim, 1, 1),
+        threadgroup=(dim, 1, 1),
+        output_shapes=[(n_vecs * p_dim,), (n_vecs,)],
+        output_dtypes=[mx.uint32, mx.float32],
+    )
+    return outputs[0].reshape(n_vecs, p_dim), outputs[1]
+
+
+def dequant_fp16(
+    packed: mx.array,
+    norms: mx.array,
+    centroids: mx.array,
+    signs: mx.array,
+    dim: int,
+    bits: int,
+) -> mx.array:
+    """Dequantize from packed to fp16 directly (no float32 intermediate)."""
+    global _dequant_fp16_kernel
+    if _dequant_fp16_kernel is None:
+        _dequant_fp16_kernel = mx.fast.metal_kernel(
+            name="tq_dequant_fp16",
+            input_names=["packed", "norms", "centroids", "signs", "scale", "dims"],
+            output_names=["out"],
+            source=DEQUANT_FP16_KERNEL,
+        )
+
+    from mlx_lm.models.turboquant_packing import packed_dim as calc_packed_dim, VALS_PER_WORD
+    seq_len = norms.shape[0]
+    vpw = VALS_PER_WORD[bits]
+    p_dim = calc_packed_dim(dim, bits)
+    scale = mx.array([1.0 / math.sqrt(dim)], dtype=mx.float32)
+    dims_arr = mx.array([dim, bits, vpw, p_dim], dtype=mx.uint32)
+
+    outputs = _dequant_fp16_kernel(
+        inputs=[packed.astype(mx.uint32).reshape(-1), norms.astype(mx.float32), centroids, signs, scale, dims_arr],
+        template=[],
+        grid=(seq_len * dim, 1, 1),
+        threadgroup=(dim, 1, 1),
+        output_shapes=[(seq_len, dim)],
+        output_dtypes=[mx.float16],
+    )
+    return outputs[0]
diff --git a/mlx_lm/models/turboquant_packing.py b/mlx_lm/models/turboquant_packing.py
new file mode 100644
index 000000000..feafe3537
--- /dev/null
+++ b/mlx_lm/models/turboquant_packing.py
@@ -0,0 +1,89 @@
+"""Bit-packing for TurboQuant indices.
+
+Packs multiple small-bit indices into uint32 words:
+- 1-bit: 32 values per uint32
+- 2-bit: 16 values per uint32
+- 3-bit: 10 values per uint32 (30/32 bits used)
+- 4-bit:  8 values per uint32
+
+For 3-bit with dim=128: 13 uint32s per vector (52 bytes) vs 128 bytes (uint8).
+Combined with float32 norm: 56 bytes/vector vs 256 bytes (fp16) = 4.6x compression.
+"""
+
+import mlx.core as mx
+import math
+
+VALS_PER_WORD = {1: 32, 2: 16, 3: 10, 4: 8}
+BIT_MASK = {1: 0x1, 2: 0x3, 3: 0x7, 4: 0xF}
+
+
+def packed_dim(dim: int, bits: int) -> int:
+    """Number of uint32 words needed to pack `dim` values at `bits` each."""
+    vpw = VALS_PER_WORD[bits]
+    return (dim + vpw - 1) // vpw
+
+
+def pack_indices(indices: mx.array, bits: int) -> mx.array:
+    """Pack uint8 indices into uint32 words.
+
+    Args:
+        indices: (..., dim) uint8, values in [0, 2^bits)
+        dim: last dimension
+
+    Returns:
+        (..., packed_dim) uint32
+    """
+    vpw = VALS_PER_WORD[bits]
+    shape = indices.shape
+    dim = shape[-1]
+    flat = indices.reshape(-1, dim).astype(mx.uint32)
+    n_vecs = flat.shape[0]
+    p_dim = packed_dim(dim, bits)
+
+    # Pad to multiple of vpw
+    if dim % vpw != 0:
+        pad_size = vpw - (dim % vpw)
+        flat = mx.concatenate([flat, mx.zeros((n_vecs, pad_size), dtype=mx.uint32)], axis=1)
+
+    # Reshape to (n_vecs, p_dim, vpw) and pack
+    flat = flat.reshape(n_vecs, p_dim, vpw)
+
+    # Shift each value by its position and OR together
+    packed = mx.zeros((n_vecs, p_dim), dtype=mx.uint32)
+    for i in range(vpw):
+        packed = packed | (flat[:, :, i] << (i * bits))
+
+    return packed.reshape(*shape[:-1], p_dim)
+
+
+def unpack_indices(packed: mx.array, bits: int, dim: int) -> mx.array:
+    """Unpack uint32 words back to uint8 indices.
+
+    Args:
+        packed: (..., packed_dim) uint32
+        bits: bit width
+        dim: original dimension
+
+    Returns:
+        (..., dim) uint8
+    """
+    vpw = VALS_PER_WORD[bits]
+    mask = BIT_MASK[bits]
+    shape = packed.shape
+    p_dim = shape[-1]
+    flat = packed.reshape(-1, p_dim)
+    n_vecs = flat.shape[0]
+
+    # Extract each value
+    values = []
+    for i in range(vpw):
+        values.append((flat >> (i * bits)) & mask)
+
+    # Stack and trim to original dim
+    result = mx.concatenate(values, axis=-1)  # wrong order, need interleave
+    # Actually: values[i] has shape (n_vecs, p_dim) = the i-th value from each word
+    # We need to reshape to (n_vecs, p_dim * vpw) then trim
+    result = mx.stack(values, axis=-1)  # (n_vecs, p_dim, vpw)
+    result = result.reshape(n_vecs, p_dim * vpw)[:, :dim]
+
+    return result.reshape(*shape[:-1], dim).astype(mx.uint8)
diff --git a/mlx_lm/models/turboquant_rotation.py b/mlx_lm/models/turboquant_rotation.py
new file mode 100644
index 000000000..55b57ee4d
--- /dev/null
+++ b/mlx_lm/models/turboquant_rotation.py
@@ -0,0 +1,80 @@
+"""Walsh-Hadamard Transform and random rotation for TurboQuant."""
+
+import mlx.core as mx
+import math
+
+
+def walsh_hadamard_transform(x: mx.array) -> mx.array:
+    """Fast Walsh-Hadamard Transform in MLX.
+
+    O(d log d) butterfly operations. Input dimension must be power of 2.
+    Operates on last dimension.
+
+    Args:
+        x: (..., d) where d is power of 2
+
+    Returns:
+        (..., d) transformed array, normalized by 1/sqrt(d)
+    """
+    d = x.shape[-1]
+    assert d > 0 and (d & (d - 1)) == 0, f"Dimension must be power of 2, got {d}"
+
+    h = 1
+    while h < d:
+        # Split into pairs at stride h
+        x_reshaped = x.reshape(*x.shape[:-1], d // (2 * h), 2, h)
+        even = x_reshaped[..., 0, :]
+        odd = x_reshaped[..., 1, :]
+        # Butterfly: [a+b, a-b]
+        new_even = even + odd
+        new_odd = even - odd
+        x = mx.stack([new_even, new_odd], axis=-2).reshape(*x.shape[:-1], d)
+        h *= 2
+
+    return x * (1.0 / math.sqrt(d))
+
+
+def random_diagonal_sign(d: int, seed: int = 42) -> mx.array:
+    """Random ±1 diagonal for randomized Hadamard transform.
+
+    Args:
+        d: dimension
+        seed: random seed
+
+    Returns:
+        (d,) array of ±1 values
+    """
+    key = mx.random.key(seed)
+    mask = mx.random.bernoulli(p=0.5, shape=(d,), key=key)
+    return mx.where(mask, mx.array(1.0), mx.array(-1.0))
+
+
+def randomized_hadamard_transform(x: mx.array, signs: mx.array) -> mx.array:
+    """Randomized Hadamard Transform: WHT(diag(signs) @ x).
+
+    This is the rotation used in PolarQuant. O(d log d).
+
+    Args:
+        x: (..., d)
+        signs: (d,) random ±1 diagonal
+
+    Returns:
+        (..., d) rotated array
+    """
+    return walsh_hadamard_transform(x * signs)
+
+
+def inverse_randomized_hadamard(x: mx.array, signs: mx.array) -> mx.array:
+    """Inverse of randomized Hadamard transform.
+
+    Since WHT is self-inverse (up to scaling) and diag(signs) is self-inverse:
+    inverse = diag(signs) @ WHT(x)
+
+    Args:
+        x: (..., d)
+        signs: (d,) same signs used in forward transform
+
+    Returns:
+        (..., d) inverse-rotated array
+    """
+    return walsh_hadamard_transform(x) * signs

From 530e6a5e7f53c37384970e4ebfda9ba93ec9c89f Mon Sep 17 00:00:00 2001
From: dirtyhandz <antonrozanov@gmail.com>
Date: Sat, 28 Mar 2026 15:13:15 +0100
Subject: [PATCH 2/4] Add architecture compatibility check for TurboQuant

---
 mlx_lm/generate.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/mlx_lm/generate.py b/mlx_lm/generate.py
index f34d2b015..f6e75b409 100644
--- a/mlx_lm/generate.py
+++ b/mlx_lm/generate.py
@@ -316,6 +316,17 @@ def make_turboquant_cache(model, bits=3, fp16_layers=1):
     """
     from mlx_lm.models.turboquant_cache import TurboQuantKVCache
 
+    # Check for incompatible architectures
+    if hasattr(model, "make_cache"):
+        default_cache = model.make_cache()
+        if default_cache and not isinstance(default_cache[0], cache.KVCache):
+            cache_type = type(default_cache[0]).__name__
+            raise ValueError(
+                f"[TurboQuant] Incompatible cache type: {cache_type}. "
+                f"TurboQuant only works with standard multi-head attention "
+                f"(KVCache). MLA, SSM, and hybrid architectures are not supported."
+            )
+
     num_layers = len(model.layers)
     caches = []
     for i in range(num_layers):

From de54031586cc4125c9ff412bbf4cd4f147147ce3 Mon Sep 17 00:00:00 2001
From: dirtyhandz <antonrozanov@gmail.com>
Date: Sat, 28 Mar 2026 23:17:53 +0100
Subject: [PATCH 3/4] Rework TurboQuant: to_turbo_quantized(),
 make_prompt_cache routing, CLI args

---
 mlx_lm/generate.py     | 67 ++++++++++++++----------------------------
 mlx_lm/models/cache.py | 43 ++++++++++++++++++++++++++-
 2 files changed, 64 insertions(+), 46 deletions(-)

diff --git a/mlx_lm/generate.py b/mlx_lm/generate.py
index f6e75b409..b24b8457c 100644
--- a/mlx_lm/generate.py
+++ b/mlx_lm/generate.py
@@ -203,6 +203,20 @@ def setup_arg_parser():
         type=int,
         default=DEFAULT_QUANTIZED_KV_START,
     )
+    parser.add_argument(
+        "--turbo-kv-bits",
+        type=int,
+        help="TurboQuant KV cache compression bits (1-4). "
+        "3-bit gives 4.6x compression. Default: no compression.",
+        default=None,
+    )
+    parser.add_argument(
+        "--turbo-fp16-layers",
+        type=int,
+        help="Number of first/last layers to keep in FP16 "
+        "when using --turbo-kv-bits. Default: 1.",
+        default=1,
+    )
     parser.add_argument(
         "--draft-model",
         type=str,
@@ -300,42 +314,6 @@ def maybe_quantize_kv_cache(prompt_cache, quantized_kv_start, kv_group_size, kv_
             prompt_cache[e] = c.to_quantized(group_size=kv_group_size, bits=kv_bits)
 
 
-def make_turboquant_cache(model, bits=3, fp16_layers=1):
-    """Create layer-adaptive TurboQuant cache.
-
-    First and last ``fp16_layers`` layers use standard FP16 KVCache.
-    Middle layers use TurboQuantKVCache with ``bits``-bit compression.
-
-    Args:
-        model: The model to create caches for.
-        bits (int): Quantization bits (1-4). Default: ``3`` (4.6x compression).
-        fp16_layers (int): Number of first/last layers to keep in FP16. Default: ``1``.
-
-    Returns:
-        List of cache objects (one per layer).
-    """
-    from mlx_lm.models.turboquant_cache import TurboQuantKVCache
-
-    # Check for incompatible architectures
-    if hasattr(model, "make_cache"):
-        default_cache = model.make_cache()
-        if default_cache and not isinstance(default_cache[0], cache.KVCache):
-            cache_type = type(default_cache[0]).__name__
-            raise ValueError(
-                f"[TurboQuant] Incompatible cache type: {cache_type}. "
-                f"TurboQuant only works with standard multi-head attention "
-                f"(KVCache). MLA, SSM, and hybrid architectures are not supported."
-            )
-
-    num_layers = len(model.layers)
-    caches = []
-    for i in range(num_layers):
-        if i < fp16_layers or i >= num_layers - fp16_layers:
-            caches.append(cache.KVCache())
-        else:
-            caches.append(TurboQuantKVCache(bits=bits))
-    return caches
-
 
 def generate_step(
     prompt: mx.array,
@@ -409,15 +387,12 @@ def generate_step(
 
     # Create the KV cache for generation
     if prompt_cache is None:
-        if turbo_kv_bits is not None:
-            prompt_cache = make_turboquant_cache(
-                model, bits=turbo_kv_bits, fp16_layers=turbo_fp16_layers,
-            )
-        else:
-            prompt_cache = cache.make_prompt_cache(
-                model,
-                max_kv_size=max_kv_size,
-            )
+        prompt_cache = cache.make_prompt_cache(
+            model,
+            max_kv_size=max_kv_size,
+            turbo_kv_bits=turbo_kv_bits,
+            turbo_fp16_layers=turbo_fp16_layers,
+        )
 
     prompt_progress_callback = prompt_progress_callback or (lambda *_: None)
 
@@ -1575,6 +1550,8 @@ def main():
         kv_bits=args.kv_bits,
         kv_group_size=args.kv_group_size,
         quantized_kv_start=args.quantized_kv_start,
+        turbo_kv_bits=args.turbo_kv_bits,
+        turbo_fp16_layers=args.turbo_fp16_layers,
         draft_model=draft_model,
         num_draft_tokens=args.num_draft_tokens,
     )
diff --git a/mlx_lm/models/cache.py b/mlx_lm/models/cache.py
index e6993243c..d32c9c50a 100644
--- a/mlx_lm/models/cache.py
+++ b/mlx_lm/models/cache.py
@@ -15,6 +15,8 @@
 def make_prompt_cache(
     model: nn.Module,
     max_kv_size: Optional[int] = None,
+    turbo_kv_bits: Optional[int] = None,
+    turbo_fp16_layers: int = 1,
 ) -> List[Any]:
     """
     Construct the model's cache for use in generation.
@@ -27,11 +29,39 @@ def make_prompt_cache(
         max_kv_size (Optional[int]): If provided and the model does not have a
             ``make_cache`` method, a ``RotatingKVCache`` is used with a maximum
             size of ``max_kv_size``
+        turbo_kv_bits (Optional[int]): If provided, use TurboQuant KV cache
+            compression at the given bit width (1-4). 3-bit gives 4.6x
+            compression. Default: ``None`` (no compression).
+        turbo_fp16_layers (int): Number of first/last layers to keep in FP16
+            when using TurboQuant. Default: ``1``.
     """
     if hasattr(model, "make_cache"):
-        return model.make_cache()
+        default_cache = model.make_cache()
+        if turbo_kv_bits is not None:
+            # Check compatibility
+            if not isinstance(default_cache[0], KVCache):
+                raise ValueError(
+                    f"[TurboQuant] Incompatible cache type: "
+                    f"{type(default_cache[0]).__name__}. "
+                    f"TurboQuant only works with standard multi-head "
+                    f"attention (KVCache)."
+                )
+        else:
+            return default_cache
 
     num_layers = len(model.layers)
+
+    if turbo_kv_bits is not None:
+        from mlx_lm.models.turboquant_cache import TurboQuantKVCache
+
+        caches = []
+        for i in range(num_layers):
+            if i < turbo_fp16_layers or i >= num_layers - turbo_fp16_layers:
+                caches.append(KVCache())
+            else:
+                caches.append(TurboQuantKVCache(bits=turbo_kv_bits))
+        return caches
+
     if max_kv_size is not None:
         return [
             RotatingKVCache(max_size=max_kv_size, keep=4) for _ in range(num_layers)
@@ -390,6 +420,17 @@ def to_quantized(self, group_size: int = 64, bits: int = 4) -> QuantizedKVCache:
             )
         return quant_cache
 
+    def to_turbo_quantized(self, bits: int = 3):
+        from mlx_lm.models.turboquant_cache import TurboQuantKVCache
+
+        tq_cache = TurboQuantKVCache(bits=bits)
+        if self.keys is not None:
+            tq_cache.update_and_fetch(
+                self.keys[..., : self.offset, :],
+                self.values[..., : self.offset, :],
+            )
+        return tq_cache
+
     def make_mask(self, *args, **kwargs):
         return create_attention_mask(*args, offset=self.offset, **kwargs)
 

From 9315fbc14169d9290502eeea887523608d07da6c Mon Sep 17 00:00:00 2001
From: dirtyhandz <antonrozanov@gmail.com>
Date: Sun, 29 Mar 2026 18:03:34 +0200
Subject: [PATCH 4/4] Add TurboQuant tests and fix save/load support

---
 mlx_lm/models/cache.py            |   7 +
 mlx_lm/models/turboquant_cache.py |  21 +
 tests/test_turboquant.py          | 668 ++++++++++++++++++++++++++++++
 3 files changed, 696 insertions(+)
 create mode 100644 tests/test_turboquant.py

diff --git a/mlx_lm/models/cache.py b/mlx_lm/models/cache.py
index d32c9c50a..d372456ba 100644
--- a/mlx_lm/models/cache.py
+++ b/mlx_lm/models/cache.py
@@ -106,6 +106,13 @@ def load_prompt_cache(file_name, return_metadata=False):
     arrays = tree_unflatten(list(arrays.items()))
     cache_metadata = tree_unflatten(list(cache_metadata.items()))
     info, metadata, classes = cache_metadata
+
+    # Ensure TurboQuantKVCache is in globals for deserialization
+    if "TurboQuantKVCache" in classes and "TurboQuantKVCache" not in globals():
+        from mlx_lm.models.turboquant_cache import TurboQuantKVCache
+
+        globals()["TurboQuantKVCache"] = TurboQuantKVCache
+
     cache = [
         globals()[c].from_state(state, meta_state)
         for c, state, meta_state in zip(classes, arrays, info)
diff --git a/mlx_lm/models/turboquant_cache.py b/mlx_lm/models/turboquant_cache.py
index 2f196a862..c5f76d38f 100644
--- a/mlx_lm/models/turboquant_cache.py
+++ b/mlx_lm/models/turboquant_cache.py
@@ -206,3 +206,24 @@ def size(self):
     def make_mask(self, *args, **kwargs):
         from mlx_lm.models.cache import create_attention_mask
         return create_attention_mask(*args, offset=self.offset, **kwargs)
+
+    @classmethod
+    def from_state(cls, state, meta_state):
+        obj = cls.__new__(cls)
+        obj.k_packed = None
+        obj.k_norms = None
+        obj.v_packed = None
+        obj.v_norms = None
+        obj._k_deq_buf = None
+        obj._v_deq_buf = None
+        obj._deq_offset = 0
+        obj._deq_alloc = 0
+        obj._k_q = None
+        obj._v_q = None
+        obj._k_dim = None
+        obj._v_dim = None
+        obj._k_pdim = None
+        obj._v_pdim = None
+        obj.meta_state = meta_state
+        obj.state = state
+        return obj
diff --git a/tests/test_turboquant.py b/tests/test_turboquant.py
new file mode 100644
index 000000000..e3fa62ed9
--- /dev/null
+++ b/tests/test_turboquant.py
@@ -0,0 +1,668 @@
+# Copyright © 2024 Apple Inc.
+
+"""Tests for TurboQuant KV cache compression.
+
+Covers:
+- Bit-packing (pack/unpack roundtrip for all bit widths)
+- Walsh-Hadamard transform (orthogonality, invertibility)
+- TurboQuantKVCache (update, offset, trim, state, nbytes, serialization)
+- Conversion from KVCache via to_turbo_quantized()
+- make_prompt_cache with turbo_kv_bits (mixed cache layers)
+- End-to-end generation with TurboQuant cache
+- Save/load prompt cache with TurboQuantKVCache
+"""
+
+import os
+import tempfile
+import unittest
+
+import mlx.core as mx
+
+from mlx_lm.models.cache import (
+    KVCache,
+    make_prompt_cache,
+    save_prompt_cache,
+    load_prompt_cache,
+    trim_prompt_cache,
+    can_trim_prompt_cache,
+)
+from mlx_lm.models.turboquant_cache import TurboQuantKVCache
+from mlx_lm.models.turboquant_packing import (
+    pack_indices,
+    unpack_indices,
+    packed_dim,
+    VALS_PER_WORD,
+)
+from mlx_lm.models.turboquant_rotation import (
+    walsh_hadamard_transform,
+    random_diagonal_sign,
+    randomized_hadamard_transform,
+    inverse_randomized_hadamard,
+)
+
+
+# ---------------------------------------------------------------------------
+# Packing tests
+# ---------------------------------------------------------------------------
+class TestBitPacking(unittest.TestCase):
+
+    def test_packed_dim(self):
+        self.assertEqual(packed_dim(128, 3), 13)  # ceil(128/10)
+        self.assertEqual(packed_dim(128, 4), 16)  # ceil(128/8)
+        self.assertEqual(packed_dim(128, 2), 8)  # ceil(128/16)
+        self.assertEqual(packed_dim(128, 1), 4)  # ceil(128/32)
+        self.assertEqual(packed_dim(1, 3), 1)
+        self.assertEqual(packed_dim(10, 3), 1)  # exactly 10 vals in one word
+        self.assertEqual(packed_dim(11, 3), 2)
+
+    def test_pack_unpack_roundtrip(self):
+        for bits in [1, 2, 3, 4]:
+            max_val = (1 << bits) - 1
+            for dim in [16, 64, 96, 128]:
+                indices = mx.random.randint(
+                    0, max_val + 1, shape=(4, dim)
+                ).astype(mx.uint8)
+                packed = pack_indices(indices, bits)
+                self.assertEqual(packed.shape[-1], packed_dim(dim, bits))
+                unpacked = unpack_indices(packed, bits, dim)
+                self.assertTrue(
+                    mx.array_equal(indices, unpacked),
+                    f"Roundtrip failed for bits={bits}, dim={dim}",
+                )
+
+    def test_pack_unpack_batched(self):
+        """Test with batch and head dimensions."""
+        for bits in [1, 2, 3, 4]:
+            max_val = (1 << bits) - 1
+            indices = mx.random.randint(
+                0, max_val + 1, shape=(2, 8, 10, 128)
+            ).astype(mx.uint8)
+            packed = pack_indices(indices, bits)
+            unpacked = unpack_indices(packed, bits, 128)
+            self.assertTrue(mx.array_equal(indices, unpacked))
+
+    def test_pack_zeros(self):
+        indices = mx.zeros((4, 128), dtype=mx.uint8)
+        for bits in [1, 2, 3, 4]:
+            packed = pack_indices(indices, bits)
+            self.assertTrue(mx.array_equal(packed, mx.zeros_like(packed)))
+
+    def test_pack_max_values(self):
+        for bits in [1, 2, 3, 4]:
+            max_val = (1 << bits) - 1
+            indices = mx.full((4, 128), max_val, dtype=mx.uint8)
+            packed = pack_indices(indices, bits)
+            unpacked = unpack_indices(packed, bits, 128)
+            self.assertTrue(mx.array_equal(indices, unpacked))
+
+
+# ---------------------------------------------------------------------------
+# Rotation tests
+# ---------------------------------------------------------------------------
+class TestRotation(unittest.TestCase):
+
+    def test_wht_orthogonality(self):
+        """WHT is orthogonal: WHT(WHT(x)) == x."""
+        for d in [16, 64, 128]:
+            x = mx.random.normal(shape=(4, d))
+            y = walsh_hadamard_transform(walsh_hadamard_transform(x))
+            self.assertTrue(
+                mx.allclose(x, y, atol=1e-5),
+                f"WHT not self-inverse for d={d}",
+            )
+
+    def test_wht_preserves_norm(self):
+        """WHT is norm-preserving (isometry)."""
+        x = mx.random.normal(shape=(8, 128))
+        y = walsh_hadamard_transform(x)
+        x_norms = mx.linalg.norm(x, axis=-1)
+        y_norms = mx.linalg.norm(y, axis=-1)
+        self.assertTrue(mx.allclose(x_norms, y_norms, atol=1e-4))
+
+    def test_wht_requires_power_of_2(self):
+        x = mx.random.normal(shape=(4, 7))
+        with self.assertRaises(AssertionError):
+            walsh_hadamard_transform(x)
+
+    def test_random_diagonal_sign(self):
+        signs = random_diagonal_sign(128, seed=42)
+        self.assertEqual(signs.shape, (128,))
+        # All values should be +1 or -1
+        self.assertTrue(mx.all(mx.abs(signs) == 1.0))
+
+    def test_random_diagonal_deterministic(self):
+        s1 = random_diagonal_sign(64, seed=99)
+        s2 = random_diagonal_sign(64, seed=99)
+        self.assertTrue(mx.array_equal(s1, s2))
+
+    def test_randomized_hadamard_invertible(self):
+        """Forward then inverse should recover original."""
+        signs = random_diagonal_sign(128, seed=42)
+        x = mx.random.normal(shape=(4, 128))
+        y = randomized_hadamard_transform(x, signs)
+        x_recovered = inverse_randomized_hadamard(y, signs)
+        self.assertTrue(mx.allclose(x, x_recovered, atol=1e-5))
+
+
+# ---------------------------------------------------------------------------
+# TurboQuantKVCache tests
+# ---------------------------------------------------------------------------
+class TestTurboQuantKVCache(unittest.TestCase):
+
+    def test_init(self):
+        cache = TurboQuantKVCache(bits=3)
+        self.assertEqual(cache.quant_bits, 3)
+        self.assertEqual(cache.offset, 0)
+        self.assertTrue(cache.empty())
+        self.assertEqual(cache.size(), 0)
+        self.assertEqual(cache.nbytes, 0)
+
+    def test_single_update(self):
+        cache = TurboQuantKVCache(bits=3)
+        B, H, S, D = 1, 8, 10, 64
+        k = mx.random.normal(shape=(B, H, S, D))
+        v = mx.random.normal(shape=(B, H, S, D))
+
+        k_ret, v_ret = cache.update_and_fetch(k, v)
+
+        self.assertEqual(cache.offset, 10)
+        self.assertEqual(cache.size(), 10)
+        self.assertFalse(cache.empty())
+        self.assertEqual(k_ret.shape, (B, H, 10, D))
+        self.assertEqual(v_ret.shape, (B, H, 10, D))
+
+    def test_sequential_updates(self):
+        """Simulate prefill then decode tokens."""
+        cache = TurboQuantKVCache(bits=3)
+        B, H, D = 1, 8, 64
+
+        # Prefill: 20 tokens
+        k = mx.random.normal(shape=(B, H, 20, D))
+        v = mx.random.normal(shape=(B, H, 20, D))
+        k_ret, v_ret = cache.update_and_fetch(k, v)
+        self.assertEqual(cache.offset, 20)
+        self.assertEqual(k_ret.shape, (B, H, 20, D))
+
+        # Decode: 5 single tokens
+        for i in range(5):
+            k1 = mx.random.normal(shape=(B, H, 1, D))
+            v1 = mx.random.normal(shape=(B, H, 1, D))
+            k_ret, v_ret = cache.update_and_fetch(k1, v1)
+            self.assertEqual(cache.offset, 21 + i)
+            self.assertEqual(k_ret.shape, (B, H, 21 + i, D))
+            self.assertEqual(v_ret.shape, (B, H, 21 + i, D))
+
+    def test_asymmetric_kv_dims(self):
+        """K and V can have different dimensions (GQA patterns)."""
+        cache = TurboQuantKVCache(bits=3)
+        B, H = 1, 4
+        k = mx.random.normal(shape=(B, H, 5, 128))
+        v = mx.random.normal(shape=(B, H, 5, 64))
+        k_ret, v_ret = cache.update_and_fetch(k, v)
+        self.assertEqual(k_ret.shape, (B, H, 5, 128))
+        self.assertEqual(v_ret.shape, (B, H, 5, 64))
+
+    def test_different_bit_widths(self):
+        for bits in [1, 2, 3, 4]:
+            cache = TurboQuantKVCache(bits=bits)
+            k = mx.random.normal(shape=(1, 4, 8, 64))
+            v = mx.random.normal(shape=(1, 4, 8, 64))
+            k_ret, v_ret = cache.update_and_fetch(k, v)
+            self.assertEqual(cache.offset, 8)
+            self.assertEqual(k_ret.shape, (1, 4, 8, 64))
+
+    def test_quantization_quality(self):
+        """Dequantized values should approximate originals."""
+        cache = TurboQuantKVCache(bits=3)
+        k = mx.random.normal(shape=(1, 4, 16, 128))
+        v = mx.random.normal(shape=(1, 4, 16, 128))
+        k_ret, v_ret = cache.update_and_fetch(k, v)
+
+        # Cosine similarity should be high for 3-bit
+        k_flat = k.reshape(-1, 128)
+        kr_flat = k_ret.reshape(-1, 128)
+        dots = mx.sum(k_flat * kr_flat, axis=-1)
+        norms = mx.linalg.norm(k_flat, axis=-1) * mx.linalg.norm(kr_flat, axis=-1)
+        cos_sim = mx.mean(dots / (norms + 1e-10))
+        mx.eval(cos_sim)
+        self.assertGreater(cos_sim.item(), 0.85, "3-bit cosine similarity too low")
+
+    def test_compression_ratio(self):
+        """TurboQuant should use less memory than FP16."""
+        cache = TurboQuantKVCache(bits=3)
+        B, H, S, D = 1, 8, 100, 128
+        k = mx.random.normal(shape=(B, H, S, D))
+        v = mx.random.normal(shape=(B, H, S, D))
+        cache.update_and_fetch(k, v)
+
+        fp16_bytes = 2 * B * H * S * D * 2  # keys + values, 2 bytes each
+        tq_bytes = cache.nbytes
+        ratio = fp16_bytes / tq_bytes
+        self.assertGreater(ratio, 3.0, f"Compression ratio {ratio:.1f}x < 3x for 3-bit")
+
+    def test_trim(self):
+        cache = TurboQuantKVCache(bits=3)
+        k = mx.random.normal(shape=(1, 4, 20, 64))
+        v = mx.random.normal(shape=(1, 4, 20, 64))
+        cache.update_and_fetch(k, v)
+        self.assertEqual(cache.offset, 20)
+
+        trimmed = cache.trim(5)
+        self.assertEqual(trimmed, 5)
+        self.assertEqual(cache.offset, 15)
+        self.assertEqual(cache.size(), 15)
+
+    def test_trim_more_than_available(self):
+        cache = TurboQuantKVCache(bits=3)
+        k = mx.random.normal(shape=(1, 4, 10, 64))
+        v = mx.random.normal(shape=(1, 4, 10, 64))
+        cache.update_and_fetch(k, v)
+
+        trimmed = cache.trim(100)
+        self.assertEqual(trimmed, 10)
+        self.assertEqual(cache.offset, 0)
+
+    def test_is_trimmable(self):
+        cache = TurboQuantKVCache(bits=3)
+        self.assertTrue(cache.is_trimmable())
+
+    def test_state_property(self):
+        cache = TurboQuantKVCache(bits=3)
+
+        # Empty cache returns empty list
+        self.assertEqual(cache.state, [])
+
+        k = mx.random.normal(shape=(1, 4, 10, 64))
+        v = mx.random.normal(shape=(1, 4, 10, 64))
+        cache.update_and_fetch(k, v)
+
+        state = cache.state
+        self.assertEqual(len(state), 4)  # k_packed, k_norms, v_packed, v_norms
+        self.assertEqual(state[0].shape[2], 10)  # k_packed seq dim
+        self.assertEqual(state[1].shape[2], 10)  # k_norms seq dim
+
+    def test_state_roundtrip(self):
+        """Setting state on a new cache should restore it."""
+        cache = TurboQuantKVCache(bits=3)
+        k = mx.random.normal(shape=(1, 4, 10, 64))
+        v = mx.random.normal(shape=(1, 4, 10, 64))
+        cache.update_and_fetch(k, v)
+
+        state = cache.state
+        meta = cache.meta_state
+
+        new_cache = TurboQuantKVCache(bits=3)
+        new_cache.state = state
+        new_cache.meta_state = meta
+
+        self.assertEqual(new_cache.offset, cache.offset)
+        self.assertEqual(new_cache.quant_bits, cache.quant_bits)
+        self.assertEqual(new_cache.seed, cache.seed)
+
+    def test_meta_state(self):
+        cache = TurboQuantKVCache(bits=3, seed=99)
+        k = mx.random.normal(shape=(1, 4, 10, 64))
+        v = mx.random.normal(shape=(1, 4, 10, 128))
+        cache.update_and_fetch(k, v)
+
+        meta = cache.meta_state
+        parts = meta.split(",")
+        self.assertEqual(int(parts[0]), 10)   # offset
+        self.assertEqual(int(parts[1]), 3)    # bits
+        self.assertEqual(int(parts[2]), 99)   # seed
+        self.assertEqual(int(parts[3]), 64)   # k_dim
+        self.assertEqual(int(parts[4]), 128)  # v_dim
+
+    def test_from_state(self):
+        """from_state classmethod for save/load support."""
+        cache = TurboQuantKVCache(bits=3)
+        k = mx.random.normal(shape=(1, 4, 10, 64))
+        v = mx.random.normal(shape=(1, 4, 10, 64))
+        cache.update_and_fetch(k, v)
+
+        restored = TurboQuantKVCache.from_state(cache.state, cache.meta_state)
+        self.assertEqual(restored.offset, 10)
+        self.assertEqual(restored.quant_bits, 3)
+        for s, rs in zip(cache.state, restored.state):
+            self.assertTrue(mx.array_equal(s, rs))
+
+    def test_incremental_decode_consistency(self):
+        """Incremental decode buffer should match full dequant."""
+        cache = TurboQuantKVCache(bits=3)
+
+        # Prefill
+        k = mx.random.normal(shape=(1, 4, 20, 64))
+        v = mx.random.normal(shape=(1, 4, 20, 64))
+        k_full, v_full = cache.update_and_fetch(k, v)
+
+        # Decode one token
+        k1 = mx.random.normal(shape=(1, 4, 1, 64))
+        v1 = mx.random.normal(shape=(1, 4, 1, 64))
+        k_inc, v_inc = cache.update_and_fetch(k1, v1)
+
+        # The first 20 tokens should match between full and incremental
+        self.assertTrue(
+            mx.allclose(k_full, k_inc[..., :20, :], atol=1e-5),
+            "Incremental decode keys don't match full dequant",
+        )
+        self.assertTrue(
+            mx.allclose(v_full, v_inc[..., :20, :], atol=1e-5),
+            "Incremental decode values don't match full dequant",
+        )
+
+
+# ---------------------------------------------------------------------------
+# Conversion from KVCache
+# ---------------------------------------------------------------------------
+class TestCacheConversion(unittest.TestCase):
+
+    def test_to_turbo_quantized_basic(self):
+        kv_cache = KVCache()
+        k = mx.random.normal(shape=(1, 8, 10, 64))
+        v = mx.random.normal(shape=(1, 8, 10, 64))
+        kv_cache.update_and_fetch(k, v)
+
+        tq_cache = kv_cache.to_turbo_quantized(bits=3)
+        self.assertIsInstance(tq_cache, TurboQuantKVCache)
+        self.assertEqual(tq_cache.offset, 10)
+        self.assertEqual(tq_cache.quant_bits, 3)
+
+    def test_to_turbo_quantized_empty(self):
+        kv_cache = KVCache()
+        tq_cache = kv_cache.to_turbo_quantized(bits=3)
+        self.assertIsInstance(tq_cache, TurboQuantKVCache)
+        self.assertTrue(tq_cache.empty())
+        self.assertEqual(tq_cache.offset, 0)
+
+    def test_to_turbo_quantized_preserves_content(self):
+        """After conversion, dequantized values should approximate originals."""
+        kv_cache = KVCache()
+        k = mx.random.normal(shape=(1, 4, 16, 128))
+        v = mx.random.normal(shape=(1, 4, 16, 128))
+        kv_cache.update_and_fetch(k, v)
+
+        tq_cache = kv_cache.to_turbo_quantized(bits=4)  # 4-bit for higher quality
+
+        # Feed a new token through the converted cache
+        k1 = mx.random.normal(shape=(1, 4, 1, 128))
+        v1 = mx.random.normal(shape=(1, 4, 1, 128))
+        k_ret, v_ret = tq_cache.update_and_fetch(k1, v1)
+
+        self.assertEqual(k_ret.shape, (1, 4, 17, 128))
+        self.assertEqual(tq_cache.offset, 17)
+
+    def test_to_turbo_quantized_different_bits(self):
+        kv_cache = KVCache()
+        k = mx.random.normal(shape=(1, 4, 8, 64))
+        v = mx.random.normal(shape=(1, 4, 8, 64))
+        kv_cache.update_and_fetch(k, v)
+
+        for bits in [1, 2, 3, 4]:
+            tq = kv_cache.to_turbo_quantized(bits=bits)
+            self.assertEqual(tq.quant_bits, bits)
+            self.assertEqual(tq.offset, 8)
+
+
+# ---------------------------------------------------------------------------
+# make_prompt_cache integration
+# ---------------------------------------------------------------------------
+class TestMakePromptCache(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        from mlx_lm.utils import load
+
+        cls.model, cls.tokenizer = load("mlx-community/Qwen1.5-0.5B-Chat-4bit")
+
+    def test_make_prompt_cache_turbo(self):
+        """make_prompt_cache with turbo_kv_bits creates mixed cache."""
+        cache = make_prompt_cache(
+            self.model, turbo_kv_bits=3, turbo_fp16_layers=1
+        )
+        num_layers = len(self.model.layers)
+        self.assertEqual(len(cache), num_layers)
+
+        # First and last layers should be KVCache
+        self.assertIsInstance(cache[0], KVCache)
+        self.assertIsInstance(cache[-1], KVCache)
+
+        # Middle layers should be TurboQuantKVCache
+        if num_layers > 2:
+            self.assertIsInstance(cache[1], TurboQuantKVCache)
+            self.assertIsInstance(cache[-2], TurboQuantKVCache)
+
+    def test_make_prompt_cache_turbo_fp16_layers(self):
+        """Different turbo_fp16_layers values."""
+        num_layers = len(self.model.layers)
+
+        cache = make_prompt_cache(
+            self.model, turbo_kv_bits=3, turbo_fp16_layers=2
+        )
+        # First 2 and last 2 layers should be KVCache
+        self.assertIsInstance(cache[0], KVCache)
+        self.assertIsInstance(cache[1], KVCache)
+        self.assertIsInstance(cache[-1], KVCache)
+        self.assertIsInstance(cache[-2], KVCache)
+        if num_layers > 4:
+            self.assertIsInstance(cache[2], TurboQuantKVCache)
+
+    def test_make_prompt_cache_no_turbo(self):
+        """Without turbo_kv_bits, should return regular caches."""
+        cache = make_prompt_cache(self.model)
+        for c in cache:
+            self.assertIsInstance(c, KVCache)
+
+    def test_turbo_cache_trimmable(self):
+        """Mixed cache should be fully trimmable."""
+        cache = make_prompt_cache(
+            self.model, turbo_kv_bits=3, turbo_fp16_layers=1
+        )
+        self.assertTrue(can_trim_prompt_cache(cache))
+
+    def test_turbo_cache_trim(self):
+        cache = make_prompt_cache(
+            self.model, turbo_kv_bits=3, turbo_fp16_layers=1
+        )
+        # Feed some data
+        for c in cache:
+            k = mx.random.normal(shape=(1, 8, 10, 96))
+            v = mx.random.normal(shape=(1, 8, 10, 96))
+            c.update_and_fetch(k, v)
+
+        trimmed = trim_prompt_cache(cache, 3)
+        self.assertEqual(trimmed, 3)
+        for c in cache:
+            self.assertEqual(c.offset, 7)
+
+
+# ---------------------------------------------------------------------------
+# End-to-end generation
+# ---------------------------------------------------------------------------
+class TestTurboQuantGeneration(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        from mlx_lm.utils import load
+
+        cls.model, cls.tokenizer = load("mlx-community/Qwen1.5-0.5B-Chat-4bit")
+
+    def test_generate_with_turbo_cache(self):
+        """End-to-end generation should produce valid tokens."""
+        from mlx_lm.generate import generate_step
+
+        prompt = self.tokenizer.encode("Hello, how are", return_tensors="mlx")[0]
+        cache = make_prompt_cache(
+            self.model, turbo_kv_bits=3, turbo_fp16_layers=1
+        )
+
+        tokens = []
+        for _, (tok, logits) in zip(
+            range(5), generate_step(prompt, self.model, prompt_cache=cache)
+        ):
+            tokens.append(tok)
+
+        self.assertEqual(len(tokens), 5)
+        # All tokens should be valid vocabulary indices
+        vocab_size = self.model.model.embed_tokens.weight.shape[0]
+        for tok in tokens:
+            self.assertGreaterEqual(tok, 0)
+            self.assertLess(tok, vocab_size)
+
+    def test_generate_turbo_vs_baseline(self):
+        """TurboQuant 4-bit should produce similar outputs to baseline."""
+        from mlx_lm.generate import generate_step
+
+        prompt = self.tokenizer.encode("The capital of France is", return_tensors="mlx")[
+            0
+        ]
+
+        # Baseline generation
+        base_cache = make_prompt_cache(self.model)
+        base_tokens = []
+        base_logits = []
+        for _, (tok, logits) in zip(
+            range(3), generate_step(prompt, self.model, prompt_cache=base_cache)
+        ):
+            base_tokens.append(tok)
+            base_logits.append(logits)
+
+        # TurboQuant 4-bit generation (highest quality)
+        tq_cache = make_prompt_cache(
+            self.model, turbo_kv_bits=4, turbo_fp16_layers=1
+        )
+        tq_tokens = []
+        tq_logits = []
+        for _, (tok, logits) in zip(
+            range(3), generate_step(prompt, self.model, prompt_cache=tq_cache)
+        ):
+            tq_tokens.append(tok)
+            tq_logits.append(logits)
+
+        # First token should match (quantization error is small for 4-bit)
+        # Note: quantization affects KV cache which feeds into attention,
+        # so even the first generated token may differ for some models.
+        # We check that at least the top-1 token is the same OR the logit
+        # distributions are close.
+        if base_tokens[0] != tq_tokens[0]:
+            # Check that the correct token is at least in top-5
+            top5_tq = mx.argsort(tq_logits[0])[-5:]
+            mx.eval(top5_tq)
+            self.assertIn(
+                base_tokens[0],
+                top5_tq.tolist(),
+                "Baseline token not in TurboQuant top-5",
+            )
+
+    def test_generate_with_conversion(self):
+        """Generate some tokens, convert cache, continue generating."""
+        from mlx_lm.generate import generate_step
+
+        prompt = self.tokenizer.encode("this is a prompt", return_tensors="mlx")[0]
+
+        # Generate baseline
+        results = zip(range(4), generate_step(prompt, self.model))
+        toks, all_logits = zip(*(r[1] for r in results))
+
+        # Generate 2 tokens with regular cache, then convert
+        cache = make_prompt_cache(self.model)
+        i = 0
+        for _, (tok, logits) in zip(
+            range(2), generate_step(prompt, self.model, prompt_cache=cache)
+        ):
+            self.assertEqual(tok, toks[i])
+            i += 1
+
+        # Convert to TurboQuant (8-bit for minimal quality loss, same as
+        # test_cache_to_quantized which uses bits=8 for QuantizedKVCache)
+        cache = [c.to_turbo_quantized(bits=4) for c in cache]
+
+        # Continue generating - token may differ due to quantization
+        for _, (tok, logits) in zip(
+            range(1),
+            generate_step(mx.array([toks[i]]), self.model, prompt_cache=cache),
+        ):
+            i += 1
+            # Allow tolerance: correct token in top-5
+            if tok != toks[i]:
+                top5 = mx.argsort(logits)[-5:]
+                mx.eval(top5)
+                self.assertIn(
+                    toks[i],
+                    top5.tolist(),
+                    "Expected token not in TurboQuant top-5 after conversion",
+                )
+
+
+# ---------------------------------------------------------------------------
+# Save / Load
+# ---------------------------------------------------------------------------
+class TestTurboQuantSaveLoad(unittest.TestCase):
+
+    def setUp(self):
+        self.test_dir_fid = tempfile.TemporaryDirectory()
+        self.test_dir = self.test_dir_fid.name
+
+    def tearDown(self):
+        self.test_dir_fid.cleanup()
+
+    def test_save_load_turbo_cache(self):
+        cache = [TurboQuantKVCache(bits=3) for _ in range(4)]
+        for c in cache:
+            k = mx.random.normal(shape=(1, 4, 10, 64))
+            v = mx.random.normal(shape=(1, 4, 10, 64))
+            c.update_and_fetch(k, v)
+
+        cache_file = os.path.join(self.test_dir, "tq_cache.safetensors")
+        save_prompt_cache(cache_file, cache)
+        loaded = load_prompt_cache(cache_file)
+
+        self.assertEqual(len(loaded), 4)
+        for c, lc in zip(cache, loaded):
+            self.assertIsInstance(lc, TurboQuantKVCache)
+            self.assertEqual(c.offset, lc.offset)
+            self.assertEqual(c.quant_bits, lc.quant_bits)
+            self.assertEqual(c.seed, lc.seed)
+            for s, ls in zip(c.state, lc.state):
+                self.assertTrue(mx.array_equal(s, ls))
+
+    def test_save_load_mixed_cache(self):
+        """Save/load a mix of KVCache and TurboQuantKVCache."""
+        cache = [
+            KVCache(),
+            TurboQuantKVCache(bits=3),
+            TurboQuantKVCache(bits=3),
+            KVCache(),
+        ]
+        for c in cache:
+            k = mx.random.normal(shape=(1, 4, 10, 64))
+            v = mx.random.normal(shape=(1, 4, 10, 64))
+            c.update_and_fetch(k, v)
+
+        cache_file = os.path.join(self.test_dir, "mixed_cache.safetensors")
+        save_prompt_cache(cache_file, cache)
+        loaded = load_prompt_cache(cache_file)
+
+        self.assertEqual(len(loaded), 4)
+        self.assertIsInstance(loaded[0], KVCache)
+        self.assertIsInstance(loaded[1], TurboQuantKVCache)
+        self.assertIsInstance(loaded[2], TurboQuantKVCache)
+        self.assertIsInstance(loaded[3], KVCache)
+
+        for c, lc in zip(cache, loaded):
+            self.assertEqual(c.offset, lc.offset)
+
+    def test_save_load_with_metadata(self):
+        cache = [TurboQuantKVCache(bits=3)]
+        k = mx.random.normal(shape=(1, 4, 5, 64))
+        v = mx.random.normal(shape=(1, 4, 5, 64))
+        cache[0].update_and_fetch(k, v)
+
+        cache_file = os.path.join(self.test_dir, "tq_meta.safetensors")
+        metadata = {"model": "test", "version": "1"}
+        save_prompt_cache(cache_file, cache, metadata)
+        _, loaded_meta = load_prompt_cache(cache_file, return_metadata=True)
+        self.assertEqual(metadata, loaded_meta)
+
+
+if __name__ == "__main__":
+    unittest.main()