From f75325160f33fef106ff3405452dcb1c6fe818db Mon Sep 17 00:00:00 2001
From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com>
Date: Tue, 24 Mar 2026 04:16:00 +0000
Subject: [PATCH 1/2] Add challenge 86: Paged KV-Cache Attention (Medium)

Implements decode-phase attention over a non-contiguous paged KV cache,
modeled on the vLLM paged attention architecture. Teaches block-table
indirection, online softmax across scattered memory pages, and the
memory access patterns central to LLM serving workloads.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../medium/86_paged_attention/challenge.html  | 180 ++++++++++++++
 .../medium/86_paged_attention/challenge.py    | 231 ++++++++++++++++++
 .../86_paged_attention/starter/starter.cu     |   7 +
 .../starter/starter.cute.py                   |  20 ++
 .../86_paged_attention/starter/starter.jax.py |  20 ++
 .../86_paged_attention/starter/starter.mojo   |  21 ++
 .../starter/starter.pytorch.py                |  18 ++
 .../starter/starter.triton.py                 |  20 ++
 8 files changed, 517 insertions(+)
 create mode 100644 challenges/medium/86_paged_attention/challenge.html
 create mode 100644 challenges/medium/86_paged_attention/challenge.py
 create mode 100644 challenges/medium/86_paged_attention/starter/starter.cu
 create mode 100644 challenges/medium/86_paged_attention/starter/starter.cute.py
 create mode 100644 challenges/medium/86_paged_attention/starter/starter.jax.py
 create mode 100644 challenges/medium/86_paged_attention/starter/starter.mojo
 create mode 100644 challenges/medium/86_paged_attention/starter/starter.pytorch.py
 create mode 100644 challenges/medium/86_paged_attention/starter/starter.triton.py
diff --git a/challenges/medium/86_paged_attention/challenge.html b/challenges/medium/86_paged_attention/challenge.html
new file mode 100644
index 00000000..c1d75301
--- /dev/null
+++ b/challenges/medium/86_paged_attention/challenge.html
@@ -0,0 +1,180 @@
+<p>
+  Implement decode-phase attention over a <strong>paged KV cache</strong>. In LLM serving systems (e.g., vLLM),
+  the key and value tensors for each sequence are stored in fixed-size memory <em>blocks</em> (pages) that
+  may be scattered non-contiguously across a shared GPU memory pool. A <code>block_table</code> maps each
+  sequence's logical block indices to physical block indices in the cache pool. Given a single query vector
+  per sequence (one new token being generated), compute the attention output by gathering the relevant
+  K/V blocks via the block table and computing scaled dot-product attention over the full context.
+</p>
+
+<svg width="700" height="280" viewBox="0 0 700 280" style="display:block; margin:20px auto;" xmlns="http://www.w3.org/2000/svg">
+  <rect width="700" height="280" fill="#222" rx="8"/>
+
+  <!-- Title -->
+  <text x="350" y="22" text-anchor="middle" fill="#ccc" font-size="13" font-family="monospace">Paged KV-Cache: block_table lookup</text>
+
+  <!-- Sequence 0 block table -->
+  <rect x="20" y="40" width="140" height="28" fill="#2a3a4a" stroke="#4a8abf" stroke-width="1" rx="3"/>
+  <text x="30" y="59" fill="#4a8abf" font-size="11" font-family="monospace">seq 0: [3, 7, —]</text>
+
+  <!-- Sequence 1 block table -->
+  <rect x="20" y="76" width="140" height="28" fill="#2a3a4a" stroke="#4a8abf" stroke-width="1" rx="3"/>
+  <text x="30" y="95" fill="#4a8abf" font-size="11" font-family="monospace">seq 1: [1, 5, 9]</text>
+
+  <text x="90" y="130" text-anchor="middle" fill="#888" font-size="11" font-family="monospace">block_table</text>
+
+  <!-- Arrow labels -->
+  <text x="190" y="59" fill="#888" font-size="11" font-family="monospace">phys block 3</text>
+  <text x="190" y="83" fill="#888" font-size="11" font-family="monospace">phys block 7</text>
+  <text x="190" y="107" fill="#888" font-size="11" font-family="monospace">phys block 1</text>
+  <text x="190" y="131" fill="#888" font-size="11" font-family="monospace">phys block 5</text>
+  <text x="190" y="155" fill="#888" font-size="11" font-family="monospace">phys block 9</text>
+
+  <!-- Arrows from seq 0 -->
+  <line x1="160" y1="54" x2="185" y2="54" stroke="#4a8abf" stroke-width="1.5" marker-end="url(#arr)"/>
+  <line x1="160" y1="54" x2="185" y2="78" stroke="#4a8abf" stroke-width="1.5" marker-end="url(#arr)"/>
+  <!-- Arrows from seq 1 -->
+  <line x1="160" y1="90" x2="185" y2="102" stroke="#52b788" stroke-width="1.5" marker-end="url(#arr2)"/>
+  <line x1="160" y1="90" x2="185" y2="126" stroke="#52b788" stroke-width="1.5" marker-end="url(#arr2)"/>
+  <line x1="160" y1="90" x2="185" y2="150" stroke="#52b788" stroke-width="1.5" marker-end="url(#arr2)"/>
+
+  <defs>
+    <marker id="arr" markerWidth="6" markerHeight="6" refX="3" refY="3" orient="auto">
+      <path d="M0,0 L6,3 L0,6 Z" fill="#4a8abf"/>
+    </marker>
+    <marker id="arr2" markerWidth="6" markerHeight="6" refX="3" refY="3" orient="auto">
+      <path d="M0,0 L6,3 L0,6 Z" fill="#52b788"/>
+    </marker>
+  </defs>
+
+  <!-- KV cache pool blocks -->
+  <text x="420" y="22" text-anchor="middle" fill="#ccc" font-size="12" font-family="monospace">K_cache / V_cache pool</text>
+
+  <!-- Draw blocks 0-9 in the pool -->
+  <rect x="305" y="35" width="55" height="22" fill="#333" stroke="#555" rx="2"/>
+  <text x="332" y="51" text-anchor="middle" fill="#666" font-size="10" font-family="monospace">blk 0</text>
+
+  <!-- block 1 (seq 1) -->
+  <rect x="305" y="61" width="55" height="22" fill="#2a4a38" stroke="#52b788" rx="2"/>
+  <text x="332" y="77" text-anchor="middle" fill="#52b788" font-size="10" font-family="monospace">blk 1</text>
+
+  <rect x="305" y="87" width="55" height="22" fill="#333" stroke="#555" rx="2"/>
+  <text x="332" y="103" text-anchor="middle" fill="#666" font-size="10" font-family="monospace">blk 2</text>
+
+  <!-- block 3 (seq 0) -->
+  <rect x="305" y="113" width="55" height="22" fill="#2a3a4a" stroke="#4a8abf" rx="2"/>
+  <text x="332" y="129" text-anchor="middle" fill="#4a8abf" font-size="10" font-family="monospace">blk 3</text>
+
+  <rect x="305" y="139" width="55" height="22" fill="#333" stroke="#555" rx="2"/>
+  <text x="332" y="155" text-anchor="middle" fill="#666" font-size="10" font-family="monospace">blk 4</text>
+
+  <!-- block 5 (seq 1) -->
+  <rect x="305" y="165" width="55" height="22" fill="#2a4a38" stroke="#52b788" rx="2"/>
+  <text x="332" y="181" text-anchor="middle" fill="#52b788" font-size="10" font-family="monospace">blk 5</text>
+
+  <rect x="305" y="191" width="55" height="22" fill="#333" stroke="#555" rx="2"/>
+  <text x="332" y="207" text-anchor="middle" fill="#666" font-size="10" font-family="monospace">blk 6</text>
+
+  <!-- block 7 (seq 0) -->
+  <rect x="305" y="217" width="55" height="22" fill="#2a3a4a" stroke="#4a8abf" rx="2"/>
+  <text x="332" y="233" text-anchor="middle" fill="#4a8abf" font-size="10" font-family="monospace">blk 7</text>
+
+  <rect x="305" y="243" width="55" height="22" fill="#333" stroke="#555" rx="2"/>
+  <text x="332" y="259" text-anchor="middle" fill="#666" font-size="10" font-family="monospace">blk 8</text>
+
+  <!-- Arrows from label to pool -->
+  <line x1="285" y1="59" x2="300" y2="72" stroke="#52b788" stroke-width="1" stroke-dasharray="3,2"/>
+  <line x1="285" y1="83" x2="300" y2="228" stroke="#4a8abf" stroke-width="1" stroke-dasharray="3,2"/>
+  <line x1="285" y1="107" x2="300" y2="72" stroke="#52b788" stroke-width="1" stroke-dasharray="3,2"/>
+  <line x1="285" y1="131" x2="300" y2="176" stroke="#52b788" stroke-width="1" stroke-dasharray="3,2"/>
+  <line x1="285" y1="155" x2="360" y2="200" stroke="#52b788" stroke-width="1" stroke-dasharray="3,2"/>
+  <line x1="285" y1="59" x2="300" y2="124" stroke="#4a8abf" stroke-width="1" stroke-dasharray="3,2"/>
+
+  <!-- Attention computation box -->
+  <rect x="390" y="90" width="290" height="170" fill="#1e2e1e" stroke="#52b788" stroke-width="1" rx="5"/>
+  <text x="535" y="112" text-anchor="middle" fill="#52b788" font-size="12" font-family="monospace">Attention Computation</text>
+  <text x="535" y="135" text-anchor="middle" fill="#ccc" font-size="11" font-family="monospace">scores = Q · K_gathered&#x1D40;/ √d</text>
+  <text x="535" y="158" text-anchor="middle" fill="#ccc" font-size="11" font-family="monospace">weights = softmax(scores)</text>
+  <text x="535" y="181" text-anchor="middle" fill="#ccc" font-size="11" font-family="monospace">output = weights · V_gathered</text>
+  <text x="405" y="215" fill="#888" font-size="10" font-family="monospace">K/V tokens gathered from</text>
+  <text x="405" y="230" fill="#888" font-size="10" font-family="monospace">non-contiguous pool blocks</text>
+  <text x="405" y="245" fill="#888" font-size="10" font-family="monospace">via block_table indirection</text>
+</svg>
+
+<h2>Implementation Requirements</h2>
+<p>
+  Implement the function <code>solve(Q, K_cache, V_cache, block_table, context_lens, output, batch_size, num_heads, head_dim, block_size, max_blocks_per_seq)</code>
+  that computes paged decode-phase attention:
+</p>
+<ul>
+  <li><code>Q</code>: query tensor of shape <code>(batch_size, num_heads, head_dim)</code>, dtype <code>float32</code> — one query per sequence</li>
+  <li><code>K_cache</code>: paged key cache of shape <code>(num_blocks, block_size, num_heads, head_dim)</code>, dtype <code>float32</code></li>
+  <li><code>V_cache</code>: paged value cache of shape <code>(num_blocks, block_size, num_heads, head_dim)</code>, dtype <code>float32</code></li>
+  <li><code>block_table</code>: physical block indices of shape <code>(batch_size, max_blocks_per_seq)</code>, dtype <code>int32</code></li>
+  <li><code>context_lens</code>: number of valid KV tokens per sequence, shape <code>(batch_size,)</code>, dtype <code>int32</code></li>
+  <li><code>output</code>: result of shape <code>(batch_size, num_heads, head_dim)</code>, dtype <code>float32</code></li>
+</ul>
+<p>
+  For each sequence <code>s</code> and each attention head <code>h</code>, compute:
+</p>
+<ol>
+  <li>
+    Gather the <code>context_lens[s]</code> key and value vectors from the paged cache using <code>block_table[s]</code>.
+    Token at logical position <code>t</code> lives in physical block <code>block_table[s, t / block_size]</code>
+    at offset <code>t % block_size</code> within that block.
+  </li>
+  <li>
+    Compute scaled dot-product attention:
+    <code>scores[t] = Q[s, h] &middot; K[s, h, t] / &radic;head_dim</code>
+  </li>
+  <li>
+    Apply softmax over all <code>context_lens[s]</code> positions to get attention weights.
+  </li>
+  <li>
+    Compute <code>output[s, h] = &sum;<sub>t</sub> weights[t] &times; V[s, h, t]</code>.
+  </li>
+</ol>
+<p>
+  Do not use external libraries beyond the framework you select. Keep the function signature unchanged.
+  Write results directly into <code>output</code>.
+</p>
+
+<h2>Example</h2>
+<p>
+  With <code>batch_size</code> = 1, <code>num_heads</code> = 1, <code>head_dim</code> = 4,
+  <code>block_size</code> = 2, <code>context_lens</code> = [2], <code>block_table</code> = [[0]]:
+</p>
+<pre>
+Q[0, 0]            = [1.0, 1.0, 0.0, 0.0]
+
+K_cache[0, 0, 0]   = [1.0, 0.0, 0.0, 0.0]   # block 0, token 0
+K_cache[0, 1, 0]   = [0.0, 1.0, 0.0, 0.0]   # block 0, token 1
+
+V_cache[0, 0, 0]   = [2.0, 0.0, 0.0, 0.0]
+V_cache[0, 1, 0]   = [0.0, 4.0, 0.0, 0.0]
+</pre>
+<p>
+  Scores (before softmax):
+</p>
+<pre>
+score[0] = (1·1 + 1·0 + 0·0 + 0·0) / √4 = 0.5
+score[1] = (1·0 + 1·1 + 0·0 + 0·0) / √4 = 0.5
+</pre>
+<p>
+  Attention weights = softmax([0.5, 0.5]) = [0.5, 0.5]
+</p>
+<pre>
+output[0, 0] = 0.5 × [2, 0, 0, 0] + 0.5 × [0, 4, 0, 0] = [1.0, 2.0, 0.0, 0.0]
+</pre>
+
+<h2>Constraints</h2>
+<ul>
+  <li>1 &le; <code>batch_size</code> &le; 32</li>
+  <li>1 &le; <code>num_heads</code> &le; 64</li>
+  <li>1 &le; <code>head_dim</code> &le; 256; <code>head_dim</code> is a multiple of 8</li>
+  <li>1 &le; <code>block_size</code> &le; 64; <code>block_size</code> is a power of 2</li>
+  <li>1 &le; <code>context_lens[s]</code> &le; 8,192 for all sequences <code>s</code></li>
+  <li>All input tensors are on the GPU and in <code>float32</code> (except <code>block_table</code> and <code>context_lens</code> which are <code>int32</code>)</li>
+  <li><code>block_table[s, i]</code> is a valid index into the first dimension of <code>K_cache</code> for all <code>i &lt; ceil(context_lens[s] / block_size)</code></li>
+  <li>Performance is measured with <code>batch_size</code> = 8, <code>num_heads</code> = 32, <code>head_dim</code> = 128, <code>block_size</code> = 16, <code>context_lens[s]</code> = 2,048 for all sequences</li>
+</ul>
diff --git a/challenges/medium/86_paged_attention/challenge.py b/challenges/medium/86_paged_attention/challenge.py
new file mode 100644
index 00000000..824f869b
--- /dev/null
+++ b/challenges/medium/86_paged_attention/challenge.py
@@ -0,0 +1,231 @@
+import ctypes
+import math
+from typing import Any, Dict, List
+
+import torch
+from core.challenge_base import ChallengeBase
+
+
+class Challenge(ChallengeBase):
+    def __init__(self):
+        super().__init__(
+            name="Paged KV-Cache Attention",
+            atol=1e-04,
+            rtol=1e-04,
+            num_gpus=1,
+            access_tier="free",
+        )
+
+    def reference_impl(
+        self,
+        Q: torch.Tensor,
+        K_cache: torch.Tensor,
+        V_cache: torch.Tensor,
+        block_table: torch.Tensor,
+        context_lens: torch.Tensor,
+        output: torch.Tensor,
+        batch_size: int,
+        num_heads: int,
+        head_dim: int,
+        block_size: int,
+        max_blocks_per_seq: int,
+    ):
+        assert Q.shape == (batch_size, num_heads, head_dim)
+        assert K_cache.shape[1] == block_size
+        assert K_cache.shape[2] == num_heads
+        assert K_cache.shape[3] == head_dim
+        assert V_cache.shape == K_cache.shape
+        assert block_table.shape == (batch_size, max_blocks_per_seq)
+        assert context_lens.shape == (batch_size,)
+        assert output.shape == (batch_size, num_heads, head_dim)
+        assert Q.dtype == K_cache.dtype == V_cache.dtype == output.dtype == torch.float32
+        assert block_table.dtype == context_lens.dtype == torch.int32
+        assert Q.device.type == "cuda"
+        assert K_cache.device.type == "cuda"
+        assert V_cache.device.type == "cuda"
+        assert block_table.device.type == "cuda"
+        assert context_lens.device.type == "cuda"
+        assert output.device.type == "cuda"
+
+        scale = 1.0 / math.sqrt(head_dim)
+
+        for s in range(batch_size):
+            ctx_len = context_lens[s].item()
+            n_blocks = (ctx_len + block_size - 1) // block_size
+
+            # Gather the physical blocks assigned to this sequence
+            phys_blocks = block_table[s, :n_blocks].long()  # (n_blocks,)
+
+            # Gather K and V: (n_blocks, block_size, num_heads, head_dim)
+            K_blocks = K_cache[phys_blocks]
+            V_blocks = V_cache[phys_blocks]
+
+            # Flatten to (n_blocks * block_size, num_heads, head_dim) and trim
+            K_seq = K_blocks.reshape(-1, num_heads, head_dim)[
+                :ctx_len
+            ]  # (ctx_len, num_heads, head_dim)
+            V_seq = V_blocks.reshape(-1, num_heads, head_dim)[:ctx_len]
+
+            # Transpose to (num_heads, ctx_len, head_dim)
+            K_seq = K_seq.transpose(0, 1).contiguous()
+            V_seq = V_seq.transpose(0, 1).contiguous()
+
+            # Q[s]: (num_heads, head_dim) -> (num_heads, 1, head_dim)
+            q = Q[s].unsqueeze(1)
+
+            # Scaled dot-product: (num_heads, 1, ctx_len)
+            scores = torch.bmm(q, K_seq.transpose(1, 2)) * scale
+            attn_weights = torch.softmax(scores, dim=-1)
+
+            # Weighted sum: (num_heads, 1, head_dim) -> (num_heads, head_dim)
+            out = torch.bmm(attn_weights, V_seq).squeeze(1)
+            output[s].copy_(out)
+
+    def get_solve_signature(self) -> Dict[str, tuple]:
+        return {
+            "Q": (ctypes.POINTER(ctypes.c_float), "in"),
+            "K_cache": (ctypes.POINTER(ctypes.c_float), "in"),
+            "V_cache": (ctypes.POINTER(ctypes.c_float), "in"),
+            "block_table": (ctypes.POINTER(ctypes.c_int), "in"),
+            "context_lens": (ctypes.POINTER(ctypes.c_int), "in"),
+            "output": (ctypes.POINTER(ctypes.c_float), "out"),
+            "batch_size": (ctypes.c_int, "in"),
+            "num_heads": (ctypes.c_int, "in"),
+            "head_dim": (ctypes.c_int, "in"),
+            "block_size": (ctypes.c_int, "in"),
+            "max_blocks_per_seq": (ctypes.c_int, "in"),
+        }
+
+    def _make_test_case(
+        self, batch_size, num_heads, head_dim, block_size, context_lens, zero_q=False
+    ):
+        if isinstance(context_lens, int):
+            context_lens = [context_lens] * batch_size
+
+        max_ctx = max(context_lens)
+        max_blocks_per_seq = (max_ctx + block_size - 1) // block_size
+
+        # Allocate exactly the blocks needed, assigned sequentially
+        total_blocks = sum((cl + block_size - 1) // block_size for cl in context_lens)
+
+        device = "cuda"
+        dtype = torch.float32
+
+        if zero_q:
+            Q = torch.zeros(batch_size, num_heads, head_dim, device=device, dtype=dtype)
+        else:
+            Q = torch.randn(batch_size, num_heads, head_dim, device=device, dtype=dtype)
+
+        K_cache = torch.randn(
+            total_blocks, block_size, num_heads, head_dim, device=device, dtype=dtype
+        )
+        V_cache = torch.randn(
+            total_blocks, block_size, num_heads, head_dim, device=device, dtype=dtype
+        )
+
+        block_table = torch.zeros(batch_size, max_blocks_per_seq, device=device, dtype=torch.int32)
+        ctx_lens_tensor = torch.tensor(context_lens, device=device, dtype=torch.int32)
+
+        # Assign physical blocks sequentially per sequence
+        block_idx = 0
+        for s in range(batch_size):
+            n_blocks = (context_lens[s] + block_size - 1) // block_size
+            for b in range(n_blocks):
+                block_table[s, b] = block_idx
+                block_idx += 1
+
+        output = torch.zeros(batch_size, num_heads, head_dim, device=device, dtype=dtype)
+
+        return {
+            "Q": Q,
+            "K_cache": K_cache,
+            "V_cache": V_cache,
+            "block_table": block_table,
+            "context_lens": ctx_lens_tensor,
+            "output": output,
+            "batch_size": batch_size,
+            "num_heads": num_heads,
+            "head_dim": head_dim,
+            "block_size": block_size,
+            "max_blocks_per_seq": max_blocks_per_seq,
+        }
+
+    def generate_example_test(self) -> Dict[str, Any]:
+        device = "cuda"
+        dtype = torch.float32
+
+        # batch=1, heads=1, head_dim=4, block_size=2, ctx_len=2
+        # Q · K / sqrt(4): [1,1,0,0]·[1,0,0,0]/2 = 0.5, [1,1,0,0]·[0,1,0,0]/2 = 0.5
+        # attn = softmax([0.5, 0.5]) = [0.5, 0.5]
+        # output = 0.5*[2,0,0,0] + 0.5*[0,4,0,0] = [1, 2, 0, 0]
+        Q = torch.tensor([[[1.0, 1.0, 0.0, 0.0]]], device=device, dtype=dtype)  # (1, 1, 4)
+        K_cache = torch.tensor(
+            [[[[1.0, 0.0, 0.0, 0.0]], [[0.0, 1.0, 0.0, 0.0]]]],
+            device=device,
+            dtype=dtype,
+        )  # (1 block, block_size=2, 1 head, head_dim=4)
+        V_cache = torch.tensor(
+            [[[[2.0, 0.0, 0.0, 0.0]], [[0.0, 4.0, 0.0, 0.0]]]],
+            device=device,
+            dtype=dtype,
+        )
+        block_table = torch.tensor(
+            [[0]], device=device, dtype=torch.int32
+        )  # seq 0 -> physical block 0
+        context_lens = torch.tensor([2], device=device, dtype=torch.int32)
+        output = torch.zeros(1, 1, 4, device=device, dtype=dtype)
+
+        return {
+            "Q": Q,
+            "K_cache": K_cache,
+            "V_cache": V_cache,
+            "block_table": block_table,
+            "context_lens": context_lens,
+            "output": output,
+            "batch_size": 1,
+            "num_heads": 1,
+            "head_dim": 4,
+            "block_size": 2,
+            "max_blocks_per_seq": 1,
+        }
+
+    def generate_functional_test(self) -> List[Dict[str, Any]]:
+        torch.manual_seed(42)
+        tests = []
+
+        # Edge case: single KV token
+        tests.append(self._make_test_case(1, 1, 4, 2, 1))
+
+        # Edge case: ctx_len equals block_size exactly
+        tests.append(self._make_test_case(1, 2, 8, 4, 4))
+
+        # Zero query: softmax is uniform, output is mean of V
+        tests.append(self._make_test_case(2, 2, 8, 4, 8, zero_q=True))
+
+        # Variable context lengths within a batch
+        tests.append(self._make_test_case(4, 4, 32, 16, [16, 32, 48, 64]))
+
+        # Power-of-2 context lengths
+        tests.append(self._make_test_case(4, 4, 32, 16, 32))
+
+        # Power-of-2, larger
+        tests.append(self._make_test_case(4, 8, 64, 16, 128))
+
+        # Non-power-of-2 context length
+        tests.append(self._make_test_case(2, 4, 32, 16, 30))
+
+        # Non-power-of-2, straddles multiple blocks
+        tests.append(self._make_test_case(4, 4, 64, 16, 100))
+
+        # Mixed variable lengths with non-power-of-2
+        tests.append(self._make_test_case(4, 8, 64, 16, [50, 100, 150, 200]))
+
+        # Realistic: LLaMA-3 8B style (8 Q heads), shorter context
+        tests.append(self._make_test_case(4, 8, 128, 16, 256))
+
+        return tests
+
+    def generate_performance_test(self) -> Dict[str, Any]:
+        torch.manual_seed(0)
+        # Realistic LLM decode: batch=8, 32 heads, head_dim=128, block_size=16, ctx_len=2048
+        return self._make_test_case(8, 32, 128, 16, 2048)
diff --git a/challenges/medium/86_paged_attention/starter/starter.cu b/challenges/medium/86_paged_attention/starter/starter.cu
new file mode 100644
index 00000000..b72d1b9c
--- /dev/null
+++ b/challenges/medium/86_paged_attention/starter/starter.cu
@@ -0,0 +1,7 @@
+#include <cuda_runtime.h>
+
+// Q, K_cache, V_cache, block_table, context_lens, output are device pointers
+extern "C" void solve(const float* Q, const float* K_cache, const float* V_cache,
+                      const int* block_table, const int* context_lens, float* output,
+                      int batch_size, int num_heads, int head_dim, int block_size,
+                      int max_blocks_per_seq) {}
diff --git a/challenges/medium/86_paged_attention/starter/starter.cute.py b/challenges/medium/86_paged_attention/starter/starter.cute.py
new file mode 100644
index 00000000..d703ed65
--- /dev/null
+++ b/challenges/medium/86_paged_attention/starter/starter.cute.py
@@ -0,0 +1,20 @@
+import cutlass
+import cutlass.cute as cute
+
+
+# Q, K_cache, V_cache, block_table, context_lens, output are tensors on the GPU
+@cute.jit
+def solve(
+    Q: cute.Tensor,
+    K_cache: cute.Tensor,
+    V_cache: cute.Tensor,
+    block_table: cute.Tensor,
+    context_lens: cute.Tensor,
+    output: cute.Tensor,
+    batch_size: cute.Int32,
+    num_heads: cute.Int32,
+    head_dim: cute.Int32,
+    block_size: cute.Int32,
+    max_blocks_per_seq: cute.Int32,
+):
+    pass
diff --git a/challenges/medium/86_paged_attention/starter/starter.jax.py b/challenges/medium/86_paged_attention/starter/starter.jax.py
new file mode 100644
index 00000000..cd82ce9b
--- /dev/null
+++ b/challenges/medium/86_paged_attention/starter/starter.jax.py
@@ -0,0 +1,20 @@
+import jax
+import jax.numpy as jnp
+
+
+# Q, K_cache, V_cache, block_table, context_lens are tensors on GPU
+@jax.jit
+def solve(
+    Q: jax.Array,
+    K_cache: jax.Array,
+    V_cache: jax.Array,
+    block_table: jax.Array,
+    context_lens: jax.Array,
+    batch_size: int,
+    num_heads: int,
+    head_dim: int,
+    block_size: int,
+    max_blocks_per_seq: int,
+) -> jax.Array:
+    # return output tensor directly
+    pass
diff --git a/challenges/medium/86_paged_attention/starter/starter.mojo b/challenges/medium/86_paged_attention/starter/starter.mojo
new file mode 100644
index 00000000..ce8b7e21
--- /dev/null
+++ b/challenges/medium/86_paged_attention/starter/starter.mojo
@@ -0,0 +1,21 @@
+from gpu.host import DeviceContext
+from gpu.id import block_dim, block_idx, thread_idx
+from memory import UnsafePointer
+from math import ceildiv
+
+# Q, K_cache, V_cache, block_table, context_lens, output are device pointers
+@export
+def solve(
+    Q: UnsafePointer[Float32],
+    K_cache: UnsafePointer[Float32],
+    V_cache: UnsafePointer[Float32],
+    block_table: UnsafePointer[Int32],
+    context_lens: UnsafePointer[Int32],
+    output: UnsafePointer[Float32],
+    batch_size: Int32,
+    num_heads: Int32,
+    head_dim: Int32,
+    block_size: Int32,
+    max_blocks_per_seq: Int32,
+):
+    pass
diff --git a/challenges/medium/86_paged_attention/starter/starter.pytorch.py b/challenges/medium/86_paged_attention/starter/starter.pytorch.py
new file mode 100644
index 00000000..aeb42ce3
--- /dev/null
+++ b/challenges/medium/86_paged_attention/starter/starter.pytorch.py
@@ -0,0 +1,18 @@
+import torch
+
+
+# Q, K_cache, V_cache, block_table, context_lens, output are tensors on the GPU
+def solve(
+    Q: torch.Tensor,
+    K_cache: torch.Tensor,
+    V_cache: torch.Tensor,
+    block_table: torch.Tensor,
+    context_lens: torch.Tensor,
+    output: torch.Tensor,
+    batch_size: int,
+    num_heads: int,
+    head_dim: int,
+    block_size: int,
+    max_blocks_per_seq: int,
+):
+    pass
diff --git a/challenges/medium/86_paged_attention/starter/starter.triton.py b/challenges/medium/86_paged_attention/starter/starter.triton.py
new file mode 100644
index 00000000..7c392628
--- /dev/null
+++ b/challenges/medium/86_paged_attention/starter/starter.triton.py
@@ -0,0 +1,20 @@
+import torch
+import triton
+import triton.language as tl
+
+
+# Q, K_cache, V_cache, block_table, context_lens, output are tensors on the GPU
+def solve(
+    Q: torch.Tensor,
+    K_cache: torch.Tensor,
+    V_cache: torch.Tensor,
+    block_table: torch.Tensor,
+    context_lens: torch.Tensor,
+    output: torch.Tensor,
+    batch_size: int,
+    num_heads: int,
+    head_dim: int,
+    block_size: int,
+    max_blocks_per_seq: int,
+):
+    pass

From bb42a8eb74777b25acd4c0f2f910d96726a250db Mon Sep 17 00:00:00 2001
From: James Song <haoxijamessong@gmail.com>
Date: Thu, 26 Mar 2026 22:10:18 -0400
Subject: [PATCH 2/2] Improve paged attention HTML: SVG diagram, LaTeX example
 and formulas

Redesign SVG: block_table as a proper table with column headers,
cache pool as horizontal memory strip with color-coded blocks and
sequence labels. Convert example and computation steps from HTML
entities to LaTeX math notation.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../medium/86_paged_attention/challenge.html  | 249 ++++++++++--------
 1 file changed, 135 insertions(+), 114 deletions(-)

diff --git a/challenges/medium/86_paged_attention/challenge.html b/challenges/medium/86_paged_attention/challenge.html
index c1d75301..9eb99eb7 100644
--- a/challenges/medium/86_paged_attention/challenge.html
+++ b/challenges/medium/86_paged_attention/challenge.html
@@ -7,98 +7,113 @@
   K/V blocks via the block table and computing scaled dot-product attention over the full context.
 </p>
 
-<svg width="700" height="280" viewBox="0 0 700 280" style="display:block; margin:20px auto;" xmlns="http://www.w3.org/2000/svg">
-  <rect width="700" height="280" fill="#222" rx="8"/>
-
-  <!-- Title -->
-  <text x="350" y="22" text-anchor="middle" fill="#ccc" font-size="13" font-family="monospace">Paged KV-Cache: block_table lookup</text>
-
-  <!-- Sequence 0 block table -->
-  <rect x="20" y="40" width="140" height="28" fill="#2a3a4a" stroke="#4a8abf" stroke-width="1" rx="3"/>
-  <text x="30" y="59" fill="#4a8abf" font-size="11" font-family="monospace">seq 0: [3, 7, —]</text>
-
-  <!-- Sequence 1 block table -->
-  <rect x="20" y="76" width="140" height="28" fill="#2a3a4a" stroke="#4a8abf" stroke-width="1" rx="3"/>
-  <text x="30" y="95" fill="#4a8abf" font-size="11" font-family="monospace">seq 1: [1, 5, 9]</text>
-
-  <text x="90" y="130" text-anchor="middle" fill="#888" font-size="11" font-family="monospace">block_table</text>
-
-  <!-- Arrow labels -->
-  <text x="190" y="59" fill="#888" font-size="11" font-family="monospace">phys block 3</text>
-  <text x="190" y="83" fill="#888" font-size="11" font-family="monospace">phys block 7</text>
-  <text x="190" y="107" fill="#888" font-size="11" font-family="monospace">phys block 1</text>
-  <text x="190" y="131" fill="#888" font-size="11" font-family="monospace">phys block 5</text>
-  <text x="190" y="155" fill="#888" font-size="11" font-family="monospace">phys block 9</text>
-
-  <!-- Arrows from seq 0 -->
-  <line x1="160" y1="54" x2="185" y2="54" stroke="#4a8abf" stroke-width="1.5" marker-end="url(#arr)"/>
-  <line x1="160" y1="54" x2="185" y2="78" stroke="#4a8abf" stroke-width="1.5" marker-end="url(#arr)"/>
-  <!-- Arrows from seq 1 -->
-  <line x1="160" y1="90" x2="185" y2="102" stroke="#52b788" stroke-width="1.5" marker-end="url(#arr2)"/>
-  <line x1="160" y1="90" x2="185" y2="126" stroke="#52b788" stroke-width="1.5" marker-end="url(#arr2)"/>
-  <line x1="160" y1="90" x2="185" y2="150" stroke="#52b788" stroke-width="1.5" marker-end="url(#arr2)"/>
-
+<svg width="620" height="300" viewBox="0 0 620 300" xmlns="http://www.w3.org/2000/svg"
+     style="display:block; margin:20px auto; font-family:monospace;">
+  <rect width="620" height="300" fill="#222" rx="8"/>
   <defs>
-    <marker id="arr" markerWidth="6" markerHeight="6" refX="3" refY="3" orient="auto">
-      <path d="M0,0 L6,3 L0,6 Z" fill="#4a8abf"/>
+    <marker id="ab" markerWidth="7" markerHeight="7" refX="5" refY="3.5" orient="auto">
+      <path d="M0,0 L0,7 L7,3.5 z" fill="#4a8abf"/>
     </marker>
-    <marker id="arr2" markerWidth="6" markerHeight="6" refX="3" refY="3" orient="auto">
-      <path d="M0,0 L6,3 L0,6 Z" fill="#52b788"/>
+    <marker id="ag" markerWidth="7" markerHeight="7" refX="5" refY="3.5" orient="auto">
+      <path d="M0,0 L0,7 L7,3.5 z" fill="#52b788"/>
     </marker>
   </defs>
 
-  <!-- KV cache pool blocks -->
-  <text x="420" y="22" text-anchor="middle" fill="#ccc" font-size="12" font-family="monospace">K_cache / V_cache pool</text>
-
-  <!-- Draw blocks 0-9 in the pool -->
-  <rect x="305" y="35" width="55" height="22" fill="#333" stroke="#555" rx="2"/>
-  <text x="332" y="51" text-anchor="middle" fill="#666" font-size="10" font-family="monospace">blk 0</text>
-
-  <!-- block 1 (seq 1) -->
-  <rect x="305" y="61" width="55" height="22" fill="#2a4a38" stroke="#52b788" rx="2"/>
-  <text x="332" y="77" text-anchor="middle" fill="#52b788" font-size="10" font-family="monospace">blk 1</text>
-
-  <rect x="305" y="87" width="55" height="22" fill="#333" stroke="#555" rx="2"/>
-  <text x="332" y="103" text-anchor="middle" fill="#666" font-size="10" font-family="monospace">blk 2</text>
-
-  <!-- block 3 (seq 0) -->
-  <rect x="305" y="113" width="55" height="22" fill="#2a3a4a" stroke="#4a8abf" rx="2"/>
-  <text x="332" y="129" text-anchor="middle" fill="#4a8abf" font-size="10" font-family="monospace">blk 3</text>
-
-  <rect x="305" y="139" width="55" height="22" fill="#333" stroke="#555" rx="2"/>
-  <text x="332" y="155" text-anchor="middle" fill="#666" font-size="10" font-family="monospace">blk 4</text>
-
-  <!-- block 5 (seq 1) -->
-  <rect x="305" y="165" width="55" height="22" fill="#2a4a38" stroke="#52b788" rx="2"/>
-  <text x="332" y="181" text-anchor="middle" fill="#52b788" font-size="10" font-family="monospace">blk 5</text>
-
-  <rect x="305" y="191" width="55" height="22" fill="#333" stroke="#555" rx="2"/>
-  <text x="332" y="207" text-anchor="middle" fill="#666" font-size="10" font-family="monospace">blk 6</text>
-
-  <!-- block 7 (seq 0) -->
-  <rect x="305" y="217" width="55" height="22" fill="#2a3a4a" stroke="#4a8abf" rx="2"/>
-  <text x="332" y="233" text-anchor="middle" fill="#4a8abf" font-size="10" font-family="monospace">blk 7</text>
-
-  <rect x="305" y="243" width="55" height="22" fill="#333" stroke="#555" rx="2"/>
-  <text x="332" y="259" text-anchor="middle" fill="#666" font-size="10" font-family="monospace">blk 8</text>
-
-  <!-- Arrows from label to pool -->
-  <line x1="285" y1="59" x2="300" y2="72" stroke="#52b788" stroke-width="1" stroke-dasharray="3,2"/>
-  <line x1="285" y1="83" x2="300" y2="228" stroke="#4a8abf" stroke-width="1" stroke-dasharray="3,2"/>
-  <line x1="285" y1="107" x2="300" y2="72" stroke="#52b788" stroke-width="1" stroke-dasharray="3,2"/>
-  <line x1="285" y1="131" x2="300" y2="176" stroke="#52b788" stroke-width="1" stroke-dasharray="3,2"/>
-  <line x1="285" y1="155" x2="360" y2="200" stroke="#52b788" stroke-width="1" stroke-dasharray="3,2"/>
-  <line x1="285" y1="59" x2="300" y2="124" stroke="#4a8abf" stroke-width="1" stroke-dasharray="3,2"/>
-
-  <!-- Attention computation box -->
-  <rect x="390" y="90" width="290" height="170" fill="#1e2e1e" stroke="#52b788" stroke-width="1" rx="5"/>
-  <text x="535" y="112" text-anchor="middle" fill="#52b788" font-size="12" font-family="monospace">Attention Computation</text>
-  <text x="535" y="135" text-anchor="middle" fill="#ccc" font-size="11" font-family="monospace">scores = Q · K_gathered&#x1D40;/ √d</text>
-  <text x="535" y="158" text-anchor="middle" fill="#ccc" font-size="11" font-family="monospace">weights = softmax(scores)</text>
-  <text x="535" y="181" text-anchor="middle" fill="#ccc" font-size="11" font-family="monospace">output = weights · V_gathered</text>
-  <text x="405" y="215" fill="#888" font-size="10" font-family="monospace">K/V tokens gathered from</text>
-  <text x="405" y="230" fill="#888" font-size="10" font-family="monospace">non-contiguous pool blocks</text>
-  <text x="405" y="245" fill="#888" font-size="10" font-family="monospace">via block_table indirection</text>
+  <!-- ============================================================ -->
+  <!-- TOP: block_table as a proper table                           -->
+  <!-- ============================================================ -->
+  <text x="16" y="18" fill="#666" font-size="10">block_table</text>
+
+  <!-- Column headers -->
+  <text x="168" y="18" fill="#888" font-size="9" text-anchor="middle">blk 0</text>
+  <text x="218" y="18" fill="#888" font-size="9" text-anchor="middle">blk 1</text>
+  <text x="268" y="18" fill="#888" font-size="9" text-anchor="middle">blk 2</text>
+
+  <!-- Seq 0 row -->
+  <text x="100" y="40" fill="#4a8abf" font-size="10" text-anchor="end">seq 0</text>
+  <rect x="114" y="26" width="48" height="22" rx="3" fill="#2a3a4a" stroke="#4a8abf" stroke-width="1.5"/>
+  <text x="138" y="42" text-anchor="middle" fill="#4a8abf" font-size="11" font-weight="bold">3</text>
+  <rect x="164" y="26" width="48" height="22" rx="3" fill="#2a3a4a" stroke="#4a8abf" stroke-width="1.5"/>
+  <text x="188" y="42" text-anchor="middle" fill="#4a8abf" font-size="11" font-weight="bold">7</text>
+  <rect x="214" y="26" width="48" height="22" rx="3" fill="#2a2a2a" stroke="#555" stroke-width="1"/>
+  <text x="238" y="42" text-anchor="middle" fill="#555" font-size="10">&#x2014;</text>
+
+  <!-- Seq 1 row -->
+  <text x="100" y="68" fill="#52b788" font-size="10" text-anchor="end">seq 1</text>
+  <rect x="114" y="54" width="48" height="22" rx="3" fill="#2a4a38" stroke="#52b788" stroke-width="1.5"/>
+  <text x="138" y="70" text-anchor="middle" fill="#52b788" font-size="11" font-weight="bold">1</text>
+  <rect x="164" y="54" width="48" height="22" rx="3" fill="#2a4a38" stroke="#52b788" stroke-width="1.5"/>
+  <text x="188" y="70" text-anchor="middle" fill="#52b788" font-size="11" font-weight="bold">5</text>
+  <rect x="214" y="54" width="48" height="22" rx="3" fill="#2a4a38" stroke="#52b788" stroke-width="1.5"/>
+  <text x="238" y="70" text-anchor="middle" fill="#52b788" font-size="11" font-weight="bold">9</text>
+
+  <text x="190" y="92" fill="#888" font-size="9" text-anchor="middle">values = physical block indices in pool &#x2193;</text>
+
+  <!-- ============================================================ -->
+  <!-- MIDDLE: KV cache pool as a horizontal memory strip           -->
+  <!-- ============================================================ -->
+  <text x="16" y="118" fill="#666" font-size="10">K_cache / V_cache pool (GPU memory)</text>
+
+  <!-- 10 blocks in a horizontal strip -->
+  <!-- blk 0 (unused) -->
+  <rect x="16" y="128" width="54" height="36" rx="3" fill="#2a2a2a" stroke="#555" stroke-width="1"/>
+  <text x="43" y="151" text-anchor="middle" fill="#666" font-size="9">blk 0</text>
+
+  <!-- blk 1 (seq 1) -->
+  <rect x="74" y="128" width="54" height="36" rx="3" fill="#2a4a38" stroke="#52b788" stroke-width="1.5"/>
+  <text x="101" y="148" text-anchor="middle" fill="#52b788" font-size="9">blk 1</text>
+  <text x="101" y="159" text-anchor="middle" fill="#52b788" font-size="7">seq1.0</text>
+
+  <!-- blk 2 (unused) -->
+  <rect x="132" y="128" width="54" height="36" rx="3" fill="#2a2a2a" stroke="#555" stroke-width="1"/>
+  <text x="159" y="151" text-anchor="middle" fill="#666" font-size="9">blk 2</text>
+
+  <!-- blk 3 (seq 0) -->
+  <rect x="190" y="128" width="54" height="36" rx="3" fill="#2a3a4a" stroke="#4a8abf" stroke-width="1.5"/>
+  <text x="217" y="148" text-anchor="middle" fill="#4a8abf" font-size="9">blk 3</text>
+  <text x="217" y="159" text-anchor="middle" fill="#4a8abf" font-size="7">seq0.0</text>
+
+  <!-- blk 4 (unused) -->
+  <rect x="248" y="128" width="54" height="36" rx="3" fill="#2a2a2a" stroke="#555" stroke-width="1"/>
+  <text x="275" y="151" text-anchor="middle" fill="#666" font-size="9">blk 4</text>
+
+  <!-- blk 5 (seq 1) -->
+  <rect x="306" y="128" width="54" height="36" rx="3" fill="#2a4a38" stroke="#52b788" stroke-width="1.5"/>
+  <text x="333" y="148" text-anchor="middle" fill="#52b788" font-size="9">blk 5</text>
+  <text x="333" y="159" text-anchor="middle" fill="#52b788" font-size="7">seq1.1</text>
+
+  <!-- blk 6 (unused) -->
+  <rect x="364" y="128" width="54" height="36" rx="3" fill="#2a2a2a" stroke="#555" stroke-width="1"/>
+  <text x="391" y="151" text-anchor="middle" fill="#666" font-size="9">blk 6</text>
+
+  <!-- blk 7 (seq 0) -->
+  <rect x="422" y="128" width="54" height="36" rx="3" fill="#2a3a4a" stroke="#4a8abf" stroke-width="1.5"/>
+  <text x="449" y="148" text-anchor="middle" fill="#4a8abf" font-size="9">blk 7</text>
+  <text x="449" y="159" text-anchor="middle" fill="#4a8abf" font-size="7">seq0.1</text>
+
+  <!-- blk 8 (unused) -->
+  <rect x="480" y="128" width="54" height="36" rx="3" fill="#2a2a2a" stroke="#555" stroke-width="1"/>
+  <text x="507" y="151" text-anchor="middle" fill="#666" font-size="9">blk 8</text>
+
+  <!-- blk 9 (seq 1) -->
+  <rect x="538" y="128" width="54" height="36" rx="3" fill="#2a4a38" stroke="#52b788" stroke-width="1.5"/>
+  <text x="565" y="148" text-anchor="middle" fill="#52b788" font-size="9">blk 9</text>
+  <text x="565" y="159" text-anchor="middle" fill="#52b788" font-size="7">seq1.2</text>
+
+  <!-- ============================================================ -->
+  <!-- BOTTOM: Attention computation                                -->
+  <!-- ============================================================ -->
+  <rect x="16" y="186" width="588" height="100" rx="5" fill="#1a1a1a" stroke="#666" stroke-width="1"/>
+  <text x="310" y="208" text-anchor="middle" fill="#ccc" font-size="11" font-weight="bold">Decode Attention (per sequence s, per head h)</text>
+
+  <text x="36" y="232" fill="#ffcc66" font-size="10">1.</text>
+  <text x="56" y="232" fill="#aaa" font-size="10">Gather K, V: token t is at pool[ block_table[s, t/B] ], offset t%B</text>
+
+  <text x="36" y="254" fill="#ffcc66" font-size="10">2.</text>
+  <text x="56" y="254" fill="#aaa" font-size="10">scores[t] = Q[s,h] &#xb7; K[s,h,t] / &#x221a;head_dim     for t = 0 .. context_lens[s]-1</text>
+
+  <text x="36" y="276" fill="#ffcc66" font-size="10">3.</text>
+  <text x="56" y="276" fill="#aaa" font-size="10">output[s,h] = &#x2211;_t softmax(scores)[t] &#xb7; V[s,h,t]</text>
 </svg>
 
 <h2>Implementation Requirements</h2>
@@ -115,23 +130,24 @@ <h2>Implementation Requirements</h2>
   <li><code>output</code>: result of shape <code>(batch_size, num_heads, head_dim)</code>, dtype <code>float32</code></li>
 </ul>
 <p>
-  For each sequence <code>s</code> and each attention head <code>h</code>, compute:
+  For each sequence \(s\) and each attention head \(h\), compute:
 </p>
 <ol>
   <li>
-    Gather the <code>context_lens[s]</code> key and value vectors from the paged cache using <code>block_table[s]</code>.
-    Token at logical position <code>t</code> lives in physical block <code>block_table[s, t / block_size]</code>
-    at offset <code>t % block_size</code> within that block.
+    Gather the \(\text{context_lens}[s]\) key and value vectors from the paged cache using
+    \(\text{block_table}[s]\). Token at logical position \(t\) lives in physical block
+    \(\text{block_table}[s,\;\lfloor t / B \rfloor]\) at offset \(t \bmod B\) within that block,
+    where \(B = \text{block_size}\).
   </li>
   <li>
     Compute scaled dot-product attention:
-    <code>scores[t] = Q[s, h] &middot; K[s, h, t] / &radic;head_dim</code>
+    \[\text{scores}[t] = \frac{Q[s, h] \cdot K[s, h, t]}{\sqrt{\text{head_dim}}}\]
   </li>
   <li>
-    Apply softmax over all <code>context_lens[s]</code> positions to get attention weights.
+    Apply softmax over all \(\text{context_lens}[s]\) positions to get attention weights.
   </li>
   <li>
-    Compute <code>output[s, h] = &sum;<sub>t</sub> weights[t] &times; V[s, h, t]</code>.
+    Compute: \(\displaystyle \text{output}[s, h] = \sum_{t} \text{softmax}(\text{scores})[t] \cdot V[s, h, t]\)
   </li>
 </ol>
 <p>
@@ -141,31 +157,36 @@ <h2>Implementation Requirements</h2>
 
 <h2>Example</h2>
 <p>
-  With <code>batch_size</code> = 1, <code>num_heads</code> = 1, <code>head_dim</code> = 4,
-  <code>block_size</code> = 2, <code>context_lens</code> = [2], <code>block_table</code> = [[0]]:
+  Input: <code>batch_size</code> = 1, <code>num_heads</code> = 1, <code>head_dim</code> = 4,
+  <code>block_size</code> = 2, <code>context_lens</code> = [2], <code>block_table</code> = [[0]]
 </p>
-<pre>
-Q[0, 0]            = [1.0, 1.0, 0.0, 0.0]
-
-K_cache[0, 0, 0]   = [1.0, 0.0, 0.0, 0.0]   # block 0, token 0
-K_cache[0, 1, 0]   = [0.0, 1.0, 0.0, 0.0]   # block 0, token 1
-
-V_cache[0, 0, 0]   = [2.0, 0.0, 0.0, 0.0]
-V_cache[0, 1, 0]   = [0.0, 4.0, 0.0, 0.0]
-</pre>
 <p>
-  Scores (before softmax):
+  \(Q[0, 0] = \begin{bmatrix} 1.0 & 1.0 & 0.0 & 0.0 \end{bmatrix}\)
 </p>
-<pre>
-score[0] = (1·1 + 1·0 + 0·0 + 0·0) / √4 = 0.5
-score[1] = (1·0 + 1·1 + 0·0 + 0·0) / √4 = 0.5
-</pre>
 <p>
-  Attention weights = softmax([0.5, 0.5]) = [0.5, 0.5]
+  Keys gathered from block 0 (2 tokens):
+  \[
+  K_0 = \begin{bmatrix} 1.0 & 0.0 & 0.0 & 0.0 \end{bmatrix}, \quad
+  K_1 = \begin{bmatrix} 0.0 & 1.0 & 0.0 & 0.0 \end{bmatrix}
+  \]
+  Values gathered from block 0:
+  \[
+  V_0 = \begin{bmatrix} 2.0 & 0.0 & 0.0 & 0.0 \end{bmatrix}, \quad
+  V_1 = \begin{bmatrix} 0.0 & 4.0 & 0.0 & 0.0 \end{bmatrix}
+  \]
+</p>
+<p>
+  Scores (before softmax):
+  \[
+  s_0 = \frac{Q \cdot K_0}{\sqrt{4}} = \frac{1}{2} = 0.5, \quad
+  s_1 = \frac{Q \cdot K_1}{\sqrt{4}} = \frac{1}{2} = 0.5
+  \]
+  Attention weights: \(\text{softmax}([0.5, 0.5]) = [0.5, 0.5]\)
+  \[
+  \text{output}[0, 0] = 0.5 \cdot V_0 + 0.5 \cdot V_1 =
+  \begin{bmatrix} 1.0 & 2.0 & 0.0 & 0.0 \end{bmatrix}
+  \]
 </p>
-<pre>
-output[0, 0] = 0.5 × [2, 0, 0, 0] + 0.5 × [0, 4, 0, 0] = [1.0, 2.0, 0.0, 0.0]
-</pre>
 
 <h2>Constraints</h2>
 <ul>