diff --git a/challenges/medium/87_speculative_decoding_verification/challenge.html b/challenges/medium/87_speculative_decoding_verification/challenge.html
new file mode 100644
index 00000000..74e3ff19
--- /dev/null
+++ b/challenges/medium/87_speculative_decoding_verification/challenge.html
@@ -0,0 +1,187 @@
+<p>
+  Implement the token verification step of speculative decoding. A draft model proposes \(T\) tokens;
+  the target model evaluates them in one forward pass and accepts or rejects each. Given \(B\)
+  sequences, produce the verified output tokens. Probability tensors are <code>float32</code>;
+  token tensors are <code>int32</code>.
+</p>
+
+<p>
+  Notation for each sequence \(b\), at each draft position \(i = 0, \ldots, T{-}1\):
+</p>
+<ul>
+  <li>\(t_i = \texttt{draft_tokens}[b, i]\) &mdash; the token proposed by the draft model</li>
+  <li>\(p_i(v) = \texttt{draft_probs}[b, i, v]\) &mdash; draft model's probability for token \(v\)</li>
+  <li>\(q_i(v) = \texttt{target_probs}[b, i, v]\) &mdash; target model's probability for token \(v\)</li>
+  <li>\(u_i = \texttt{uniform_samples}[b, i]\) &mdash; pre-generated \(U[0,1)\) sample for position \(i\)</li>
+</ul>
+
+<svg width="660" height="310" viewBox="0 0 660 310" xmlns="http://www.w3.org/2000/svg"
+     style="display:block; margin:20px auto; font-family:monospace;">
+  <rect width="660" height="310" fill="#222" rx="8"/>
+
+  <!-- Column headers -->
+  <text x="108" y="18" fill="#666" font-size="9" text-anchor="middle">pos 0</text>
+  <text x="248" y="18" fill="#666" font-size="9" text-anchor="middle">pos 1</text>
+  <text x="388" y="18" fill="#666" font-size="9" text-anchor="middle">pos 2</text>
+  <text x="528" y="18" fill="#666" font-size="9" text-anchor="middle">pos 3</text>
+
+  <!-- Row 1: Draft tokens -->
+  <text x="16" y="42" fill="#888" font-size="10">draft</text>
+  <rect x="56" y="28" width="104" height="24" rx="4" fill="#1e3a5f" stroke="#4477bb" stroke-width="1.5"/>
+  <text x="108" y="45" text-anchor="middle" fill="#8ec4f0" font-size="11">t&#x2080;</text>
+  <rect x="196" y="28" width="104" height="24" rx="4" fill="#1e3a5f" stroke="#4477bb" stroke-width="1.5"/>
+  <text x="248" y="45" text-anchor="middle" fill="#8ec4f0" font-size="11">t&#x2081;</text>
+  <rect x="336" y="28" width="104" height="24" rx="4" fill="#1e3a5f" stroke="#4477bb" stroke-width="1.5"/>
+  <text x="388" y="45" text-anchor="middle" fill="#8ec4f0" font-size="11">t&#x2082;</text>
+  <rect x="476" y="28" width="104" height="24" rx="4" fill="#1e3a5f" stroke="#4477bb" stroke-width="1.5"/>
+  <text x="528" y="45" text-anchor="middle" fill="#8ec4f0" font-size="11">t&#x2083;</text>
+
+  <!-- Row 2: Probabilities -->
+  <text x="16" y="76" fill="#888" font-size="10">probs</text>
+  <rect x="56" y="62" width="104" height="34" rx="4" fill="#1a1a1a" stroke="#666" stroke-width="1"/>
+  <text x="108" y="76" text-anchor="middle" fill="#c060e0" font-size="9">p(t&#x2080;) = 0.60</text>
+  <text x="108" y="89" text-anchor="middle" fill="#e0a040" font-size="9">q(t&#x2080;) = 0.50</text>
+
+  <rect x="196" y="62" width="104" height="34" rx="4" fill="#1a1a1a" stroke="#666" stroke-width="1"/>
+  <text x="248" y="76" text-anchor="middle" fill="#c060e0" font-size="9">p(t&#x2081;) = 0.50</text>
+  <text x="248" y="89" text-anchor="middle" fill="#e0a040" font-size="9">q(t&#x2081;) = 0.20</text>
+
+  <rect x="336" y="62" width="104" height="34" rx="4" fill="#2a2a2a" stroke="#555" stroke-width="1"/>
+  <text x="388" y="80" text-anchor="middle" fill="#555" font-size="9">not reached</text>
+
+  <rect x="476" y="62" width="104" height="34" rx="4" fill="#2a2a2a" stroke="#555" stroke-width="1"/>
+  <text x="528" y="80" text-anchor="middle" fill="#555" font-size="9">not reached</text>
+
+  <!-- Row 3: Alpha + accept/reject -->
+  <text x="16" y="124" fill="#888" font-size="10">&#x3b1;, test</text>
+  <rect x="56" y="108" width="104" height="40" rx="4" fill="#1a3a1a" stroke="#44aa66" stroke-width="1.5"/>
+  <text x="108" y="124" text-anchor="middle" fill="#aaa" font-size="9">&#x3b1; = .50/.60 = .83</text>
+  <text x="108" y="140" text-anchor="middle" fill="#44aa66" font-size="9">u=0.1 &lt; .83 &#x2713;</text>
+
+  <rect x="196" y="108" width="104" height="40" rx="4" fill="#4a1a1a" stroke="#e06060" stroke-width="1.5"/>
+  <text x="248" y="124" text-anchor="middle" fill="#aaa" font-size="9">&#x3b1; = .20/.50 = .40</text>
+  <text x="248" y="140" text-anchor="middle" fill="#e06060" font-size="9">u=0.7 &#x2265; .40 &#x2717;</text>
+
+  <rect x="336" y="108" width="104" height="40" rx="4" fill="#2a2a2a" stroke="#555" stroke-width="1"/>
+  <text x="388" y="132" text-anchor="middle" fill="#555" font-size="9">skipped</text>
+
+  <rect x="476" y="108" width="104" height="40" rx="4" fill="#2a2a2a" stroke="#555" stroke-width="1"/>
+  <text x="528" y="132" text-anchor="middle" fill="#555" font-size="9">skipped</text>
+
+  <!-- Row 4: Resample box -->
+  <rect x="56" y="164" width="524" height="38" rx="5" fill="#1a1a1a" stroke="#e06060" stroke-width="1"/>
+  <text x="318" y="180" text-anchor="middle" fill="#e06060" font-size="10">reject at pos 1 &#x2192; stop, resample from adj(v) = max(0, q(v) &#x2212; p(v))</text>
+  <text x="318" y="194" text-anchor="middle" fill="#aaa" font-size="9">normalize adj, inverse-CDF sample using u[b, T] &#x2192; replacement token t&#x2081;&#x2032;</text>
+
+  <!-- Row 5: Output tokens -->
+  <text x="16" y="224" fill="#888" font-size="10">output</text>
+  <rect x="56" y="212" width="104" height="24" rx="4" fill="#1e3a5f" stroke="#4477bb" stroke-width="1.5"/>
+  <text x="108" y="229" text-anchor="middle" fill="#8ec4f0" font-size="11">t&#x2080;</text>
+  <rect x="196" y="212" width="104" height="24" rx="4" fill="#3a2010" stroke="#e0a040" stroke-width="1.5"/>
+  <text x="248" y="229" text-anchor="middle" fill="#f0b060" font-size="11">t&#x2081;&#x2032;</text>
+  <rect x="336" y="212" width="104" height="24" rx="4" fill="#2a2a2a" stroke="#555" stroke-width="1"/>
+  <text x="388" y="229" text-anchor="middle" fill="#555" font-size="11">0</text>
+  <rect x="476" y="212" width="104" height="24" rx="4" fill="#2a2a2a" stroke="#555" stroke-width="1"/>
+  <text x="528" y="229" text-anchor="middle" fill="#555" font-size="11">0</text>
+
+  <!-- Legend -->
+  <text x="16" y="260" fill="#c060e0" font-size="9">p = draft prob</text>
+  <text x="130" y="260" fill="#e0a040" font-size="9">q = target prob</text>
+  <text x="260" y="260" fill="#aaa" font-size="9">&#x3b1; = min(1, q/p)</text>
+  <text x="400" y="260" fill="#44aa66" font-size="9">&#x25a0; accepted</text>
+  <text x="490" y="260" fill="#e0a040" font-size="9">&#x25a0; resampled</text>
+  <text x="590" y="260" fill="#555" font-size="9">&#x25a0; pad</text>
+
+  <!-- All-accept note -->
+  <text x="330" y="290" text-anchor="middle" fill="#888" font-size="9">If all T tokens accepted: sample bonus token from q at last position using u[b, T]</text>
+</svg>
+
+<p>
+  For each sequence \(b\), process positions \(i = 0, 1, \ldots, T{-}1\) left-to-right:
+</p>
+<ol>
+  <li>Compute acceptance probability: \(\displaystyle \alpha_i = \min\!\left(1,\; \frac{q_i(t_i)}{p_i(t_i)}\right)\)</li>
+  <li>If \(u_i < \alpha_i\): <strong>accept</strong> \(t_i\), continue to position \(i{+}1\).</li>
+  <li>If \(u_i \ge \alpha_i\): <strong>reject</strong>, stop. Sample replacement from:
+    \[\text{adj}(v) = \frac{\max(0,\; q_i(v) - p_i(v))}{\sum_{v'} \max(0,\; q_i(v') - p_i(v'))}\]
+    using inverse CDF with \(r = \texttt{uniform_samples}[b, T]\). If \(\text{adj}\) is all zeros, use uniform \(1/V\).
+  </li>
+  <li>If all \(T\) tokens accepted: sample a <strong>bonus token</strong> from \(q_{T-1}\) using \(\texttt{uniform_samples}[b, T]\).</li>
+</ol>
+<p>
+  Write results into <code>output_tokens[b, :]</code> (shape \([B, T{+}1]\)): accepted/resampled tokens
+  fill positions \(0\) through the accepted count (inclusive), remaining positions are zero.
+</p>
+
+<h2>Implementation Requirements</h2>
+<ul>
+  <li>Implement <code>solve(draft_tokens, draft_probs, target_probs, uniform_samples, output_tokens, B, T, V)</code>.</li>
+  <li>Do not change the function signature or use external libraries beyond the standard GPU frameworks.</li>
+  <li>Write results into the provided <code>output_tokens</code> buffer (shape <code>[B, T+1]</code>, <code>int32</code>).</li>
+  <li>Memory layout is row-major: <code>draft_probs[b, i, v]</code> is at offset <code>b*T*V + i*V + v</code>.</li>
+  <li>
+    Inverse CDF sampling: given distribution \(\text{adj}\) (already normalized), find the
+    smallest index \(k\) where \(\sum_{v=0}^{k} \text{adj}(v) \ge r\), where
+    \(r = \texttt{uniform_samples}[b, T]\). Clamp the result to \([0, V-1]\).
+  </li>
+  <li>
+    If the adjusted distribution is all zeros (i.e., \(q_i \le p_i\) everywhere), fall back to
+    the uniform distribution over \(V\) tokens.
+  </li>
+</ul>
+
+<h2>Example</h2>
+<p>
+  Input: \(B = 1,\; T = 3,\; V = 4\)
+</p>
+<p>
+  \(\text{draft_tokens} = [1, 2, 0]\)
+</p>
+<p>
+  Draft probabilities \(p_i\) and target probabilities \(q_i\) per position:
+  \[
+  p_0 = \begin{bmatrix} 0.10 & 0.60 & 0.20 & 0.10 \end{bmatrix}, \quad
+  q_0 = \begin{bmatrix} 0.10 & 0.50 & 0.20 & 0.20 \end{bmatrix}
+  \]
+  \[
+  p_1 = \begin{bmatrix} 0.10 & 0.20 & 0.50 & 0.20 \end{bmatrix}, \quad
+  q_1 = \begin{bmatrix} 0.30 & 0.20 & 0.20 & 0.30 \end{bmatrix}
+  \]
+  \[
+  \text{uniform_samples} = \begin{bmatrix} 0.50 & 0.70 & 0.30 & 0.90 \end{bmatrix}
+  \]
+</p>
+<p>
+  <strong>Position 0</strong> (draft token = 1):
+  \(\alpha_0 = \min\!\left(1,\, \frac{q_0(1)}{p_0(1)}\right) = \min\!\left(1,\, \frac{0.50}{0.60}\right) \approx 0.833\).
+  Since \(u_0 = 0.50 < 0.833\), <strong>accept</strong> token 1.
+</p>
+<p>
+  <strong>Position 1</strong> (draft token = 2):
+  \(\alpha_1 = \min\!\left(1,\, \frac{q_1(2)}{p_1(2)}\right) = \min\!\left(1,\, \frac{0.20}{0.50}\right) = 0.40\).
+  Since \(u_1 = 0.70 \ge 0.40\), <strong>reject</strong>. Resample from adjusted distribution:
+  \[
+  \text{adj}(v) = \max(0,\, q_1(v) - p_1(v)) = [0.20,\, 0,\, 0,\, 0.10]
+  \]
+  \[
+  \text{normalized} = \left[\tfrac{2}{3},\, 0,\, 0,\, \tfrac{1}{3}\right], \quad
+  \text{CDF} = [0.667,\, 0.667,\, 0.667,\, 1.0]
+  \]
+  With \(r = \text{uniform_samples}[0, T] = 0.90\), inverse CDF gives token <strong>3</strong>.
+</p>
+<p>
+  Output:
+  \[\text{output_tokens} = \begin{bmatrix} 1 & 3 & 0 & 0 \end{bmatrix}\]
+</p>
+
+<h2>Constraints</h2>
+<ul>
+  <li>1 &le; <code>B</code> &le; 256</li>
+  <li>1 &le; <code>T</code> &le; 16</li>
+  <li>2 &le; <code>V</code> &le; 131,072</li>
+  <li><code>draft_probs[b, i, :]</code> and <code>target_probs[b, i, :]</code> are valid probability distributions (non-negative, sum to 1)</li>
+  <li><code>draft_probs[b, i, draft_tokens[b, i]]</code> &gt; 0 for all <code>b</code>, <code>i</code></li>
+  <li><code>uniform_samples</code> values are in \([0, 1)\)</li>
+  <li>All floating-point tensors use <code>float32</code>; token tensors use <code>int32</code></li>
+  <li>Performance is measured with <code>B</code> = 64, <code>T</code> = 8, <code>V</code> = 32,768</li>
+</ul>
diff --git a/challenges/medium/87_speculative_decoding_verification/challenge.py b/challenges/medium/87_speculative_decoding_verification/challenge.py
new file mode 100644
index 00000000..363c849a
--- /dev/null
+++ b/challenges/medium/87_speculative_decoding_verification/challenge.py
@@ -0,0 +1,300 @@
+import ctypes
+from typing import Any, Dict, List
+
+import torch
+from core.challenge_base import ChallengeBase
+
+
+class Challenge(ChallengeBase):
+    def __init__(self):
+        super().__init__(
+            name="Speculative Decoding Verification",
+            atol=1e-05,
+            rtol=1e-05,
+            num_gpus=1,
+            access_tier="free",
+        )
+
+    def reference_impl(
+        self,
+        draft_tokens: torch.Tensor,
+        draft_probs: torch.Tensor,
+        target_probs: torch.Tensor,
+        uniform_samples: torch.Tensor,
+        output_tokens: torch.Tensor,
+        B: int,
+        T: int,
+        V: int,
+    ):
+        assert draft_tokens.shape == (B, T)
+        assert draft_probs.shape == (B, T, V)
+        assert target_probs.shape == (B, T, V)
+        assert uniform_samples.shape == (B, T + 1)
+        assert output_tokens.shape == (B, T + 1)
+        assert draft_tokens.dtype == torch.int32
+        assert draft_probs.dtype == torch.float32
+        assert target_probs.dtype == torch.float32
+        assert uniform_samples.dtype == torch.float32
+        assert output_tokens.dtype == torch.int32
+        assert draft_tokens.device.type == "cuda"
+        assert draft_probs.device.type == "cuda"
+        assert target_probs.device.type == "cuda"
+        assert uniform_samples.device.type == "cuda"
+        assert output_tokens.device.type == "cuda"
+
+        output_tokens.fill_(0)
+
+        for b in range(B):
+            for i in range(T):
+                tok = int(draft_tokens[b, i].item())
+                p = draft_probs[b, i, tok].item()
+                q = target_probs[b, i, tok].item()
+                alpha = min(1.0, q / p)
+
+                if uniform_samples[b, i].item() < alpha:
+                    output_tokens[b, i] = tok
+                else:
+                    adjusted = torch.clamp(target_probs[b, i] - draft_probs[b, i], min=0.0)
+                    total = adjusted.sum().item()
+                    if total > 0.0:
+                        adjusted = adjusted / total
+                    else:
+                        adjusted = (
+                            torch.ones(V, dtype=torch.float32, device=draft_tokens.device) / V
+                        )
+                    cdf = torch.cumsum(adjusted, dim=0)
+                    r = float(uniform_samples[b, T].item())
+                    new_tok = int(torch.searchsorted(cdf.contiguous(), r).item())
+                    output_tokens[b, i] = min(new_tok, V - 1)
+                    break
+            else:
+                cdf = torch.cumsum(target_probs[b, T - 1], dim=0)
+                r = float(uniform_samples[b, T].item())
+                bonus_tok = int(torch.searchsorted(cdf.contiguous(), r).item())
+                output_tokens[b, T] = min(bonus_tok, V - 1)
+
+    def get_solve_signature(self) -> Dict[str, tuple]:
+        return {
+            "draft_tokens": (ctypes.POINTER(ctypes.c_int), "in"),
+            "draft_probs": (ctypes.POINTER(ctypes.c_float), "in"),
+            "target_probs": (ctypes.POINTER(ctypes.c_float), "in"),
+            "uniform_samples": (ctypes.POINTER(ctypes.c_float), "in"),
+            "output_tokens": (ctypes.POINTER(ctypes.c_int), "out"),
+            "B": (ctypes.c_int, "in"),
+            "T": (ctypes.c_int, "in"),
+            "V": (ctypes.c_int, "in"),
+        }
+
+    def _make_sparse_probs(self, B, T, V, K, device):
+        """Generate sparse probability distributions: only K tokens have nonzero probability.
+
+        Using sparse distributions ensures that the adjusted distribution clamp(q-p, 0)
+        has at most 2K nonzero entries, making CDF summation numerically exact regardless
+        of summation order. This prevents floating-point sensitivity for large V.
+        """
+        K = min(K, V)
+        flat = B * T
+        # For each (b, i), sample K distinct token indices
+        idx = torch.stack([torch.randperm(V, device=device)[:K] for _ in range(flat)])
+        idx = idx.view(B, T, K)
+        # Random weights summing to 1
+        weights = torch.rand(B, T, K, device=device)
+        weights = weights / weights.sum(dim=-1, keepdim=True)
+        # Scatter into full V-dimensional probability vector
+        probs = torch.zeros(B, T, V, device=device)
+        probs.scatter_(2, idx, weights)
+        return probs, idx
+
+    def _make_test_case(self, B, T, V, seed=42):
+        torch.manual_seed(seed)
+        device = "cuda"
+
+        # K=64 active tokens per position: enough diversity while keeping the adjusted
+        # distribution sparse (at most 128 nonzero entries), ensuring CDF sums are
+        # independent of floating-point summation order.
+        K = min(64, V)
+        draft_probs, draft_idx = self._make_sparse_probs(B, T, V, K, device)
+        target_probs, _ = self._make_sparse_probs(B, T, V, K, device)
+
+        # Sample draft tokens from the active K tokens
+        weights = draft_probs.gather(2, draft_idx)  # (B, T, K)
+        flat_w = weights.view(B * T, K)
+        chosen = torch.multinomial(flat_w, 1).view(B, T)  # index within the K tokens
+        draft_tokens = draft_idx.gather(2, chosen.unsqueeze(-1)).squeeze(-1).to(torch.int32)
+
+        uniform_samples = torch.rand(B, T + 1, device=device)
+        output_tokens = torch.zeros(B, T + 1, device=device, dtype=torch.int32)
+
+        return {
+            "draft_tokens": draft_tokens,
+            "draft_probs": draft_probs,
+            "target_probs": target_probs,
+            "uniform_samples": uniform_samples,
+            "output_tokens": output_tokens,
+            "B": B,
+            "T": T,
+            "V": V,
+        }
+
+    def _make_accept_all_case(self, B, T, V, seed=42):
+        """All draft tokens accepted: target_probs == draft_probs so alpha == 1 everywhere."""
+        torch.manual_seed(seed)
+        device = "cuda"
+
+        K = min(64, V)
+        draft_probs, draft_idx = self._make_sparse_probs(B, T, V, K, device)
+        target_probs = draft_probs.clone()  # alpha = min(1, q/p) = 1 → always accept
+
+        weights = draft_probs.gather(2, draft_idx)
+        flat_w = weights.view(B * T, K)
+        chosen = torch.multinomial(flat_w, 1).view(B, T)
+        draft_tokens = draft_idx.gather(2, chosen.unsqueeze(-1)).squeeze(-1).to(torch.int32)
+
+        # All acceptance samples set to 0 (< 1.0 = alpha) to guarantee acceptance
+        uniform_samples = torch.zeros(B, T + 1, device=device)
+        uniform_samples[:, T] = torch.rand(B, device=device)  # bonus sampling sample
+
+        output_tokens = torch.zeros(B, T + 1, device=device, dtype=torch.int32)
+
+        return {
+            "draft_tokens": draft_tokens,
+            "draft_probs": draft_probs,
+            "target_probs": target_probs,
+            "uniform_samples": uniform_samples,
+            "output_tokens": output_tokens,
+            "B": B,
+            "T": T,
+            "V": V,
+        }
+
+    def _make_reject_first_case(self, B, T, V, seed=42):
+        """First draft token always rejected: draft_probs high, target low for that token."""
+        torch.manual_seed(seed)
+        device = "cuda"
+
+        draft_probs = torch.softmax(torch.randn(B, T, V, device=device), dim=-1)
+        target_probs = torch.softmax(torch.randn(B, T, V, device=device), dim=-1)
+
+        flat = draft_probs.view(B * T, V)
+        draft_tokens = torch.multinomial(flat, 1).view(B, T).to(torch.int32)
+
+        # Force rejection at position 0 for every sequence:
+        # set alpha[b,0] very small and uniform_sample[b,0] high enough to reject
+        for b in range(B):
+            tok = int(draft_tokens[b, 0].item())
+            # Make draft prob ~0.9 for the chosen token (high p)
+            draft_probs[b, 0] = torch.full((V,), 0.1 / max(V - 1, 1), device=device)
+            draft_probs[b, 0, tok] = 0.9
+            draft_probs[b, 0] = draft_probs[b, 0] / draft_probs[b, 0].sum()
+            # Make target prob ~1/V for the same token (low q)
+            target_probs[b, 0] = torch.ones(V, device=device) / V
+
+        uniform_samples = torch.rand(B, T + 1, device=device)
+        # Force uniform[b, 0] = 0.99 > alpha (which is ~1/V / 0.9 ≈ small)
+        uniform_samples[:, 0] = 0.99
+
+        output_tokens = torch.zeros(B, T + 1, device=device, dtype=torch.int32)
+
+        return {
+            "draft_tokens": draft_tokens,
+            "draft_probs": draft_probs,
+            "target_probs": target_probs,
+            "uniform_samples": uniform_samples,
+            "output_tokens": output_tokens,
+            "B": B,
+            "T": T,
+            "V": V,
+        }
+
+    def generate_example_test(self) -> Dict[str, Any]:
+        device = "cuda"
+
+        # B=1, T=3, V=4: position 0 accepted, position 1 rejected, token resampled
+        draft_tokens = torch.tensor([[1, 2, 0]], device=device, dtype=torch.int32)
+
+        draft_probs = torch.tensor(
+            [
+                [
+                    [0.10, 0.60, 0.20, 0.10],  # pos 0: draft_tokens[0,0]=1, p=0.60
+                    [0.10, 0.20, 0.50, 0.20],  # pos 1: draft_tokens[0,1]=2, p=0.50
+                    [0.40, 0.20, 0.20, 0.20],  # pos 2: draft_tokens[0,2]=0, p=0.40
+                ]
+            ],
+            device=device,
+            dtype=torch.float32,
+        )
+
+        target_probs = torch.tensor(
+            [
+                [
+                    [0.10, 0.50, 0.20, 0.20],  # pos 0: q=0.50, alpha=min(1,0.50/0.60)=0.833
+                    [0.30, 0.20, 0.20, 0.30],  # pos 1: q=0.20, alpha=min(1,0.20/0.50)=0.400
+                    [0.30, 0.20, 0.30, 0.20],  # pos 2: not reached
+                ]
+            ],
+            device=device,
+            dtype=torch.float32,
+        )
+
+        # uniform_samples[0, 0]=0.50 < 0.833 → ACCEPT token 1
+        # uniform_samples[0, 1]=0.70 > 0.400 → REJECT token 2
+        #   adjusted = clamp([0.20, 0, -0.30, 0.10], min=0) = [0.20, 0, 0, 0.10]
+        #   normalized CDF = [0.667, 0.667, 0.667, 1.0]
+        #   uniform_samples[0, T=3]=0.90 → searchsorted → token 3
+        # output_tokens[0] = [1, 3, 0, 0]
+        uniform_samples = torch.tensor(
+            [[0.50, 0.70, 0.30, 0.90]], device=device, dtype=torch.float32
+        )
+
+        output_tokens = torch.zeros(1, 4, device=device, dtype=torch.int32)
+
+        return {
+            "draft_tokens": draft_tokens,
+            "draft_probs": draft_probs,
+            "target_probs": target_probs,
+            "uniform_samples": uniform_samples,
+            "output_tokens": output_tokens,
+            "B": 1,
+            "T": 3,
+            "V": 4,
+        }
+
+    def generate_functional_test(self) -> List[Dict[str, Any]]:
+        tests = []
+
+        # Edge: T=1, rejected immediately
+        tests.append(self._make_reject_first_case(1, 1, 4, seed=1))
+
+        # Edge: T=1, all accepted (bonus token sampled)
+        tests.append(self._make_accept_all_case(1, 1, 4, seed=2))
+
+        # Edge: T=2, first rejected
+        tests.append(self._make_reject_first_case(1, 2, 8, seed=3))
+
+        # Edge: T=4, all accepted
+        tests.append(self._make_accept_all_case(2, 4, 8, seed=4))
+
+        # Zero uniform_samples acceptance values → force rejection at pos 0 (unless alpha=1)
+        tests.append(self._make_reject_first_case(4, 4, 16, seed=5))
+
+        # Power-of-2 vocab, mixed acceptance
+        tests.append(self._make_test_case(4, 8, 64, seed=10))
+
+        # Larger vocab, mixed acceptance
+        tests.append(self._make_test_case(8, 8, 256, seed=20))
+
+        # Non-power-of-2 vocab
+        tests.append(self._make_test_case(4, 6, 30, seed=30))
+
+        # All sequences accept all tokens (bonus sampling)
+        tests.append(self._make_accept_all_case(8, 8, 128, seed=40))
+
+        # Realistic small batch
+        tests.append(self._make_test_case(16, 8, 1000, seed=50))
+
+        return tests
+
+    def generate_performance_test(self) -> Dict[str, Any]:
+        torch.manual_seed(0)
+        # B=64 sequences, T=8 draft tokens, V=32768 (Mistral/LLaMA-2 vocab size)
+        return self._make_test_case(64, 8, 32768, seed=0)
diff --git a/challenges/medium/87_speculative_decoding_verification/starter/starter.cu b/challenges/medium/87_speculative_decoding_verification/starter/starter.cu
new file mode 100644
index 00000000..6a9731a8
--- /dev/null
+++ b/challenges/medium/87_speculative_decoding_verification/starter/starter.cu
@@ -0,0 +1,5 @@
+#include <cuda_runtime.h>
+
+// draft_tokens, draft_probs, target_probs, uniform_samples, output_tokens are device pointers
+extern "C" void solve(const int* draft_tokens, const float* draft_probs, const float* target_probs,
+                      const float* uniform_samples, int* output_tokens, int B, int T, int V) {}
diff --git a/challenges/medium/87_speculative_decoding_verification/starter/starter.cute.py b/challenges/medium/87_speculative_decoding_verification/starter/starter.cute.py
new file mode 100644
index 00000000..320555b2
--- /dev/null
+++ b/challenges/medium/87_speculative_decoding_verification/starter/starter.cute.py
@@ -0,0 +1,17 @@
+import cutlass
+import cutlass.cute as cute
+
+
+# draft_tokens, draft_probs, target_probs, uniform_samples, output_tokens are tensors on the GPU
+@cute.jit
+def solve(
+    draft_tokens: cute.Tensor,
+    draft_probs: cute.Tensor,
+    target_probs: cute.Tensor,
+    uniform_samples: cute.Tensor,
+    output_tokens: cute.Tensor,
+    B: cute.Int32,
+    T: cute.Int32,
+    V: cute.Int32,
+):
+    pass
diff --git a/challenges/medium/87_speculative_decoding_verification/starter/starter.jax.py b/challenges/medium/87_speculative_decoding_verification/starter/starter.jax.py
new file mode 100644
index 00000000..bd7e8e18
--- /dev/null
+++ b/challenges/medium/87_speculative_decoding_verification/starter/starter.jax.py
@@ -0,0 +1,17 @@
+import jax
+import jax.numpy as jnp
+
+
+# draft_tokens, draft_probs, target_probs, uniform_samples are tensors on GPU
+@jax.jit
+def solve(
+    draft_tokens: jax.Array,
+    draft_probs: jax.Array,
+    target_probs: jax.Array,
+    uniform_samples: jax.Array,
+    B: int,
+    T: int,
+    V: int,
+) -> jax.Array:
+    # return output tensor directly
+    pass
diff --git a/challenges/medium/87_speculative_decoding_verification/starter/starter.mojo b/challenges/medium/87_speculative_decoding_verification/starter/starter.mojo
new file mode 100644
index 00000000..89c9fca2
--- /dev/null
+++ b/challenges/medium/87_speculative_decoding_verification/starter/starter.mojo
@@ -0,0 +1,16 @@
+from gpu.host import DeviceContext
+from memory import UnsafePointer
+
+# draft_tokens, draft_probs, target_probs, uniform_samples, output_tokens are device pointers
+@export
+def solve(
+    draft_tokens: UnsafePointer[Int32],
+    draft_probs: UnsafePointer[Float32],
+    target_probs: UnsafePointer[Float32],
+    uniform_samples: UnsafePointer[Float32],
+    output_tokens: UnsafePointer[Int32],
+    B: Int32,
+    T: Int32,
+    V: Int32,
+):
+    pass
diff --git a/challenges/medium/87_speculative_decoding_verification/starter/starter.pytorch.py b/challenges/medium/87_speculative_decoding_verification/starter/starter.pytorch.py
new file mode 100644
index 00000000..3cce7fae
--- /dev/null
+++ b/challenges/medium/87_speculative_decoding_verification/starter/starter.pytorch.py
@@ -0,0 +1,15 @@
+import torch
+
+
+# draft_tokens, draft_probs, target_probs, uniform_samples, output_tokens are tensors on the GPU
+def solve(
+    draft_tokens: torch.Tensor,
+    draft_probs: torch.Tensor,
+    target_probs: torch.Tensor,
+    uniform_samples: torch.Tensor,
+    output_tokens: torch.Tensor,
+    B: int,
+    T: int,
+    V: int,
+):
+    pass
diff --git a/challenges/medium/87_speculative_decoding_verification/starter/starter.triton.py b/challenges/medium/87_speculative_decoding_verification/starter/starter.triton.py
new file mode 100644
index 00000000..1347ec9c
--- /dev/null
+++ b/challenges/medium/87_speculative_decoding_verification/starter/starter.triton.py
@@ -0,0 +1,17 @@
+import torch
+import triton
+import triton.language as tl
+
+
+# draft_tokens, draft_probs, target_probs, uniform_samples, output_tokens are tensors on the GPU
+def solve(
+    draft_tokens: torch.Tensor,
+    draft_probs: torch.Tensor,
+    target_probs: torch.Tensor,
+    uniform_samples: torch.Tensor,
+    output_tokens: torch.Tensor,
+    B: int,
+    T: int,
+    V: int,
+):
+    pass