From b87bf1ed980ac347ad78f10316b4d592d050f47c Mon Sep 17 00:00:00 2001
From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com>
Date: Thu, 19 Mar 2026 04:23:23 +0000
Subject: [PATCH 1/2] Add challenge 84: SwiGLU MLP Block (Medium)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../medium/84_swiglu_mlp_block/challenge.html | 103 +++++++++++
 .../medium/84_swiglu_mlp_block/challenge.py   | 162 ++++++++++++++++++
 .../84_swiglu_mlp_block/starter/starter.cu    |   5 +
 .../starter/starter.cute.py                   |  17 ++
 .../starter/starter.jax.py                    |  17 ++
 .../84_swiglu_mlp_block/starter/starter.mojo  |   9 +
 .../starter/starter.pytorch.py                |  15 ++
 .../starter/starter.triton.py                 |  17 ++
 8 files changed, 345 insertions(+)
 create mode 100644 challenges/medium/84_swiglu_mlp_block/challenge.html
 create mode 100644 challenges/medium/84_swiglu_mlp_block/challenge.py
 create mode 100644 challenges/medium/84_swiglu_mlp_block/starter/starter.cu
 create mode 100644 challenges/medium/84_swiglu_mlp_block/starter/starter.cute.py
 create mode 100644 challenges/medium/84_swiglu_mlp_block/starter/starter.jax.py
 create mode 100644 challenges/medium/84_swiglu_mlp_block/starter/starter.mojo
 create mode 100644 challenges/medium/84_swiglu_mlp_block/starter/starter.pytorch.py
 create mode 100644 challenges/medium/84_swiglu_mlp_block/starter/starter.triton.py
diff --git a/challenges/medium/84_swiglu_mlp_block/challenge.html b/challenges/medium/84_swiglu_mlp_block/challenge.html
new file mode 100644
index 00000000..fb253c3a
--- /dev/null
+++ b/challenges/medium/84_swiglu_mlp_block/challenge.html
@@ -0,0 +1,103 @@
+<p>
+  Implement the SwiGLU MLP block — the feedforward network used in LLaMA, Mistral, Gemma, and most
+  modern large language models. Given an input matrix <code>x</code> of shape
+  <code>[M, d_model]</code> and three weight matrices <code>W_gate</code>, <code>W_up</code>
+  (each <code>[d_model, d_ffn]</code>), and <code>W_down</code> (<code>[d_ffn, d_model]</code>),
+  compute:
+  <code>output = (SiLU(x &times; W_gate) &odot; (x &times; W_up)) &times; W_down</code>,
+  where <code>SiLU(z) = z &times; sigmoid(z)</code> and <code>&odot;</code> denotes element-wise
+  multiplication. All tensors are <code>float32</code>.
+</p>
+
+<svg width="700" height="200" viewBox="0 0 700 200" xmlns="http://www.w3.org/2000/svg" style="display:block; margin:20px auto;">
+  <rect width="700" height="200" fill="#222"/>
+  <!-- x box -->
+  <rect x="20" y="80" width="60" height="40" rx="4" fill="#2a4a7f" stroke="#5588cc" stroke-width="1.5"/>
+  <text x="50" y="104" fill="#ccc" font-size="13" text-anchor="middle" font-family="monospace">x</text>
+  <!-- arrows from x -->
+  <line x1="80" y1="90" x2="150" y2="65" stroke="#888" stroke-width="1.5" marker-end="url(#arr)"/>
+  <line x1="80" y1="110" x2="150" y2="135" stroke="#888" stroke-width="1.5" marker-end="url(#arr)"/>
+  <!-- W_gate box -->
+  <rect x="150" y="45" width="80" height="40" rx="4" fill="#2a4a7f" stroke="#5588cc" stroke-width="1.5"/>
+  <text x="190" y="64" fill="#ccc" font-size="11" text-anchor="middle" font-family="monospace">x&#x22C5;W_gate</text>
+  <!-- W_up box -->
+  <rect x="150" y="115" width="80" height="40" rx="4" fill="#2a4a7f" stroke="#5588cc" stroke-width="1.5"/>
+  <text x="190" y="134" fill="#ccc" font-size="11" text-anchor="middle" font-family="monospace">x&#x22C5;W_up</text>
+  <!-- SiLU box -->
+  <rect x="260" y="45" width="60" height="40" rx="4" fill="#1a5a3a" stroke="#44aa66" stroke-width="1.5"/>
+  <text x="290" y="69" fill="#ccc" font-size="12" text-anchor="middle" font-family="monospace">SiLU</text>
+  <!-- arrows -->
+  <line x1="230" y1="65" x2="260" y2="65" stroke="#888" stroke-width="1.5" marker-end="url(#arr)"/>
+  <line x1="230" y1="135" x2="350" y2="108" stroke="#888" stroke-width="1.5" marker-end="url(#arr)"/>
+  <line x1="320" y1="65" x2="350" y2="88" stroke="#888" stroke-width="1.5" marker-end="url(#arr)"/>
+  <!-- multiply box -->
+  <rect x="350" y="80" width="60" height="40" rx="4" fill="#5a3a1a" stroke="#cc8844" stroke-width="1.5"/>
+  <text x="380" y="104" fill="#ccc" font-size="18" text-anchor="middle" font-family="monospace">&#x2299;</text>
+  <!-- W_down box -->
+  <rect x="450" y="80" width="80" height="40" rx="4" fill="#2a4a7f" stroke="#5588cc" stroke-width="1.5"/>
+  <text x="490" y="99" fill="#ccc" font-size="11" text-anchor="middle" font-family="monospace">&#x22C5;W_down</text>
+  <!-- output box -->
+  <rect x="570" y="80" width="70" height="40" rx="4" fill="#2a4a7f" stroke="#5588cc" stroke-width="1.5"/>
+  <text x="605" y="104" fill="#ccc" font-size="13" text-anchor="middle" font-family="monospace">output</text>
+  <!-- arrows -->
+  <line x1="410" y1="100" x2="450" y2="100" stroke="#888" stroke-width="1.5" marker-end="url(#arr)"/>
+  <line x1="530" y1="100" x2="570" y2="100" stroke="#888" stroke-width="1.5" marker-end="url(#arr)"/>
+  <!-- labels -->
+  <text x="190" y="158" fill="#888" font-size="10" text-anchor="middle" font-family="sans-serif">gate projection</text>
+  <text x="290" y="100" fill="#888" font-size="10" text-anchor="middle" font-family="sans-serif">gate</text>
+  <text x="190" y="170" fill="#888" font-size="10" text-anchor="middle" font-family="sans-serif">up projection</text>
+  <defs>
+    <marker id="arr" markerWidth="6" markerHeight="6" refX="5" refY="3" orient="auto">
+      <path d="M0,0 L0,6 L6,3 z" fill="#888"/>
+    </marker>
+  </defs>
+</svg>
+
+<h2>Implementation Requirements</h2>
+<ul>
+  <li>Implement the <code>solve</code> function with the signature unchanged.</li>
+  <li>Do not use external libraries beyond the framework provided.</li>
+  <li>Write the result into <code>output</code> in-place.</li>
+</ul>
+
+<h2>Example</h2>
+<pre>
+M = 2, d_model = 2, d_ffn = 4
+
+x      = [[1.0, 0.0],
+           [0.0, 1.0]]
+
+W_gate = [[1.0, 0.0, 0.0, 0.0],
+           [0.0, 1.0, 0.0, 0.0]]
+
+W_up   = [[1.0, 0.0, 0.0, 0.0],
+           [0.0, 1.0, 0.0, 0.0]]
+
+W_down = [[1.0, 0.0],
+           [0.0, 1.0],
+           [0.0, 0.0],
+           [0.0, 0.0]]
+
+Intermediate steps:
+  gate   = x @ W_gate = [[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]]
+  up     = x @ W_up   = [[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]]
+  SiLU(1.0) = 1.0 * sigmoid(1.0) ≈ 0.7311
+  hidden = SiLU(gate) * up = [[0.7311, 0.0, 0.0, 0.0], [0.0, 0.7311, 0.0, 0.0]]
+
+output = hidden @ W_down ≈
+  [[0.7311, 0.0],
+   [0.0,    0.7311]]
+</pre>
+
+<h2>Constraints</h2>
+<ul>
+  <li>1 &le; <code>M</code> &le; 65,536</li>
+  <li>1 &le; <code>d_model</code> &le; 8,192</li>
+  <li>1 &le; <code>d_ffn</code> &le; 32,768</li>
+  <li>All tensors are <code>float32</code> on the GPU.</li>
+  <li>Input values are in the range [-10, 10].</li>
+  <li>
+    Performance is measured with <code>M</code> = 512, <code>d_model</code> = 4,096,
+    <code>d_ffn</code> = 14,336
+  </li>
+</ul>
diff --git a/challenges/medium/84_swiglu_mlp_block/challenge.py b/challenges/medium/84_swiglu_mlp_block/challenge.py
new file mode 100644
index 00000000..87408f23
--- /dev/null
+++ b/challenges/medium/84_swiglu_mlp_block/challenge.py
@@ -0,0 +1,162 @@
+import ctypes
+from typing import Any, Dict, List
+
+import torch
+import torch.nn.functional as F
+from core.challenge_base import ChallengeBase
+
+
+class Challenge(ChallengeBase):
+    def __init__(self):
+        super().__init__(
+            name="SwiGLU MLP Block",
+            atol=1e-04,
+            rtol=1e-04,
+            num_gpus=1,
+            access_tier="free",
+        )
+
+    def reference_impl(
+        self,
+        x: torch.Tensor,
+        W_gate: torch.Tensor,
+        W_up: torch.Tensor,
+        W_down: torch.Tensor,
+        output: torch.Tensor,
+        M: int,
+        d_model: int,
+        d_ffn: int,
+    ):
+        assert x.shape == (M, d_model)
+        assert W_gate.shape == (d_model, d_ffn)
+        assert W_up.shape == (d_model, d_ffn)
+        assert W_down.shape == (d_ffn, d_model)
+        assert output.shape == (M, d_model)
+        assert (
+            x.dtype == W_gate.dtype == W_up.dtype == W_down.dtype == output.dtype == torch.float32
+        )
+        assert x.device.type == "cuda"
+        assert W_gate.device.type == "cuda"
+        assert W_up.device.type == "cuda"
+        assert W_down.device.type == "cuda"
+        assert output.device.type == "cuda"
+
+        gate = x @ W_gate  # [M, d_ffn]
+        up = x @ W_up  # [M, d_ffn]
+        hidden = F.silu(gate) * up  # [M, d_ffn]
+        output.copy_(hidden @ W_down)  # [M, d_model]
+
+    def get_solve_signature(self) -> Dict[str, tuple]:
+        return {
+            "x": (ctypes.POINTER(ctypes.c_float), "in"),
+            "W_gate": (ctypes.POINTER(ctypes.c_float), "in"),
+            "W_up": (ctypes.POINTER(ctypes.c_float), "in"),
+            "W_down": (ctypes.POINTER(ctypes.c_float), "in"),
+            "output": (ctypes.POINTER(ctypes.c_float), "out"),
+            "M": (ctypes.c_int, "in"),
+            "d_model": (ctypes.c_int, "in"),
+            "d_ffn": (ctypes.c_int, "in"),
+        }
+
+    def _make_test_case(self, M, d_model, d_ffn, zero_x=False):
+        device = "cuda"
+        dtype = torch.float32
+        if zero_x:
+            x = torch.zeros(M, d_model, device=device, dtype=dtype)
+        else:
+            x = torch.randn(M, d_model, device=device, dtype=dtype) * 0.1
+        W_gate = torch.randn(d_model, d_ffn, device=device, dtype=dtype) * 0.02
+        W_up = torch.randn(d_model, d_ffn, device=device, dtype=dtype) * 0.02
+        W_down = torch.randn(d_ffn, d_model, device=device, dtype=dtype) * 0.02
+        output = torch.empty(M, d_model, device=device, dtype=dtype)
+        return {
+            "x": x,
+            "W_gate": W_gate,
+            "W_up": W_up,
+            "W_down": W_down,
+            "output": output,
+            "M": M,
+            "d_model": d_model,
+            "d_ffn": d_ffn,
+        }
+
+    def generate_example_test(self) -> Dict[str, Any]:
+        device = "cuda"
+        dtype = torch.float32
+        M, d_model, d_ffn = 2, 2, 4
+        # x: each row is a basis vector
+        x = torch.tensor(
+            [[1.0, 0.0], [0.0, 1.0]],
+            device=device,
+            dtype=dtype,
+        )
+        # W_gate: [d_model=2, d_ffn=4] — first two columns are identity, rest zeros
+        W_gate = torch.tensor(
+            [[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]],
+            device=device,
+            dtype=dtype,
+        )
+        # W_up: same layout as W_gate
+        W_up = torch.tensor(
+            [[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]],
+            device=device,
+            dtype=dtype,
+        )
+        # W_down: [d_ffn=4, d_model=2] — top 2x2 is identity, rest zeros
+        W_down = torch.tensor(
+            [[1.0, 0.0], [0.0, 1.0], [0.0, 0.0], [0.0, 0.0]],
+            device=device,
+            dtype=dtype,
+        )
+        output = torch.empty(M, d_model, device=device, dtype=dtype)
+        return {
+            "x": x,
+            "W_gate": W_gate,
+            "W_up": W_up,
+            "W_down": W_down,
+            "output": output,
+            "M": M,
+            "d_model": d_model,
+            "d_ffn": d_ffn,
+        }
+
+    def generate_functional_test(self) -> List[Dict[str, Any]]:
+        torch.manual_seed(42)
+        tests = []
+
+        # Edge cases: single row
+        tests.append(self._make_test_case(1, 4, 8))
+
+        # Edge case: two rows
+        tests.append(self._make_test_case(2, 4, 8))
+
+        # Zero input
+        tests.append(self._make_test_case(4, 8, 16, zero_x=True))
+
+        # Power-of-2 sizes
+        tests.append(self._make_test_case(16, 32, 64))
+
+        # Power-of-2 larger
+        tests.append(self._make_test_case(64, 64, 128))
+
+        # Non-power-of-2 M
+        tests.append(self._make_test_case(30, 32, 64))
+
+        # Non-power-of-2 all dims
+        tests.append(self._make_test_case(100, 60, 120))
+
+        # Non-power-of-2 M, medium size
+        tests.append(self._make_test_case(255, 64, 128))
+
+        # Realistic small inference batch (LLaMA-style ratios)
+        tests.append(self._make_test_case(128, 256, 512))
+
+        # Realistic medium inference batch
+        tests.append(self._make_test_case(256, 512, 1024))
+
+        return tests
+
+    def generate_performance_test(self) -> Dict[str, Any]:
+        torch.manual_seed(0)
+        # LLaMA-3 8B style: d_model=4096, d_ffn=14336, M=512 (batch=4 x seq=128)
+        return self._make_test_case(512, 4096, 14336)
diff --git a/challenges/medium/84_swiglu_mlp_block/starter/starter.cu b/challenges/medium/84_swiglu_mlp_block/starter/starter.cu
new file mode 100644
index 00000000..118724bc
--- /dev/null
+++ b/challenges/medium/84_swiglu_mlp_block/starter/starter.cu
@@ -0,0 +1,5 @@
+#include <cuda_runtime.h>
+
+// x, W_gate, W_up, W_down, output are device pointers
+extern "C" void solve(const float* x, const float* W_gate, const float* W_up, const float* W_down,
+                      float* output, int M, int d_model, int d_ffn) {}
diff --git a/challenges/medium/84_swiglu_mlp_block/starter/starter.cute.py b/challenges/medium/84_swiglu_mlp_block/starter/starter.cute.py
new file mode 100644
index 00000000..c0833020
--- /dev/null
+++ b/challenges/medium/84_swiglu_mlp_block/starter/starter.cute.py
@@ -0,0 +1,17 @@
+import cutlass
+import cutlass.cute as cute
+
+
+# x, W_gate, W_up, W_down, output are tensors on the GPU
+@cute.jit
+def solve(
+    x: cute.Tensor,
+    W_gate: cute.Tensor,
+    W_up: cute.Tensor,
+    W_down: cute.Tensor,
+    output: cute.Tensor,
+    M: cute.Int32,
+    d_model: cute.Int32,
+    d_ffn: cute.Int32,
+):
+    pass
diff --git a/challenges/medium/84_swiglu_mlp_block/starter/starter.jax.py b/challenges/medium/84_swiglu_mlp_block/starter/starter.jax.py
new file mode 100644
index 00000000..219948bb
--- /dev/null
+++ b/challenges/medium/84_swiglu_mlp_block/starter/starter.jax.py
@@ -0,0 +1,17 @@
+import jax
+import jax.numpy as jnp
+
+
+# x, W_gate, W_up, W_down are tensors on GPU
+@jax.jit
+def solve(
+    x: jax.Array,
+    W_gate: jax.Array,
+    W_up: jax.Array,
+    W_down: jax.Array,
+    M: int,
+    d_model: int,
+    d_ffn: int,
+) -> jax.Array:
+    # return output tensor directly
+    pass
diff --git a/challenges/medium/84_swiglu_mlp_block/starter/starter.mojo b/challenges/medium/84_swiglu_mlp_block/starter/starter.mojo
new file mode 100644
index 00000000..184038dc
--- /dev/null
+++ b/challenges/medium/84_swiglu_mlp_block/starter/starter.mojo
@@ -0,0 +1,9 @@
+from gpu.host import DeviceContext
+from gpu.id import block_dim, block_idx, thread_idx
+from memory import UnsafePointer
+from math import ceildiv
+
+# x, W_gate, W_up, W_down, output are device pointers
+@export
+def solve(x: UnsafePointer[Float32], W_gate: UnsafePointer[Float32], W_up: UnsafePointer[Float32], W_down: UnsafePointer[Float32], output: UnsafePointer[Float32], M: Int32, d_model: Int32, d_ffn: Int32):
+    pass
diff --git a/challenges/medium/84_swiglu_mlp_block/starter/starter.pytorch.py b/challenges/medium/84_swiglu_mlp_block/starter/starter.pytorch.py
new file mode 100644
index 00000000..2af6572f
--- /dev/null
+++ b/challenges/medium/84_swiglu_mlp_block/starter/starter.pytorch.py
@@ -0,0 +1,15 @@
+import torch
+
+
+# x, W_gate, W_up, W_down, output are tensors on the GPU
+def solve(
+    x: torch.Tensor,
+    W_gate: torch.Tensor,
+    W_up: torch.Tensor,
+    W_down: torch.Tensor,
+    output: torch.Tensor,
+    M: int,
+    d_model: int,
+    d_ffn: int,
+):
+    pass
diff --git a/challenges/medium/84_swiglu_mlp_block/starter/starter.triton.py b/challenges/medium/84_swiglu_mlp_block/starter/starter.triton.py
new file mode 100644
index 00000000..47552d0c
--- /dev/null
+++ b/challenges/medium/84_swiglu_mlp_block/starter/starter.triton.py
@@ -0,0 +1,17 @@
+import torch
+import triton
+import triton.language as tl
+
+
+# x, W_gate, W_up, W_down, output are tensors on the GPU
+def solve(
+    x: torch.Tensor,
+    W_gate: torch.Tensor,
+    W_up: torch.Tensor,
+    W_down: torch.Tensor,
+    output: torch.Tensor,
+    M: int,
+    d_model: int,
+    d_ffn: int,
+):
+    pass

From 9afb67fd9ec5f7f67e791df1681cb7d6f045b98e Mon Sep 17 00:00:00 2001
From: James Song <haoxijamessong@gmail.com>
Date: Thu, 26 Mar 2026 21:43:05 -0400
Subject: [PATCH 2/2] Improve SwiGLU MLP block SVG diagram and convert example
 to LaTeX

Fix overlapping text in SVG: separate gate/up branch labels, add tensor
shape annotations at each stage, color-code converging arrows. Convert
example from <pre> to LaTeX bmatrix with proper math notation for each
intermediate computation step.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../medium/84_swiglu_mlp_block/challenge.html | 180 +++++++++++-------
 1 file changed, 114 insertions(+), 66 deletions(-)

diff --git a/challenges/medium/84_swiglu_mlp_block/challenge.html b/challenges/medium/84_swiglu_mlp_block/challenge.html
index fb253c3a..b93b5801 100644
--- a/challenges/medium/84_swiglu_mlp_block/challenge.html
+++ b/challenges/medium/84_swiglu_mlp_block/challenge.html
@@ -9,48 +9,71 @@
   multiplication. All tensors are <code>float32</code>.
 </p>
 
-<svg width="700" height="200" viewBox="0 0 700 200" xmlns="http://www.w3.org/2000/svg" style="display:block; margin:20px auto;">
-  <rect width="700" height="200" fill="#222"/>
-  <!-- x box -->
-  <rect x="20" y="80" width="60" height="40" rx="4" fill="#2a4a7f" stroke="#5588cc" stroke-width="1.5"/>
-  <text x="50" y="104" fill="#ccc" font-size="13" text-anchor="middle" font-family="monospace">x</text>
-  <!-- arrows from x -->
-  <line x1="80" y1="90" x2="150" y2="65" stroke="#888" stroke-width="1.5" marker-end="url(#arr)"/>
-  <line x1="80" y1="110" x2="150" y2="135" stroke="#888" stroke-width="1.5" marker-end="url(#arr)"/>
-  <!-- W_gate box -->
-  <rect x="150" y="45" width="80" height="40" rx="4" fill="#2a4a7f" stroke="#5588cc" stroke-width="1.5"/>
-  <text x="190" y="64" fill="#ccc" font-size="11" text-anchor="middle" font-family="monospace">x&#x22C5;W_gate</text>
-  <!-- W_up box -->
-  <rect x="150" y="115" width="80" height="40" rx="4" fill="#2a4a7f" stroke="#5588cc" stroke-width="1.5"/>
-  <text x="190" y="134" fill="#ccc" font-size="11" text-anchor="middle" font-family="monospace">x&#x22C5;W_up</text>
-  <!-- SiLU box -->
-  <rect x="260" y="45" width="60" height="40" rx="4" fill="#1a5a3a" stroke="#44aa66" stroke-width="1.5"/>
-  <text x="290" y="69" fill="#ccc" font-size="12" text-anchor="middle" font-family="monospace">SiLU</text>
-  <!-- arrows -->
-  <line x1="230" y1="65" x2="260" y2="65" stroke="#888" stroke-width="1.5" marker-end="url(#arr)"/>
-  <line x1="230" y1="135" x2="350" y2="108" stroke="#888" stroke-width="1.5" marker-end="url(#arr)"/>
-  <line x1="320" y1="65" x2="350" y2="88" stroke="#888" stroke-width="1.5" marker-end="url(#arr)"/>
-  <!-- multiply box -->
-  <rect x="350" y="80" width="60" height="40" rx="4" fill="#5a3a1a" stroke="#cc8844" stroke-width="1.5"/>
-  <text x="380" y="104" fill="#ccc" font-size="18" text-anchor="middle" font-family="monospace">&#x2299;</text>
-  <!-- W_down box -->
-  <rect x="450" y="80" width="80" height="40" rx="4" fill="#2a4a7f" stroke="#5588cc" stroke-width="1.5"/>
-  <text x="490" y="99" fill="#ccc" font-size="11" text-anchor="middle" font-family="monospace">&#x22C5;W_down</text>
-  <!-- output box -->
-  <rect x="570" y="80" width="70" height="40" rx="4" fill="#2a4a7f" stroke="#5588cc" stroke-width="1.5"/>
-  <text x="605" y="104" fill="#ccc" font-size="13" text-anchor="middle" font-family="monospace">output</text>
-  <!-- arrows -->
-  <line x1="410" y1="100" x2="450" y2="100" stroke="#888" stroke-width="1.5" marker-end="url(#arr)"/>
-  <line x1="530" y1="100" x2="570" y2="100" stroke="#888" stroke-width="1.5" marker-end="url(#arr)"/>
-  <!-- labels -->
-  <text x="190" y="158" fill="#888" font-size="10" text-anchor="middle" font-family="sans-serif">gate projection</text>
-  <text x="290" y="100" fill="#888" font-size="10" text-anchor="middle" font-family="sans-serif">gate</text>
-  <text x="190" y="170" fill="#888" font-size="10" text-anchor="middle" font-family="sans-serif">up projection</text>
+<svg width="680" height="220" viewBox="0 0 680 220" xmlns="http://www.w3.org/2000/svg"
+     style="display:block; margin:20px auto; font-family:monospace;">
+  <rect width="680" height="220" fill="#222" rx="8"/>
   <defs>
-    <marker id="arr" markerWidth="6" markerHeight="6" refX="5" refY="3" orient="auto">
-      <path d="M0,0 L0,6 L6,3 z" fill="#888"/>
+    <marker id="arr" markerWidth="8" markerHeight="8" refX="6" refY="3" orient="auto">
+      <path d="M0,0 L0,6 L8,3 z" fill="#888"/>
     </marker>
   </defs>
+
+  <!-- x box -->
+  <rect x="16" y="82" width="56" height="40" rx="4" fill="#2a4a7f" stroke="#5588cc" stroke-width="1.5"/>
+  <text x="44" y="106" fill="#ccc" font-size="12" text-anchor="middle">x</text>
+  <text x="44" y="136" fill="#666" font-size="8" text-anchor="middle">[M, d_model]</text>
+
+  <!-- Gate branch (top) -->
+  <line x1="72" y1="92" x2="108" y2="52" stroke="#888" stroke-width="1.5" marker-end="url(#arr)"/>
+  <rect x="110" y="32" width="90" height="40" rx="4" fill="#2a4a7f" stroke="#5588cc" stroke-width="1.5"/>
+  <text x="155" y="56" fill="#ccc" font-size="10" text-anchor="middle">x &#xb7; W_gate</text>
+  <text x="155" y="22" fill="#5588cc" font-size="9" text-anchor="middle">gate projection</text>
+
+  <!-- Up branch (bottom) -->
+  <line x1="72" y1="112" x2="108" y2="152" stroke="#888" stroke-width="1.5" marker-end="url(#arr)"/>
+  <rect x="110" y="132" width="90" height="40" rx="4" fill="#2a4a7f" stroke="#5588cc" stroke-width="1.5"/>
+  <text x="155" y="156" fill="#ccc" font-size="10" text-anchor="middle">x &#xb7; W_up</text>
+  <text x="155" y="184" fill="#5588cc" font-size="9" text-anchor="middle">up projection</text>
+
+  <!-- Shape labels after projections -->
+  <text x="155" y="82" fill="#666" font-size="8" text-anchor="middle">[M, d_ffn]</text>
+  <text x="155" y="130" fill="#666" font-size="8" text-anchor="middle">[M, d_ffn]</text>
+
+  <!-- Arrow gate → SiLU -->
+  <line x1="200" y1="52" x2="238" y2="52" stroke="#888" stroke-width="1.5" marker-end="url(#arr)"/>
+
+  <!-- SiLU box -->
+  <rect x="240" y="32" width="60" height="40" rx="4" fill="#1a5a3a" stroke="#44aa66" stroke-width="1.5"/>
+  <text x="270" y="56" fill="#ccc" font-size="11" text-anchor="middle">SiLU</text>
+
+  <!-- Arrow SiLU → element-wise multiply (goes down) -->
+  <line x1="300" y1="52" x2="370" y2="90" stroke="#44aa66" stroke-width="1.5" marker-end="url(#arr)"/>
+
+  <!-- Arrow up branch → element-wise multiply (goes up) -->
+  <line x1="200" y1="152" x2="370" y2="114" stroke="#5588cc" stroke-width="1.5" marker-end="url(#arr)"/>
+
+  <!-- Element-wise multiply box -->
+  <rect x="372" y="82" width="50" height="40" rx="4" fill="#5a3a1a" stroke="#cc8844" stroke-width="1.5"/>
+  <text x="397" y="107" fill="#ccc" font-size="16" text-anchor="middle">&#x2299;</text>
+
+  <!-- Arrow ⊙ → W_down -->
+  <line x1="422" y1="102" x2="458" y2="102" stroke="#888" stroke-width="1.5" marker-end="url(#arr)"/>
+
+  <!-- W_down box -->
+  <rect x="460" y="82" width="86" height="40" rx="4" fill="#2a4a7f" stroke="#5588cc" stroke-width="1.5"/>
+  <text x="503" y="106" fill="#ccc" font-size="10" text-anchor="middle">&#xb7; W_down</text>
+  <text x="503" y="76" fill="#666" font-size="8" text-anchor="middle">[M, d_ffn]</text>
+
+  <!-- Arrow W_down → output -->
+  <line x1="546" y1="102" x2="578" y2="102" stroke="#888" stroke-width="1.5" marker-end="url(#arr)"/>
+
+  <!-- Output box -->
+  <rect x="580" y="82" width="80" height="40" rx="4" fill="#3a1a3a" stroke="#cc44cc" stroke-width="1.5"/>
+  <text x="620" y="106" fill="#ccc" font-size="11" text-anchor="middle">output</text>
+  <text x="620" y="136" fill="#666" font-size="8" text-anchor="middle">[M, d_model]</text>
+
+  <!-- SiLU formula -->
+  <text x="270" y="18" fill="#44aa66" font-size="9" text-anchor="middle">z &#xb7; sigmoid(z)</text>
 </svg>
 
 <h2>Implementation Requirements</h2>
@@ -61,33 +84,58 @@ <h2>Implementation Requirements</h2>
 </ul>
 
 <h2>Example</h2>
-<pre>
-M = 2, d_model = 2, d_ffn = 4
-
-x      = [[1.0, 0.0],
-           [0.0, 1.0]]
-
-W_gate = [[1.0, 0.0, 0.0, 0.0],
-           [0.0, 1.0, 0.0, 0.0]]
-
-W_up   = [[1.0, 0.0, 0.0, 0.0],
-           [0.0, 1.0, 0.0, 0.0]]
-
-W_down = [[1.0, 0.0],
-           [0.0, 1.0],
-           [0.0, 0.0],
-           [0.0, 0.0]]
-
-Intermediate steps:
-  gate   = x @ W_gate = [[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]]
-  up     = x @ W_up   = [[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]]
-  SiLU(1.0) = 1.0 * sigmoid(1.0) ≈ 0.7311
-  hidden = SiLU(gate) * up = [[0.7311, 0.0, 0.0, 0.0], [0.0, 0.7311, 0.0, 0.0]]
-
-output = hidden @ W_down ≈
-  [[0.7311, 0.0],
-   [0.0,    0.7311]]
-</pre>
+<p>
+  Input: <code>M</code> = 2, <code>d_model</code> = 2, <code>d_ffn</code> = 4
+</p>
+<p>
+  \(x\) (float32, \(2 \times 2\)):
+  \[
+  x = \begin{bmatrix} 1.0 & 0.0 \\ 0.0 & 1.0 \end{bmatrix}
+  \]
+  \(W_\text{gate}\) and \(W_\text{up}\) (both \(2 \times 4\)):
+  \[
+  W_\text{gate} = W_\text{up} =
+  \begin{bmatrix}
+  1.0 & 0.0 & 0.0 & 0.0 \\
+  0.0 & 1.0 & 0.0 & 0.0
+  \end{bmatrix}
+  \]
+  \(W_\text{down}\) (\(4 \times 2\)):
+  \[
+  W_\text{down} =
+  \begin{bmatrix}
+  1.0 & 0.0 \\
+  0.0 & 1.0 \\
+  0.0 & 0.0 \\
+  0.0 & 0.0
+  \end{bmatrix}
+  \]
+</p>
+<p>
+  Intermediate steps:
+  \[
+  \text{gate} = x \cdot W_\text{gate} =
+  \begin{bmatrix} 1.0 & 0.0 & 0.0 & 0.0 \\ 0.0 & 1.0 & 0.0 & 0.0 \end{bmatrix}
+  \]
+  \[
+  \text{up} = x \cdot W_\text{up} =
+  \begin{bmatrix} 1.0 & 0.0 & 0.0 & 0.0 \\ 0.0 & 1.0 & 0.0 & 0.0 \end{bmatrix}
+  \]
+  \[
+  \text{SiLU}(1.0) = 1.0 \times \sigma(1.0) \approx 0.7311
+  \]
+  \[
+  \text{hidden} = \text{SiLU}(\text{gate}) \odot \text{up} =
+  \begin{bmatrix} 0.7311 & 0.0 & 0.0 & 0.0 \\ 0.0 & 0.7311 & 0.0 & 0.0 \end{bmatrix}
+  \]
+</p>
+<p>
+  Output:
+  \[
+  \text{output} = \text{hidden} \cdot W_\text{down} \approx
+  \begin{bmatrix} 0.7311 & 0.0 \\ 0.0 & 0.7311 \end{bmatrix}
+  \]
+</p>
 
 <h2>Constraints</h2>
 <ul>