AlphaGPU · claude · Feb 26, 2026
@@ -0,0 +1,65 @@
+<p>
+  Implement the forward pass of layer normalization for a 2D input tensor. Given an input tensor of shape [N, C] where N is the batch size and C is the number of features, normalize each sample independently across its C features, then apply learnable scale (<code>weight</code>) and shift (<code>bias</code>) parameters. Layer normalization is a core building block of transformer architectures.
+</p>
+
+<p>
+  For each sample \(i\), layer normalization computes:
+  \[
+  \begin{align}
+  \mu_i &= \frac{1}{C} \sum_{j=0}^{C-1} x_{i,j} \\
+  \sigma_i^2 &= \frac{1}{C} \sum_{j=0}^{C-1} (x_{i,j} - \mu_i)^2 \\
+  y_{i,j} &= \text{weight}_j \cdot \frac{x_{i,j} - \mu_i}{\sqrt{\sigma_i^2 + \varepsilon}} + \text{bias}_j
+  \end{align}
+  \]
+</p>
+
+<h2>Implementation Requirements</h2>
+<ul>
+  <li>Use only native features (external libraries are not permitted)</li>
+  <li>The <code>solve</code> function signature must remain unchanged</li>
+  <li>The final result must be stored in the <code>output</code> tensor</li>
+</ul>
+
+<h2>Example</h2>
+<p>
+Input:<br>
+\(\text{input}\) (N=2, C=4):
+\[
+\begin{bmatrix}
+1.0 & 2.0 & 3.0 & 4.0 \\
+-1.0 & 0.0 & 0.0 & 1.0
+\end{bmatrix}
+\]
+\(\text{weight}\):
+\[
+\begin{bmatrix}
+1.0 & 1.0 & 1.0 & 1.0
+\end{bmatrix}
+\]
+\(\text{bias}\):
+\[
+\begin{bmatrix}
+0.0 & 0.0 & 0.0 & 0.0
+\end{bmatrix}
+\]
+\(\varepsilon\) = 1e-5<br><br>
+Output:<br>
+\(\text{output}\) (N=2, C=4):
+\[
+\begin{bmatrix}
+-1.3416 & -0.4472 & 0.4472 & 1.3416 \\
+-1.4142 & 0.0 & 0.0 & 1.4142
+\end{bmatrix}
+\]
+</p>
+
+<h2>Constraints</h2>
+<ul>
+  <li>1 &le; <code>N</code> &le; 65,536</li>
+  <li>1 &le; <code>C</code> &le; 4,096</li>
+  <li><code>eps</code> = 1e-5</li>
+  <li>Input values are in the range [-100.0, 100.0]</li>
+  <li>Weight values are in the range [0.1, 10.0]</li>
+  <li>Bias values are in the range [-10.0, 10.0]</li>
+  <li>Performance is measured with <code>N</code> = 65,536, <code>C</code> = 512</li>
+</ul>
@@ -0,0 +1,232 @@
+import ctypes
+from typing import Any, Dict, List
+
+import torch
+from core.challenge_base import ChallengeBase
+
+
+class Challenge(ChallengeBase):
+    def __init__(self):
+        super().__init__(
+            name="Layer Normalization", atol=1e-04, rtol=1e-04, num_gpus=1, access_tier="free"
+        )
+
+    def reference_impl(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        bias: torch.Tensor,
+        output: torch.Tensor,
+        N: int,
+        C: int,
+        eps: float,
+    ):
+        assert input.shape == output.shape == (N, C)
+        assert weight.shape == bias.shape == (C,)
+        assert input.dtype == weight.dtype == bias.dtype == output.dtype
+        assert input.device == weight.device == bias.device == output.device
+        assert str(input.device).startswith("cuda")
+
+        mean = input.mean(dim=1, keepdim=True)
+        var = input.var(dim=1, keepdim=True, unbiased=False)
+        normalized = (input - mean) / torch.sqrt(var + eps)
+        output.copy_(weight * normalized + bias)
+
+    def get_solve_signature(self) -> Dict[str, tuple]:
+        return {
+            "input": (ctypes.POINTER(ctypes.c_float), "in"),
+            "weight": (ctypes.POINTER(ctypes.c_float), "in"),
+            "bias": (ctypes.POINTER(ctypes.c_float), "in"),
+            "output": (ctypes.POINTER(ctypes.c_float), "out"),
+            "N": (ctypes.c_int, "in"),
+            "C": (ctypes.c_int, "in"),
+            "eps": (ctypes.c_float, "in"),
+        }
+
+    def generate_example_test(self) -> Dict[str, Any]:
+        dtype = torch.float32
+        N, C = 2, 4
+        input = torch.tensor(
+            [[1.0, 2.0, 3.0, 4.0], [-1.0, 0.0, 0.0, 1.0]], device="cuda", dtype=dtype
+        )
+        weight = torch.ones(C, device="cuda", dtype=dtype)
+        bias = torch.zeros(C, device="cuda", dtype=dtype)
+        output = torch.empty((N, C), device="cuda", dtype=dtype)
+        eps = 1e-5
+        return {
+            "input": input,
+            "weight": weight,
+            "bias": bias,
+            "output": output,
+            "N": N,
+            "C": C,
+            "eps": eps,
+        }
+
+    def generate_functional_test(self) -> List[Dict[str, Any]]:
+        dtype = torch.float32
+        tests = []
+
+        # edge: single element per row
+        N, C = 1, 1
+        tests.append(
+            {
+                "input": torch.tensor([[3.0]], device="cuda", dtype=dtype),
+                "weight": torch.tensor([1.0], device="cuda", dtype=dtype),
+                "bias": torch.tensor([0.5], device="cuda", dtype=dtype),
+                "output": torch.empty((N, C), device="cuda", dtype=dtype),
+                "N": N,
+                "C": C,
+                "eps": 1e-5,
+            }
+        )
+
+        # edge: 2x2, all zeros
+        N, C = 2, 2
+        tests.append(
+            {
+                "input": torch.zeros((N, C), device="cuda", dtype=dtype),
+                "weight": torch.ones(C, device="cuda", dtype=dtype),
+                "bias": torch.zeros(C, device="cuda", dtype=dtype),
+                "output": torch.empty((N, C), device="cuda", dtype=dtype),
+                "N": N,
+                "C": C,
+                "eps": 1e-5,
+            }
+        )
+
+        # edge: 4x4, negative values
+        N, C = 4, 4
+        tests.append(
+            {
+                "input": torch.tensor(
+                    [
+                        [-1.0, -2.0, -3.0, -4.0],
+                        [1.0, 2.0, 3.0, 4.0],
+                        [0.0, 0.0, 0.0, 0.0],
+                        [-2.0, 0.0, 2.0, 4.0],
+                    ],
+                    device="cuda",
+                    dtype=dtype,
+                ),
+                "weight": torch.tensor([1.0, 2.0, 1.0, 0.5], device="cuda", dtype=dtype),
+                "bias": torch.tensor([0.0, 0.0, 1.0, -1.0], device="cuda", dtype=dtype),
+                "output": torch.empty((N, C), device="cuda", dtype=dtype),
+                "N": N,
+                "C": C,
+                "eps": 1e-5,
+            }
+        )
+
+        # power-of-2: 8x16
+        N, C = 8, 16
+        tests.append(
+            {
+                "input": torch.empty((N, C), device="cuda", dtype=dtype).uniform_(-5.0, 5.0),
+                "weight": torch.empty(C, device="cuda", dtype=dtype).uniform_(0.5, 2.0),
+                "bias": torch.empty(C, device="cuda", dtype=dtype).uniform_(-1.0, 1.0),
+                "output": torch.empty((N, C), device="cuda", dtype=dtype),
+                "N": N,
+                "C": C,
+                "eps": 1e-5,
+            }
+        )
+
+        # power-of-2: 32x64
+        N, C = 32, 64
+        tests.append(
+            {
+                "input": torch.empty((N, C), device="cuda", dtype=dtype).uniform_(-10.0, 10.0),
+                "weight": torch.empty(C, device="cuda", dtype=dtype).uniform_(0.5, 2.0),
+                "bias": torch.empty(C, device="cuda", dtype=dtype).uniform_(-2.0, 2.0),
+                "output": torch.empty((N, C), device="cuda", dtype=dtype),
+                "N": N,
+                "C": C,
+                "eps": 1e-5,
+            }
+        )
+
+        # power-of-2: 128x256
+        N, C = 128, 256
+        tests.append(
+            {
+                "input": torch.empty((N, C), device="cuda", dtype=dtype).uniform_(-10.0, 10.0),
+                "weight": torch.empty(C, device="cuda", dtype=dtype).uniform_(0.5, 2.0),
+                "bias": torch.empty(C, device="cuda", dtype=dtype).uniform_(-2.0, 2.0),
+                "output": torch.empty((N, C), device="cuda", dtype=dtype),
+                "N": N,
+                "C": C,
+                "eps": 1e-5,
+            }
+        )
+
+        # non-power-of-2: 7x30
+        N, C = 7, 30
+        tests.append(
+            {
+                "input": torch.empty((N, C), device="cuda", dtype=dtype).uniform_(-5.0, 5.0),
+                "weight": torch.ones(C, device="cuda", dtype=dtype),
+                "bias": torch.zeros(C, device="cuda", dtype=dtype),
+                "output": torch.empty((N, C), device="cuda", dtype=dtype),
+                "N": N,
+                "C": C,
+                "eps": 1e-5,
+            }
+        )
+
+        # non-power-of-2: 15x100
+        N, C = 15, 100
+        tests.append(
+            {
+                "input": torch.empty((N, C), device="cuda", dtype=dtype).uniform_(-100.0, 100.0),
+                "weight": torch.empty(C, device="cuda", dtype=dtype).uniform_(0.1, 3.0),
+                "bias": torch.empty(C, device="cuda", dtype=dtype).uniform_(-5.0, 5.0),
+                "output": torch.empty((N, C), device="cuda", dtype=dtype),
+                "N": N,
+                "C": C,
+                "eps": 1e-5,
+            }
+        )
+
+        # non-power-of-2: 25x255
+        N, C = 25, 255
+        tests.append(
+            {
+                "input": torch.empty((N, C), device="cuda", dtype=dtype).uniform_(-10.0, 10.0),
+                "weight": torch.empty(C, device="cuda", dtype=dtype).uniform_(0.5, 2.0),
+                "bias": torch.empty(C, device="cuda", dtype=dtype).uniform_(-1.0, 1.0),
+                "output": torch.empty((N, C), device="cuda", dtype=dtype),
+                "N": N,
+                "C": C,
+                "eps": 1e-5,
+            }
+        )
+
+        # realistic: 512x768 (BERT hidden size)
+        N, C = 512, 768
+        tests.append(
+            {
+                "input": torch.empty((N, C), device="cuda", dtype=dtype).uniform_(-5.0, 5.0),
+                "weight": torch.empty(C, device="cuda", dtype=dtype).uniform_(0.5, 2.0),
+                "bias": torch.empty(C, device="cuda", dtype=dtype).uniform_(-1.0, 1.0),
+                "output": torch.empty((N, C), device="cuda", dtype=dtype),
+                "N": N,
+                "C": C,
+                "eps": 1e-5,
+            }
+        )
+
+        return tests
+
+    def generate_performance_test(self) -> Dict[str, Any]:
+        dtype = torch.float32
+        N, C = 65536, 512
+        return {
+            "input": torch.empty((N, C), device="cuda", dtype=dtype).uniform_(-5.0, 10.0),
+            "weight": torch.empty(C, device="cuda", dtype=dtype).uniform_(0.5, 2.0),
+            "bias": torch.empty(C, device="cuda", dtype=dtype).uniform_(-1.0, 1.0),
+            "output": torch.empty((N, C), device="cuda", dtype=dtype),
+            "N": N,
+            "C": C,
+            "eps": 1e-5,
+        }
@@ -0,0 +1,5 @@
+#include <cuda_runtime.h>
+
+// input, weight, bias, output are device pointers
+extern "C" void solve(const float* input, const float* weight, const float* bias, float* output,
+                      int N, int C, float eps) {}
@@ -0,0 +1,16 @@
+import cutlass
+import cutlass.cute as cute
+
+
+# input, weight, bias, output are tensors on the GPU
+@cute.jit
+def solve(
+    input: cute.Tensor,
+    weight: cute.Tensor,
+    bias: cute.Tensor,
+    output: cute.Tensor,
+    N: cute.Int32,
+    C: cute.Int32,
+    eps: cute.Float32,
+):
+    pass
@@ -0,0 +1,11 @@
+import jax
+import jax.numpy as jnp
+
+
+# input, weight, bias are tensors on the GPU
+@jax.jit
+def solve(
+    input: jax.Array, weight: jax.Array, bias: jax.Array, N: int, C: int, eps: float
+) -> jax.Array:
+    # return output tensor directly
+    pass
@@ -0,0 +1,11 @@
+from gpu.host import DeviceContext
+from gpu.id import block_dim, block_idx, thread_idx
+from memory import UnsafePointer
+from math import ceildiv
+
+# input, weight, bias, output are device pointers
+@export
+def solve(input: UnsafePointer[Float32], weight: UnsafePointer[Float32],
+          bias: UnsafePointer[Float32], output: UnsafePointer[Float32],
+          N: Int32, C: Int32, eps: Float32):
+    pass
@@ -0,0 +1,14 @@
+import torch
+
+
+# input, weight, bias, output are tensors on the GPU
+def solve(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    output: torch.Tensor,
+    N: int,
+    C: int,
+    eps: float,
+):
+    pass
@@ -0,0 +1,16 @@
+import torch
+import triton
+import triton.language as tl
+
+
+# input, weight, bias, output are tensors on the GPU
+def solve(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    output: torch.Tensor,
+    N: int,
+    C: int,
+    eps: float,
+):
+    pass