diff --git a/challenges/medium/74_layer_normalization/challenge.html b/challenges/medium/74_layer_normalization/challenge.html
new file mode 100644
index 00000000..2f9f0e92
--- /dev/null
+++ b/challenges/medium/74_layer_normalization/challenge.html
@@ -0,0 +1,65 @@
+
+ Implement the forward pass of layer normalization for a 2D input tensor. Given an input tensor of shape [N, C] where N is the batch size and C is the number of features, normalize each sample independently across its C features, then apply learnable scale (weight) and shift (bias) parameters. Layer normalization is a core building block of transformer architectures.
+
+
+
+ For each sample \(i\), layer normalization computes:
+ \[
+ \begin{align}
+ \mu_i &= \frac{1}{C} \sum_{j=0}^{C-1} x_{i,j} \\
+ \sigma_i^2 &= \frac{1}{C} \sum_{j=0}^{C-1} (x_{i,j} - \mu_i)^2 \\
+ y_{i,j} &= \text{weight}_j \cdot \frac{x_{i,j} - \mu_i}{\sqrt{\sigma_i^2 + \varepsilon}} + \text{bias}_j
+ \end{align}
+ \]
+
+
+Implementation Requirements
+
+ - Use only native features (external libraries are not permitted)
+ - The
solve function signature must remain unchanged
+ - The final result must be stored in the
output tensor
+
+
+Example
+
+Input:
+\(\text{input}\) (N=2, C=4):
+\[
+\begin{bmatrix}
+1.0 & 2.0 & 3.0 & 4.0 \\
+-1.0 & 0.0 & 0.0 & 1.0
+\end{bmatrix}
+\]
+\(\text{weight}\):
+\[
+\begin{bmatrix}
+1.0 & 1.0 & 1.0 & 1.0
+\end{bmatrix}
+\]
+\(\text{bias}\):
+\[
+\begin{bmatrix}
+0.0 & 0.0 & 0.0 & 0.0
+\end{bmatrix}
+\]
+\(\varepsilon\) = 1e-5
+Output:
+\(\text{output}\) (N=2, C=4):
+\[
+\begin{bmatrix}
+-1.3416 & -0.4472 & 0.4472 & 1.3416 \\
+-1.4142 & 0.0 & 0.0 & 1.4142
+\end{bmatrix}
+\]
+
+
+Constraints
+
+ - 1 ≤
N ≤ 65,536
+ - 1 ≤
C ≤ 4,096
+ eps = 1e-5
+ - Input values are in the range [-100.0, 100.0]
+ - Weight values are in the range [0.1, 10.0]
+ - Bias values are in the range [-10.0, 10.0]
+ - Performance is measured with
N = 65,536, C = 512
+
diff --git a/challenges/medium/74_layer_normalization/challenge.py b/challenges/medium/74_layer_normalization/challenge.py
new file mode 100644
index 00000000..025b843f
--- /dev/null
+++ b/challenges/medium/74_layer_normalization/challenge.py
@@ -0,0 +1,232 @@
+import ctypes
+from typing import Any, Dict, List
+
+import torch
+from core.challenge_base import ChallengeBase
+
+
+class Challenge(ChallengeBase):
+ def __init__(self):
+ super().__init__(
+ name="Layer Normalization", atol=1e-04, rtol=1e-04, num_gpus=1, access_tier="free"
+ )
+
+ def reference_impl(
+ self,
+ input: torch.Tensor,
+ weight: torch.Tensor,
+ bias: torch.Tensor,
+ output: torch.Tensor,
+ N: int,
+ C: int,
+ eps: float,
+ ):
+ assert input.shape == output.shape == (N, C)
+ assert weight.shape == bias.shape == (C,)
+ assert input.dtype == weight.dtype == bias.dtype == output.dtype
+ assert input.device == weight.device == bias.device == output.device
+ assert str(input.device).startswith("cuda")
+
+ mean = input.mean(dim=1, keepdim=True)
+ var = input.var(dim=1, keepdim=True, unbiased=False)
+ normalized = (input - mean) / torch.sqrt(var + eps)
+ output.copy_(weight * normalized + bias)
+
+ def get_solve_signature(self) -> Dict[str, tuple]:
+ return {
+ "input": (ctypes.POINTER(ctypes.c_float), "in"),
+ "weight": (ctypes.POINTER(ctypes.c_float), "in"),
+ "bias": (ctypes.POINTER(ctypes.c_float), "in"),
+ "output": (ctypes.POINTER(ctypes.c_float), "out"),
+ "N": (ctypes.c_int, "in"),
+ "C": (ctypes.c_int, "in"),
+ "eps": (ctypes.c_float, "in"),
+ }
+
+ def generate_example_test(self) -> Dict[str, Any]:
+ dtype = torch.float32
+ N, C = 2, 4
+ input = torch.tensor(
+ [[1.0, 2.0, 3.0, 4.0], [-1.0, 0.0, 0.0, 1.0]], device="cuda", dtype=dtype
+ )
+ weight = torch.ones(C, device="cuda", dtype=dtype)
+ bias = torch.zeros(C, device="cuda", dtype=dtype)
+ output = torch.empty((N, C), device="cuda", dtype=dtype)
+ eps = 1e-5
+ return {
+ "input": input,
+ "weight": weight,
+ "bias": bias,
+ "output": output,
+ "N": N,
+ "C": C,
+ "eps": eps,
+ }
+
+ def generate_functional_test(self) -> List[Dict[str, Any]]:
+ dtype = torch.float32
+ tests = []
+
+ # edge: single element per row
+ N, C = 1, 1
+ tests.append(
+ {
+ "input": torch.tensor([[3.0]], device="cuda", dtype=dtype),
+ "weight": torch.tensor([1.0], device="cuda", dtype=dtype),
+ "bias": torch.tensor([0.5], device="cuda", dtype=dtype),
+ "output": torch.empty((N, C), device="cuda", dtype=dtype),
+ "N": N,
+ "C": C,
+ "eps": 1e-5,
+ }
+ )
+
+ # edge: 2x2, all zeros
+ N, C = 2, 2
+ tests.append(
+ {
+ "input": torch.zeros((N, C), device="cuda", dtype=dtype),
+ "weight": torch.ones(C, device="cuda", dtype=dtype),
+ "bias": torch.zeros(C, device="cuda", dtype=dtype),
+ "output": torch.empty((N, C), device="cuda", dtype=dtype),
+ "N": N,
+ "C": C,
+ "eps": 1e-5,
+ }
+ )
+
+ # edge: 4x4, negative values
+ N, C = 4, 4
+ tests.append(
+ {
+ "input": torch.tensor(
+ [
+ [-1.0, -2.0, -3.0, -4.0],
+ [1.0, 2.0, 3.0, 4.0],
+ [0.0, 0.0, 0.0, 0.0],
+ [-2.0, 0.0, 2.0, 4.0],
+ ],
+ device="cuda",
+ dtype=dtype,
+ ),
+ "weight": torch.tensor([1.0, 2.0, 1.0, 0.5], device="cuda", dtype=dtype),
+ "bias": torch.tensor([0.0, 0.0, 1.0, -1.0], device="cuda", dtype=dtype),
+ "output": torch.empty((N, C), device="cuda", dtype=dtype),
+ "N": N,
+ "C": C,
+ "eps": 1e-5,
+ }
+ )
+
+ # power-of-2: 8x16
+ N, C = 8, 16
+ tests.append(
+ {
+ "input": torch.empty((N, C), device="cuda", dtype=dtype).uniform_(-5.0, 5.0),
+ "weight": torch.empty(C, device="cuda", dtype=dtype).uniform_(0.5, 2.0),
+ "bias": torch.empty(C, device="cuda", dtype=dtype).uniform_(-1.0, 1.0),
+ "output": torch.empty((N, C), device="cuda", dtype=dtype),
+ "N": N,
+ "C": C,
+ "eps": 1e-5,
+ }
+ )
+
+ # power-of-2: 32x64
+ N, C = 32, 64
+ tests.append(
+ {
+ "input": torch.empty((N, C), device="cuda", dtype=dtype).uniform_(-10.0, 10.0),
+ "weight": torch.empty(C, device="cuda", dtype=dtype).uniform_(0.5, 2.0),
+ "bias": torch.empty(C, device="cuda", dtype=dtype).uniform_(-2.0, 2.0),
+ "output": torch.empty((N, C), device="cuda", dtype=dtype),
+ "N": N,
+ "C": C,
+ "eps": 1e-5,
+ }
+ )
+
+ # power-of-2: 128x256
+ N, C = 128, 256
+ tests.append(
+ {
+ "input": torch.empty((N, C), device="cuda", dtype=dtype).uniform_(-10.0, 10.0),
+ "weight": torch.empty(C, device="cuda", dtype=dtype).uniform_(0.5, 2.0),
+ "bias": torch.empty(C, device="cuda", dtype=dtype).uniform_(-2.0, 2.0),
+ "output": torch.empty((N, C), device="cuda", dtype=dtype),
+ "N": N,
+ "C": C,
+ "eps": 1e-5,
+ }
+ )
+
+ # non-power-of-2: 7x30
+ N, C = 7, 30
+ tests.append(
+ {
+ "input": torch.empty((N, C), device="cuda", dtype=dtype).uniform_(-5.0, 5.0),
+ "weight": torch.ones(C, device="cuda", dtype=dtype),
+ "bias": torch.zeros(C, device="cuda", dtype=dtype),
+ "output": torch.empty((N, C), device="cuda", dtype=dtype),
+ "N": N,
+ "C": C,
+ "eps": 1e-5,
+ }
+ )
+
+ # non-power-of-2: 15x100
+ N, C = 15, 100
+ tests.append(
+ {
+ "input": torch.empty((N, C), device="cuda", dtype=dtype).uniform_(-100.0, 100.0),
+ "weight": torch.empty(C, device="cuda", dtype=dtype).uniform_(0.1, 3.0),
+ "bias": torch.empty(C, device="cuda", dtype=dtype).uniform_(-5.0, 5.0),
+ "output": torch.empty((N, C), device="cuda", dtype=dtype),
+ "N": N,
+ "C": C,
+ "eps": 1e-5,
+ }
+ )
+
+ # non-power-of-2: 25x255
+ N, C = 25, 255
+ tests.append(
+ {
+ "input": torch.empty((N, C), device="cuda", dtype=dtype).uniform_(-10.0, 10.0),
+ "weight": torch.empty(C, device="cuda", dtype=dtype).uniform_(0.5, 2.0),
+ "bias": torch.empty(C, device="cuda", dtype=dtype).uniform_(-1.0, 1.0),
+ "output": torch.empty((N, C), device="cuda", dtype=dtype),
+ "N": N,
+ "C": C,
+ "eps": 1e-5,
+ }
+ )
+
+ # realistic: 512x768 (BERT hidden size)
+ N, C = 512, 768
+ tests.append(
+ {
+ "input": torch.empty((N, C), device="cuda", dtype=dtype).uniform_(-5.0, 5.0),
+ "weight": torch.empty(C, device="cuda", dtype=dtype).uniform_(0.5, 2.0),
+ "bias": torch.empty(C, device="cuda", dtype=dtype).uniform_(-1.0, 1.0),
+ "output": torch.empty((N, C), device="cuda", dtype=dtype),
+ "N": N,
+ "C": C,
+ "eps": 1e-5,
+ }
+ )
+
+ return tests
+
+ def generate_performance_test(self) -> Dict[str, Any]:
+ dtype = torch.float32
+ N, C = 65536, 512
+ return {
+ "input": torch.empty((N, C), device="cuda", dtype=dtype).uniform_(-5.0, 10.0),
+ "weight": torch.empty(C, device="cuda", dtype=dtype).uniform_(0.5, 2.0),
+ "bias": torch.empty(C, device="cuda", dtype=dtype).uniform_(-1.0, 1.0),
+ "output": torch.empty((N, C), device="cuda", dtype=dtype),
+ "N": N,
+ "C": C,
+ "eps": 1e-5,
+ }
diff --git a/challenges/medium/74_layer_normalization/starter/starter.cu b/challenges/medium/74_layer_normalization/starter/starter.cu
new file mode 100644
index 00000000..746181d4
--- /dev/null
+++ b/challenges/medium/74_layer_normalization/starter/starter.cu
@@ -0,0 +1,5 @@
+#include
+
+// input, weight, bias, output are device pointers
+extern "C" void solve(const float* input, const float* weight, const float* bias, float* output,
+ int N, int C, float eps) {}
diff --git a/challenges/medium/74_layer_normalization/starter/starter.cute.py b/challenges/medium/74_layer_normalization/starter/starter.cute.py
new file mode 100644
index 00000000..b76597d6
--- /dev/null
+++ b/challenges/medium/74_layer_normalization/starter/starter.cute.py
@@ -0,0 +1,16 @@
+import cutlass
+import cutlass.cute as cute
+
+
+# input, weight, bias, output are tensors on the GPU
+@cute.jit
+def solve(
+ input: cute.Tensor,
+ weight: cute.Tensor,
+ bias: cute.Tensor,
+ output: cute.Tensor,
+ N: cute.Int32,
+ C: cute.Int32,
+ eps: cute.Float32,
+):
+ pass
diff --git a/challenges/medium/74_layer_normalization/starter/starter.jax.py b/challenges/medium/74_layer_normalization/starter/starter.jax.py
new file mode 100644
index 00000000..d4ae384d
--- /dev/null
+++ b/challenges/medium/74_layer_normalization/starter/starter.jax.py
@@ -0,0 +1,11 @@
+import jax
+import jax.numpy as jnp
+
+
+# input, weight, bias are tensors on the GPU
+@jax.jit
+def solve(
+ input: jax.Array, weight: jax.Array, bias: jax.Array, N: int, C: int, eps: float
+) -> jax.Array:
+ # return output tensor directly
+ pass
diff --git a/challenges/medium/74_layer_normalization/starter/starter.mojo b/challenges/medium/74_layer_normalization/starter/starter.mojo
new file mode 100644
index 00000000..4c948458
--- /dev/null
+++ b/challenges/medium/74_layer_normalization/starter/starter.mojo
@@ -0,0 +1,11 @@
+from gpu.host import DeviceContext
+from gpu.id import block_dim, block_idx, thread_idx
+from memory import UnsafePointer
+from math import ceildiv
+
+# input, weight, bias, output are device pointers
+@export
+def solve(input: UnsafePointer[Float32], weight: UnsafePointer[Float32],
+ bias: UnsafePointer[Float32], output: UnsafePointer[Float32],
+ N: Int32, C: Int32, eps: Float32):
+ pass
diff --git a/challenges/medium/74_layer_normalization/starter/starter.pytorch.py b/challenges/medium/74_layer_normalization/starter/starter.pytorch.py
new file mode 100644
index 00000000..7bfa8d83
--- /dev/null
+++ b/challenges/medium/74_layer_normalization/starter/starter.pytorch.py
@@ -0,0 +1,14 @@
+import torch
+
+
+# input, weight, bias, output are tensors on the GPU
+def solve(
+ input: torch.Tensor,
+ weight: torch.Tensor,
+ bias: torch.Tensor,
+ output: torch.Tensor,
+ N: int,
+ C: int,
+ eps: float,
+):
+ pass
diff --git a/challenges/medium/74_layer_normalization/starter/starter.triton.py b/challenges/medium/74_layer_normalization/starter/starter.triton.py
new file mode 100644
index 00000000..411638f1
--- /dev/null
+++ b/challenges/medium/74_layer_normalization/starter/starter.triton.py
@@ -0,0 +1,16 @@
+import torch
+import triton
+import triton.language as tl
+
+
+# input, weight, bias, output are tensors on the GPU
+def solve(
+ input: torch.Tensor,
+ weight: torch.Tensor,
+ bias: torch.Tensor,
+ output: torch.Tensor,
+ N: int,
+ C: int,
+ eps: float,
+):
+ pass