diff --git a/challenges/medium/74_layer_normalization/challenge.html b/challenges/medium/74_layer_normalization/challenge.html new file mode 100644 index 00000000..2f9f0e92 --- /dev/null +++ b/challenges/medium/74_layer_normalization/challenge.html @@ -0,0 +1,65 @@ +

+ Implement the forward pass of layer normalization for a 2D input tensor. Given an input tensor of shape [N, C] where N is the batch size and C is the number of features, normalize each sample independently across its C features, then apply learnable scale (weight) and shift (bias) parameters. Layer normalization is a core building block of transformer architectures. +

+ +

+ For each sample \(i\), layer normalization computes: + \[ + \begin{align} + \mu_i &= \frac{1}{C} \sum_{j=0}^{C-1} x_{i,j} \\ + \sigma_i^2 &= \frac{1}{C} \sum_{j=0}^{C-1} (x_{i,j} - \mu_i)^2 \\ + y_{i,j} &= \text{weight}_j \cdot \frac{x_{i,j} - \mu_i}{\sqrt{\sigma_i^2 + \varepsilon}} + \text{bias}_j + \end{align} + \] +

+ +

Implementation Requirements

+ + +

Example

+

+Input:
+\(\text{input}\) (N=2, C=4): +\[ +\begin{bmatrix} +1.0 & 2.0 & 3.0 & 4.0 \\ +-1.0 & 0.0 & 0.0 & 1.0 +\end{bmatrix} +\] +\(\text{weight}\): +\[ +\begin{bmatrix} +1.0 & 1.0 & 1.0 & 1.0 +\end{bmatrix} +\] +\(\text{bias}\): +\[ +\begin{bmatrix} +0.0 & 0.0 & 0.0 & 0.0 +\end{bmatrix} +\] +\(\varepsilon\) = 1e-5

+Output:
+\(\text{output}\) (N=2, C=4): +\[ +\begin{bmatrix} +-1.3416 & -0.4472 & 0.4472 & 1.3416 \\ +-1.4142 & 0.0 & 0.0 & 1.4142 +\end{bmatrix} +\] +

+ +

Constraints

+ diff --git a/challenges/medium/74_layer_normalization/challenge.py b/challenges/medium/74_layer_normalization/challenge.py new file mode 100644 index 00000000..025b843f --- /dev/null +++ b/challenges/medium/74_layer_normalization/challenge.py @@ -0,0 +1,232 @@ +import ctypes +from typing import Any, Dict, List + +import torch +from core.challenge_base import ChallengeBase + + +class Challenge(ChallengeBase): + def __init__(self): + super().__init__( + name="Layer Normalization", atol=1e-04, rtol=1e-04, num_gpus=1, access_tier="free" + ) + + def reference_impl( + self, + input: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + output: torch.Tensor, + N: int, + C: int, + eps: float, + ): + assert input.shape == output.shape == (N, C) + assert weight.shape == bias.shape == (C,) + assert input.dtype == weight.dtype == bias.dtype == output.dtype + assert input.device == weight.device == bias.device == output.device + assert str(input.device).startswith("cuda") + + mean = input.mean(dim=1, keepdim=True) + var = input.var(dim=1, keepdim=True, unbiased=False) + normalized = (input - mean) / torch.sqrt(var + eps) + output.copy_(weight * normalized + bias) + + def get_solve_signature(self) -> Dict[str, tuple]: + return { + "input": (ctypes.POINTER(ctypes.c_float), "in"), + "weight": (ctypes.POINTER(ctypes.c_float), "in"), + "bias": (ctypes.POINTER(ctypes.c_float), "in"), + "output": (ctypes.POINTER(ctypes.c_float), "out"), + "N": (ctypes.c_int, "in"), + "C": (ctypes.c_int, "in"), + "eps": (ctypes.c_float, "in"), + } + + def generate_example_test(self) -> Dict[str, Any]: + dtype = torch.float32 + N, C = 2, 4 + input = torch.tensor( + [[1.0, 2.0, 3.0, 4.0], [-1.0, 0.0, 0.0, 1.0]], device="cuda", dtype=dtype + ) + weight = torch.ones(C, device="cuda", dtype=dtype) + bias = torch.zeros(C, device="cuda", dtype=dtype) + output = torch.empty((N, C), device="cuda", dtype=dtype) + eps = 1e-5 + return { + "input": input, + "weight": weight, + "bias": bias, + "output": output, + "N": N, + "C": C, + "eps": eps, + } + + def generate_functional_test(self) -> List[Dict[str, Any]]: + dtype = torch.float32 + tests = [] + + # edge: single element per row + N, C = 1, 1 + tests.append( + { + "input": torch.tensor([[3.0]], device="cuda", dtype=dtype), + "weight": torch.tensor([1.0], device="cuda", dtype=dtype), + "bias": torch.tensor([0.5], device="cuda", dtype=dtype), + "output": torch.empty((N, C), device="cuda", dtype=dtype), + "N": N, + "C": C, + "eps": 1e-5, + } + ) + + # edge: 2x2, all zeros + N, C = 2, 2 + tests.append( + { + "input": torch.zeros((N, C), device="cuda", dtype=dtype), + "weight": torch.ones(C, device="cuda", dtype=dtype), + "bias": torch.zeros(C, device="cuda", dtype=dtype), + "output": torch.empty((N, C), device="cuda", dtype=dtype), + "N": N, + "C": C, + "eps": 1e-5, + } + ) + + # edge: 4x4, negative values + N, C = 4, 4 + tests.append( + { + "input": torch.tensor( + [ + [-1.0, -2.0, -3.0, -4.0], + [1.0, 2.0, 3.0, 4.0], + [0.0, 0.0, 0.0, 0.0], + [-2.0, 0.0, 2.0, 4.0], + ], + device="cuda", + dtype=dtype, + ), + "weight": torch.tensor([1.0, 2.0, 1.0, 0.5], device="cuda", dtype=dtype), + "bias": torch.tensor([0.0, 0.0, 1.0, -1.0], device="cuda", dtype=dtype), + "output": torch.empty((N, C), device="cuda", dtype=dtype), + "N": N, + "C": C, + "eps": 1e-5, + } + ) + + # power-of-2: 8x16 + N, C = 8, 16 + tests.append( + { + "input": torch.empty((N, C), device="cuda", dtype=dtype).uniform_(-5.0, 5.0), + "weight": torch.empty(C, device="cuda", dtype=dtype).uniform_(0.5, 2.0), + "bias": torch.empty(C, device="cuda", dtype=dtype).uniform_(-1.0, 1.0), + "output": torch.empty((N, C), device="cuda", dtype=dtype), + "N": N, + "C": C, + "eps": 1e-5, + } + ) + + # power-of-2: 32x64 + N, C = 32, 64 + tests.append( + { + "input": torch.empty((N, C), device="cuda", dtype=dtype).uniform_(-10.0, 10.0), + "weight": torch.empty(C, device="cuda", dtype=dtype).uniform_(0.5, 2.0), + "bias": torch.empty(C, device="cuda", dtype=dtype).uniform_(-2.0, 2.0), + "output": torch.empty((N, C), device="cuda", dtype=dtype), + "N": N, + "C": C, + "eps": 1e-5, + } + ) + + # power-of-2: 128x256 + N, C = 128, 256 + tests.append( + { + "input": torch.empty((N, C), device="cuda", dtype=dtype).uniform_(-10.0, 10.0), + "weight": torch.empty(C, device="cuda", dtype=dtype).uniform_(0.5, 2.0), + "bias": torch.empty(C, device="cuda", dtype=dtype).uniform_(-2.0, 2.0), + "output": torch.empty((N, C), device="cuda", dtype=dtype), + "N": N, + "C": C, + "eps": 1e-5, + } + ) + + # non-power-of-2: 7x30 + N, C = 7, 30 + tests.append( + { + "input": torch.empty((N, C), device="cuda", dtype=dtype).uniform_(-5.0, 5.0), + "weight": torch.ones(C, device="cuda", dtype=dtype), + "bias": torch.zeros(C, device="cuda", dtype=dtype), + "output": torch.empty((N, C), device="cuda", dtype=dtype), + "N": N, + "C": C, + "eps": 1e-5, + } + ) + + # non-power-of-2: 15x100 + N, C = 15, 100 + tests.append( + { + "input": torch.empty((N, C), device="cuda", dtype=dtype).uniform_(-100.0, 100.0), + "weight": torch.empty(C, device="cuda", dtype=dtype).uniform_(0.1, 3.0), + "bias": torch.empty(C, device="cuda", dtype=dtype).uniform_(-5.0, 5.0), + "output": torch.empty((N, C), device="cuda", dtype=dtype), + "N": N, + "C": C, + "eps": 1e-5, + } + ) + + # non-power-of-2: 25x255 + N, C = 25, 255 + tests.append( + { + "input": torch.empty((N, C), device="cuda", dtype=dtype).uniform_(-10.0, 10.0), + "weight": torch.empty(C, device="cuda", dtype=dtype).uniform_(0.5, 2.0), + "bias": torch.empty(C, device="cuda", dtype=dtype).uniform_(-1.0, 1.0), + "output": torch.empty((N, C), device="cuda", dtype=dtype), + "N": N, + "C": C, + "eps": 1e-5, + } + ) + + # realistic: 512x768 (BERT hidden size) + N, C = 512, 768 + tests.append( + { + "input": torch.empty((N, C), device="cuda", dtype=dtype).uniform_(-5.0, 5.0), + "weight": torch.empty(C, device="cuda", dtype=dtype).uniform_(0.5, 2.0), + "bias": torch.empty(C, device="cuda", dtype=dtype).uniform_(-1.0, 1.0), + "output": torch.empty((N, C), device="cuda", dtype=dtype), + "N": N, + "C": C, + "eps": 1e-5, + } + ) + + return tests + + def generate_performance_test(self) -> Dict[str, Any]: + dtype = torch.float32 + N, C = 65536, 512 + return { + "input": torch.empty((N, C), device="cuda", dtype=dtype).uniform_(-5.0, 10.0), + "weight": torch.empty(C, device="cuda", dtype=dtype).uniform_(0.5, 2.0), + "bias": torch.empty(C, device="cuda", dtype=dtype).uniform_(-1.0, 1.0), + "output": torch.empty((N, C), device="cuda", dtype=dtype), + "N": N, + "C": C, + "eps": 1e-5, + } diff --git a/challenges/medium/74_layer_normalization/starter/starter.cu b/challenges/medium/74_layer_normalization/starter/starter.cu new file mode 100644 index 00000000..746181d4 --- /dev/null +++ b/challenges/medium/74_layer_normalization/starter/starter.cu @@ -0,0 +1,5 @@ +#include + +// input, weight, bias, output are device pointers +extern "C" void solve(const float* input, const float* weight, const float* bias, float* output, + int N, int C, float eps) {} diff --git a/challenges/medium/74_layer_normalization/starter/starter.cute.py b/challenges/medium/74_layer_normalization/starter/starter.cute.py new file mode 100644 index 00000000..b76597d6 --- /dev/null +++ b/challenges/medium/74_layer_normalization/starter/starter.cute.py @@ -0,0 +1,16 @@ +import cutlass +import cutlass.cute as cute + + +# input, weight, bias, output are tensors on the GPU +@cute.jit +def solve( + input: cute.Tensor, + weight: cute.Tensor, + bias: cute.Tensor, + output: cute.Tensor, + N: cute.Int32, + C: cute.Int32, + eps: cute.Float32, +): + pass diff --git a/challenges/medium/74_layer_normalization/starter/starter.jax.py b/challenges/medium/74_layer_normalization/starter/starter.jax.py new file mode 100644 index 00000000..d4ae384d --- /dev/null +++ b/challenges/medium/74_layer_normalization/starter/starter.jax.py @@ -0,0 +1,11 @@ +import jax +import jax.numpy as jnp + + +# input, weight, bias are tensors on the GPU +@jax.jit +def solve( + input: jax.Array, weight: jax.Array, bias: jax.Array, N: int, C: int, eps: float +) -> jax.Array: + # return output tensor directly + pass diff --git a/challenges/medium/74_layer_normalization/starter/starter.mojo b/challenges/medium/74_layer_normalization/starter/starter.mojo new file mode 100644 index 00000000..4c948458 --- /dev/null +++ b/challenges/medium/74_layer_normalization/starter/starter.mojo @@ -0,0 +1,11 @@ +from gpu.host import DeviceContext +from gpu.id import block_dim, block_idx, thread_idx +from memory import UnsafePointer +from math import ceildiv + +# input, weight, bias, output are device pointers +@export +def solve(input: UnsafePointer[Float32], weight: UnsafePointer[Float32], + bias: UnsafePointer[Float32], output: UnsafePointer[Float32], + N: Int32, C: Int32, eps: Float32): + pass diff --git a/challenges/medium/74_layer_normalization/starter/starter.pytorch.py b/challenges/medium/74_layer_normalization/starter/starter.pytorch.py new file mode 100644 index 00000000..7bfa8d83 --- /dev/null +++ b/challenges/medium/74_layer_normalization/starter/starter.pytorch.py @@ -0,0 +1,14 @@ +import torch + + +# input, weight, bias, output are tensors on the GPU +def solve( + input: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + output: torch.Tensor, + N: int, + C: int, + eps: float, +): + pass diff --git a/challenges/medium/74_layer_normalization/starter/starter.triton.py b/challenges/medium/74_layer_normalization/starter/starter.triton.py new file mode 100644 index 00000000..411638f1 --- /dev/null +++ b/challenges/medium/74_layer_normalization/starter/starter.triton.py @@ -0,0 +1,16 @@ +import torch +import triton +import triton.language as tl + + +# input, weight, bias, output are tensors on the GPU +def solve( + input: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + output: torch.Tensor, + N: int, + C: int, + eps: float, +): + pass