Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions challenges/medium/74_layer_normalization/challenge.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
<p>
Implement the forward pass of layer normalization for a 2D input tensor. Given an input tensor of shape [N, C] where N is the batch size and C is the number of features, normalize each sample independently across its C features, then apply learnable scale (<code>weight</code>) and shift (<code>bias</code>) parameters. Layer normalization is a core building block of transformer architectures.
</p>

<p>
For each sample \(i\), layer normalization computes:
\[
\begin{align}
\mu_i &= \frac{1}{C} \sum_{j=0}^{C-1} x_{i,j} \\
\sigma_i^2 &= \frac{1}{C} \sum_{j=0}^{C-1} (x_{i,j} - \mu_i)^2 \\
y_{i,j} &= \text{weight}_j \cdot \frac{x_{i,j} - \mu_i}{\sqrt{\sigma_i^2 + \varepsilon}} + \text{bias}_j
\end{align}
\]
</p>

<h2>Implementation Requirements</h2>
<ul>
<li>Use only native features (external libraries are not permitted)</li>
<li>The <code>solve</code> function signature must remain unchanged</li>
<li>The final result must be stored in the <code>output</code> tensor</li>
</ul>

<h2>Example</h2>
<p>
Input:<br>
\(\text{input}\) (N=2, C=4):
\[
\begin{bmatrix}
1.0 & 2.0 & 3.0 & 4.0 \\
-1.0 & 0.0 & 0.0 & 1.0
\end{bmatrix}
\]
\(\text{weight}\):
\[
\begin{bmatrix}
1.0 & 1.0 & 1.0 & 1.0
\end{bmatrix}
\]
\(\text{bias}\):
\[
\begin{bmatrix}
0.0 & 0.0 & 0.0 & 0.0
\end{bmatrix}
\]
\(\varepsilon\) = 1e-5<br><br>
Output:<br>
\(\text{output}\) (N=2, C=4):
\[
\begin{bmatrix}
-1.3416 & -0.4472 & 0.4472 & 1.3416 \\
-1.4142 & 0.0 & 0.0 & 1.4142
\end{bmatrix}
\]
</p>

<h2>Constraints</h2>
<ul>
<li>1 &le; <code>N</code> &le; 65,536</li>
<li>1 &le; <code>C</code> &le; 4,096</li>
<li><code>eps</code> = 1e-5</li>
<li>Input values are in the range [-100.0, 100.0]</li>
<li>Weight values are in the range [0.1, 10.0]</li>
<li>Bias values are in the range [-10.0, 10.0]</li>
<li>Performance is measured with <code>N</code> = 65,536, <code>C</code> = 512</li>
</ul>
232 changes: 232 additions & 0 deletions challenges/medium/74_layer_normalization/challenge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
import ctypes
from typing import Any, Dict, List

import torch
from core.challenge_base import ChallengeBase


class Challenge(ChallengeBase):
def __init__(self):
super().__init__(
name="Layer Normalization", atol=1e-04, rtol=1e-04, num_gpus=1, access_tier="free"
)

def reference_impl(
self,
input: torch.Tensor,
weight: torch.Tensor,
bias: torch.Tensor,
output: torch.Tensor,
N: int,
C: int,
eps: float,
):
assert input.shape == output.shape == (N, C)
assert weight.shape == bias.shape == (C,)
assert input.dtype == weight.dtype == bias.dtype == output.dtype
assert input.device == weight.device == bias.device == output.device
assert str(input.device).startswith("cuda")

mean = input.mean(dim=1, keepdim=True)
var = input.var(dim=1, keepdim=True, unbiased=False)
normalized = (input - mean) / torch.sqrt(var + eps)
output.copy_(weight * normalized + bias)

def get_solve_signature(self) -> Dict[str, tuple]:
return {
"input": (ctypes.POINTER(ctypes.c_float), "in"),
"weight": (ctypes.POINTER(ctypes.c_float), "in"),
"bias": (ctypes.POINTER(ctypes.c_float), "in"),
"output": (ctypes.POINTER(ctypes.c_float), "out"),
"N": (ctypes.c_int, "in"),
"C": (ctypes.c_int, "in"),
"eps": (ctypes.c_float, "in"),
}

def generate_example_test(self) -> Dict[str, Any]:
dtype = torch.float32
N, C = 2, 4
input = torch.tensor(
[[1.0, 2.0, 3.0, 4.0], [-1.0, 0.0, 0.0, 1.0]], device="cuda", dtype=dtype
)
weight = torch.ones(C, device="cuda", dtype=dtype)
bias = torch.zeros(C, device="cuda", dtype=dtype)
output = torch.empty((N, C), device="cuda", dtype=dtype)
eps = 1e-5
return {
"input": input,
"weight": weight,
"bias": bias,
"output": output,
"N": N,
"C": C,
"eps": eps,
}

def generate_functional_test(self) -> List[Dict[str, Any]]:
dtype = torch.float32
tests = []

# edge: single element per row
N, C = 1, 1
tests.append(
{
"input": torch.tensor([[3.0]], device="cuda", dtype=dtype),
"weight": torch.tensor([1.0], device="cuda", dtype=dtype),
"bias": torch.tensor([0.5], device="cuda", dtype=dtype),
"output": torch.empty((N, C), device="cuda", dtype=dtype),
"N": N,
"C": C,
"eps": 1e-5,
}
)

# edge: 2x2, all zeros
N, C = 2, 2
tests.append(
{
"input": torch.zeros((N, C), device="cuda", dtype=dtype),
"weight": torch.ones(C, device="cuda", dtype=dtype),
"bias": torch.zeros(C, device="cuda", dtype=dtype),
"output": torch.empty((N, C), device="cuda", dtype=dtype),
"N": N,
"C": C,
"eps": 1e-5,
}
)

# edge: 4x4, negative values
N, C = 4, 4
tests.append(
{
"input": torch.tensor(
[
[-1.0, -2.0, -3.0, -4.0],
[1.0, 2.0, 3.0, 4.0],
[0.0, 0.0, 0.0, 0.0],
[-2.0, 0.0, 2.0, 4.0],
],
device="cuda",
dtype=dtype,
),
"weight": torch.tensor([1.0, 2.0, 1.0, 0.5], device="cuda", dtype=dtype),
"bias": torch.tensor([0.0, 0.0, 1.0, -1.0], device="cuda", dtype=dtype),
"output": torch.empty((N, C), device="cuda", dtype=dtype),
"N": N,
"C": C,
"eps": 1e-5,
}
)

# power-of-2: 8x16
N, C = 8, 16
tests.append(
{
"input": torch.empty((N, C), device="cuda", dtype=dtype).uniform_(-5.0, 5.0),
"weight": torch.empty(C, device="cuda", dtype=dtype).uniform_(0.5, 2.0),
"bias": torch.empty(C, device="cuda", dtype=dtype).uniform_(-1.0, 1.0),
"output": torch.empty((N, C), device="cuda", dtype=dtype),
"N": N,
"C": C,
"eps": 1e-5,
}
)

# power-of-2: 32x64
N, C = 32, 64
tests.append(
{
"input": torch.empty((N, C), device="cuda", dtype=dtype).uniform_(-10.0, 10.0),
"weight": torch.empty(C, device="cuda", dtype=dtype).uniform_(0.5, 2.0),
"bias": torch.empty(C, device="cuda", dtype=dtype).uniform_(-2.0, 2.0),
"output": torch.empty((N, C), device="cuda", dtype=dtype),
"N": N,
"C": C,
"eps": 1e-5,
}
)

# power-of-2: 128x256
N, C = 128, 256
tests.append(
{
"input": torch.empty((N, C), device="cuda", dtype=dtype).uniform_(-10.0, 10.0),
"weight": torch.empty(C, device="cuda", dtype=dtype).uniform_(0.5, 2.0),
"bias": torch.empty(C, device="cuda", dtype=dtype).uniform_(-2.0, 2.0),
"output": torch.empty((N, C), device="cuda", dtype=dtype),
"N": N,
"C": C,
"eps": 1e-5,
}
)

# non-power-of-2: 7x30
N, C = 7, 30
tests.append(
{
"input": torch.empty((N, C), device="cuda", dtype=dtype).uniform_(-5.0, 5.0),
"weight": torch.ones(C, device="cuda", dtype=dtype),
"bias": torch.zeros(C, device="cuda", dtype=dtype),
"output": torch.empty((N, C), device="cuda", dtype=dtype),
"N": N,
"C": C,
"eps": 1e-5,
}
)

# non-power-of-2: 15x100
N, C = 15, 100
tests.append(
{
"input": torch.empty((N, C), device="cuda", dtype=dtype).uniform_(-100.0, 100.0),
"weight": torch.empty(C, device="cuda", dtype=dtype).uniform_(0.1, 3.0),
"bias": torch.empty(C, device="cuda", dtype=dtype).uniform_(-5.0, 5.0),
"output": torch.empty((N, C), device="cuda", dtype=dtype),
"N": N,
"C": C,
"eps": 1e-5,
}
)

# non-power-of-2: 25x255
N, C = 25, 255
tests.append(
{
"input": torch.empty((N, C), device="cuda", dtype=dtype).uniform_(-10.0, 10.0),
"weight": torch.empty(C, device="cuda", dtype=dtype).uniform_(0.5, 2.0),
"bias": torch.empty(C, device="cuda", dtype=dtype).uniform_(-1.0, 1.0),
"output": torch.empty((N, C), device="cuda", dtype=dtype),
"N": N,
"C": C,
"eps": 1e-5,
}
)

# realistic: 512x768 (BERT hidden size)
N, C = 512, 768
tests.append(
{
"input": torch.empty((N, C), device="cuda", dtype=dtype).uniform_(-5.0, 5.0),
"weight": torch.empty(C, device="cuda", dtype=dtype).uniform_(0.5, 2.0),
"bias": torch.empty(C, device="cuda", dtype=dtype).uniform_(-1.0, 1.0),
"output": torch.empty((N, C), device="cuda", dtype=dtype),
"N": N,
"C": C,
"eps": 1e-5,
}
)

return tests

def generate_performance_test(self) -> Dict[str, Any]:
dtype = torch.float32
N, C = 65536, 512
return {
"input": torch.empty((N, C), device="cuda", dtype=dtype).uniform_(-5.0, 10.0),
"weight": torch.empty(C, device="cuda", dtype=dtype).uniform_(0.5, 2.0),
"bias": torch.empty(C, device="cuda", dtype=dtype).uniform_(-1.0, 1.0),
"output": torch.empty((N, C), device="cuda", dtype=dtype),
"N": N,
"C": C,
"eps": 1e-5,
}
5 changes: 5 additions & 0 deletions challenges/medium/74_layer_normalization/starter/starter.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#include <cuda_runtime.h>

// input, weight, bias, output are device pointers
extern "C" void solve(const float* input, const float* weight, const float* bias, float* output,
int N, int C, float eps) {}
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import cutlass
import cutlass.cute as cute


# input, weight, bias, output are tensors on the GPU
@cute.jit
def solve(
input: cute.Tensor,
weight: cute.Tensor,
bias: cute.Tensor,
output: cute.Tensor,
N: cute.Int32,
C: cute.Int32,
eps: cute.Float32,
):
pass
11 changes: 11 additions & 0 deletions challenges/medium/74_layer_normalization/starter/starter.jax.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import jax
import jax.numpy as jnp


# input, weight, bias are tensors on the GPU
@jax.jit
def solve(
input: jax.Array, weight: jax.Array, bias: jax.Array, N: int, C: int, eps: float
) -> jax.Array:
# return output tensor directly
pass
11 changes: 11 additions & 0 deletions challenges/medium/74_layer_normalization/starter/starter.mojo
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from gpu.host import DeviceContext
from gpu.id import block_dim, block_idx, thread_idx
from memory import UnsafePointer
from math import ceildiv

# input, weight, bias, output are device pointers
@export
def solve(input: UnsafePointer[Float32], weight: UnsafePointer[Float32],
bias: UnsafePointer[Float32], output: UnsafePointer[Float32],
N: Int32, C: Int32, eps: Float32):
pass
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import torch


# input, weight, bias, output are tensors on the GPU
def solve(
input: torch.Tensor,
weight: torch.Tensor,
bias: torch.Tensor,
output: torch.Tensor,
N: int,
C: int,
eps: float,
):
pass
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import torch
import triton
import triton.language as tl


# input, weight, bias, output are tensors on the GPU
def solve(
input: torch.Tensor,
weight: torch.Tensor,
bias: torch.Tensor,
output: torch.Tensor,
N: int,
C: int,
eps: float,
):
pass