From 4ae659dc55e98a8fa54670cf7670c97d06a7ae94 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Tue, 24 Feb 2026 09:59:13 +0000 Subject: [PATCH] Add challenge 73: Bilateral Filter (Medium) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a new medium-difficulty challenge implementing the bilateral filter — an edge-preserving image smoothing operation where per-pixel weights combine a spatial Gaussian and an intensity-range Gaussian, preventing blurring across sharp edges. This teaches GPU programmers about data-dependent stencil weights, 2D halo memory access patterns, and per-pixel normalization — concepts not covered by any existing challenge (Gaussian blur uses fixed precomputed kernels). Co-Authored-By: Claude Sonnet 4.6 --- .../medium/73_bilateral_filter/challenge.html | 64 ++++ .../medium/73_bilateral_filter/challenge.py | 273 ++++++++++++++++++ .../73_bilateral_filter/starter/starter.cu | 5 + .../starter/starter.cute.py | 16 + .../starter/starter.jax.py | 11 + .../73_bilateral_filter/starter/starter.mojo | 18 ++ .../starter/starter.pytorch.py | 14 + .../starter/starter.triton.py | 16 + 8 files changed, 417 insertions(+) create mode 100644 challenges/medium/73_bilateral_filter/challenge.html create mode 100644 challenges/medium/73_bilateral_filter/challenge.py create mode 100644 challenges/medium/73_bilateral_filter/starter/starter.cu create mode 100644 challenges/medium/73_bilateral_filter/starter/starter.cute.py create mode 100644 challenges/medium/73_bilateral_filter/starter/starter.jax.py create mode 100644 challenges/medium/73_bilateral_filter/starter/starter.mojo create mode 100644 challenges/medium/73_bilateral_filter/starter/starter.pytorch.py create mode 100644 challenges/medium/73_bilateral_filter/starter/starter.triton.py diff --git a/challenges/medium/73_bilateral_filter/challenge.html b/challenges/medium/73_bilateral_filter/challenge.html new file mode 100644 index 00000000..861fd4b8 --- /dev/null +++ b/challenges/medium/73_bilateral_filter/challenge.html @@ -0,0 +1,64 @@ +

+ Implement a bilateral filter on a 2D image of 32-bit floating point values. + The bilateral filter is an edge-preserving smoothing filter: for each pixel it computes a + weighted average of its neighbors, where the weight combines a spatial Gaussian (favoring + nearby pixels) and a range Gaussian (favoring pixels with similar intensity), so that sharp + edges are preserved while flat regions are smoothed. + The image is stored in row-major order and boundary pixels are handled by clamping to the + nearest valid index (border replication). +

+ +

Implementation Requirements

+ + +

Example

+

+ Input image (\(H=3, W=3\)), spatial_sigma = 1.0, + range_sigma = 0.5, radius = 1: + \[ + \begin{bmatrix} + 1.0 & 1.0 & 1.0 \\ + 1.0 & 0.0 & 1.0 \\ + 1.0 & 1.0 & 1.0 + \end{bmatrix} + \] + Output: + \[ + \begin{bmatrix} + 0.9891 & 0.9812 & 0.9891 \\ + 0.9812 & 0.3453 & 0.9812 \\ + 0.9891 & 0.9812 & 0.9891 + \end{bmatrix} + \] + The center pixel (value 0.0) is surrounded by neighbors all equal to 1.0. Because the + intensity difference of 1.0 is large relative to range_sigma = 0.5, the range + weights strongly suppress those neighbors, so the output at the center (0.3453) is far lower + than a plain Gaussian blur would produce (≈ 0.75). The outer pixels remain close to + 1.0 because their neighbors are mostly equal to themselves. +

+ +

Constraints

+ diff --git a/challenges/medium/73_bilateral_filter/challenge.py b/challenges/medium/73_bilateral_filter/challenge.py new file mode 100644 index 00000000..ac1cf05d --- /dev/null +++ b/challenges/medium/73_bilateral_filter/challenge.py @@ -0,0 +1,273 @@ +import ctypes +from typing import Any, Dict, List + +import torch +import torch.nn.functional as F +from core.challenge_base import ChallengeBase + + +class Challenge(ChallengeBase): + def __init__(self): + super().__init__( + name="Bilateral Filter", + atol=1e-04, + rtol=1e-04, + num_gpus=1, + access_tier="free", + ) + + def reference_impl( + self, + image: torch.Tensor, + output: torch.Tensor, + H: int, + W: int, + spatial_sigma: float, + range_sigma: float, + radius: int, + ): + assert image.shape == (H * W,) + assert output.shape == (H * W,) + assert image.dtype == torch.float32 + assert output.dtype == torch.float32 + assert image.device.type == "cuda" + assert output.device.type == "cuda" + + r = int(radius) + img = image.view(H, W) + + yy = torch.arange(-r, r + 1, device=image.device, dtype=torch.float32) + xx = torch.arange(-r, r + 1, device=image.device, dtype=torch.float32) + grid_y, grid_x = torch.meshgrid(yy, xx, indexing="ij") + spatial_weights = torch.exp(-(grid_y**2 + grid_x**2) / (2.0 * float(spatial_sigma) ** 2)) + + padded = ( + F.pad(img.unsqueeze(0).unsqueeze(0), (r, r, r, r), mode="replicate") + .squeeze(0) + .squeeze(0) + ) + + out = torch.zeros(H, W, device=image.device, dtype=torch.float32) + norm = torch.zeros(H, W, device=image.device, dtype=torch.float32) + inv_2rs2 = 1.0 / (2.0 * float(range_sigma) ** 2) + + for dy in range(2 * r + 1): + for dx in range(2 * r + 1): + neighbor = padded[dy : dy + H, dx : dx + W] + range_weight = torch.exp(-((neighbor - img) ** 2) * inv_2rs2) + weight = spatial_weights[dy, dx] * range_weight + out += weight * neighbor + norm += weight + + output.copy_(out.view(-1) / norm.view(-1)) + + def get_solve_signature(self) -> Dict[str, tuple]: + return { + "image": (ctypes.POINTER(ctypes.c_float), "in"), + "output": (ctypes.POINTER(ctypes.c_float), "out"), + "H": (ctypes.c_int, "in"), + "W": (ctypes.c_int, "in"), + "spatial_sigma": (ctypes.c_float, "in"), + "range_sigma": (ctypes.c_float, "in"), + "radius": (ctypes.c_int, "in"), + } + + def generate_example_test(self) -> Dict[str, Any]: + dtype = torch.float32 + H, W = 3, 3 + image = torch.tensor( + [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0], device="cuda", dtype=dtype + ) + output = torch.zeros(H * W, device="cuda", dtype=dtype) + return { + "image": image, + "output": output, + "H": H, + "W": W, + "spatial_sigma": 1.0, + "range_sigma": 0.5, + "radius": 1, + } + + def generate_functional_test(self) -> List[Dict[str, Any]]: + dtype = torch.float32 + device = "cuda" + tests = [] + + # single_pixel + H, W = 1, 1 + tests.append( + { + "image": torch.tensor([0.5], device=device, dtype=dtype), + "output": torch.zeros(H * W, device=device, dtype=dtype), + "H": H, + "W": W, + "spatial_sigma": 1.0, + "range_sigma": 0.5, + "radius": 1, + } + ) + + # two_by_two_zeros + H, W = 2, 2 + tests.append( + { + "image": torch.zeros(H * W, device=device, dtype=dtype), + "output": torch.zeros(H * W, device=device, dtype=dtype), + "H": H, + "W": W, + "spatial_sigma": 1.0, + "range_sigma": 0.5, + "radius": 1, + } + ) + + # three_by_three_ring (matches example) + H, W = 3, 3 + tests.append( + { + "image": torch.tensor( + [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0], device=device, dtype=dtype + ), + "output": torch.zeros(H * W, device=device, dtype=dtype), + "H": H, + "W": W, + "spatial_sigma": 1.0, + "range_sigma": 0.5, + "radius": 1, + } + ) + + # four_by_four_negatives + H, W = 4, 4 + tests.append( + { + "image": torch.tensor( + [ + -1.0, + -1.0, + 1.0, + 1.0, + -1.0, + -1.0, + 1.0, + 1.0, + -1.0, + -1.0, + 1.0, + 1.0, + -1.0, + -1.0, + 1.0, + 1.0, + ], + device=device, + dtype=dtype, + ), + "output": torch.zeros(H * W, device=device, dtype=dtype), + "H": H, + "W": W, + "spatial_sigma": 1.5, + "range_sigma": 0.8, + "radius": 1, + } + ) + + # power_of_two_16x16 + H, W = 16, 16 + tests.append( + { + "image": torch.empty(H * W, device=device, dtype=dtype).uniform_(0.0, 1.0), + "output": torch.zeros(H * W, device=device, dtype=dtype), + "H": H, + "W": W, + "spatial_sigma": 1.5, + "range_sigma": 0.3, + "radius": 2, + } + ) + + # power_of_two_64x64_mixed + H, W = 64, 64 + tests.append( + { + "image": torch.empty(H * W, device=device, dtype=dtype).uniform_(-1.0, 1.0), + "output": torch.zeros(H * W, device=device, dtype=dtype), + "H": H, + "W": W, + "spatial_sigma": 2.0, + "range_sigma": 0.5, + "radius": 2, + } + ) + + # non_power_of_two_100x100 + H, W = 100, 100 + tests.append( + { + "image": torch.empty(H * W, device=device, dtype=dtype).uniform_(0.0, 1.0), + "output": torch.zeros(H * W, device=device, dtype=dtype), + "H": H, + "W": W, + "spatial_sigma": 2.0, + "range_sigma": 0.3, + "radius": 3, + } + ) + + # non_power_of_two_255x255_mixed + H, W = 255, 255 + tests.append( + { + "image": torch.empty(H * W, device=device, dtype=dtype).uniform_(-1.0, 1.0), + "output": torch.zeros(H * W, device=device, dtype=dtype), + "H": H, + "W": W, + "spatial_sigma": 1.5, + "range_sigma": 0.4, + "radius": 2, + } + ) + + # realistic_512x512 + H, W = 512, 512 + tests.append( + { + "image": torch.empty(H * W, device=device, dtype=dtype).uniform_(0.0, 1.0), + "output": torch.zeros(H * W, device=device, dtype=dtype), + "H": H, + "W": W, + "spatial_sigma": 2.0, + "range_sigma": 0.2, + "radius": 3, + } + ) + + # realistic_1000x1000 + H, W = 1000, 1000 + tests.append( + { + "image": torch.empty(H * W, device=device, dtype=dtype).uniform_(0.0, 1.0), + "output": torch.zeros(H * W, device=device, dtype=dtype), + "H": H, + "W": W, + "spatial_sigma": 3.0, + "range_sigma": 0.1, + "radius": 5, + } + ) + + return tests + + def generate_performance_test(self) -> Dict[str, Any]: + dtype = torch.float32 + H, W = 2048, 2048 + return { + "image": torch.empty(H * W, device="cuda", dtype=dtype).uniform_(0.0, 1.0), + "output": torch.zeros(H * W, device="cuda", dtype=dtype), + "H": H, + "W": W, + "spatial_sigma": 3.0, + "range_sigma": 0.1, + "radius": 5, + } diff --git a/challenges/medium/73_bilateral_filter/starter/starter.cu b/challenges/medium/73_bilateral_filter/starter/starter.cu new file mode 100644 index 00000000..76760898 --- /dev/null +++ b/challenges/medium/73_bilateral_filter/starter/starter.cu @@ -0,0 +1,5 @@ +#include + +// image, output are device pointers +extern "C" void solve(const float* image, float* output, int H, int W, float spatial_sigma, + float range_sigma, int radius) {} diff --git a/challenges/medium/73_bilateral_filter/starter/starter.cute.py b/challenges/medium/73_bilateral_filter/starter/starter.cute.py new file mode 100644 index 00000000..ef93ace3 --- /dev/null +++ b/challenges/medium/73_bilateral_filter/starter/starter.cute.py @@ -0,0 +1,16 @@ +import cutlass +import cutlass.cute as cute + + +# image, output are tensors on the GPU +@cute.jit +def solve( + image: cute.Tensor, + output: cute.Tensor, + H: cute.Int32, + W: cute.Int32, + spatial_sigma: cute.Float32, + range_sigma: cute.Float32, + radius: cute.Int32, +): + pass diff --git a/challenges/medium/73_bilateral_filter/starter/starter.jax.py b/challenges/medium/73_bilateral_filter/starter/starter.jax.py new file mode 100644 index 00000000..3869e954 --- /dev/null +++ b/challenges/medium/73_bilateral_filter/starter/starter.jax.py @@ -0,0 +1,11 @@ +import jax +import jax.numpy as jnp + + +# image is a tensor on GPU +@jax.jit +def solve( + image: jax.Array, H: int, W: int, spatial_sigma: float, range_sigma: float, radius: int +) -> jax.Array: + # return output tensor directly + pass diff --git a/challenges/medium/73_bilateral_filter/starter/starter.mojo b/challenges/medium/73_bilateral_filter/starter/starter.mojo new file mode 100644 index 00000000..ad6b1a11 --- /dev/null +++ b/challenges/medium/73_bilateral_filter/starter/starter.mojo @@ -0,0 +1,18 @@ +from gpu.host import DeviceContext +from gpu.id import block_dim, block_idx, thread_idx +from memory import UnsafePointer +from math import ceildiv + + +# image, output are device pointers +@export +def solve( + image: UnsafePointer[Float32], + output: UnsafePointer[Float32], + H: Int32, + W: Int32, + spatial_sigma: Float32, + range_sigma: Float32, + radius: Int32, +): + pass diff --git a/challenges/medium/73_bilateral_filter/starter/starter.pytorch.py b/challenges/medium/73_bilateral_filter/starter/starter.pytorch.py new file mode 100644 index 00000000..363a237b --- /dev/null +++ b/challenges/medium/73_bilateral_filter/starter/starter.pytorch.py @@ -0,0 +1,14 @@ +import torch + + +# image, output are tensors on the GPU +def solve( + image: torch.Tensor, + output: torch.Tensor, + H: int, + W: int, + spatial_sigma: float, + range_sigma: float, + radius: int, +): + pass diff --git a/challenges/medium/73_bilateral_filter/starter/starter.triton.py b/challenges/medium/73_bilateral_filter/starter/starter.triton.py new file mode 100644 index 00000000..84bc0f46 --- /dev/null +++ b/challenges/medium/73_bilateral_filter/starter/starter.triton.py @@ -0,0 +1,16 @@ +import torch +import triton +import triton.language as tl + + +# image, output are tensors on the GPU +def solve( + image: torch.Tensor, + output: torch.Tensor, + H: int, + W: int, + spatial_sigma: float, + range_sigma: float, + radius: int, +): + pass