diff --git a/challenges/medium/76_bilinear_image_scaling/challenge.html b/challenges/medium/76_bilinear_image_scaling/challenge.html new file mode 100644 index 00000000..cd1e5b85 --- /dev/null +++ b/challenges/medium/76_bilinear_image_scaling/challenge.html @@ -0,0 +1,48 @@ +

+ Given an input image of dimensions H × W (stored as 32-bit floats + in row-major order), produce a scaled output image of dimensions H_out × + W_out using bilinear interpolation with align-corners convention. For each output + pixel (i, j), compute its corresponding source coordinate in the input image and + blend the four surrounding pixels with bilinearly-weighted coefficients. +

+ +

Implementation Requirements

+ + +

Example

+

+Input image (H = 2, W = 2): +\[ +\begin{bmatrix} +1.0 & 3.0 \\ +7.0 & 9.0 +\end{bmatrix} +\] +H_out = 3, W_out = 3

+Source coordinates (align-corners): \(\text{src\_y} = i \cdot \frac{H-1}{H_\text{out}-1}\), \(\text{src\_x} = j \cdot \frac{W-1}{W_\text{out}-1}\)
+For output size 3 and input size 2 this gives a scale of 0.5, so each output step maps to 0.5 input steps.

+Output (H_out = 3, W_out = 3): +\[ +\begin{bmatrix} +1.0 & 2.0 & 3.0 \\ +4.0 & 5.0 & 6.0 \\ +7.0 & 8.0 & 9.0 +\end{bmatrix} +\] +For example, output pixel \((1, 1)\): \(\text{src\_y} = 0.5\), \(\text{src\_x} = 0.5\), so +\(0.25 \times (1.0 + 3.0 + 7.0 + 9.0) = 5.0\). +

+ +

Constraints

+ diff --git a/challenges/medium/76_bilinear_image_scaling/challenge.py b/challenges/medium/76_bilinear_image_scaling/challenge.py new file mode 100644 index 00000000..e974dc3b --- /dev/null +++ b/challenges/medium/76_bilinear_image_scaling/challenge.py @@ -0,0 +1,208 @@ +import ctypes +from typing import Any, Dict, List + +import torch +from core.challenge_base import ChallengeBase + + +class Challenge(ChallengeBase): + def __init__(self): + super().__init__( + name="Bilinear Image Scaling", + atol=1e-05, + rtol=1e-05, + num_gpus=1, + access_tier="free", + ) + + def reference_impl( + self, + image: torch.Tensor, + output: torch.Tensor, + H: int, + W: int, + H_out: int, + W_out: int, + ): + assert image.shape == (H, W), f"Expected image.shape=({H},{W}), got {image.shape}" + assert output.shape == ( + H_out, + W_out, + ), f"Expected output.shape=({H_out},{W_out}), got {output.shape}" + assert image.dtype == torch.float32 + assert output.dtype == torch.float32 + assert image.device.type == "cuda" + + img = image.unsqueeze(0).unsqueeze(0) # 1x1xHxW + result = torch.nn.functional.interpolate( + img, size=(H_out, W_out), mode="bilinear", align_corners=True + ) + output.copy_(result.squeeze(0).squeeze(0)) + + def get_solve_signature(self) -> Dict[str, tuple]: + return { + "image": (ctypes.POINTER(ctypes.c_float), "in"), + "output": (ctypes.POINTER(ctypes.c_float), "out"), + "H": (ctypes.c_int, "in"), + "W": (ctypes.c_int, "in"), + "H_out": (ctypes.c_int, "in"), + "W_out": (ctypes.c_int, "in"), + } + + def generate_example_test(self) -> Dict[str, Any]: + dtype = torch.float32 + image = torch.tensor( + [[1.0, 3.0], [7.0, 9.0]], + device="cuda", + dtype=dtype, + ) + output = torch.empty((3, 3), device="cuda", dtype=dtype) + return { + "image": image, + "output": output, + "H": 2, + "W": 2, + "H_out": 3, + "W_out": 3, + } + + def generate_functional_test(self) -> List[Dict[str, Any]]: + dtype = torch.float32 + tests = [] + + # Edge case: 1x1 -> 1x1 (single pixel, no interpolation) + tests.append( + { + "image": torch.tensor([[5.0]], device="cuda", dtype=dtype), + "output": torch.empty((1, 1), device="cuda", dtype=dtype), + "H": 1, + "W": 1, + "H_out": 1, + "W_out": 1, + } + ) + + # Edge case: 2x2 -> 2x2 (identity, no scaling) + tests.append( + { + "image": torch.tensor([[-1.0, 2.0], [3.0, -4.0]], device="cuda", dtype=dtype), + "output": torch.empty((2, 2), device="cuda", dtype=dtype), + "H": 2, + "W": 2, + "H_out": 2, + "W_out": 2, + } + ) + + # Edge case: 3x3 -> 5x5 (small upsampling, includes zeros) + tests.append( + { + "image": torch.zeros((3, 3), device="cuda", dtype=dtype), + "output": torch.empty((5, 5), device="cuda", dtype=dtype), + "H": 3, + "W": 3, + "H_out": 5, + "W_out": 5, + } + ) + + # Edge case: 4x4 -> 4x12 (width-only scaling, 3x) + tests.append( + { + "image": torch.empty((4, 4), device="cuda", dtype=dtype).uniform_(-5.0, 5.0), + "output": torch.empty((4, 12), device="cuda", dtype=dtype), + "H": 4, + "W": 4, + "H_out": 4, + "W_out": 12, + } + ) + + # Power-of-2: 16x16 -> 32x32 (all zeros) + tests.append( + { + "image": torch.zeros((16, 16), device="cuda", dtype=dtype), + "output": torch.empty((32, 32), device="cuda", dtype=dtype), + "H": 16, + "W": 16, + "H_out": 32, + "W_out": 32, + } + ) + + # Power-of-2: 64x64 -> 128x128 (negative values) + tests.append( + { + "image": torch.empty((64, 64), device="cuda", dtype=dtype).uniform_(-10.0, 0.0), + "output": torch.empty((128, 128), device="cuda", dtype=dtype), + "H": 64, + "W": 64, + "H_out": 128, + "W_out": 128, + } + ) + + # Power-of-2: 256x256 -> 512x512 (mixed values) + tests.append( + { + "image": torch.empty((256, 256), device="cuda", dtype=dtype).uniform_(-1.0, 1.0), + "output": torch.empty((512, 512), device="cuda", dtype=dtype), + "H": 256, + "W": 256, + "H_out": 512, + "W_out": 512, + } + ) + + # Non-power-of-2: 30x40 -> 60x80 + tests.append( + { + "image": torch.empty((30, 40), device="cuda", dtype=dtype).uniform_(-3.0, 3.0), + "output": torch.empty((60, 80), device="cuda", dtype=dtype), + "H": 30, + "W": 40, + "H_out": 60, + "W_out": 80, + } + ) + + # Non-power-of-2: 100x150 -> 255x400 (non-integer scale factors) + tests.append( + { + "image": torch.empty((100, 150), device="cuda", dtype=dtype).uniform_(-2.0, 2.0), + "output": torch.empty((255, 400), device="cuda", dtype=dtype), + "H": 100, + "W": 150, + "H_out": 255, + "W_out": 400, + } + ) + + # Realistic: 1024x1024 -> 2048x2048 + tests.append( + { + "image": torch.empty((1024, 1024), device="cuda", dtype=dtype).uniform_(-5.0, 5.0), + "output": torch.empty((2048, 2048), device="cuda", dtype=dtype), + "H": 1024, + "W": 1024, + "H_out": 2048, + "W_out": 2048, + } + ) + + return tests + + def generate_performance_test(self) -> Dict[str, Any]: + dtype = torch.float32 + H = 4096 + W = 4096 + H_out = 8192 + W_out = 8192 + return { + "image": torch.empty((H, W), device="cuda", dtype=dtype).uniform_(-1.0, 1.0), + "output": torch.empty((H_out, W_out), device="cuda", dtype=dtype), + "H": H, + "W": W, + "H_out": H_out, + "W_out": W_out, + } diff --git a/challenges/medium/76_bilinear_image_scaling/starter/starter.cu b/challenges/medium/76_bilinear_image_scaling/starter/starter.cu new file mode 100644 index 00000000..c90a0c72 --- /dev/null +++ b/challenges/medium/76_bilinear_image_scaling/starter/starter.cu @@ -0,0 +1,4 @@ +#include + +// image, output are device pointers +extern "C" void solve(const float* image, float* output, int H, int W, int H_out, int W_out) {} diff --git a/challenges/medium/76_bilinear_image_scaling/starter/starter.cute.py b/challenges/medium/76_bilinear_image_scaling/starter/starter.cute.py new file mode 100644 index 00000000..93a8800f --- /dev/null +++ b/challenges/medium/76_bilinear_image_scaling/starter/starter.cute.py @@ -0,0 +1,15 @@ +import cutlass +import cutlass.cute as cute + + +# image, output are tensors on the GPU +@cute.jit +def solve( + image: cute.Tensor, + output: cute.Tensor, + H: cute.Int32, + W: cute.Int32, + H_out: cute.Int32, + W_out: cute.Int32, +): + pass diff --git a/challenges/medium/76_bilinear_image_scaling/starter/starter.jax.py b/challenges/medium/76_bilinear_image_scaling/starter/starter.jax.py new file mode 100644 index 00000000..1960179b --- /dev/null +++ b/challenges/medium/76_bilinear_image_scaling/starter/starter.jax.py @@ -0,0 +1,9 @@ +import jax +import jax.numpy as jnp + + +# image is a tensor on the GPU +@jax.jit +def solve(image: jax.Array, H: int, W: int, H_out: int, W_out: int) -> jax.Array: + # return output tensor directly + pass diff --git a/challenges/medium/76_bilinear_image_scaling/starter/starter.mojo b/challenges/medium/76_bilinear_image_scaling/starter/starter.mojo new file mode 100644 index 00000000..bd694c62 --- /dev/null +++ b/challenges/medium/76_bilinear_image_scaling/starter/starter.mojo @@ -0,0 +1,16 @@ +from gpu.host import DeviceContext +from gpu.id import block_dim, block_idx, thread_idx +from memory import UnsafePointer +from math import ceildiv + +# image, output are device pointers +@export +def solve( + image: UnsafePointer[Float32], + output: UnsafePointer[Float32], + H: Int32, + W: Int32, + H_out: Int32, + W_out: Int32, +): + pass diff --git a/challenges/medium/76_bilinear_image_scaling/starter/starter.pytorch.py b/challenges/medium/76_bilinear_image_scaling/starter/starter.pytorch.py new file mode 100644 index 00000000..50610f09 --- /dev/null +++ b/challenges/medium/76_bilinear_image_scaling/starter/starter.pytorch.py @@ -0,0 +1,6 @@ +import torch + + +# image, output are tensors on the GPU +def solve(image: torch.Tensor, output: torch.Tensor, H: int, W: int, H_out: int, W_out: int): + pass diff --git a/challenges/medium/76_bilinear_image_scaling/starter/starter.triton.py b/challenges/medium/76_bilinear_image_scaling/starter/starter.triton.py new file mode 100644 index 00000000..917d2689 --- /dev/null +++ b/challenges/medium/76_bilinear_image_scaling/starter/starter.triton.py @@ -0,0 +1,8 @@ +import torch +import triton +import triton.language as tl + + +# image, output are tensors on the GPU +def solve(image: torch.Tensor, output: torch.Tensor, H: int, W: int, H_out: int, W_out: int): + pass