From 733272790ee924b4e24398af997c8ffbd5a3fadf Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Thu, 19 Feb 2026 10:02:26 +0000 Subject: [PATCH] Add challenge 70: Merge Sorted Arrays (Medium) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces a parallel merge challenge requiring solvers to merge two sorted float32 arrays into a single sorted output. Unlike a trivial sequential merge, an efficient GPU implementation must use the merge path (co-ranking) technique — each thread independently binary-searches to determine which slice of A and B it is responsible for, then merges locally without serial dependencies. Key GPU concepts exercised: independent work decomposition, binary search on GPU, merge path algorithm, memory access patterns. Includes 13 functional tests (edge cases, powers-of-2, non-powers, negative values, non-overlapping ranges, realistic sizes) and a performance test at M = N = 10,000,000. Co-Authored-By: Claude Sonnet 4.6 --- .../70_merge_sorted_arrays/challenge.html | 43 ++++ .../70_merge_sorted_arrays/challenge.py | 218 ++++++++++++++++++ .../70_merge_sorted_arrays/starter/starter.cu | 4 + .../starter/starter.cute.py | 8 + .../starter/starter.jax.py | 9 + .../starter/starter.mojo | 9 + .../starter/starter.pytorch.py | 6 + .../starter/starter.triton.py | 8 + 8 files changed, 305 insertions(+) create mode 100644 challenges/medium/70_merge_sorted_arrays/challenge.html create mode 100644 challenges/medium/70_merge_sorted_arrays/challenge.py create mode 100644 challenges/medium/70_merge_sorted_arrays/starter/starter.cu create mode 100644 challenges/medium/70_merge_sorted_arrays/starter/starter.cute.py create mode 100644 challenges/medium/70_merge_sorted_arrays/starter/starter.jax.py create mode 100644 challenges/medium/70_merge_sorted_arrays/starter/starter.mojo create mode 100644 challenges/medium/70_merge_sorted_arrays/starter/starter.pytorch.py create mode 100644 challenges/medium/70_merge_sorted_arrays/starter/starter.triton.py diff --git a/challenges/medium/70_merge_sorted_arrays/challenge.html b/challenges/medium/70_merge_sorted_arrays/challenge.html new file mode 100644 index 00000000..136f01ed --- /dev/null +++ b/challenges/medium/70_merge_sorted_arrays/challenge.html @@ -0,0 +1,43 @@ +

+ Given two arrays A of M elements and B of N elements, + each sorted in non-decreasing order and containing 32-bit floating point numbers, produce a single sorted + output array C of M + N elements containing all values from A + and B in non-decreasing order. + The sequential merge algorithm processes elements one-by-one, but an efficient GPU implementation must + determine each output element's origin independently — without a serial dependency chain. +

+ +

Implementation Requirements

+ + +

Example 1

+
+Input:
+A = [1.0, 3.0, 5.0, 7.0]  (M = 4)
+B = [2.0, 4.0, 6.0, 8.0]  (N = 4)
+
+Output:
+C = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]
+
+ +

Example 2

+
+Input:
+A = [1.0, 2.0, 3.0]  (M = 3)
+B = [4.0]             (N = 1)
+
+Output:
+C = [1.0, 2.0, 3.0, 4.0]
+
+ +

Constraints

+ diff --git a/challenges/medium/70_merge_sorted_arrays/challenge.py b/challenges/medium/70_merge_sorted_arrays/challenge.py new file mode 100644 index 00000000..fea1314c --- /dev/null +++ b/challenges/medium/70_merge_sorted_arrays/challenge.py @@ -0,0 +1,218 @@ +import ctypes +from typing import Any, Dict, List + +import torch +from core.challenge_base import ChallengeBase + + +class Challenge(ChallengeBase): + def __init__(self): + super().__init__( + name="Merge Sorted Arrays", + atol=1e-05, + rtol=1e-05, + num_gpus=1, + access_tier="free", + ) + + def reference_impl(self, A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, M: int, N: int): + assert A.shape == (M,) + assert B.shape == (N,) + assert C.shape == (M + N,) + assert A.dtype == torch.float32 + assert B.dtype == torch.float32 + assert C.dtype == torch.float32 + assert A.device.type == "cuda" + assert B.device.type == "cuda" + assert C.device.type == "cuda" + result = torch.sort(torch.cat([A, B]))[0] + C.copy_(result) + + def get_solve_signature(self) -> Dict[str, tuple]: + return { + "A": (ctypes.POINTER(ctypes.c_float), "in"), + "B": (ctypes.POINTER(ctypes.c_float), "in"), + "C": (ctypes.POINTER(ctypes.c_float), "out"), + "M": (ctypes.c_int, "in"), + "N": (ctypes.c_int, "in"), + } + + def generate_example_test(self) -> Dict[str, Any]: + dtype = torch.float32 + A = torch.tensor([1.0, 3.0, 5.0, 7.0], device="cuda", dtype=dtype) + B = torch.tensor([2.0, 4.0, 6.0, 8.0], device="cuda", dtype=dtype) + C = torch.empty(8, device="cuda", dtype=dtype) + return {"A": A, "B": B, "C": C, "M": 4, "N": 4} + + def generate_functional_test(self) -> List[Dict[str, Any]]: + dtype = torch.float32 + tests = [] + + def make_sorted(size, low=-10.0, high=10.0): + return torch.sort(torch.empty(size, device="cuda", dtype=dtype).uniform_(low, high))[0] + + # edge: M=1, N=1 + tests.append( + { + "A": torch.tensor([2.0], device="cuda", dtype=dtype), + "B": torch.tensor([1.0], device="cuda", dtype=dtype), + "C": torch.empty(2, device="cuda", dtype=dtype), + "M": 1, + "N": 1, + } + ) + + # edge: single vs three, B entirely below A + tests.append( + { + "A": torch.tensor([5.0], device="cuda", dtype=dtype), + "B": torch.tensor([1.0, 2.0, 3.0], device="cuda", dtype=dtype), + "C": torch.empty(4, device="cuda", dtype=dtype), + "M": 1, + "N": 3, + } + ) + + # edge: three vs one, A entirely below B + tests.append( + { + "A": torch.tensor([1.0, 2.0, 3.0], device="cuda", dtype=dtype), + "B": torch.tensor([4.0], device="cuda", dtype=dtype), + "C": torch.empty(4, device="cuda", dtype=dtype), + "M": 3, + "N": 1, + } + ) + + # edge: all zeros (zero inputs) + tests.append( + { + "A": torch.zeros(4, device="cuda", dtype=dtype), + "B": torch.zeros(4, device="cuda", dtype=dtype), + "C": torch.empty(8, device="cuda", dtype=dtype), + "M": 4, + "N": 4, + } + ) + + # power-of-2: M=16, N=16, interleaved ranges + A16 = make_sorted(16, 0.0, 1.0) + B16 = make_sorted(16, 0.5, 1.5) + tests.append( + {"A": A16, "B": B16, "C": torch.empty(32, device="cuda", dtype=dtype), "M": 16, "N": 16} + ) + + # power-of-2: M=64, N=128 + A64 = make_sorted(64, -5.0, 5.0) + B128 = make_sorted(128, -5.0, 5.0) + tests.append( + { + "A": A64, + "B": B128, + "C": torch.empty(192, device="cuda", dtype=dtype), + "M": 64, + "N": 128, + } + ) + + # power-of-2: M=512, N=256, all-negative inputs + A512 = make_sorted(512, -100.0, -1.0) + B256 = make_sorted(256, -50.0, -0.5) + tests.append( + { + "A": A512, + "B": B256, + "C": torch.empty(768, device="cuda", dtype=dtype), + "M": 512, + "N": 256, + } + ) + + # non-power-of-2: M=30, N=25, mixed values + A30 = make_sorted(30, -10.0, 10.0) + B25 = make_sorted(25, -10.0, 10.0) + tests.append( + { + "A": A30, + "B": B25, + "C": torch.empty(55, device="cuda", dtype=dtype), + "M": 30, + "N": 25, + } + ) + + # non-power-of-2: M=100, N=73 + A100 = make_sorted(100, -1.0, 1.0) + B73 = make_sorted(73, -1.0, 1.0) + tests.append( + { + "A": A100, + "B": B73, + "C": torch.empty(173, device="cuda", dtype=dtype), + "M": 100, + "N": 73, + } + ) + + # non-power-of-2: M=255, N=200 + A255 = make_sorted(255, -100.0, 100.0) + B200 = make_sorted(200, -100.0, 100.0) + tests.append( + { + "A": A255, + "B": B200, + "C": torch.empty(455, device="cuda", dtype=dtype), + "M": 255, + "N": 200, + } + ) + + # non-overlapping ranges (A fully below B — concat is already sorted) + A_low = make_sorted(64, -10.0, 0.0) + B_high = make_sorted(64, 1.0, 10.0) + tests.append( + { + "A": A_low, + "B": B_high, + "C": torch.empty(128, device="cuda", dtype=dtype), + "M": 64, + "N": 64, + } + ) + + # realistic: M=1000, N=500 + A1000 = make_sorted(1000, -50.0, 50.0) + B500 = make_sorted(500, -50.0, 50.0) + tests.append( + { + "A": A1000, + "B": B500, + "C": torch.empty(1500, device="cuda", dtype=dtype), + "M": 1000, + "N": 500, + } + ) + + # realistic: M=5000, N=3000 + A5000 = make_sorted(5000, -1000.0, 1000.0) + B3000 = make_sorted(3000, -1000.0, 1000.0) + tests.append( + { + "A": A5000, + "B": B3000, + "C": torch.empty(8000, device="cuda", dtype=dtype), + "M": 5000, + "N": 3000, + } + ) + + return tests + + def generate_performance_test(self) -> Dict[str, Any]: + dtype = torch.float32 + M = 10_000_000 + N = 10_000_000 + A = torch.sort(torch.empty(M, device="cuda", dtype=dtype).uniform_(-1e6, 1e6))[0] + B = torch.sort(torch.empty(N, device="cuda", dtype=dtype).uniform_(-1e6, 1e6))[0] + C = torch.empty(M + N, device="cuda", dtype=dtype) + return {"A": A, "B": B, "C": C, "M": M, "N": N} diff --git a/challenges/medium/70_merge_sorted_arrays/starter/starter.cu b/challenges/medium/70_merge_sorted_arrays/starter/starter.cu new file mode 100644 index 00000000..ed724738 --- /dev/null +++ b/challenges/medium/70_merge_sorted_arrays/starter/starter.cu @@ -0,0 +1,4 @@ +#include + +// A, B, C are device pointers (i.e. pointers to memory on the GPU) +extern "C" void solve(const float* A, const float* B, float* C, int M, int N) {} diff --git a/challenges/medium/70_merge_sorted_arrays/starter/starter.cute.py b/challenges/medium/70_merge_sorted_arrays/starter/starter.cute.py new file mode 100644 index 00000000..89bc6024 --- /dev/null +++ b/challenges/medium/70_merge_sorted_arrays/starter/starter.cute.py @@ -0,0 +1,8 @@ +import cutlass +import cutlass.cute as cute + + +# A, B, C are tensors on the GPU +@cute.jit +def solve(A: cute.Tensor, B: cute.Tensor, C: cute.Tensor, M: cute.Int32, N: cute.Int32): + pass diff --git a/challenges/medium/70_merge_sorted_arrays/starter/starter.jax.py b/challenges/medium/70_merge_sorted_arrays/starter/starter.jax.py new file mode 100644 index 00000000..c167dbfd --- /dev/null +++ b/challenges/medium/70_merge_sorted_arrays/starter/starter.jax.py @@ -0,0 +1,9 @@ +import jax +import jax.numpy as jnp + + +# A, B are tensors on the GPU +@jax.jit +def solve(A: jax.Array, B: jax.Array, M: int, N: int) -> jax.Array: + # return output tensor directly + pass diff --git a/challenges/medium/70_merge_sorted_arrays/starter/starter.mojo b/challenges/medium/70_merge_sorted_arrays/starter/starter.mojo new file mode 100644 index 00000000..a7218413 --- /dev/null +++ b/challenges/medium/70_merge_sorted_arrays/starter/starter.mojo @@ -0,0 +1,9 @@ +from gpu.host import DeviceContext +from gpu.id import block_dim, block_idx, thread_idx +from memory import UnsafePointer +from math import ceildiv + +# A, B, C are device pointers (i.e. pointers to memory on the GPU) +@export +def solve(A: UnsafePointer[Float32], B: UnsafePointer[Float32], C: UnsafePointer[Float32], M: Int32, N: Int32): + pass diff --git a/challenges/medium/70_merge_sorted_arrays/starter/starter.pytorch.py b/challenges/medium/70_merge_sorted_arrays/starter/starter.pytorch.py new file mode 100644 index 00000000..c1759236 --- /dev/null +++ b/challenges/medium/70_merge_sorted_arrays/starter/starter.pytorch.py @@ -0,0 +1,6 @@ +import torch + + +# A, B, C are tensors on the GPU +def solve(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, M: int, N: int): + pass diff --git a/challenges/medium/70_merge_sorted_arrays/starter/starter.triton.py b/challenges/medium/70_merge_sorted_arrays/starter/starter.triton.py new file mode 100644 index 00000000..db556ebb --- /dev/null +++ b/challenges/medium/70_merge_sorted_arrays/starter/starter.triton.py @@ -0,0 +1,8 @@ +import torch +import triton +import triton.language as tl + + +# A, B, C are tensors on the GPU +def solve(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, M: int, N: int): + pass