diff --git a/challenges/medium/90_causal_depthwise_conv1d/challenge.html b/challenges/medium/90_causal_depthwise_conv1d/challenge.html
new file mode 100644
index 0000000..628f5bf
--- /dev/null
+++ b/challenges/medium/90_causal_depthwise_conv1d/challenge.html
@@ -0,0 +1,144 @@
+
+ Implement a causal depthwise 1D convolution over a batched sequence tensor
+ x of shape (B, L, D), producing an output of the same shape.
+ In a depthwise convolution, each channel d is convolved independently using its
+ own kernel weight[d, :] — there is no mixing across channels.
+ The convolution is causal : output position l may only depend on
+ input positions 0, 1, …, l (past and present), never future positions.
+ This operation is a key component of state-space models such as Mamba, where it is applied
+ before the selective scan to mix local context within each feature channel.
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Causal Depthwise Conv1d (K=3, one channel shown)
+
+
+ x[d]
+
+
+
+ x₀
+
+
+ x₁
+
+
+ x₂
+
+
+ x₃
+
+
+ x₄
+
+
+ x₅
+
+
+ w[d]
+
+ w₀
+
+ w₁
+
+ w₂
+
+
+ kernel at l=4: reads x₂,x₃,x₄
+
+
+
+
+
+ y[d]
+
+
+
+ y₀
+
+
+ y₁
+
+
+ y₂
+
+
+ y₃
+
+
+ y₄
+
+
+ y₅
+
+
+
+ y[d,l] = bias[d] + Σ w[d,k] · x[d, l−k] (x[d,l−k] = 0 if l−k < 0)
+
+
+
+
+ Formally, for each batch element b, sequence position l, and channel d:
+
+
+\[
+\text{output}[b,\, l,\, d]
+= \text{bias}[d]
++ \sum_{k=0}^{K-1} \text{weight}[d,\, k] \cdot x[b,\, l - k,\, d]
+\]
+
+
+ where positions l − k < 0 are treated as zero (zero-pad the left boundary).
+ The tensor layout is channels-last : x[b, l, d] is stored at offset
+ b × L × D + l × D + d.
+
+
+Implementation Requirements
+
+ The solve function signature must remain unchanged
+ The result must be written into the output tensor
+ Use only native features (external libraries are not permitted)
+ Input positions before the start of the sequence (i.e. indices l − k < 0) must be treated as zero
+
+
+Example
+
+With B = 1, L = 4, D = 2, K = 3:
+
+
+x = [[[1.0, 2.0], # l=0
+ [3.0, 4.0], # l=1
+ [5.0, 6.0], # l=2
+ [7.0, 8.0]]] # l=3 shape (1, 4, 2)
+
+weight = [[ 1.0, 0.0, -1.0], # channel d=0
+ [ 1.0, 1.0, 1.0]] # channel d=1 shape (2, 3)
+
+bias = [0.0, 0.0]
+
+output = [[[1.0, 2.0], # l=0: d0: 1*1=1 d1: 1*2=2
+ [3.0, 6.0], # l=1: d0: 3*1+1*0=3 d1: 4*1+2*1=6
+ [4.0, 12.0], # l=2: d0: 5*1+3*0+1*(-1)=4 d1: 6+4+2=12
+ [4.0, 18.0]]] # l=3: d0: 7*1+5*0+3*(-1)=4 d1: 8+6+4=18
+
+
+Constraints
+
+ 1 ≤ B ≤ 16 (batch size)
+ 1 ≤ L ≤ 8,192 (sequence length)
+ 1 ≤ D ≤ 8,192 (number of channels)
+ 1 ≤ K ≤ 8 (kernel size; typically 3 or 4 in practice)
+ All tensors use 32-bit floating point
+ Tensor x and output use channels-last layout: shape (B, L, D)
+ Performance is measured with B = 8, L = 2,048, D = 4,096, K = 4
+
diff --git a/challenges/medium/90_causal_depthwise_conv1d/challenge.py b/challenges/medium/90_causal_depthwise_conv1d/challenge.py
new file mode 100644
index 0000000..db0c8fb
--- /dev/null
+++ b/challenges/medium/90_causal_depthwise_conv1d/challenge.py
@@ -0,0 +1,187 @@
+import ctypes
+from typing import Any, Dict, List
+
+import torch
+import torch.nn.functional as F
+from core.challenge_base import ChallengeBase
+
+
+class Challenge(ChallengeBase):
+ def __init__(self):
+ super().__init__(
+ name="Causal Depthwise Conv1d",
+ atol=1e-04,
+ rtol=1e-04,
+ num_gpus=1,
+ access_tier="free",
+ )
+
+ def reference_impl(
+ self,
+ x: torch.Tensor,
+ weight: torch.Tensor,
+ bias: torch.Tensor,
+ output: torch.Tensor,
+ B: int,
+ L: int,
+ D: int,
+ K: int,
+ ):
+ assert x.shape == (B, L, D)
+ assert weight.shape == (D, K)
+ assert bias.shape == (D,)
+ assert output.shape == (B, L, D)
+ assert x.dtype == weight.dtype == bias.dtype == output.dtype == torch.float32
+ assert x.device.type == "cuda"
+ assert weight.device.type == "cuda"
+ assert bias.device.type == "cuda"
+ assert output.device.type == "cuda"
+
+ # Reshape to (B, D, L) for conv1d
+ x_t = x.permute(0, 2, 1).contiguous() # (B, D, L)
+
+ # Causal padding: pad K-1 zeros on the left so each output position
+ # only sees current and past input positions
+ x_padded = F.pad(x_t, (K - 1, 0)) # (B, D, L + K - 1)
+
+ # Depthwise conv: weight (D, K) -> (D, 1, K), groups=D
+ # Flip the kernel so weight[d, 0] applies to the current position (l-0)
+ # and weight[d, K-1] applies to the oldest position (l-(K-1)).
+ # F.conv1d uses cross-correlation (no implicit flip), so we flip explicitly.
+ w = weight.flip(1).unsqueeze(1) # (D, 1, K)
+ result = F.conv1d(x_padded, w, bias=bias, groups=D) # (B, D, L)
+
+ output.copy_(result.permute(0, 2, 1)) # (B, L, D)
+
+ def get_solve_signature(self) -> Dict[str, tuple]:
+ return {
+ "x": (ctypes.POINTER(ctypes.c_float), "in"),
+ "weight": (ctypes.POINTER(ctypes.c_float), "in"),
+ "bias": (ctypes.POINTER(ctypes.c_float), "in"),
+ "output": (ctypes.POINTER(ctypes.c_float), "out"),
+ "B": (ctypes.c_int, "in"),
+ "L": (ctypes.c_int, "in"),
+ "D": (ctypes.c_int, "in"),
+ "K": (ctypes.c_int, "in"),
+ }
+
+ def generate_example_test(self) -> Dict[str, Any]:
+ B, L, D, K = 1, 4, 2, 3
+ x = torch.tensor(
+ [[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]]],
+ device="cuda",
+ dtype=torch.float32,
+ )
+ weight = torch.tensor(
+ [[1.0, 0.0, -1.0], [1.0, 1.0, 1.0]], device="cuda", dtype=torch.float32
+ )
+ bias = torch.zeros(D, device="cuda", dtype=torch.float32)
+ output = torch.empty(B, L, D, device="cuda", dtype=torch.float32)
+ return {
+ "x": x,
+ "weight": weight,
+ "bias": bias,
+ "output": output,
+ "B": B,
+ "L": L,
+ "D": D,
+ "K": K,
+ }
+
+ def generate_functional_test(self) -> List[Dict[str, Any]]:
+ dtype = torch.float32
+ test_cases = []
+
+ def make_case(B, L, D, K, x_vals=None, w_vals=None, b_vals=None):
+ if x_vals is not None:
+ x = torch.tensor(x_vals, device="cuda", dtype=dtype)
+ else:
+ x = torch.randn(B, L, D, device="cuda", dtype=dtype)
+ if w_vals is not None:
+ weight = torch.tensor(w_vals, device="cuda", dtype=dtype)
+ else:
+ weight = torch.randn(D, K, device="cuda", dtype=dtype)
+ if b_vals is not None:
+ bias = torch.tensor(b_vals, device="cuda", dtype=dtype)
+ else:
+ bias = torch.randn(D, device="cuda", dtype=dtype)
+ output = torch.empty(B, L, D, device="cuda", dtype=dtype)
+ return {
+ "x": x,
+ "weight": weight,
+ "bias": bias,
+ "output": output,
+ "B": B,
+ "L": L,
+ "D": D,
+ "K": K,
+ }
+
+ # Example test (matches generate_example_test)
+ test_cases.append(
+ make_case(
+ 1,
+ 4,
+ 2,
+ 3,
+ x_vals=[[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]]],
+ w_vals=[[1.0, 0.0, -1.0], [1.0, 1.0, 1.0]],
+ b_vals=[0.0, 0.0],
+ )
+ )
+
+ # Edge cases: minimal sizes
+ test_cases.append(make_case(1, 1, 1, 1)) # single element, kernel=1
+ test_cases.append(make_case(1, 2, 1, 2)) # L < K, so first output is partial
+ test_cases.append(make_case(2, 3, 4, 3)) # small batch, B=2
+
+ # Zero inputs
+ x_zero = torch.zeros(1, 8, 4, device="cuda", dtype=dtype)
+ w_zero = torch.randn(4, 3, device="cuda", dtype=dtype)
+ b_zero = torch.randn(4, device="cuda", dtype=dtype)
+ test_cases.append(
+ {
+ "x": x_zero,
+ "weight": w_zero,
+ "bias": b_zero,
+ "output": torch.empty(1, 8, 4, device="cuda", dtype=dtype),
+ "B": 1,
+ "L": 8,
+ "D": 4,
+ "K": 3,
+ }
+ )
+
+ # Negative values
+ test_cases.append(make_case(1, 16, 8, 4))
+
+ # Power-of-2 sizes
+ test_cases.append(make_case(2, 32, 16, 4))
+ test_cases.append(make_case(4, 64, 32, 4))
+
+ # Non-power-of-2 sizes
+ test_cases.append(make_case(3, 30, 12, 3))
+ test_cases.append(make_case(2, 100, 24, 4))
+
+ # Realistic inference size (Mamba-like small)
+ test_cases.append(make_case(2, 256, 128, 4))
+
+ return test_cases
+
+ def generate_performance_test(self) -> Dict[str, Any]:
+ B, L, D, K = 8, 2048, 4096, 4
+ dtype = torch.float32
+ x = torch.randn(B, L, D, device="cuda", dtype=dtype)
+ weight = torch.randn(D, K, device="cuda", dtype=dtype)
+ bias = torch.randn(D, device="cuda", dtype=dtype)
+ output = torch.empty(B, L, D, device="cuda", dtype=dtype)
+ return {
+ "x": x,
+ "weight": weight,
+ "bias": bias,
+ "output": output,
+ "B": B,
+ "L": L,
+ "D": D,
+ "K": K,
+ }
diff --git a/challenges/medium/90_causal_depthwise_conv1d/starter/starter.cu b/challenges/medium/90_causal_depthwise_conv1d/starter/starter.cu
new file mode 100644
index 0000000..b6fd040
--- /dev/null
+++ b/challenges/medium/90_causal_depthwise_conv1d/starter/starter.cu
@@ -0,0 +1,5 @@
+#include
+
+// x, weight, bias, output are device pointers
+extern "C" void solve(const float* x, const float* weight, const float* bias, float* output, int B,
+ int L, int D, int K) {}
diff --git a/challenges/medium/90_causal_depthwise_conv1d/starter/starter.cute.py b/challenges/medium/90_causal_depthwise_conv1d/starter/starter.cute.py
new file mode 100644
index 0000000..177b7fc
--- /dev/null
+++ b/challenges/medium/90_causal_depthwise_conv1d/starter/starter.cute.py
@@ -0,0 +1,17 @@
+import cutlass
+import cutlass.cute as cute
+
+
+# x, weight, bias, output are tensors on the GPU
+@cute.jit
+def solve(
+ x: cute.Tensor,
+ weight: cute.Tensor,
+ bias: cute.Tensor,
+ output: cute.Tensor,
+ B: cute.Int32,
+ L: cute.Int32,
+ D: cute.Int32,
+ K: cute.Int32,
+):
+ pass
diff --git a/challenges/medium/90_causal_depthwise_conv1d/starter/starter.jax.py b/challenges/medium/90_causal_depthwise_conv1d/starter/starter.jax.py
new file mode 100644
index 0000000..4f61597
--- /dev/null
+++ b/challenges/medium/90_causal_depthwise_conv1d/starter/starter.jax.py
@@ -0,0 +1,11 @@
+import jax
+import jax.numpy as jnp
+
+
+# x, weight, bias are tensors on GPU
+@jax.jit
+def solve(
+ x: jax.Array, weight: jax.Array, bias: jax.Array, B: int, L: int, D: int, K: int
+) -> jax.Array:
+ # return output tensor directly
+ pass
diff --git a/challenges/medium/90_causal_depthwise_conv1d/starter/starter.mojo b/challenges/medium/90_causal_depthwise_conv1d/starter/starter.mojo
new file mode 100644
index 0000000..315e67f
--- /dev/null
+++ b/challenges/medium/90_causal_depthwise_conv1d/starter/starter.mojo
@@ -0,0 +1,16 @@
+from gpu.host import DeviceContext
+from memory import UnsafePointer
+
+# x, weight, bias, output are device pointers
+@export
+def solve(
+ x: UnsafePointer[Float32],
+ weight: UnsafePointer[Float32],
+ bias: UnsafePointer[Float32],
+ output: UnsafePointer[Float32],
+ B: Int32,
+ L: Int32,
+ D: Int32,
+ K: Int32,
+):
+ pass
diff --git a/challenges/medium/90_causal_depthwise_conv1d/starter/starter.pytorch.py b/challenges/medium/90_causal_depthwise_conv1d/starter/starter.pytorch.py
new file mode 100644
index 0000000..c812b9a
--- /dev/null
+++ b/challenges/medium/90_causal_depthwise_conv1d/starter/starter.pytorch.py
@@ -0,0 +1,15 @@
+import torch
+
+
+# x, weight, bias, output are tensors on the GPU
+def solve(
+ x: torch.Tensor,
+ weight: torch.Tensor,
+ bias: torch.Tensor,
+ output: torch.Tensor,
+ B: int,
+ L: int,
+ D: int,
+ K: int,
+):
+ pass
diff --git a/challenges/medium/90_causal_depthwise_conv1d/starter/starter.triton.py b/challenges/medium/90_causal_depthwise_conv1d/starter/starter.triton.py
new file mode 100644
index 0000000..c13da18
--- /dev/null
+++ b/challenges/medium/90_causal_depthwise_conv1d/starter/starter.triton.py
@@ -0,0 +1,17 @@
+import torch
+import triton
+import triton.language as tl
+
+
+# x, weight, bias, output are tensors on the GPU
+def solve(
+ x: torch.Tensor,
+ weight: torch.Tensor,
+ bias: torch.Tensor,
+ output: torch.Tensor,
+ B: int,
+ L: int,
+ D: int,
+ K: int,
+):
+ pass