AlphaGPU · claude · Mar 7, 2026
@@ -0,0 +1,42 @@
+<p>
+  Given an undirected graph with <code>N</code> vertices (numbered 0 to N&minus;1) and <code>M</code> edges
+  represented as two arrays <code>src</code> and <code>dst</code> of 32-bit integers, compute the connected
+  component label for every vertex. The label for vertex <code>i</code> must equal the minimum vertex ID in
+  the connected component containing <code>i</code>.
+</p>
+
+<h2>Implementation Requirements</h2>
+<ul>
+  <li>Use only native features (external libraries are not permitted)</li>
+  <li>The <code>solve</code> function signature must remain unchanged</li>
+  <li>Store the label for vertex <code>i</code> in <code>labels[i]</code></li>
+  <li>Each edge <code>(src[e], dst[e])</code> is undirected; vertex IDs satisfy 0 &le; <code>src[e]</code>, <code>dst[e]</code> &lt; <code>N</code></li>
+  <li>The label for every vertex must be the minimum vertex ID in its component</li>
+</ul>
+
+<h2>Example</h2>
+<pre>
+Input:
+  N = 5, M = 3
+  src = [0, 1, 3]
+  dst = [1, 2, 4]
+
+Graph edges: 0-1, 1-2, 3-4
+
+Output:
+  labels = [0, 0, 0, 3, 3]
+
+Explanation:
+  Vertices 0, 1, 2 are connected (min ID = 0).
+  Vertices 3, 4 are connected (min ID = 3).
+</pre>
+
+<h2>Constraints</h2>
+<ul>
+  <li>1 &le; <code>N</code> &le; 1,000,000</li>
+  <li>1 &le; <code>M</code> &le; 5,000,000</li>
+  <li>0 &le; <code>src[e]</code>, <code>dst[e]</code> &lt; <code>N</code> for all edges</li>
+  <li>The graph may contain self-loops and duplicate edges</li>
+  <li>All values are 32-bit integers (<code>int32</code>)</li>
+  <li>Performance is measured with <code>N</code> = 1,000,000, <code>M</code> = 5,000,000</li>
+</ul>
@@ -0,0 +1,223 @@
+import ctypes
+from typing import Any, Dict, List
+
+import torch
+from core.challenge_base import ChallengeBase
+
+
+class Challenge(ChallengeBase):
+    def __init__(self):
+        super().__init__(
+            name="Connected Components",
+            atol=0,
+            rtol=0,
+            num_gpus=1,
+            access_tier="free",
+        )
+
+    def reference_impl(
+        self,
+        src: torch.Tensor,
+        dst: torch.Tensor,
+        labels: torch.Tensor,
+        N: int,
+        M: int,
+    ):
+        assert src.dtype == torch.int32
+        assert dst.dtype == torch.int32
+        assert labels.dtype == torch.int32
+        assert src.shape == (M,)
+        assert dst.shape == (M,)
+        assert labels.shape == (N,)
+        assert src.device.type == "cuda"
+        assert dst.device.type == "cuda"
+        assert labels.device.type == "cuda"
+
+        # Label propagation: labels[i] converges to the minimum vertex ID
+        # in the connected component containing vertex i.
+        comp = torch.arange(N, dtype=torch.int32, device="cuda")
+
+        if M > 0:
+            # Build undirected edge list (each edge appears in both directions)
+            src_long = src.long()
+            dst_long = dst.long()
+            edge_src = torch.cat([src_long, dst_long])
+            edge_dst = torch.cat([dst_long, src_long])
+
+            for _ in range(N):
+                neighbor_labels = comp[edge_dst]
+                new_comp = comp.clone()
+                new_comp.scatter_reduce_(
+                    0, edge_src, neighbor_labels, reduce="amin", include_self=True
+                )
+                if torch.equal(new_comp, comp):
+                    break
+                comp = new_comp
+
+        labels.copy_(comp)
+
+    def get_solve_signature(self) -> Dict[str, tuple]:
+        return {
+            "src": (ctypes.POINTER(ctypes.c_int), "in"),
+            "dst": (ctypes.POINTER(ctypes.c_int), "in"),
+            "labels": (ctypes.POINTER(ctypes.c_int), "out"),
+            "N": (ctypes.c_int, "in"),
+            "M": (ctypes.c_int, "in"),
+        }
+
+    def generate_example_test(self) -> Dict[str, Any]:
+        # Graph: 5 vertices, 3 edges
+        # Edges: (0,1), (1,2), (3,4)
+        # Components: {0,1,2} -> label 0, {3,4} -> label 3
+        N, M = 5, 3
+        src = torch.tensor([0, 1, 3], dtype=torch.int32, device="cuda")
+        dst = torch.tensor([1, 2, 4], dtype=torch.int32, device="cuda")
+        labels = torch.zeros(N, dtype=torch.int32, device="cuda")
+        return {"src": src, "dst": dst, "labels": labels, "N": N, "M": M}
+
+    def generate_functional_test(self) -> List[Dict[str, Any]]:
+        tests = []
+
+        # Test 1: Single vertex, no edges
+        tests.append(
+            {
+                "src": torch.zeros(1, dtype=torch.int32, device="cuda"),
+                "dst": torch.zeros(1, dtype=torch.int32, device="cuda"),
+                "labels": torch.zeros(1, dtype=torch.int32, device="cuda"),
+                "N": 1,
+                "M": 1,
+            }
+        )
+
+        # Test 2: Two vertices, one edge (single component)
+        tests.append(
+            {
+                "src": torch.tensor([0], dtype=torch.int32, device="cuda"),
+                "dst": torch.tensor([1], dtype=torch.int32, device="cuda"),
+                "labels": torch.zeros(2, dtype=torch.int32, device="cuda"),
+                "N": 2,
+                "M": 1,
+            }
+        )
+
+        # Test 3: Four isolated vertices (no edges; each is its own component)
+        tests.append(
+            {
+                "src": torch.tensor([0], dtype=torch.int32, device="cuda"),
+                "dst": torch.tensor([0], dtype=torch.int32, device="cuda"),
+                "labels": torch.zeros(4, dtype=torch.int32, device="cuda"),
+                "N": 4,
+                "M": 1,
+            }
+        )
+
+        # Test 4: Two disjoint pairs
+        tests.append(
+            {
+                "src": torch.tensor([0, 2], dtype=torch.int32, device="cuda"),
+                "dst": torch.tensor([1, 3], dtype=torch.int32, device="cuda"),
+                "labels": torch.zeros(4, dtype=torch.int32, device="cuda"),
+                "N": 4,
+                "M": 2,
+            }
+        )
+
+        # Test 5: Line graph 0-1-2-3-4-5-6-7 (chain, power-of-2 size)
+        N, M = 8, 7
+        tests.append(
+            {
+                "src": torch.arange(0, M, dtype=torch.int32, device="cuda"),
+                "dst": torch.arange(1, N, dtype=torch.int32, device="cuda"),
+                "labels": torch.zeros(N, dtype=torch.int32, device="cuda"),
+                "N": N,
+                "M": M,
+            }
+        )
+
+        # Test 6: Star graph — center=0, leaves=1..15 (power-of-2 N=16)
+        N, M = 16, 15
+        tests.append(
+            {
+                "src": torch.zeros(M, dtype=torch.int32, device="cuda"),
+                "dst": torch.arange(1, N, dtype=torch.int32, device="cuda"),
+                "labels": torch.zeros(N, dtype=torch.int32, device="cuda"),
+                "N": N,
+                "M": M,
+            }
+        )
+
+        # Test 7: 10x10 grid graph (N=100, non-power-of-2)
+        N = 100
+        edges = []
+        for r in range(10):
+            for c in range(10):
+                v = r * 10 + c
+                if c + 1 < 10:
+                    edges.append((v, v + 1))
+                if r + 1 < 10:
+                    edges.append((v, v + 10))
+        M = len(edges)
+        src_list, dst_list = zip(*edges)
+        tests.append(
+            {
+                "src": torch.tensor(src_list, dtype=torch.int32, device="cuda"),
+                "dst": torch.tensor(dst_list, dtype=torch.int32, device="cuda"),
+                "labels": torch.zeros(N, dtype=torch.int32, device="cuda"),
+                "N": N,
+                "M": M,
+            }
+        )
+
+        # Test 8: Three components of sizes 5, 3, 7 (N=15, non-power-of-2)
+        N = 15
+        src_list = [0, 1, 2, 3, 5, 6, 8, 9, 10, 11, 12, 13]
+        dst_list = [1, 2, 3, 4, 6, 7, 9, 10, 11, 12, 13, 14]
+        M = len(src_list)
+        tests.append(
+            {
+                "src": torch.tensor(src_list, dtype=torch.int32, device="cuda"),
+                "dst": torch.tensor(dst_list, dtype=torch.int32, device="cuda"),
+                "labels": torch.zeros(N, dtype=torch.int32, device="cuda"),
+                "N": N,
+                "M": M,
+            }
+        )
+
+        # Test 9: Random sparse graph, realistic size (N=1000, M=3000)
+        torch.manual_seed(42)
+        N, M = 1000, 3000
+        tests.append(
+            {
+                "src": torch.randint(0, N, (M,), dtype=torch.int32, device="cuda"),
+                "dst": torch.randint(0, N, (M,), dtype=torch.int32, device="cuda"),
+                "labels": torch.zeros(N, dtype=torch.int32, device="cuda"),
+                "N": N,
+                "M": M,
+            }
+        )
+
+        # Test 10: Complete graph K4 (all vertices connected)
+        N = 4
+        src_list = [0, 0, 0, 1, 1, 2]
+        dst_list = [1, 2, 3, 2, 3, 3]
+        M = len(src_list)
+        tests.append(
+            {
+                "src": torch.tensor(src_list, dtype=torch.int32, device="cuda"),
+                "dst": torch.tensor(dst_list, dtype=torch.int32, device="cuda"),
+                "labels": torch.zeros(N, dtype=torch.int32, device="cuda"),
+                "N": N,
+                "M": M,
+            }
+        )
+
+        return tests
+
+    def generate_performance_test(self) -> Dict[str, Any]:
+        torch.manual_seed(42)
+        N = 1_000_000
+        M = 5_000_000
+        src = torch.randint(0, N, (M,), dtype=torch.int32, device="cuda")
+        dst = torch.randint(0, N, (M,), dtype=torch.int32, device="cuda")
+        labels = torch.zeros(N, dtype=torch.int32, device="cuda")
+        return {"src": src, "dst": dst, "labels": labels, "N": N, "M": M}
@@ -0,0 +1,4 @@
+#include <cuda_runtime.h>
+
+// src, dst, labels are device pointers
+extern "C" void solve(const int* src, const int* dst, int* labels, int N, int M) {}
@@ -0,0 +1,14 @@
+import cutlass
+import cutlass.cute as cute
+
+
+# src, dst, labels are tensors on the GPU
+@cute.jit
+def solve(
+    src: cute.Tensor,
+    dst: cute.Tensor,
+    labels: cute.Tensor,
+    N: cute.Int32,
+    M: cute.Int32,
+):
+    pass
@@ -0,0 +1,9 @@
+import jax
+import jax.numpy as jnp
+
+
+# src, dst are tensors on GPU
+@jax.jit
+def solve(src: jax.Array, dst: jax.Array, N: int, M: int) -> jax.Array:
+    # return output tensor directly
+    pass
@@ -0,0 +1,6 @@
+from memory import UnsafePointer
+
+# src, dst, labels are device pointers
+@export
+def solve(src: UnsafePointer[Int32], dst: UnsafePointer[Int32], labels: UnsafePointer[Int32], N: Int32, M: Int32):
+    pass
@@ -0,0 +1,6 @@
+import torch
+
+
+# src, dst, labels are tensors on the GPU
+def solve(src: torch.Tensor, dst: torch.Tensor, labels: torch.Tensor, N: int, M: int):
+    pass
@@ -0,0 +1,8 @@
+import torch
+import triton
+import triton.language as tl
+
+
+# src, dst, labels are tensors on the GPU
+def solve(src: torch.Tensor, dst: torch.Tensor, labels: torch.Tensor, N: int, M: int):
+    pass