add vector_add example

leofang · leofang · commit f0c155cdf4d5 · 2024-10-07T20:55:12.000Z
diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py
@@ -1,3 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 import sys
 
 from cuda.core import Device
diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+from cuda.core import Device
+from cuda.core import LaunchConfig, launch
+from cuda.core import Program
+
+import cupy as cp
+
+
+# compute c = a + b
+code = """
+template<typename T>
+__global__ void vector_add(const T* A,
+                           const T* B,
+                           T* C,
+                           size_t N) {
+    const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    for (size_t i=tid; i<N; i+=gridDim.x*blockDim.x) {
+        C[tid] = A[tid] + B[tid];
+    }
+}
+"""
+
+
+dev = Device()
+dev.set_current()
+s = dev.create_stream()
+
+# prepare program
+prog = Program(code, code_type="c++")
+mod = prog.compile(
+    "cubin",
+    options=("-std=c++17", "-arch=sm_" + "".join(f"{i}" for i in dev.compute_capability),),
+    name_expressions=("vector_add<float>",))
+
+# run in single precision
+ker = mod.get_kernel("vector_add<float>")
+dtype = cp.float32
+
+# prepare input/output
+size = 50000
+a = cp.random.random(size, dtype=dtype)
+b = cp.random.random(size, dtype=dtype)
+c = cp.empty_like(a)
+
+# cupy runs on a different stream from s, so sync before accessing
+dev.sync()
+
+# prepare launch
+block = 256
+grid = (size + block - 1) // block
+config = LaunchConfig(grid=grid, block=block, stream=s)
+
+# launch kernel on stream s
+launch(ker, config, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
+s.sync()
+
+# check result
+assert cp.allclose(c, a+b)
+print("done!")