Skip to content

Commit f0c155c

Browse files
committed
add vector_add example
1 parent 7e1c8f5 commit f0c155c

File tree

2 files changed

+66
-0
lines changed

2 files changed

+66
-0
lines changed

cuda_core/examples/saxpy.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
2+
#
3+
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
4+
15
import sys
26

37
from cuda.core import Device

cuda_core/examples/vector_add.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
2+
#
3+
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
4+
5+
from cuda.core import Device
6+
from cuda.core import LaunchConfig, launch
7+
from cuda.core import Program
8+
9+
import cupy as cp
10+
11+
12+
# compute c = a + b
13+
code = """
14+
template<typename T>
15+
__global__ void vector_add(const T* A,
16+
const T* B,
17+
T* C,
18+
size_t N) {
19+
const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
20+
for (size_t i=tid; i<N; i+=gridDim.x*blockDim.x) {
21+
C[tid] = A[tid] + B[tid];
22+
}
23+
}
24+
"""
25+
26+
27+
dev = Device()
28+
dev.set_current()
29+
s = dev.create_stream()
30+
31+
# prepare program
32+
prog = Program(code, code_type="c++")
33+
mod = prog.compile(
34+
"cubin",
35+
options=("-std=c++17", "-arch=sm_" + "".join(f"{i}" for i in dev.compute_capability),),
36+
name_expressions=("vector_add<float>",))
37+
38+
# run in single precision
39+
ker = mod.get_kernel("vector_add<float>")
40+
dtype = cp.float32
41+
42+
# prepare input/output
43+
size = 50000
44+
a = cp.random.random(size, dtype=dtype)
45+
b = cp.random.random(size, dtype=dtype)
46+
c = cp.empty_like(a)
47+
48+
# cupy runs on a different stream from s, so sync before accessing
49+
dev.sync()
50+
51+
# prepare launch
52+
block = 256
53+
grid = (size + block - 1) // block
54+
config = LaunchConfig(grid=grid, block=block, stream=s)
55+
56+
# launch kernel on stream s
57+
launch(ker, config, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
58+
s.sync()
59+
60+
# check result
61+
assert cp.allclose(c, a+b)
62+
print("done!")

0 commit comments

Comments
 (0)