diff --git a/ci/tools/run-tests b/ci/tools/run-tests index 51fcf0a6b4..3a35b6ad23 100755 --- a/ci/tools/run-tests +++ b/ci/tools/run-tests @@ -57,7 +57,13 @@ elif [[ "${test_module}" == "core" ]]; then echo "Installing core wheel" pwd ls - pip install $(ls *.whl)["cu${TEST_CUDA_MAJOR}","test-cu${TEST_CUDA_MAJOR}"] + if [[ "${LOCAL_CTK}" == 1 ]]; then + # We already installed cuda-bindings, and all CTK components exist locally, + # so just install the test dependencies. + pip install $(ls *.whl)["test-cu${TEST_CUDA_MAJOR}"] + else + pip install $(ls *.whl)["cu${TEST_CUDA_MAJOR}","test-cu${TEST_CUDA_MAJOR}"] + fi popd pushd ./cuda_core echo "Running core tests" @@ -69,4 +75,4 @@ elif [[ "${test_module}" == "core" ]]; then ${SANITIZER_CMD} pytest -rxXs -v tests/cython fi popd -fi \ No newline at end of file +fi diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 7f9f814d41..01501c52f6 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -21,6 +21,8 @@ CUDA runtime GraphBuilder launch Buffer + Stream + Event MemoryResource DeviceMemoryResource LegacyPinnedMemoryResource diff --git a/cuda_core/docs/source/getting-started.md b/cuda_core/docs/source/getting-started.md new file mode 100644 index 0000000000..a1eccb0fe4 --- /dev/null +++ b/cuda_core/docs/source/getting-started.md @@ -0,0 +1,112 @@ +# Overview + +## What is `cuda core`? + +`cuda.core` provides a Pythonic interface to the CUDA runtime and other functionality, +including: + +- Compiling and launching CUDA kernels +- Asynchronous concurrent execution with CUDA graphs, streams and events +- Coordinating work across multiple CUDA devices +- Allocating, transfering, and managing device memory +- Runtime linking of device code with Link-Time Optimization (LTO) +- and much more! + +Rather than providing 1:1 equivalents of the CUDA driver and runtime APIs +(for that, see [`cuda.bindings`][bindings]), `cuda.core` provides high-level constructs such as: + +- {class}`Device ` class for GPU device operations and context management. +- {class}`Buffer ` and {class}`MemoryResource ` classes for memory allocation and management. +- {class}`Program ` for JIT compilation of CUDA kernels. +- {class}`GraphBuilder ` for building and executing CUDA graphs. +- {class}`Stream ` and {class}`Event ` for asynchronous execution and timing. + +## Example: Compiling and Launching a CUDA kernel + +To get a taste for `cuda.core`, let's walk through a simple example that compiles and launches a vector addition kernel. +You can find the complete example in [`vector_add.py`][vector_add_example]. + +First, we define a string containing the CUDA C++ kernel. Note that this is a templated kernel: + +```python +# compute c = a + b +code = """ +template +__global__ void vector_add(const T* A, + const T* B, + T* C, + size_t N) { + const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (size_t i=tid; i` object +and a corresponding {class}`Stream `. +Don't forget to use {meth}`Device.set_current() `! + +```python +from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch + +dev = Device() +dev.set_current() +s = dev.create_stream() +``` + +Next, we compile the CUDA C++ kernel from earlier using the {class}`Program ` class. +The result of the compilation is saved as a CUBIN. +Note the use of the `name_expressions` parameter to the {meth}`Program.compile() ` method to specify which kernel template instantiations to compile: + +```python +arch = "".join(f"{i}" for i in dev.compute_capability) +program_options = ProgramOptions(std="c++17", arch=f"sm_{arch}") +prog = Program(code, code_type="c++", options=program_options) +mod = prog.compile("cubin", name_expressions=("vector_add",)) +``` + +Next, we retrieve the compiled kernel from the CUBIN and prepare the arguments and kernel configuration. +We're using [CuPy][cupy] arrays as inputs for this example, but you can use PyTorch tensors too +(we show how to do this in one of our [examples][examples]). + +```python +ker = mod.get_kernel("vector_add") + +# Prepare input/output arrays (using CuPy) +size = 50000 +a = rng.random(size, dtype=cp.float32) +b = rng.random(size, dtype=cp.float32) +c = cp.empty_like(a) + +# Configure launch parameters +block = 256 +grid = (size + block - 1) // block +config = LaunchConfig(grid=grid, block=block) +``` + +Finally, we use the {func}`launch ` function to execute our kernel on the specified stream with the given configuration and arguments. Note the use of `.data.ptr` to get the pointer to the array data. + +```python +launch(s, config, ker, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size)) +s.sync() +``` + +This example demonstrates one of the core workflows enabled by `cuda.core`: compiling and launching CUDA code. +Note the clean, Pythonic interface, and absense of any direct calls to the CUDA runtime/driver APIs. + +## Examples and Recipes + +As we mentioned before, `cuda.core` can do much more than just compile and launch kernels. + +The best way to explore and learn the different features `cuda.core` is through +our [`examples`][examples]. Find one that matches your use-case, and modify it to fit your needs! + + +[bindings]: https://nvidia.github.io/cuda-python/cuda-bindings/latest/ +[cai]: https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html +[cupy]: https://cupy.dev/ +[dlpack]: https://dmlc.github.io/dlpack/latest/ +[examples]: https://github.com/NVIDIA/cuda-python/tree/main/cuda_core/examples +[vector_add_example]: https://github.com/NVIDIA/cuda-python/tree/main/cuda_core/examples/vector_add.py diff --git a/cuda_core/docs/source/index.rst b/cuda_core/docs/source/index.rst index 69d3d25c9f..9832b775c8 100644 --- a/cuda_core/docs/source/index.rst +++ b/cuda_core/docs/source/index.rst @@ -1,23 +1,30 @@ .. SPDX-License-Identifier: Apache-2.0 -``cuda.core``: Pythonic access to CUDA core functionalities -=========================================================== +``cuda.core``: Pythonic access to CUDA core functionality +========================================================= -The new Python module ``cuda.core`` offers idiomatic, pythonic access to CUDA runtime -and other functionalities. +Welcome to the documentation for ``cuda.core``. .. toctree:: :maxdepth: 2 :caption: Contents: - release - install.md + getting-started + install interoperability api contribute - conduct.md + +.. toctree:: + :maxdepth: 1 + + conduct license +.. toctree:: + :maxdepth: 2 + + release Indices and tables ================== diff --git a/cuda_core/examples/cuda_graphs.py b/cuda_core/examples/cuda_graphs.py new file mode 100644 index 0000000000..2db89f37f7 --- /dev/null +++ b/cuda_core/examples/cuda_graphs.py @@ -0,0 +1,172 @@ +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This demo illustrates how to use CUDA graphs to capture and execute +# multiple kernel launches with minimal overhead. The graph performs a +# sequence of vector operations: add, multiply, and subtract. +# +# ################################################################################ + +import time + +import cupy as cp + +from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch + + +def main(): + # CUDA kernels for vector operations + code = """ + template + __global__ void vector_add(const T* A, const T* B, T* C, size_t N) { + const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (size_t i = tid; i < N; i += gridDim.x * blockDim.x) { + C[i] = A[i] + B[i]; + } + } + + template + __global__ void vector_multiply(const T* A, const T* B, T* C, size_t N) { + const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (size_t i = tid; i < N; i += gridDim.x * blockDim.x) { + C[i] = A[i] * B[i]; + } + } + + template + __global__ void vector_subtract(const T* A, const T* B, T* C, size_t N) { + const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (size_t i = tid; i < N; i += gridDim.x * blockDim.x) { + C[i] = A[i] - B[i]; + } + } + """ + + # Initialize device and stream + dev = Device() + dev.set_current() + stream = dev.create_stream() + # tell CuPy to use our stream as the current stream: + cp.cuda.ExternalStream(int(stream.handle)).use() + + # Compile the program + arch = "".join(f"{i}" for i in dev.compute_capability) + program_options = ProgramOptions(std="c++17", arch=f"sm_{arch}") + prog = Program(code, code_type="c++", options=program_options) + mod = prog.compile( + "cubin", name_expressions=("vector_add", "vector_multiply", "vector_subtract") + ) + + # Get kernel functions + add_kernel = mod.get_kernel("vector_add") + multiply_kernel = mod.get_kernel("vector_multiply") + subtract_kernel = mod.get_kernel("vector_subtract") + + # Prepare data + size = 1000000 + dtype = cp.float32 + + # Create input arrays + rng = cp.random.default_rng(42) # Fixed seed for reproducibility + a = rng.random(size, dtype=dtype) + b = rng.random(size, dtype=dtype) + c = rng.random(size, dtype=dtype) + + # Create output arrays + result1 = cp.empty_like(a) + result2 = cp.empty_like(a) + result3 = cp.empty_like(a) + + # Prepare launch configuration + block_size = 256 + grid_size = (size + block_size - 1) // block_size + config = LaunchConfig(grid=grid_size, block=block_size) + + # Sync before graph capture + dev.sync() + + print("Building CUDA graph...") + + # Build the graph + graph_builder = stream.create_graph_builder() + graph_builder.begin_building() + + # Add multiple kernel launches to the graph + # Kernel 1: result1 = a + b + launch(graph_builder, config, add_kernel, a.data.ptr, b.data.ptr, result1.data.ptr, cp.uint64(size)) + + # Kernel 2: result2 = result1 * c + launch(graph_builder, config, multiply_kernel, result1.data.ptr, c.data.ptr, result2.data.ptr, cp.uint64(size)) + + # Kernel 3: result3 = result2 - a + launch(graph_builder, config, subtract_kernel, result2.data.ptr, a.data.ptr, result3.data.ptr, cp.uint64(size)) + + # Complete the graph + graph = graph_builder.end_building().complete() + + print("Graph built successfully!") + + # Upload the graph to the stream + graph.upload(stream) + + # Execute the entire graph with a single launch + print("Executing graph...") + start_time = time.time() + graph.launch(stream) + stream.sync() + end_time = time.time() + + graph_execution_time = end_time - start_time + print(f"Graph execution time: {graph_execution_time:.6f} seconds") + + # Verify results + expected_result1 = a + b + expected_result2 = expected_result1 * c + expected_result3 = expected_result2 - a + + print("Verifying results...") + assert cp.allclose(result1, expected_result1, rtol=1e-5, atol=1e-5), "Result 1 mismatch" + assert cp.allclose(result2, expected_result2, rtol=1e-5, atol=1e-5), "Result 2 mismatch" + assert cp.allclose(result3, expected_result3, rtol=1e-5, atol=1e-5), "Result 3 mismatch" + print("All results verified successfully!") + + # Demonstrate performance benefit by running the same operations without graph + print("\nRunning same operations without graph for comparison...") + + # Reset results + result1.fill(0) + result2.fill(0) + result3.fill(0) + + start_time = time.time() + + # Individual kernel launches + launch(stream, config, add_kernel, a.data.ptr, b.data.ptr, result1.data.ptr, cp.uint64(size)) + launch(stream, config, multiply_kernel, result1.data.ptr, c.data.ptr, result2.data.ptr, cp.uint64(size)) + launch(stream, config, subtract_kernel, result2.data.ptr, a.data.ptr, result3.data.ptr, cp.uint64(size)) + + stream.sync() + end_time = time.time() + + individual_execution_time = end_time - start_time + print(f"Individual kernel execution time: {individual_execution_time:.6f} seconds") + + # Calculate speedup + speedup = individual_execution_time / graph_execution_time + print(f"Graph provides {speedup:.2f}x speedup") + + # Verify results again + assert cp.allclose(result1, expected_result1, rtol=1e-5, atol=1e-5), "Result 1 mismatch" + assert cp.allclose(result2, expected_result2, rtol=1e-5, atol=1e-5), "Result 2 mismatch" + assert cp.allclose(result3, expected_result3, rtol=1e-5, atol=1e-5), "Result 3 mismatch" + + cp.cuda.Stream.null.use() # reset CuPy's current stream to the null stream + + print("\nExample completed successfully!") + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/jit_lto_fractal.py b/cuda_core/examples/jit_lto_fractal.py index a44c5a4c45..5fe95eeccb 100644 --- a/cuda_core/examples/jit_lto_fractal.py +++ b/cuda_core/examples/jit_lto_fractal.py @@ -4,7 +4,7 @@ # ################################################################################ # -# This demo aims to illustrate a couple takeaways: +# This demo illustrates: # # 1. How to use the JIT LTO feature provided by the Linker class to link multiple objects together # 2. That linking allows for libraries to modify workflows dynamically at runtime diff --git a/cuda_core/examples/pytorch_example.py b/cuda_core/examples/pytorch_example.py index 4e8ce55337..0eb86f2af4 100644 --- a/cuda_core/examples/pytorch_example.py +++ b/cuda_core/examples/pytorch_example.py @@ -2,8 +2,16 @@ # # SPDX-License-Identifier: Apache-2.0 -## Usage: pip install "cuda-core[cu12]" -## python python_example.py +# ################################################################################ +# +# This demo illustrates how to use `cuda.core` to compile a CUDA kernel +# and launch it using PyTorch tensors as inputs. +# +# ## Usage: pip install "cuda-core[cu12]" +# ## python pytorch_example.py +# +# ################################################################################ + import sys import torch diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py index 2255311418..fedf7aafd4 100644 --- a/cuda_core/examples/saxpy.py +++ b/cuda_core/examples/saxpy.py @@ -2,6 +2,15 @@ # # SPDX-License-Identifier: Apache-2.0 +# ################################################################################ +# +# This demo illustrates how to use `cuda.core` to compile a templated CUDA kernel +# and launch it using `cupy` arrays as inputs. This is a simple example of a +# templated kernel, where the kernel is instantiated for both `float` and `double` +# data types. +# +# ################################################################################ + import sys import cupy as cp @@ -32,6 +41,10 @@ arch = "".join(f"{i}" for i in dev.compute_capability) program_options = ProgramOptions(std="c++11", arch=f"sm_{arch}") prog = Program(code, code_type="c++", options=program_options) + +# Note the use of the `name_expressions` argument to specify the template +# instantiations of the kernel that we will use. For non-templated kernels, +# `name_expressions` will simply contain the name of the kernels. mod = prog.compile( "cubin", logs=sys.stdout, diff --git a/cuda_core/examples/show_device_properties.py b/cuda_core/examples/show_device_properties.py index 318ea41fb7..3d5ac63d4a 100644 --- a/cuda_core/examples/show_device_properties.py +++ b/cuda_core/examples/show_device_properties.py @@ -2,6 +2,13 @@ # # SPDX-License-Identifier: Apache-2.0 +# ################################################################################ +# +# This demo illustrates how to use `cuda.core` to show the properties of the +# CUDA devices in the system. +# +# ################################################################################ + import sys from cuda.core.experimental import Device, system diff --git a/cuda_core/examples/simple_multi_gpu_example.py b/cuda_core/examples/simple_multi_gpu_example.py index a9b006b31d..63d5ebf4ee 100644 --- a/cuda_core/examples/simple_multi_gpu_example.py +++ b/cuda_core/examples/simple_multi_gpu_example.py @@ -2,6 +2,13 @@ # # SPDX-License-Identifier: Apache-2.0 +# ################################################################################ +# +# This demo illustrates how to use `cuda.core` to compile and launch kernels +# on multiple GPUs. +# +# ################################################################################ + import sys import cupy as cp diff --git a/cuda_core/examples/strided_memory_view_cpu.py b/cuda_core/examples/strided_memory_view_cpu.py index 0fa8f38e6c..066f1afc18 100644 --- a/cuda_core/examples/strided_memory_view_cpu.py +++ b/cuda_core/examples/strided_memory_view_cpu.py @@ -4,14 +4,13 @@ # ################################################################################ # -# This demo aims to illustrate two takeaways: +# This demo illustrates: # # 1. The similarity between CPU and GPU JIT-compilation with C++ sources # 2. How to use StridedMemoryView to interface with foreign C/C++ functions # -# To facilitate this demo, we use cffi (https://cffi.readthedocs.io/) for the CPU -# path, which can be easily installed from pip or conda following their instructions. -# We also use NumPy/CuPy as the CPU/GPU array container. +# This demo uses cffi (https://cffi.readthedocs.io/) for the CPU path, which can be +# easily installed from pip or conda following their instructions. # # ################################################################################ diff --git a/cuda_core/examples/strided_memory_view_gpu.py b/cuda_core/examples/strided_memory_view_gpu.py index 10d12fd30c..57cfa6f3cf 100644 --- a/cuda_core/examples/strided_memory_view_gpu.py +++ b/cuda_core/examples/strided_memory_view_gpu.py @@ -4,14 +4,13 @@ # ################################################################################ # -# This demo aims to illustrate two takeaways: +# This demo illustrates: # # 1. The similarity between CPU and GPU JIT-compilation with C++ sources # 2. How to use StridedMemoryView to interface with foreign C/C++ functions # -# To facilitate this demo, we use cffi (https://cffi.readthedocs.io/) for the CPU -# path, which can be easily installed from pip or conda following their instructions. -# We also use NumPy/CuPy as the CPU/GPU array container. +# This demo uses cffi (https://cffi.readthedocs.io/) for the CPU path, which can be +# easily installed from pip or conda following their instructions. # # ################################################################################ diff --git a/cuda_core/examples/thread_block_cluster.py b/cuda_core/examples/thread_block_cluster.py index e761fbb47d..ce337c7af7 100644 --- a/cuda_core/examples/thread_block_cluster.py +++ b/cuda_core/examples/thread_block_cluster.py @@ -2,6 +2,13 @@ # # SPDX-License-Identifier: Apache-2.0 +# ################################################################################ +# +# This demo illustrates the use of thread block clusters in the CUDA launch +# configuration. +# +# ################################################################################ + import os import sys diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py index 9f48d31f0e..94ca201efc 100644 --- a/cuda_core/examples/vector_add.py +++ b/cuda_core/examples/vector_add.py @@ -2,6 +2,13 @@ # # SPDX-License-Identifier: Apache-2.0 +# ################################################################################ +# +# This demo illustrates how to use `cuda.core` to compile and launch a simple +# vector addition kernel. +# +# ################################################################################ + import cupy as cp from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml index 8518a2e282..fe046e5f9e 100644 --- a/cuda_core/pyproject.toml +++ b/cuda_core/pyproject.toml @@ -46,8 +46,8 @@ dependencies = [ ] [project.optional-dependencies] -cu11 = ["cuda-bindings==11.8.*"] -cu12 = ["cuda-bindings==12.*"] +cu11 = ["cuda-bindings[all]==11.8.*"] +cu12 = ["cuda-bindings[all]==12.*"] test = ["cython>=3.0", "setuptools", "pytest>=6.2.4"] test-cu11 = ["cuda-core[test]", "cupy-cuda11x", "nvidia-cuda-runtime-cu11"] # runtime headers needed by CuPy test-cu12 = ["cuda-core[test]", "cupy-cuda12x", "nvidia-cuda-runtime-cu12"] # runtime headers needed by CuPy