From 1d1171f8e5e7950015f090885560040e819d4d63 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 19 Jun 2025 15:41:38 -0400 Subject: [PATCH 01/10] Add a CUDA graphs example --- cuda_core/examples/cuda_graphs.py | 168 ++++++++++++++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 cuda_core/examples/cuda_graphs.py diff --git a/cuda_core/examples/cuda_graphs.py b/cuda_core/examples/cuda_graphs.py new file mode 100644 index 0000000000..0f9a5b4374 --- /dev/null +++ b/cuda_core/examples/cuda_graphs.py @@ -0,0 +1,168 @@ +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This demo illustrates how to use CUDA graphs to capture and execute +# multiple kernel launches with minimal overhead. The graph performs a +# sequence of vector operations: add, multiply, and subtract. +# +# ################################################################################ + +import time + +import cupy as cp + +from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch + + +def main(): + # CUDA kernels for vector operations + code = """ + template + __global__ void vector_add(const T* A, const T* B, T* C, size_t N) { + const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (size_t i = tid; i < N; i += gridDim.x * blockDim.x) { + C[i] = A[i] + B[i]; + } + } + + template + __global__ void vector_multiply(const T* A, const T* B, T* C, size_t N) { + const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (size_t i = tid; i < N; i += gridDim.x * blockDim.x) { + C[i] = A[i] * B[i]; + } + } + + template + __global__ void vector_subtract(const T* A, const T* B, T* C, size_t N) { + const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (size_t i = tid; i < N; i += gridDim.x * blockDim.x) { + C[i] = A[i] - B[i]; + } + } + """ + + # Initialize device and stream + dev = Device() + dev.set_current() + stream = dev.create_stream() + + # Compile the program + arch = "".join(f"{i}" for i in dev.compute_capability) + program_options = ProgramOptions(std="c++17", arch=f"sm_{arch}") + prog = Program(code, code_type="c++", options=program_options) + mod = prog.compile( + "cubin", name_expressions=("vector_add", "vector_multiply", "vector_subtract") + ) + + # Get kernel functions + add_kernel = mod.get_kernel("vector_add") + multiply_kernel = mod.get_kernel("vector_multiply") + subtract_kernel = mod.get_kernel("vector_subtract") + + # Prepare data + size = 1000000 + dtype = cp.float32 + + # Create input arrays + rng = cp.random.default_rng(42) # Fixed seed for reproducibility + a = rng.random(size, dtype=dtype) + b = rng.random(size, dtype=dtype) + c = rng.random(size, dtype=dtype) + + # Create output arrays + result1 = cp.empty_like(a) + result2 = cp.empty_like(a) + result3 = cp.empty_like(a) + + # Prepare launch configuration + block_size = 256 + grid_size = (size + block_size - 1) // block_size + config = LaunchConfig(grid=grid_size, block=block_size) + + # Sync before graph capture + dev.sync() + + print("Building CUDA graph...") + + # Build the graph + graph_builder = stream.create_graph_builder() + graph_builder.begin_building() + + # Add multiple kernel launches to the graph + # Kernel 1: result1 = a + b + launch(graph_builder, config, add_kernel, a.data.ptr, b.data.ptr, result1.data.ptr, cp.uint64(size)) + + # Kernel 2: result2 = result1 * c + launch(graph_builder, config, multiply_kernel, result1.data.ptr, c.data.ptr, result2.data.ptr, cp.uint64(size)) + + # Kernel 3: result3 = result2 - a + launch(graph_builder, config, subtract_kernel, result2.data.ptr, a.data.ptr, result3.data.ptr, cp.uint64(size)) + + # Complete the graph + graph = graph_builder.end_building().complete() + + print("Graph built successfully!") + + # Upload the graph to the stream + graph.upload(stream) + + # Execute the entire graph with a single launch + print("Executing graph...") + start_time = time.time() + graph.launch(stream) + stream.sync() + end_time = time.time() + + graph_execution_time = end_time - start_time + print(f"Graph execution time: {graph_execution_time:.6f} seconds") + + # Verify results + expected_result1 = a + b + expected_result2 = expected_result1 * c + expected_result3 = expected_result2 - a + + print("Verifying results...") + assert cp.allclose(result1, expected_result1, rtol=1e-5, atol=1e-5), "Result 1 mismatch" + assert cp.allclose(result2, expected_result2, rtol=1e-5, atol=1e-5), "Result 2 mismatch" + assert cp.allclose(result3, expected_result3, rtol=1e-5, atol=1e-5), "Result 3 mismatch" + print("All results verified successfully!") + + # Demonstrate performance benefit by running the same operations without graph + print("\nRunning same operations without graph for comparison...") + + # Reset results + result1.fill(0) + result2.fill(0) + result3.fill(0) + + start_time = time.time() + + # Individual kernel launches + launch(stream, config, add_kernel, a.data.ptr, b.data.ptr, result1.data.ptr, cp.uint64(size)) + launch(stream, config, multiply_kernel, result1.data.ptr, c.data.ptr, result2.data.ptr, cp.uint64(size)) + launch(stream, config, subtract_kernel, result2.data.ptr, a.data.ptr, result3.data.ptr, cp.uint64(size)) + + stream.sync() + end_time = time.time() + + individual_execution_time = end_time - start_time + print(f"Individual kernel execution time: {individual_execution_time:.6f} seconds") + + # Calculate speedup + speedup = individual_execution_time / graph_execution_time + print(f"Graph provides {speedup:.2f}x speedup") + + # Verify results again + assert cp.allclose(result1, expected_result1, rtol=1e-5, atol=1e-5), "Result 1 mismatch" + assert cp.allclose(result2, expected_result2, rtol=1e-5, atol=1e-5), "Result 2 mismatch" + assert cp.allclose(result3, expected_result3, rtol=1e-5, atol=1e-5), "Result 3 mismatch" + + print("\nExample completed successfully!") + + +if __name__ == "__main__": + main() From f161652e564d5fe412f6cbee0677108515084206 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 19 Jun 2025 15:42:31 -0400 Subject: [PATCH 02/10] Add a brief description for each of the examples --- cuda_core/examples/jit_lto_fractal.py | 2 +- cuda_core/examples/pytorch_example.py | 12 ++++++++++-- cuda_core/examples/saxpy.py | 13 +++++++++++++ cuda_core/examples/show_device_properties.py | 7 +++++++ cuda_core/examples/simple_multi_gpu_example.py | 7 +++++++ cuda_core/examples/strided_memory_view_cpu.py | 7 +++---- cuda_core/examples/strided_memory_view_gpu.py | 7 +++---- cuda_core/examples/thread_block_cluster.py | 7 +++++++ cuda_core/examples/vector_add.py | 7 +++++++ 9 files changed, 58 insertions(+), 11 deletions(-) diff --git a/cuda_core/examples/jit_lto_fractal.py b/cuda_core/examples/jit_lto_fractal.py index a44c5a4c45..5fe95eeccb 100644 --- a/cuda_core/examples/jit_lto_fractal.py +++ b/cuda_core/examples/jit_lto_fractal.py @@ -4,7 +4,7 @@ # ################################################################################ # -# This demo aims to illustrate a couple takeaways: +# This demo illustrates: # # 1. How to use the JIT LTO feature provided by the Linker class to link multiple objects together # 2. That linking allows for libraries to modify workflows dynamically at runtime diff --git a/cuda_core/examples/pytorch_example.py b/cuda_core/examples/pytorch_example.py index 4e8ce55337..0eb86f2af4 100644 --- a/cuda_core/examples/pytorch_example.py +++ b/cuda_core/examples/pytorch_example.py @@ -2,8 +2,16 @@ # # SPDX-License-Identifier: Apache-2.0 -## Usage: pip install "cuda-core[cu12]" -## python python_example.py +# ################################################################################ +# +# This demo illustrates how to use `cuda.core` to compile a CUDA kernel +# and launch it using PyTorch tensors as inputs. +# +# ## Usage: pip install "cuda-core[cu12]" +# ## python pytorch_example.py +# +# ################################################################################ + import sys import torch diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py index 2255311418..fedf7aafd4 100644 --- a/cuda_core/examples/saxpy.py +++ b/cuda_core/examples/saxpy.py @@ -2,6 +2,15 @@ # # SPDX-License-Identifier: Apache-2.0 +# ################################################################################ +# +# This demo illustrates how to use `cuda.core` to compile a templated CUDA kernel +# and launch it using `cupy` arrays as inputs. This is a simple example of a +# templated kernel, where the kernel is instantiated for both `float` and `double` +# data types. +# +# ################################################################################ + import sys import cupy as cp @@ -32,6 +41,10 @@ arch = "".join(f"{i}" for i in dev.compute_capability) program_options = ProgramOptions(std="c++11", arch=f"sm_{arch}") prog = Program(code, code_type="c++", options=program_options) + +# Note the use of the `name_expressions` argument to specify the template +# instantiations of the kernel that we will use. For non-templated kernels, +# `name_expressions` will simply contain the name of the kernels. mod = prog.compile( "cubin", logs=sys.stdout, diff --git a/cuda_core/examples/show_device_properties.py b/cuda_core/examples/show_device_properties.py index 318ea41fb7..3d5ac63d4a 100644 --- a/cuda_core/examples/show_device_properties.py +++ b/cuda_core/examples/show_device_properties.py @@ -2,6 +2,13 @@ # # SPDX-License-Identifier: Apache-2.0 +# ################################################################################ +# +# This demo illustrates how to use `cuda.core` to show the properties of the +# CUDA devices in the system. +# +# ################################################################################ + import sys from cuda.core.experimental import Device, system diff --git a/cuda_core/examples/simple_multi_gpu_example.py b/cuda_core/examples/simple_multi_gpu_example.py index a9b006b31d..63d5ebf4ee 100644 --- a/cuda_core/examples/simple_multi_gpu_example.py +++ b/cuda_core/examples/simple_multi_gpu_example.py @@ -2,6 +2,13 @@ # # SPDX-License-Identifier: Apache-2.0 +# ################################################################################ +# +# This demo illustrates how to use `cuda.core` to compile and launch kernels +# on multiple GPUs. +# +# ################################################################################ + import sys import cupy as cp diff --git a/cuda_core/examples/strided_memory_view_cpu.py b/cuda_core/examples/strided_memory_view_cpu.py index 0fa8f38e6c..066f1afc18 100644 --- a/cuda_core/examples/strided_memory_view_cpu.py +++ b/cuda_core/examples/strided_memory_view_cpu.py @@ -4,14 +4,13 @@ # ################################################################################ # -# This demo aims to illustrate two takeaways: +# This demo illustrates: # # 1. The similarity between CPU and GPU JIT-compilation with C++ sources # 2. How to use StridedMemoryView to interface with foreign C/C++ functions # -# To facilitate this demo, we use cffi (https://cffi.readthedocs.io/) for the CPU -# path, which can be easily installed from pip or conda following their instructions. -# We also use NumPy/CuPy as the CPU/GPU array container. +# This demo uses cffi (https://cffi.readthedocs.io/) for the CPU path, which can be +# easily installed from pip or conda following their instructions. # # ################################################################################ diff --git a/cuda_core/examples/strided_memory_view_gpu.py b/cuda_core/examples/strided_memory_view_gpu.py index 10d12fd30c..57cfa6f3cf 100644 --- a/cuda_core/examples/strided_memory_view_gpu.py +++ b/cuda_core/examples/strided_memory_view_gpu.py @@ -4,14 +4,13 @@ # ################################################################################ # -# This demo aims to illustrate two takeaways: +# This demo illustrates: # # 1. The similarity between CPU and GPU JIT-compilation with C++ sources # 2. How to use StridedMemoryView to interface with foreign C/C++ functions # -# To facilitate this demo, we use cffi (https://cffi.readthedocs.io/) for the CPU -# path, which can be easily installed from pip or conda following their instructions. -# We also use NumPy/CuPy as the CPU/GPU array container. +# This demo uses cffi (https://cffi.readthedocs.io/) for the CPU path, which can be +# easily installed from pip or conda following their instructions. # # ################################################################################ diff --git a/cuda_core/examples/thread_block_cluster.py b/cuda_core/examples/thread_block_cluster.py index e761fbb47d..ce337c7af7 100644 --- a/cuda_core/examples/thread_block_cluster.py +++ b/cuda_core/examples/thread_block_cluster.py @@ -2,6 +2,13 @@ # # SPDX-License-Identifier: Apache-2.0 +# ################################################################################ +# +# This demo illustrates the use of thread block clusters in the CUDA launch +# configuration. +# +# ################################################################################ + import os import sys diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py index 9f48d31f0e..94ca201efc 100644 --- a/cuda_core/examples/vector_add.py +++ b/cuda_core/examples/vector_add.py @@ -2,6 +2,13 @@ # # SPDX-License-Identifier: Apache-2.0 +# ################################################################################ +# +# This demo illustrates how to use `cuda.core` to compile and launch a simple +# vector addition kernel. +# +# ################################################################################ + import cupy as cp from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch From dc63296559ba253f81def3579235ba342eb4d801 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 19 Jun 2025 15:49:40 -0400 Subject: [PATCH 03/10] Add getting started page --- cuda_core/docs/source/api.rst | 2 + cuda_core/docs/source/getting-started.md | 112 +++++++++++++++++++++++ cuda_core/docs/source/index.rst | 21 +++-- 3 files changed, 128 insertions(+), 7 deletions(-) create mode 100644 cuda_core/docs/source/getting-started.md diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 7f9f814d41..01501c52f6 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -21,6 +21,8 @@ CUDA runtime GraphBuilder launch Buffer + Stream + Event MemoryResource DeviceMemoryResource LegacyPinnedMemoryResource diff --git a/cuda_core/docs/source/getting-started.md b/cuda_core/docs/source/getting-started.md new file mode 100644 index 0000000000..44490270d4 --- /dev/null +++ b/cuda_core/docs/source/getting-started.md @@ -0,0 +1,112 @@ +# Overview + +## What is `cuda core`? + +`cuda.core` provides a Pythonic interface to the CUDA runtime and other functionality, +including: + +- Compiling and launching CUDA kernels +- Asynchronous concurrent execution with CUDA graphs, streams and events +- Coordinating work across multiple CUDA devices +- Allocating, transfering, and managing device memory +- Runtime linking of device code with Link-Time Optimization (LTO) +- and much more! + +Rather than providing 1:1 equivalents of the CUDA driver and runtime APIs +(for that, see [`cuda.bindings`][bindings]), `cuda.core` provides high-level constructs such as: + +- {class}`Device ` class for GPU device operations and context management. +- {class}`Buffer ` and {class}`MemoryResource ` classes for memory allocation and management. +- {class}`Program ` for JIT compilation of CUDA kernels. +- {class}`GraphBuilder ` for building and executing CUDA graphs. +- {class}`Stream ` and {class}`Event ` for asynchronous execution and timing. + +## Example: Compiling and Launching a CUDA kernel + +To get a taste for `cuda.core`, let's walk through a simple example that compiles and launches a vector addition kernel. +You can find the complete example in [`vector_add.py`][vector_add_example]. + +First, we define a string containing the CUDA C++ kernel. Note that this is a templated kernel: + +```python +# compute c = a + b +code = """ +template +__global__ void vector_add(const T* A, + const T* B, + T* C, + size_t N) { + const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (size_t i=tid; i` object +and a corresponding {class}`Stream `. +Don't forget to use {meth}`Device.set_current() `! + +```python +from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch + +dev = Device() +dev.set_current() +s = dev.create_stream() +``` + +Next, we compile the CUDA C++ kernel from earlier using the {class}`Program ` class. +The result of the compilation is saved as a CUBIN. +Note the use of the `name_expressions` parameter to the {meth}`Program.compile() ` method to specify which kernel template instantiations to compile: + +```python +arch = "".join(f"{i}" for i in dev.compute_capability) +program_options = ProgramOptions(std="c++17", arch=f"sm_{arch}") +prog = Program(code, code_type="c++", options=program_options) +mod = prog.compile("cubin", name_expressions=("vector_add",)) +``` + +Next, we retrieve the compiled kernel from the CUBIN and prepare the arguments and kernel configuration. +We're using [CuPy][cupy] arrays as inputs for this example, but you can use PyTorch tensors too +(we show how to do this in one of our [examples][examples]). + +```python +ker = mod.get_kernel("vector_add") + +# Prepare input/output arrays (using CuPy) +size = 50000 +a = rng.random(size, dtype=cp.float32) +b = rng.random(size, dtype=cp.float32) +c = cp.empty_like(a) + +# Configure launch parameters +block = 256 +grid = (size + block - 1) // block +config = LaunchConfig(grid=grid, block=block) +``` + +Finally, we use the {func}`launch ` function to execute our kernel on the specified stream with the given configuration and arguments. Note the use of `.data.ptr` to get the pointer to the array data. + +```python +launch(s, config, ker, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size)) +s.sync() +``` + +This example demonstrates one of the core workflows enabled by `cuda.core`: compiling and launching CUDA code. +Note the clean, Pythonic interface, and absense of any direct calls to the CUDA runtime/driver APIs! + +## Examples and Recipes + +As we mentioned before, `cuda.core` can do much more than just compile and launch kernels! + +The best way to explore and learn the different features `cuda.core` is through +our [`examples`][examples]. Find one that matches your use-case, and modify it to fit your needs! + + +[bindings]: https://nvidia.github.io/cuda-python/cuda-bindings/latest/ +[cai]: https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html +[cupy]: https://cupy.dev/ +[dlpack]: https://dmlc.github.io/dlpack/latest/ +[examples]: https://github.com/NVIDIA/cuda-python/tree/main/cuda_core/examples +[vector_add_example]: https://github.com/NVIDIA/cuda-python/tree/main/cuda_core/examples/vector_add.py diff --git a/cuda_core/docs/source/index.rst b/cuda_core/docs/source/index.rst index 69d3d25c9f..9832b775c8 100644 --- a/cuda_core/docs/source/index.rst +++ b/cuda_core/docs/source/index.rst @@ -1,23 +1,30 @@ .. SPDX-License-Identifier: Apache-2.0 -``cuda.core``: Pythonic access to CUDA core functionalities -=========================================================== +``cuda.core``: Pythonic access to CUDA core functionality +========================================================= -The new Python module ``cuda.core`` offers idiomatic, pythonic access to CUDA runtime -and other functionalities. +Welcome to the documentation for ``cuda.core``. .. toctree:: :maxdepth: 2 :caption: Contents: - release - install.md + getting-started + install interoperability api contribute - conduct.md + +.. toctree:: + :maxdepth: 1 + + conduct license +.. toctree:: + :maxdepth: 2 + + release Indices and tables ================== From 08919d92e830728cc30ee9d7a0b7b78f36f2fb28 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 19 Jun 2025 15:50:14 -0400 Subject: [PATCH 04/10] Load nvrtc using path_finder --- cuda_core/cuda/core/__init__.py | 3 +++ cuda_core/pyproject.toml | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index 281374f649..516ab3f9d4 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -2,4 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 +from cuda.bindings.path_finder import _load_nvidia_dynamic_library from cuda.core._version import __version__ + +""" _load_nvidia_dynamic_library("nvrtc") """ diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml index 8518a2e282..fe046e5f9e 100644 --- a/cuda_core/pyproject.toml +++ b/cuda_core/pyproject.toml @@ -46,8 +46,8 @@ dependencies = [ ] [project.optional-dependencies] -cu11 = ["cuda-bindings==11.8.*"] -cu12 = ["cuda-bindings==12.*"] +cu11 = ["cuda-bindings[all]==11.8.*"] +cu12 = ["cuda-bindings[all]==12.*"] test = ["cython>=3.0", "setuptools", "pytest>=6.2.4"] test-cu11 = ["cuda-core[test]", "cupy-cuda11x", "nvidia-cuda-runtime-cu11"] # runtime headers needed by CuPy test-cu12 = ["cuda-core[test]", "cupy-cuda12x", "nvidia-cuda-runtime-cu12"] # runtime headers needed by CuPy From aa0e36f54a04544be544b14cfcb94cd353917b12 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 24 Jun 2025 09:21:23 -0400 Subject: [PATCH 05/10] Tell CuPy to use our stream --- cuda_core/examples/cuda_graphs.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cuda_core/examples/cuda_graphs.py b/cuda_core/examples/cuda_graphs.py index 0f9a5b4374..2db89f37f7 100644 --- a/cuda_core/examples/cuda_graphs.py +++ b/cuda_core/examples/cuda_graphs.py @@ -49,6 +49,8 @@ def main(): dev = Device() dev.set_current() stream = dev.create_stream() + # tell CuPy to use our stream as the current stream: + cp.cuda.ExternalStream(int(stream.handle)).use() # Compile the program arch = "".join(f"{i}" for i in dev.compute_capability) @@ -161,6 +163,8 @@ def main(): assert cp.allclose(result2, expected_result2, rtol=1e-5, atol=1e-5), "Result 2 mismatch" assert cp.allclose(result3, expected_result3, rtol=1e-5, atol=1e-5), "Result 3 mismatch" + cp.cuda.Stream.null.use() # reset CuPy's current stream to the null stream + print("\nExample completed successfully!") From 5f12bb81aa2e1b96c8a1644dde8123c89561a8d4 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 26 Jun 2025 09:51:32 -0400 Subject: [PATCH 06/10] Fix dependency specification --- cuda_core/pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml index fe046e5f9e..84f3c75181 100644 --- a/cuda_core/pyproject.toml +++ b/cuda_core/pyproject.toml @@ -49,8 +49,8 @@ dependencies = [ cu11 = ["cuda-bindings[all]==11.8.*"] cu12 = ["cuda-bindings[all]==12.*"] test = ["cython>=3.0", "setuptools", "pytest>=6.2.4"] -test-cu11 = ["cuda-core[test]", "cupy-cuda11x", "nvidia-cuda-runtime-cu11"] # runtime headers needed by CuPy -test-cu12 = ["cuda-core[test]", "cupy-cuda12x", "nvidia-cuda-runtime-cu12"] # runtime headers needed by CuPy +test-cu11 = ["cuda-core[cu11,test]", "cupy-cuda11x", "nvidia-cuda-runtime-cu11"] # runtime headers needed by CuPy +test-cu12 = ["cuda-core[cu12,test]", "cupy-cuda12x", "nvidia-cuda-runtime-cu12"] # runtime headers needed by CuPy [project.urls] homepage = "https://nvidia.github.io/cuda-python/" From 0c9f25fb444e1ff09e329b30c13aa2e7af25108a Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 26 Jun 2025 09:55:17 -0400 Subject: [PATCH 07/10] Too exciting --- cuda_core/docs/source/getting-started.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_core/docs/source/getting-started.md b/cuda_core/docs/source/getting-started.md index 44490270d4..a1eccb0fe4 100644 --- a/cuda_core/docs/source/getting-started.md +++ b/cuda_core/docs/source/getting-started.md @@ -94,11 +94,11 @@ s.sync() ``` This example demonstrates one of the core workflows enabled by `cuda.core`: compiling and launching CUDA code. -Note the clean, Pythonic interface, and absense of any direct calls to the CUDA runtime/driver APIs! +Note the clean, Pythonic interface, and absense of any direct calls to the CUDA runtime/driver APIs. ## Examples and Recipes -As we mentioned before, `cuda.core` can do much more than just compile and launch kernels! +As we mentioned before, `cuda.core` can do much more than just compile and launch kernels. The best way to explore and learn the different features `cuda.core` is through our [`examples`][examples]. Find one that matches your use-case, and modify it to fit your needs! From 4af0bb72e58fbd57a3f3e4c7b02fdb42027b9f10 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 26 Jun 2025 10:37:49 -0400 Subject: [PATCH 08/10] Don't load nvrtc in cuda.core --- cuda_core/cuda/core/__init__.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index 516ab3f9d4..281374f649 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -2,7 +2,4 @@ # # SPDX-License-Identifier: Apache-2.0 -from cuda.bindings.path_finder import _load_nvidia_dynamic_library from cuda.core._version import __version__ - -""" _load_nvidia_dynamic_library("nvrtc") """ From 310f3912f74f6ce50209dd1d75af4e3831d591d1 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 26 Jun 2025 16:03:29 -0400 Subject: [PATCH 09/10] Revert --- cuda_core/pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml index 84f3c75181..fe046e5f9e 100644 --- a/cuda_core/pyproject.toml +++ b/cuda_core/pyproject.toml @@ -49,8 +49,8 @@ dependencies = [ cu11 = ["cuda-bindings[all]==11.8.*"] cu12 = ["cuda-bindings[all]==12.*"] test = ["cython>=3.0", "setuptools", "pytest>=6.2.4"] -test-cu11 = ["cuda-core[cu11,test]", "cupy-cuda11x", "nvidia-cuda-runtime-cu11"] # runtime headers needed by CuPy -test-cu12 = ["cuda-core[cu12,test]", "cupy-cuda12x", "nvidia-cuda-runtime-cu12"] # runtime headers needed by CuPy +test-cu11 = ["cuda-core[test]", "cupy-cuda11x", "nvidia-cuda-runtime-cu11"] # runtime headers needed by CuPy +test-cu12 = ["cuda-core[test]", "cupy-cuda12x", "nvidia-cuda-runtime-cu12"] # runtime headers needed by CuPy [project.urls] homepage = "https://nvidia.github.io/cuda-python/" From 806b9a7e3567f4e15df872cf08de877af0c56580 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Thu, 26 Jun 2025 21:49:03 +0000 Subject: [PATCH 10/10] update CI handling of installing cuda.core dependencies when testing against local CTK --- ci/tools/run-tests | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/ci/tools/run-tests b/ci/tools/run-tests index 51fcf0a6b4..3a35b6ad23 100755 --- a/ci/tools/run-tests +++ b/ci/tools/run-tests @@ -57,7 +57,13 @@ elif [[ "${test_module}" == "core" ]]; then echo "Installing core wheel" pwd ls - pip install $(ls *.whl)["cu${TEST_CUDA_MAJOR}","test-cu${TEST_CUDA_MAJOR}"] + if [[ "${LOCAL_CTK}" == 1 ]]; then + # We already installed cuda-bindings, and all CTK components exist locally, + # so just install the test dependencies. + pip install $(ls *.whl)["test-cu${TEST_CUDA_MAJOR}"] + else + pip install $(ls *.whl)["cu${TEST_CUDA_MAJOR}","test-cu${TEST_CUDA_MAJOR}"] + fi popd pushd ./cuda_core echo "Running core tests" @@ -69,4 +75,4 @@ elif [[ "${test_module}" == "core" ]]; then ${SANITIZER_CMD} pytest -rxXs -v tests/cython fi popd -fi \ No newline at end of file +fi