From 1d1171f8e5e7950015f090885560040e819d4d63 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 19 Jun 2025 15:41:38 -0400
Subject: [PATCH 01/10] Add a CUDA graphs example

---
 cuda_core/examples/cuda_graphs.py | 168 ++++++++++++++++++++++++++++++
 1 file changed, 168 insertions(+)
 create mode 100644 cuda_core/examples/cuda_graphs.py
diff --git a/cuda_core/examples/cuda_graphs.py b/cuda_core/examples/cuda_graphs.py
new file mode 100644
index 0000000000..0f9a5b4374
--- /dev/null
+++ b/cuda_core/examples/cuda_graphs.py
@@ -0,0 +1,168 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This demo illustrates how to use CUDA graphs to capture and execute
+# multiple kernel launches with minimal overhead. The graph performs a
+# sequence of vector operations: add, multiply, and subtract.
+#
+# ################################################################################
+
+import time
+
+import cupy as cp
+
+from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
+
+
+def main():
+    # CUDA kernels for vector operations
+    code = """
+    template<typename T>
+    __global__ void vector_add(const T* A, const T* B, T* C, size_t N) {
+        const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
+        for (size_t i = tid; i < N; i += gridDim.x * blockDim.x) {
+            C[i] = A[i] + B[i];
+        }
+    }
+
+    template<typename T>
+    __global__ void vector_multiply(const T* A, const T* B, T* C, size_t N) {
+        const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
+        for (size_t i = tid; i < N; i += gridDim.x * blockDim.x) {
+            C[i] = A[i] * B[i];
+        }
+    }
+
+    template<typename T>
+    __global__ void vector_subtract(const T* A, const T* B, T* C, size_t N) {
+        const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
+        for (size_t i = tid; i < N; i += gridDim.x * blockDim.x) {
+            C[i] = A[i] - B[i];
+        }
+    }
+    """
+
+    # Initialize device and stream
+    dev = Device()
+    dev.set_current()
+    stream = dev.create_stream()
+
+    # Compile the program
+    arch = "".join(f"{i}" for i in dev.compute_capability)
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{arch}")
+    prog = Program(code, code_type="c++", options=program_options)
+    mod = prog.compile(
+        "cubin", name_expressions=("vector_add<float>", "vector_multiply<float>", "vector_subtract<float>")
+    )
+
+    # Get kernel functions
+    add_kernel = mod.get_kernel("vector_add<float>")
+    multiply_kernel = mod.get_kernel("vector_multiply<float>")
+    subtract_kernel = mod.get_kernel("vector_subtract<float>")
+
+    # Prepare data
+    size = 1000000
+    dtype = cp.float32
+
+    # Create input arrays
+    rng = cp.random.default_rng(42)  # Fixed seed for reproducibility
+    a = rng.random(size, dtype=dtype)
+    b = rng.random(size, dtype=dtype)
+    c = rng.random(size, dtype=dtype)
+
+    # Create output arrays
+    result1 = cp.empty_like(a)
+    result2 = cp.empty_like(a)
+    result3 = cp.empty_like(a)
+
+    # Prepare launch configuration
+    block_size = 256
+    grid_size = (size + block_size - 1) // block_size
+    config = LaunchConfig(grid=grid_size, block=block_size)
+
+    # Sync before graph capture
+    dev.sync()
+
+    print("Building CUDA graph...")
+
+    # Build the graph
+    graph_builder = stream.create_graph_builder()
+    graph_builder.begin_building()
+
+    # Add multiple kernel launches to the graph
+    # Kernel 1: result1 = a + b
+    launch(graph_builder, config, add_kernel, a.data.ptr, b.data.ptr, result1.data.ptr, cp.uint64(size))
+
+    # Kernel 2: result2 = result1 * c
+    launch(graph_builder, config, multiply_kernel, result1.data.ptr, c.data.ptr, result2.data.ptr, cp.uint64(size))
+
+    # Kernel 3: result3 = result2 - a
+    launch(graph_builder, config, subtract_kernel, result2.data.ptr, a.data.ptr, result3.data.ptr, cp.uint64(size))
+
+    # Complete the graph
+    graph = graph_builder.end_building().complete()
+
+    print("Graph built successfully!")
+
+    # Upload the graph to the stream
+    graph.upload(stream)
+
+    # Execute the entire graph with a single launch
+    print("Executing graph...")
+    start_time = time.time()
+    graph.launch(stream)
+    stream.sync()
+    end_time = time.time()
+
+    graph_execution_time = end_time - start_time
+    print(f"Graph execution time: {graph_execution_time:.6f} seconds")
+
+    # Verify results
+    expected_result1 = a + b
+    expected_result2 = expected_result1 * c
+    expected_result3 = expected_result2 - a
+
+    print("Verifying results...")
+    assert cp.allclose(result1, expected_result1, rtol=1e-5, atol=1e-5), "Result 1 mismatch"
+    assert cp.allclose(result2, expected_result2, rtol=1e-5, atol=1e-5), "Result 2 mismatch"
+    assert cp.allclose(result3, expected_result3, rtol=1e-5, atol=1e-5), "Result 3 mismatch"
+    print("All results verified successfully!")
+
+    # Demonstrate performance benefit by running the same operations without graph
+    print("\nRunning same operations without graph for comparison...")
+
+    # Reset results
+    result1.fill(0)
+    result2.fill(0)
+    result3.fill(0)
+
+    start_time = time.time()
+
+    # Individual kernel launches
+    launch(stream, config, add_kernel, a.data.ptr, b.data.ptr, result1.data.ptr, cp.uint64(size))
+    launch(stream, config, multiply_kernel, result1.data.ptr, c.data.ptr, result2.data.ptr, cp.uint64(size))
+    launch(stream, config, subtract_kernel, result2.data.ptr, a.data.ptr, result3.data.ptr, cp.uint64(size))
+
+    stream.sync()
+    end_time = time.time()
+
+    individual_execution_time = end_time - start_time
+    print(f"Individual kernel execution time: {individual_execution_time:.6f} seconds")
+
+    # Calculate speedup
+    speedup = individual_execution_time / graph_execution_time
+    print(f"Graph provides {speedup:.2f}x speedup")
+
+    # Verify results again
+    assert cp.allclose(result1, expected_result1, rtol=1e-5, atol=1e-5), "Result 1 mismatch"
+    assert cp.allclose(result2, expected_result2, rtol=1e-5, atol=1e-5), "Result 2 mismatch"
+    assert cp.allclose(result3, expected_result3, rtol=1e-5, atol=1e-5), "Result 3 mismatch"
+
+    print("\nExample completed successfully!")
+
+
+if __name__ == "__main__":
+    main()

From f161652e564d5fe412f6cbee0677108515084206 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 19 Jun 2025 15:42:31 -0400
Subject: [PATCH 02/10] Add a brief description for each of the examples

---
 cuda_core/examples/jit_lto_fractal.py          |  2 +-
 cuda_core/examples/pytorch_example.py          | 12 ++++++++++--
 cuda_core/examples/saxpy.py                    | 13 +++++++++++++
 cuda_core/examples/show_device_properties.py   |  7 +++++++
 cuda_core/examples/simple_multi_gpu_example.py |  7 +++++++
 cuda_core/examples/strided_memory_view_cpu.py  |  7 +++----
 cuda_core/examples/strided_memory_view_gpu.py  |  7 +++----
 cuda_core/examples/thread_block_cluster.py     |  7 +++++++
 cuda_core/examples/vector_add.py               |  7 +++++++
 9 files changed, 58 insertions(+), 11 deletions(-)

diff --git a/cuda_core/examples/jit_lto_fractal.py b/cuda_core/examples/jit_lto_fractal.py
index a44c5a4c45..5fe95eeccb 100644
--- a/cuda_core/examples/jit_lto_fractal.py
+++ b/cuda_core/examples/jit_lto_fractal.py
@@ -4,7 +4,7 @@
 
 # ################################################################################
 #
-# This demo aims to illustrate a couple takeaways:
+# This demo illustrates:
 #
 #   1. How to use the JIT LTO feature provided by the Linker class to link multiple objects together
 #   2. That linking allows for libraries to modify workflows dynamically at runtime
diff --git a/cuda_core/examples/pytorch_example.py b/cuda_core/examples/pytorch_example.py
index 4e8ce55337..0eb86f2af4 100644
--- a/cuda_core/examples/pytorch_example.py
+++ b/cuda_core/examples/pytorch_example.py
@@ -2,8 +2,16 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-## Usage: pip install "cuda-core[cu12]"
-## python python_example.py
+# ################################################################################
+#
+# This demo illustrates how to use `cuda.core` to compile a CUDA kernel
+# and launch it using PyTorch tensors as inputs.
+#
+# ## Usage: pip install "cuda-core[cu12]"
+# ## python pytorch_example.py
+#
+# ################################################################################
+
 import sys
 
 import torch
diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py
index 2255311418..fedf7aafd4 100644
--- a/cuda_core/examples/saxpy.py
+++ b/cuda_core/examples/saxpy.py
@@ -2,6 +2,15 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+# ################################################################################
+#
+# This demo illustrates how to use `cuda.core` to compile a templated CUDA kernel
+# and launch it using `cupy` arrays as inputs. This is a simple example of a
+# templated kernel, where the kernel is instantiated for both `float` and `double`
+# data types.
+#
+# ################################################################################
+
 import sys
 
 import cupy as cp
@@ -32,6 +41,10 @@
 arch = "".join(f"{i}" for i in dev.compute_capability)
 program_options = ProgramOptions(std="c++11", arch=f"sm_{arch}")
 prog = Program(code, code_type="c++", options=program_options)
+
+# Note the use of the `name_expressions` argument to specify the template
+# instantiations of the kernel that we will use. For non-templated kernels,
+# `name_expressions` will simply contain the name of the kernels.
 mod = prog.compile(
     "cubin",
     logs=sys.stdout,
diff --git a/cuda_core/examples/show_device_properties.py b/cuda_core/examples/show_device_properties.py
index 318ea41fb7..3d5ac63d4a 100644
--- a/cuda_core/examples/show_device_properties.py
+++ b/cuda_core/examples/show_device_properties.py
@@ -2,6 +2,13 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+# ################################################################################
+#
+# This demo illustrates how to use `cuda.core` to show the properties of the
+# CUDA devices in the system.
+#
+# ################################################################################
+
 import sys
 
 from cuda.core.experimental import Device, system
diff --git a/cuda_core/examples/simple_multi_gpu_example.py b/cuda_core/examples/simple_multi_gpu_example.py
index a9b006b31d..63d5ebf4ee 100644
--- a/cuda_core/examples/simple_multi_gpu_example.py
+++ b/cuda_core/examples/simple_multi_gpu_example.py
@@ -2,6 +2,13 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+# ################################################################################
+#
+# This demo illustrates how to use `cuda.core` to compile and launch kernels
+# on multiple GPUs.
+#
+# ################################################################################
+
 import sys
 
 import cupy as cp
diff --git a/cuda_core/examples/strided_memory_view_cpu.py b/cuda_core/examples/strided_memory_view_cpu.py
index 0fa8f38e6c..066f1afc18 100644
--- a/cuda_core/examples/strided_memory_view_cpu.py
+++ b/cuda_core/examples/strided_memory_view_cpu.py
@@ -4,14 +4,13 @@
 
 # ################################################################################
 #
-# This demo aims to illustrate two takeaways:
+# This demo illustrates:
 #
 #   1. The similarity between CPU and GPU JIT-compilation with C++ sources
 #   2. How to use StridedMemoryView to interface with foreign C/C++ functions
 #
-# To facilitate this demo, we use cffi (https://cffi.readthedocs.io/) for the CPU
-# path, which can be easily installed from pip or conda following their instructions.
-# We also use NumPy/CuPy as the CPU/GPU array container.
+# This demo uses cffi (https://cffi.readthedocs.io/) for the CPU path, which can be
+# easily installed from pip or conda following their instructions.
 #
 # ################################################################################
 
diff --git a/cuda_core/examples/strided_memory_view_gpu.py b/cuda_core/examples/strided_memory_view_gpu.py
index 10d12fd30c..57cfa6f3cf 100644
--- a/cuda_core/examples/strided_memory_view_gpu.py
+++ b/cuda_core/examples/strided_memory_view_gpu.py
@@ -4,14 +4,13 @@
 
 # ################################################################################
 #
-# This demo aims to illustrate two takeaways:
+# This demo illustrates:
 #
 #   1. The similarity between CPU and GPU JIT-compilation with C++ sources
 #   2. How to use StridedMemoryView to interface with foreign C/C++ functions
 #
-# To facilitate this demo, we use cffi (https://cffi.readthedocs.io/) for the CPU
-# path, which can be easily installed from pip or conda following their instructions.
-# We also use NumPy/CuPy as the CPU/GPU array container.
+# This demo uses cffi (https://cffi.readthedocs.io/) for the CPU path, which can be
+# easily installed from pip or conda following their instructions.
 #
 # ################################################################################
 
diff --git a/cuda_core/examples/thread_block_cluster.py b/cuda_core/examples/thread_block_cluster.py
index e761fbb47d..ce337c7af7 100644
--- a/cuda_core/examples/thread_block_cluster.py
+++ b/cuda_core/examples/thread_block_cluster.py
@@ -2,6 +2,13 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+# ################################################################################
+#
+# This demo illustrates the use of thread block clusters in the CUDA launch
+# configuration.
+#
+# ################################################################################
+
 import os
 import sys
 
diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py
index 9f48d31f0e..94ca201efc 100644
--- a/cuda_core/examples/vector_add.py
+++ b/cuda_core/examples/vector_add.py
@@ -2,6 +2,13 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+# ################################################################################
+#
+# This demo illustrates how to use `cuda.core` to compile and launch a simple
+# vector addition kernel.
+#
+# ################################################################################
+
 import cupy as cp
 
 from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch

From dc63296559ba253f81def3579235ba342eb4d801 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 19 Jun 2025 15:49:40 -0400
Subject: [PATCH 03/10] Add getting started page

---
 cuda_core/docs/source/api.rst            |   2 +
 cuda_core/docs/source/getting-started.md | 112 +++++++++++++++++++++++
 cuda_core/docs/source/index.rst          |  21 +++--
 3 files changed, 128 insertions(+), 7 deletions(-)
 create mode 100644 cuda_core/docs/source/getting-started.md

diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
index 7f9f814d41..01501c52f6 100644
--- a/cuda_core/docs/source/api.rst
+++ b/cuda_core/docs/source/api.rst
@@ -21,6 +21,8 @@ CUDA runtime
    GraphBuilder
    launch
    Buffer
+   Stream
+   Event
    MemoryResource
    DeviceMemoryResource
    LegacyPinnedMemoryResource
diff --git a/cuda_core/docs/source/getting-started.md b/cuda_core/docs/source/getting-started.md
new file mode 100644
index 0000000000..44490270d4
--- /dev/null
+++ b/cuda_core/docs/source/getting-started.md
@@ -0,0 +1,112 @@
+# Overview
+
+## What is `cuda core`?
+
+`cuda.core` provides a Pythonic interface to the CUDA runtime and other functionality,
+including:
+
+- Compiling and launching CUDA kernels
+- Asynchronous concurrent execution with CUDA graphs, streams and events
+- Coordinating work across multiple CUDA devices
+- Allocating, transfering, and managing device memory
+- Runtime linking of device code with Link-Time Optimization (LTO)
+- and much more!
+
+Rather than providing 1:1 equivalents of the CUDA driver and runtime APIs
+(for that, see [`cuda.bindings`][bindings]), `cuda.core` provides high-level constructs such as:
+
+- {class}`Device <cuda.core.experimental.Device>` class for GPU device operations and context management.
+- {class}`Buffer <cuda.core.experimental.Buffer>` and {class}`MemoryResource <cuda.core.experimental.MemoryResource>` classes for memory allocation and management.
+- {class}`Program <cuda.core.experimental.Program>` for JIT compilation of CUDA kernels.
+- {class}`GraphBuilder <cuda.core.experimental.GraphBuilder>` for building and executing CUDA graphs.
+- {class}`Stream <cuda.core.experimental.Stream>` and {class}`Event <cuda.core.experimental.Event>` for asynchronous execution and timing.
+
+## Example: Compiling and Launching a CUDA kernel
+
+To get a taste for `cuda.core`, let's walk through a simple example that compiles and launches a vector addition kernel.
+You can find the complete example in [`vector_add.py`][vector_add_example].
+
+First, we define a string containing the CUDA C++ kernel. Note that this is a templated kernel:
+
+```python
+# compute c = a + b
+code = """
+template<typename T>
+__global__ void vector_add(const T* A,
+                           const T* B,
+                           T* C,
+                           size_t N) {
+    const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    for (size_t i=tid; i<N; i+=gridDim.x*blockDim.x) {
+        C[tid] = A[tid] + B[tid];
+    }
+}
+"""
+```
+
+Next, we create a {class}`Device <cuda.core.experimental.Device>` object
+and a corresponding {class}`Stream <cuda.core.experimental.Stream>`.
+Don't forget to use {meth}`Device.set_current() <cuda.core.experimental.Device.set_current>`!
+
+```python
+from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
+
+dev = Device()
+dev.set_current()
+s = dev.create_stream()
+```
+
+Next, we compile the CUDA C++ kernel from earlier using the {class}`Program <cuda.core.experimental.Program>` class.
+The result of the compilation  is saved as a CUBIN.
+Note the use of the `name_expressions` parameter to the {meth}`Program.compile() <cuda.core.experimental.Program.compile>` method to specify which kernel template instantiations to compile:
+
+```python
+arch = "".join(f"{i}" for i in dev.compute_capability)
+program_options = ProgramOptions(std="c++17", arch=f"sm_{arch}")
+prog = Program(code, code_type="c++", options=program_options)
+mod = prog.compile("cubin", name_expressions=("vector_add<float>",))
+```
+
+Next, we retrieve the compiled kernel from the CUBIN and prepare the arguments and kernel configuration.
+We're using [CuPy][cupy] arrays as inputs for this example, but you can use PyTorch tensors too
+(we show how to do this in one of our [examples][examples]).
+
+```python
+ker = mod.get_kernel("vector_add<float>")
+
+# Prepare input/output arrays (using CuPy)
+size = 50000
+a = rng.random(size, dtype=cp.float32)
+b = rng.random(size, dtype=cp.float32)
+c = cp.empty_like(a)
+
+# Configure launch parameters
+block = 256
+grid = (size + block - 1) // block
+config = LaunchConfig(grid=grid, block=block)
+```
+
+Finally, we use the {func}`launch <cuda.core.experimental.launch>` function to execute our kernel on the specified stream with the given configuration and arguments. Note the use of `.data.ptr` to get the pointer to the array data.
+
+```python
+launch(s, config, ker, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
+s.sync()
+```
+
+This example demonstrates one of the core workflows enabled by `cuda.core`: compiling and launching CUDA code.
+Note the clean, Pythonic interface, and absense of any direct calls to the CUDA runtime/driver APIs!
+
+## Examples and Recipes
+
+As we mentioned before, `cuda.core` can do much more than just compile and launch kernels!
+
+The best way to explore and learn the different features `cuda.core` is through
+our [`examples`][examples]. Find one that matches your use-case, and modify it to fit your needs!
+
+
+[bindings]: https://nvidia.github.io/cuda-python/cuda-bindings/latest/
+[cai]: https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html
+[cupy]: https://cupy.dev/
+[dlpack]: https://dmlc.github.io/dlpack/latest/
+[examples]: https://github.com/NVIDIA/cuda-python/tree/main/cuda_core/examples
+[vector_add_example]: https://github.com/NVIDIA/cuda-python/tree/main/cuda_core/examples/vector_add.py
diff --git a/cuda_core/docs/source/index.rst b/cuda_core/docs/source/index.rst
index 69d3d25c9f..9832b775c8 100644
--- a/cuda_core/docs/source/index.rst
+++ b/cuda_core/docs/source/index.rst
@@ -1,23 +1,30 @@
 .. SPDX-License-Identifier: Apache-2.0
 
-``cuda.core``: Pythonic access to CUDA core functionalities
-===========================================================
+``cuda.core``: Pythonic access to CUDA core functionality
+=========================================================
 
-The new Python module ``cuda.core`` offers idiomatic, pythonic access to CUDA runtime
-and other functionalities.
+Welcome to the documentation for ``cuda.core``.
 
 .. toctree::
    :maxdepth: 2
    :caption: Contents:
 
-   release
-   install.md
+   getting-started
+   install
    interoperability
    api
    contribute
-   conduct.md
+
+.. toctree::
+   :maxdepth: 1
+
+   conduct
    license
 
+.. toctree::
+   :maxdepth: 2
+
+   release
 
 Indices and tables
 ==================

From 08919d92e830728cc30ee9d7a0b7b78f36f2fb28 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 19 Jun 2025 15:50:14 -0400
Subject: [PATCH 04/10] Load nvrtc using path_finder

---
 cuda_core/cuda/core/__init__.py | 3 +++
 cuda_core/pyproject.toml        | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
index 281374f649..516ab3f9d4 100644
--- a/cuda_core/cuda/core/__init__.py
+++ b/cuda_core/cuda/core/__init__.py
@@ -2,4 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from cuda.bindings.path_finder import _load_nvidia_dynamic_library
 from cuda.core._version import __version__
+
+""" _load_nvidia_dynamic_library("nvrtc") """
diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml
index 8518a2e282..fe046e5f9e 100644
--- a/cuda_core/pyproject.toml
+++ b/cuda_core/pyproject.toml
@@ -46,8 +46,8 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
-cu11 = ["cuda-bindings==11.8.*"]
-cu12 = ["cuda-bindings==12.*"]
+cu11 = ["cuda-bindings[all]==11.8.*"]
+cu12 = ["cuda-bindings[all]==12.*"]
 test = ["cython>=3.0", "setuptools", "pytest>=6.2.4"]
 test-cu11 = ["cuda-core[test]", "cupy-cuda11x", "nvidia-cuda-runtime-cu11"]  # runtime headers needed by CuPy
 test-cu12 = ["cuda-core[test]", "cupy-cuda12x", "nvidia-cuda-runtime-cu12"]  # runtime headers needed by CuPy

From aa0e36f54a04544be544b14cfcb94cd353917b12 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Tue, 24 Jun 2025 09:21:23 -0400
Subject: [PATCH 05/10] Tell CuPy to use our stream

---
 cuda_core/examples/cuda_graphs.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cuda_core/examples/cuda_graphs.py b/cuda_core/examples/cuda_graphs.py
index 0f9a5b4374..2db89f37f7 100644
--- a/cuda_core/examples/cuda_graphs.py
+++ b/cuda_core/examples/cuda_graphs.py
@@ -49,6 +49,8 @@ def main():
     dev = Device()
     dev.set_current()
     stream = dev.create_stream()
+    # tell CuPy to use our stream as the current stream:
+    cp.cuda.ExternalStream(int(stream.handle)).use()
 
     # Compile the program
     arch = "".join(f"{i}" for i in dev.compute_capability)
@@ -161,6 +163,8 @@ def main():
     assert cp.allclose(result2, expected_result2, rtol=1e-5, atol=1e-5), "Result 2 mismatch"
     assert cp.allclose(result3, expected_result3, rtol=1e-5, atol=1e-5), "Result 3 mismatch"
 
+    cp.cuda.Stream.null.use()  # reset CuPy's current stream to the null stream
+
     print("\nExample completed successfully!")
 
 

From 5f12bb81aa2e1b96c8a1644dde8123c89561a8d4 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 26 Jun 2025 09:51:32 -0400
Subject: [PATCH 06/10] Fix dependency specification

---
 cuda_core/pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml
index fe046e5f9e..84f3c75181 100644
--- a/cuda_core/pyproject.toml
+++ b/cuda_core/pyproject.toml
@@ -49,8 +49,8 @@ dependencies = [
 cu11 = ["cuda-bindings[all]==11.8.*"]
 cu12 = ["cuda-bindings[all]==12.*"]
 test = ["cython>=3.0", "setuptools", "pytest>=6.2.4"]
-test-cu11 = ["cuda-core[test]", "cupy-cuda11x", "nvidia-cuda-runtime-cu11"]  # runtime headers needed by CuPy
-test-cu12 = ["cuda-core[test]", "cupy-cuda12x", "nvidia-cuda-runtime-cu12"]  # runtime headers needed by CuPy
+test-cu11 = ["cuda-core[cu11,test]", "cupy-cuda11x", "nvidia-cuda-runtime-cu11"]  # runtime headers needed by CuPy
+test-cu12 = ["cuda-core[cu12,test]", "cupy-cuda12x", "nvidia-cuda-runtime-cu12"]  # runtime headers needed by CuPy
 
 [project.urls]
 homepage = "https://nvidia.github.io/cuda-python/"

From 0c9f25fb444e1ff09e329b30c13aa2e7af25108a Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 26 Jun 2025 09:55:17 -0400
Subject: [PATCH 07/10] Too exciting

---
 cuda_core/docs/source/getting-started.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_core/docs/source/getting-started.md b/cuda_core/docs/source/getting-started.md
index 44490270d4..a1eccb0fe4 100644
--- a/cuda_core/docs/source/getting-started.md
+++ b/cuda_core/docs/source/getting-started.md
@@ -94,11 +94,11 @@ s.sync()
 ```
 
 This example demonstrates one of the core workflows enabled by `cuda.core`: compiling and launching CUDA code.
-Note the clean, Pythonic interface, and absense of any direct calls to the CUDA runtime/driver APIs!
+Note the clean, Pythonic interface, and absense of any direct calls to the CUDA runtime/driver APIs.
 
 ## Examples and Recipes
 
-As we mentioned before, `cuda.core` can do much more than just compile and launch kernels!
+As we mentioned before, `cuda.core` can do much more than just compile and launch kernels.
 
 The best way to explore and learn the different features `cuda.core` is through
 our [`examples`][examples]. Find one that matches your use-case, and modify it to fit your needs!

From 4af0bb72e58fbd57a3f3e4c7b02fdb42027b9f10 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 26 Jun 2025 10:37:49 -0400
Subject: [PATCH 08/10] Don't load nvrtc in cuda.core

---
 cuda_core/cuda/core/__init__.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
index 516ab3f9d4..281374f649 100644
--- a/cuda_core/cuda/core/__init__.py
+++ b/cuda_core/cuda/core/__init__.py
@@ -2,7 +2,4 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from cuda.bindings.path_finder import _load_nvidia_dynamic_library
 from cuda.core._version import __version__
-
-""" _load_nvidia_dynamic_library("nvrtc") """

From 310f3912f74f6ce50209dd1d75af4e3831d591d1 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 26 Jun 2025 16:03:29 -0400
Subject: [PATCH 09/10] Revert

---
 cuda_core/pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml
index 84f3c75181..fe046e5f9e 100644
--- a/cuda_core/pyproject.toml
+++ b/cuda_core/pyproject.toml
@@ -49,8 +49,8 @@ dependencies = [
 cu11 = ["cuda-bindings[all]==11.8.*"]
 cu12 = ["cuda-bindings[all]==12.*"]
 test = ["cython>=3.0", "setuptools", "pytest>=6.2.4"]
-test-cu11 = ["cuda-core[cu11,test]", "cupy-cuda11x", "nvidia-cuda-runtime-cu11"]  # runtime headers needed by CuPy
-test-cu12 = ["cuda-core[cu12,test]", "cupy-cuda12x", "nvidia-cuda-runtime-cu12"]  # runtime headers needed by CuPy
+test-cu11 = ["cuda-core[test]", "cupy-cuda11x", "nvidia-cuda-runtime-cu11"]  # runtime headers needed by CuPy
+test-cu12 = ["cuda-core[test]", "cupy-cuda12x", "nvidia-cuda-runtime-cu12"]  # runtime headers needed by CuPy
 
 [project.urls]
 homepage = "https://nvidia.github.io/cuda-python/"

From 806b9a7e3567f4e15df872cf08de877af0c56580 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Thu, 26 Jun 2025 21:49:03 +0000
Subject: [PATCH 10/10] update CI handling of installing cuda.core dependencies
 when testing against local CTK

---
 ci/tools/run-tests | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/ci/tools/run-tests b/ci/tools/run-tests
index 51fcf0a6b4..3a35b6ad23 100755
--- a/ci/tools/run-tests
+++ b/ci/tools/run-tests
@@ -57,7 +57,13 @@ elif [[ "${test_module}" == "core" ]]; then
   echo "Installing core wheel"
   pwd
   ls
-  pip install $(ls *.whl)["cu${TEST_CUDA_MAJOR}","test-cu${TEST_CUDA_MAJOR}"]
+  if [[ "${LOCAL_CTK}" == 1 ]]; then
+    # We already installed cuda-bindings, and all CTK components exist locally,
+    # so just install the test dependencies.
+    pip install $(ls *.whl)["test-cu${TEST_CUDA_MAJOR}"]
+  else
+    pip install $(ls *.whl)["cu${TEST_CUDA_MAJOR}","test-cu${TEST_CUDA_MAJOR}"]
+  fi
   popd
   pushd ./cuda_core
   echo "Running core tests"
@@ -69,4 +75,4 @@ elif [[ "${test_module}" == "core" ]]; then
     ${SANITIZER_CMD} pytest -rxXs -v tests/cython
   fi
   popd
-fi
\ No newline at end of file
+fi