Skip to content

Commit b9c76b3

Browse files
authored
Implements GraphMemoryResource (#1235)
* Implement non-pooling memory allocation. * Add GraphMemoryResource. * Remove mempool_enabled option now that GraphMemoryResource is ready. * Add docstring and make GraphMemoryResource a singleton. * Move tests to a separate file. * Add errors for DeviceMemoryResource and GraphMemoryResource when graph capture state is not as expected. * Add tests for attributes and memory allocation escaping graphs. * Simplify logic for converting IsStreamT arguments. * Standardize Stream arguments to IsStreamT. Update Buffer and MemoryResource methods to take any kind of stream-providing object. Update graph allocation tests. * Add tests for IsStreamT conversions. * Expand files named _gmr.*. Add __eq__ and __hash__ support to StreamWrapper (testing only) * Fix format/lint issues. * Minor clean up. * Change public signatures to accept "Stream | GraphBuiler" where only streams were accepted. Add a helper Stream_accept to accept Stream-like arguments throughout. Revert changes to Stream._init that relaxed acceptance criteria for first argument. Revert addition of StreamWrapper and associated tests. Suppress invalid context errors in deallocate to avoid noise during shutdown. * Add deprecation warning when stream protocol is used with launch. * Fix builds post-merge. * Simplify GraphMemoryResourceAttributes. * Simplify Stream_accept. Default arguments can more easily be handled outside that function. * Adjust tests for platform-dependent behavior. * Disable additional tests for platform-dependent behavior. * Adjust deallocation stream for legacy memory resources to avoid platform-dependent errors. Add dependence on mempool_device where needed for certain tests. Touch-ups.
1 parent b347255 commit b9c76b3

20 files changed

+740
-136
lines changed

cuda_core/cuda/core/experimental/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
Buffer,
4343
DeviceMemoryResource,
4444
DeviceMemoryResourceOptions,
45+
GraphMemoryResource,
4546
LegacyPinnedMemoryResource,
4647
MemoryResource,
4748
VirtualMemoryResource,

cuda_core/cuda/core/experimental/_device.pyx

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ from cuda.bindings cimport cydriver
99
from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
1010

1111
import threading
12-
from typing import Union, TYPE_CHECKING
12+
from typing import Optional, TYPE_CHECKING, Union
1313

1414
from cuda.core.experimental._context import Context, ContextOptions
1515
from cuda.core.experimental._event import Event, EventOptions
@@ -1306,7 +1306,7 @@ class Device:
13061306
ctx = self._get_current_context()
13071307
return Event._init(self._id, ctx, options, True)
13081308

1309-
def allocate(self, size, stream: Stream | None = None) -> Buffer:
1309+
def allocate(self, size, stream: Stream | GraphBuilder | None = None) -> Buffer:
13101310
"""Allocate device memory from a specified stream.
13111311

13121312
Allocates device memory of `size` bytes on the specified `stream`
@@ -1333,8 +1333,6 @@ class Device:
13331333

13341334
"""
13351335
self._check_context_initialized()
1336-
if stream is None:
1337-
stream = default_stream()
13381336
return self.memory_resource.allocate(size, stream)
13391337

13401338
def sync(self):

cuda_core/cuda/core/experimental/_launcher.pyx

Lines changed: 9 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,13 @@
11
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
#
33
# SPDX-License-Identifier: Apache-2.0
4+
from cuda.core.experimental._launch_config cimport LaunchConfig, _to_native_launch_config
5+
from cuda.core.experimental._stream cimport Stream_accept
46

5-
from libc.stdint cimport uintptr_t
6-
7-
from cuda.core.experimental._stream cimport _try_to_get_stream_ptr
8-
9-
from typing import Union
107

118
from cuda.core.experimental._kernel_arg_handler import ParamHolder
12-
from cuda.core.experimental._launch_config cimport LaunchConfig, _to_native_launch_config
139
from cuda.core.experimental._module import Kernel
14-
from cuda.core.experimental._stream import IsStreamT, Stream
10+
from cuda.core.experimental._stream import Stream
1511
from cuda.core.experimental._utils.clear_error_support import assert_type
1612
from cuda.core.experimental._utils.cuda_utils import (
1713
_reduce_3_tuple,
@@ -39,13 +35,13 @@ def _lazy_init():
3935
_inited = True
4036

4137

42-
def launch(stream: Union[Stream, IsStreamT], config: LaunchConfig, kernel: Kernel, *kernel_args):
38+
def launch(stream: Stream | GraphBuilder | IsStreamT, config: LaunchConfig, kernel: Kernel, *kernel_args):
4339
"""Launches a :obj:`~_module.Kernel`
4440
object with launch-time configuration.
4541
4642
Parameters
4743
----------
48-
stream : :obj:`~_stream.Stream`
44+
stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`
4945
The stream establishing the stream ordering semantic of a
5046
launch.
5147
config : :obj:`LaunchConfig`
@@ -58,17 +54,7 @@ def launch(stream: Union[Stream, IsStreamT], config: LaunchConfig, kernel: Kerne
5854
launching kernel.
5955
6056
"""
61-
if stream is None:
62-
raise ValueError("stream cannot be None, stream must either be a Stream object or support __cuda_stream__")
63-
try:
64-
stream_handle = stream.handle
65-
except AttributeError:
66-
try:
67-
stream_handle = driver.CUstream(<uintptr_t>(_try_to_get_stream_ptr(stream)))
68-
except Exception:
69-
raise ValueError(
70-
f"stream must either be a Stream object or support __cuda_stream__ (got {type(stream)})"
71-
) from None
57+
stream = Stream_accept(stream, allow_stream_protocol=True)
7258
assert_type(kernel, Kernel)
7359
_lazy_init()
7460
config = check_or_create_options(LaunchConfig, config, "launch config")
@@ -85,20 +71,20 @@ def launch(stream: Union[Stream, IsStreamT], config: LaunchConfig, kernel: Kerne
8571
# rich.
8672
if _use_ex:
8773
drv_cfg = _to_native_launch_config(config)
88-
drv_cfg.hStream = stream_handle
74+
drv_cfg.hStream = stream.handle
8975
if config.cooperative_launch:
9076
_check_cooperative_launch(kernel, config, stream)
9177
handle_return(driver.cuLaunchKernelEx(drv_cfg, int(kernel._handle), args_ptr, 0))
9278
else:
9379
# TODO: check if config has any unsupported attrs
9480
handle_return(
9581
driver.cuLaunchKernel(
96-
int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream_handle, args_ptr, 0
82+
int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream.handle, args_ptr, 0
9783
)
9884
)
9985

10086

101-
def _check_cooperative_launch(kernel: Kernel, config: LaunchConfig, stream: Stream):
87+
cdef _check_cooperative_launch(kernel: Kernel, config: LaunchConfig, stream: Stream):
10288
dev = stream.device
10389
num_sm = dev.properties.multiprocessor_count
10490
max_grid_size = (

cuda_core/cuda/core/experimental/_memory/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from ._buffer import * # noqa: F403
66
from ._device_memory_resource import * # noqa: F403
7+
from ._graph_memory_resource import * # noqa: F403
78
from ._ipc import * # noqa: F403
89
from ._legacy import * # noqa: F403
910
from ._virtual_memory_resource import * # noqa: F403

cuda_core/cuda/core/experimental/_memory/_buffer.pyx

Lines changed: 15 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ from libc.stdint cimport uintptr_t
99
from cuda.core.experimental._memory._device_memory_resource cimport DeviceMemoryResource
1010
from cuda.core.experimental._memory._ipc cimport IPCBufferDescriptor
1111
from cuda.core.experimental._memory cimport _ipc
12-
from cuda.core.experimental._stream cimport default_stream, Stream
12+
from cuda.core.experimental._stream cimport Stream_accept, Stream
1313
from cuda.core.experimental._utils.cuda_utils cimport (
1414
_check_driver_error as raise_if_driver_error,
1515
)
@@ -102,21 +102,21 @@ cdef class Buffer:
102102
"""Export a buffer allocated for sharing between processes."""
103103
return _ipc.Buffer_get_ipc_descriptor(self)
104104

105-
def close(self, stream: Stream = None):
105+
def close(self, stream: Stream | GraphBuilder | None = None):
106106
"""Deallocate this buffer asynchronously on the given stream.
107107
108108
This buffer is released back to their memory resource
109109
asynchronously on the given stream.
110110
111111
Parameters
112112
----------
113-
stream : Stream, optional
113+
stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional
114114
The stream object to use for asynchronous deallocation. If None,
115115
the behavior depends on the underlying memory resource.
116116
"""
117117
Buffer_close(self, stream)
118118

119-
def copy_to(self, dst: Buffer = None, *, stream: Stream) -> Buffer:
119+
def copy_to(self, dst: Buffer = None, *, stream: Stream | GraphBuilder) -> Buffer:
120120
"""Copy from this buffer to the dst buffer asynchronously on the given stream.
121121

122122
Copies the data from this buffer to the provided dst buffer.
@@ -127,14 +127,12 @@ cdef class Buffer:
127127
----------
128128
dst : :obj:`~_memory.Buffer`
129129
Source buffer to copy data from
130-
stream : Stream
130+
stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`
131131
Keyword argument specifying the stream for the
132132
asynchronous copy
133133

134134
"""
135-
if stream is None:
136-
raise ValueError("stream must be provided")
137-
135+
stream = Stream_accept(stream)
138136
cdef size_t src_size = self._size
139137

140138
if dst is None:
@@ -152,21 +150,19 @@ cdef class Buffer:
152150
raise_if_driver_error(err)
153151
return dst
154152

155-
def copy_from(self, src: Buffer, *, stream: Stream):
153+
def copy_from(self, src: Buffer, *, stream: Stream | GraphBuilder):
156154
"""Copy from the src buffer to this buffer asynchronously on the given stream.
157155
158156
Parameters
159157
----------
160158
src : :obj:`~_memory.Buffer`
161159
Source buffer to copy data from
162-
stream : Stream
160+
stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`
163161
Keyword argument specifying the stream for the
164162
asynchronous copy
165163
166164
"""
167-
if stream is None:
168-
raise ValueError("stream must be provided")
169-
165+
stream = Stream_accept(stream)
170166
cdef size_t dst_size = self._size
171167
cdef size_t src_size = src._size
172168

@@ -274,17 +270,10 @@ cdef class Buffer:
274270

275271
# Buffer Implementation
276272
# ---------------------
277-
cdef Buffer_close(Buffer self, stream):
273+
cdef inline void Buffer_close(Buffer self, stream):
278274
cdef Stream s
279275
if self._ptr and self._memory_resource is not None:
280-
if stream is None:
281-
if self._alloc_stream is not None:
282-
s = self._alloc_stream
283-
else:
284-
# TODO: remove this branch when from_handle takes a stream
285-
s = <Stream>(default_stream())
286-
else:
287-
s = <Stream>stream
276+
s = Stream_accept(stream) if stream is not None else self._alloc_stream
288277
self._memory_resource.deallocate(self._ptr, self._size, s)
289278
self._ptr = 0
290279
self._memory_resource = None
@@ -305,14 +294,14 @@ cdef class MemoryResource:
305294
"""
306295

307296
@abc.abstractmethod
308-
def allocate(self, size_t size, stream: Stream = None) -> Buffer:
297+
def allocate(self, size_t size, stream: Stream | GraphBuilder | None = None) -> Buffer:
309298
"""Allocate a buffer of the requested size.
310299

311300
Parameters
312301
----------
313302
size : int
314303
The size of the buffer to allocate, in bytes.
315-
stream : Stream, optional
304+
stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional
316305
The stream on which to perform the allocation asynchronously.
317306
If None, it is up to each memory resource implementation to decide
318307
and document the behavior.
@@ -326,7 +315,7 @@ cdef class MemoryResource:
326315
...
327316

328317
@abc.abstractmethod
329-
def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None):
318+
def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream | GraphBuilder | None = None):
330319
"""Deallocate a buffer previously allocated by this resource.
331320
332321
Parameters
@@ -335,7 +324,7 @@ cdef class MemoryResource:
335324
The pointer or handle to the buffer to deallocate.
336325
size : int
337326
The size of the buffer to deallocate, in bytes.
338-
stream : Stream, optional
327+
stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional
339328
The stream on which to perform the deallocation asynchronously.
340329
If None, it is up to each memory resource implementation to decide
341330
and document the behavior.

cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx

Lines changed: 35 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ from cuda.bindings cimport cydriver
1212
from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource
1313
from cuda.core.experimental._memory cimport _ipc
1414
from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle, IPCData
15-
from cuda.core.experimental._stream cimport default_stream, Stream
15+
from cuda.core.experimental._stream cimport default_stream, Stream_accept, Stream
1616
from cuda.core.experimental._utils.cuda_utils cimport (
1717
check_or_create_options,
1818
HANDLE_RETURN,
@@ -65,6 +65,12 @@ cdef class DeviceMemoryResourceAttributes:
6565
self._mr_weakref = mr
6666
return self
6767

68+
def __repr__(self):
69+
return f"{self.__class__.__name__}(%s)" % ", ".join(
70+
f"{attr}={getattr(self, attr)}" for attr in dir(self)
71+
if not attr.startswith("_")
72+
)
73+
6874
cdef int _getattribute(self, cydriver.CUmemPool_attribute attr_enum, void* value) except?-1:
6975
cdef DeviceMemoryResource mr = <DeviceMemoryResource>(self._mr_weakref())
7076
if mr is None:
@@ -133,7 +139,7 @@ cdef class DeviceMemoryResourceAttributes:
133139

134140
cdef class DeviceMemoryResource(MemoryResource):
135141
"""
136-
Create a device memory resource managing a stream-ordered memory pool.
142+
A device memory resource managing a stream-ordered memory pool.
137143
138144
Parameters
139145
----------
@@ -309,14 +315,14 @@ cdef class DeviceMemoryResource(MemoryResource):
309315
raise RuntimeError("Imported memory resource cannot be exported")
310316
return self._ipc_data._alloc_handle
311317

312-
def allocate(self, size_t size, stream: Stream = None) -> Buffer:
318+
def allocate(self, size_t size, stream: Stream | GraphBuilder | None = None) -> Buffer:
313319
"""Allocate a buffer of the requested size.
314320

315321
Parameters
316322
----------
317323
size : int
318324
The size of the buffer to allocate, in bytes.
319-
stream : Stream, optional
325+
stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional
320326
The stream on which to perform the allocation asynchronously.
321327
If None, an internal stream is used.
322328

@@ -328,11 +334,10 @@ cdef class DeviceMemoryResource(MemoryResource):
328334
"""
329335
if self.is_mapped:
330336
raise TypeError("Cannot allocate from a mapped IPC-enabled memory resource")
331-
if stream is None:
332-
stream = default_stream()
333-
return DMR_allocate(self, size, <Stream>stream)
337+
stream = Stream_accept(stream) if stream is not None else default_stream()
338+
return DMR_allocate(self, size, <Stream> stream)
334339

335-
def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None):
340+
def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream | GraphBuilder | None = None):
336341
"""Deallocate a buffer previously allocated by this resource.
337342
338343
Parameters
@@ -341,15 +346,17 @@ cdef class DeviceMemoryResource(MemoryResource):
341346
The pointer or handle to the buffer to deallocate.
342347
size : int
343348
The size of the buffer to deallocate, in bytes.
344-
stream : Stream, optional
349+
stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional
345350
The stream on which to perform the deallocation asynchronously.
346351
If the buffer is deallocated without an explicit stream, the allocation stream
347352
is used.
348353
"""
349-
DMR_deallocate(self, <uintptr_t>ptr, size, <Stream>stream)
354+
stream = Stream_accept(stream) if stream is not None else default_stream()
355+
DMR_deallocate(self, <uintptr_t>ptr, size, <Stream> stream)
350356

351357
@property
352358
def attributes(self) -> DeviceMemoryResourceAttributes:
359+
"""Memory pool attributes."""
353360
if self._attributes is None:
354361
ref = weakref.ref(self)
355362
self._attributes = DeviceMemoryResourceAttributes._init(ref)
@@ -467,10 +474,21 @@ cdef void DMR_init_create(
467474
self._ipc_data = IPCData(alloc_handle, mapped=False)
468475

469476

470-
cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream):
477+
# Raise an exception if the given stream is capturing.
478+
# A result of CU_STREAM_CAPTURE_STATUS_INVALIDATED is considered an error.
479+
cdef inline int check_not_capturing(cydriver.CUstream s) except?-1 nogil:
480+
cdef cydriver.CUstreamCaptureStatus capturing
481+
HANDLE_RETURN(cydriver.cuStreamIsCapturing(s, &capturing))
482+
if capturing != cydriver.CUstreamCaptureStatus.CU_STREAM_CAPTURE_STATUS_NONE:
483+
raise RuntimeError("DeviceMemoryResource cannot perform memory operations on "
484+
"a capturing stream (consider using GraphMemoryResource).")
485+
486+
487+
cdef inline Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream):
471488
cdef cydriver.CUstream s = stream._handle
472489
cdef cydriver.CUdeviceptr devptr
473490
with nogil:
491+
check_not_capturing(s)
474492
HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._handle, s))
475493
cdef Buffer buf = Buffer.__new__(Buffer)
476494
buf._ptr = <uintptr_t>(devptr)
@@ -481,16 +499,19 @@ cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream):
481499
return buf
482500

483501

484-
cdef void DMR_deallocate(
502+
cdef inline void DMR_deallocate(
485503
DeviceMemoryResource self, uintptr_t ptr, size_t size, Stream stream
486504
) noexcept:
487505
cdef cydriver.CUstream s = stream._handle
488506
cdef cydriver.CUdeviceptr devptr = <cydriver.CUdeviceptr>ptr
507+
cdef cydriver.CUresult r
489508
with nogil:
490-
HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s))
509+
r = cydriver.cuMemFreeAsync(devptr, s)
510+
if r != cydriver.CUDA_ERROR_INVALID_CONTEXT:
511+
HANDLE_RETURN(r)
491512

492513

493-
cdef DMR_close(DeviceMemoryResource self):
514+
cdef inline DMR_close(DeviceMemoryResource self):
494515
if self._handle == NULL:
495516
return
496517

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
from cuda.core.experimental._memory._buffer cimport MemoryResource
6+
7+
8+
cdef class cyGraphMemoryResource(MemoryResource):
9+
cdef:
10+
int _dev_id

0 commit comments

Comments
 (0)