Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
685 changes: 685 additions & 0 deletions cuda_core/cuda/core/experimental/_layout.pxd

Large diffs are not rendered by default.

1,323 changes: 1,323 additions & 0 deletions cuda_core/cuda/core/experimental/_layout.pyx

Large diffs are not rendered by default.

9 changes: 9 additions & 0 deletions cuda_core/cuda/core/experimental/_memory/_buffer.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,22 @@ from libc.stdint cimport uintptr_t
from cuda.core.experimental._stream cimport Stream


cdef struct _MemAttrs:
int device_id
bint is_device_accessible
bint is_host_accessible


cdef class Buffer:
cdef:
uintptr_t _ptr
size_t _size
MemoryResource _memory_resource
object _owner
object _ptr_obj
Stream _alloc_stream
_MemAttrs _mem_attrs
bint _mem_attrs_inited


cdef class MemoryResource:
Expand Down
110 changes: 101 additions & 9 deletions cuda_core/cuda/core/experimental/_memory/_buffer.pyx
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@leofang,

NB.

from cuda.bindings import driver
driver.cuPointerGetAttributes(1, [driver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL], 0)

returns

(<CUresult.CUDA_SUCCESS: 0>, [4294967294])
while the actual value returned by the underlying API is -2. Looks like some unsigned/signed type mismatch.

Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from __future__ import annotations

cimport cython
from libc.stdint cimport uintptr_t

from cuda.core.experimental._memory._device_memory_resource cimport DeviceMemoryResource
Expand All @@ -12,7 +13,9 @@ from cuda.core.experimental._memory cimport _ipc
from cuda.core.experimental._stream cimport Stream_accept, Stream
from cuda.core.experimental._utils.cuda_utils cimport (
_check_driver_error as raise_if_driver_error,
HANDLE_RETURN,
)
from cuda.bindings cimport cydriver

import abc
from typing import TypeVar, Union
Expand Down Expand Up @@ -47,6 +50,8 @@ cdef class Buffer:
self._memory_resource = None
self._ptr_obj = None
self._alloc_stream = None
self._owner = None
self._mem_attrs_inited = False

def __init__(self, *args, **kwargs):
raise RuntimeError("Buffer objects cannot be instantiated directly. "
Expand All @@ -55,14 +60,17 @@ cdef class Buffer:
@classmethod
def _init(
cls, ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None,
stream: Stream | None = None
stream: Stream | None = None, owner : object | None = None,
):
cdef Buffer self = Buffer.__new__(cls)
self._ptr = <uintptr_t>(int(ptr))
self._ptr_obj = ptr
self._size = size
if mr is not None and owner is not None:
raise ValueError("owner and memory resource cannot be both specified together")
self._memory_resource = mr
self._alloc_stream = <Stream>(stream) if stream is not None else None
self._owner = owner
return self

def __dealloc__(self):
Expand All @@ -74,7 +82,8 @@ cdef class Buffer:

@staticmethod
def from_handle(
ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None
ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None,
owner: object | None = None,
) -> Buffer:
"""Create a new :class:`Buffer` object from a pointer.

Expand All @@ -86,9 +95,13 @@ cdef class Buffer:
Memory size of the buffer
mr : :obj:`~_memory.MemoryResource`, optional
Memory resource associated with the buffer
owner : object, optional
An object holding external allocation that the ``ptr`` points to.
The reference is kept as long as the buffer is alive.
The ``owner`` and ``mr`` cannot be specified together.
"""
# TODO: It is better to take a stream for latter deallocation
return Buffer._init(ptr, size, mr=mr)
return Buffer._init(ptr, size, mr=mr, owner=owner)

@classmethod
def from_ipc_descriptor(
Expand Down Expand Up @@ -224,7 +237,9 @@ cdef class Buffer:
"""Return the device ordinal of this buffer."""
if self._memory_resource is not None:
return self._memory_resource.device_id
raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
else:
Buffer_init_mem_attrs(self)
return self._mem_attrs.device_id

@property
def handle(self) -> DevicePointerT:
Expand All @@ -248,14 +263,18 @@ cdef class Buffer:
"""Return True if this buffer can be accessed by the GPU, otherwise False."""
if self._memory_resource is not None:
return self._memory_resource.is_device_accessible
raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
else:
Buffer_init_mem_attrs(self)
return self._mem_attrs.is_device_accessible

@property
def is_host_accessible(self) -> bool:
"""Return True if this buffer can be accessed by the CPU, otherwise False."""
if self._memory_resource is not None:
return self._memory_resource.is_host_accessible
raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
else:
Buffer_init_mem_attrs(self)
return self._mem_attrs.is_host_accessible

@property
def memory_resource(self) -> MemoryResource:
Expand All @@ -267,20 +286,93 @@ cdef class Buffer:
"""Return the memory size of this buffer."""
return self._size

@property
def owner(self) -> object:
"""Return the object holding external allocation."""
return self._owner


# Buffer Implementation
# ---------------------
cdef inline void Buffer_close(Buffer self, stream):
cdef Stream s
if self._ptr and self._memory_resource is not None:
s = Stream_accept(stream) if stream is not None else self._alloc_stream
self._memory_resource.deallocate(self._ptr, self._size, s)
if self._ptr:
if self._memory_resource is not None:
s = Stream_accept(stream) if stream is not None else self._alloc_stream
self._memory_resource.deallocate(self._ptr, self._size, s)
self._ptr = 0
self._memory_resource = None
self._owner = None
self._ptr_obj = None
self._alloc_stream = None


cdef Buffer_init_mem_attrs(Buffer self):
if not self._mem_attrs_inited:
query_memory_attrs(self._mem_attrs, self._ptr)
self._mem_attrs_inited = True


cdef int query_memory_attrs(_MemAttrs &out, uintptr_t ptr) except -1 nogil:
cdef unsigned int memory_type = 0
cdef int is_managed = 0
cdef int device_id = 0
_query_memory_attrs(memory_type, is_managed, device_id, <cydriver.CUdeviceptr>ptr)

if memory_type == 0:
# unregistered host pointer
out.is_host_accessible = True
out.is_device_accessible = False
out.device_id = -1
# for managed memory, the memory type can be CU_MEMORYTYPE_DEVICE,
# so we need to check it first not to falsely claim it is not
# host accessible.
elif (
is_managed
or memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_HOST
):
# For pinned memory allocated with cudaMallocHost or paged-locked
# with cudaHostRegister, the memory_type is
# cydriver.CUmemorytype.CU_MEMORYTYPE_HOST.
# TODO(ktokarski): In some cases, the registered memory requires
# using different ptr for device and host, we could check
# cuMemHostGetDevicePointer and
# CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM
# to double check the device accessibility.
out.is_host_accessible = True
out.is_device_accessible = True
out.device_id = device_id
elif memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_DEVICE:
out.is_host_accessible = False
out.is_device_accessible = True
out.device_id = device_id
else:
raise ValueError(f"Unsupported memory type: {memory_type}")
return 0


cdef inline int _query_memory_attrs(unsigned int& memory_type, int & is_managed, int& device_id, cydriver.CUdeviceptr ptr) except -1 nogil:
cdef cydriver.CUpointer_attribute attrs[3]
cdef uintptr_t vals[3]
attrs[0] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE
attrs[1] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_MANAGED
attrs[2] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL
vals[0] = <uintptr_t><void*>&memory_type
vals[1] = <uintptr_t><void*>&is_managed
vals[2] = <uintptr_t><void*>&device_id

cdef cydriver.CUresult ret
ret = cydriver.cuPointerGetAttributes(3, attrs, <void**>vals, ptr)
if ret == cydriver.CUresult.CUDA_ERROR_NOT_INITIALIZED:
with cython.gil:
# Device class handles the cuInit call internally
from cuda.core.experimental import Device
Device()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we call cuInit if that's what we're after, instead of calling Device?

ret = cydriver.cuPointerGetAttributes(3, attrs, <void**>vals, ptr)
HANDLE_RETURN(ret)
return 0


cdef class MemoryResource:
"""Abstract base class for memory resources that manage allocation and
deallocation of buffers.
Expand Down
Loading