diff --git a/README.md b/README.md index 2131d3f1..27eed85e 100644 --- a/README.md +++ b/README.md @@ -103,7 +103,7 @@ if __name__ == "__main__": ### Quick Installation > [!NOTE] -> **Requirements**: Python 3.10+, PyTorch 2.0+ (ROCm version), ROCm 6.3.1+ HIP runtime, and Triton +> **Requirements**: Python 3.10+, PyTorch 2.0+ (ROCm version), ROCm 6.3.1+ HIP runtime, Triton, and setuptools>=61 For a quick installation directly from the repository: diff --git a/iris/hip.py b/iris/hip.py index ba9e2051..89807860 100644 --- a/iris/hip.py +++ b/iris/hip.py @@ -8,73 +8,126 @@ import subprocess import os -rt_path = "libamdhip64.so" -hip_runtime = ctypes.cdll.LoadLibrary(rt_path) +# Auto-detect backend +_is_amd_backend = True +try: + rt_path = "libamdhip64.so" + gpu_runtime = ctypes.cdll.LoadLibrary(rt_path) +except OSError: + try: + rt_path = "libcudart.so" + gpu_runtime = ctypes.cdll.LoadLibrary(rt_path) + _is_amd_backend = False + except OSError: + rt_path = "libamdhip64.so" + gpu_runtime = ctypes.cdll.LoadLibrary(rt_path) -def hip_try(err): +def gpu_try(err): if err != 0: - hip_runtime.hipGetErrorString.restype = ctypes.c_char_p - error_string = hip_runtime.hipGetErrorString(ctypes.c_int(err)).decode("utf-8") - raise RuntimeError(f"HIP error code {err}: {error_string}") + if _is_amd_backend: + gpu_runtime.hipGetErrorString.restype = ctypes.c_char_p + error_string = gpu_runtime.hipGetErrorString(ctypes.c_int(err)).decode("utf-8") + raise RuntimeError(f"HIP error code {err}: {error_string}") + else: + gpu_runtime.cudaGetErrorString.restype = ctypes.c_char_p + error_string = gpu_runtime.cudaGetErrorString(ctypes.c_int(err)).decode("utf-8") + raise RuntimeError(f"CUDA error code {err}: {error_string}") + +def get_ipc_handle_size(): + """Return the IPC handle size for the current backend.""" + return 64 if _is_amd_backend else 128 -class hipIpcMemHandle_t(ctypes.Structure): - _fields_ = [("reserved", ctypes.c_char * 64)] + +class gpuIpcMemHandle_t(ctypes.Structure): + _fields_ = [("reserved", ctypes.c_char * get_ipc_handle_size())] def open_ipc_handle(ipc_handle_data, rank): ptr = ctypes.c_void_p() - hipIpcMemLazyEnablePeerAccess = ctypes.c_uint(1) - hip_runtime.hipIpcOpenMemHandle.argtypes = [ - ctypes.POINTER(ctypes.c_void_p), - hipIpcMemHandle_t, - ctypes.c_uint, - ] + handle_size = get_ipc_handle_size() + + if _is_amd_backend: + hipIpcMemLazyEnablePeerAccess = ctypes.c_uint(1) + gpu_runtime.hipIpcOpenMemHandle.argtypes = [ + ctypes.POINTER(ctypes.c_void_p), + gpuIpcMemHandle_t, + ctypes.c_uint, + ] + else: + gpu_runtime.cudaIpcOpenMemHandle.argtypes = [ + ctypes.POINTER(ctypes.c_void_p), + gpuIpcMemHandle_t, + ctypes.c_uint, + ] + cudaIpcMemLazyEnablePeerAccess = ctypes.c_uint(1) + if isinstance(ipc_handle_data, np.ndarray): - if ipc_handle_data.dtype != np.uint8 or ipc_handle_data.size != 64: - raise ValueError("ipc_handle_data must be a 64-element uint8 numpy array") + if ipc_handle_data.dtype != np.uint8 or ipc_handle_data.size != handle_size: + raise ValueError(f"ipc_handle_data must be a {handle_size}-element uint8 numpy array") ipc_handle_bytes = ipc_handle_data.tobytes() - ipc_handle_data = (ctypes.c_char * 64).from_buffer_copy(ipc_handle_bytes) + ipc_handle_data = (ctypes.c_char * handle_size).from_buffer_copy(ipc_handle_bytes) else: - raise TypeError("ipc_handle_data must be a numpy.ndarray of dtype uint8 with 64 elements") + raise TypeError(f"ipc_handle_data must be a numpy.ndarray of dtype uint8 with {handle_size} elements") - raw_memory = ctypes.create_string_buffer(64) - ctypes.memset(raw_memory, 0x00, 64) - ipc_handle_struct = hipIpcMemHandle_t.from_buffer(raw_memory) + raw_memory = ctypes.create_string_buffer(handle_size) + ctypes.memset(raw_memory, 0x00, handle_size) + ipc_handle_struct = gpuIpcMemHandle_t.from_buffer(raw_memory) ipc_handle_data_bytes = bytes(ipc_handle_data) - ctypes.memmove(raw_memory, ipc_handle_data_bytes, 64) - - hip_try( - hip_runtime.hipIpcOpenMemHandle( - ctypes.byref(ptr), - ipc_handle_struct, - hipIpcMemLazyEnablePeerAccess, + ctypes.memmove(raw_memory, ipc_handle_data_bytes, handle_size) + + if _is_amd_backend: + gpu_try( + gpu_runtime.hipIpcOpenMemHandle( + ctypes.byref(ptr), + ipc_handle_struct, + hipIpcMemLazyEnablePeerAccess, + ) + ) + else: + gpu_try( + gpu_runtime.cudaIpcOpenMemHandle( + ctypes.byref(ptr), + ipc_handle_struct, + cudaIpcMemLazyEnablePeerAccess, + ) ) - ) return ptr.value def get_ipc_handle(ptr, rank): - ipc_handle = hipIpcMemHandle_t() - hip_try(hip_runtime.hipIpcGetMemHandle(ctypes.byref(ipc_handle), ptr)) + ipc_handle = gpuIpcMemHandle_t() + if _is_amd_backend: + gpu_try(gpu_runtime.hipIpcGetMemHandle(ctypes.byref(ipc_handle), ptr)) + else: + gpu_try(gpu_runtime.cudaIpcGetMemHandle(ctypes.byref(ipc_handle), ptr)) return ipc_handle def count_devices(): device_count = ctypes.c_int() - hip_try(hip_runtime.hipGetDeviceCount(ctypes.byref(device_count))) + if _is_amd_backend: + gpu_try(gpu_runtime.hipGetDeviceCount(ctypes.byref(device_count))) + else: + gpu_try(gpu_runtime.cudaGetDeviceCount(ctypes.byref(device_count))) return device_count.value def set_device(gpu_id): - hip_try(hip_runtime.hipSetDevice(gpu_id)) + if _is_amd_backend: + gpu_try(gpu_runtime.hipSetDevice(gpu_id)) + else: + gpu_try(gpu_runtime.cudaSetDevice(gpu_id)) def get_device_id(): device_id = ctypes.c_int() - hip_try(hip_runtime.hipGetDevice(ctypes.byref(device_id))) + if _is_amd_backend: + gpu_try(gpu_runtime.hipGetDevice(ctypes.byref(device_id))) + else: + gpu_try(gpu_runtime.cudaGetDevice(ctypes.byref(device_id))) return device_id.value @@ -82,15 +135,25 @@ def get_cu_count(device_id=None): if device_id is None: device_id = get_device_id() - hipDeviceAttributeMultiprocessorCount = 63 cu_count = ctypes.c_int() - hip_try(hip_runtime.hipDeviceGetAttribute(ctypes.byref(cu_count), hipDeviceAttributeMultiprocessorCount, device_id)) + if _is_amd_backend: + hipDeviceAttributeMultiprocessorCount = 63 + gpu_try( + gpu_runtime.hipDeviceGetAttribute(ctypes.byref(cu_count), hipDeviceAttributeMultiprocessorCount, device_id) + ) + else: + cudaDevAttrMultiProcessorCount = 16 + gpu_try(gpu_runtime.cudaDeviceGetAttribute(ctypes.byref(cu_count), cudaDevAttrMultiProcessorCount, device_id)) return cu_count.value def get_rocm_version(): + if not _is_amd_backend: + # Not applicable for CUDA + return (-1, -1) + major, minor = -1, -1 # Try hipconfig --path first @@ -119,47 +182,76 @@ def get_rocm_version(): def get_wall_clock_rate(device_id): - hipDeviceAttributeWallClockRate = 10017 wall_clock_rate = ctypes.c_int() - status = hip_runtime.hipDeviceGetAttribute( - ctypes.byref(wall_clock_rate), hipDeviceAttributeWallClockRate, device_id - ) - hip_try(status) + + if _is_amd_backend: + hipDeviceAttributeWallClockRate = 10017 + status = gpu_runtime.hipDeviceGetAttribute( + ctypes.byref(wall_clock_rate), hipDeviceAttributeWallClockRate, device_id + ) + else: + cudaDevAttrClockRate = 13 + status = gpu_runtime.cudaDeviceGetAttribute(ctypes.byref(wall_clock_rate), cudaDevAttrClockRate, device_id) + + gpu_try(status) return wall_clock_rate.value def get_arch_string(device_id=None): if device_id is None: device_id = get_device_id() - arch_full = torch.cuda.get_device_properties(device_id).gcnArchName - arch_name = arch_full.split(":")[0] - return arch_name + + if _is_amd_backend: + arch_full = torch.cuda.get_device_properties(device_id).gcnArchName + arch_name = arch_full.split(":")[0] + return arch_name + else: + # For CUDA, return compute capability + props = torch.cuda.get_device_properties(device_id) + return f"sm_{props.major}{props.minor}" def get_num_xcc(device_id=None): if device_id is None: device_id = get_device_id() + + if not _is_amd_backend: + # XCC is AMD-specific, return 1 for CUDA + return 1 + rocm_major, _ = get_rocm_version() if rocm_major < 7: return 8 hipDeviceAttributeNumberOfXccs = 10018 xcc_count = ctypes.c_int() - hip_try(hip_runtime.hipDeviceGetAttribute(ctypes.byref(xcc_count), hipDeviceAttributeNumberOfXccs, device_id)) + gpu_try(gpu_runtime.hipDeviceGetAttribute(ctypes.byref(xcc_count), hipDeviceAttributeNumberOfXccs, device_id)) return xcc_count.value def malloc_fine_grained(size): - hipDeviceMallocFinegrained = 0x1 ptr = ctypes.c_void_p() - hip_try(hip_runtime.hipExtMallocWithFlags(ctypes.byref(ptr), size, hipDeviceMallocFinegrained)) + + if _is_amd_backend: + hipDeviceMallocFinegrained = 0x1 + gpu_try(gpu_runtime.hipExtMallocWithFlags(ctypes.byref(ptr), size, hipDeviceMallocFinegrained)) + else: + # CUDA doesn't have direct equivalent, use regular malloc + gpu_try(gpu_runtime.cudaMalloc(ctypes.byref(ptr), size)) + return ptr def hip_malloc(size): ptr = ctypes.c_void_p() - hip_try(hip_runtime.hipMalloc(ctypes.byref(ptr), size)) + if _is_amd_backend: + gpu_try(gpu_runtime.hipMalloc(ctypes.byref(ptr), size)) + else: + gpu_try(gpu_runtime.cudaMalloc(ctypes.byref(ptr), size)) return ptr def hip_free(ptr): - hip_try(hip_runtime.hipFree(ptr)) + if _is_amd_backend: + gpu_try(gpu_runtime.hipFree(ptr)) + else: + gpu_try(gpu_runtime.cudaFree(ptr)) diff --git a/iris/iris.py b/iris/iris.py index be91eacb..22627058 100644 --- a/iris/iris.py +++ b/iris/iris.py @@ -39,6 +39,7 @@ get_ipc_handle, open_ipc_handle, get_wall_clock_rate, + get_ipc_handle_size, ) import numpy as np import math @@ -89,13 +90,16 @@ def __init__(self, heap_size=1 << 30): heap_bases = np.zeros(num_ranks, dtype=np.uint64) heap_bases[cur_rank] = heap_base - ipc_handles = np.zeros((num_ranks, 64), dtype=np.uint8) + ipc_handle_size = get_ipc_handle_size() + ipc_handles = np.zeros((num_ranks, ipc_handle_size), dtype=np.uint8) ipc_handle = get_ipc_handle(heap_base_ptr, cur_rank) distributed_barrier() - all_ipc_handles = distributed_allgather(np.frombuffer(ipc_handle, dtype=np.uint8)) - all_heap_bases = distributed_allgather(np.array([heap_bases[cur_rank]], dtype=np.uint64)) + all_ipc_handles = distributed_allgather(np.frombuffer(ipc_handle, dtype=np.uint8).copy()) + heap_base_bytes = np.array([heap_bases[cur_rank]], dtype=np.uint64).tobytes() + all_heap_bases_bytes = distributed_allgather(np.frombuffer(heap_base_bytes, dtype=np.uint8).copy()) + all_heap_bases = np.frombuffer(all_heap_bases_bytes.tobytes(), dtype=np.uint64).reshape(num_ranks, -1) distributed_barrier()