From 443bfa8692c945539bec21e2b910fc22f67f7bc0 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:27:00 +0800 Subject: [PATCH 01/60] more --- csrc/deep_ep.cpp | 6 ++++++ csrc/deep_ep.hpp | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 9c90178b..8bae5e79 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -10,6 +10,12 @@ #include "kernels/api.cuh" #include "kernels/configs.cuh" +namespace shared_memory { + void get_mem_handle(bool enable_fabric, ) { + } + +} + namespace deep_ep { Buffer::Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_bytes, bool low_latency_mode): diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp index dfa2202d..f984735c 100644 --- a/csrc/deep_ep.hpp +++ b/csrc/deep_ep.hpp @@ -20,6 +20,13 @@ #define TORCH_EXTENSION_NAME deep_ep_cpp #endif +namespace shared_memory { +typedef union { + cudaIpcMemHandle_t cuda_ipc_mem_handle; + CUmemFabricHandle cu_mem_fabric_handle; +} MemHandle; +} + namespace deep_ep { struct Buffer { From b986cce27bf39167488415fefcf0ee2dfee24e0c Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:27:50 +0800 Subject: [PATCH 02/60] more --- csrc/deep_ep.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 8bae5e79..6f71f71f 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -11,7 +11,12 @@ #include "kernels/configs.cuh" namespace shared_memory { - void get_mem_handle(bool enable_fabric, ) { + void get_mem_handle(bool enable_fabric, MemHandle* handle, void* ptr) { + if (enable_fabric) { + TODO; + } else { + CUDA_CHECK(cudaIpcGetMemHandle(handle, ptr)); + } } } @@ -52,7 +57,7 @@ Buffer::Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_ if (num_nvl_bytes > 0) { // Local IPC: alloc local memory and set local IPC handles CUDA_CHECK(cudaMalloc(&buffer_ptrs[nvl_rank], num_nvl_bytes + barrier_signal_bytes + buffer_ptr_bytes + barrier_signal_ptr_bytes)); - CUDA_CHECK(cudaIpcGetMemHandle(&ipc_handles[nvl_rank], buffer_ptrs[nvl_rank])); + CUDA_CHECK(shared_memory::get_mem_handle(&ipc_handles[nvl_rank], buffer_ptrs[nvl_rank])); buffer_ptrs_gpu = reinterpret_cast(static_cast(buffer_ptrs[nvl_rank]) + num_nvl_bytes + barrier_signal_bytes); // Set barrier signals From 3ea6f58265cfccec4112bc00a195458dda5503d5 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:29:26 +0800 Subject: [PATCH 03/60] more --- csrc/deep_ep.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 6f71f71f..a9b72efc 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -11,6 +11,14 @@ #include "kernels/configs.cuh" namespace shared_memory { + void malloc(void** ptr, size_t size) { + if (enable_fabric) { + TODO; + } else { + CUDA_CHECK(cudaMalloc(ptr, size)); + } + } + void get_mem_handle(bool enable_fabric, MemHandle* handle, void* ptr) { if (enable_fabric) { TODO; @@ -56,7 +64,7 @@ Buffer::Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_ if (num_nvl_bytes > 0) { // Local IPC: alloc local memory and set local IPC handles - CUDA_CHECK(cudaMalloc(&buffer_ptrs[nvl_rank], num_nvl_bytes + barrier_signal_bytes + buffer_ptr_bytes + barrier_signal_ptr_bytes)); + CUDA_CHECK(shared_memory::malloc(&buffer_ptrs[nvl_rank], num_nvl_bytes + barrier_signal_bytes + buffer_ptr_bytes + barrier_signal_ptr_bytes)); CUDA_CHECK(shared_memory::get_mem_handle(&ipc_handles[nvl_rank], buffer_ptrs[nvl_rank])); buffer_ptrs_gpu = reinterpret_cast(static_cast(buffer_ptrs[nvl_rank]) + num_nvl_bytes + barrier_signal_bytes); From 5d3513bbd3fda45cbd83b40214ecaa736267f4f0 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:29:37 +0800 Subject: [PATCH 04/60] more --- csrc/deep_ep.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp index f984735c..3c4284f0 100644 --- a/csrc/deep_ep.hpp +++ b/csrc/deep_ep.hpp @@ -51,7 +51,7 @@ struct Buffer { int num_device_sms; int rank, rdma_rank, nvl_rank; int num_ranks, num_rdma_ranks, num_nvl_ranks; - cudaIpcMemHandle_t ipc_handles[NUM_MAX_NVL_PEERS]; + shared_memory::MemHandle ipc_handles[NUM_MAX_NVL_PEERS]; // Stream for communication at::cuda::CUDAStream comm_stream; From bda56951540b9ea4b7c110e27bb6f877cc69786b Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:30:00 +0800 Subject: [PATCH 05/60] more --- csrc/deep_ep.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index a9b72efc..90304b09 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -23,7 +23,7 @@ namespace shared_memory { if (enable_fabric) { TODO; } else { - CUDA_CHECK(cudaIpcGetMemHandle(handle, ptr)); + CUDA_CHECK(cudaIpcGetMemHandle(handle->cuda_ipc_mem_handle, ptr)); } } From 3740762ccfe54edc147472c7c6af81196cfb1243 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:30:13 +0800 Subject: [PATCH 06/60] more --- csrc/deep_ep.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 90304b09..d1a299eb 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -23,7 +23,7 @@ namespace shared_memory { if (enable_fabric) { TODO; } else { - CUDA_CHECK(cudaIpcGetMemHandle(handle->cuda_ipc_mem_handle, ptr)); + CUDA_CHECK(cudaIpcGetMemHandle(&handle->cuda_ipc_mem_handle, ptr)); } } From ad4aee8bfcaa3b6c02c0eda2ec5affb637574722 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:31:16 +0800 Subject: [PATCH 07/60] more --- csrc/deep_ep.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index d1a299eb..ab04202a 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -161,7 +161,8 @@ int Buffer::get_local_device_id() const { } pybind11::bytearray Buffer::get_local_ipc_handle() const { - return {ipc_handles[nvl_rank].reserved, CUDA_IPC_HANDLE_SIZE}; + const shared_memory::MemHandle& handle = ipc_handles[nvl_rank]; + return {reinterpret_cast(&handle), sizeof(handle)}; } pybind11::bytearray Buffer::get_local_nvshmem_unique_id() const { From b5e4aad6597e7d3487718a05bfd74dac1c7e3687 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:33:05 +0800 Subject: [PATCH 08/60] more --- csrc/deep_ep.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index ab04202a..3c8709a4 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -27,6 +27,13 @@ namespace shared_memory { } } + void open_mem_handle(bool enable_fabric, void** ptr, MemHandle* handle) { + if (enable_fabric) { + TODO; + } else { + CUDA_CHECK(cudaIpcOpenMemHandle(ptr, handle, cudaIpcMemLazyEnablePeerAccess)); + } + } } namespace deep_ep { @@ -198,7 +205,7 @@ void Buffer::sync(const std::vector &device_ids, EP_HOST_ASSERT(handle_str.size() == CUDA_IPC_HANDLE_SIZE); if (offset + i != rank) { std::memcpy(ipc_handles[i].reserved, handle_str.c_str(), CUDA_IPC_HANDLE_SIZE); - CUDA_CHECK(cudaIpcOpenMemHandle(&buffer_ptrs[i], ipc_handles[i], cudaIpcMemLazyEnablePeerAccess)); + CUDA_CHECK(shared_memory::open_mem_handle(&buffer_ptrs[i], ipc_handles[i])); barrier_signal_ptrs[i] = reinterpret_cast(static_cast(buffer_ptrs[i]) + num_nvl_bytes); } else { EP_HOST_ASSERT(std::memcmp(ipc_handles[i].reserved, handle_str.c_str(), CUDA_IPC_HANDLE_SIZE) == 0); From 240d0582533b6b0b3864f1a9835e2596bd9fc2cd Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:41:25 +0800 Subject: [PATCH 09/60] more --- csrc/deep_ep.cpp | 6 +++--- csrc/deep_ep.hpp | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 3c8709a4..7174e8a7 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -202,13 +202,13 @@ void Buffer::sync(const std::vector &device_ids, for (int i = 0, offset = rdma_rank * num_nvl_ranks; i < num_nvl_ranks; ++ i) { EP_HOST_ASSERT(all_gathered_handles[offset + i].has_value()); auto handle_str = std::string(all_gathered_handles[offset + i].value()); - EP_HOST_ASSERT(handle_str.size() == CUDA_IPC_HANDLE_SIZE); + EP_HOST_ASSERT(handle_str.size() == shared_memory::HANDLE_SIZE); if (offset + i != rank) { - std::memcpy(ipc_handles[i].reserved, handle_str.c_str(), CUDA_IPC_HANDLE_SIZE); + std::memcpy(ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE); CUDA_CHECK(shared_memory::open_mem_handle(&buffer_ptrs[i], ipc_handles[i])); barrier_signal_ptrs[i] = reinterpret_cast(static_cast(buffer_ptrs[i]) + num_nvl_bytes); } else { - EP_HOST_ASSERT(std::memcmp(ipc_handles[i].reserved, handle_str.c_str(), CUDA_IPC_HANDLE_SIZE) == 0); + EP_HOST_ASSERT(std::memcmp(ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE) == 0); } } diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp index 3c4284f0..dbb4df72 100644 --- a/csrc/deep_ep.hpp +++ b/csrc/deep_ep.hpp @@ -25,6 +25,8 @@ typedef union { cudaIpcMemHandle_t cuda_ipc_mem_handle; CUmemFabricHandle cu_mem_fabric_handle; } MemHandle; + +constexpr usize_t HANDLE_SIZE = sizeof(MemHandle); } namespace deep_ep { From 5379d59f6889acdb2063df8177165c282348b4a1 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:41:58 +0800 Subject: [PATCH 10/60] more --- csrc/deep_ep.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 7174e8a7..9667961c 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -31,7 +31,7 @@ namespace shared_memory { if (enable_fabric) { TODO; } else { - CUDA_CHECK(cudaIpcOpenMemHandle(ptr, handle, cudaIpcMemLazyEnablePeerAccess)); + CUDA_CHECK(cudaIpcOpenMemHandle(ptr, handle->cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess)); } } } From 4fc8e79646295def0d90b49859c44154a3ccbda9 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:43:21 +0800 Subject: [PATCH 11/60] more --- csrc/deep_ep.cpp | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 9667961c..fa2e9def 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -19,6 +19,14 @@ namespace shared_memory { } } + void free(void* ptr) { + if (enable_fabric) { + TODO; + } else { + CUDA_CHECK(cudaFree(buffer_ptrs[nvl_rank])); + } + } + void get_mem_handle(bool enable_fabric, MemHandle* handle, void* ptr) { if (enable_fabric) { TODO; @@ -34,6 +42,14 @@ namespace shared_memory { CUDA_CHECK(cudaIpcOpenMemHandle(ptr, handle->cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess)); } } + + void close_mem_handle(bool enable_fabric, void* ptr) { + if (enable_fabric) { + TODO; + } else { + CUDA_CHECK(cudaIpcCloseMemHandle(buffer_ptrs[i])); + } + } } namespace deep_ep { @@ -71,8 +87,8 @@ Buffer::Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_ if (num_nvl_bytes > 0) { // Local IPC: alloc local memory and set local IPC handles - CUDA_CHECK(shared_memory::malloc(&buffer_ptrs[nvl_rank], num_nvl_bytes + barrier_signal_bytes + buffer_ptr_bytes + barrier_signal_ptr_bytes)); - CUDA_CHECK(shared_memory::get_mem_handle(&ipc_handles[nvl_rank], buffer_ptrs[nvl_rank])); + shared_memory::malloc(&buffer_ptrs[nvl_rank], num_nvl_bytes + barrier_signal_bytes + buffer_ptr_bytes + barrier_signal_ptr_bytes); + shared_memory::get_mem_handle(&ipc_handles[nvl_rank], buffer_ptrs[nvl_rank]); buffer_ptrs_gpu = reinterpret_cast(static_cast(buffer_ptrs[nvl_rank]) + num_nvl_bytes + barrier_signal_bytes); // Set barrier signals @@ -118,11 +134,11 @@ Buffer::~Buffer() noexcept(false) { // Close remote IPC if (is_available()) { for (int i = 0; i < num_nvl_ranks; ++ i) if (i != nvl_rank) - CUDA_CHECK(cudaIpcCloseMemHandle(buffer_ptrs[i])); + shared_memory::close_mem_handle(buffer_ptrs[i]); } // Free local buffer and error flag - CUDA_CHECK(cudaFree(buffer_ptrs[nvl_rank])); + shared_memory::free(buffer_ptrs[nvl_rank])); } // Free NVSHMEM @@ -205,7 +221,7 @@ void Buffer::sync(const std::vector &device_ids, EP_HOST_ASSERT(handle_str.size() == shared_memory::HANDLE_SIZE); if (offset + i != rank) { std::memcpy(ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE); - CUDA_CHECK(shared_memory::open_mem_handle(&buffer_ptrs[i], ipc_handles[i])); + shared_memory::open_mem_handle(&buffer_ptrs[i], ipc_handles[i]); barrier_signal_ptrs[i] = reinterpret_cast(static_cast(buffer_ptrs[i]) + num_nvl_bytes); } else { EP_HOST_ASSERT(std::memcmp(ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE) == 0); From 2e90afea36cbfe613f89fd7be34268d3e0c3ed2d Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:43:35 +0800 Subject: [PATCH 12/60] more --- csrc/deep_ep.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index fa2e9def..453993f2 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -47,7 +47,7 @@ namespace shared_memory { if (enable_fabric) { TODO; } else { - CUDA_CHECK(cudaIpcCloseMemHandle(buffer_ptrs[i])); + CUDA_CHECK(cudaIpcCloseMemHandle(ptr)); } } } From 3639a57cd511d4bb9539bc2aaef4938eaa0c6614 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:48:24 +0800 Subject: [PATCH 13/60] more --- csrc/deep_ep.cpp | 35 ++++++++++++++++++++++++++++++++++- csrc/kernels/exception.cuh | 10 ++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 453993f2..40cfff62 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -13,7 +13,40 @@ namespace shared_memory { void malloc(void** ptr, size_t size) { if (enable_fabric) { - TODO; + CUmemGenericAllocationHandle handle; + + int cudaDev; + CUDA_CHECK(cudaGetDevice(&cudaDev)); + + CUdevice currentDev; + CU_CHECK(cuDeviceGet(¤tDev, cudaDev)); + + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; + prop.location.id = currentDev; + + size_t granularity = 0; + CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); + + size = (size + granularity - 1) & ~(granularity - 1); + if (size == 0) size = granularity; + + CU_CHECK(cuMemCreate(&handle, size, &prop, 0)); + CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0)); + CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); + + int device_count; + CUDA_CHECK(cudaGetDeviceCount(&device_count)); + + CUmemAccessDesc accessDesc[device_count]; + for (int idx = 0; idx < device_count; ++idx) { + accessDesc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE; + accessDesc[idx].location.id = idx; + accessDesc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + } + CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, accessDesc, device_count)); } else { CUDA_CHECK(cudaMalloc(ptr, size)); } diff --git a/csrc/kernels/exception.cuh b/csrc/kernels/exception.cuh index 7db0ddb7..9eeedadd 100644 --- a/csrc/kernels/exception.cuh +++ b/csrc/kernels/exception.cuh @@ -31,6 +31,16 @@ do { \ } while (0) #endif +#ifndef CU_CHECK +#define CU_CHECK(cmd) \ +do { \ + CUresult e = (cmd); \ + if (e != CUDA_SUCCESS) { \ + throw EPException("CUDA", __FILE__, __LINE__, std::string(e)); \ + } \ +} while (0) +#endif + #ifndef EP_HOST_ASSERT #define EP_HOST_ASSERT(cond) \ do { \ From 4ef8f05afb241c810ee064438f98ef1a5a38f400 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:48:38 +0800 Subject: [PATCH 14/60] more --- csrc/deep_ep.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 40cfff62..5fbf20ac 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -13,8 +13,6 @@ namespace shared_memory { void malloc(void** ptr, size_t size) { if (enable_fabric) { - CUmemGenericAllocationHandle handle; - int cudaDev; CUDA_CHECK(cudaGetDevice(&cudaDev)); @@ -33,6 +31,7 @@ namespace shared_memory { size = (size + granularity - 1) & ~(granularity - 1); if (size == 0) size = granularity; + CUmemGenericAllocationHandle handle; CU_CHECK(cuMemCreate(&handle, size, &prop, 0)); CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0)); CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); From 047656e98affce1903a9987f5fa0d789d1323bc8 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:49:13 +0800 Subject: [PATCH 15/60] more --- csrc/deep_ep.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 5fbf20ac..50b67923 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -13,17 +13,14 @@ namespace shared_memory { void malloc(void** ptr, size_t size) { if (enable_fabric) { - int cudaDev; - CUDA_CHECK(cudaGetDevice(&cudaDev)); - - CUdevice currentDev; - CU_CHECK(cuDeviceGet(¤tDev, cudaDev)); + CUdevice device; + CURESULT_CHECK(cuCtxGetDevice(&device)); CUmemAllocationProp prop = {}; prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; - prop.location.id = currentDev; + prop.location.id = device; size_t granularity = 0; CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); From c21f36dccdd7ce0061cc3f362f65312095d9620a Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:51:01 +0800 Subject: [PATCH 16/60] more --- csrc/deep_ep.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 50b67923..cf5abd8f 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -50,7 +50,15 @@ namespace shared_memory { void free(void* ptr) { if (enable_fabric) { - TODO; + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); + + size_t size = 0; + CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); + + CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size)); + CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size)); + CU_CHECK(cuMemRelease(handle)); } else { CUDA_CHECK(cudaFree(buffer_ptrs[nvl_rank])); } From 7f3e4c088c268a700e928bf7a3edfd47f552d9c3 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:52:50 +0800 Subject: [PATCH 17/60] more --- csrc/deep_ep.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index cf5abd8f..0c7e63d4 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -64,19 +64,22 @@ namespace shared_memory { } } - void get_mem_handle(bool enable_fabric, MemHandle* handle, void* ptr) { + void get_mem_handle(bool enable_fabric, MemHandle* mem_handle, void* ptr) { if (enable_fabric) { - TODO; + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); + + CU_CHECK(cuMemExportToShareableHandle(&mem_handle->cu_mem_fabric_handle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0)); } else { - CUDA_CHECK(cudaIpcGetMemHandle(&handle->cuda_ipc_mem_handle, ptr)); + CUDA_CHECK(cudaIpcGetMemHandle(&mem_handle->cuda_ipc_mem_handle, ptr)); } } - void open_mem_handle(bool enable_fabric, void** ptr, MemHandle* handle) { + void open_mem_handle(bool enable_fabric, void** ptr, MemHandle* mem_handle) { if (enable_fabric) { TODO; } else { - CUDA_CHECK(cudaIpcOpenMemHandle(ptr, handle->cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess)); + CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess)); } } From 92fb573703c5977c0c3af8adfd4f99d32aa927ba Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:54:01 +0800 Subject: [PATCH 18/60] more --- csrc/deep_ep.cpp | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 0c7e63d4..e4b1d5e7 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -77,7 +77,24 @@ namespace shared_memory { void open_mem_handle(bool enable_fabric, void** ptr, MemHandle* mem_handle) { if (enable_fabric) { - TODO; + CUmemFabricHandle export_handle; + memcpy(&export_handle, output_buffer.data(), sizeof(export_handle)); + void *shm_addr = nullptr; + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemImportFromShareableHandle(&handle, &export_handle, CU_MEM_HANDLE_TYPE_FABRIC)); + CU_CHECK(cuMemAddressReserve((CUdeviceptr *)&shm_addr, entry.length, 0, 0, 0)); + CU_CHECK(cuMemMap((CUdeviceptr)shm_addr, entry.length, 0, handle, 0)); + + int device_count; + CUDA_CHECK(cudaGetDeviceCount(&device_count)); + + CUmemAccessDesc accessDesc[device_count]; + for (int device_id = 0; device_id < device_count; ++device_id) { + accessDesc[device_id].location.type = CU_MEM_LOCATION_TYPE_DEVICE; + accessDesc[device_id].location.id = device_id; + accessDesc[device_id].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + } + CU_CHECK(cuMemSetAccess((CUdeviceptr)shm_addr, entry.length, accessDesc, device_count)); } else { CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess)); } From 29f86f3537cf363207b5968b5f1a0db0c1b11314 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:55:06 +0800 Subject: [PATCH 19/60] more --- csrc/deep_ep.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index e4b1d5e7..dfbb103e 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -77,13 +77,12 @@ namespace shared_memory { void open_mem_handle(bool enable_fabric, void** ptr, MemHandle* mem_handle) { if (enable_fabric) { - CUmemFabricHandle export_handle; - memcpy(&export_handle, output_buffer.data(), sizeof(export_handle)); - void *shm_addr = nullptr; + TODO_size; + CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemImportFromShareableHandle(&handle, &export_handle, CU_MEM_HANDLE_TYPE_FABRIC)); - CU_CHECK(cuMemAddressReserve((CUdeviceptr *)&shm_addr, entry.length, 0, 0, 0)); - CU_CHECK(cuMemMap((CUdeviceptr)shm_addr, entry.length, 0, handle, 0)); + CU_CHECK(cuMemImportFromShareableHandle(&handle, &mem_handle->cu_mem_fabric_handle, CU_MEM_HANDLE_TYPE_FABRIC)); + CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0)); + CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); int device_count; CUDA_CHECK(cudaGetDeviceCount(&device_count)); From 5557e70cb3562d31fd87c811d0c640e24aba6c18 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:56:07 +0800 Subject: [PATCH 20/60] more --- csrc/deep_ep.cpp | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index dfbb103e..6b530f4a 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -11,6 +11,20 @@ #include "kernels/configs.cuh" namespace shared_memory { + void cu_mem_set_access_all(void* ptr, size_t size) { + int device_count; + CUDA_CHECK(cudaGetDeviceCount(&device_count)); + + CUmemAccessDesc accessDesc[device_count]; + for (int idx = 0; idx < device_count; ++idx) { + accessDesc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE; + accessDesc[idx].location.id = idx; + accessDesc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + } + + CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, accessDesc, device_count)); + } + void malloc(void** ptr, size_t size) { if (enable_fabric) { CUdevice device; @@ -33,16 +47,7 @@ namespace shared_memory { CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0)); CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); - int device_count; - CUDA_CHECK(cudaGetDeviceCount(&device_count)); - - CUmemAccessDesc accessDesc[device_count]; - for (int idx = 0; idx < device_count; ++idx) { - accessDesc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE; - accessDesc[idx].location.id = idx; - accessDesc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - } - CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, accessDesc, device_count)); + cu_mem_set_access_all(*ptr, size); } else { CUDA_CHECK(cudaMalloc(ptr, size)); } @@ -87,13 +92,7 @@ namespace shared_memory { int device_count; CUDA_CHECK(cudaGetDeviceCount(&device_count)); - CUmemAccessDesc accessDesc[device_count]; - for (int device_id = 0; device_id < device_count; ++device_id) { - accessDesc[device_id].location.type = CU_MEM_LOCATION_TYPE_DEVICE; - accessDesc[device_id].location.id = device_id; - accessDesc[device_id].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - } - CU_CHECK(cuMemSetAccess((CUdeviceptr)shm_addr, entry.length, accessDesc, device_count)); + cu_mem_set_access_all(*ptr, size); } else { CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess)); } From 9fd34e757bc5c35aee509a173d7d5d008a789d2b Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:57:53 +0800 Subject: [PATCH 21/60] more --- csrc/deep_ep.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 6b530f4a..a994f918 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -86,6 +86,7 @@ namespace shared_memory { CUmemGenericAllocationHandle handle; CU_CHECK(cuMemImportFromShareableHandle(&handle, &mem_handle->cu_mem_fabric_handle, CU_MEM_HANDLE_TYPE_FABRIC)); + CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0)); CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); From 64173931ff7887bd3b40dcca5100e83798348f3d Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:58:39 +0800 Subject: [PATCH 22/60] more --- csrc/deep_ep.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index a994f918..bf576ded 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -101,7 +101,15 @@ namespace shared_memory { void close_mem_handle(bool enable_fabric, void* ptr) { if (enable_fabric) { - TODO; + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); + + size_t size = 0; + CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); + + CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size)); + CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size)); + CU_CHECK(cuMemRelease(handle)); } else { CUDA_CHECK(cudaIpcCloseMemHandle(ptr)); } From faaeaadfdaf741f482eb5b3292782fe85c2feeb0 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:00:13 +0800 Subject: [PATCH 23/60] more --- csrc/deep_ep.cpp | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index bf576ded..ef4e940e 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -25,7 +25,7 @@ namespace shared_memory { CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, accessDesc, device_count)); } - void malloc(void** ptr, size_t size) { + void malloc(bool enable_fabric, void** ptr, size_t size) { if (enable_fabric) { CUdevice device; CURESULT_CHECK(cuCtxGetDevice(&device)); @@ -53,7 +53,7 @@ namespace shared_memory { } } - void free(void* ptr) { + void free(bool enable_fabric, void* ptr) { if (enable_fabric) { CUmemGenericAllocationHandle handle; CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); @@ -101,15 +101,7 @@ namespace shared_memory { void close_mem_handle(bool enable_fabric, void* ptr) { if (enable_fabric) { - CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); - - size_t size = 0; - CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); - - CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size)); - CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size)); - CU_CHECK(cuMemRelease(handle)); + free(true, ptr); } else { CUDA_CHECK(cudaIpcCloseMemHandle(ptr)); } From c38dbeddd903279cf6235c76e524e134ff250a59 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:00:53 +0800 Subject: [PATCH 24/60] more --- csrc/deep_ep.cpp | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index ef4e940e..e0d6e047 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -25,6 +25,18 @@ namespace shared_memory { CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, accessDesc, device_count)); } + void cu_mem_free(void* ptr) { + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); + + size_t size = 0; + CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); + + CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size)); + CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size)); + CU_CHECK(cuMemRelease(handle)); + } + void malloc(bool enable_fabric, void** ptr, size_t size) { if (enable_fabric) { CUdevice device; @@ -55,17 +67,9 @@ namespace shared_memory { void free(bool enable_fabric, void* ptr) { if (enable_fabric) { - CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); - - size_t size = 0; - CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); - - CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size)); - CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size)); - CU_CHECK(cuMemRelease(handle)); + cu_mem_free(ptr); } else { - CUDA_CHECK(cudaFree(buffer_ptrs[nvl_rank])); + CUDA_CHECK(cudaFree(ptr)); } } @@ -101,7 +105,7 @@ namespace shared_memory { void close_mem_handle(bool enable_fabric, void* ptr) { if (enable_fabric) { - free(true, ptr); + cu_mem_free(ptr); } else { CUDA_CHECK(cudaIpcCloseMemHandle(ptr)); } From dc74c0a9f3428c52201ad14bcbbcd64e304d8669 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:01:24 +0800 Subject: [PATCH 25/60] more --- csrc/deep_ep.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index e0d6e047..a48a7e15 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -56,9 +56,9 @@ namespace shared_memory { CUmemGenericAllocationHandle handle; CU_CHECK(cuMemCreate(&handle, size, &prop, 0)); + CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0)); CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); - cu_mem_set_access_all(*ptr, size); } else { CUDA_CHECK(cudaMalloc(ptr, size)); @@ -93,10 +93,6 @@ namespace shared_memory { CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0)); CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); - - int device_count; - CUDA_CHECK(cudaGetDeviceCount(&device_count)); - cu_mem_set_access_all(*ptr, size); } else { CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess)); From 61dea30b60ac87409148213a96e3bce0d5c2ab26 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:06:00 +0800 Subject: [PATCH 26/60] more --- csrc/deep_ep.cpp | 12 +++++++----- csrc/deep_ep.hpp | 10 ++++++++-- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index a48a7e15..54b3a941 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -74,28 +74,30 @@ namespace shared_memory { } void get_mem_handle(bool enable_fabric, MemHandle* mem_handle, void* ptr) { + mem_handle->size = TODO; + if (enable_fabric) { CUmemGenericAllocationHandle handle; CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); - CU_CHECK(cuMemExportToShareableHandle(&mem_handle->cu_mem_fabric_handle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0)); + CU_CHECK(cuMemExportToShareableHandle(&mem_handle->inner.cu_mem_fabric_handle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0)); } else { - CUDA_CHECK(cudaIpcGetMemHandle(&mem_handle->cuda_ipc_mem_handle, ptr)); + CUDA_CHECK(cudaIpcGetMemHandle(&mem_handle->inner.cuda_ipc_mem_handle, ptr)); } } void open_mem_handle(bool enable_fabric, void** ptr, MemHandle* mem_handle) { if (enable_fabric) { - TODO_size; + size_t size = mem_handle->size; CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemImportFromShareableHandle(&handle, &mem_handle->cu_mem_fabric_handle, CU_MEM_HANDLE_TYPE_FABRIC)); + CU_CHECK(cuMemImportFromShareableHandle(&handle, &mem_handle->inner.cu_mem_fabric_handle, CU_MEM_HANDLE_TYPE_FABRIC)); CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0)); CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); cu_mem_set_access_all(*ptr, size); } else { - CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess)); + CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->inner.cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess)); } } diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp index dbb4df72..1a015f5a 100644 --- a/csrc/deep_ep.hpp +++ b/csrc/deep_ep.hpp @@ -21,10 +21,16 @@ #endif namespace shared_memory { -typedef union { + +union MemHandleInner { cudaIpcMemHandle_t cuda_ipc_mem_handle; CUmemFabricHandle cu_mem_fabric_handle; -} MemHandle; +}; + +struct MemHandle { + MemHandleInner inner; + size_t size; +}; constexpr usize_t HANDLE_SIZE = sizeof(MemHandle); } From 7d4bc93e2d204cc1f16c10a1b67e594e8cb65491 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:07:00 +0800 Subject: [PATCH 27/60] more --- csrc/deep_ep.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 54b3a941..cf350ce3 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -74,7 +74,10 @@ namespace shared_memory { } void get_mem_handle(bool enable_fabric, MemHandle* mem_handle, void* ptr) { - mem_handle->size = TODO; + size_t size = 0; + CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); + + mem_handle->size = size; if (enable_fabric) { CUmemGenericAllocationHandle handle; From 5b78f2243341a6bd314bfd98faefdf8cbad3e31e Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:07:18 +0800 Subject: [PATCH 28/60] more --- csrc/deep_ep.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index cf350ce3..a16583b8 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -15,14 +15,14 @@ namespace shared_memory { int device_count; CUDA_CHECK(cudaGetDeviceCount(&device_count)); - CUmemAccessDesc accessDesc[device_count]; + CUmemAccessDesc access_desc[device_count]; for (int idx = 0; idx < device_count; ++idx) { - accessDesc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE; - accessDesc[idx].location.id = idx; - accessDesc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + access_desc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE; + access_desc[idx].location.id = idx; + access_desc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; } - CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, accessDesc, device_count)); + CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, access_desc, device_count)); } void cu_mem_free(void* ptr) { From 75351cd380ba45cff664683ee3312740a01968ad Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:08:34 +0800 Subject: [PATCH 29/60] more --- csrc/deep_ep.cpp | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index a16583b8..991c94b0 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -37,7 +37,16 @@ namespace shared_memory { CU_CHECK(cuMemRelease(handle)); } - void malloc(bool enable_fabric, void** ptr, size_t size) { + void get_size_align_to_granularity(size_t size_raw) { + size_t granularity = 0; + CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); + + size_t size = (size_raw + granularity - 1) & ~(granularity - 1); + if (size == 0) size = granularity; + return size; + } + + void malloc(bool enable_fabric, void** ptr, size_t size_raw) { if (enable_fabric) { CUdevice device; CURESULT_CHECK(cuCtxGetDevice(&device)); @@ -47,12 +56,8 @@ namespace shared_memory { prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; prop.location.id = device; - - size_t granularity = 0; - CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); - - size = (size + granularity - 1) & ~(granularity - 1); - if (size == 0) size = granularity; + + size_t size = get_size_align_to_granularity(size_raw); CUmemGenericAllocationHandle handle; CU_CHECK(cuMemCreate(&handle, size, &prop, 0)); From 7bb12d4e66569af1f8d0a6c37840e139cd8570b0 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:08:49 +0800 Subject: [PATCH 30/60] more --- csrc/deep_ep.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 991c94b0..fe099af2 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -37,7 +37,7 @@ namespace shared_memory { CU_CHECK(cuMemRelease(handle)); } - void get_size_align_to_granularity(size_t size_raw) { + void get_size_align_to_granularity(size_t size_raw, CUmemAllocationProp& prop) { size_t granularity = 0; CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); @@ -57,7 +57,7 @@ namespace shared_memory { prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; prop.location.id = device; - size_t size = get_size_align_to_granularity(size_raw); + size_t size = get_size_align_to_granularity(size_raw, prop); CUmemGenericAllocationHandle handle; CU_CHECK(cuMemCreate(&handle, size, &prop, 0)); From 0e5a15509dceb76f82feea6aac775111cc28b4c5 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:11:56 +0800 Subject: [PATCH 31/60] more --- csrc/deep_ep.cpp | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index fe099af2..42537a2a 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -46,6 +46,21 @@ namespace shared_memory { return size; } + bool support_fabric() { + int device_count; + CUDA_CHECK(cudaGetDeviceCount(&device_count)); + + for (int device = 0; device < device_count; ++device) { + int support = 0; + CU_CHECK(cuDeviceGetAttribute(&support, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device)); + if (!support) { + return false; + } + } + + return true; + } + void malloc(bool enable_fabric, void** ptr, size_t size_raw) { if (enable_fabric) { CUdevice device; @@ -56,7 +71,7 @@ namespace shared_memory { prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; prop.location.id = device; - + size_t size = get_size_align_to_granularity(size_raw, prop); CUmemGenericAllocationHandle handle; From 87b398034dee42cde66fa362e70f992f2ca08ffa Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:13:52 +0800 Subject: [PATCH 32/60] more --- csrc/deep_ep.cpp | 10 +++++----- csrc/deep_ep.hpp | 5 +++++ 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 42537a2a..0b2cbbe1 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -61,7 +61,7 @@ namespace shared_memory { return true; } - void malloc(bool enable_fabric, void** ptr, size_t size_raw) { + void malloc(void** ptr, size_t size_raw) { if (enable_fabric) { CUdevice device; CURESULT_CHECK(cuCtxGetDevice(&device)); @@ -85,7 +85,7 @@ namespace shared_memory { } } - void free(bool enable_fabric, void* ptr) { + void free(void* ptr) { if (enable_fabric) { cu_mem_free(ptr); } else { @@ -93,7 +93,7 @@ namespace shared_memory { } } - void get_mem_handle(bool enable_fabric, MemHandle* mem_handle, void* ptr) { + void get_mem_handle(MemHandle* mem_handle, void* ptr) { size_t size = 0; CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); @@ -109,7 +109,7 @@ namespace shared_memory { } } - void open_mem_handle(bool enable_fabric, void** ptr, MemHandle* mem_handle) { + void open_mem_handle(void** ptr, MemHandle* mem_handle) { if (enable_fabric) { size_t size = mem_handle->size; @@ -124,7 +124,7 @@ namespace shared_memory { } } - void close_mem_handle(bool enable_fabric, void* ptr) { + void close_mem_handle(void* ptr) { if (enable_fabric) { cu_mem_free(ptr); } else { diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp index 1a015f5a..017c465a 100644 --- a/csrc/deep_ep.hpp +++ b/csrc/deep_ep.hpp @@ -33,6 +33,11 @@ struct MemHandle { }; constexpr usize_t HANDLE_SIZE = sizeof(MemHandle); + +class SharedMemoryAllocator { +private: + bool enable_fabric; +}; } namespace deep_ep { From 4398b5ce9737b0f2e2d92f9897a981f5567e5a6d Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:14:21 +0800 Subject: [PATCH 33/60] more --- csrc/deep_ep.cpp | 189 ++++++++++++++++++++++++----------------------- csrc/deep_ep.hpp | 4 - 2 files changed, 98 insertions(+), 95 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 0b2cbbe1..bae8a622 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -11,127 +11,134 @@ #include "kernels/configs.cuh" namespace shared_memory { - void cu_mem_set_access_all(void* ptr, size_t size) { - int device_count; - CUDA_CHECK(cudaGetDeviceCount(&device_count)); - - CUmemAccessDesc access_desc[device_count]; - for (int idx = 0; idx < device_count; ++idx) { - access_desc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE; - access_desc[idx].location.id = idx; - access_desc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - } - - CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, access_desc, device_count)); +void cu_mem_set_access_all(void* ptr, size_t size) { + int device_count; + CUDA_CHECK(cudaGetDeviceCount(&device_count)); + + CUmemAccessDesc access_desc[device_count]; + for (int idx = 0; idx < device_count; ++idx) { + access_desc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE; + access_desc[idx].location.id = idx; + access_desc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; } - void cu_mem_free(void* ptr) { - CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); + CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, access_desc, device_count)); +} - size_t size = 0; - CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); +void cu_mem_free(void* ptr) { + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); - CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size)); - CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size)); - CU_CHECK(cuMemRelease(handle)); - } + size_t size = 0; + CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); - void get_size_align_to_granularity(size_t size_raw, CUmemAllocationProp& prop) { - size_t granularity = 0; - CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); + CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size)); + CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size)); + CU_CHECK(cuMemRelease(handle)); +} - size_t size = (size_raw + granularity - 1) & ~(granularity - 1); - if (size == 0) size = granularity; - return size; - } +void get_size_align_to_granularity(size_t size_raw, CUmemAllocationProp& prop) { + size_t granularity = 0; + CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); - bool support_fabric() { - int device_count; - CUDA_CHECK(cudaGetDeviceCount(&device_count)); + size_t size = (size_raw + granularity - 1) & ~(granularity - 1); + if (size == 0) size = granularity; + return size; +} - for (int device = 0; device < device_count; ++device) { - int support = 0; - CU_CHECK(cuDeviceGetAttribute(&support, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device)); - if (!support) { - return false; - } - } +bool support_fabric() { + int device_count; + CUDA_CHECK(cudaGetDeviceCount(&device_count)); - return true; + for (int device = 0; device < device_count; ++device) { + int support = 0; + CU_CHECK(cuDeviceGetAttribute(&support, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device)); + if (!support) { + return false; + } } - void malloc(void** ptr, size_t size_raw) { - if (enable_fabric) { - CUdevice device; - CURESULT_CHECK(cuCtxGetDevice(&device)); + return true; +} - CUmemAllocationProp prop = {}; - prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; - prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; - prop.location.id = device; +class SharedMemoryAllocator { +public: + void malloc(void** ptr, size_t size_raw); +private: + bool enable_fabric; +}; - size_t size = get_size_align_to_granularity(size_raw, prop); +void malloc(void** ptr, size_t size_raw) { + if (enable_fabric) { + CUdevice device; + CURESULT_CHECK(cuCtxGetDevice(&device)); - CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemCreate(&handle, size, &prop, 0)); + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; + prop.location.id = device; - CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0)); - CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); - cu_mem_set_access_all(*ptr, size); - } else { - CUDA_CHECK(cudaMalloc(ptr, size)); - } + size_t size = get_size_align_to_granularity(size_raw, prop); + + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemCreate(&handle, size, &prop, 0)); + + CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0)); + CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); + cu_mem_set_access_all(*ptr, size); + } else { + CUDA_CHECK(cudaMalloc(ptr, size)); } +} - void free(void* ptr) { - if (enable_fabric) { - cu_mem_free(ptr); - } else { - CUDA_CHECK(cudaFree(ptr)); - } +void free(void* ptr) { + if (enable_fabric) { + cu_mem_free(ptr); + } else { + CUDA_CHECK(cudaFree(ptr)); } +} - void get_mem_handle(MemHandle* mem_handle, void* ptr) { - size_t size = 0; - CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); +void get_mem_handle(MemHandle* mem_handle, void* ptr) { + size_t size = 0; + CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); - mem_handle->size = size; + mem_handle->size = size; - if (enable_fabric) { - CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); + if (enable_fabric) { + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); - CU_CHECK(cuMemExportToShareableHandle(&mem_handle->inner.cu_mem_fabric_handle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0)); - } else { - CUDA_CHECK(cudaIpcGetMemHandle(&mem_handle->inner.cuda_ipc_mem_handle, ptr)); - } + CU_CHECK(cuMemExportToShareableHandle(&mem_handle->inner.cu_mem_fabric_handle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0)); + } else { + CUDA_CHECK(cudaIpcGetMemHandle(&mem_handle->inner.cuda_ipc_mem_handle, ptr)); } +} - void open_mem_handle(void** ptr, MemHandle* mem_handle) { - if (enable_fabric) { - size_t size = mem_handle->size; +void open_mem_handle(void** ptr, MemHandle* mem_handle) { + if (enable_fabric) { + size_t size = mem_handle->size; - CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemImportFromShareableHandle(&handle, &mem_handle->inner.cu_mem_fabric_handle, CU_MEM_HANDLE_TYPE_FABRIC)); + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemImportFromShareableHandle(&handle, &mem_handle->inner.cu_mem_fabric_handle, CU_MEM_HANDLE_TYPE_FABRIC)); - CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0)); - CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); - cu_mem_set_access_all(*ptr, size); - } else { - CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->inner.cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess)); - } + CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0)); + CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); + cu_mem_set_access_all(*ptr, size); + } else { + CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->inner.cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess)); } +} - void close_mem_handle(void* ptr) { - if (enable_fabric) { - cu_mem_free(ptr); - } else { - CUDA_CHECK(cudaIpcCloseMemHandle(ptr)); - } +void close_mem_handle(void* ptr) { + if (enable_fabric) { + cu_mem_free(ptr); + } else { + CUDA_CHECK(cudaIpcCloseMemHandle(ptr)); } } +} namespace deep_ep { diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp index 017c465a..14bbaf7a 100644 --- a/csrc/deep_ep.hpp +++ b/csrc/deep_ep.hpp @@ -34,10 +34,6 @@ struct MemHandle { constexpr usize_t HANDLE_SIZE = sizeof(MemHandle); -class SharedMemoryAllocator { -private: - bool enable_fabric; -}; } namespace deep_ep { From d7e9ce380c689a5ca42de187a6cdf6633fdf5d5a Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:16:08 +0800 Subject: [PATCH 34/60] more --- csrc/deep_ep.cpp | 189 +++++++++++++++++++++++------------------------ csrc/deep_ep.hpp | 10 +++ 2 files changed, 101 insertions(+), 98 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index bae8a622..a2650617 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -11,134 +11,127 @@ #include "kernels/configs.cuh" namespace shared_memory { -void cu_mem_set_access_all(void* ptr, size_t size) { - int device_count; - CUDA_CHECK(cudaGetDeviceCount(&device_count)); - - CUmemAccessDesc access_desc[device_count]; - for (int idx = 0; idx < device_count; ++idx) { - access_desc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE; - access_desc[idx].location.id = idx; - access_desc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - } + void cu_mem_set_access_all(void* ptr, size_t size) { + int device_count; + CUDA_CHECK(cudaGetDeviceCount(&device_count)); + + CUmemAccessDesc access_desc[device_count]; + for (int idx = 0; idx < device_count; ++idx) { + access_desc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE; + access_desc[idx].location.id = idx; + access_desc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + } - CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, access_desc, device_count)); -} + CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, access_desc, device_count)); + } -void cu_mem_free(void* ptr) { - CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); + void cu_mem_free(void* ptr) { + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); - size_t size = 0; - CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); + size_t size = 0; + CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); - CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size)); - CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size)); - CU_CHECK(cuMemRelease(handle)); -} + CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size)); + CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size)); + CU_CHECK(cuMemRelease(handle)); + } -void get_size_align_to_granularity(size_t size_raw, CUmemAllocationProp& prop) { - size_t granularity = 0; - CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); + void get_size_align_to_granularity(size_t size_raw, CUmemAllocationProp& prop) { + size_t granularity = 0; + CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); - size_t size = (size_raw + granularity - 1) & ~(granularity - 1); - if (size == 0) size = granularity; - return size; -} + size_t size = (size_raw + granularity - 1) & ~(granularity - 1); + if (size == 0) size = granularity; + return size; + } -bool support_fabric() { - int device_count; - CUDA_CHECK(cudaGetDeviceCount(&device_count)); + bool support_fabric() { + int device_count; + CUDA_CHECK(cudaGetDeviceCount(&device_count)); - for (int device = 0; device < device_count; ++device) { - int support = 0; - CU_CHECK(cuDeviceGetAttribute(&support, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device)); - if (!support) { - return false; + for (int device = 0; device < device_count; ++device) { + int support = 0; + CU_CHECK(cuDeviceGetAttribute(&support, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device)); + if (!support) { + return false; + } } - } - - return true; -} -class SharedMemoryAllocator { -public: - void malloc(void** ptr, size_t size_raw); -private: - bool enable_fabric; -}; + return true; + } -void malloc(void** ptr, size_t size_raw) { - if (enable_fabric) { - CUdevice device; - CURESULT_CHECK(cuCtxGetDevice(&device)); + void SharedMemoryAllocator::malloc(void** ptr, size_t size_raw) { + if (enable_fabric) { + CUdevice device; + CURESULT_CHECK(cuCtxGetDevice(&device)); - CUmemAllocationProp prop = {}; - prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; - prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; - prop.location.id = device; + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; + prop.location.id = device; - size_t size = get_size_align_to_granularity(size_raw, prop); + size_t size = get_size_align_to_granularity(size_raw, prop); - CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemCreate(&handle, size, &prop, 0)); + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemCreate(&handle, size, &prop, 0)); - CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0)); - CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); - cu_mem_set_access_all(*ptr, size); - } else { - CUDA_CHECK(cudaMalloc(ptr, size)); + CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0)); + CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); + cu_mem_set_access_all(*ptr, size); + } else { + CUDA_CHECK(cudaMalloc(ptr, size)); + } } -} -void free(void* ptr) { - if (enable_fabric) { - cu_mem_free(ptr); - } else { - CUDA_CHECK(cudaFree(ptr)); + void SharedMemoryAllocator::free(void* ptr) { + if (enable_fabric) { + cu_mem_free(ptr); + } else { + CUDA_CHECK(cudaFree(ptr)); + } } -} -void get_mem_handle(MemHandle* mem_handle, void* ptr) { - size_t size = 0; - CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); + void SharedMemoryAllocator::get_mem_handle(MemHandle* mem_handle, void* ptr) { + size_t size = 0; + CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); - mem_handle->size = size; + mem_handle->size = size; - if (enable_fabric) { - CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); + if (enable_fabric) { + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); - CU_CHECK(cuMemExportToShareableHandle(&mem_handle->inner.cu_mem_fabric_handle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0)); - } else { - CUDA_CHECK(cudaIpcGetMemHandle(&mem_handle->inner.cuda_ipc_mem_handle, ptr)); + CU_CHECK(cuMemExportToShareableHandle(&mem_handle->inner.cu_mem_fabric_handle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0)); + } else { + CUDA_CHECK(cudaIpcGetMemHandle(&mem_handle->inner.cuda_ipc_mem_handle, ptr)); + } } -} -void open_mem_handle(void** ptr, MemHandle* mem_handle) { - if (enable_fabric) { - size_t size = mem_handle->size; + void SharedMemoryAllocator::open_mem_handle(void** ptr, MemHandle* mem_handle) { + if (enable_fabric) { + size_t size = mem_handle->size; - CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemImportFromShareableHandle(&handle, &mem_handle->inner.cu_mem_fabric_handle, CU_MEM_HANDLE_TYPE_FABRIC)); + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemImportFromShareableHandle(&handle, &mem_handle->inner.cu_mem_fabric_handle, CU_MEM_HANDLE_TYPE_FABRIC)); - CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0)); - CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); - cu_mem_set_access_all(*ptr, size); - } else { - CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->inner.cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess)); + CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0)); + CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); + cu_mem_set_access_all(*ptr, size); + } else { + CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->inner.cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess)); + } } -} -void close_mem_handle(void* ptr) { - if (enable_fabric) { - cu_mem_free(ptr); - } else { - CUDA_CHECK(cudaIpcCloseMemHandle(ptr)); + void SharedMemoryAllocator::close_mem_handle(void* ptr) { + if (enable_fabric) { + cu_mem_free(ptr); + } else { + CUDA_CHECK(cudaIpcCloseMemHandle(ptr)); + } } } -} namespace deep_ep { diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp index 14bbaf7a..5df87429 100644 --- a/csrc/deep_ep.hpp +++ b/csrc/deep_ep.hpp @@ -34,6 +34,16 @@ struct MemHandle { constexpr usize_t HANDLE_SIZE = sizeof(MemHandle); +class SharedMemoryAllocator { +public: + void malloc(void** ptr, size_t size); + void free(void* ptr); + void get_mem_handle(MemHandle* mem_handle, void* ptr); + void open_mem_handle(void** ptr, MemHandle* mem_handle); + void close_mem_handle(void* ptr); +private: + bool enable_fabric; +}; } namespace deep_ep { From 5b83cb85878f39b897432f9b38184bcd17a36f9d Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:16:41 +0800 Subject: [PATCH 35/60] more --- csrc/deep_ep.cpp | 2 ++ csrc/deep_ep.hpp | 1 + 2 files changed, 3 insertions(+) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index a2650617..6e2c084f 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -60,6 +60,8 @@ namespace shared_memory { return true; } + + SharedMemoryAllocator::SharedMemoryAllocator() : enable_fabric(support_fabric()) {} void SharedMemoryAllocator::malloc(void** ptr, size_t size_raw) { if (enable_fabric) { diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp index 5df87429..a8c73f75 100644 --- a/csrc/deep_ep.hpp +++ b/csrc/deep_ep.hpp @@ -36,6 +36,7 @@ constexpr usize_t HANDLE_SIZE = sizeof(MemHandle); class SharedMemoryAllocator { public: + SharedMemoryAllocator(); void malloc(void** ptr, size_t size); void free(void* ptr); void get_mem_handle(MemHandle* mem_handle, void* ptr); From f024df5938cb2f81060974cd4ef238b2e596f1bf Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:16:45 +0800 Subject: [PATCH 36/60] more --- csrc/deep_ep.cpp | 186 +++++++++++++++++++++++------------------------ 1 file changed, 93 insertions(+), 93 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 6e2c084f..6d6deda4 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -11,129 +11,129 @@ #include "kernels/configs.cuh" namespace shared_memory { - void cu_mem_set_access_all(void* ptr, size_t size) { - int device_count; - CUDA_CHECK(cudaGetDeviceCount(&device_count)); - - CUmemAccessDesc access_desc[device_count]; - for (int idx = 0; idx < device_count; ++idx) { - access_desc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE; - access_desc[idx].location.id = idx; - access_desc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - } - - CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, access_desc, device_count)); +void cu_mem_set_access_all(void* ptr, size_t size) { + int device_count; + CUDA_CHECK(cudaGetDeviceCount(&device_count)); + + CUmemAccessDesc access_desc[device_count]; + for (int idx = 0; idx < device_count; ++idx) { + access_desc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE; + access_desc[idx].location.id = idx; + access_desc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; } - void cu_mem_free(void* ptr) { - CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); + CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, access_desc, device_count)); +} - size_t size = 0; - CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); +void cu_mem_free(void* ptr) { + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); - CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size)); - CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size)); - CU_CHECK(cuMemRelease(handle)); - } + size_t size = 0; + CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); - void get_size_align_to_granularity(size_t size_raw, CUmemAllocationProp& prop) { - size_t granularity = 0; - CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); + CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size)); + CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size)); + CU_CHECK(cuMemRelease(handle)); +} - size_t size = (size_raw + granularity - 1) & ~(granularity - 1); - if (size == 0) size = granularity; - return size; - } +void get_size_align_to_granularity(size_t size_raw, CUmemAllocationProp& prop) { + size_t granularity = 0; + CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); - bool support_fabric() { - int device_count; - CUDA_CHECK(cudaGetDeviceCount(&device_count)); + size_t size = (size_raw + granularity - 1) & ~(granularity - 1); + if (size == 0) size = granularity; + return size; +} - for (int device = 0; device < device_count; ++device) { - int support = 0; - CU_CHECK(cuDeviceGetAttribute(&support, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device)); - if (!support) { - return false; - } - } +bool support_fabric() { + int device_count; + CUDA_CHECK(cudaGetDeviceCount(&device_count)); - return true; + for (int device = 0; device < device_count; ++device) { + int support = 0; + CU_CHECK(cuDeviceGetAttribute(&support, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device)); + if (!support) { + return false; + } } - - SharedMemoryAllocator::SharedMemoryAllocator() : enable_fabric(support_fabric()) {} - void SharedMemoryAllocator::malloc(void** ptr, size_t size_raw) { - if (enable_fabric) { - CUdevice device; - CURESULT_CHECK(cuCtxGetDevice(&device)); + return true; +} - CUmemAllocationProp prop = {}; - prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; - prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; - prop.location.id = device; +SharedMemoryAllocator::SharedMemoryAllocator() : enable_fabric(support_fabric()) {} - size_t size = get_size_align_to_granularity(size_raw, prop); +void SharedMemoryAllocator::malloc(void** ptr, size_t size_raw) { + if (enable_fabric) { + CUdevice device; + CURESULT_CHECK(cuCtxGetDevice(&device)); - CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemCreate(&handle, size, &prop, 0)); + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; + prop.location.id = device; - CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0)); - CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); - cu_mem_set_access_all(*ptr, size); - } else { - CUDA_CHECK(cudaMalloc(ptr, size)); - } + size_t size = get_size_align_to_granularity(size_raw, prop); + + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemCreate(&handle, size, &prop, 0)); + + CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0)); + CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); + cu_mem_set_access_all(*ptr, size); + } else { + CUDA_CHECK(cudaMalloc(ptr, size)); } +} - void SharedMemoryAllocator::free(void* ptr) { - if (enable_fabric) { - cu_mem_free(ptr); - } else { - CUDA_CHECK(cudaFree(ptr)); - } +void SharedMemoryAllocator::free(void* ptr) { + if (enable_fabric) { + cu_mem_free(ptr); + } else { + CUDA_CHECK(cudaFree(ptr)); } +} - void SharedMemoryAllocator::get_mem_handle(MemHandle* mem_handle, void* ptr) { - size_t size = 0; - CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); +void SharedMemoryAllocator::get_mem_handle(MemHandle* mem_handle, void* ptr) { + size_t size = 0; + CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); - mem_handle->size = size; + mem_handle->size = size; - if (enable_fabric) { - CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); + if (enable_fabric) { + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); - CU_CHECK(cuMemExportToShareableHandle(&mem_handle->inner.cu_mem_fabric_handle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0)); - } else { - CUDA_CHECK(cudaIpcGetMemHandle(&mem_handle->inner.cuda_ipc_mem_handle, ptr)); - } + CU_CHECK(cuMemExportToShareableHandle(&mem_handle->inner.cu_mem_fabric_handle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0)); + } else { + CUDA_CHECK(cudaIpcGetMemHandle(&mem_handle->inner.cuda_ipc_mem_handle, ptr)); } +} - void SharedMemoryAllocator::open_mem_handle(void** ptr, MemHandle* mem_handle) { - if (enable_fabric) { - size_t size = mem_handle->size; +void SharedMemoryAllocator::open_mem_handle(void** ptr, MemHandle* mem_handle) { + if (enable_fabric) { + size_t size = mem_handle->size; - CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemImportFromShareableHandle(&handle, &mem_handle->inner.cu_mem_fabric_handle, CU_MEM_HANDLE_TYPE_FABRIC)); + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemImportFromShareableHandle(&handle, &mem_handle->inner.cu_mem_fabric_handle, CU_MEM_HANDLE_TYPE_FABRIC)); - CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0)); - CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); - cu_mem_set_access_all(*ptr, size); - } else { - CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->inner.cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess)); - } + CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0)); + CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); + cu_mem_set_access_all(*ptr, size); + } else { + CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->inner.cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess)); } +} - void SharedMemoryAllocator::close_mem_handle(void* ptr) { - if (enable_fabric) { - cu_mem_free(ptr); - } else { - CUDA_CHECK(cudaIpcCloseMemHandle(ptr)); - } +void SharedMemoryAllocator::close_mem_handle(void* ptr) { + if (enable_fabric) { + cu_mem_free(ptr); + } else { + CUDA_CHECK(cudaIpcCloseMemHandle(ptr)); } } +} namespace deep_ep { From 5a7b2f2ab15ceb2492d9bd13eb6300a5fc71325e Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:20:10 +0800 Subject: [PATCH 37/60] more --- csrc/deep_ep.cpp | 10 +++++----- csrc/deep_ep.hpp | 2 ++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 6d6deda4..8bdfc3e9 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -170,8 +170,8 @@ Buffer::Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_ if (num_nvl_bytes > 0) { // Local IPC: alloc local memory and set local IPC handles - shared_memory::malloc(&buffer_ptrs[nvl_rank], num_nvl_bytes + barrier_signal_bytes + buffer_ptr_bytes + barrier_signal_ptr_bytes); - shared_memory::get_mem_handle(&ipc_handles[nvl_rank], buffer_ptrs[nvl_rank]); + shared_memory_allocator.malloc(&buffer_ptrs[nvl_rank], num_nvl_bytes + barrier_signal_bytes + buffer_ptr_bytes + barrier_signal_ptr_bytes); + shared_memory_allocator.get_mem_handle(&ipc_handles[nvl_rank], buffer_ptrs[nvl_rank]); buffer_ptrs_gpu = reinterpret_cast(static_cast(buffer_ptrs[nvl_rank]) + num_nvl_bytes + barrier_signal_bytes); // Set barrier signals @@ -217,11 +217,11 @@ Buffer::~Buffer() noexcept(false) { // Close remote IPC if (is_available()) { for (int i = 0; i < num_nvl_ranks; ++ i) if (i != nvl_rank) - shared_memory::close_mem_handle(buffer_ptrs[i]); + shared_memory_allocator.close_mem_handle(buffer_ptrs[i]); } // Free local buffer and error flag - shared_memory::free(buffer_ptrs[nvl_rank])); + shared_memory_allocator.free(buffer_ptrs[nvl_rank])); } // Free NVSHMEM @@ -304,7 +304,7 @@ void Buffer::sync(const std::vector &device_ids, EP_HOST_ASSERT(handle_str.size() == shared_memory::HANDLE_SIZE); if (offset + i != rank) { std::memcpy(ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE); - shared_memory::open_mem_handle(&buffer_ptrs[i], ipc_handles[i]); + shared_memory_allocator.open_mem_handle(&buffer_ptrs[i], ipc_handles[i]); barrier_signal_ptrs[i] = reinterpret_cast(static_cast(buffer_ptrs[i]) + num_nvl_bytes); } else { EP_HOST_ASSERT(std::memcmp(ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE) == 0); diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp index a8c73f75..8ceee158 100644 --- a/csrc/deep_ep.hpp +++ b/csrc/deep_ep.hpp @@ -98,6 +98,8 @@ struct Buffer { volatile int* moe_recv_rdma_counter = nullptr; int* moe_recv_rdma_counter_mapped = nullptr; + SharedMemoryAllocator shared_memory_allocator; + public: Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_bytes, bool low_latency_mode); From 60523797d95a353b61754e885654115ff1123858 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:22:32 +0800 Subject: [PATCH 38/60] more --- csrc/kernels/exception.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/kernels/exception.cuh b/csrc/kernels/exception.cuh index 9eeedadd..4be59122 100644 --- a/csrc/kernels/exception.cuh +++ b/csrc/kernels/exception.cuh @@ -36,7 +36,7 @@ do { \ do { \ CUresult e = (cmd); \ if (e != CUDA_SUCCESS) { \ - throw EPException("CUDA", __FILE__, __LINE__, std::string(e)); \ + throw EPException("CU", __FILE__, __LINE__, cuGetErrorName(e)); \ } \ } while (0) #endif From befcd27066f1524737892446900113344dd22dbd Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:22:50 +0800 Subject: [PATCH 39/60] more --- csrc/kernels/exception.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/kernels/exception.cuh b/csrc/kernels/exception.cuh index 4be59122..80aae935 100644 --- a/csrc/kernels/exception.cuh +++ b/csrc/kernels/exception.cuh @@ -36,7 +36,7 @@ do { \ do { \ CUresult e = (cmd); \ if (e != CUDA_SUCCESS) { \ - throw EPException("CU", __FILE__, __LINE__, cuGetErrorName(e)); \ + throw EPException("CU", __FILE__, __LINE__, cuGetErrorString(e)); \ } \ } while (0) #endif From df598ea7ac8306ab8a80a5130133eab3650d6fd5 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:24:54 +0800 Subject: [PATCH 40/60] more --- csrc/kernels/exception.cuh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/csrc/kernels/exception.cuh b/csrc/kernels/exception.cuh index 80aae935..3026374b 100644 --- a/csrc/kernels/exception.cuh +++ b/csrc/kernels/exception.cuh @@ -36,7 +36,9 @@ do { \ do { \ CUresult e = (cmd); \ if (e != CUDA_SUCCESS) { \ - throw EPException("CU", __FILE__, __LINE__, cuGetErrorString(e)); \ + const char *error_str = NULL; \ + cuGetErrorString(e, &error_str); \ + throw EPException("CU", __FILE__, __LINE__, std::string(error_str)); \ } \ } while (0) #endif From 5b23a8ad2190514697523f375d8e18b2571aff4b Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:25:15 +0800 Subject: [PATCH 41/60] more --- csrc/deep_ep.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp index 8ceee158..9b99d5e8 100644 --- a/csrc/deep_ep.hpp +++ b/csrc/deep_ep.hpp @@ -32,7 +32,7 @@ struct MemHandle { size_t size; }; -constexpr usize_t HANDLE_SIZE = sizeof(MemHandle); +constexpr size_t HANDLE_SIZE = sizeof(MemHandle); class SharedMemoryAllocator { public: From 210e4997026bd372b69632f3589551a454aea81f Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:26:28 +0800 Subject: [PATCH 42/60] more --- csrc/deep_ep.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp index 9b99d5e8..185fd33b 100644 --- a/csrc/deep_ep.hpp +++ b/csrc/deep_ep.hpp @@ -98,7 +98,7 @@ struct Buffer { volatile int* moe_recv_rdma_counter = nullptr; int* moe_recv_rdma_counter_mapped = nullptr; - SharedMemoryAllocator shared_memory_allocator; + shared_memory::SharedMemoryAllocator shared_memory_allocator; public: Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_bytes, bool low_latency_mode); From 379ac2447d8415679c493edd8d78855d8e742d5e Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:28:11 +0800 Subject: [PATCH 43/60] more --- csrc/deep_ep.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 8bdfc3e9..0e725f88 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -37,10 +37,7 @@ void cu_mem_free(void* ptr) { CU_CHECK(cuMemRelease(handle)); } -void get_size_align_to_granularity(size_t size_raw, CUmemAllocationProp& prop) { - size_t granularity = 0; - CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); - +size_t get_size_align_to_granularity(size_t size_raw, size_t granularity) { size_t size = (size_raw + granularity - 1) & ~(granularity - 1); if (size == 0) size = granularity; return size; @@ -66,7 +63,7 @@ SharedMemoryAllocator::SharedMemoryAllocator() : enable_fabric(support_fabric()) void SharedMemoryAllocator::malloc(void** ptr, size_t size_raw) { if (enable_fabric) { CUdevice device; - CURESULT_CHECK(cuCtxGetDevice(&device)); + CU_CHECK(cuCtxGetDevice(&device)); CUmemAllocationProp prop = {}; prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; @@ -74,7 +71,10 @@ void SharedMemoryAllocator::malloc(void** ptr, size_t size_raw) { prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; prop.location.id = device; - size_t size = get_size_align_to_granularity(size_raw, prop); + size_t granularity = 0; + CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); + + size_t size = get_size_align_to_granularity(size_raw, granularity); CUmemGenericAllocationHandle handle; CU_CHECK(cuMemCreate(&handle, size, &prop, 0)); From 43999dc0d16aaf1436b168ec10f803c4eeaa3142 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:29:02 +0800 Subject: [PATCH 44/60] more --- csrc/deep_ep.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 0e725f88..40b3cd1e 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -303,11 +303,11 @@ void Buffer::sync(const std::vector &device_ids, auto handle_str = std::string(all_gathered_handles[offset + i].value()); EP_HOST_ASSERT(handle_str.size() == shared_memory::HANDLE_SIZE); if (offset + i != rank) { - std::memcpy(ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE); + std::memcpy(&ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE); shared_memory_allocator.open_mem_handle(&buffer_ptrs[i], ipc_handles[i]); barrier_signal_ptrs[i] = reinterpret_cast(static_cast(buffer_ptrs[i]) + num_nvl_bytes); } else { - EP_HOST_ASSERT(std::memcmp(ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE) == 0); + EP_HOST_ASSERT(std::memcmp(&ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE) == 0); } } From 791601101bdfb82881b6e7446c17b6bbf9c28815 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:30:16 +0800 Subject: [PATCH 45/60] more --- csrc/deep_ep.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 40b3cd1e..d872c037 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -83,7 +83,7 @@ void SharedMemoryAllocator::malloc(void** ptr, size_t size_raw) { CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); cu_mem_set_access_all(*ptr, size); } else { - CUDA_CHECK(cudaMalloc(ptr, size)); + CUDA_CHECK(cudaMalloc(ptr, size_raw)); } } @@ -221,7 +221,7 @@ Buffer::~Buffer() noexcept(false) { } // Free local buffer and error flag - shared_memory_allocator.free(buffer_ptrs[nvl_rank])); + shared_memory_allocator.free(buffer_ptrs[nvl_rank]); } // Free NVSHMEM @@ -304,7 +304,7 @@ void Buffer::sync(const std::vector &device_ids, EP_HOST_ASSERT(handle_str.size() == shared_memory::HANDLE_SIZE); if (offset + i != rank) { std::memcpy(&ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE); - shared_memory_allocator.open_mem_handle(&buffer_ptrs[i], ipc_handles[i]); + shared_memory_allocator.open_mem_handle(&buffer_ptrs[i], &ipc_handles[i]); barrier_signal_ptrs[i] = reinterpret_cast(static_cast(buffer_ptrs[i]) + num_nvl_bytes); } else { EP_HOST_ASSERT(std::memcmp(&ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE) == 0); From 0525f8f79e8b96b8e934130d01bd853c3386b5c2 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:44:00 +0800 Subject: [PATCH 46/60] more --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b16310a7..93294f74 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ include_dirs = ['csrc/'] library_dirs = [] nvcc_dlink = [] - extra_link_args = [] + extra_link_args = ['-lcuda'] # NVSHMEM flags if disable_nvshmem: From 2bf764cfd442b2423a13d423bd72135a03187896 Mon Sep 17 00:00:00 2001 From: shifangx Date: Thu, 24 Jul 2025 22:14:17 -0700 Subject: [PATCH 47/60] support NVFP4 data format in low latency dispatch --- csrc/deep_ep.cpp | 26 +++- csrc/deep_ep.hpp | 4 +- csrc/kernels/api.cuh | 4 +- csrc/kernels/internode_ll.cu | 237 ++++++++++++++++++++++++++++++++--- deep_ep/buffer.py | 19 ++- tests/test_low_latency.py | 52 ++++++-- tests/utils.py | 62 ++++++++- 7 files changed, 367 insertions(+), 37 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 0789cd58..67393fbc 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -1087,12 +1087,14 @@ void Buffer::clean_low_latency_buffer(int num_max_dispatch_tokens_per_rank, int #endif } -std::tuple, torch::Tensor, torch::Tensor, torch::Tensor, std::optional, std::optional>> +std::tuple, std::optional, torch::Tensor, torch::Tensor, torch::Tensor, std::optional, std::optional>> Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_idx, const std::optional& cumulative_local_expert_recv_stats, const std::optional& dispatch_wait_recv_cost_stats, + const std::optional& x_sf_scale, int num_max_dispatch_tokens_per_rank, int num_experts, bool use_fp8, bool round_scale, bool use_ue8m0, + bool use_nvfp4, bool use_ue8m0_for_nvfp4_sf, bool async, bool return_recv_hook) { #ifndef DISABLE_NVSHMEM EP_HOST_ASSERT(low_latency_mode); @@ -1137,8 +1139,9 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i stream_wait(launch_stream, compute_stream); // Allocate packed tensors - auto packed_recv_x = torch::empty({num_local_experts, num_ranks * num_max_dispatch_tokens_per_rank, hidden}, - x.options().dtype(use_fp8 ? torch::kFloat8_e4m3fn: torch::kBFloat16)); + constexpr int NUM_ELEMS_PER_PACK = 8; + auto packed_recv_x = torch::empty({num_local_experts, num_ranks * num_max_dispatch_tokens_per_rank, use_nvfp4 ? hidden / NUM_ELEMS_PER_PACK : hidden}, + x.options().dtype(use_nvfp4 ? torch::kInt32 : (use_fp8 ? torch::kFloat8_e4m3fn: torch::kBFloat16))); auto packed_recv_src_info = torch::empty({num_local_experts, num_ranks * num_max_dispatch_tokens_per_rank}, torch::dtype(torch::kInt32).device(torch::kCUDA)); auto packed_recv_layout_range = torch::empty({num_local_experts, num_ranks}, torch::dtype(torch::kInt64).device(torch::kCUDA)); auto packed_recv_count = torch::empty({num_local_experts}, torch::dtype(torch::kInt32).device(torch::kCUDA)); @@ -1146,6 +1149,8 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i // Allocate column-majored scales auto packed_recv_x_scales = std::optional(); void* packed_recv_x_scales_ptr = nullptr; + auto packed_recv_x_sf_scale = std::optional(); + void* packed_recv_x_sf_scale_ptr = nullptr; EP_HOST_ASSERT((num_ranks * num_max_dispatch_tokens_per_rank) % 4 == 0 and "TMA requires the number of tokens to be multiple of 4"); if (use_fp8) { @@ -1161,16 +1166,26 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i } packed_recv_x_scales = torch::transpose(packed_recv_x_scales.value(), 1, 2); packed_recv_x_scales_ptr = packed_recv_x_scales->data_ptr(); + }else if (use_nvfp4) { + constexpr int SF_VEC_SIZE = 16; + constexpr int NUM_SF_ELEMS_PER_PACK = 4; + packed_recv_x_scales = torch::empty({num_local_experts, hidden / (SF_VEC_SIZE * NUM_SF_ELEMS_PER_PACK), num_ranks * num_max_dispatch_tokens_per_rank}, + torch::dtype(torch::kInt).device(torch::kCUDA)); + packed_recv_x_scales = torch::transpose(packed_recv_x_scales.value(), 1, 2); + packed_recv_x_scales_ptr = packed_recv_x_scales->data_ptr(); + packed_recv_x_sf_scale = torch::empty({num_local_experts, num_ranks * num_max_dispatch_tokens_per_rank}, torch::dtype(torch::kFloat32).device(torch::kCUDA)); + packed_recv_x_sf_scale_ptr = packed_recv_x_sf_scale->data_ptr(); } // Kernel launch auto next_clean_meta = next_buffer.clean_meta(); auto launcher = [=](int phases) { - internode_ll::dispatch(packed_recv_x.data_ptr(), packed_recv_x_scales_ptr, + internode_ll::dispatch(packed_recv_x.data_ptr(), packed_recv_x_scales_ptr, packed_recv_x_sf_scale_ptr, packed_recv_src_info.data_ptr(), packed_recv_layout_range.data_ptr(), packed_recv_count.data_ptr(), cumulative_local_expert_recv_stats.has_value() ? cumulative_local_expert_recv_stats->data_ptr() : nullptr, dispatch_wait_recv_cost_stats.has_value() ? dispatch_wait_recv_cost_stats->data_ptr() : nullptr, + x_sf_scale.has_value() ? x_sf_scale->data_ptr() : nullptr, buffer.dispatch_rdma_recv_data_buffer, buffer.dispatch_rdma_recv_count_buffer, buffer.dispatch_rdma_send_buffer, x.data_ptr(), topk_idx.data_ptr(), @@ -1178,6 +1193,7 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i num_tokens, hidden, num_max_dispatch_tokens_per_rank, num_topk, num_experts, rank, num_ranks, use_fp8, round_scale, use_ue8m0, + use_nvfp4, use_ue8m0_for_nvfp4_sf, workspace, num_device_sms, launch_stream, phases); }; @@ -1199,7 +1215,7 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i recv_hook = [=]() { launcher(LOW_LATENCY_RECV_PHASE); }; // Return values - return {packed_recv_x, packed_recv_x_scales, packed_recv_count, packed_recv_src_info, packed_recv_layout_range, event, recv_hook}; + return {packed_recv_x, packed_recv_x_scales, packed_recv_x_sf_scale, packed_recv_count, packed_recv_src_info, packed_recv_layout_range, event, recv_hook}; #else EP_HOST_ASSERT(false and "NVSHMEM is disabled during compilation"); return {}; diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp index aa62ccb0..27ff4951 100644 --- a/csrc/deep_ep.hpp +++ b/csrc/deep_ep.hpp @@ -143,12 +143,14 @@ struct Buffer { void clean_low_latency_buffer(int num_max_dispatch_tokens_per_rank, int hidden, int num_experts); - std::tuple, torch::Tensor, torch::Tensor, torch::Tensor, std::optional, std::optional>> + std::tuple, std::optional, torch::Tensor, torch::Tensor, torch::Tensor, std::optional, std::optional>> low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_idx, const std::optional& cumulative_local_expert_recv_stats, const std::optional& dispatch_wait_recv_cost_stats, + const std::optional& x_sf_scale, int num_max_dispatch_tokens_per_rank, int num_experts, bool use_fp8, bool round_scale, bool use_ue8m0, + bool use_nvfp4, bool use_ue8m0_for_nvfp4_sf, bool async, bool return_recv_hook); std::tuple, std::optional>> diff --git a/csrc/kernels/api.cuh b/csrc/kernels/api.cuh index d34775fd..6540443c 100644 --- a/csrc/kernels/api.cuh +++ b/csrc/kernels/api.cuh @@ -139,17 +139,19 @@ void clean_low_latency_buffer(int* clean_0, int num_clean_int_0, int* clean_1, int num_clean_int_1, cudaStream_t stream); -void dispatch(void* packed_recv_x, void* packed_recv_x_scales, +void dispatch(void* packed_recv_x, void* packed_recv_x_scales, void* packed_recv_x_sf_scale, int* packed_recv_src_info, int64_t* packed_recv_layout_range, int* packed_recv_count, int* cumulative_local_expert_recv_stats, int64_t* dispatch_wait_recv_cost_stats, + const float* x_sf_scale, void* rdma_recv_x, int* rdma_recv_count, void* rdma_x, const void* x, const int64_t* topk_idx, int* next_clean, int num_next_clean_int, int num_tokens, int hidden, int num_max_dispatch_tokens_per_rank, int num_topk, int num_experts, int rank, int num_ranks, bool use_fp8, bool round_scale, bool use_ue8m0, + bool use_nvfp4, bool use_ue8m0_for_nvfp4_sf, void* workspace, int num_device_sms, cudaStream_t stream, int phases); diff --git a/csrc/kernels/internode_ll.cu b/csrc/kernels/internode_ll.cu index 391a4b3d..f13880f7 100644 --- a/csrc/kernels/internode_ll.cu +++ b/csrc/kernels/internode_ll.cu @@ -36,13 +36,158 @@ void clean_low_latency_buffer(int* clean_0, int num_clean_int_0, clean_0, num_clean_int_0, clean_1, num_clean_int_1); } -template +constexpr int CVT_ELTS_PER_THREAD = 8; +constexpr int SF_VEC_SIZE = 16; + +struct PackedVec { + __nv_bfloat162 elts[4]; +}; + +using Type = __nv_bfloat16; + +__device__ __forceinline__ float exp2f_rcp(uint8_t exp) { + constexpr uint32_t FP32_EXPONENT_BIAS = 127; + return (exp == 0) ? 1 : exp2f(FP32_EXPONENT_BIAS - static_cast(exp)); +} + +// Fast reciprocal. +inline __device__ float reciprocal_approximate_ftz(float a) { + float b; + asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a)); + return b; +} + +// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t). +inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) { + #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) + uint32_t val; + asm volatile( + "{\n" + ".reg .b8 byte0;\n" + ".reg .b8 byte1;\n" + ".reg .b8 byte2;\n" + ".reg .b8 byte3;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte0, %2, %1;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte1, %4, %3;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte2, %6, %5;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte3, %8, %7;\n" + "mov.b32 %0, {byte0, byte1, byte2, byte3};\n" + "}" + : "=r"(val) + : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y), "f"(array[2].x), + "f"(array[2].y), "f"(array[3].x), "f"(array[3].y)); + return val; + #else + // static_assert(false, "not supported."); + return 0; + #endif +} + +// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t). +inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) { + #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) + uint32_t val; + asm volatile( + "{\n" + ".reg .b8 byte0;\n" + ".reg .b8 byte1;\n" + ".reg .b8 byte2;\n" + ".reg .b8 byte3;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte0, %2, %1;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte1, %4, %3;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte2, %6, %5;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte3, %8, %7;\n" + "mov.b32 %0, {byte0, byte1, byte2, byte3};\n" + "}" + : "=r"(val) + : "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]), "f"(array[4]), "f"(array[5]), + "f"(array[6]), "f"(array[7])); + return val; + #else + // static_assert(false, "not supported."); + return 0; + #endif +} + +// Quantizes the provided PackedVec into the uint32_t output +template +__device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec& vec, float SFScaleVal, uint8_t* SFout) { + // Get absolute maximum values among the local 8 values. + auto localMax = __habs2(vec.elts[0]); + +// Local maximum value. +#pragma unroll + for (int i = 1; i < CVT_ELTS_PER_THREAD / 2; i++) { + localMax = __hmax2(localMax, __habs2(vec.elts[i])); + } + + constexpr int CVT_NUM_THREADS_PER_SF = SF_VEC_SIZE / CVT_ELTS_PER_THREAD; + EP_STATIC_ASSERT(CVT_NUM_THREADS_PER_SF == 2 or CVT_NUM_THREADS_PER_SF == 4, "Invalid number of threads per SF"); + // Get the absolute maximum among all 16 values (two threads for 16, four threads for 32). + localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax); + if constexpr (CVT_NUM_THREADS_PER_SF == 4) { + localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 2), localMax); + } + // Get the final absolute maximum values. + float vecMax = float(__hmax(localMax.x, localMax.y)); + + // 8 bits representation of the SF. + uint8_t fp8SFVal; + float outputScale; + // Write the SF to global memory (STG.8). + if constexpr (UE8M0_SF) { + __nv_fp8_e8m0 tmp; + // Scale the max value to the range of E2m1. + vecMax *= reciprocal_approximate_ftz(6.0f); + tmp.__x = __nv_cvt_float_to_e8m0(vecMax, __NV_SATFINITE, cudaRoundPosInf); + fp8SFVal = tmp.__x; + outputScale = exp2f_rcp(fp8SFVal); + } else { + // Get the SF (max value of the vector / max value of e2m1). + // maximum value of e2m1 = 6.0. + // TODO: use half as compute data type. + auto SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f)); + // Here SFValue is always positive, so E4M3 is the same as UE4M3. + __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue); + fp8SFVal = tmp.__x; + SFValue = static_cast(tmp); + // Get the output scale. + // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal)) * reciprocal(SFScaleVal)) + outputScale = SFValue != 0 + ? reciprocal_approximate_ftz(SFValue * reciprocal_approximate_ftz(SFScaleVal)) + : 0.0f; + } + + if (SFout) { + // Write the SF to global memory (STG.8). + *SFout = fp8SFVal; + } + + // Convert the input to float. + float2 fp2Vals[CVT_ELTS_PER_THREAD / 2]; + +#pragma unroll + for (int i = 0; i < CVT_ELTS_PER_THREAD / 2; i++) { + fp2Vals[i] = __bfloat1622float2(vec.elts[i]); + fp2Vals[i].x *= outputScale; + fp2Vals[i].y *= outputScale; + } + + // Convert to e2m1 values. + uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals); + + // Write the e2m1 values to global memory. + return e2m1Vec; +} + +template __global__ __launch_bounds__(1024, 1) void -dispatch(void* packed_recv_x, void* packed_recv_x_scales, +dispatch(void* packed_recv_x, void* packed_recv_x_scales, void* packed_recv_x_sf_scale, int* packed_recv_src_info, int64_t* packed_recv_layout_range, int* packed_recv_count, int* cumulative_local_expert_recv_stats, int64_t* dispatch_wait_recv_cost_stats, + const float* x_sf_scale, void* rdma_recv_x, int* rdma_recv_count, void* rdma_x, const void* x, const int64_t* topk_idx, int* atomic_counter_per_expert, int* atomic_finish_counter_per_expert, @@ -62,20 +207,28 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales, const auto responsible_expert_idx = sm_id * num_warp_groups + warp_group_id; // May extract UE8M0 from the scales - using scale_t = std::conditional_t; - using packed_t = std::conditional_t; + using scale_t = std::conditional_t; + using packed_t = std::conditional_t; EP_STATIC_ASSERT(sizeof(packed_t) % sizeof(scale_t) == 0, "Invalid vector length"); + EP_STATIC_ASSERT(!(kUseFP8 && kUseNVFP4), "FP8 and NVFP4 cannot be used together"); // FP8 staffs - constexpr int kNumPerChannels = 128; + constexpr int kNumPerChannels = kUseNVFP4 ? 16 : 128; const int num_scales = kHidden / kNumPerChannels; - const size_t hidden_bytes = kHidden * (kUseFP8 ? sizeof(__nv_fp8_storage_t) : sizeof(nv_bfloat16)); + constexpr size_t hidden_bytes = + kUseNVFP4 + ? kHidden * sizeof(__nv_fp8_storage_t) / 2 + : kHidden * (kUseFP8 ? sizeof(__nv_fp8_storage_t) : sizeof(nv_bfloat16)); const size_t hidden_int4 = hidden_bytes / sizeof(int4); // Message package: index at source (int), 3 reserved int fields, hidden data, FP8 scales // NOTES: currently we have 3 reserved int fields for future use - using vec_t = std::conditional_t; - const size_t num_bytes_per_msg = sizeof(int4) + (kUseFP8 ? (kHidden + num_scales * sizeof(float)) : (kHidden * sizeof(nv_bfloat16))); + using vec_t = std::conditional_t< + kUseNVFP4, + int32_t, + std::conditional_t>; + using rdma_x_scale_t = std::conditional_t; + const size_t num_bytes_per_msg = sizeof(int4) + ((kUseFP8 || kUseNVFP4) ? (hidden_bytes + num_scales * sizeof(rdma_x_scale_t)) : hidden_bytes); const size_t num_int4_per_msg = num_bytes_per_msg / sizeof(int4); EP_DEVICE_ASSERT(num_bytes_per_msg % sizeof(int4) == 0); @@ -100,12 +253,24 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales, for (int token_idx = sm_id; token_idx < num_tokens; token_idx += num_sms) { const auto x_int4 = static_cast(x) + token_idx * hidden_bf16_int4; const auto rdma_x_src_idx = reinterpret_cast(static_cast(rdma_x) + token_idx * num_bytes_per_msg); + const auto rdma_x_sf_scale = reinterpret_cast(reinterpret_cast(rdma_x_src_idx) + sizeof(int)); const auto rdma_x_vec = reinterpret_cast(reinterpret_cast(rdma_x_src_idx) + sizeof(int4)); - const auto rdma_x_scales = reinterpret_cast(reinterpret_cast(rdma_x_vec) + hidden_bytes); + const auto rdma_x_scales = reinterpret_cast(reinterpret_cast(rdma_x_vec) + hidden_bytes); // Overlap top-k index read and source token index writes auto dst_expert_idx = warp_id < num_topk ? static_cast(__ldg(topk_idx + token_idx * num_topk + warp_id)) : -1; thread_id == 0 ? (*rdma_x_src_idx = token_idx) : 0; + float SFScaleVal = 1.0f; + if constexpr (kUseNVFP4) { + // Get scaling value: if x_sf_scale is nullptr, use 1.0f; otherwise, read value at token_idx + if (x_sf_scale != nullptr) { + SFScaleVal = *(static_cast(x_sf_scale) + token_idx); + } + // Only thread 0 writes scaling value to rdma_x_sf_scale + if (thread_id == 0) { + *rdma_x_sf_scale = SFScaleVal; + } + } // FP8 cast EP_STATIC_ASSERT(hidden_bf16_int4 % 32 == 0, "Must use the full warp to reduce"); @@ -141,6 +306,20 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales, fp8x2_values[j / 2] = __nv_cvt_float2_to_fp8x2(fp32x2, __NV_SATFINITE, __NV_E4M3); } rdma_x_vec[i] = int2_value; + } else if constexpr (kUseNVFP4) { + // Convert to NVFP4 + uint8_t sf_val; + PackedVec vec = *reinterpret_cast(&int4_value); + uint32_t result = cvt_warp_fp16_to_fp4(vec, SFScaleVal, &sf_val); + + // Write scale to send buffer + if (lane_id % 2 == 0){ + EP_DEVICE_ASSERT((i * kNumElemsPerRead) % 16 == 0); + int rdma_x_scale_idx = i * kNumElemsPerRead / 16; + rdma_x_scales[rdma_x_scale_idx] = sf_val; + } + // Cast into send buffer + rdma_x_vec[i] = *reinterpret_cast(&result); } else { // Reinterpret-cast is for C++14 compatibility rdma_x_vec[i] = *reinterpret_cast(&int4_value); @@ -262,6 +441,7 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales, const auto recv_x_int4 = static_cast(packed_recv_x) + local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank * hidden_int4; const auto recv_src_info = packed_recv_src_info + local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank; + const auto recv_sf_scale = static_cast(packed_recv_x_sf_scale) + local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank; const auto recv_range = packed_recv_layout_range + local_expert_idx * num_ranks; const auto num_aligned_scales = align(num_scales, sizeof(float) / sizeof(scale_t)); const auto recv_x_scales = static_cast(packed_recv_x_scales) + local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank * num_aligned_scales; @@ -294,12 +474,17 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales, recv_token_begin_idx = shared_recv_token_begin_idx[warp_group_id]; // Copy tokens - EP_DEVICE_ASSERT(num_scales <= 64); for (int i = sub_warp_id; i < num_recv_tokens; i += num_warps_per_group) { // Copy source info const auto src_src_idx = reinterpret_cast(rdma_recv_x_uint8 + i * num_bytes_per_msg); if (lane_id == 0) recv_src_info[recv_token_begin_idx + i] = ld_nc_global(src_src_idx); + if constexpr (kUseNVFP4) { + const auto src_sf_scale_for_nvfp4 = reinterpret_cast(rdma_recv_x_uint8 + i * num_bytes_per_msg + sizeof(int)); + if (lane_id == 0) + recv_sf_scale[recv_token_begin_idx + i] = ld_nc_global(src_sf_scale_for_nvfp4); + } + __syncwarp(); // Copy data @@ -310,6 +495,7 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales, // Copy scales if constexpr (kUseFP8) { + EP_DEVICE_ASSERT(num_scales <= 64); // Equivalent CuTe layout: // (num_tokens, (num_packed, num_elems_per_pack)):(num_elems_per_pack, (num_tokens * num_elems_per_pack, 1)) const auto src_scales = reinterpret_cast(reinterpret_cast(src_data) + hidden_bytes); @@ -329,22 +515,40 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales, auto scale = extract_required_scale_format(ld_nc_global(src_scales + lane_id + 32)); recv_x_scales[token_idx * token_stride + pack_idx * pack_stride + elem_idx] = scale; } + } else if constexpr (kUseNVFP4) { + // Equivalent CuTe layout: + // (num_tokens, (num_packed, num_elems_per_pack)):(num_elems_per_pack, (num_tokens * num_elems_per_pack, 1)) + const auto src_scales = reinterpret_cast(reinterpret_cast(src_data) + hidden_bytes); + const auto num_elems_per_pack = static_cast(sizeof(packed_t) / sizeof(scale_t)); + const auto token_idx = recv_token_begin_idx + i; + const auto token_stride = num_elems_per_pack; + const auto pack_stride = num_ranks * num_max_dispatch_tokens_per_rank * num_elems_per_pack; + #pragma unroll + for (int j = lane_id; j < num_scales; j += 32) { + const auto pack_idx = j / num_elems_per_pack; + const auto elem_idx = j % num_elems_per_pack; + auto scale = ld_nc_global(src_scales + j); + recv_x_scales[token_idx * token_stride + pack_idx * pack_stride + elem_idx] = scale; + } } } } } void dispatch(void* packed_recv_x, void* packed_recv_x_scales, + void* packed_recv_x_sf_scale, int* packed_recv_src_info, int64_t* packed_recv_layout_range, int* packed_recv_count, int* cumulative_local_expert_recv_stats, int64_t* dispatch_wait_recv_cost_stats, + const float* x_sf_scale, void* rdma_recv_x, int* rdma_recv_count, void* rdma_x, const void* x, const int64_t* topk_idx, int* next_clean, int num_next_clean_int, int num_tokens, int hidden, int num_max_dispatch_tokens_per_rank, int num_topk, int num_experts, int rank, int num_ranks, bool use_fp8, bool round_scale, bool use_ue8m0, + bool use_nvfp4, bool use_ue8m0_for_nvfp4_sf, void* workspace, int num_device_sms, cudaStream_t stream, int phases) { constexpr int kNumMaxTopK = 9; @@ -367,17 +571,22 @@ void dispatch(void* packed_recv_x, void* packed_recv_x_scales, EP_HOST_ASSERT(round_scale and "UE8M0 SF requires `round_scale=True`"); #define DISPATCH_LAUNCH_CASE(hidden) { \ -auto dispatch_func = dispatch