From 443bfa8692c945539bec21e2b910fc22f67f7bc0 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 15:27:00 +0800
Subject: [PATCH 01/60] more

---
 csrc/deep_ep.cpp | 6 ++++++
 csrc/deep_ep.hpp | 7 +++++++
 2 files changed, 13 insertions(+)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index 9c90178b..8bae5e79 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -10,6 +10,12 @@
 #include "kernels/api.cuh"
 #include "kernels/configs.cuh"
 
+namespace shared_memory {
+    void get_mem_handle(bool enable_fabric, ) {
+    }
+
+}
+
 namespace deep_ep {
 
 Buffer::Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_bytes, bool low_latency_mode):
diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp
index dfa2202d..f984735c 100644
--- a/csrc/deep_ep.hpp
+++ b/csrc/deep_ep.hpp
@@ -20,6 +20,13 @@
 #define TORCH_EXTENSION_NAME deep_ep_cpp
 #endif
 
+namespace shared_memory {
+typedef union {
+  cudaIpcMemHandle_t cuda_ipc_mem_handle;
+  CUmemFabricHandle cu_mem_fabric_handle;
+} MemHandle;
+}
+
 namespace deep_ep {
 
 struct Buffer {

From b986cce27bf39167488415fefcf0ee2dfee24e0c Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 15:27:50 +0800
Subject: [PATCH 02/60] more

---
 csrc/deep_ep.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index 8bae5e79..6f71f71f 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -11,7 +11,12 @@
 #include "kernels/configs.cuh"
 
 namespace shared_memory {
-    void get_mem_handle(bool enable_fabric, ) {
+    void get_mem_handle(bool enable_fabric, MemHandle* handle, void* ptr) {
+        if (enable_fabric) {
+            TODO;
+        } else {
+            CUDA_CHECK(cudaIpcGetMemHandle(handle, ptr));
+        }
     }
 
 }
@@ -52,7 +57,7 @@ Buffer::Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_
     if (num_nvl_bytes > 0) {
         // Local IPC: alloc local memory and set local IPC handles
         CUDA_CHECK(cudaMalloc(&buffer_ptrs[nvl_rank], num_nvl_bytes + barrier_signal_bytes + buffer_ptr_bytes + barrier_signal_ptr_bytes));
-        CUDA_CHECK(cudaIpcGetMemHandle(&ipc_handles[nvl_rank], buffer_ptrs[nvl_rank]));
+        CUDA_CHECK(shared_memory::get_mem_handle(&ipc_handles[nvl_rank], buffer_ptrs[nvl_rank]));
         buffer_ptrs_gpu = reinterpret_cast<void**>(static_cast<uint8_t*>(buffer_ptrs[nvl_rank]) + num_nvl_bytes + barrier_signal_bytes);
 
         // Set barrier signals

From 3ea6f58265cfccec4112bc00a195458dda5503d5 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 15:29:26 +0800
Subject: [PATCH 03/60] more

---
 csrc/deep_ep.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index 6f71f71f..a9b72efc 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -11,6 +11,14 @@
 #include "kernels/configs.cuh"
 
 namespace shared_memory {
+    void malloc(void** ptr, size_t size) {
+        if (enable_fabric) {
+            TODO;
+        } else {
+            CUDA_CHECK(cudaMalloc(ptr, size));
+        }
+    }
+
     void get_mem_handle(bool enable_fabric, MemHandle* handle, void* ptr) {
         if (enable_fabric) {
             TODO;
@@ -56,7 +64,7 @@ Buffer::Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_
 
     if (num_nvl_bytes > 0) {
         // Local IPC: alloc local memory and set local IPC handles
-        CUDA_CHECK(cudaMalloc(&buffer_ptrs[nvl_rank], num_nvl_bytes + barrier_signal_bytes + buffer_ptr_bytes + barrier_signal_ptr_bytes));
+        CUDA_CHECK(shared_memory::malloc(&buffer_ptrs[nvl_rank], num_nvl_bytes + barrier_signal_bytes + buffer_ptr_bytes + barrier_signal_ptr_bytes));
         CUDA_CHECK(shared_memory::get_mem_handle(&ipc_handles[nvl_rank], buffer_ptrs[nvl_rank]));
         buffer_ptrs_gpu = reinterpret_cast<void**>(static_cast<uint8_t*>(buffer_ptrs[nvl_rank]) + num_nvl_bytes + barrier_signal_bytes);
 

From 5d3513bbd3fda45cbd83b40214ecaa736267f4f0 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 15:29:37 +0800
Subject: [PATCH 04/60] more

---
 csrc/deep_ep.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp
index f984735c..3c4284f0 100644
--- a/csrc/deep_ep.hpp
+++ b/csrc/deep_ep.hpp
@@ -51,7 +51,7 @@ struct Buffer {
     int num_device_sms;
     int rank, rdma_rank, nvl_rank;
     int num_ranks, num_rdma_ranks, num_nvl_ranks;
-    cudaIpcMemHandle_t ipc_handles[NUM_MAX_NVL_PEERS];
+    shared_memory::MemHandle ipc_handles[NUM_MAX_NVL_PEERS];
 
     // Stream for communication
     at::cuda::CUDAStream comm_stream;

From bda56951540b9ea4b7c110e27bb6f877cc69786b Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 15:30:00 +0800
Subject: [PATCH 05/60] more

---
 csrc/deep_ep.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index a9b72efc..90304b09 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -23,7 +23,7 @@ namespace shared_memory {
         if (enable_fabric) {
             TODO;
         } else {
-            CUDA_CHECK(cudaIpcGetMemHandle(handle, ptr));
+            CUDA_CHECK(cudaIpcGetMemHandle(handle->cuda_ipc_mem_handle, ptr));
         }
     }
 

From 3740762ccfe54edc147472c7c6af81196cfb1243 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 15:30:13 +0800
Subject: [PATCH 06/60] more

---
 csrc/deep_ep.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index 90304b09..d1a299eb 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -23,7 +23,7 @@ namespace shared_memory {
         if (enable_fabric) {
             TODO;
         } else {
-            CUDA_CHECK(cudaIpcGetMemHandle(handle->cuda_ipc_mem_handle, ptr));
+            CUDA_CHECK(cudaIpcGetMemHandle(&handle->cuda_ipc_mem_handle, ptr));
         }
     }
 

From ad4aee8bfcaa3b6c02c0eda2ec5affb637574722 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 15:31:16 +0800
Subject: [PATCH 07/60] more

---
 csrc/deep_ep.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index d1a299eb..ab04202a 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -161,7 +161,8 @@ int Buffer::get_local_device_id() const {
 }
 
 pybind11::bytearray Buffer::get_local_ipc_handle() const {
-    return {ipc_handles[nvl_rank].reserved, CUDA_IPC_HANDLE_SIZE};
+    const shared_memory::MemHandle& handle = ipc_handles[nvl_rank];
+    return {reinterpret_cast<const char*>(&handle), sizeof(handle)};
 }
 
 pybind11::bytearray Buffer::get_local_nvshmem_unique_id() const {

From b5e4aad6597e7d3487718a05bfd74dac1c7e3687 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 15:33:05 +0800
Subject: [PATCH 08/60] more

---
 csrc/deep_ep.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index ab04202a..3c8709a4 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -27,6 +27,13 @@ namespace shared_memory {
         }
     }
 
+    void open_mem_handle(bool enable_fabric, void** ptr, MemHandle* handle) {
+        if (enable_fabric) {
+            TODO;
+        } else {
+            CUDA_CHECK(cudaIpcOpenMemHandle(ptr, handle, cudaIpcMemLazyEnablePeerAccess));
+        }
+    }
 }
 
 namespace deep_ep {
@@ -198,7 +205,7 @@ void Buffer::sync(const std::vector<int> &device_ids,
             EP_HOST_ASSERT(handle_str.size() == CUDA_IPC_HANDLE_SIZE);
             if (offset + i != rank) {
                 std::memcpy(ipc_handles[i].reserved, handle_str.c_str(), CUDA_IPC_HANDLE_SIZE);
-                CUDA_CHECK(cudaIpcOpenMemHandle(&buffer_ptrs[i], ipc_handles[i], cudaIpcMemLazyEnablePeerAccess));
+                CUDA_CHECK(shared_memory::open_mem_handle(&buffer_ptrs[i], ipc_handles[i]));
                 barrier_signal_ptrs[i] = reinterpret_cast<int*>(static_cast<uint8_t*>(buffer_ptrs[i]) + num_nvl_bytes);
             } else {
                 EP_HOST_ASSERT(std::memcmp(ipc_handles[i].reserved, handle_str.c_str(), CUDA_IPC_HANDLE_SIZE) == 0);

From 240d0582533b6b0b3864f1a9835e2596bd9fc2cd Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 15:41:25 +0800
Subject: [PATCH 09/60] more

---
 csrc/deep_ep.cpp | 6 +++---
 csrc/deep_ep.hpp | 2 ++
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index 3c8709a4..7174e8a7 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -202,13 +202,13 @@ void Buffer::sync(const std::vector<int> &device_ids,
         for (int i = 0, offset = rdma_rank * num_nvl_ranks; i < num_nvl_ranks; ++ i) {
             EP_HOST_ASSERT(all_gathered_handles[offset + i].has_value());
             auto handle_str = std::string(all_gathered_handles[offset + i].value());
-            EP_HOST_ASSERT(handle_str.size() == CUDA_IPC_HANDLE_SIZE);
+            EP_HOST_ASSERT(handle_str.size() == shared_memory::HANDLE_SIZE);
             if (offset + i != rank) {
-                std::memcpy(ipc_handles[i].reserved, handle_str.c_str(), CUDA_IPC_HANDLE_SIZE);
+                std::memcpy(ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE);
                 CUDA_CHECK(shared_memory::open_mem_handle(&buffer_ptrs[i], ipc_handles[i]));
                 barrier_signal_ptrs[i] = reinterpret_cast<int*>(static_cast<uint8_t*>(buffer_ptrs[i]) + num_nvl_bytes);
             } else {
-                EP_HOST_ASSERT(std::memcmp(ipc_handles[i].reserved, handle_str.c_str(), CUDA_IPC_HANDLE_SIZE) == 0);
+                EP_HOST_ASSERT(std::memcmp(ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE) == 0);
             }
         }
 
diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp
index 3c4284f0..dbb4df72 100644
--- a/csrc/deep_ep.hpp
+++ b/csrc/deep_ep.hpp
@@ -25,6 +25,8 @@ typedef union {
   cudaIpcMemHandle_t cuda_ipc_mem_handle;
   CUmemFabricHandle cu_mem_fabric_handle;
 } MemHandle;
+
+constexpr usize_t HANDLE_SIZE = sizeof(MemHandle);
 }
 
 namespace deep_ep {

From 5379d59f6889acdb2063df8177165c282348b4a1 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 15:41:58 +0800
Subject: [PATCH 10/60] more

---
 csrc/deep_ep.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index 7174e8a7..9667961c 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -31,7 +31,7 @@ namespace shared_memory {
         if (enable_fabric) {
             TODO;
         } else {
-            CUDA_CHECK(cudaIpcOpenMemHandle(ptr, handle, cudaIpcMemLazyEnablePeerAccess));
+            CUDA_CHECK(cudaIpcOpenMemHandle(ptr, handle->cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess));
         }
     }
 }

From 4fc8e79646295def0d90b49859c44154a3ccbda9 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 15:43:21 +0800
Subject: [PATCH 11/60] more

---
 csrc/deep_ep.cpp | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index 9667961c..fa2e9def 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -19,6 +19,14 @@ namespace shared_memory {
         }
     }
 
+    void free(void* ptr) {
+        if (enable_fabric) {
+            TODO;
+        } else {
+            CUDA_CHECK(cudaFree(buffer_ptrs[nvl_rank]));
+        }
+    }
+
     void get_mem_handle(bool enable_fabric, MemHandle* handle, void* ptr) {
         if (enable_fabric) {
             TODO;
@@ -34,6 +42,14 @@ namespace shared_memory {
             CUDA_CHECK(cudaIpcOpenMemHandle(ptr, handle->cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess));
         }
     }
+
+    void close_mem_handle(bool enable_fabric, void* ptr) {
+        if (enable_fabric) {
+            TODO;
+        } else {
+            CUDA_CHECK(cudaIpcCloseMemHandle(buffer_ptrs[i]));
+        }
+    }
 }
 
 namespace deep_ep {
@@ -71,8 +87,8 @@ Buffer::Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_
 
     if (num_nvl_bytes > 0) {
         // Local IPC: alloc local memory and set local IPC handles
-        CUDA_CHECK(shared_memory::malloc(&buffer_ptrs[nvl_rank], num_nvl_bytes + barrier_signal_bytes + buffer_ptr_bytes + barrier_signal_ptr_bytes));
-        CUDA_CHECK(shared_memory::get_mem_handle(&ipc_handles[nvl_rank], buffer_ptrs[nvl_rank]));
+        shared_memory::malloc(&buffer_ptrs[nvl_rank], num_nvl_bytes + barrier_signal_bytes + buffer_ptr_bytes + barrier_signal_ptr_bytes);
+        shared_memory::get_mem_handle(&ipc_handles[nvl_rank], buffer_ptrs[nvl_rank]);
         buffer_ptrs_gpu = reinterpret_cast<void**>(static_cast<uint8_t*>(buffer_ptrs[nvl_rank]) + num_nvl_bytes + barrier_signal_bytes);
 
         // Set barrier signals
@@ -118,11 +134,11 @@ Buffer::~Buffer() noexcept(false) {
         // Close remote IPC
         if (is_available()) {
             for (int i = 0; i < num_nvl_ranks; ++ i) if (i != nvl_rank)
-                CUDA_CHECK(cudaIpcCloseMemHandle(buffer_ptrs[i]));
+                shared_memory::close_mem_handle(buffer_ptrs[i]);
         }
 
         // Free local buffer and error flag
-        CUDA_CHECK(cudaFree(buffer_ptrs[nvl_rank]));
+        shared_memory::free(buffer_ptrs[nvl_rank]));
     }
 
     // Free NVSHMEM
@@ -205,7 +221,7 @@ void Buffer::sync(const std::vector<int> &device_ids,
             EP_HOST_ASSERT(handle_str.size() == shared_memory::HANDLE_SIZE);
             if (offset + i != rank) {
                 std::memcpy(ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE);
-                CUDA_CHECK(shared_memory::open_mem_handle(&buffer_ptrs[i], ipc_handles[i]));
+                shared_memory::open_mem_handle(&buffer_ptrs[i], ipc_handles[i]);
                 barrier_signal_ptrs[i] = reinterpret_cast<int*>(static_cast<uint8_t*>(buffer_ptrs[i]) + num_nvl_bytes);
             } else {
                 EP_HOST_ASSERT(std::memcmp(ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE) == 0);

From 2e90afea36cbfe613f89fd7be34268d3e0c3ed2d Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 15:43:35 +0800
Subject: [PATCH 12/60] more

---
 csrc/deep_ep.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index fa2e9def..453993f2 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -47,7 +47,7 @@ namespace shared_memory {
         if (enable_fabric) {
             TODO;
         } else {
-            CUDA_CHECK(cudaIpcCloseMemHandle(buffer_ptrs[i]));
+            CUDA_CHECK(cudaIpcCloseMemHandle(ptr));
         }
     }
 }

From 3639a57cd511d4bb9539bc2aaef4938eaa0c6614 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 15:48:24 +0800
Subject: [PATCH 13/60] more

---
 csrc/deep_ep.cpp           | 35 ++++++++++++++++++++++++++++++++++-
 csrc/kernels/exception.cuh | 10 ++++++++++
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index 453993f2..40cfff62 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -13,7 +13,40 @@
 namespace shared_memory {
     void malloc(void** ptr, size_t size) {
         if (enable_fabric) {
-            TODO;
+            CUmemGenericAllocationHandle handle;
+
+            int cudaDev;
+            CUDA_CHECK(cudaGetDevice(&cudaDev));
+
+            CUdevice currentDev;
+            CU_CHECK(cuDeviceGet(&currentDev, cudaDev));
+
+            CUmemAllocationProp prop = {};
+            prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+            prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+            prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
+            prop.location.id = currentDev;
+
+            size_t granularity = 0;
+            CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
+
+            size = (size + granularity - 1) & ~(granularity - 1);
+            if (size == 0) size = granularity;
+
+            CU_CHECK(cuMemCreate(&handle, size, &prop, 0));
+            CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0));
+            CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
+
+            int device_count;
+            CUDA_CHECK(cudaGetDeviceCount(&device_count));
+
+            CUmemAccessDesc accessDesc[device_count];
+            for (int idx = 0; idx < device_count; ++idx) {
+                accessDesc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+                accessDesc[idx].location.id = idx;
+                accessDesc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+            }
+            CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, accessDesc, device_count));
         } else {
             CUDA_CHECK(cudaMalloc(ptr, size));
         }
diff --git a/csrc/kernels/exception.cuh b/csrc/kernels/exception.cuh
index 7db0ddb7..9eeedadd 100644
--- a/csrc/kernels/exception.cuh
+++ b/csrc/kernels/exception.cuh
@@ -31,6 +31,16 @@ do { \
 } while (0)
 #endif
 
+#ifndef CU_CHECK
+#define CU_CHECK(cmd) \
+do { \
+    CUresult e = (cmd); \
+    if (e != CUDA_SUCCESS) { \
+        throw EPException("CUDA", __FILE__, __LINE__, std::string(e)); \
+    } \
+} while (0)
+#endif
+
 #ifndef EP_HOST_ASSERT
 #define EP_HOST_ASSERT(cond) \
 do { \

From 4ef8f05afb241c810ee064438f98ef1a5a38f400 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 15:48:38 +0800
Subject: [PATCH 14/60] more

---
 csrc/deep_ep.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index 40cfff62..5fbf20ac 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -13,8 +13,6 @@
 namespace shared_memory {
     void malloc(void** ptr, size_t size) {
         if (enable_fabric) {
-            CUmemGenericAllocationHandle handle;
-
             int cudaDev;
             CUDA_CHECK(cudaGetDevice(&cudaDev));
 
@@ -33,6 +31,7 @@ namespace shared_memory {
             size = (size + granularity - 1) & ~(granularity - 1);
             if (size == 0) size = granularity;
 
+            CUmemGenericAllocationHandle handle;
             CU_CHECK(cuMemCreate(&handle, size, &prop, 0));
             CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0));
             CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));

From 047656e98affce1903a9987f5fa0d789d1323bc8 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 15:49:13 +0800
Subject: [PATCH 15/60] more

---
 csrc/deep_ep.cpp | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index 5fbf20ac..50b67923 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -13,17 +13,14 @@
 namespace shared_memory {
     void malloc(void** ptr, size_t size) {
         if (enable_fabric) {
-            int cudaDev;
-            CUDA_CHECK(cudaGetDevice(&cudaDev));
-
-            CUdevice currentDev;
-            CU_CHECK(cuDeviceGet(&currentDev, cudaDev));
+            CUdevice device;
+            CURESULT_CHECK(cuCtxGetDevice(&device));
 
             CUmemAllocationProp prop = {};
             prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
             prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
             prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
-            prop.location.id = currentDev;
+            prop.location.id = device;
 
             size_t granularity = 0;
             CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));

From c21f36dccdd7ce0061cc3f362f65312095d9620a Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 15:51:01 +0800
Subject: [PATCH 16/60] more

---
 csrc/deep_ep.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index 50b67923..cf5abd8f 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -50,7 +50,15 @@ namespace shared_memory {
 
     void free(void* ptr) {
         if (enable_fabric) {
-            TODO;
+            CUmemGenericAllocationHandle handle;
+            CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr));
+
+            size_t size = 0;
+            CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
+
+            CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size));
+            CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
+            CU_CHECK(cuMemRelease(handle));
         } else {
             CUDA_CHECK(cudaFree(buffer_ptrs[nvl_rank]));
         }

From 7f3e4c088c268a700e928bf7a3edfd47f552d9c3 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 15:52:50 +0800
Subject: [PATCH 17/60] more

---
 csrc/deep_ep.cpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index cf5abd8f..0c7e63d4 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -64,19 +64,22 @@ namespace shared_memory {
         }
     }
 
-    void get_mem_handle(bool enable_fabric, MemHandle* handle, void* ptr) {
+    void get_mem_handle(bool enable_fabric, MemHandle* mem_handle, void* ptr) {
         if (enable_fabric) {
-            TODO;
+            CUmemGenericAllocationHandle handle;
+            CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr));
+
+            CU_CHECK(cuMemExportToShareableHandle(&mem_handle->cu_mem_fabric_handle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0));
         } else {
-            CUDA_CHECK(cudaIpcGetMemHandle(&handle->cuda_ipc_mem_handle, ptr));
+            CUDA_CHECK(cudaIpcGetMemHandle(&mem_handle->cuda_ipc_mem_handle, ptr));
         }
     }
 
-    void open_mem_handle(bool enable_fabric, void** ptr, MemHandle* handle) {
+    void open_mem_handle(bool enable_fabric, void** ptr, MemHandle* mem_handle) {
         if (enable_fabric) {
             TODO;
         } else {
-            CUDA_CHECK(cudaIpcOpenMemHandle(ptr, handle->cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess));
+            CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess));
         }
     }
 

From 92fb573703c5977c0c3af8adfd4f99d32aa927ba Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 15:54:01 +0800
Subject: [PATCH 18/60] more

---
 csrc/deep_ep.cpp | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index 0c7e63d4..e4b1d5e7 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -77,7 +77,24 @@ namespace shared_memory {
 
     void open_mem_handle(bool enable_fabric, void** ptr, MemHandle* mem_handle) {
         if (enable_fabric) {
-            TODO;
+            CUmemFabricHandle export_handle;
+            memcpy(&export_handle, output_buffer.data(), sizeof(export_handle));
+            void *shm_addr = nullptr;
+            CUmemGenericAllocationHandle handle;
+            CU_CHECK(cuMemImportFromShareableHandle(&handle, &export_handle, CU_MEM_HANDLE_TYPE_FABRIC));
+            CU_CHECK(cuMemAddressReserve((CUdeviceptr *)&shm_addr, entry.length, 0, 0, 0));
+            CU_CHECK(cuMemMap((CUdeviceptr)shm_addr, entry.length, 0, handle, 0));
+
+            int device_count;
+            CUDA_CHECK(cudaGetDeviceCount(&device_count));
+
+            CUmemAccessDesc accessDesc[device_count];
+            for (int device_id = 0; device_id < device_count; ++device_id) {
+                accessDesc[device_id].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+                accessDesc[device_id].location.id = device_id;
+                accessDesc[device_id].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+            }
+            CU_CHECK(cuMemSetAccess((CUdeviceptr)shm_addr, entry.length, accessDesc, device_count));
         } else {
             CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess));
         }

From 29f86f3537cf363207b5968b5f1a0db0c1b11314 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 15:55:06 +0800
Subject: [PATCH 19/60] more

---
 csrc/deep_ep.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index e4b1d5e7..dfbb103e 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -77,13 +77,12 @@ namespace shared_memory {
 
     void open_mem_handle(bool enable_fabric, void** ptr, MemHandle* mem_handle) {
         if (enable_fabric) {
-            CUmemFabricHandle export_handle;
-            memcpy(&export_handle, output_buffer.data(), sizeof(export_handle));
-            void *shm_addr = nullptr;
+            TODO_size;
+
             CUmemGenericAllocationHandle handle;
-            CU_CHECK(cuMemImportFromShareableHandle(&handle, &export_handle, CU_MEM_HANDLE_TYPE_FABRIC));
-            CU_CHECK(cuMemAddressReserve((CUdeviceptr *)&shm_addr, entry.length, 0, 0, 0));
-            CU_CHECK(cuMemMap((CUdeviceptr)shm_addr, entry.length, 0, handle, 0));
+            CU_CHECK(cuMemImportFromShareableHandle(&handle, &mem_handle->cu_mem_fabric_handle, CU_MEM_HANDLE_TYPE_FABRIC));
+            CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0));
+            CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
 
             int device_count;
             CUDA_CHECK(cudaGetDeviceCount(&device_count));

From 5557e70cb3562d31fd87c811d0c640e24aba6c18 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 15:56:07 +0800
Subject: [PATCH 20/60] more

---
 csrc/deep_ep.cpp | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index dfbb103e..6b530f4a 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -11,6 +11,20 @@
 #include "kernels/configs.cuh"
 
 namespace shared_memory {
+    void cu_mem_set_access_all(void* ptr, size_t size) {
+        int device_count;
+        CUDA_CHECK(cudaGetDeviceCount(&device_count));
+
+        CUmemAccessDesc accessDesc[device_count];
+        for (int idx = 0; idx < device_count; ++idx) {
+            accessDesc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+            accessDesc[idx].location.id = idx;
+            accessDesc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+        }
+
+        CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, accessDesc, device_count));
+    }
+
     void malloc(void** ptr, size_t size) {
         if (enable_fabric) {
             CUdevice device;
@@ -33,16 +47,7 @@ namespace shared_memory {
             CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0));
             CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
 
-            int device_count;
-            CUDA_CHECK(cudaGetDeviceCount(&device_count));
-
-            CUmemAccessDesc accessDesc[device_count];
-            for (int idx = 0; idx < device_count; ++idx) {
-                accessDesc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-                accessDesc[idx].location.id = idx;
-                accessDesc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-            }
-            CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, accessDesc, device_count));
+            cu_mem_set_access_all(*ptr, size);
         } else {
             CUDA_CHECK(cudaMalloc(ptr, size));
         }
@@ -87,13 +92,7 @@ namespace shared_memory {
             int device_count;
             CUDA_CHECK(cudaGetDeviceCount(&device_count));
 
-            CUmemAccessDesc accessDesc[device_count];
-            for (int device_id = 0; device_id < device_count; ++device_id) {
-                accessDesc[device_id].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-                accessDesc[device_id].location.id = device_id;
-                accessDesc[device_id].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-            }
-            CU_CHECK(cuMemSetAccess((CUdeviceptr)shm_addr, entry.length, accessDesc, device_count));
+            cu_mem_set_access_all(*ptr, size);
         } else {
             CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess));
         }

From 9fd34e757bc5c35aee509a173d7d5d008a789d2b Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 15:57:53 +0800
Subject: [PATCH 21/60] more

---
 csrc/deep_ep.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index 6b530f4a..a994f918 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -86,6 +86,7 @@ namespace shared_memory {
 
             CUmemGenericAllocationHandle handle;
             CU_CHECK(cuMemImportFromShareableHandle(&handle, &mem_handle->cu_mem_fabric_handle, CU_MEM_HANDLE_TYPE_FABRIC));
+
             CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0));
             CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
 

From 64173931ff7887bd3b40dcca5100e83798348f3d Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 15:58:39 +0800
Subject: [PATCH 22/60] more

---
 csrc/deep_ep.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index a994f918..bf576ded 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -101,7 +101,15 @@ namespace shared_memory {
 
     void close_mem_handle(bool enable_fabric, void* ptr) {
         if (enable_fabric) {
-            TODO;
+            CUmemGenericAllocationHandle handle;
+            CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr));
+
+            size_t size = 0;
+            CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
+
+            CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size));
+            CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
+            CU_CHECK(cuMemRelease(handle));
         } else {
             CUDA_CHECK(cudaIpcCloseMemHandle(ptr));
         }

From faaeaadfdaf741f482eb5b3292782fe85c2feeb0 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 16:00:13 +0800
Subject: [PATCH 23/60] more

---
 csrc/deep_ep.cpp | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index bf576ded..ef4e940e 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -25,7 +25,7 @@ namespace shared_memory {
         CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, accessDesc, device_count));
     }
 
-    void malloc(void** ptr, size_t size) {
+    void malloc(bool enable_fabric, void** ptr, size_t size) {
         if (enable_fabric) {
             CUdevice device;
             CURESULT_CHECK(cuCtxGetDevice(&device));
@@ -53,7 +53,7 @@ namespace shared_memory {
         }
     }
 
-    void free(void* ptr) {
+    void free(bool enable_fabric, void* ptr) {
         if (enable_fabric) {
             CUmemGenericAllocationHandle handle;
             CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr));
@@ -101,15 +101,7 @@ namespace shared_memory {
 
     void close_mem_handle(bool enable_fabric, void* ptr) {
         if (enable_fabric) {
-            CUmemGenericAllocationHandle handle;
-            CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr));
-
-            size_t size = 0;
-            CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
-
-            CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size));
-            CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
-            CU_CHECK(cuMemRelease(handle));
+            free(true, ptr);
         } else {
             CUDA_CHECK(cudaIpcCloseMemHandle(ptr));
         }

From c38dbeddd903279cf6235c76e524e134ff250a59 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 16:00:53 +0800
Subject: [PATCH 24/60] more

---
 csrc/deep_ep.cpp | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index ef4e940e..e0d6e047 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -25,6 +25,18 @@ namespace shared_memory {
         CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, accessDesc, device_count));
     }
 
+    void cu_mem_free(void* ptr) {
+        CUmemGenericAllocationHandle handle;
+        CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr));
+
+        size_t size = 0;
+        CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
+
+        CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size));
+        CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
+        CU_CHECK(cuMemRelease(handle));
+    }
+
     void malloc(bool enable_fabric, void** ptr, size_t size) {
         if (enable_fabric) {
             CUdevice device;
@@ -55,17 +67,9 @@ namespace shared_memory {
 
     void free(bool enable_fabric, void* ptr) {
         if (enable_fabric) {
-            CUmemGenericAllocationHandle handle;
-            CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr));
-
-            size_t size = 0;
-            CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
-
-            CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size));
-            CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
-            CU_CHECK(cuMemRelease(handle));
+            cu_mem_free(ptr);
         } else {
-            CUDA_CHECK(cudaFree(buffer_ptrs[nvl_rank]));
+            CUDA_CHECK(cudaFree(ptr));
         }
     }
 
@@ -101,7 +105,7 @@ namespace shared_memory {
 
     void close_mem_handle(bool enable_fabric, void* ptr) {
         if (enable_fabric) {
-            free(true, ptr);
+            cu_mem_free(ptr);
         } else {
             CUDA_CHECK(cudaIpcCloseMemHandle(ptr));
         }

From dc74c0a9f3428c52201ad14bcbbcd64e304d8669 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 16:01:24 +0800
Subject: [PATCH 25/60] more

---
 csrc/deep_ep.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index e0d6e047..a48a7e15 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -56,9 +56,9 @@ namespace shared_memory {
 
             CUmemGenericAllocationHandle handle;
             CU_CHECK(cuMemCreate(&handle, size, &prop, 0));
+
             CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0));
             CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
-
             cu_mem_set_access_all(*ptr, size);
         } else {
             CUDA_CHECK(cudaMalloc(ptr, size));
@@ -93,10 +93,6 @@ namespace shared_memory {
 
             CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0));
             CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
-
-            int device_count;
-            CUDA_CHECK(cudaGetDeviceCount(&device_count));
-
             cu_mem_set_access_all(*ptr, size);
         } else {
             CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess));

From 61dea30b60ac87409148213a96e3bce0d5c2ab26 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 16:06:00 +0800
Subject: [PATCH 26/60] more

---
 csrc/deep_ep.cpp | 12 +++++++-----
 csrc/deep_ep.hpp | 10 ++++++++--
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index a48a7e15..54b3a941 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -74,28 +74,30 @@ namespace shared_memory {
     }
 
     void get_mem_handle(bool enable_fabric, MemHandle* mem_handle, void* ptr) {
+        mem_handle->size = TODO;
+
         if (enable_fabric) {
             CUmemGenericAllocationHandle handle;
             CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr));
 
-            CU_CHECK(cuMemExportToShareableHandle(&mem_handle->cu_mem_fabric_handle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0));
+            CU_CHECK(cuMemExportToShareableHandle(&mem_handle->inner.cu_mem_fabric_handle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0));
         } else {
-            CUDA_CHECK(cudaIpcGetMemHandle(&mem_handle->cuda_ipc_mem_handle, ptr));
+            CUDA_CHECK(cudaIpcGetMemHandle(&mem_handle->inner.cuda_ipc_mem_handle, ptr));
         }
     }
 
     void open_mem_handle(bool enable_fabric, void** ptr, MemHandle* mem_handle) {
         if (enable_fabric) {
-            TODO_size;
+            size_t size = mem_handle->size;
 
             CUmemGenericAllocationHandle handle;
-            CU_CHECK(cuMemImportFromShareableHandle(&handle, &mem_handle->cu_mem_fabric_handle, CU_MEM_HANDLE_TYPE_FABRIC));
+            CU_CHECK(cuMemImportFromShareableHandle(&handle, &mem_handle->inner.cu_mem_fabric_handle, CU_MEM_HANDLE_TYPE_FABRIC));
 
             CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0));
             CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
             cu_mem_set_access_all(*ptr, size);
         } else {
-            CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess));
+            CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->inner.cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess));
         }
     }
 
diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp
index dbb4df72..1a015f5a 100644
--- a/csrc/deep_ep.hpp
+++ b/csrc/deep_ep.hpp
@@ -21,10 +21,16 @@
 #endif
 
 namespace shared_memory {
-typedef union {
+
+union MemHandleInner {
   cudaIpcMemHandle_t cuda_ipc_mem_handle;
   CUmemFabricHandle cu_mem_fabric_handle;
-} MemHandle;
+};
+
+struct MemHandle {
+    MemHandleInner inner;
+    size_t size;
+};
 
 constexpr usize_t HANDLE_SIZE = sizeof(MemHandle);
 }

From 7d4bc93e2d204cc1f16c10a1b67e594e8cb65491 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 16:07:00 +0800
Subject: [PATCH 27/60] more

---
 csrc/deep_ep.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index 54b3a941..cf350ce3 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -74,7 +74,10 @@ namespace shared_memory {
     }
 
     void get_mem_handle(bool enable_fabric, MemHandle* mem_handle, void* ptr) {
-        mem_handle->size = TODO;
+        size_t size = 0;
+        CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
+
+        mem_handle->size = size;
 
         if (enable_fabric) {
             CUmemGenericAllocationHandle handle;

From 5b78f2243341a6bd314bfd98faefdf8cbad3e31e Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 16:07:18 +0800
Subject: [PATCH 28/60] more

---
 csrc/deep_ep.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index cf350ce3..a16583b8 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -15,14 +15,14 @@ namespace shared_memory {
         int device_count;
         CUDA_CHECK(cudaGetDeviceCount(&device_count));
 
-        CUmemAccessDesc accessDesc[device_count];
+        CUmemAccessDesc access_desc[device_count];
         for (int idx = 0; idx < device_count; ++idx) {
-            accessDesc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-            accessDesc[idx].location.id = idx;
-            accessDesc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+            access_desc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+            access_desc[idx].location.id = idx;
+            access_desc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
         }
 
-        CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, accessDesc, device_count));
+        CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, access_desc, device_count));
     }
 
     void cu_mem_free(void* ptr) {

From 75351cd380ba45cff664683ee3312740a01968ad Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 16:08:34 +0800
Subject: [PATCH 29/60] more

---
 csrc/deep_ep.cpp | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index a16583b8..991c94b0 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -37,7 +37,16 @@ namespace shared_memory {
         CU_CHECK(cuMemRelease(handle));
     }
 
-    void malloc(bool enable_fabric, void** ptr, size_t size) {
+    void get_size_align_to_granularity(size_t size_raw) {
+        size_t granularity = 0;
+        CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
+
+        size_t size = (size_raw + granularity - 1) & ~(granularity - 1);
+        if (size == 0) size = granularity;
+        return size;
+    }
+
+    void malloc(bool enable_fabric, void** ptr, size_t size_raw) {
         if (enable_fabric) {
             CUdevice device;
             CURESULT_CHECK(cuCtxGetDevice(&device));
@@ -47,12 +56,8 @@ namespace shared_memory {
             prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
             prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
             prop.location.id = device;
-
-            size_t granularity = 0;
-            CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
-
-            size = (size + granularity - 1) & ~(granularity - 1);
-            if (size == 0) size = granularity;
+            
+            size_t size = get_size_align_to_granularity(size_raw);
 
             CUmemGenericAllocationHandle handle;
             CU_CHECK(cuMemCreate(&handle, size, &prop, 0));

From 7bb12d4e66569af1f8d0a6c37840e139cd8570b0 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 16:08:49 +0800
Subject: [PATCH 30/60] more

---
 csrc/deep_ep.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index 991c94b0..fe099af2 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -37,7 +37,7 @@ namespace shared_memory {
         CU_CHECK(cuMemRelease(handle));
     }
 
-    void get_size_align_to_granularity(size_t size_raw) {
+    void get_size_align_to_granularity(size_t size_raw, CUmemAllocationProp& prop) {
         size_t granularity = 0;
         CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
 
@@ -57,7 +57,7 @@ namespace shared_memory {
             prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
             prop.location.id = device;
             
-            size_t size = get_size_align_to_granularity(size_raw);
+            size_t size = get_size_align_to_granularity(size_raw, prop);
 
             CUmemGenericAllocationHandle handle;
             CU_CHECK(cuMemCreate(&handle, size, &prop, 0));

From 0e5a15509dceb76f82feea6aac775111cc28b4c5 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 16:11:56 +0800
Subject: [PATCH 31/60] more

---
 csrc/deep_ep.cpp | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index fe099af2..42537a2a 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -46,6 +46,21 @@ namespace shared_memory {
         return size;
     }
 
+    bool support_fabric() {
+        int device_count;
+        CUDA_CHECK(cudaGetDeviceCount(&device_count));
+
+        for (int device = 0; device < device_count; ++device) {
+            int support = 0;
+            CU_CHECK(cuDeviceGetAttribute(&support, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device));
+            if (!support) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
     void malloc(bool enable_fabric, void** ptr, size_t size_raw) {
         if (enable_fabric) {
             CUdevice device;
@@ -56,7 +71,7 @@ namespace shared_memory {
             prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
             prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
             prop.location.id = device;
-            
+
             size_t size = get_size_align_to_granularity(size_raw, prop);
 
             CUmemGenericAllocationHandle handle;

From 87b398034dee42cde66fa362e70f992f2ca08ffa Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 16:13:52 +0800
Subject: [PATCH 32/60] more

---
 csrc/deep_ep.cpp | 10 +++++-----
 csrc/deep_ep.hpp |  5 +++++
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index 42537a2a..0b2cbbe1 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -61,7 +61,7 @@ namespace shared_memory {
         return true;
     }
 
-    void malloc(bool enable_fabric, void** ptr, size_t size_raw) {
+    void malloc(void** ptr, size_t size_raw) {
         if (enable_fabric) {
             CUdevice device;
             CURESULT_CHECK(cuCtxGetDevice(&device));
@@ -85,7 +85,7 @@ namespace shared_memory {
         }
     }
 
-    void free(bool enable_fabric, void* ptr) {
+    void free(void* ptr) {
         if (enable_fabric) {
             cu_mem_free(ptr);
         } else {
@@ -93,7 +93,7 @@ namespace shared_memory {
         }
     }
 
-    void get_mem_handle(bool enable_fabric, MemHandle* mem_handle, void* ptr) {
+    void get_mem_handle(MemHandle* mem_handle, void* ptr) {
         size_t size = 0;
         CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
 
@@ -109,7 +109,7 @@ namespace shared_memory {
         }
     }
 
-    void open_mem_handle(bool enable_fabric, void** ptr, MemHandle* mem_handle) {
+    void open_mem_handle(void** ptr, MemHandle* mem_handle) {
         if (enable_fabric) {
             size_t size = mem_handle->size;
 
@@ -124,7 +124,7 @@ namespace shared_memory {
         }
     }
 
-    void close_mem_handle(bool enable_fabric, void* ptr) {
+    void close_mem_handle(void* ptr) {
         if (enable_fabric) {
             cu_mem_free(ptr);
         } else {
diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp
index 1a015f5a..017c465a 100644
--- a/csrc/deep_ep.hpp
+++ b/csrc/deep_ep.hpp
@@ -33,6 +33,11 @@ struct MemHandle {
 };
 
 constexpr usize_t HANDLE_SIZE = sizeof(MemHandle);
+
+class SharedMemoryAllocator {
+private:
+    bool enable_fabric;
+};
 }
 
 namespace deep_ep {

From 4398b5ce9737b0f2e2d92f9897a981f5567e5a6d Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 16:14:21 +0800
Subject: [PATCH 33/60] more

---
 csrc/deep_ep.cpp | 189 ++++++++++++++++++++++++-----------------------
 csrc/deep_ep.hpp |   4 -
 2 files changed, 98 insertions(+), 95 deletions(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index 0b2cbbe1..bae8a622 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -11,127 +11,134 @@
 #include "kernels/configs.cuh"
 
 namespace shared_memory {
-    void cu_mem_set_access_all(void* ptr, size_t size) {
-        int device_count;
-        CUDA_CHECK(cudaGetDeviceCount(&device_count));
-
-        CUmemAccessDesc access_desc[device_count];
-        for (int idx = 0; idx < device_count; ++idx) {
-            access_desc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-            access_desc[idx].location.id = idx;
-            access_desc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-        }
-
-        CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, access_desc, device_count));
+void cu_mem_set_access_all(void* ptr, size_t size) {
+    int device_count;
+    CUDA_CHECK(cudaGetDeviceCount(&device_count));
+
+    CUmemAccessDesc access_desc[device_count];
+    for (int idx = 0; idx < device_count; ++idx) {
+        access_desc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+        access_desc[idx].location.id = idx;
+        access_desc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
     }
 
-    void cu_mem_free(void* ptr) {
-        CUmemGenericAllocationHandle handle;
-        CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr));
+    CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, access_desc, device_count));
+}
 
-        size_t size = 0;
-        CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
+void cu_mem_free(void* ptr) {
+    CUmemGenericAllocationHandle handle;
+    CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr));
 
-        CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size));
-        CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
-        CU_CHECK(cuMemRelease(handle));
-    }
+    size_t size = 0;
+    CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
 
-    void get_size_align_to_granularity(size_t size_raw, CUmemAllocationProp& prop) {
-        size_t granularity = 0;
-        CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
+    CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size));
+    CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
+    CU_CHECK(cuMemRelease(handle));
+}
 
-        size_t size = (size_raw + granularity - 1) & ~(granularity - 1);
-        if (size == 0) size = granularity;
-        return size;
-    }
+void get_size_align_to_granularity(size_t size_raw, CUmemAllocationProp& prop) {
+    size_t granularity = 0;
+    CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
 
-    bool support_fabric() {
-        int device_count;
-        CUDA_CHECK(cudaGetDeviceCount(&device_count));
+    size_t size = (size_raw + granularity - 1) & ~(granularity - 1);
+    if (size == 0) size = granularity;
+    return size;
+}
 
-        for (int device = 0; device < device_count; ++device) {
-            int support = 0;
-            CU_CHECK(cuDeviceGetAttribute(&support, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device));
-            if (!support) {
-                return false;
-            }
-        }
+bool support_fabric() {
+    int device_count;
+    CUDA_CHECK(cudaGetDeviceCount(&device_count));
 
-        return true;
+    for (int device = 0; device < device_count; ++device) {
+        int support = 0;
+        CU_CHECK(cuDeviceGetAttribute(&support, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device));
+        if (!support) {
+            return false;
+        }
     }
 
-    void malloc(void** ptr, size_t size_raw) {
-        if (enable_fabric) {
-            CUdevice device;
-            CURESULT_CHECK(cuCtxGetDevice(&device));
+    return true;
+}
 
-            CUmemAllocationProp prop = {};
-            prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-            prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-            prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
-            prop.location.id = device;
+class SharedMemoryAllocator {
+public:
+    void malloc(void** ptr, size_t size_raw);
+private:
+    bool enable_fabric;
+};
 
-            size_t size = get_size_align_to_granularity(size_raw, prop);
+void malloc(void** ptr, size_t size_raw) {
+    if (enable_fabric) {
+        CUdevice device;
+        CURESULT_CHECK(cuCtxGetDevice(&device));
 
-            CUmemGenericAllocationHandle handle;
-            CU_CHECK(cuMemCreate(&handle, size, &prop, 0));
+        CUmemAllocationProp prop = {};
+        prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+        prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+        prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
+        prop.location.id = device;
 
-            CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0));
-            CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
-            cu_mem_set_access_all(*ptr, size);
-        } else {
-            CUDA_CHECK(cudaMalloc(ptr, size));
-        }
+        size_t size = get_size_align_to_granularity(size_raw, prop);
+
+        CUmemGenericAllocationHandle handle;
+        CU_CHECK(cuMemCreate(&handle, size, &prop, 0));
+
+        CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0));
+        CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
+        cu_mem_set_access_all(*ptr, size);
+    } else {
+        CUDA_CHECK(cudaMalloc(ptr, size));
     }
+}
 
-    void free(void* ptr) {
-        if (enable_fabric) {
-            cu_mem_free(ptr);
-        } else {
-            CUDA_CHECK(cudaFree(ptr));
-        }
+void free(void* ptr) {
+    if (enable_fabric) {
+        cu_mem_free(ptr);
+    } else {
+        CUDA_CHECK(cudaFree(ptr));
     }
+}
 
-    void get_mem_handle(MemHandle* mem_handle, void* ptr) {
-        size_t size = 0;
-        CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
+void get_mem_handle(MemHandle* mem_handle, void* ptr) {
+    size_t size = 0;
+    CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
 
-        mem_handle->size = size;
+    mem_handle->size = size;
 
-        if (enable_fabric) {
-            CUmemGenericAllocationHandle handle;
-            CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr));
+    if (enable_fabric) {
+        CUmemGenericAllocationHandle handle;
+        CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr));
 
-            CU_CHECK(cuMemExportToShareableHandle(&mem_handle->inner.cu_mem_fabric_handle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0));
-        } else {
-            CUDA_CHECK(cudaIpcGetMemHandle(&mem_handle->inner.cuda_ipc_mem_handle, ptr));
-        }
+        CU_CHECK(cuMemExportToShareableHandle(&mem_handle->inner.cu_mem_fabric_handle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0));
+    } else {
+        CUDA_CHECK(cudaIpcGetMemHandle(&mem_handle->inner.cuda_ipc_mem_handle, ptr));
     }
+}
 
-    void open_mem_handle(void** ptr, MemHandle* mem_handle) {
-        if (enable_fabric) {
-            size_t size = mem_handle->size;
+void open_mem_handle(void** ptr, MemHandle* mem_handle) {
+    if (enable_fabric) {
+        size_t size = mem_handle->size;
 
-            CUmemGenericAllocationHandle handle;
-            CU_CHECK(cuMemImportFromShareableHandle(&handle, &mem_handle->inner.cu_mem_fabric_handle, CU_MEM_HANDLE_TYPE_FABRIC));
+        CUmemGenericAllocationHandle handle;
+        CU_CHECK(cuMemImportFromShareableHandle(&handle, &mem_handle->inner.cu_mem_fabric_handle, CU_MEM_HANDLE_TYPE_FABRIC));
 
-            CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0));
-            CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
-            cu_mem_set_access_all(*ptr, size);
-        } else {
-            CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->inner.cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess));
-        }
+        CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0));
+        CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
+        cu_mem_set_access_all(*ptr, size);
+    } else {
+        CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->inner.cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess));
     }
+}
 
-    void close_mem_handle(void* ptr) {
-        if (enable_fabric) {
-            cu_mem_free(ptr);
-        } else {
-            CUDA_CHECK(cudaIpcCloseMemHandle(ptr));
-        }
+void close_mem_handle(void* ptr) {
+    if (enable_fabric) {
+        cu_mem_free(ptr);
+    } else {
+        CUDA_CHECK(cudaIpcCloseMemHandle(ptr));
     }
 }
+}
 
 namespace deep_ep {
 
diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp
index 017c465a..14bbaf7a 100644
--- a/csrc/deep_ep.hpp
+++ b/csrc/deep_ep.hpp
@@ -34,10 +34,6 @@ struct MemHandle {
 
 constexpr usize_t HANDLE_SIZE = sizeof(MemHandle);
 
-class SharedMemoryAllocator {
-private:
-    bool enable_fabric;
-};
 }
 
 namespace deep_ep {

From d7e9ce380c689a5ca42de187a6cdf6633fdf5d5a Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 16:16:08 +0800
Subject: [PATCH 34/60] more

---
 csrc/deep_ep.cpp | 189 +++++++++++++++++++++++------------------------
 csrc/deep_ep.hpp |  10 +++
 2 files changed, 101 insertions(+), 98 deletions(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index bae8a622..a2650617 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -11,134 +11,127 @@
 #include "kernels/configs.cuh"
 
 namespace shared_memory {
-void cu_mem_set_access_all(void* ptr, size_t size) {
-    int device_count;
-    CUDA_CHECK(cudaGetDeviceCount(&device_count));
-
-    CUmemAccessDesc access_desc[device_count];
-    for (int idx = 0; idx < device_count; ++idx) {
-        access_desc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-        access_desc[idx].location.id = idx;
-        access_desc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-    }
+    void cu_mem_set_access_all(void* ptr, size_t size) {
+        int device_count;
+        CUDA_CHECK(cudaGetDeviceCount(&device_count));
+
+        CUmemAccessDesc access_desc[device_count];
+        for (int idx = 0; idx < device_count; ++idx) {
+            access_desc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+            access_desc[idx].location.id = idx;
+            access_desc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+        }
 
-    CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, access_desc, device_count));
-}
+        CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, access_desc, device_count));
+    }
 
-void cu_mem_free(void* ptr) {
-    CUmemGenericAllocationHandle handle;
-    CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr));
+    void cu_mem_free(void* ptr) {
+        CUmemGenericAllocationHandle handle;
+        CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr));
 
-    size_t size = 0;
-    CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
+        size_t size = 0;
+        CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
 
-    CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size));
-    CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
-    CU_CHECK(cuMemRelease(handle));
-}
+        CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size));
+        CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
+        CU_CHECK(cuMemRelease(handle));
+    }
 
-void get_size_align_to_granularity(size_t size_raw, CUmemAllocationProp& prop) {
-    size_t granularity = 0;
-    CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
+    void get_size_align_to_granularity(size_t size_raw, CUmemAllocationProp& prop) {
+        size_t granularity = 0;
+        CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
 
-    size_t size = (size_raw + granularity - 1) & ~(granularity - 1);
-    if (size == 0) size = granularity;
-    return size;
-}
+        size_t size = (size_raw + granularity - 1) & ~(granularity - 1);
+        if (size == 0) size = granularity;
+        return size;
+    }
 
-bool support_fabric() {
-    int device_count;
-    CUDA_CHECK(cudaGetDeviceCount(&device_count));
+    bool support_fabric() {
+        int device_count;
+        CUDA_CHECK(cudaGetDeviceCount(&device_count));
 
-    for (int device = 0; device < device_count; ++device) {
-        int support = 0;
-        CU_CHECK(cuDeviceGetAttribute(&support, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device));
-        if (!support) {
-            return false;
+        for (int device = 0; device < device_count; ++device) {
+            int support = 0;
+            CU_CHECK(cuDeviceGetAttribute(&support, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device));
+            if (!support) {
+                return false;
+            }
         }
-    }
-
-    return true;
-}
 
-class SharedMemoryAllocator {
-public:
-    void malloc(void** ptr, size_t size_raw);
-private:
-    bool enable_fabric;
-};
+        return true;
+    }
 
-void malloc(void** ptr, size_t size_raw) {
-    if (enable_fabric) {
-        CUdevice device;
-        CURESULT_CHECK(cuCtxGetDevice(&device));
+    void SharedMemoryAllocator::malloc(void** ptr, size_t size_raw) {
+        if (enable_fabric) {
+            CUdevice device;
+            CURESULT_CHECK(cuCtxGetDevice(&device));
 
-        CUmemAllocationProp prop = {};
-        prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-        prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-        prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
-        prop.location.id = device;
+            CUmemAllocationProp prop = {};
+            prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+            prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+            prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
+            prop.location.id = device;
 
-        size_t size = get_size_align_to_granularity(size_raw, prop);
+            size_t size = get_size_align_to_granularity(size_raw, prop);
 
-        CUmemGenericAllocationHandle handle;
-        CU_CHECK(cuMemCreate(&handle, size, &prop, 0));
+            CUmemGenericAllocationHandle handle;
+            CU_CHECK(cuMemCreate(&handle, size, &prop, 0));
 
-        CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0));
-        CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
-        cu_mem_set_access_all(*ptr, size);
-    } else {
-        CUDA_CHECK(cudaMalloc(ptr, size));
+            CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0));
+            CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
+            cu_mem_set_access_all(*ptr, size);
+        } else {
+            CUDA_CHECK(cudaMalloc(ptr, size));
+        }
     }
-}
 
-void free(void* ptr) {
-    if (enable_fabric) {
-        cu_mem_free(ptr);
-    } else {
-        CUDA_CHECK(cudaFree(ptr));
+    void SharedMemoryAllocator::free(void* ptr) {
+        if (enable_fabric) {
+            cu_mem_free(ptr);
+        } else {
+            CUDA_CHECK(cudaFree(ptr));
+        }
     }
-}
 
-void get_mem_handle(MemHandle* mem_handle, void* ptr) {
-    size_t size = 0;
-    CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
+    void SharedMemoryAllocator::get_mem_handle(MemHandle* mem_handle, void* ptr) {
+        size_t size = 0;
+        CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
 
-    mem_handle->size = size;
+        mem_handle->size = size;
 
-    if (enable_fabric) {
-        CUmemGenericAllocationHandle handle;
-        CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr));
+        if (enable_fabric) {
+            CUmemGenericAllocationHandle handle;
+            CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr));
 
-        CU_CHECK(cuMemExportToShareableHandle(&mem_handle->inner.cu_mem_fabric_handle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0));
-    } else {
-        CUDA_CHECK(cudaIpcGetMemHandle(&mem_handle->inner.cuda_ipc_mem_handle, ptr));
+            CU_CHECK(cuMemExportToShareableHandle(&mem_handle->inner.cu_mem_fabric_handle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0));
+        } else {
+            CUDA_CHECK(cudaIpcGetMemHandle(&mem_handle->inner.cuda_ipc_mem_handle, ptr));
+        }
     }
-}
 
-void open_mem_handle(void** ptr, MemHandle* mem_handle) {
-    if (enable_fabric) {
-        size_t size = mem_handle->size;
+    void SharedMemoryAllocator::open_mem_handle(void** ptr, MemHandle* mem_handle) {
+        if (enable_fabric) {
+            size_t size = mem_handle->size;
 
-        CUmemGenericAllocationHandle handle;
-        CU_CHECK(cuMemImportFromShareableHandle(&handle, &mem_handle->inner.cu_mem_fabric_handle, CU_MEM_HANDLE_TYPE_FABRIC));
+            CUmemGenericAllocationHandle handle;
+            CU_CHECK(cuMemImportFromShareableHandle(&handle, &mem_handle->inner.cu_mem_fabric_handle, CU_MEM_HANDLE_TYPE_FABRIC));
 
-        CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0));
-        CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
-        cu_mem_set_access_all(*ptr, size);
-    } else {
-        CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->inner.cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess));
+            CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0));
+            CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
+            cu_mem_set_access_all(*ptr, size);
+        } else {
+            CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->inner.cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess));
+        }
     }
-}
 
-void close_mem_handle(void* ptr) {
-    if (enable_fabric) {
-        cu_mem_free(ptr);
-    } else {
-        CUDA_CHECK(cudaIpcCloseMemHandle(ptr));
+    void SharedMemoryAllocator::close_mem_handle(void* ptr) {
+        if (enable_fabric) {
+            cu_mem_free(ptr);
+        } else {
+            CUDA_CHECK(cudaIpcCloseMemHandle(ptr));
+        }
     }
 }
-}
 
 namespace deep_ep {
 
diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp
index 14bbaf7a..5df87429 100644
--- a/csrc/deep_ep.hpp
+++ b/csrc/deep_ep.hpp
@@ -34,6 +34,16 @@ struct MemHandle {
 
 constexpr usize_t HANDLE_SIZE = sizeof(MemHandle);
 
+class SharedMemoryAllocator {
+public:
+    void malloc(void** ptr, size_t size);
+    void free(void* ptr);
+    void get_mem_handle(MemHandle* mem_handle, void* ptr);
+    void open_mem_handle(void** ptr, MemHandle* mem_handle);
+    void close_mem_handle(void* ptr);
+private:
+    bool enable_fabric;
+};
 }
 
 namespace deep_ep {

From 5b83cb85878f39b897432f9b38184bcd17a36f9d Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 16:16:41 +0800
Subject: [PATCH 35/60] more

---
 csrc/deep_ep.cpp | 2 ++
 csrc/deep_ep.hpp | 1 +
 2 files changed, 3 insertions(+)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index a2650617..6e2c084f 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -60,6 +60,8 @@ namespace shared_memory {
 
         return true;
     }
+    
+    SharedMemoryAllocator::SharedMemoryAllocator() : enable_fabric(support_fabric()) {}
 
     void SharedMemoryAllocator::malloc(void** ptr, size_t size_raw) {
         if (enable_fabric) {
diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp
index 5df87429..a8c73f75 100644
--- a/csrc/deep_ep.hpp
+++ b/csrc/deep_ep.hpp
@@ -36,6 +36,7 @@ constexpr usize_t HANDLE_SIZE = sizeof(MemHandle);
 
 class SharedMemoryAllocator {
 public:
+    SharedMemoryAllocator();
     void malloc(void** ptr, size_t size);
     void free(void* ptr);
     void get_mem_handle(MemHandle* mem_handle, void* ptr);

From f024df5938cb2f81060974cd4ef238b2e596f1bf Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 16:16:45 +0800
Subject: [PATCH 36/60] more

---
 csrc/deep_ep.cpp | 186 +++++++++++++++++++++++------------------------
 1 file changed, 93 insertions(+), 93 deletions(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index 6e2c084f..6d6deda4 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -11,129 +11,129 @@
 #include "kernels/configs.cuh"
 
 namespace shared_memory {
-    void cu_mem_set_access_all(void* ptr, size_t size) {
-        int device_count;
-        CUDA_CHECK(cudaGetDeviceCount(&device_count));
-
-        CUmemAccessDesc access_desc[device_count];
-        for (int idx = 0; idx < device_count; ++idx) {
-            access_desc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-            access_desc[idx].location.id = idx;
-            access_desc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-        }
-
-        CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, access_desc, device_count));
+void cu_mem_set_access_all(void* ptr, size_t size) {
+    int device_count;
+    CUDA_CHECK(cudaGetDeviceCount(&device_count));
+
+    CUmemAccessDesc access_desc[device_count];
+    for (int idx = 0; idx < device_count; ++idx) {
+        access_desc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+        access_desc[idx].location.id = idx;
+        access_desc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
     }
 
-    void cu_mem_free(void* ptr) {
-        CUmemGenericAllocationHandle handle;
-        CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr));
+    CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, access_desc, device_count));
+}
 
-        size_t size = 0;
-        CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
+void cu_mem_free(void* ptr) {
+    CUmemGenericAllocationHandle handle;
+    CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr));
 
-        CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size));
-        CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
-        CU_CHECK(cuMemRelease(handle));
-    }
+    size_t size = 0;
+    CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
 
-    void get_size_align_to_granularity(size_t size_raw, CUmemAllocationProp& prop) {
-        size_t granularity = 0;
-        CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
+    CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size));
+    CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
+    CU_CHECK(cuMemRelease(handle));
+}
 
-        size_t size = (size_raw + granularity - 1) & ~(granularity - 1);
-        if (size == 0) size = granularity;
-        return size;
-    }
+void get_size_align_to_granularity(size_t size_raw, CUmemAllocationProp& prop) {
+    size_t granularity = 0;
+    CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
 
-    bool support_fabric() {
-        int device_count;
-        CUDA_CHECK(cudaGetDeviceCount(&device_count));
+    size_t size = (size_raw + granularity - 1) & ~(granularity - 1);
+    if (size == 0) size = granularity;
+    return size;
+}
 
-        for (int device = 0; device < device_count; ++device) {
-            int support = 0;
-            CU_CHECK(cuDeviceGetAttribute(&support, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device));
-            if (!support) {
-                return false;
-            }
-        }
+bool support_fabric() {
+    int device_count;
+    CUDA_CHECK(cudaGetDeviceCount(&device_count));
 
-        return true;
+    for (int device = 0; device < device_count; ++device) {
+        int support = 0;
+        CU_CHECK(cuDeviceGetAttribute(&support, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device));
+        if (!support) {
+            return false;
+        }
     }
-    
-    SharedMemoryAllocator::SharedMemoryAllocator() : enable_fabric(support_fabric()) {}
 
-    void SharedMemoryAllocator::malloc(void** ptr, size_t size_raw) {
-        if (enable_fabric) {
-            CUdevice device;
-            CURESULT_CHECK(cuCtxGetDevice(&device));
+    return true;
+}
 
-            CUmemAllocationProp prop = {};
-            prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-            prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-            prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
-            prop.location.id = device;
+SharedMemoryAllocator::SharedMemoryAllocator() : enable_fabric(support_fabric()) {}
 
-            size_t size = get_size_align_to_granularity(size_raw, prop);
+void SharedMemoryAllocator::malloc(void** ptr, size_t size_raw) {
+    if (enable_fabric) {
+        CUdevice device;
+        CURESULT_CHECK(cuCtxGetDevice(&device));
 
-            CUmemGenericAllocationHandle handle;
-            CU_CHECK(cuMemCreate(&handle, size, &prop, 0));
+        CUmemAllocationProp prop = {};
+        prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+        prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+        prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
+        prop.location.id = device;
 
-            CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0));
-            CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
-            cu_mem_set_access_all(*ptr, size);
-        } else {
-            CUDA_CHECK(cudaMalloc(ptr, size));
-        }
+        size_t size = get_size_align_to_granularity(size_raw, prop);
+
+        CUmemGenericAllocationHandle handle;
+        CU_CHECK(cuMemCreate(&handle, size, &prop, 0));
+
+        CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0));
+        CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
+        cu_mem_set_access_all(*ptr, size);
+    } else {
+        CUDA_CHECK(cudaMalloc(ptr, size));
     }
+}
 
-    void SharedMemoryAllocator::free(void* ptr) {
-        if (enable_fabric) {
-            cu_mem_free(ptr);
-        } else {
-            CUDA_CHECK(cudaFree(ptr));
-        }
+void SharedMemoryAllocator::free(void* ptr) {
+    if (enable_fabric) {
+        cu_mem_free(ptr);
+    } else {
+        CUDA_CHECK(cudaFree(ptr));
     }
+}
 
-    void SharedMemoryAllocator::get_mem_handle(MemHandle* mem_handle, void* ptr) {
-        size_t size = 0;
-        CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
+void SharedMemoryAllocator::get_mem_handle(MemHandle* mem_handle, void* ptr) {
+    size_t size = 0;
+    CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
 
-        mem_handle->size = size;
+    mem_handle->size = size;
 
-        if (enable_fabric) {
-            CUmemGenericAllocationHandle handle;
-            CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr));
+    if (enable_fabric) {
+        CUmemGenericAllocationHandle handle;
+        CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr));
 
-            CU_CHECK(cuMemExportToShareableHandle(&mem_handle->inner.cu_mem_fabric_handle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0));
-        } else {
-            CUDA_CHECK(cudaIpcGetMemHandle(&mem_handle->inner.cuda_ipc_mem_handle, ptr));
-        }
+        CU_CHECK(cuMemExportToShareableHandle(&mem_handle->inner.cu_mem_fabric_handle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0));
+    } else {
+        CUDA_CHECK(cudaIpcGetMemHandle(&mem_handle->inner.cuda_ipc_mem_handle, ptr));
     }
+}
 
-    void SharedMemoryAllocator::open_mem_handle(void** ptr, MemHandle* mem_handle) {
-        if (enable_fabric) {
-            size_t size = mem_handle->size;
+void SharedMemoryAllocator::open_mem_handle(void** ptr, MemHandle* mem_handle) {
+    if (enable_fabric) {
+        size_t size = mem_handle->size;
 
-            CUmemGenericAllocationHandle handle;
-            CU_CHECK(cuMemImportFromShareableHandle(&handle, &mem_handle->inner.cu_mem_fabric_handle, CU_MEM_HANDLE_TYPE_FABRIC));
+        CUmemGenericAllocationHandle handle;
+        CU_CHECK(cuMemImportFromShareableHandle(&handle, &mem_handle->inner.cu_mem_fabric_handle, CU_MEM_HANDLE_TYPE_FABRIC));
 
-            CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0));
-            CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
-            cu_mem_set_access_all(*ptr, size);
-        } else {
-            CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->inner.cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess));
-        }
+        CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0));
+        CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
+        cu_mem_set_access_all(*ptr, size);
+    } else {
+        CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->inner.cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess));
     }
+}
 
-    void SharedMemoryAllocator::close_mem_handle(void* ptr) {
-        if (enable_fabric) {
-            cu_mem_free(ptr);
-        } else {
-            CUDA_CHECK(cudaIpcCloseMemHandle(ptr));
-        }
+void SharedMemoryAllocator::close_mem_handle(void* ptr) {
+    if (enable_fabric) {
+        cu_mem_free(ptr);
+    } else {
+        CUDA_CHECK(cudaIpcCloseMemHandle(ptr));
     }
 }
+}
 
 namespace deep_ep {
 

From 5a7b2f2ab15ceb2492d9bd13eb6300a5fc71325e Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 16:20:10 +0800
Subject: [PATCH 37/60] more

---
 csrc/deep_ep.cpp | 10 +++++-----
 csrc/deep_ep.hpp |  2 ++
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index 6d6deda4..8bdfc3e9 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -170,8 +170,8 @@ Buffer::Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_
 
     if (num_nvl_bytes > 0) {
         // Local IPC: alloc local memory and set local IPC handles
-        shared_memory::malloc(&buffer_ptrs[nvl_rank], num_nvl_bytes + barrier_signal_bytes + buffer_ptr_bytes + barrier_signal_ptr_bytes);
-        shared_memory::get_mem_handle(&ipc_handles[nvl_rank], buffer_ptrs[nvl_rank]);
+        shared_memory_allocator.malloc(&buffer_ptrs[nvl_rank], num_nvl_bytes + barrier_signal_bytes + buffer_ptr_bytes + barrier_signal_ptr_bytes);
+        shared_memory_allocator.get_mem_handle(&ipc_handles[nvl_rank], buffer_ptrs[nvl_rank]);
         buffer_ptrs_gpu = reinterpret_cast<void**>(static_cast<uint8_t*>(buffer_ptrs[nvl_rank]) + num_nvl_bytes + barrier_signal_bytes);
 
         // Set barrier signals
@@ -217,11 +217,11 @@ Buffer::~Buffer() noexcept(false) {
         // Close remote IPC
         if (is_available()) {
             for (int i = 0; i < num_nvl_ranks; ++ i) if (i != nvl_rank)
-                shared_memory::close_mem_handle(buffer_ptrs[i]);
+                shared_memory_allocator.close_mem_handle(buffer_ptrs[i]);
         }
 
         // Free local buffer and error flag
-        shared_memory::free(buffer_ptrs[nvl_rank]));
+        shared_memory_allocator.free(buffer_ptrs[nvl_rank]));
     }
 
     // Free NVSHMEM
@@ -304,7 +304,7 @@ void Buffer::sync(const std::vector<int> &device_ids,
             EP_HOST_ASSERT(handle_str.size() == shared_memory::HANDLE_SIZE);
             if (offset + i != rank) {
                 std::memcpy(ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE);
-                shared_memory::open_mem_handle(&buffer_ptrs[i], ipc_handles[i]);
+                shared_memory_allocator.open_mem_handle(&buffer_ptrs[i], ipc_handles[i]);
                 barrier_signal_ptrs[i] = reinterpret_cast<int*>(static_cast<uint8_t*>(buffer_ptrs[i]) + num_nvl_bytes);
             } else {
                 EP_HOST_ASSERT(std::memcmp(ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE) == 0);
diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp
index a8c73f75..8ceee158 100644
--- a/csrc/deep_ep.hpp
+++ b/csrc/deep_ep.hpp
@@ -98,6 +98,8 @@ struct Buffer {
     volatile int* moe_recv_rdma_counter = nullptr;
     int* moe_recv_rdma_counter_mapped = nullptr;
 
+    SharedMemoryAllocator shared_memory_allocator;
+
 public:
     Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_bytes, bool low_latency_mode);
 

From 60523797d95a353b61754e885654115ff1123858 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 16:22:32 +0800
Subject: [PATCH 38/60] more

---
 csrc/kernels/exception.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/kernels/exception.cuh b/csrc/kernels/exception.cuh
index 9eeedadd..4be59122 100644
--- a/csrc/kernels/exception.cuh
+++ b/csrc/kernels/exception.cuh
@@ -36,7 +36,7 @@ do { \
 do { \
     CUresult e = (cmd); \
     if (e != CUDA_SUCCESS) { \
-        throw EPException("CUDA", __FILE__, __LINE__, std::string(e)); \
+        throw EPException("CU", __FILE__, __LINE__, cuGetErrorName(e)); \
     } \
 } while (0)
 #endif

From befcd27066f1524737892446900113344dd22dbd Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 16:22:50 +0800
Subject: [PATCH 39/60] more

---
 csrc/kernels/exception.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/kernels/exception.cuh b/csrc/kernels/exception.cuh
index 4be59122..80aae935 100644
--- a/csrc/kernels/exception.cuh
+++ b/csrc/kernels/exception.cuh
@@ -36,7 +36,7 @@ do { \
 do { \
     CUresult e = (cmd); \
     if (e != CUDA_SUCCESS) { \
-        throw EPException("CU", __FILE__, __LINE__, cuGetErrorName(e)); \
+        throw EPException("CU", __FILE__, __LINE__, cuGetErrorString(e)); \
     } \
 } while (0)
 #endif

From df598ea7ac8306ab8a80a5130133eab3650d6fd5 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 16:24:54 +0800
Subject: [PATCH 40/60] more

---
 csrc/kernels/exception.cuh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/csrc/kernels/exception.cuh b/csrc/kernels/exception.cuh
index 80aae935..3026374b 100644
--- a/csrc/kernels/exception.cuh
+++ b/csrc/kernels/exception.cuh
@@ -36,7 +36,9 @@ do { \
 do { \
     CUresult e = (cmd); \
     if (e != CUDA_SUCCESS) { \
-        throw EPException("CU", __FILE__, __LINE__, cuGetErrorString(e)); \
+        const char *error_str = NULL; \
+        cuGetErrorString(e, &error_str); \
+        throw EPException("CU", __FILE__, __LINE__, std::string(error_str)); \
     } \
 } while (0)
 #endif

From 5b23a8ad2190514697523f375d8e18b2571aff4b Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 16:25:15 +0800
Subject: [PATCH 41/60] more

---
 csrc/deep_ep.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp
index 8ceee158..9b99d5e8 100644
--- a/csrc/deep_ep.hpp
+++ b/csrc/deep_ep.hpp
@@ -32,7 +32,7 @@ struct MemHandle {
     size_t size;
 };
 
-constexpr usize_t HANDLE_SIZE = sizeof(MemHandle);
+constexpr size_t HANDLE_SIZE = sizeof(MemHandle);
 
 class SharedMemoryAllocator {
 public:

From 210e4997026bd372b69632f3589551a454aea81f Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 16:26:28 +0800
Subject: [PATCH 42/60] more

---
 csrc/deep_ep.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp
index 9b99d5e8..185fd33b 100644
--- a/csrc/deep_ep.hpp
+++ b/csrc/deep_ep.hpp
@@ -98,7 +98,7 @@ struct Buffer {
     volatile int* moe_recv_rdma_counter = nullptr;
     int* moe_recv_rdma_counter_mapped = nullptr;
 
-    SharedMemoryAllocator shared_memory_allocator;
+    shared_memory::SharedMemoryAllocator shared_memory_allocator;
 
 public:
     Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_bytes, bool low_latency_mode);

From 379ac2447d8415679c493edd8d78855d8e742d5e Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 16:28:11 +0800
Subject: [PATCH 43/60] more

---
 csrc/deep_ep.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index 8bdfc3e9..0e725f88 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -37,10 +37,7 @@ void cu_mem_free(void* ptr) {
     CU_CHECK(cuMemRelease(handle));
 }
 
-void get_size_align_to_granularity(size_t size_raw, CUmemAllocationProp& prop) {
-    size_t granularity = 0;
-    CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
-
+size_t get_size_align_to_granularity(size_t size_raw, size_t granularity) {
     size_t size = (size_raw + granularity - 1) & ~(granularity - 1);
     if (size == 0) size = granularity;
     return size;
@@ -66,7 +63,7 @@ SharedMemoryAllocator::SharedMemoryAllocator() : enable_fabric(support_fabric())
 void SharedMemoryAllocator::malloc(void** ptr, size_t size_raw) {
     if (enable_fabric) {
         CUdevice device;
-        CURESULT_CHECK(cuCtxGetDevice(&device));
+        CU_CHECK(cuCtxGetDevice(&device));
 
         CUmemAllocationProp prop = {};
         prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
@@ -74,7 +71,10 @@ void SharedMemoryAllocator::malloc(void** ptr, size_t size_raw) {
         prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
         prop.location.id = device;
 
-        size_t size = get_size_align_to_granularity(size_raw, prop);
+        size_t granularity = 0;
+        CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
+
+        size_t size = get_size_align_to_granularity(size_raw, granularity);
 
         CUmemGenericAllocationHandle handle;
         CU_CHECK(cuMemCreate(&handle, size, &prop, 0));

From 43999dc0d16aaf1436b168ec10f803c4eeaa3142 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 16:29:02 +0800
Subject: [PATCH 44/60] more

---
 csrc/deep_ep.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index 0e725f88..40b3cd1e 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -303,11 +303,11 @@ void Buffer::sync(const std::vector<int> &device_ids,
             auto handle_str = std::string(all_gathered_handles[offset + i].value());
             EP_HOST_ASSERT(handle_str.size() == shared_memory::HANDLE_SIZE);
             if (offset + i != rank) {
-                std::memcpy(ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE);
+                std::memcpy(&ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE);
                 shared_memory_allocator.open_mem_handle(&buffer_ptrs[i], ipc_handles[i]);
                 barrier_signal_ptrs[i] = reinterpret_cast<int*>(static_cast<uint8_t*>(buffer_ptrs[i]) + num_nvl_bytes);
             } else {
-                EP_HOST_ASSERT(std::memcmp(ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE) == 0);
+                EP_HOST_ASSERT(std::memcmp(&ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE) == 0);
             }
         }
 

From 791601101bdfb82881b6e7446c17b6bbf9c28815 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 16:30:16 +0800
Subject: [PATCH 45/60] more

---
 csrc/deep_ep.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index 40b3cd1e..d872c037 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -83,7 +83,7 @@ void SharedMemoryAllocator::malloc(void** ptr, size_t size_raw) {
         CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
         cu_mem_set_access_all(*ptr, size);
     } else {
-        CUDA_CHECK(cudaMalloc(ptr, size));
+        CUDA_CHECK(cudaMalloc(ptr, size_raw));
     }
 }
 
@@ -221,7 +221,7 @@ Buffer::~Buffer() noexcept(false) {
         }
 
         // Free local buffer and error flag
-        shared_memory_allocator.free(buffer_ptrs[nvl_rank]));
+        shared_memory_allocator.free(buffer_ptrs[nvl_rank]);
     }
 
     // Free NVSHMEM
@@ -304,7 +304,7 @@ void Buffer::sync(const std::vector<int> &device_ids,
             EP_HOST_ASSERT(handle_str.size() == shared_memory::HANDLE_SIZE);
             if (offset + i != rank) {
                 std::memcpy(&ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE);
-                shared_memory_allocator.open_mem_handle(&buffer_ptrs[i], ipc_handles[i]);
+                shared_memory_allocator.open_mem_handle(&buffer_ptrs[i], &ipc_handles[i]);
                 barrier_signal_ptrs[i] = reinterpret_cast<int*>(static_cast<uint8_t*>(buffer_ptrs[i]) + num_nvl_bytes);
             } else {
                 EP_HOST_ASSERT(std::memcmp(&ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE) == 0);

From 0525f8f79e8b96b8e934130d01bd853c3386b5c2 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 17 Jun 2025 16:44:00 +0800
Subject: [PATCH 46/60] more

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index b16310a7..93294f74 100644
--- a/setup.py
+++ b/setup.py
@@ -19,7 +19,7 @@
     include_dirs = ['csrc/']
     library_dirs = []
     nvcc_dlink = []
-    extra_link_args = []
+    extra_link_args = ['-lcuda']
 
     # NVSHMEM flags
     if disable_nvshmem:

From 2bf764cfd442b2423a13d423bd72135a03187896 Mon Sep 17 00:00:00 2001
From: shifangx <shifangx@nvidia.com>
Date: Thu, 24 Jul 2025 22:14:17 -0700
Subject: [PATCH 47/60] support NVFP4 data format in low latency dispatch

---
 csrc/deep_ep.cpp             |  26 +++-
 csrc/deep_ep.hpp             |   4 +-
 csrc/kernels/api.cuh         |   4 +-
 csrc/kernels/internode_ll.cu | 237 ++++++++++++++++++++++++++++++++---
 deep_ep/buffer.py            |  19 ++-
 tests/test_low_latency.py    |  52 ++++++--
 tests/utils.py               |  62 ++++++++-
 7 files changed, 367 insertions(+), 37 deletions(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index 0789cd58..67393fbc 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -1087,12 +1087,14 @@ void Buffer::clean_low_latency_buffer(int num_max_dispatch_tokens_per_rank, int
 #endif
 }
 
-std::tuple<torch::Tensor, std::optional<torch::Tensor>, torch::Tensor, torch::Tensor, torch::Tensor, std::optional<EventHandle>, std::optional<std::function<void()>>>
+std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>, torch::Tensor, torch::Tensor, torch::Tensor, std::optional<EventHandle>, std::optional<std::function<void()>>>
 Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_idx,
                              const std::optional<torch::Tensor>& cumulative_local_expert_recv_stats,
                              const std::optional<torch::Tensor>& dispatch_wait_recv_cost_stats,
+                             const std::optional<torch::Tensor>& x_sf_scale,
                              int num_max_dispatch_tokens_per_rank, int num_experts,
                              bool use_fp8, bool round_scale, bool use_ue8m0,
+                             bool use_nvfp4, bool use_ue8m0_for_nvfp4_sf,
                              bool async, bool return_recv_hook) {
 #ifndef DISABLE_NVSHMEM
     EP_HOST_ASSERT(low_latency_mode);
@@ -1137,8 +1139,9 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
         stream_wait(launch_stream, compute_stream);
 
     // Allocate packed tensors
-    auto packed_recv_x = torch::empty({num_local_experts, num_ranks * num_max_dispatch_tokens_per_rank, hidden},
-                                      x.options().dtype(use_fp8 ? torch::kFloat8_e4m3fn: torch::kBFloat16));
+    constexpr int NUM_ELEMS_PER_PACK = 8;
+    auto packed_recv_x = torch::empty({num_local_experts, num_ranks * num_max_dispatch_tokens_per_rank, use_nvfp4 ? hidden / NUM_ELEMS_PER_PACK : hidden},
+                                      x.options().dtype(use_nvfp4 ? torch::kInt32 : (use_fp8 ? torch::kFloat8_e4m3fn: torch::kBFloat16)));
     auto packed_recv_src_info = torch::empty({num_local_experts, num_ranks * num_max_dispatch_tokens_per_rank}, torch::dtype(torch::kInt32).device(torch::kCUDA));
     auto packed_recv_layout_range = torch::empty({num_local_experts, num_ranks}, torch::dtype(torch::kInt64).device(torch::kCUDA));
     auto packed_recv_count = torch::empty({num_local_experts}, torch::dtype(torch::kInt32).device(torch::kCUDA));
@@ -1146,6 +1149,8 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
     // Allocate column-majored scales
     auto packed_recv_x_scales = std::optional<torch::Tensor>();
     void* packed_recv_x_scales_ptr = nullptr;
+    auto packed_recv_x_sf_scale = std::optional<torch::Tensor>();
+    void* packed_recv_x_sf_scale_ptr = nullptr;
     EP_HOST_ASSERT((num_ranks * num_max_dispatch_tokens_per_rank) % 4 == 0 and "TMA requires the number of tokens to be multiple of 4");
 
     if (use_fp8) {
@@ -1161,16 +1166,26 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
         }
         packed_recv_x_scales = torch::transpose(packed_recv_x_scales.value(), 1, 2);
         packed_recv_x_scales_ptr = packed_recv_x_scales->data_ptr();
+    }else if (use_nvfp4) {
+        constexpr int SF_VEC_SIZE = 16;
+        constexpr int NUM_SF_ELEMS_PER_PACK = 4;
+        packed_recv_x_scales = torch::empty({num_local_experts, hidden / (SF_VEC_SIZE * NUM_SF_ELEMS_PER_PACK), num_ranks * num_max_dispatch_tokens_per_rank},
+                                            torch::dtype(torch::kInt).device(torch::kCUDA));
+        packed_recv_x_scales = torch::transpose(packed_recv_x_scales.value(), 1, 2);
+        packed_recv_x_scales_ptr = packed_recv_x_scales->data_ptr();
+        packed_recv_x_sf_scale = torch::empty({num_local_experts, num_ranks * num_max_dispatch_tokens_per_rank}, torch::dtype(torch::kFloat32).device(torch::kCUDA));
+        packed_recv_x_sf_scale_ptr = packed_recv_x_sf_scale->data_ptr();
     }
 
     // Kernel launch
     auto next_clean_meta = next_buffer.clean_meta();
     auto launcher = [=](int phases) {
-        internode_ll::dispatch(packed_recv_x.data_ptr(), packed_recv_x_scales_ptr,
+        internode_ll::dispatch(packed_recv_x.data_ptr(), packed_recv_x_scales_ptr, packed_recv_x_sf_scale_ptr,
                                packed_recv_src_info.data_ptr<int>(), packed_recv_layout_range.data_ptr<int64_t>(),
                                packed_recv_count.data_ptr<int>(),
                                cumulative_local_expert_recv_stats.has_value() ? cumulative_local_expert_recv_stats->data_ptr<int>() : nullptr,
                                dispatch_wait_recv_cost_stats.has_value() ? dispatch_wait_recv_cost_stats->data_ptr<int64_t>() : nullptr,
+                               x_sf_scale.has_value() ? x_sf_scale->data_ptr<float>() : nullptr,
                                buffer.dispatch_rdma_recv_data_buffer, buffer.dispatch_rdma_recv_count_buffer,
                                buffer.dispatch_rdma_send_buffer,
                                x.data_ptr(), topk_idx.data_ptr<int64_t>(),
@@ -1178,6 +1193,7 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
                                num_tokens, hidden, num_max_dispatch_tokens_per_rank,
                                num_topk, num_experts, rank, num_ranks,
                                use_fp8, round_scale, use_ue8m0,
+                               use_nvfp4, use_ue8m0_for_nvfp4_sf,
                                workspace, num_device_sms,
                                launch_stream, phases);
     };
@@ -1199,7 +1215,7 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
         recv_hook = [=]() { launcher(LOW_LATENCY_RECV_PHASE); };
 
     // Return values
-    return {packed_recv_x, packed_recv_x_scales, packed_recv_count, packed_recv_src_info, packed_recv_layout_range, event, recv_hook};
+    return {packed_recv_x, packed_recv_x_scales, packed_recv_x_sf_scale, packed_recv_count, packed_recv_src_info, packed_recv_layout_range, event, recv_hook};
 #else
     EP_HOST_ASSERT(false and "NVSHMEM is disabled during compilation");
     return {};
diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp
index aa62ccb0..27ff4951 100644
--- a/csrc/deep_ep.hpp
+++ b/csrc/deep_ep.hpp
@@ -143,12 +143,14 @@ struct Buffer {
 
     void clean_low_latency_buffer(int num_max_dispatch_tokens_per_rank, int hidden, int num_experts);
 
-    std::tuple<torch::Tensor, std::optional<torch::Tensor>, torch::Tensor, torch::Tensor, torch::Tensor, std::optional<EventHandle>, std::optional<std::function<void()>>>
+    std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>, torch::Tensor, torch::Tensor, torch::Tensor, std::optional<EventHandle>, std::optional<std::function<void()>>>
     low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_idx,
                          const std::optional<torch::Tensor>& cumulative_local_expert_recv_stats,
                          const std::optional<torch::Tensor>& dispatch_wait_recv_cost_stats,
+                         const std::optional<torch::Tensor>& x_sf_scale,
                          int num_max_dispatch_tokens_per_rank, int num_experts,
                          bool use_fp8, bool round_scale, bool use_ue8m0,
+                         bool use_nvfp4, bool use_ue8m0_for_nvfp4_sf,
                          bool async, bool return_recv_hook);
 
     std::tuple<torch::Tensor, std::optional<EventHandle>, std::optional<std::function<void()>>>
diff --git a/csrc/kernels/api.cuh b/csrc/kernels/api.cuh
index d34775fd..6540443c 100644
--- a/csrc/kernels/api.cuh
+++ b/csrc/kernels/api.cuh
@@ -139,17 +139,19 @@ void clean_low_latency_buffer(int* clean_0, int num_clean_int_0,
                               int* clean_1, int num_clean_int_1,
                               cudaStream_t stream);
 
-void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
+void dispatch(void* packed_recv_x, void* packed_recv_x_scales, void* packed_recv_x_sf_scale,
               int* packed_recv_src_info, int64_t* packed_recv_layout_range,
               int* packed_recv_count,
               int* cumulative_local_expert_recv_stats,
               int64_t* dispatch_wait_recv_cost_stats,
+              const float* x_sf_scale,
               void* rdma_recv_x, int* rdma_recv_count, void* rdma_x,
               const void* x, const int64_t* topk_idx,
               int* next_clean, int num_next_clean_int,
               int num_tokens, int hidden, int num_max_dispatch_tokens_per_rank,
               int num_topk, int num_experts, int rank, int num_ranks,
               bool use_fp8, bool round_scale, bool use_ue8m0,
+              bool use_nvfp4, bool use_ue8m0_for_nvfp4_sf,
               void* workspace, int num_device_sms,
               cudaStream_t stream, int phases);
 
diff --git a/csrc/kernels/internode_ll.cu b/csrc/kernels/internode_ll.cu
index 391a4b3d..f13880f7 100644
--- a/csrc/kernels/internode_ll.cu
+++ b/csrc/kernels/internode_ll.cu
@@ -36,13 +36,158 @@ void clean_low_latency_buffer(int* clean_0, int num_clean_int_0,
                   clean_0, num_clean_int_0, clean_1, num_clean_int_1);
 }
 
-template <bool kUseFP8, bool kUseUE8M0, int kHidden>
+constexpr int CVT_ELTS_PER_THREAD = 8;
+constexpr int SF_VEC_SIZE = 16;
+
+struct PackedVec {
+  __nv_bfloat162 elts[4];
+};
+
+using Type = __nv_bfloat16;
+
+__device__ __forceinline__ float exp2f_rcp(uint8_t exp) {
+  constexpr uint32_t FP32_EXPONENT_BIAS = 127;
+  return (exp == 0) ? 1 : exp2f(FP32_EXPONENT_BIAS - static_cast<float>(exp));
+}
+
+// Fast reciprocal.
+inline __device__ float reciprocal_approximate_ftz(float a) {
+  float b;
+  asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
+  return b;
+}
+
+// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
+inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
+  #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+    uint32_t val;
+    asm volatile(
+        "{\n"
+        ".reg .b8 byte0;\n"
+        ".reg .b8 byte1;\n"
+        ".reg .b8 byte2;\n"
+        ".reg .b8 byte3;\n"
+        "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
+        "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
+        "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
+        "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
+        "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
+        "}"
+        : "=r"(val)
+        : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y), "f"(array[2].x),
+          "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
+    return val;
+  #else
+    // static_assert(false, "not supported.");
+    return 0;
+  #endif
+}
+
+// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
+inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
+  #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+    uint32_t val;
+    asm volatile(
+        "{\n"
+        ".reg .b8 byte0;\n"
+        ".reg .b8 byte1;\n"
+        ".reg .b8 byte2;\n"
+        ".reg .b8 byte3;\n"
+        "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
+        "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
+        "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
+        "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
+        "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
+        "}"
+        : "=r"(val)
+        : "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]), "f"(array[4]), "f"(array[5]),
+          "f"(array[6]), "f"(array[7]));
+    return val;
+  #else
+    // static_assert(false, "not supported.");
+    return 0;
+  #endif
+}
+
+// Quantizes the provided PackedVec into the uint32_t output
+template <int SF_VEC_SIZE, bool UE8M0_SF>
+__device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec& vec, float SFScaleVal, uint8_t* SFout) {
+  // Get absolute maximum values among the local 8 values.
+  auto localMax = __habs2(vec.elts[0]);
+
+// Local maximum value.
+#pragma unroll
+  for (int i = 1; i < CVT_ELTS_PER_THREAD / 2; i++) {
+    localMax = __hmax2(localMax, __habs2(vec.elts[i]));
+  }
+
+  constexpr int CVT_NUM_THREADS_PER_SF = SF_VEC_SIZE / CVT_ELTS_PER_THREAD;
+  EP_STATIC_ASSERT(CVT_NUM_THREADS_PER_SF == 2 or CVT_NUM_THREADS_PER_SF == 4, "Invalid number of threads per SF");
+  // Get the absolute maximum among all 16 values (two threads for 16, four threads for 32).
+  localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
+  if constexpr (CVT_NUM_THREADS_PER_SF == 4) {
+    localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 2), localMax);
+  }
+  // Get the final absolute maximum values.
+  float vecMax = float(__hmax(localMax.x, localMax.y));
+
+  // 8 bits representation of the SF.
+  uint8_t fp8SFVal;
+  float outputScale;
+  // Write the SF to global memory (STG.8).
+  if constexpr (UE8M0_SF) {
+    __nv_fp8_e8m0 tmp;
+    // Scale the max value to the range of E2m1.
+    vecMax *= reciprocal_approximate_ftz(6.0f);
+    tmp.__x = __nv_cvt_float_to_e8m0(vecMax, __NV_SATFINITE, cudaRoundPosInf);
+    fp8SFVal = tmp.__x;
+    outputScale = exp2f_rcp(fp8SFVal);
+  } else {
+    // Get the SF (max value of the vector / max value of e2m1).
+    // maximum value of e2m1 = 6.0.
+    // TODO: use half as compute data type.
+    auto SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
+    // Here SFValue is always positive, so E4M3 is the same as UE4M3.
+    __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
+    fp8SFVal = tmp.__x;
+    SFValue = static_cast<float>(tmp);
+    // Get the output scale.
+    // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal)) * reciprocal(SFScaleVal))
+    outputScale = SFValue != 0
+                      ? reciprocal_approximate_ftz(SFValue * reciprocal_approximate_ftz(SFScaleVal))
+                      : 0.0f;
+  }
+
+  if (SFout) {
+    // Write the SF to global memory (STG.8).
+    *SFout = fp8SFVal;
+  }
+
+  // Convert the input to float.
+  float2 fp2Vals[CVT_ELTS_PER_THREAD / 2];
+
+#pragma unroll
+  for (int i = 0; i < CVT_ELTS_PER_THREAD / 2; i++) {
+    fp2Vals[i] = __bfloat1622float2(vec.elts[i]);
+    fp2Vals[i].x *= outputScale;
+    fp2Vals[i].y *= outputScale;
+  }
+
+  // Convert to e2m1 values.
+  uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals);
+
+  // Write the e2m1 values to global memory.
+  return e2m1Vec;
+}
+
+template <bool kUseFP8, bool kUseUE8M0, bool kUseNVFP4, bool kUseUE8M0ForNVFP4SF, int kHidden>
 __global__ __launch_bounds__(1024, 1) void
-dispatch(void* packed_recv_x, void* packed_recv_x_scales,
+dispatch(void* packed_recv_x, void* packed_recv_x_scales, void* packed_recv_x_sf_scale,
          int* packed_recv_src_info, int64_t* packed_recv_layout_range,
          int* packed_recv_count,
          int* cumulative_local_expert_recv_stats,
          int64_t* dispatch_wait_recv_cost_stats,
+         const float* x_sf_scale,
          void* rdma_recv_x, int* rdma_recv_count, void* rdma_x,
          const void* x, const int64_t* topk_idx,
          int* atomic_counter_per_expert, int* atomic_finish_counter_per_expert,
@@ -62,20 +207,28 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales,
     const auto responsible_expert_idx = sm_id * num_warp_groups + warp_group_id;
 
     // May extract UE8M0 from the scales
-    using scale_t = std::conditional_t<kUseUE8M0, uint8_t, float>;
-    using packed_t = std::conditional_t<kUseUE8M0, uint32_t, float>;
+    using scale_t = std::conditional_t<kUseUE8M0 || kUseNVFP4, uint8_t, float>;
+    using packed_t = std::conditional_t<kUseUE8M0 || kUseNVFP4, uint32_t, float>;
     EP_STATIC_ASSERT(sizeof(packed_t) % sizeof(scale_t) == 0, "Invalid vector length");
+    EP_STATIC_ASSERT(!(kUseFP8 && kUseNVFP4), "FP8 and NVFP4 cannot be used together");
 
     // FP8 staffs
-    constexpr int kNumPerChannels = 128;
+    constexpr int kNumPerChannels = kUseNVFP4 ? 16 : 128;
     const int num_scales = kHidden / kNumPerChannels;
-    const size_t hidden_bytes = kHidden * (kUseFP8 ? sizeof(__nv_fp8_storage_t) : sizeof(nv_bfloat16));
+    constexpr size_t hidden_bytes =
+    kUseNVFP4
+        ? kHidden * sizeof(__nv_fp8_storage_t) / 2
+        : kHidden * (kUseFP8 ? sizeof(__nv_fp8_storage_t) : sizeof(nv_bfloat16));
     const size_t hidden_int4 = hidden_bytes / sizeof(int4);
 
     // Message package: index at source (int), 3 reserved int fields, hidden data, FP8 scales
     // NOTES: currently we have 3 reserved int fields for future use
-    using vec_t = std::conditional_t<kUseFP8, int2, int4>;
-    const size_t num_bytes_per_msg = sizeof(int4) + (kUseFP8 ? (kHidden + num_scales * sizeof(float)) : (kHidden * sizeof(nv_bfloat16)));
+    using vec_t = std::conditional_t<
+        kUseNVFP4,
+        int32_t,
+        std::conditional_t<kUseFP8, int2, int4>>;
+    using rdma_x_scale_t = std::conditional_t<kUseNVFP4, uint8_t, float>;
+    const size_t num_bytes_per_msg = sizeof(int4) + ((kUseFP8 || kUseNVFP4) ? (hidden_bytes + num_scales * sizeof(rdma_x_scale_t)) : hidden_bytes);
     const size_t num_int4_per_msg = num_bytes_per_msg / sizeof(int4);
     EP_DEVICE_ASSERT(num_bytes_per_msg % sizeof(int4) == 0);
 
@@ -100,12 +253,24 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales,
         for (int token_idx = sm_id; token_idx < num_tokens; token_idx += num_sms) {
             const auto x_int4 = static_cast<const int4*>(x) + token_idx * hidden_bf16_int4;
             const auto rdma_x_src_idx = reinterpret_cast<int*>(static_cast<uint8_t*>(rdma_x) + token_idx * num_bytes_per_msg);
+            const auto rdma_x_sf_scale = reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(rdma_x_src_idx) + sizeof(int));
             const auto rdma_x_vec = reinterpret_cast<vec_t*>(reinterpret_cast<uint8_t*>(rdma_x_src_idx) + sizeof(int4));
-            const auto rdma_x_scales = reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(rdma_x_vec) + hidden_bytes);
+            const auto rdma_x_scales = reinterpret_cast<rdma_x_scale_t*>(reinterpret_cast<uint8_t*>(rdma_x_vec) + hidden_bytes);
 
             // Overlap top-k index read and source token index writes
             auto dst_expert_idx = warp_id < num_topk ? static_cast<int>(__ldg(topk_idx + token_idx * num_topk + warp_id)) : -1;
             thread_id == 0 ? (*rdma_x_src_idx = token_idx) : 0;
+            float SFScaleVal = 1.0f;
+            if constexpr (kUseNVFP4) {
+                // Get scaling value: if x_sf_scale is nullptr, use 1.0f; otherwise, read value at token_idx
+                if (x_sf_scale != nullptr) {
+                    SFScaleVal = *(static_cast<const float*>(x_sf_scale) + token_idx);
+                }
+                // Only thread 0 writes scaling value to rdma_x_sf_scale
+                if (thread_id == 0) {
+                    *rdma_x_sf_scale = SFScaleVal;
+                }
+            }
 
             // FP8 cast
             EP_STATIC_ASSERT(hidden_bf16_int4 % 32 == 0, "Must use the full warp to reduce");
@@ -141,6 +306,20 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales,
                         fp8x2_values[j / 2] = __nv_cvt_float2_to_fp8x2(fp32x2, __NV_SATFINITE, __NV_E4M3);
                     }
                     rdma_x_vec[i] = int2_value;
+                } else if constexpr (kUseNVFP4) {
+                    // Convert to NVFP4
+                    uint8_t sf_val;
+                    PackedVec vec = *reinterpret_cast<PackedVec*>(&int4_value);
+                    uint32_t result = cvt_warp_fp16_to_fp4<SF_VEC_SIZE, kUseUE8M0ForNVFP4SF>(vec, SFScaleVal, &sf_val);
+
+                    // Write scale to send buffer
+                    if (lane_id % 2 == 0){
+                        EP_DEVICE_ASSERT((i * kNumElemsPerRead) % 16 == 0);
+                        int rdma_x_scale_idx = i * kNumElemsPerRead / 16;
+                        rdma_x_scales[rdma_x_scale_idx] = sf_val;                   
+                        }
+                    // Cast into send buffer                    
+                    rdma_x_vec[i] = *reinterpret_cast<vec_t*>(&result);
                 } else {
                     // Reinterpret-cast is for C++14 compatibility
                     rdma_x_vec[i] = *reinterpret_cast<vec_t*>(&int4_value);
@@ -262,6 +441,7 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales,
         const auto recv_x_int4 = static_cast<int4*>(packed_recv_x) +
                 local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank * hidden_int4;
         const auto recv_src_info = packed_recv_src_info + local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank;
+        const auto recv_sf_scale = static_cast<float*>(packed_recv_x_sf_scale) + local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank;
         const auto recv_range = packed_recv_layout_range + local_expert_idx * num_ranks;
         const auto num_aligned_scales = align<int>(num_scales, sizeof(float) / sizeof(scale_t));
         const auto recv_x_scales = static_cast<scale_t*>(packed_recv_x_scales) + local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank * num_aligned_scales;
@@ -294,12 +474,17 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales,
         recv_token_begin_idx = shared_recv_token_begin_idx[warp_group_id];
 
         // Copy tokens
-        EP_DEVICE_ASSERT(num_scales <= 64);
         for (int i = sub_warp_id; i < num_recv_tokens; i += num_warps_per_group) {
             // Copy source info
             const auto src_src_idx = reinterpret_cast<int*>(rdma_recv_x_uint8 + i * num_bytes_per_msg);
             if (lane_id == 0)
                 recv_src_info[recv_token_begin_idx + i] = ld_nc_global(src_src_idx);
+            if constexpr (kUseNVFP4) {
+                const auto src_sf_scale_for_nvfp4 = reinterpret_cast<float*>(rdma_recv_x_uint8 + i * num_bytes_per_msg + sizeof(int));
+                if (lane_id == 0)
+                    recv_sf_scale[recv_token_begin_idx + i] = ld_nc_global(src_sf_scale_for_nvfp4);
+            }
+
             __syncwarp();
 
             // Copy data
@@ -310,6 +495,7 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales,
 
             // Copy scales
             if constexpr (kUseFP8) {
+                EP_DEVICE_ASSERT(num_scales <= 64);
                 // Equivalent CuTe layout:
                 //   (num_tokens, (num_packed, num_elems_per_pack)):(num_elems_per_pack, (num_tokens * num_elems_per_pack, 1))
                 const auto src_scales = reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(src_data) + hidden_bytes);
@@ -329,22 +515,40 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales,
                     auto scale = extract_required_scale_format<kUseUE8M0>(ld_nc_global(src_scales + lane_id + 32));
                     recv_x_scales[token_idx * token_stride + pack_idx * pack_stride + elem_idx] = scale;
                 }
+            } else if constexpr (kUseNVFP4) {            
+                // Equivalent CuTe layout:
+                //   (num_tokens, (num_packed, num_elems_per_pack)):(num_elems_per_pack, (num_tokens * num_elems_per_pack, 1))
+                const auto src_scales = reinterpret_cast<uint8_t*>(reinterpret_cast<uint8_t*>(src_data) + hidden_bytes);
+                const auto num_elems_per_pack = static_cast<int>(sizeof(packed_t) / sizeof(scale_t));
+                const auto token_idx = recv_token_begin_idx + i;
+                const auto token_stride = num_elems_per_pack;
+                const auto pack_stride = num_ranks * num_max_dispatch_tokens_per_rank * num_elems_per_pack;
+                #pragma unroll
+                for (int j = lane_id; j < num_scales; j += 32) {
+                    const auto pack_idx = j / num_elems_per_pack;
+                    const auto elem_idx = j % num_elems_per_pack;
+                    auto scale = ld_nc_global(src_scales + j);
+                    recv_x_scales[token_idx * token_stride + pack_idx * pack_stride + elem_idx] = scale;                   
+                }
             }
         }
     }
 }
 
 void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
+              void* packed_recv_x_sf_scale,
               int* packed_recv_src_info, int64_t* packed_recv_layout_range,
               int* packed_recv_count,
               int* cumulative_local_expert_recv_stats,
               int64_t* dispatch_wait_recv_cost_stats,
+              const float* x_sf_scale,
               void* rdma_recv_x, int* rdma_recv_count, void* rdma_x,
               const void* x, const int64_t* topk_idx,
               int* next_clean, int num_next_clean_int,
               int num_tokens, int hidden, int num_max_dispatch_tokens_per_rank,
               int num_topk, int num_experts, int rank, int num_ranks,
               bool use_fp8, bool round_scale, bool use_ue8m0,
+              bool use_nvfp4, bool use_ue8m0_for_nvfp4_sf,
               void* workspace, int num_device_sms,
               cudaStream_t stream, int phases) {
     constexpr int kNumMaxTopK = 9;
@@ -367,17 +571,22 @@ void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
         EP_HOST_ASSERT(round_scale and "UE8M0 SF requires `round_scale=True`");
 
 #define DISPATCH_LAUNCH_CASE(hidden) { \
-auto dispatch_func = dispatch<false, false, hidden>; \
+auto dispatch_func = dispatch<false, false, false, false, hidden>; \
 if (use_fp8 and not use_ue8m0) \
-    dispatch_func = dispatch<true, false, hidden>; \
+    dispatch_func = dispatch<true, false, false, false, hidden>; \
 if (use_fp8 and use_ue8m0) \
-    dispatch_func = dispatch<true, true, hidden>; \
+    dispatch_func = dispatch<true, true, false, false, hidden>; \
+if (use_nvfp4 and not use_ue8m0_for_nvfp4_sf) \
+    dispatch_func = dispatch<false, false, true, false, hidden>; \
+if (use_nvfp4 and use_ue8m0_for_nvfp4_sf) \
+    dispatch_func = dispatch<false, false, true, true, hidden>; \
 LAUNCH_KERNEL(&cfg, dispatch_func, \
-              packed_recv_x, packed_recv_x_scales, \
+              packed_recv_x, packed_recv_x_scales, packed_recv_x_sf_scale, \
               packed_recv_src_info, packed_recv_layout_range, \
               packed_recv_count, \
               cumulative_local_expert_recv_stats, \
               dispatch_wait_recv_cost_stats, \
+              x_sf_scale, \
               rdma_recv_x, rdma_recv_count, rdma_x, \
               x, topk_idx, \
               atomic_counter_per_expert, atomic_finish_counter_per_expert, \
diff --git a/deep_ep/buffer.py b/deep_ep/buffer.py
index 03386e62..d35f6604 100644
--- a/deep_ep/buffer.py
+++ b/deep_ep/buffer.py
@@ -528,7 +528,9 @@ def low_latency_dispatch(self, x: torch.Tensor, topk_idx: torch.Tensor,
                              num_max_dispatch_tokens_per_rank: int, num_experts: int,
                              cumulative_local_expert_recv_stats: Optional[torch.Tensor] = None,
                              dispatch_wait_recv_cost_stats: Optional[torch.Tensor] = None,
+                             x_sf_scale: Optional[torch.Tensor] = None,
                              use_fp8: bool = True, round_scale: bool = False, use_ue8m0: bool = False,
+                             use_nvfp4: bool = False, use_ue8m0_for_nvfp4_sf: bool = False,
                              async_finish: bool = False, return_recv_hook: bool = False) -> \
             Tuple[Tuple[torch.Tensor, torch.Tensor], torch.Tensor, Tuple, EventOverlap, Callable]:
         """
@@ -551,9 +553,12 @@ def low_latency_dispatch(self, x: torch.Tensor, topk_idx: torch.Tensor,
             dispatch_wait_recv_cost_stats: a cumulative time spent waiting to receive each token tensor for statistics,
                 which should have shape `[num_ranks, num_ranks]` and be typed as `torch.int64`.
                 This is useful for detecting and pre-cisely localizing slow anomalies.
+            x_sf_scale: a tensor with `torch.float32`, shaped as `[num_tokens]`, the scaling factors for each token.
             use_fp8: whether to enable FP8 casting, with this, the received data will be a tuple of FP8 tensor and scaling factors.
             round_scale: whether round the scaling factors into power of 2.
             use_ue8m0: whether use UE8M0 as scaling factor format (available only with `round_scale=True`).
+            use_nvfp4: whether to enable NVFP4 casting, with this, the received data will be a tuple of NVFP4 tensor and scaling factors.
+            use_ue8m0_for_nvfp4_sf: whether use UE8M0 as NVFP4 scaling factor format (available only with `use_nvfp4=True`).
             async_finish: the current stream will not wait for the communication kernels to be finished if set.
             return_recv_hook: return a receiving hook if set. If set, the kernel will just do the RDMA request issues,
                 but **without actually receiving the data**. You must call the received hook to make sure the data's arrival.
@@ -578,19 +583,27 @@ def low_latency_dispatch(self, x: torch.Tensor, topk_idx: torch.Tensor,
             event: the event after executing the kernel (valid only if `async_finish` is set).
             hook: the receiving hook function (valid only if `return_recv_hook` is set).
         """
-        packed_recv_x, packed_recv_x_scales, packed_recv_count, packed_recv_src_info, packed_recv_layout_range, event, hook = \
+        packed_recv_x, packed_recv_x_scales, packed_recv_x_sf_scale, packed_recv_count, packed_recv_src_info, packed_recv_layout_range, event, hook = \
             self.runtime.low_latency_dispatch(x, topk_idx,
                                               cumulative_local_expert_recv_stats,
                                               dispatch_wait_recv_cost_stats,
+                                              x_sf_scale,
                                               num_max_dispatch_tokens_per_rank, num_experts,
                                               use_fp8, round_scale, use_ue8m0,
+                                              use_nvfp4, use_ue8m0_for_nvfp4_sf,
                                               async_finish, return_recv_hook)
         handle = (packed_recv_src_info, packed_recv_layout_range, num_max_dispatch_tokens_per_rank, x.size(1), num_experts)
         tensors_to_record = (x, topk_idx,
                              packed_recv_x, packed_recv_x_scales, packed_recv_count,
                              packed_recv_src_info, packed_recv_layout_range,
-                             cumulative_local_expert_recv_stats)
-        return (packed_recv_x, packed_recv_x_scales) if use_fp8 else packed_recv_x, packed_recv_count, handle, \
+                             cumulative_local_expert_recv_stats,
+                             x_sf_scale)
+        if use_fp8:
+            packed_recv_x = (packed_recv_x, packed_recv_x_scales)
+        elif use_nvfp4:
+            packed_recv_x = (packed_recv_x, packed_recv_x_scales, packed_recv_x_sf_scale)
+        
+        return packed_recv_x, packed_recv_count, handle, \
             EventOverlap(event, tensors_to_record if async_finish else None), hook
 
     # noinspection PyTypeChecker
diff --git a/tests/test_low_latency.py b/tests/test_low_latency.py
index aa928aab..6075e599 100644
--- a/tests/test_low_latency.py
+++ b/tests/test_low_latency.py
@@ -11,6 +11,9 @@
 import deep_ep
 from utils import init_dist, bench, bench_kineto, calc_diff, hash_tensor, per_token_cast_back
 
+MAX_E4M3 = 448
+MAX_NVFP4 = 6.0
+
 
 def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
               rank: int, num_ranks: int, group: dist.ProcessGroup, buffer: deep_ep.Buffer,
@@ -48,26 +51,48 @@ def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
     hash_value, num_times = 0, 0
     for current_x in x_list:
         for return_recv_hook in (False, True):
-            for dispatch_use_fp8 in (False, True):
+            for dispatch_data_type in ('bf16', 'fp8', 'nvfp4'):
+                dispatch_use_fp8 = dispatch_data_type == 'fp8'
+                dispatch_use_nvfp4 = dispatch_data_type == 'nvfp4'
+                use_ue8m0_for_nvfp4_sf = False
                 for round_scale in (False, True) if dispatch_use_fp8 else (False, ):
                     for use_ue8m0 in (False, True) if round_scale else (False, ):
                         num_times += 1
                         for i in range((num_times % 2) + 1):
                             cumulative_local_expert_recv_stats = torch.zeros((num_local_experts, ), dtype=torch.int, device='cuda')
+                            max_val = torch.max(torch.abs(current_x), dim=1).values
+                            x_sf_scale = (MAX_E4M3 * MAX_NVFP4) / max_val.to(torch.float32)
                             packed_recv_x, packed_recv_count, handle, event, hook = \
                                 buffer.low_latency_dispatch(current_x, topk_idx, num_tokens, num_experts,
                                                             use_fp8=dispatch_use_fp8, round_scale=round_scale, use_ue8m0=use_ue8m0,
+                                                            use_nvfp4=dispatch_use_nvfp4, use_ue8m0_for_nvfp4_sf=use_ue8m0_for_nvfp4_sf,
                                                             cumulative_local_expert_recv_stats=cumulative_local_expert_recv_stats,
+                                                            x_sf_scale=x_sf_scale,
                                                             async_finish=not return_recv_hook, return_recv_hook=return_recv_hook)
                             hook() if return_recv_hook else event.current_stream_wait()
-                        packed_recv_x = (packed_recv_x[0], packed_recv_x[1].contiguous()) if dispatch_use_fp8 else packed_recv_x
-                        simulated_gemm_x = per_token_cast_back(packed_recv_x[0].view(-1, hidden), packed_recv_x[1].view(-1, hidden // 128)).view(packed_recv_x[0].shape) \
-                            if dispatch_use_fp8 else packed_recv_x.clone()
+                        if dispatch_use_fp8:
+                            packed_recv_x = (packed_recv_x[0], packed_recv_x[1].contiguous())
+                        elif dispatch_use_nvfp4:
+                            packed_recv_x = (packed_recv_x[0], packed_recv_x[1].contiguous(), packed_recv_x[2].contiguous())
+                        else:
+                            packed_recv_x = packed_recv_x
+
+                        if dispatch_use_fp8:
+                            simulated_gemm_x = per_token_cast_back(packed_recv_x[0].view(-1, hidden), packed_recv_x[1].view(-1, hidden // 128)).view(packed_recv_x[0].shape)
+                        elif dispatch_use_nvfp4:
+                            simulated_gemm_x = per_token_cast_back(packed_recv_x[0], packed_recv_x[1], packed_recv_x[2], src_data_format='nvfp4', use_ue8m0_for_nvfp4_sf=use_ue8m0_for_nvfp4_sf)
+                        else:
+                            simulated_gemm_x = packed_recv_x.clone()
                         all_topk_idx = torch.empty((num_ranks, num_tokens, num_topk), dtype=topk_idx.dtype, device='cuda')
                         dist.all_gather_into_tensor(all_topk_idx, topk_idx, group=group)
                         for i in range(num_local_experts if do_check else 0):
                             expert_id = rank * num_local_experts + i
-                            recv_x = per_token_cast_back(packed_recv_x[0][i], packed_recv_x[1][i]) if dispatch_use_fp8 else packed_recv_x[i]
+                            if dispatch_use_fp8:
+                                recv_x = per_token_cast_back(packed_recv_x[0][i], packed_recv_x[1][i])
+                            elif dispatch_use_nvfp4:
+                                recv_x = per_token_cast_back(packed_recv_x[0][i], packed_recv_x[1][i], packed_recv_x[2][i], src_data_format='nvfp4', use_ue8m0_for_nvfp4_sf=use_ue8m0_for_nvfp4_sf)
+                            else:
+                                recv_x = packed_recv_x[i]
                             recv_count, recv_src_info, recv_layout_range = packed_recv_count[i], handle[0][i], handle[1][i]
 
                             # Check expert indices
@@ -83,18 +108,22 @@ def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
                             if current_x is x:
                                 recv_x = recv_x[:num_valid_tokens]
                                 recv_x_amin = recv_x[:, :-128].amin(dim=-1)
+                                recv_x_amax = recv_x[:, :-128].amax(dim=-1)
                                 recv_src_info = recv_src_info[:num_valid_tokens]
-                                assert torch.equal(recv_x_amin, recv_x[:, :-128].amax(dim=-1))
-                                if round_scale:
-                                    assert calc_diff(recv_x[:, -1], recv_src_info.view(-1)) < 0.007
+                                assert torch.equal(recv_x_amin, recv_x_amax), f'recv_x_amin: {recv_x_amin}, recv_x_amax: {recv_x_amax}'
+                                diff = calc_diff(recv_x[:, -1], recv_src_info.view(-1))
+                                if dispatch_use_nvfp4:
+                                    assert diff < 3, f"rank {rank}, num_times {num_times}, expert_id: {expert_id}, diff: {diff}"
+                                elif round_scale:
+                                    assert diff < 3, f"rank {rank}, num_times {num_times}, expert_id: {expert_id}, diff: {diff}"
                                 else:
                                     assert (recv_x[:, -128:] - recv_src_info.view(-1, 1) % num_tokens).sum().item() == 0
                                 for j in range(num_ranks):
                                     begin_idx, count = (recv_layout_range[j] >> 32).item(), (recv_layout_range[j] & int_mask).item()
-                                    if not round_scale:
+                                    if not round_scale and not dispatch_use_nvfp4:
                                         assert (recv_x_amin == j - rank_offset).sum().item() == (all_topk_idx[j] == expert_id).sum().item()
                                         assert (recv_x[begin_idx:begin_idx + count, :-128] - j + rank_offset).sum().item() == 0
-                            if dispatch_use_fp8:
+                            if dispatch_use_fp8 or dispatch_use_nvfp4:
                                 hash_value ^= hash_tensor(packed_recv_x[0][i, :num_valid_tokens])
                                 hash_value ^= hash_tensor(packed_recv_x[1][i, :num_valid_tokens])
                             else:
@@ -113,7 +142,7 @@ def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
                             if do_check:
                                 diff = calc_diff(current_x * topk_weights.masked_fill(topk_idx == -1, 0).sum(dim=1).view(-1, 1), combined_x)
                                 assert torch.isnan(combined_x).sum().item() == 0
-                                assert diff < (9e-4 if dispatch_use_fp8 else 1e-5), f'Error: {diff=}, {dispatch_use_fp8=}, {zero_copy=}'
+                                assert diff < (9e-4 if (dispatch_use_fp8 or dispatch_use_nvfp4) else 1e-5), f'Error: {diff=}, {dispatch_use_fp8=}, {dispatch_use_nvfp4=}, {zero_copy=}'
                                 hash_value ^= hash_tensor(combined_x)
 
     # noinspection PyShadowingNames
@@ -217,6 +246,7 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
                         help='Whether to test LogFMT combine')
     parser.add_argument("--pressure-test", action='store_true',
                         help='Whether to do pressure test')
+
     args = parser.parse_args()
 
     num_processes = args.num_processes
diff --git a/tests/utils.py b/tests/utils.py
index a64cc0ae..ae33d922 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -50,8 +50,8 @@ def per_token_cast_to_fp8(x: torch.Tensor):
     x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
     return (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(m, n), (x_amax / 448.0).view(m, -1)
 
-
-def per_token_cast_back(x_fp8: torch.Tensor, x_scales: torch.Tensor):
+    
+def cast_fp8_to_fp32(x_fp8: torch.Tensor, x_scales: torch.Tensor):
     if x_fp8.numel() == 0:
         return x_fp8.to(torch.bfloat16)
     if x_scales.dtype == torch.int:
@@ -62,6 +62,64 @@ def per_token_cast_back(x_fp8: torch.Tensor, x_scales: torch.Tensor):
     return (x_fp32 * x_scales).view(x_fp8.shape).to(torch.bfloat16)
 
 
+def int32_to_8floats_lookup(tensor: torch.Tensor, table: torch.Tensor) -> torch.Tensor:
+    """
+    Decomposes each int32 in the input tensor into 8 4-bit values,
+    and converts them into float values using a lookup table.
+
+    Args:
+        tensor: (int32 Tensor) Tensor of any shape, e.g., [B, N]
+        table: (float Tensor) A 1D lookup table of length 16 that maps all 4-bit values to floats
+
+    Returns:
+        float32 Tensor: Merges the last two dimensions, so shape is [..., n*M], where n is the number of int32 and 8 per int32.
+    """
+    assert tensor.dtype == torch.int32, "Input must be of int32 type"
+    assert table.numel() == 16 and table.ndim == 1, "Lookup table must be 1D with length 16"
+
+    result = []
+    for i in range(8):
+        shift = (7 - i) * 4
+        idx = ((tensor >> shift) & 0xF).long()  # Extract 4-bit index [0, 15]
+        val = table[idx].unsqueeze(-1)  # Lookup and preserve dimensions
+        result.append(val)
+
+    out = torch.cat(result, dim=-1)  # Output shape: [..., 8]
+    # Merge the last two dimensions if shape is [..., M, 8]
+    out = out.reshape(*out.shape[:-2], -1) if out.ndim > 2 else out
+    return out
+
+
+def cast_nvfp4_to_fp32(x_nvfp4: torch.Tensor, x_scales: torch.Tensor, x_sf_scale: float, use_ue8m0_for_nvfp4_sf: bool = False):
+    NVFP4_TABLE = torch.tensor([0, 0.5, 1, 1.5, 2, 3, 4, 6, 0, -0.5, -1.0, -1.5, -2, -3, -4, -6], dtype=torch.float32, device='cuda')    
+    if use_ue8m0_for_nvfp4_sf:
+        x_scales = x_scales.view(dtype=torch.int8).to(torch.int) << 23
+        x_scales = x_scales.view(dtype=torch.float)
+    else:
+        x_scales = x_scales.view(dtype=torch.float8_e4m3fn).to(torch.float32)
+    x_sf_scale = x_sf_scale.view(*x_sf_scale.shape, 1)
+    x_scales = x_scales * (1 / x_sf_scale)
+    
+    x_int32 = x_nvfp4.view(dtype=torch.int32)
+    x_fp32 = int32_to_8floats_lookup(x_int32, NVFP4_TABLE) 
+    
+    x_fp32 = x_fp32.view(*x_fp32.shape[:-1], -1, 16)
+    x_scales = x_scales.view(*x_scales.shape[:-1], -1, 1)
+    x_fp32 = x_fp32 * x_scales
+    x_fp32 = x_fp32.view(*x_nvfp4.shape[:-1], -1).to(torch.bfloat16)
+
+    return x_fp32
+
+
+def per_token_cast_back(x: torch.Tensor, x_scales: torch.Tensor, x_sf_scale: torch.Tensor = None, src_data_format: str = 'fp8', use_ue8m0_for_nvfp4_sf: bool = False):
+    if src_data_format == 'fp8':
+        return cast_fp8_to_fp32(x, x_scales)
+    elif src_data_format == 'nvfp4':
+        return cast_nvfp4_to_fp32(x, x_scales, x_sf_scale, use_ue8m0_for_nvfp4_sf)
+    else:
+        raise ValueError(f"Unsupported src_data_format: {src_data_format}")
+
+
 def inplace_unique(x: torch.Tensor, num_slots: int):
     assert x.dim() == 2
     mask = x < 0

From d320aaa09783090923837571be009388bfdcd1ab Mon Sep 17 00:00:00 2001
From: shifangx <shifangx@nvidia.com>
Date: Thu, 28 Aug 2025 18:08:17 -0700
Subject: [PATCH 48/60] add support fp32_vec_to_e2m1 for __CUDA_ARCH__ less
 than 1000

---
 csrc/kernels/internode_ll.cu | 35 +++++++++++++++++++++++++++++++----
 1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/csrc/kernels/internode_ll.cu b/csrc/kernels/internode_ll.cu
index f13880f7..770f6a03 100644
--- a/csrc/kernels/internode_ll.cu
+++ b/csrc/kernels/internode_ll.cu
@@ -57,6 +57,19 @@ inline __device__ float reciprocal_approximate_ftz(float a) {
   return b;
 }
 
+// float to e2m1 4bit (sign:1, exp:2, mantissa:1) quantization
+__device__ inline uint8_t float_to_e2m1(float f) {
+    // Get sign
+    uint8_t sign = (f < 0);
+    float abs_f = fabsf(f);
+    float abs_f_log2 = log2f(abs_f);
+    // map float to 2-bit exponent
+    int exp = static_cast<int>(floorf(abs_f_log2 + 1));
+    // Take one bit for mantissa
+    uint8_t mant = (abs_f_log2 + 1 - exp > 0.5f) ? 1 : 0;
+    return (sign << 3) | (exp << 1) | mant;
+}
+
 // Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
 inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
   #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
@@ -78,8 +91,15 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
           "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
     return val;
   #else
-    // static_assert(false, "not supported.");
-    return 0;
+    #if !(defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000))
+        #pragma message("warning: this architecture does not support cvt.rn.satfinite.e2m1x2.f32, use float_to_e2m1 instead.")
+    #endif
+    uint32_t val = 0;
+    float* data = reinterpret_cast<float*>(&array[0]);
+    for (int i = 0; i < 8; ++i) {
+        val |= (float_to_e2m1(data[i]) & 0xF) << (4 * i);
+    }
+    return val;
   #endif
 }
 
@@ -104,8 +124,15 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
           "f"(array[6]), "f"(array[7]));
     return val;
   #else
-    // static_assert(false, "not supported.");
-    return 0;
+    #if !(defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000))
+        #pragma message("warning: this architecture does not support cvt.rn.satfinite.e2m1x2.f32, use float_to_e2m1 instead.")
+    #endif
+    uint32_t val = 0;
+    float* data = reinterpret_cast<float*>(&array[0]);
+    for (int i = 0; i < 8; ++i) {
+        val |= (float_to_e2m1(data[i]) & 0xF) << (4 * i);
+    }
+    return val;
   #endif
 }
 

From d88e77e03fb345e6b27808bb63067e0711b821a6 Mon Sep 17 00:00:00 2001
From: shifangx <shifangx@nvidia.com>
Date: Thu, 28 Aug 2025 20:31:59 -0700
Subject: [PATCH 49/60] change threshold for diff

---
 tests/test_low_latency.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_low_latency.py b/tests/test_low_latency.py
index 6075e599..eb1845ef 100644
--- a/tests/test_low_latency.py
+++ b/tests/test_low_latency.py
@@ -113,9 +113,9 @@ def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
                                 assert torch.equal(recv_x_amin, recv_x_amax), f'recv_x_amin: {recv_x_amin}, recv_x_amax: {recv_x_amax}'
                                 diff = calc_diff(recv_x[:, -1], recv_src_info.view(-1))
                                 if dispatch_use_nvfp4:
-                                    assert diff < 3, f"rank {rank}, num_times {num_times}, expert_id: {expert_id}, diff: {diff}"
+                                    assert diff < 0.007, f"rank {rank}, num_times {num_times}, expert_id: {expert_id}, diff: {diff}"
                                 elif round_scale:
-                                    assert diff < 3, f"rank {rank}, num_times {num_times}, expert_id: {expert_id}, diff: {diff}"
+                                    assert diff < 0.007, f"rank {rank}, num_times {num_times}, expert_id: {expert_id}, diff: {diff}"
                                 else:
                                     assert (recv_x[:, -128:] - recv_src_info.view(-1, 1) % num_tokens).sum().item() == 0
                                 for j in range(num_ranks):
@@ -142,7 +142,7 @@ def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
                             if do_check:
                                 diff = calc_diff(current_x * topk_weights.masked_fill(topk_idx == -1, 0).sum(dim=1).view(-1, 1), combined_x)
                                 assert torch.isnan(combined_x).sum().item() == 0
-                                assert diff < (9e-4 if (dispatch_use_fp8 or dispatch_use_nvfp4) else 1e-5), f'Error: {diff=}, {dispatch_use_fp8=}, {dispatch_use_nvfp4=}, {zero_copy=}'
+                                assert diff < (1 if (dispatch_use_fp8 or dispatch_use_nvfp4) else 1e-5), f'Error: {diff=}, {dispatch_use_fp8=}, {dispatch_use_nvfp4=}, {zero_copy=}'
                                 hash_value ^= hash_tensor(combined_x)
 
     # noinspection PyShadowingNames

From 3a28b71ad5a0e46542d33ad7e644029a97a34d4a Mon Sep 17 00:00:00 2001
From: shifangx <shifangx@nvidia.com>
Date: Thu, 28 Aug 2025 20:36:20 -0700
Subject: [PATCH 50/60] add debug message

---
 tests/test_low_latency.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/test_low_latency.py b/tests/test_low_latency.py
index eb1845ef..2ea42436 100644
--- a/tests/test_low_latency.py
+++ b/tests/test_low_latency.py
@@ -70,6 +70,8 @@ def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
                                                             x_sf_scale=x_sf_scale,
                                                             async_finish=not return_recv_hook, return_recv_hook=return_recv_hook)
                             hook() if return_recv_hook else event.current_stream_wait()
+                            if dispatch_use_nvfp4:
+                                print(f"rank {rank}, num_times {num_times}, i: {i}, current_x: {current_x}, topk_idx: {topk_idx}, packed_recv_x: {packed_recv_x}")
                         if dispatch_use_fp8:
                             packed_recv_x = (packed_recv_x[0], packed_recv_x[1].contiguous())
                         elif dispatch_use_nvfp4:
@@ -110,9 +112,12 @@ def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
                                 recv_x_amin = recv_x[:, :-128].amin(dim=-1)
                                 recv_x_amax = recv_x[:, :-128].amax(dim=-1)
                                 recv_src_info = recv_src_info[:num_valid_tokens]
+                                if dispatch_use_nvfp4:
+                                    print(f"rank {rank}, num_times {num_times}, expert_id: {expert_id}, recv_x: {recv_x}")
                                 assert torch.equal(recv_x_amin, recv_x_amax), f'recv_x_amin: {recv_x_amin}, recv_x_amax: {recv_x_amax}'
                                 diff = calc_diff(recv_x[:, -1], recv_src_info.view(-1))
                                 if dispatch_use_nvfp4:
+                                    print(f"rank {rank}, num_times {num_times}, expert_id: {expert_id}, diff after dispatch: {diff}")
                                     assert diff < 0.007, f"rank {rank}, num_times {num_times}, expert_id: {expert_id}, diff: {diff}"
                                 elif round_scale:
                                     assert diff < 0.007, f"rank {rank}, num_times {num_times}, expert_id: {expert_id}, diff: {diff}"
@@ -141,6 +146,8 @@ def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
                             hook() if return_recv_hook else event.current_stream_wait()
                             if do_check:
                                 diff = calc_diff(current_x * topk_weights.masked_fill(topk_idx == -1, 0).sum(dim=1).view(-1, 1), combined_x)
+                                if dispatch_use_nvfp4:
+                                    print(f"rank {rank}, num_times {num_times}, diff after combine: {diff}")
                                 assert torch.isnan(combined_x).sum().item() == 0
                                 assert diff < (1 if (dispatch_use_fp8 or dispatch_use_nvfp4) else 1e-5), f'Error: {diff=}, {dispatch_use_fp8=}, {dispatch_use_nvfp4=}, {zero_copy=}'
                                 hash_value ^= hash_tensor(combined_x)

From 2add0199bde30d24e77bd7ce1215fecf256ce876 Mon Sep 17 00:00:00 2001
From: shifangx <shifangx@nvidia.com>
Date: Thu, 28 Aug 2025 08:26:23 -0700
Subject: [PATCH 51/60] change physical layout to be (l, m/128, k/4, 32, 4, 4)

---
 csrc/deep_ep.cpp             | 22 ++++++---
 csrc/kernels/internode_ll.cu | 96 +++++++++++++++++-------------------
 tests/test_low_latency.py    | 19 +++----
 tests/utils.py               |  2 +-
 4 files changed, 73 insertions(+), 66 deletions(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index 67393fbc..ad9355f7 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -1141,7 +1141,7 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
     // Allocate packed tensors
     constexpr int NUM_ELEMS_PER_PACK = 8;
     auto packed_recv_x = torch::empty({num_local_experts, num_ranks * num_max_dispatch_tokens_per_rank, use_nvfp4 ? hidden / NUM_ELEMS_PER_PACK : hidden},
-                                      x.options().dtype(use_nvfp4 ? torch::kInt32 : (use_fp8 ? torch::kFloat8_e4m3fn: torch::kBFloat16)));
+                                      x.options().dtype(use_nvfp4 ? torch::kUInt32 : (use_fp8 ? torch::kFloat8_e4m3fn: torch::kBFloat16)));
     auto packed_recv_src_info = torch::empty({num_local_experts, num_ranks * num_max_dispatch_tokens_per_rank}, torch::dtype(torch::kInt32).device(torch::kCUDA));
     auto packed_recv_layout_range = torch::empty({num_local_experts, num_ranks}, torch::dtype(torch::kInt64).device(torch::kCUDA));
     auto packed_recv_count = torch::empty({num_local_experts}, torch::dtype(torch::kInt32).device(torch::kCUDA));
@@ -1167,11 +1167,22 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
         packed_recv_x_scales = torch::transpose(packed_recv_x_scales.value(), 1, 2);
         packed_recv_x_scales_ptr = packed_recv_x_scales->data_ptr();
     }else if (use_nvfp4) {
-        constexpr int SF_VEC_SIZE = 16;
+        constexpr int kNumPerChannels = 16;
         constexpr int NUM_SF_ELEMS_PER_PACK = 4;
-        packed_recv_x_scales = torch::empty({num_local_experts, hidden / (SF_VEC_SIZE * NUM_SF_ELEMS_PER_PACK), num_ranks * num_max_dispatch_tokens_per_rank},
-                                            torch::dtype(torch::kInt).device(torch::kCUDA));
-        packed_recv_x_scales = torch::transpose(packed_recv_x_scales.value(), 1, 2);
+        constexpr int mTileSize_dim_0 = 32;
+        constexpr int mTileSize_dim_1 = 4;
+        constexpr int mTileSize = mTileSize_dim_0 * mTileSize_dim_1;
+
+        auto l = num_local_experts;
+        auto m = num_ranks * num_max_dispatch_tokens_per_rank;
+        auto rm = (m + 127) / 128;
+        auto rk = hidden / (kNumPerChannels * NUM_SF_ELEMS_PER_PACK);
+        // The physical layout is (l, rm, rk, 32, 4, 4).
+        packed_recv_x_scales = torch::empty({l, rm, rk, 32, 4, 4},
+                                            torch::dtype(torch::kUInt8).device(torch::kCUDA));
+        // After permute, the logical shape is (32, 4, rm, 4, rk, l)
+        packed_recv_x_scales = packed_recv_x_scales.value().permute({3, 4, 1, 5, 2, 0});
+
         packed_recv_x_scales_ptr = packed_recv_x_scales->data_ptr();
         packed_recv_x_sf_scale = torch::empty({num_local_experts, num_ranks * num_max_dispatch_tokens_per_rank}, torch::dtype(torch::kFloat32).device(torch::kCUDA));
         packed_recv_x_sf_scale_ptr = packed_recv_x_sf_scale->data_ptr();
@@ -1213,7 +1224,6 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
     std::optional<std::function<void()>> recv_hook = std::nullopt;
     if (return_recv_hook)
         recv_hook = [=]() { launcher(LOW_LATENCY_RECV_PHASE); };
-
     // Return values
     return {packed_recv_x, packed_recv_x_scales, packed_recv_x_sf_scale, packed_recv_count, packed_recv_src_info, packed_recv_layout_range, event, recv_hook};
 #else
diff --git a/csrc/kernels/internode_ll.cu b/csrc/kernels/internode_ll.cu
index 770f6a03..201b9601 100644
--- a/csrc/kernels/internode_ll.cu
+++ b/csrc/kernels/internode_ll.cu
@@ -57,16 +57,42 @@ inline __device__ float reciprocal_approximate_ftz(float a) {
   return b;
 }
 
-// float to e2m1 4bit (sign:1, exp:2, mantissa:1) quantization
+// Convert 1 float value into 1 e2m1 value (represented as one uint8_t).
 __device__ inline uint8_t float_to_e2m1(float f) {
     // Get sign
     uint8_t sign = (f < 0);
+    int exp = 0;
+    int mant = 0;
     float abs_f = fabsf(f);
-    float abs_f_log2 = log2f(abs_f);
-    // map float to 2-bit exponent
-    int exp = static_cast<int>(floorf(abs_f_log2 + 1));
-    // Take one bit for mantissa
-    uint8_t mant = (abs_f_log2 + 1 - exp > 0.5f) ? 1 : 0;
+    if (abs_f < 1.0) {
+        exp = 0;
+        if (abs_f < 0.5) {
+            mant = 0;
+        } else {
+            mant = 1;
+        }
+    } else if (abs_f < 2.0) {
+        exp = 1;
+        if (abs_f < 1.5) {
+            mant = 0;
+        } else {
+            mant = 1;
+        }
+    } else if (abs_f < 4.0) {
+        exp = 2;
+        if (abs_f < 3.0) {
+            mant = 0;
+        } else {
+            mant = 1;
+        }
+    } else {
+        exp = 3;
+        if (abs_f < 5.0) {
+            mant = 0;
+        } else {
+            mant = 1;
+        }
+    }
     return (sign << 3) | (exp << 1) | mant;
 }
 
@@ -95,42 +121,10 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
         #pragma message("warning: this architecture does not support cvt.rn.satfinite.e2m1x2.f32, use float_to_e2m1 instead.")
     #endif
     uint32_t val = 0;
-    float* data = reinterpret_cast<float*>(&array[0]);
-    for (int i = 0; i < 8; ++i) {
-        val |= (float_to_e2m1(data[i]) & 0xF) << (4 * i);
-    }
-    return val;
-  #endif
-}
-
-// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
-inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
-  #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-    uint32_t val;
-    asm volatile(
-        "{\n"
-        ".reg .b8 byte0;\n"
-        ".reg .b8 byte1;\n"
-        ".reg .b8 byte2;\n"
-        ".reg .b8 byte3;\n"
-        "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
-        "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
-        "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
-        "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
-        "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
-        "}"
-        : "=r"(val)
-        : "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]), "f"(array[4]), "f"(array[5]),
-          "f"(array[6]), "f"(array[7]));
-    return val;
-  #else
-    #if !(defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000))
-        #pragma message("warning: this architecture does not support cvt.rn.satfinite.e2m1x2.f32, use float_to_e2m1 instead.")
-    #endif
-    uint32_t val = 0;
-    float* data = reinterpret_cast<float*>(&array[0]);
-    for (int i = 0; i < 8; ++i) {
-        val |= (float_to_e2m1(data[i]) & 0xF) << (4 * i);
+    float2* data = reinterpret_cast<float2*>(&array[0]);
+    for (int i = 0; i < 4; ++i) {
+        val |= (float_to_e2m1(data[i].x) & 0xFF) << (8 * i);
+        val |= (float_to_e2m1(data[i].y) & 0xFF) << (8 * i + 4);
     }
     return val;
   #endif
@@ -299,7 +293,7 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales, void* packed_recv_x_sf
                 }
             }
 
-            // FP8 cast
+            // FP8 or NVFP4 cast
             EP_STATIC_ASSERT(hidden_bf16_int4 % 32 == 0, "Must use the full warp to reduce");
             #pragma unroll
             for (int i = thread_id; i < hidden_bf16_int4; i += num_threads) {
@@ -341,8 +335,8 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales, void* packed_recv_x_sf
 
                     // Write scale to send buffer
                     if (lane_id % 2 == 0){
-                        EP_DEVICE_ASSERT((i * kNumElemsPerRead) % 16 == 0);
-                        int rdma_x_scale_idx = i * kNumElemsPerRead / 16;
+                        EP_DEVICE_ASSERT((i * kNumElemsPerRead) % kNumPerChannels == 0);
+                        int rdma_x_scale_idx = i * kNumElemsPerRead / kNumPerChannels;
                         rdma_x_scales[rdma_x_scale_idx] = sf_val;                   
                         }
                     // Cast into send buffer                    
@@ -543,19 +537,21 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales, void* packed_recv_x_sf
                     recv_x_scales[token_idx * token_stride + pack_idx * pack_stride + elem_idx] = scale;
                 }
             } else if constexpr (kUseNVFP4) {            
-                // Equivalent CuTe layout:
-                //   (num_tokens, (num_packed, num_elems_per_pack)):(num_elems_per_pack, (num_tokens * num_elems_per_pack, 1))
+                 // The physical layout is (l, rm, rk, 32, 4, 4).
                 const auto src_scales = reinterpret_cast<uint8_t*>(reinterpret_cast<uint8_t*>(src_data) + hidden_bytes);
                 const auto num_elems_per_pack = static_cast<int>(sizeof(packed_t) / sizeof(scale_t));
                 const auto token_idx = recv_token_begin_idx + i;
-                const auto token_stride = num_elems_per_pack;
-                const auto pack_stride = num_ranks * num_max_dispatch_tokens_per_rank * num_elems_per_pack;
+                const auto token_stride = num_scales * sizeof(scale_t);
+                const auto pack_stride = num_elems_per_pack;
+                const auto rm = token_idx / 128;
+                const auto rm_res = token_idx % 128;
                 #pragma unroll
                 for (int j = lane_id; j < num_scales; j += 32) {
                     const auto pack_idx = j / num_elems_per_pack;
                     const auto elem_idx = j % num_elems_per_pack;
                     auto scale = ld_nc_global(src_scales + j);
-                    recv_x_scales[token_idx * token_stride + pack_idx * pack_stride + elem_idx] = scale;                   
+                    // recv_x_scales[token_idx * token_stride + pack_idx * pack_stride + elem_idx] = scale;                   
+                    recv_x_scales[rm * token_stride * 128 + pack_idx * pack_stride * 128 + rm_res * pack_stride + elem_idx] = scale;                   
                 }
             }
         }
diff --git a/tests/test_low_latency.py b/tests/test_low_latency.py
index 2ea42436..e218ec1c 100644
--- a/tests/test_low_latency.py
+++ b/tests/test_low_latency.py
@@ -70,12 +70,13 @@ def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
                                                             x_sf_scale=x_sf_scale,
                                                             async_finish=not return_recv_hook, return_recv_hook=return_recv_hook)
                             hook() if return_recv_hook else event.current_stream_wait()
-                            if dispatch_use_nvfp4:
-                                print(f"rank {rank}, num_times {num_times}, i: {i}, current_x: {current_x}, topk_idx: {topk_idx}, packed_recv_x: {packed_recv_x}")
                         if dispatch_use_fp8:
                             packed_recv_x = (packed_recv_x[0], packed_recv_x[1].contiguous())
                         elif dispatch_use_nvfp4:
-                            packed_recv_x = (packed_recv_x[0], packed_recv_x[1].contiguous(), packed_recv_x[2].contiguous())
+                            recv_x_scale_view = packed_recv_x[1]
+                            recv_x_scale_view = recv_x_scale_view.permute(5, 2, 0, 1, 4, 3)
+                            recv_x_scale_view = recv_x_scale_view.contiguous().view(num_local_experts, int(num_ranks * num_tokens), hidden // 16)
+                            packed_recv_x = (packed_recv_x[0], recv_x_scale_view, packed_recv_x[2].contiguous())
                         else:
                             packed_recv_x = packed_recv_x
 
@@ -112,12 +113,9 @@ def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
                                 recv_x_amin = recv_x[:, :-128].amin(dim=-1)
                                 recv_x_amax = recv_x[:, :-128].amax(dim=-1)
                                 recv_src_info = recv_src_info[:num_valid_tokens]
-                                if dispatch_use_nvfp4:
-                                    print(f"rank {rank}, num_times {num_times}, expert_id: {expert_id}, recv_x: {recv_x}")
                                 assert torch.equal(recv_x_amin, recv_x_amax), f'recv_x_amin: {recv_x_amin}, recv_x_amax: {recv_x_amax}'
                                 diff = calc_diff(recv_x[:, -1], recv_src_info.view(-1))
                                 if dispatch_use_nvfp4:
-                                    print(f"rank {rank}, num_times {num_times}, expert_id: {expert_id}, diff after dispatch: {diff}")
                                     assert diff < 0.007, f"rank {rank}, num_times {num_times}, expert_id: {expert_id}, diff: {diff}"
                                 elif round_scale:
                                     assert diff < 0.007, f"rank {rank}, num_times {num_times}, expert_id: {expert_id}, diff: {diff}"
@@ -146,10 +144,13 @@ def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
                             hook() if return_recv_hook else event.current_stream_wait()
                             if do_check:
                                 diff = calc_diff(current_x * topk_weights.masked_fill(topk_idx == -1, 0).sum(dim=1).view(-1, 1), combined_x)
-                                if dispatch_use_nvfp4:
-                                    print(f"rank {rank}, num_times {num_times}, diff after combine: {diff}")
                                 assert torch.isnan(combined_x).sum().item() == 0
-                                assert diff < (1 if (dispatch_use_fp8 or dispatch_use_nvfp4) else 1e-5), f'Error: {diff=}, {dispatch_use_fp8=}, {dispatch_use_nvfp4=}, {zero_copy=}'
+                                diff_threshold = 1e-5
+                                if dispatch_use_fp8:
+                                    diff_threshold = 9e-4
+                                elif dispatch_use_nvfp4:
+                                    diff_threshold = 0.007
+                                assert diff < diff_threshold, f'Error: {diff=}, {dispatch_use_fp8=}, {dispatch_use_nvfp4=}, {zero_copy=}'
                                 hash_value ^= hash_tensor(combined_x)
 
     # noinspection PyShadowingNames
diff --git a/tests/utils.py b/tests/utils.py
index ae33d922..946cae88 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -79,7 +79,7 @@ def int32_to_8floats_lookup(tensor: torch.Tensor, table: torch.Tensor) -> torch.
 
     result = []
     for i in range(8):
-        shift = (7 - i) * 4
+        shift = i * 4
         idx = ((tensor >> shift) & 0xF).long()  # Extract 4-bit index [0, 15]
         val = table[idx].unsqueeze(-1)  # Lookup and preserve dimensions
         result.append(val)

From 9d9e395cafeb7eae20c444d29aa8f575c1fe72ba Mon Sep 17 00:00:00 2001
From: shifangx <shifangx@nvidia.com>
Date: Sun, 31 Aug 2025 19:17:41 -0700
Subject: [PATCH 52/60] use global scale for entire dispatch instead of per
 token scale

---
 csrc/deep_ep.cpp             | 15 +++----
 csrc/deep_ep.hpp             |  4 +-
 csrc/kernels/api.cuh         |  4 +-
 csrc/kernels/internode_ll.cu | 78 ++++++++++++------------------------
 deep_ep/buffer.py            | 15 +++----
 tests/test_low_latency.py    | 33 ++++++---------
 tests/utils.py               | 12 +++---
 7 files changed, 60 insertions(+), 101 deletions(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index ad9355f7..7a43c596 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -1087,14 +1087,14 @@ void Buffer::clean_low_latency_buffer(int num_max_dispatch_tokens_per_rank, int
 #endif
 }
 
-std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>, torch::Tensor, torch::Tensor, torch::Tensor, std::optional<EventHandle>, std::optional<std::function<void()>>>
+std::tuple<torch::Tensor, std::optional<torch::Tensor>, torch::Tensor, torch::Tensor, torch::Tensor, std::optional<EventHandle>, std::optional<std::function<void()>>>
 Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_idx,
                              const std::optional<torch::Tensor>& cumulative_local_expert_recv_stats,
                              const std::optional<torch::Tensor>& dispatch_wait_recv_cost_stats,
                              const std::optional<torch::Tensor>& x_sf_scale,
                              int num_max_dispatch_tokens_per_rank, int num_experts,
                              bool use_fp8, bool round_scale, bool use_ue8m0,
-                             bool use_nvfp4, bool use_ue8m0_for_nvfp4_sf,
+                             bool use_nvfp4, bool use_ue8m0_for_sf,
                              bool async, bool return_recv_hook) {
 #ifndef DISABLE_NVSHMEM
     EP_HOST_ASSERT(low_latency_mode);
@@ -1149,8 +1149,6 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
     // Allocate column-majored scales
     auto packed_recv_x_scales = std::optional<torch::Tensor>();
     void* packed_recv_x_scales_ptr = nullptr;
-    auto packed_recv_x_sf_scale = std::optional<torch::Tensor>();
-    void* packed_recv_x_sf_scale_ptr = nullptr;
     EP_HOST_ASSERT((num_ranks * num_max_dispatch_tokens_per_rank) % 4 == 0 and "TMA requires the number of tokens to be multiple of 4");
 
     if (use_fp8) {
@@ -1184,14 +1182,12 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
         packed_recv_x_scales = packed_recv_x_scales.value().permute({3, 4, 1, 5, 2, 0});
 
         packed_recv_x_scales_ptr = packed_recv_x_scales->data_ptr();
-        packed_recv_x_sf_scale = torch::empty({num_local_experts, num_ranks * num_max_dispatch_tokens_per_rank}, torch::dtype(torch::kFloat32).device(torch::kCUDA));
-        packed_recv_x_sf_scale_ptr = packed_recv_x_sf_scale->data_ptr();
     }
 
     // Kernel launch
     auto next_clean_meta = next_buffer.clean_meta();
     auto launcher = [=](int phases) {
-        internode_ll::dispatch(packed_recv_x.data_ptr(), packed_recv_x_scales_ptr, packed_recv_x_sf_scale_ptr,
+        internode_ll::dispatch(packed_recv_x.data_ptr(), packed_recv_x_scales_ptr,
                                packed_recv_src_info.data_ptr<int>(), packed_recv_layout_range.data_ptr<int64_t>(),
                                packed_recv_count.data_ptr<int>(),
                                cumulative_local_expert_recv_stats.has_value() ? cumulative_local_expert_recv_stats->data_ptr<int>() : nullptr,
@@ -1204,7 +1200,7 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
                                num_tokens, hidden, num_max_dispatch_tokens_per_rank,
                                num_topk, num_experts, rank, num_ranks,
                                use_fp8, round_scale, use_ue8m0,
-                               use_nvfp4, use_ue8m0_for_nvfp4_sf,
+                               use_nvfp4, use_ue8m0_for_sf,
                                workspace, num_device_sms,
                                launch_stream, phases);
     };
@@ -1224,8 +1220,9 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
     std::optional<std::function<void()>> recv_hook = std::nullopt;
     if (return_recv_hook)
         recv_hook = [=]() { launcher(LOW_LATENCY_RECV_PHASE); };
+
     // Return values
-    return {packed_recv_x, packed_recv_x_scales, packed_recv_x_sf_scale, packed_recv_count, packed_recv_src_info, packed_recv_layout_range, event, recv_hook};
+    return {packed_recv_x, packed_recv_x_scales, packed_recv_count, packed_recv_src_info, packed_recv_layout_range, event, recv_hook};
 #else
     EP_HOST_ASSERT(false and "NVSHMEM is disabled during compilation");
     return {};
diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp
index 27ff4951..3772d358 100644
--- a/csrc/deep_ep.hpp
+++ b/csrc/deep_ep.hpp
@@ -143,14 +143,14 @@ struct Buffer {
 
     void clean_low_latency_buffer(int num_max_dispatch_tokens_per_rank, int hidden, int num_experts);
 
-    std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>, torch::Tensor, torch::Tensor, torch::Tensor, std::optional<EventHandle>, std::optional<std::function<void()>>>
+    std::tuple<torch::Tensor, std::optional<torch::Tensor>, torch::Tensor, torch::Tensor, torch::Tensor, std::optional<EventHandle>, std::optional<std::function<void()>>>
     low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_idx,
                          const std::optional<torch::Tensor>& cumulative_local_expert_recv_stats,
                          const std::optional<torch::Tensor>& dispatch_wait_recv_cost_stats,
                          const std::optional<torch::Tensor>& x_sf_scale,
                          int num_max_dispatch_tokens_per_rank, int num_experts,
                          bool use_fp8, bool round_scale, bool use_ue8m0,
-                         bool use_nvfp4, bool use_ue8m0_for_nvfp4_sf,
+                         bool use_nvfp4, bool use_ue8m0_for_sf,
                          bool async, bool return_recv_hook);
 
     std::tuple<torch::Tensor, std::optional<EventHandle>, std::optional<std::function<void()>>>
diff --git a/csrc/kernels/api.cuh b/csrc/kernels/api.cuh
index 6540443c..5f1d444a 100644
--- a/csrc/kernels/api.cuh
+++ b/csrc/kernels/api.cuh
@@ -139,7 +139,7 @@ void clean_low_latency_buffer(int* clean_0, int num_clean_int_0,
                               int* clean_1, int num_clean_int_1,
                               cudaStream_t stream);
 
-void dispatch(void* packed_recv_x, void* packed_recv_x_scales, void* packed_recv_x_sf_scale,
+void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
               int* packed_recv_src_info, int64_t* packed_recv_layout_range,
               int* packed_recv_count,
               int* cumulative_local_expert_recv_stats,
@@ -151,7 +151,7 @@ void dispatch(void* packed_recv_x, void* packed_recv_x_scales, void* packed_recv
               int num_tokens, int hidden, int num_max_dispatch_tokens_per_rank,
               int num_topk, int num_experts, int rank, int num_ranks,
               bool use_fp8, bool round_scale, bool use_ue8m0,
-              bool use_nvfp4, bool use_ue8m0_for_nvfp4_sf,
+              bool use_nvfp4, bool use_ue8m0_for_sf,
               void* workspace, int num_device_sms,
               cudaStream_t stream, int phases);
 
diff --git a/csrc/kernels/internode_ll.cu b/csrc/kernels/internode_ll.cu
index 201b9601..23048609 100644
--- a/csrc/kernels/internode_ll.cu
+++ b/csrc/kernels/internode_ll.cu
@@ -36,8 +36,6 @@ void clean_low_latency_buffer(int* clean_0, int num_clean_int_0,
                   clean_0, num_clean_int_0, clean_1, num_clean_int_1);
 }
 
-constexpr int CVT_ELTS_PER_THREAD = 8;
-constexpr int SF_VEC_SIZE = 16;
 
 struct PackedVec {
   __nv_bfloat162 elts[4];
@@ -57,45 +55,33 @@ inline __device__ float reciprocal_approximate_ftz(float a) {
   return b;
 }
 
-// Convert 1 float value into 1 e2m1 value (represented as one uint8_t).
+// Convert 1 float value into one e2m1 value (represented as one uint8_t).
 __device__ inline uint8_t float_to_e2m1(float f) {
     // Get sign
     uint8_t sign = (f < 0);
-    int exp = 0;
-    int mant = 0;
     float abs_f = fabsf(f);
-    if (abs_f < 1.0) {
+    float abs_f_log2 = log2f(abs_f);
+    // map float to 2-bit exponent
+    uint8_t exp, mant;
+    if (abs_f_log2 < 0) {
         exp = 0;
-        if (abs_f < 0.5) {
-            mant = 0;
-        } else {
-            mant = 1;
-        }
-    } else if (abs_f < 2.0) {
-        exp = 1;
-        if (abs_f < 1.5) {
+        if (abs_f_log2 < -1) {
             mant = 0;
-        } else {
-            mant = 1;
-        }
-    } else if (abs_f < 4.0) {
-        exp = 2;
-        if (abs_f < 3.0) {
-            mant = 0;
-        } else {
-            mant = 1;
         }
-    } else {
-        exp = 3;
-        if (abs_f < 5.0) {
-            mant = 0;
-        } else {
+        else {
             mant = 1;
         }
     }
+    else{
+        exp = static_cast<int>(floorf(abs_f_log2 + 1));
+        exp = fminf(exp, 3.0f);
+        mant = (abs_f_log2 + 1 - exp > 0.5f) ? 1 : 0;
+    }
+    // Take one bit for mantissa
     return (sign << 3) | (exp << 1) | mant;
 }
 
+
 // Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
 inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
   #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
@@ -130,6 +116,7 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
   #endif
 }
 
+constexpr int CVT_ELTS_PER_THREAD = 8;
 // Quantizes the provided PackedVec into the uint32_t output
 template <int SF_VEC_SIZE, bool UE8M0_SF>
 __device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec& vec, float SFScaleVal, uint8_t* SFout) {
@@ -203,7 +190,7 @@ __device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec& vec, float SFScaleVal, uint8
 
 template <bool kUseFP8, bool kUseUE8M0, bool kUseNVFP4, bool kUseUE8M0ForNVFP4SF, int kHidden>
 __global__ __launch_bounds__(1024, 1) void
-dispatch(void* packed_recv_x, void* packed_recv_x_scales, void* packed_recv_x_sf_scale,
+dispatch(void* packed_recv_x, void* packed_recv_x_scales,
          int* packed_recv_src_info, int64_t* packed_recv_layout_range,
          int* packed_recv_count,
          int* cumulative_local_expert_recv_stats,
@@ -274,7 +261,6 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales, void* packed_recv_x_sf
         for (int token_idx = sm_id; token_idx < num_tokens; token_idx += num_sms) {
             const auto x_int4 = static_cast<const int4*>(x) + token_idx * hidden_bf16_int4;
             const auto rdma_x_src_idx = reinterpret_cast<int*>(static_cast<uint8_t*>(rdma_x) + token_idx * num_bytes_per_msg);
-            const auto rdma_x_sf_scale = reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(rdma_x_src_idx) + sizeof(int));
             const auto rdma_x_vec = reinterpret_cast<vec_t*>(reinterpret_cast<uint8_t*>(rdma_x_src_idx) + sizeof(int4));
             const auto rdma_x_scales = reinterpret_cast<rdma_x_scale_t*>(reinterpret_cast<uint8_t*>(rdma_x_vec) + hidden_bytes);
 
@@ -283,13 +269,9 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales, void* packed_recv_x_sf
             thread_id == 0 ? (*rdma_x_src_idx = token_idx) : 0;
             float SFScaleVal = 1.0f;
             if constexpr (kUseNVFP4) {
-                // Get scaling value: if x_sf_scale is nullptr, use 1.0f; otherwise, read value at token_idx
+                // Get scaling value: if x_sf_scale is nullptr, use 1.0f;
                 if (x_sf_scale != nullptr) {
-                    SFScaleVal = *(static_cast<const float*>(x_sf_scale) + token_idx);
-                }
-                // Only thread 0 writes scaling value to rdma_x_sf_scale
-                if (thread_id == 0) {
-                    *rdma_x_sf_scale = SFScaleVal;
+                    SFScaleVal = *(static_cast<const float*>(x_sf_scale));
                 }
             }
 
@@ -331,15 +313,15 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales, void* packed_recv_x_sf
                     // Convert to NVFP4
                     uint8_t sf_val;
                     PackedVec vec = *reinterpret_cast<PackedVec*>(&int4_value);
-                    uint32_t result = cvt_warp_fp16_to_fp4<SF_VEC_SIZE, kUseUE8M0ForNVFP4SF>(vec, SFScaleVal, &sf_val);
+                    uint32_t result = cvt_warp_fp16_to_fp4<kNumPerChannels, kUseUE8M0ForNVFP4SF>(vec, SFScaleVal, &sf_val);
 
                     // Write scale to send buffer
                     if (lane_id % 2 == 0){
                         EP_DEVICE_ASSERT((i * kNumElemsPerRead) % kNumPerChannels == 0);
                         int rdma_x_scale_idx = i * kNumElemsPerRead / kNumPerChannels;
-                        rdma_x_scales[rdma_x_scale_idx] = sf_val;                   
+                        rdma_x_scales[rdma_x_scale_idx] = sf_val;
                         }
-                    // Cast into send buffer                    
+                    // Cast into send buffer
                     rdma_x_vec[i] = *reinterpret_cast<vec_t*>(&result);
                 } else {
                     // Reinterpret-cast is for C++14 compatibility
@@ -462,7 +444,6 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales, void* packed_recv_x_sf
         const auto recv_x_int4 = static_cast<int4*>(packed_recv_x) +
                 local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank * hidden_int4;
         const auto recv_src_info = packed_recv_src_info + local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank;
-        const auto recv_sf_scale = static_cast<float*>(packed_recv_x_sf_scale) + local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank;
         const auto recv_range = packed_recv_layout_range + local_expert_idx * num_ranks;
         const auto num_aligned_scales = align<int>(num_scales, sizeof(float) / sizeof(scale_t));
         const auto recv_x_scales = static_cast<scale_t*>(packed_recv_x_scales) + local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank * num_aligned_scales;
@@ -500,12 +481,6 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales, void* packed_recv_x_sf
             const auto src_src_idx = reinterpret_cast<int*>(rdma_recv_x_uint8 + i * num_bytes_per_msg);
             if (lane_id == 0)
                 recv_src_info[recv_token_begin_idx + i] = ld_nc_global(src_src_idx);
-            if constexpr (kUseNVFP4) {
-                const auto src_sf_scale_for_nvfp4 = reinterpret_cast<float*>(rdma_recv_x_uint8 + i * num_bytes_per_msg + sizeof(int));
-                if (lane_id == 0)
-                    recv_sf_scale[recv_token_begin_idx + i] = ld_nc_global(src_sf_scale_for_nvfp4);
-            }
-
             __syncwarp();
 
             // Copy data
@@ -551,7 +526,7 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales, void* packed_recv_x_sf
                     const auto elem_idx = j % num_elems_per_pack;
                     auto scale = ld_nc_global(src_scales + j);
                     // recv_x_scales[token_idx * token_stride + pack_idx * pack_stride + elem_idx] = scale;                   
-                    recv_x_scales[rm * token_stride * 128 + pack_idx * pack_stride * 128 + rm_res * pack_stride + elem_idx] = scale;                   
+                    recv_x_scales[rm * token_stride * 128 + pack_idx * pack_stride * 128 + rm_res * pack_stride + elem_idx] = scale;
                 }
             }
         }
@@ -559,7 +534,6 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales, void* packed_recv_x_sf
 }
 
 void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
-              void* packed_recv_x_sf_scale,
               int* packed_recv_src_info, int64_t* packed_recv_layout_range,
               int* packed_recv_count,
               int* cumulative_local_expert_recv_stats,
@@ -571,7 +545,7 @@ void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
               int num_tokens, int hidden, int num_max_dispatch_tokens_per_rank,
               int num_topk, int num_experts, int rank, int num_ranks,
               bool use_fp8, bool round_scale, bool use_ue8m0,
-              bool use_nvfp4, bool use_ue8m0_for_nvfp4_sf,
+              bool use_nvfp4, bool use_ue8m0_for_sf,
               void* workspace, int num_device_sms,
               cudaStream_t stream, int phases) {
     constexpr int kNumMaxTopK = 9;
@@ -599,12 +573,12 @@ if (use_fp8 and not use_ue8m0) \
     dispatch_func = dispatch<true, false, false, false, hidden>; \
 if (use_fp8 and use_ue8m0) \
     dispatch_func = dispatch<true, true, false, false, hidden>; \
-if (use_nvfp4 and not use_ue8m0_for_nvfp4_sf) \
+if (use_nvfp4 and not use_ue8m0_for_sf) \
     dispatch_func = dispatch<false, false, true, false, hidden>; \
-if (use_nvfp4 and use_ue8m0_for_nvfp4_sf) \
+if (use_nvfp4 and use_ue8m0_for_sf) \
     dispatch_func = dispatch<false, false, true, true, hidden>; \
 LAUNCH_KERNEL(&cfg, dispatch_func, \
-              packed_recv_x, packed_recv_x_scales, packed_recv_x_sf_scale, \
+              packed_recv_x, packed_recv_x_scales, \
               packed_recv_src_info, packed_recv_layout_range, \
               packed_recv_count, \
               cumulative_local_expert_recv_stats, \
diff --git a/deep_ep/buffer.py b/deep_ep/buffer.py
index d35f6604..9488273e 100644
--- a/deep_ep/buffer.py
+++ b/deep_ep/buffer.py
@@ -530,7 +530,7 @@ def low_latency_dispatch(self, x: torch.Tensor, topk_idx: torch.Tensor,
                              dispatch_wait_recv_cost_stats: Optional[torch.Tensor] = None,
                              x_sf_scale: Optional[torch.Tensor] = None,
                              use_fp8: bool = True, round_scale: bool = False, use_ue8m0: bool = False,
-                             use_nvfp4: bool = False, use_ue8m0_for_nvfp4_sf: bool = False,
+                             use_nvfp4: bool = False, use_ue8m0_for_sf: bool = False,
                              async_finish: bool = False, return_recv_hook: bool = False) -> \
             Tuple[Tuple[torch.Tensor, torch.Tensor], torch.Tensor, Tuple, EventOverlap, Callable]:
         """
@@ -553,12 +553,12 @@ def low_latency_dispatch(self, x: torch.Tensor, topk_idx: torch.Tensor,
             dispatch_wait_recv_cost_stats: a cumulative time spent waiting to receive each token tensor for statistics,
                 which should have shape `[num_ranks, num_ranks]` and be typed as `torch.int64`.
                 This is useful for detecting and pre-cisely localizing slow anomalies.
-            x_sf_scale: a tensor with `torch.float32`, shaped as `[num_tokens]`, the scaling factors for each token.
+            x_sf_scale: a float32 tensor with dim() == 0, the scaling factors for the entire dispatch.
             use_fp8: whether to enable FP8 casting, with this, the received data will be a tuple of FP8 tensor and scaling factors.
             round_scale: whether round the scaling factors into power of 2.
             use_ue8m0: whether use UE8M0 as scaling factor format (available only with `round_scale=True`).
             use_nvfp4: whether to enable NVFP4 casting, with this, the received data will be a tuple of NVFP4 tensor and scaling factors.
-            use_ue8m0_for_nvfp4_sf: whether use UE8M0 as NVFP4 scaling factor format (available only with `use_nvfp4=True`).
+            use_ue8m0_for_sf: whether use UE8M0 as NVFP4 scaling factor format (available only with `use_nvfp4=True`).
             async_finish: the current stream will not wait for the communication kernels to be finished if set.
             return_recv_hook: return a receiving hook if set. If set, the kernel will just do the RDMA request issues,
                 but **without actually receiving the data**. You must call the received hook to make sure the data's arrival.
@@ -583,14 +583,14 @@ def low_latency_dispatch(self, x: torch.Tensor, topk_idx: torch.Tensor,
             event: the event after executing the kernel (valid only if `async_finish` is set).
             hook: the receiving hook function (valid only if `return_recv_hook` is set).
         """
-        packed_recv_x, packed_recv_x_scales, packed_recv_x_sf_scale, packed_recv_count, packed_recv_src_info, packed_recv_layout_range, event, hook = \
+        packed_recv_x, packed_recv_x_scales, packed_recv_count, packed_recv_src_info, packed_recv_layout_range, event, hook = \
             self.runtime.low_latency_dispatch(x, topk_idx,
                                               cumulative_local_expert_recv_stats,
                                               dispatch_wait_recv_cost_stats,
                                               x_sf_scale,
                                               num_max_dispatch_tokens_per_rank, num_experts,
                                               use_fp8, round_scale, use_ue8m0,
-                                              use_nvfp4, use_ue8m0_for_nvfp4_sf,
+                                              use_nvfp4, use_ue8m0_for_sf,
                                               async_finish, return_recv_hook)
         handle = (packed_recv_src_info, packed_recv_layout_range, num_max_dispatch_tokens_per_rank, x.size(1), num_experts)
         tensors_to_record = (x, topk_idx,
@@ -598,11 +598,8 @@ def low_latency_dispatch(self, x: torch.Tensor, topk_idx: torch.Tensor,
                              packed_recv_src_info, packed_recv_layout_range,
                              cumulative_local_expert_recv_stats,
                              x_sf_scale)
-        if use_fp8:
+        if use_fp8 or use_nvfp4:
             packed_recv_x = (packed_recv_x, packed_recv_x_scales)
-        elif use_nvfp4:
-            packed_recv_x = (packed_recv_x, packed_recv_x_scales, packed_recv_x_sf_scale)
-        
         return packed_recv_x, packed_recv_count, handle, \
             EventOverlap(event, tensors_to_record if async_finish else None), hook
 
diff --git a/tests/test_low_latency.py b/tests/test_low_latency.py
index e218ec1c..e7400627 100644
--- a/tests/test_low_latency.py
+++ b/tests/test_low_latency.py
@@ -54,48 +54,40 @@ def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
             for dispatch_data_type in ('bf16', 'fp8', 'nvfp4'):
                 dispatch_use_fp8 = dispatch_data_type == 'fp8'
                 dispatch_use_nvfp4 = dispatch_data_type == 'nvfp4'
-                use_ue8m0_for_nvfp4_sf = False
+                use_ue8m0_for_sf = False
                 for round_scale in (False, True) if dispatch_use_fp8 else (False, ):
                     for use_ue8m0 in (False, True) if round_scale else (False, ):
                         num_times += 1
                         for i in range((num_times % 2) + 1):
                             cumulative_local_expert_recv_stats = torch.zeros((num_local_experts, ), dtype=torch.int, device='cuda')
-                            max_val = torch.max(torch.abs(current_x), dim=1).values
-                            x_sf_scale = (MAX_E4M3 * MAX_NVFP4) / max_val.to(torch.float32)
+                            x_max = torch.max(torch.abs(current_x))
+                            dist.all_reduce(x_max, op=dist.ReduceOp.MAX, group=group)
+                            x_sf_scale = (MAX_E4M3 * MAX_NVFP4) / x_max.to(torch.float32)
                             packed_recv_x, packed_recv_count, handle, event, hook = \
                                 buffer.low_latency_dispatch(current_x, topk_idx, num_tokens, num_experts,
                                                             use_fp8=dispatch_use_fp8, round_scale=round_scale, use_ue8m0=use_ue8m0,
-                                                            use_nvfp4=dispatch_use_nvfp4, use_ue8m0_for_nvfp4_sf=use_ue8m0_for_nvfp4_sf,
+                                                            use_nvfp4=dispatch_use_nvfp4, use_ue8m0_for_sf=use_ue8m0_for_sf,
                                                             cumulative_local_expert_recv_stats=cumulative_local_expert_recv_stats,
                                                             x_sf_scale=x_sf_scale,
                                                             async_finish=not return_recv_hook, return_recv_hook=return_recv_hook)
                             hook() if return_recv_hook else event.current_stream_wait()
                         if dispatch_use_fp8:
                             packed_recv_x = (packed_recv_x[0], packed_recv_x[1].contiguous())
+                            simulated_gemm_x = per_token_cast_back(packed_recv_x[0].view(-1, hidden), packed_recv_x[1].view(-1, hidden // 128)).view(packed_recv_x[0].shape)
                         elif dispatch_use_nvfp4:
                             recv_x_scale_view = packed_recv_x[1]
                             recv_x_scale_view = recv_x_scale_view.permute(5, 2, 0, 1, 4, 3)
                             recv_x_scale_view = recv_x_scale_view.contiguous().view(num_local_experts, int(num_ranks * num_tokens), hidden // 16)
-                            packed_recv_x = (packed_recv_x[0], recv_x_scale_view, packed_recv_x[2].contiguous())
+                            packed_recv_x = (packed_recv_x[0], recv_x_scale_view)
+                            simulated_gemm_x = per_token_cast_back(packed_recv_x[0], packed_recv_x[1], x_sf_scale, use_ue8m0_for_sf=use_ue8m0_for_sf, src_data_format='nvfp4')
                         else:
                             packed_recv_x = packed_recv_x
-
-                        if dispatch_use_fp8:
-                            simulated_gemm_x = per_token_cast_back(packed_recv_x[0].view(-1, hidden), packed_recv_x[1].view(-1, hidden // 128)).view(packed_recv_x[0].shape)
-                        elif dispatch_use_nvfp4:
-                            simulated_gemm_x = per_token_cast_back(packed_recv_x[0], packed_recv_x[1], packed_recv_x[2], src_data_format='nvfp4', use_ue8m0_for_nvfp4_sf=use_ue8m0_for_nvfp4_sf)
-                        else:
                             simulated_gemm_x = packed_recv_x.clone()
                         all_topk_idx = torch.empty((num_ranks, num_tokens, num_topk), dtype=topk_idx.dtype, device='cuda')
                         dist.all_gather_into_tensor(all_topk_idx, topk_idx, group=group)
                         for i in range(num_local_experts if do_check else 0):
                             expert_id = rank * num_local_experts + i
-                            if dispatch_use_fp8:
-                                recv_x = per_token_cast_back(packed_recv_x[0][i], packed_recv_x[1][i])
-                            elif dispatch_use_nvfp4:
-                                recv_x = per_token_cast_back(packed_recv_x[0][i], packed_recv_x[1][i], packed_recv_x[2][i], src_data_format='nvfp4', use_ue8m0_for_nvfp4_sf=use_ue8m0_for_nvfp4_sf)
-                            else:
-                                recv_x = packed_recv_x[i]
+                            recv_x = simulated_gemm_x[i]
                             recv_count, recv_src_info, recv_layout_range = packed_recv_count[i], handle[0][i], handle[1][i]
 
                             # Check expert indices
@@ -115,9 +107,7 @@ def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
                                 recv_src_info = recv_src_info[:num_valid_tokens]
                                 assert torch.equal(recv_x_amin, recv_x_amax), f'recv_x_amin: {recv_x_amin}, recv_x_amax: {recv_x_amax}'
                                 diff = calc_diff(recv_x[:, -1], recv_src_info.view(-1))
-                                if dispatch_use_nvfp4:
-                                    assert diff < 0.007, f"rank {rank}, num_times {num_times}, expert_id: {expert_id}, diff: {diff}"
-                                elif round_scale:
+                                if dispatch_use_nvfp4 or round_scale:
                                     assert diff < 0.007, f"rank {rank}, num_times {num_times}, expert_id: {expert_id}, diff: {diff}"
                                 else:
                                     assert (recv_x[:, -128:] - recv_src_info.view(-1, 1) % num_tokens).sum().item() == 0
@@ -145,11 +135,12 @@ def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
                             if do_check:
                                 diff = calc_diff(current_x * topk_weights.masked_fill(topk_idx == -1, 0).sum(dim=1).view(-1, 1), combined_x)
                                 assert torch.isnan(combined_x).sum().item() == 0
-                                diff_threshold = 1e-5
                                 if dispatch_use_fp8:
                                     diff_threshold = 9e-4
                                 elif dispatch_use_nvfp4:
                                     diff_threshold = 0.007
+                                else:
+                                    diff_threshold = 1e-5
                                 assert diff < diff_threshold, f'Error: {diff=}, {dispatch_use_fp8=}, {dispatch_use_nvfp4=}, {zero_copy=}'
                                 hash_value ^= hash_tensor(combined_x)
 
diff --git a/tests/utils.py b/tests/utils.py
index 946cae88..7b666aea 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -90,14 +90,14 @@ def int32_to_8floats_lookup(tensor: torch.Tensor, table: torch.Tensor) -> torch.
     return out
 
 
-def cast_nvfp4_to_fp32(x_nvfp4: torch.Tensor, x_scales: torch.Tensor, x_sf_scale: float, use_ue8m0_for_nvfp4_sf: bool = False):
-    NVFP4_TABLE = torch.tensor([0, 0.5, 1, 1.5, 2, 3, 4, 6, 0, -0.5, -1.0, -1.5, -2, -3, -4, -6], dtype=torch.float32, device='cuda')    
-    if use_ue8m0_for_nvfp4_sf:
+def cast_nvfp4_to_fp32(x_nvfp4: torch.Tensor, x_scales: torch.Tensor, x_sf_scale: float, use_ue8m0_for_sf: bool = False):
+    assert(x_sf_scale.dim() == 0, f"expect x_sf_scale.dim() == 0, but got {x_sf_scale.dim()}")
+    NVFP4_TABLE = torch.tensor([0, 0.5, 1, 1.5, 2, 3, 4, 6, 0, -0.5, -1.0, -1.5, -2, -3, -4, -6], dtype=torch.float32, device='cuda')
+    if use_ue8m0_for_sf:
         x_scales = x_scales.view(dtype=torch.int8).to(torch.int) << 23
         x_scales = x_scales.view(dtype=torch.float)
     else:
         x_scales = x_scales.view(dtype=torch.float8_e4m3fn).to(torch.float32)
-    x_sf_scale = x_sf_scale.view(*x_sf_scale.shape, 1)
     x_scales = x_scales * (1 / x_sf_scale)
     
     x_int32 = x_nvfp4.view(dtype=torch.int32)
@@ -111,11 +111,11 @@ def cast_nvfp4_to_fp32(x_nvfp4: torch.Tensor, x_scales: torch.Tensor, x_sf_scale
     return x_fp32
 
 
-def per_token_cast_back(x: torch.Tensor, x_scales: torch.Tensor, x_sf_scale: torch.Tensor = None, src_data_format: str = 'fp8', use_ue8m0_for_nvfp4_sf: bool = False):
+def per_token_cast_back(x: torch.Tensor, x_scales: torch.Tensor, x_sf_scale: torch.Tensor = None, use_ue8m0_for_sf: bool = False, src_data_format: str = 'fp8'):
     if src_data_format == 'fp8':
         return cast_fp8_to_fp32(x, x_scales)
     elif src_data_format == 'nvfp4':
-        return cast_nvfp4_to_fp32(x, x_scales, x_sf_scale, use_ue8m0_for_nvfp4_sf)
+        return cast_nvfp4_to_fp32(x, x_scales, x_sf_scale, use_ue8m0_for_sf)
     else:
         raise ValueError(f"Unsupported src_data_format: {src_data_format}")
 

From ccf4eaf24ff133678ee6292cb26b76d363d7ae26 Mon Sep 17 00:00:00 2001
From: shifangx <shifangx@nvidia.com>
Date: Wed, 3 Sep 2025 05:27:49 -0700
Subject: [PATCH 53/60] change test case

---
 csrc/deep_ep.cpp             | 2 ++
 csrc/kernels/internode_ll.cu | 7 +++----
 deep_ep/buffer.py            | 6 +++++-
 tests/test_low_latency.py    | 6 ++++--
 4 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index 7a43c596..a61baa61 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -1151,6 +1151,7 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
     void* packed_recv_x_scales_ptr = nullptr;
     EP_HOST_ASSERT((num_ranks * num_max_dispatch_tokens_per_rank) % 4 == 0 and "TMA requires the number of tokens to be multiple of 4");
 
+    EP_HOST_ASSERT(not (use_fp8 and use_nvfp4));
     if (use_fp8) {
         // TODO: support unaligned cases
         EP_HOST_ASSERT(hidden % 512 == 0);
@@ -1182,6 +1183,7 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
         packed_recv_x_scales = packed_recv_x_scales.value().permute({3, 4, 1, 5, 2, 0});
 
         packed_recv_x_scales_ptr = packed_recv_x_scales->data_ptr();
+        EP_HOST_ASSERT(packed_recv_x_scales_ptr != nullptr);
     }
 
     // Kernel launch
diff --git a/csrc/kernels/internode_ll.cu b/csrc/kernels/internode_ll.cu
index 23048609..790efd7c 100644
--- a/csrc/kernels/internode_ll.cu
+++ b/csrc/kernels/internode_ll.cu
@@ -269,10 +269,9 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales,
             thread_id == 0 ? (*rdma_x_src_idx = token_idx) : 0;
             float SFScaleVal = 1.0f;
             if constexpr (kUseNVFP4) {
-                // Get scaling value: if x_sf_scale is nullptr, use 1.0f;
-                if (x_sf_scale != nullptr) {
-                    SFScaleVal = *(static_cast<const float*>(x_sf_scale));
-                }
+                // Get scaling value;
+                EP_DEVICE_ASSERT(x_sf_scale != nullptr);
+                SFScaleVal = *(static_cast<const float*>(x_sf_scale));
             }
 
             // FP8 or NVFP4 cast
diff --git a/deep_ep/buffer.py b/deep_ep/buffer.py
index 9488273e..ea348eb8 100644
--- a/deep_ep/buffer.py
+++ b/deep_ep/buffer.py
@@ -573,7 +573,11 @@ def low_latency_dispatch(self, x: torch.Tensor, topk_idx: torch.Tensor,
                 if `use_ue8m0=False`. With `use_ue8m0=True`, the second one is packed and shaped as
                 `[num_local_experts, num_max_dispatch_tokens_per_rank * num_ranks, hidden // 512]` with type `torch.int`.
                 Notice that, the last-two-dimension of the scaling tensors are in column-major for TMA compatibility.
-                With `use_fp8=False`, the result would be a tensor shaped as
+                with `use_nvfp4=True`: the first element is a `torch.Tensor` shaped as
+                `[num_local_experts, num_max_dispatch_tokens_per_rank * num_ranks, hidden // 4]` with `torch.uint32`.
+                The second tensor is the corresponding scales for the first element with shape
+                `[32, 4, num_max_dispatch_tokens_per_rank * num_ranks // 128, 4, hidden // 64, num_local_experts]` with `torch.uint8`.
+                With `use_fp8=False and use_nvfp4=False`, the result would be a tensor shaped as
                 `[num_local_experts, num_max_dispatch_tokens_per_rank * num_ranks, hidden]` with `torch.bfloat16`.
                 Moreover, not all tokens are valid, only some of the `num_max_dispatch_tokens_per_rank * num_ranks` are,
                 as we do not synchronize CPU received count with GPU (also not incompatible with CUDA graph if synced).
diff --git a/tests/test_low_latency.py b/tests/test_low_latency.py
index e7400627..b2ffee33 100644
--- a/tests/test_low_latency.py
+++ b/tests/test_low_latency.py
@@ -61,8 +61,8 @@ def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
                         for i in range((num_times % 2) + 1):
                             cumulative_local_expert_recv_stats = torch.zeros((num_local_experts, ), dtype=torch.int, device='cuda')
                             x_max = torch.max(torch.abs(current_x))
-                            dist.all_reduce(x_max, op=dist.ReduceOp.MAX, group=group)
                             x_sf_scale = (MAX_E4M3 * MAX_NVFP4) / x_max.to(torch.float32)
+                            dist.all_reduce(x_sf_scale, op=dist.ReduceOp.MIN, group=group)
                             packed_recv_x, packed_recv_count, handle, event, hook = \
                                 buffer.low_latency_dispatch(current_x, topk_idx, num_tokens, num_experts,
                                                             use_fp8=dispatch_use_fp8, round_scale=round_scale, use_ue8m0=use_ue8m0,
@@ -153,10 +153,12 @@ def large_gemm_with_hook(hook):
 
     # noinspection PyShadowingNames
     def test_func(return_recv_hook: bool):
+        # NOTE: use nvfp4
         recv_x, recv_count, handle, event, hook = \
             buffer.low_latency_dispatch(current_x, topk_idx, num_tokens, num_experts,
                                         cumulative_local_expert_recv_stats=cumulative_local_expert_recv_stats,
-                                        use_fp8=True, async_finish=False, return_recv_hook=return_recv_hook)
+                                        use_fp8=False, use_nvfp4=True, x_sf_scale=x_sf_scale,
+                                        async_finish=False, return_recv_hook=return_recv_hook)
         large_gemm_with_hook(hook) if return_recv_hook else None
         combined_x, event, hook = buffer.low_latency_combine(simulated_gemm_x, topk_idx, topk_weights, handle,
                                                              use_logfmt=use_logfmt, return_recv_hook=return_recv_hook)

From 82147f21fe41e8ed0641a047f3eb8758f1fc4c11 Mon Sep 17 00:00:00 2001
From: shifangx <shifangx@nvidia.com>
Date: Fri, 5 Sep 2025 15:21:14 -0700
Subject: [PATCH 54/60] change some names and dtype: change from x_sf_scale to
 x_global_scales. change from use_ue8m0_for_sf to use_ue8m0_for_nvfp4_x_scale.
 set x_scale dtpye as torch::kFloat8_e4m3fn for if
 use_ue8m0_for_nvfp4_x_scale==False and torch::kUInt8 for
 use_ue8m0_for_nvfp4_x_scale==True.

---
 csrc/deep_ep.cpp             | 13 ++++++++-----
 csrc/deep_ep.hpp             |  4 ++--
 csrc/kernels/api.cuh         |  4 ++--
 csrc/kernels/internode_ll.cu | 37 ++++++++++++++++++++----------------
 deep_ep/buffer.py            | 14 +++++++-------
 tests/test_low_latency.py    | 15 ++++++++-------
 tests/utils.py               | 12 ++++++------
 7 files changed, 54 insertions(+), 45 deletions(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index a61baa61..b69c1257 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -1091,10 +1091,10 @@ std::tuple<torch::Tensor, std::optional<torch::Tensor>, torch::Tensor, torch::Te
 Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_idx,
                              const std::optional<torch::Tensor>& cumulative_local_expert_recv_stats,
                              const std::optional<torch::Tensor>& dispatch_wait_recv_cost_stats,
-                             const std::optional<torch::Tensor>& x_sf_scale,
+                             const std::optional<torch::Tensor>& x_global_scales,
                              int num_max_dispatch_tokens_per_rank, int num_experts,
                              bool use_fp8, bool round_scale, bool use_ue8m0,
-                             bool use_nvfp4, bool use_ue8m0_for_sf,
+                             bool use_nvfp4, bool use_ue8m0_for_nvfp4_x_scale,
                              bool async, bool return_recv_hook) {
 #ifndef DISABLE_NVSHMEM
     EP_HOST_ASSERT(low_latency_mode);
@@ -1176,9 +1176,12 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
         auto m = num_ranks * num_max_dispatch_tokens_per_rank;
         auto rm = (m + 127) / 128;
         auto rk = hidden / (kNumPerChannels * NUM_SF_ELEMS_PER_PACK);
+        auto scale_dtype = use_ue8m0_for_nvfp4_x_scale ?
+            torch::dtype(torch::kUInt8) :
+            torch::dtype(torch::kFloat8_e4m3fn);
         // The physical layout is (l, rm, rk, 32, 4, 4).
         packed_recv_x_scales = torch::empty({l, rm, rk, 32, 4, 4},
-                                            torch::dtype(torch::kUInt8).device(torch::kCUDA));
+                                            scale_dtype.device(torch::kCUDA));
         // After permute, the logical shape is (32, 4, rm, 4, rk, l)
         packed_recv_x_scales = packed_recv_x_scales.value().permute({3, 4, 1, 5, 2, 0});
 
@@ -1194,7 +1197,7 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
                                packed_recv_count.data_ptr<int>(),
                                cumulative_local_expert_recv_stats.has_value() ? cumulative_local_expert_recv_stats->data_ptr<int>() : nullptr,
                                dispatch_wait_recv_cost_stats.has_value() ? dispatch_wait_recv_cost_stats->data_ptr<int64_t>() : nullptr,
-                               x_sf_scale.has_value() ? x_sf_scale->data_ptr<float>() : nullptr,
+                               x_global_scales.has_value() ? x_global_scales->data_ptr<float>() : nullptr,
                                buffer.dispatch_rdma_recv_data_buffer, buffer.dispatch_rdma_recv_count_buffer,
                                buffer.dispatch_rdma_send_buffer,
                                x.data_ptr(), topk_idx.data_ptr<int64_t>(),
@@ -1202,7 +1205,7 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
                                num_tokens, hidden, num_max_dispatch_tokens_per_rank,
                                num_topk, num_experts, rank, num_ranks,
                                use_fp8, round_scale, use_ue8m0,
-                               use_nvfp4, use_ue8m0_for_sf,
+                               use_nvfp4, use_ue8m0_for_nvfp4_x_scale,
                                workspace, num_device_sms,
                                launch_stream, phases);
     };
diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp
index 3772d358..b6335f68 100644
--- a/csrc/deep_ep.hpp
+++ b/csrc/deep_ep.hpp
@@ -147,10 +147,10 @@ struct Buffer {
     low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_idx,
                          const std::optional<torch::Tensor>& cumulative_local_expert_recv_stats,
                          const std::optional<torch::Tensor>& dispatch_wait_recv_cost_stats,
-                         const std::optional<torch::Tensor>& x_sf_scale,
+                         const std::optional<torch::Tensor>& x_global_scales,
                          int num_max_dispatch_tokens_per_rank, int num_experts,
                          bool use_fp8, bool round_scale, bool use_ue8m0,
-                         bool use_nvfp4, bool use_ue8m0_for_sf,
+                         bool use_nvfp4, bool use_ue8m0_for_nvfp4_x_scale,
                          bool async, bool return_recv_hook);
 
     std::tuple<torch::Tensor, std::optional<EventHandle>, std::optional<std::function<void()>>>
diff --git a/csrc/kernels/api.cuh b/csrc/kernels/api.cuh
index 5f1d444a..cf9ea560 100644
--- a/csrc/kernels/api.cuh
+++ b/csrc/kernels/api.cuh
@@ -144,14 +144,14 @@ void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
               int* packed_recv_count,
               int* cumulative_local_expert_recv_stats,
               int64_t* dispatch_wait_recv_cost_stats,
-              const float* x_sf_scale,
+              const float* x_global_scales,
               void* rdma_recv_x, int* rdma_recv_count, void* rdma_x,
               const void* x, const int64_t* topk_idx,
               int* next_clean, int num_next_clean_int,
               int num_tokens, int hidden, int num_max_dispatch_tokens_per_rank,
               int num_topk, int num_experts, int rank, int num_ranks,
               bool use_fp8, bool round_scale, bool use_ue8m0,
-              bool use_nvfp4, bool use_ue8m0_for_sf,
+              bool use_nvfp4, bool use_ue8m0_for_nvfp4_x_scale,
               void* workspace, int num_device_sms,
               cudaStream_t stream, int phases);
 
diff --git a/csrc/kernels/internode_ll.cu b/csrc/kernels/internode_ll.cu
index 790efd7c..eca6518c 100644
--- a/csrc/kernels/internode_ll.cu
+++ b/csrc/kernels/internode_ll.cu
@@ -81,10 +81,11 @@ __device__ inline uint8_t float_to_e2m1(float f) {
     return (sign << 3) | (exp << 1) | mant;
 }
 
-
 // Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
 inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
-  #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+    // PTX instructions used here requires sm100a.
+  #if CUDA_VERSION >= 12080
+  #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && __CUDA_ARCH_HAS_FEATURE__(SM100_ALL)
     uint32_t val;
     asm volatile(
         "{\n"
@@ -99,13 +100,16 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
         "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
         "}"
         : "=r"(val)
-        : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y), "f"(array[2].x),
-          "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
+        : "f"(array[0].x),
+          "f"(array[0].y),
+          "f"(array[1].x),
+          "f"(array[1].y),
+          "f"(array[2].x),
+          "f"(array[2].y),
+          "f"(array[3].x),
+          "f"(array[3].y));
     return val;
   #else
-    #if !(defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000))
-        #pragma message("warning: this architecture does not support cvt.rn.satfinite.e2m1x2.f32, use float_to_e2m1 instead.")
-    #endif
     uint32_t val = 0;
     float2* data = reinterpret_cast<float2*>(&array[0]);
     for (int i = 0; i < 4; ++i) {
@@ -114,7 +118,8 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
     }
     return val;
   #endif
-}
+  #endif
+  }
 
 constexpr int CVT_ELTS_PER_THREAD = 8;
 // Quantizes the provided PackedVec into the uint32_t output
@@ -195,7 +200,7 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales,
          int* packed_recv_count,
          int* cumulative_local_expert_recv_stats,
          int64_t* dispatch_wait_recv_cost_stats,
-         const float* x_sf_scale,
+         const float* x_global_scales,
          void* rdma_recv_x, int* rdma_recv_count, void* rdma_x,
          const void* x, const int64_t* topk_idx,
          int* atomic_counter_per_expert, int* atomic_finish_counter_per_expert,
@@ -270,8 +275,8 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales,
             float SFScaleVal = 1.0f;
             if constexpr (kUseNVFP4) {
                 // Get scaling value;
-                EP_DEVICE_ASSERT(x_sf_scale != nullptr);
-                SFScaleVal = *(static_cast<const float*>(x_sf_scale));
+                EP_DEVICE_ASSERT(x_global_scales != nullptr);
+                SFScaleVal = *(static_cast<const float*>(x_global_scales));
             }
 
             // FP8 or NVFP4 cast
@@ -537,14 +542,14 @@ void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
               int* packed_recv_count,
               int* cumulative_local_expert_recv_stats,
               int64_t* dispatch_wait_recv_cost_stats,
-              const float* x_sf_scale,
+              const float* x_global_scales,
               void* rdma_recv_x, int* rdma_recv_count, void* rdma_x,
               const void* x, const int64_t* topk_idx,
               int* next_clean, int num_next_clean_int,
               int num_tokens, int hidden, int num_max_dispatch_tokens_per_rank,
               int num_topk, int num_experts, int rank, int num_ranks,
               bool use_fp8, bool round_scale, bool use_ue8m0,
-              bool use_nvfp4, bool use_ue8m0_for_sf,
+              bool use_nvfp4, bool use_ue8m0_for_nvfp4_x_scale,
               void* workspace, int num_device_sms,
               cudaStream_t stream, int phases) {
     constexpr int kNumMaxTopK = 9;
@@ -572,9 +577,9 @@ if (use_fp8 and not use_ue8m0) \
     dispatch_func = dispatch<true, false, false, false, hidden>; \
 if (use_fp8 and use_ue8m0) \
     dispatch_func = dispatch<true, true, false, false, hidden>; \
-if (use_nvfp4 and not use_ue8m0_for_sf) \
+if (use_nvfp4 and not use_ue8m0_for_nvfp4_x_scale) \
     dispatch_func = dispatch<false, false, true, false, hidden>; \
-if (use_nvfp4 and use_ue8m0_for_sf) \
+if (use_nvfp4 and use_ue8m0_for_nvfp4_x_scale) \
     dispatch_func = dispatch<false, false, true, true, hidden>; \
 LAUNCH_KERNEL(&cfg, dispatch_func, \
               packed_recv_x, packed_recv_x_scales, \
@@ -582,7 +587,7 @@ LAUNCH_KERNEL(&cfg, dispatch_func, \
               packed_recv_count, \
               cumulative_local_expert_recv_stats, \
               dispatch_wait_recv_cost_stats, \
-              x_sf_scale, \
+              x_global_scales, \
               rdma_recv_x, rdma_recv_count, rdma_x, \
               x, topk_idx, \
               atomic_counter_per_expert, atomic_finish_counter_per_expert, \
diff --git a/deep_ep/buffer.py b/deep_ep/buffer.py
index ea348eb8..2a6b75ba 100644
--- a/deep_ep/buffer.py
+++ b/deep_ep/buffer.py
@@ -528,9 +528,9 @@ def low_latency_dispatch(self, x: torch.Tensor, topk_idx: torch.Tensor,
                              num_max_dispatch_tokens_per_rank: int, num_experts: int,
                              cumulative_local_expert_recv_stats: Optional[torch.Tensor] = None,
                              dispatch_wait_recv_cost_stats: Optional[torch.Tensor] = None,
-                             x_sf_scale: Optional[torch.Tensor] = None,
+                             x_global_scales: Optional[torch.Tensor] = None,
                              use_fp8: bool = True, round_scale: bool = False, use_ue8m0: bool = False,
-                             use_nvfp4: bool = False, use_ue8m0_for_sf: bool = False,
+                             use_nvfp4: bool = False, use_ue8m0_for_nvfp4_x_scale: bool = False,
                              async_finish: bool = False, return_recv_hook: bool = False) -> \
             Tuple[Tuple[torch.Tensor, torch.Tensor], torch.Tensor, Tuple, EventOverlap, Callable]:
         """
@@ -553,12 +553,12 @@ def low_latency_dispatch(self, x: torch.Tensor, topk_idx: torch.Tensor,
             dispatch_wait_recv_cost_stats: a cumulative time spent waiting to receive each token tensor for statistics,
                 which should have shape `[num_ranks, num_ranks]` and be typed as `torch.int64`.
                 This is useful for detecting and pre-cisely localizing slow anomalies.
-            x_sf_scale: a float32 tensor with dim() == 0, the scaling factors for the entire dispatch.
+            x_global_scales: a float32 tensor with dim() == 0, the scaling factors for the entire dispatch.
             use_fp8: whether to enable FP8 casting, with this, the received data will be a tuple of FP8 tensor and scaling factors.
             round_scale: whether round the scaling factors into power of 2.
             use_ue8m0: whether use UE8M0 as scaling factor format (available only with `round_scale=True`).
             use_nvfp4: whether to enable NVFP4 casting, with this, the received data will be a tuple of NVFP4 tensor and scaling factors.
-            use_ue8m0_for_sf: whether use UE8M0 as NVFP4 scaling factor format (available only with `use_nvfp4=True`).
+            use_ue8m0_for_nvfp4_x_scale: whether use UE8M0 as NVFP4 scaling factor format (available only with `use_nvfp4=True`).
             async_finish: the current stream will not wait for the communication kernels to be finished if set.
             return_recv_hook: return a receiving hook if set. If set, the kernel will just do the RDMA request issues,
                 but **without actually receiving the data**. You must call the received hook to make sure the data's arrival.
@@ -591,17 +591,17 @@ def low_latency_dispatch(self, x: torch.Tensor, topk_idx: torch.Tensor,
             self.runtime.low_latency_dispatch(x, topk_idx,
                                               cumulative_local_expert_recv_stats,
                                               dispatch_wait_recv_cost_stats,
-                                              x_sf_scale,
+                                              x_global_scales,
                                               num_max_dispatch_tokens_per_rank, num_experts,
                                               use_fp8, round_scale, use_ue8m0,
-                                              use_nvfp4, use_ue8m0_for_sf,
+                                              use_nvfp4, use_ue8m0_for_nvfp4_x_scale,
                                               async_finish, return_recv_hook)
         handle = (packed_recv_src_info, packed_recv_layout_range, num_max_dispatch_tokens_per_rank, x.size(1), num_experts)
         tensors_to_record = (x, topk_idx,
                              packed_recv_x, packed_recv_x_scales, packed_recv_count,
                              packed_recv_src_info, packed_recv_layout_range,
                              cumulative_local_expert_recv_stats,
-                             x_sf_scale)
+                             x_global_scales)
         if use_fp8 or use_nvfp4:
             packed_recv_x = (packed_recv_x, packed_recv_x_scales)
         return packed_recv_x, packed_recv_count, handle, \
diff --git a/tests/test_low_latency.py b/tests/test_low_latency.py
index b2ffee33..2756da00 100644
--- a/tests/test_low_latency.py
+++ b/tests/test_low_latency.py
@@ -54,21 +54,21 @@ def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
             for dispatch_data_type in ('bf16', 'fp8', 'nvfp4'):
                 dispatch_use_fp8 = dispatch_data_type == 'fp8'
                 dispatch_use_nvfp4 = dispatch_data_type == 'nvfp4'
-                use_ue8m0_for_sf = False
+                use_ue8m0_for_nvfp4_x_scale = False
                 for round_scale in (False, True) if dispatch_use_fp8 else (False, ):
                     for use_ue8m0 in (False, True) if round_scale else (False, ):
                         num_times += 1
                         for i in range((num_times % 2) + 1):
                             cumulative_local_expert_recv_stats = torch.zeros((num_local_experts, ), dtype=torch.int, device='cuda')
                             x_max = torch.max(torch.abs(current_x))
-                            x_sf_scale = (MAX_E4M3 * MAX_NVFP4) / x_max.to(torch.float32)
-                            dist.all_reduce(x_sf_scale, op=dist.ReduceOp.MIN, group=group)
+                            x_global_scales = (MAX_E4M3 * MAX_NVFP4) / x_max.to(torch.float32)
+                            dist.all_reduce(x_global_scales, op=dist.ReduceOp.MIN, group=group)
                             packed_recv_x, packed_recv_count, handle, event, hook = \
                                 buffer.low_latency_dispatch(current_x, topk_idx, num_tokens, num_experts,
                                                             use_fp8=dispatch_use_fp8, round_scale=round_scale, use_ue8m0=use_ue8m0,
-                                                            use_nvfp4=dispatch_use_nvfp4, use_ue8m0_for_sf=use_ue8m0_for_sf,
+                                                            use_nvfp4=dispatch_use_nvfp4, use_ue8m0_for_nvfp4_x_scale=use_ue8m0_for_nvfp4_x_scale,
                                                             cumulative_local_expert_recv_stats=cumulative_local_expert_recv_stats,
-                                                            x_sf_scale=x_sf_scale,
+                                                            x_global_scales=x_global_scales,
                                                             async_finish=not return_recv_hook, return_recv_hook=return_recv_hook)
                             hook() if return_recv_hook else event.current_stream_wait()
                         if dispatch_use_fp8:
@@ -77,9 +77,10 @@ def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
                         elif dispatch_use_nvfp4:
                             recv_x_scale_view = packed_recv_x[1]
                             recv_x_scale_view = recv_x_scale_view.permute(5, 2, 0, 1, 4, 3)
+                            print(f'for num_times: {num_times}, recv_x_scale_view.shape: {recv_x_scale_view.shape}')
                             recv_x_scale_view = recv_x_scale_view.contiguous().view(num_local_experts, int(num_ranks * num_tokens), hidden // 16)
                             packed_recv_x = (packed_recv_x[0], recv_x_scale_view)
-                            simulated_gemm_x = per_token_cast_back(packed_recv_x[0], packed_recv_x[1], x_sf_scale, use_ue8m0_for_sf=use_ue8m0_for_sf, src_data_format='nvfp4')
+                            simulated_gemm_x = per_token_cast_back(packed_recv_x[0], packed_recv_x[1], x_global_scales, use_ue8m0_for_nvfp4_x_scale=use_ue8m0_for_nvfp4_x_scale, src_data_format='nvfp4')
                         else:
                             packed_recv_x = packed_recv_x
                             simulated_gemm_x = packed_recv_x.clone()
@@ -157,7 +158,7 @@ def test_func(return_recv_hook: bool):
         recv_x, recv_count, handle, event, hook = \
             buffer.low_latency_dispatch(current_x, topk_idx, num_tokens, num_experts,
                                         cumulative_local_expert_recv_stats=cumulative_local_expert_recv_stats,
-                                        use_fp8=False, use_nvfp4=True, x_sf_scale=x_sf_scale,
+                                        use_fp8=False, use_nvfp4=True, x_global_scales=x_global_scales,
                                         async_finish=False, return_recv_hook=return_recv_hook)
         large_gemm_with_hook(hook) if return_recv_hook else None
         combined_x, event, hook = buffer.low_latency_combine(simulated_gemm_x, topk_idx, topk_weights, handle,
diff --git a/tests/utils.py b/tests/utils.py
index 7b666aea..e89b458a 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -90,15 +90,15 @@ def int32_to_8floats_lookup(tensor: torch.Tensor, table: torch.Tensor) -> torch.
     return out
 
 
-def cast_nvfp4_to_fp32(x_nvfp4: torch.Tensor, x_scales: torch.Tensor, x_sf_scale: float, use_ue8m0_for_sf: bool = False):
-    assert(x_sf_scale.dim() == 0, f"expect x_sf_scale.dim() == 0, but got {x_sf_scale.dim()}")
+def cast_nvfp4_to_fp32(x_nvfp4: torch.Tensor, x_scales: torch.Tensor, x_global_scales: float, use_ue8m0_for_nvfp4_x_scale: bool = False):
+    assert(x_global_scales.dim() == 0, f"expect x_global_scales.dim() == 0, but got {x_global_scales.dim()}")
     NVFP4_TABLE = torch.tensor([0, 0.5, 1, 1.5, 2, 3, 4, 6, 0, -0.5, -1.0, -1.5, -2, -3, -4, -6], dtype=torch.float32, device='cuda')
-    if use_ue8m0_for_sf:
+    if use_ue8m0_for_nvfp4_x_scale:
         x_scales = x_scales.view(dtype=torch.int8).to(torch.int) << 23
         x_scales = x_scales.view(dtype=torch.float)
     else:
         x_scales = x_scales.view(dtype=torch.float8_e4m3fn).to(torch.float32)
-    x_scales = x_scales * (1 / x_sf_scale)
+    x_scales = x_scales * (1 / x_global_scales)
     
     x_int32 = x_nvfp4.view(dtype=torch.int32)
     x_fp32 = int32_to_8floats_lookup(x_int32, NVFP4_TABLE) 
@@ -111,11 +111,11 @@ def cast_nvfp4_to_fp32(x_nvfp4: torch.Tensor, x_scales: torch.Tensor, x_sf_scale
     return x_fp32
 
 
-def per_token_cast_back(x: torch.Tensor, x_scales: torch.Tensor, x_sf_scale: torch.Tensor = None, use_ue8m0_for_sf: bool = False, src_data_format: str = 'fp8'):
+def per_token_cast_back(x: torch.Tensor, x_scales: torch.Tensor, x_global_scales: torch.Tensor = None, use_ue8m0_for_nvfp4_x_scale: bool = False, src_data_format: str = 'fp8'):
     if src_data_format == 'fp8':
         return cast_fp8_to_fp32(x, x_scales)
     elif src_data_format == 'nvfp4':
-        return cast_nvfp4_to_fp32(x, x_scales, x_sf_scale, use_ue8m0_for_sf)
+        return cast_nvfp4_to_fp32(x, x_scales, x_global_scales, use_ue8m0_for_nvfp4_x_scale)
     else:
         raise ValueError(f"Unsupported src_data_format: {src_data_format}")
 

From fc15ca6717aae4391f69aed60eb86010857c8baa Mon Sep 17 00:00:00 2001
From: shifangx <shifangx@nvidia.com>
Date: Fri, 5 Sep 2025 23:00:46 -0700
Subject: [PATCH 55/60] support padding m

---
 csrc/kernels/internode_ll.cu | 3 ++-
 tests/test_low_latency.py    | 1 -
 tests/utils.py               | 3 +--
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/csrc/kernels/internode_ll.cu b/csrc/kernels/internode_ll.cu
index eca6518c..d5544bcf 100644
--- a/csrc/kernels/internode_ll.cu
+++ b/csrc/kernels/internode_ll.cu
@@ -450,7 +450,8 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales,
         const auto recv_src_info = packed_recv_src_info + local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank;
         const auto recv_range = packed_recv_layout_range + local_expert_idx * num_ranks;
         const auto num_aligned_scales = align<int>(num_scales, sizeof(float) / sizeof(scale_t));
-        const auto recv_x_scales = static_cast<scale_t*>(packed_recv_x_scales) + local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank * num_aligned_scales;
+        const auto num_aligned_tokens = align<int>(num_ranks * num_max_dispatch_tokens_per_rank, 128);
+        const auto recv_x_scales = static_cast<scale_t*>(packed_recv_x_scales) + local_expert_idx * num_aligned_tokens * num_aligned_scales;
 
         // Shared between sub-warps in warp groups
         __shared__ int shared_num_recv_tokens[kNumMaxWarpGroups], shared_recv_token_begin_idx[kNumMaxWarpGroups];
diff --git a/tests/test_low_latency.py b/tests/test_low_latency.py
index 2756da00..9de52d6b 100644
--- a/tests/test_low_latency.py
+++ b/tests/test_low_latency.py
@@ -77,7 +77,6 @@ def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
                         elif dispatch_use_nvfp4:
                             recv_x_scale_view = packed_recv_x[1]
                             recv_x_scale_view = recv_x_scale_view.permute(5, 2, 0, 1, 4, 3)
-                            print(f'for num_times: {num_times}, recv_x_scale_view.shape: {recv_x_scale_view.shape}')
                             recv_x_scale_view = recv_x_scale_view.contiguous().view(num_local_experts, int(num_ranks * num_tokens), hidden // 16)
                             packed_recv_x = (packed_recv_x[0], recv_x_scale_view)
                             simulated_gemm_x = per_token_cast_back(packed_recv_x[0], packed_recv_x[1], x_global_scales, use_ue8m0_for_nvfp4_x_scale=use_ue8m0_for_nvfp4_x_scale, src_data_format='nvfp4')
diff --git a/tests/utils.py b/tests/utils.py
index e89b458a..622ee201 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -90,8 +90,7 @@ def int32_to_8floats_lookup(tensor: torch.Tensor, table: torch.Tensor) -> torch.
     return out
 
 
-def cast_nvfp4_to_fp32(x_nvfp4: torch.Tensor, x_scales: torch.Tensor, x_global_scales: float, use_ue8m0_for_nvfp4_x_scale: bool = False):
-    assert(x_global_scales.dim() == 0, f"expect x_global_scales.dim() == 0, but got {x_global_scales.dim()}")
+def cast_nvfp4_to_fp32(x_nvfp4: torch.Tensor, x_scales: torch.Tensor, x_global_scales: torch.Tensor, use_ue8m0_for_nvfp4_x_scale: bool = False):
     NVFP4_TABLE = torch.tensor([0, 0.5, 1, 1.5, 2, 3, 4, 6, 0, -0.5, -1.0, -1.5, -2, -3, -4, -6], dtype=torch.float32, device='cuda')
     if use_ue8m0_for_nvfp4_x_scale:
         x_scales = x_scales.view(dtype=torch.int8).to(torch.int) << 23

From d89a25b25cc09cb7a557287cba16002aff851b7b Mon Sep 17 00:00:00 2001
From: shifangx <shifangx@nvidia.com>
Date: Tue, 9 Sep 2025 07:52:50 -0700
Subject: [PATCH 56/60] calibrate nvfp4 scale layout with grouped gemm

---
 csrc/deep_ep.cpp              |  32 +++---
 csrc/deep_ep.hpp              |   4 +-
 csrc/kernels/api.cuh          |   4 +-
 csrc/kernels/internode_ll.cu  |  41 ++++---
 deep_ep/buffer.py             |  14 +--
 tests/test_low_latency.py     | 140 ++++++++++++++++++-----
 tests/test_low_latency_fp4.py | 206 ++++++++++++++++++++++++++++++++++
 tests/utils.py                | 107 +++++++++++++++---
 8 files changed, 461 insertions(+), 87 deletions(-)
 create mode 100644 tests/test_low_latency_fp4.py

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index b69c1257..042bace0 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -1091,10 +1091,10 @@ std::tuple<torch::Tensor, std::optional<torch::Tensor>, torch::Tensor, torch::Te
 Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_idx,
                              const std::optional<torch::Tensor>& cumulative_local_expert_recv_stats,
                              const std::optional<torch::Tensor>& dispatch_wait_recv_cost_stats,
-                             const std::optional<torch::Tensor>& x_global_scales,
+                             const std::optional<torch::Tensor>& x_global_scale,
                              int num_max_dispatch_tokens_per_rank, int num_experts,
                              bool use_fp8, bool round_scale, bool use_ue8m0,
-                             bool use_nvfp4, bool use_ue8m0_for_nvfp4_x_scale,
+                             bool use_nvfp4, bool use_ue8m0_for_sf,
                              bool async, bool return_recv_hook) {
 #ifndef DISABLE_NVSHMEM
     EP_HOST_ASSERT(low_latency_mode);
@@ -1139,9 +1139,8 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
         stream_wait(launch_stream, compute_stream);
 
     // Allocate packed tensors
-    constexpr int NUM_ELEMS_PER_PACK = 8;
-    auto packed_recv_x = torch::empty({num_local_experts, num_ranks * num_max_dispatch_tokens_per_rank, use_nvfp4 ? hidden / NUM_ELEMS_PER_PACK : hidden},
-                                      x.options().dtype(use_nvfp4 ? torch::kUInt32 : (use_fp8 ? torch::kFloat8_e4m3fn: torch::kBFloat16)));
+    auto packed_recv_x = torch::empty({num_local_experts, num_ranks * num_max_dispatch_tokens_per_rank, use_nvfp4 ? hidden / 2 : hidden},
+                                      x.options().dtype(use_nvfp4 ? torch::kUInt8 : (use_fp8 ? torch::kFloat8_e4m3fn: torch::kBFloat16)));
     auto packed_recv_src_info = torch::empty({num_local_experts, num_ranks * num_max_dispatch_tokens_per_rank}, torch::dtype(torch::kInt32).device(torch::kCUDA));
     auto packed_recv_layout_range = torch::empty({num_local_experts, num_ranks}, torch::dtype(torch::kInt64).device(torch::kCUDA));
     auto packed_recv_count = torch::empty({num_local_experts}, torch::dtype(torch::kInt32).device(torch::kCUDA));
@@ -1172,19 +1171,26 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
         constexpr int mTileSize_dim_1 = 4;
         constexpr int mTileSize = mTileSize_dim_0 * mTileSize_dim_1;
 
+        assert(hidden % kNumPerChannels == 0);
         auto l = num_local_experts;
         auto m = num_ranks * num_max_dispatch_tokens_per_rank;
         auto rm = (m + 127) / 128;
-        auto rk = hidden / (kNumPerChannels * NUM_SF_ELEMS_PER_PACK);
-        auto scale_dtype = use_ue8m0_for_nvfp4_x_scale ?
-            torch::dtype(torch::kUInt8) :
-            torch::dtype(torch::kFloat8_e4m3fn);
+        auto rk = (hidden + (kNumPerChannels * NUM_SF_ELEMS_PER_PACK) -1 ) / (kNumPerChannels * NUM_SF_ELEMS_PER_PACK);
         // The physical layout is (l, rm, rk, 32, 4, 4).
-        packed_recv_x_scales = torch::empty({l, rm, rk, 32, 4, 4},
-                                            scale_dtype.device(torch::kCUDA));
+        if (use_ue8m0_for_sf) {
+            packed_recv_x_scales = torch::empty({l, rm, rk, 32, 4, 4},
+                                                torch::dtype(torch::kInt).device(torch::kCUDA));
+        } else {
+            packed_recv_x_scales = torch::empty({l, rm, rk, 32, 4, 4},
+                                                torch::dtype(torch::kFloat8_e4m3fn).device(torch::kCUDA));
+        }
         // After permute, the logical shape is (32, 4, rm, 4, rk, l)
         packed_recv_x_scales = packed_recv_x_scales.value().permute({3, 4, 1, 5, 2, 0});
 
+        // The physical layout is (l, m, k // 2). 
+        // After permute, the logical shape is (m, k // 2, l).
+        packed_recv_x = packed_recv_x.permute({1, 2, 0});
+
         packed_recv_x_scales_ptr = packed_recv_x_scales->data_ptr();
         EP_HOST_ASSERT(packed_recv_x_scales_ptr != nullptr);
     }
@@ -1197,7 +1203,7 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
                                packed_recv_count.data_ptr<int>(),
                                cumulative_local_expert_recv_stats.has_value() ? cumulative_local_expert_recv_stats->data_ptr<int>() : nullptr,
                                dispatch_wait_recv_cost_stats.has_value() ? dispatch_wait_recv_cost_stats->data_ptr<int64_t>() : nullptr,
-                               x_global_scales.has_value() ? x_global_scales->data_ptr<float>() : nullptr,
+                               x_global_scale.has_value() ? x_global_scale->data_ptr<float>() : nullptr,
                                buffer.dispatch_rdma_recv_data_buffer, buffer.dispatch_rdma_recv_count_buffer,
                                buffer.dispatch_rdma_send_buffer,
                                x.data_ptr(), topk_idx.data_ptr<int64_t>(),
@@ -1205,7 +1211,7 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
                                num_tokens, hidden, num_max_dispatch_tokens_per_rank,
                                num_topk, num_experts, rank, num_ranks,
                                use_fp8, round_scale, use_ue8m0,
-                               use_nvfp4, use_ue8m0_for_nvfp4_x_scale,
+                               use_nvfp4, use_ue8m0_for_sf,
                                workspace, num_device_sms,
                                launch_stream, phases);
     };
diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp
index b6335f68..05412c15 100644
--- a/csrc/deep_ep.hpp
+++ b/csrc/deep_ep.hpp
@@ -147,10 +147,10 @@ struct Buffer {
     low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_idx,
                          const std::optional<torch::Tensor>& cumulative_local_expert_recv_stats,
                          const std::optional<torch::Tensor>& dispatch_wait_recv_cost_stats,
-                         const std::optional<torch::Tensor>& x_global_scales,
+                         const std::optional<torch::Tensor>& x_global_scale,
                          int num_max_dispatch_tokens_per_rank, int num_experts,
                          bool use_fp8, bool round_scale, bool use_ue8m0,
-                         bool use_nvfp4, bool use_ue8m0_for_nvfp4_x_scale,
+                         bool use_nvfp4, bool use_ue8m0_for_sf,
                          bool async, bool return_recv_hook);
 
     std::tuple<torch::Tensor, std::optional<EventHandle>, std::optional<std::function<void()>>>
diff --git a/csrc/kernels/api.cuh b/csrc/kernels/api.cuh
index cf9ea560..dc93e877 100644
--- a/csrc/kernels/api.cuh
+++ b/csrc/kernels/api.cuh
@@ -144,14 +144,14 @@ void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
               int* packed_recv_count,
               int* cumulative_local_expert_recv_stats,
               int64_t* dispatch_wait_recv_cost_stats,
-              const float* x_global_scales,
+              const float* x_global_scale,
               void* rdma_recv_x, int* rdma_recv_count, void* rdma_x,
               const void* x, const int64_t* topk_idx,
               int* next_clean, int num_next_clean_int,
               int num_tokens, int hidden, int num_max_dispatch_tokens_per_rank,
               int num_topk, int num_experts, int rank, int num_ranks,
               bool use_fp8, bool round_scale, bool use_ue8m0,
-              bool use_nvfp4, bool use_ue8m0_for_nvfp4_x_scale,
+              bool use_nvfp4, bool use_ue8m0_for_sf,
               void* workspace, int num_device_sms,
               cudaStream_t stream, int phases);
 
diff --git a/csrc/kernels/internode_ll.cu b/csrc/kernels/internode_ll.cu
index d5544bcf..abf381c1 100644
--- a/csrc/kernels/internode_ll.cu
+++ b/csrc/kernels/internode_ll.cu
@@ -200,7 +200,7 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales,
          int* packed_recv_count,
          int* cumulative_local_expert_recv_stats,
          int64_t* dispatch_wait_recv_cost_stats,
-         const float* x_global_scales,
+         const float* x_global_scale,
          void* rdma_recv_x, int* rdma_recv_count, void* rdma_x,
          const void* x, const int64_t* topk_idx,
          int* atomic_counter_per_expert, int* atomic_finish_counter_per_expert,
@@ -275,8 +275,8 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales,
             float SFScaleVal = 1.0f;
             if constexpr (kUseNVFP4) {
                 // Get scaling value;
-                EP_DEVICE_ASSERT(x_global_scales != nullptr);
-                SFScaleVal = *(static_cast<const float*>(x_global_scales));
+                EP_DEVICE_ASSERT(x_global_scale != nullptr);
+                SFScaleVal = *(static_cast<const float*>(x_global_scale));
             }
 
             // FP8 or NVFP4 cast
@@ -517,21 +517,28 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales,
                     recv_x_scales[token_idx * token_stride + pack_idx * pack_stride + elem_idx] = scale;
                 }
             } else if constexpr (kUseNVFP4) {            
-                 // The physical layout is (l, rm, rk, 32, 4, 4).
+                 // The physical layout is (l, rm, rk, 32, 4, 4)
                 const auto src_scales = reinterpret_cast<uint8_t*>(reinterpret_cast<uint8_t*>(src_data) + hidden_bytes);
                 const auto num_elems_per_pack = static_cast<int>(sizeof(packed_t) / sizeof(scale_t));
                 const auto token_idx = recv_token_begin_idx + i;
-                const auto token_stride = num_scales * sizeof(scale_t);
-                const auto pack_stride = num_elems_per_pack;
-                const auto rm = token_idx / 128;
-                const auto rm_res = token_idx % 128;
+                
+                const auto padded_k = (kHidden + (kNumPerChannels * num_elems_per_pack) -1 ) / (kNumPerChannels * num_elems_per_pack);
+                const auto dim0_stride = 128 * padded_k / kNumPerChannels;
+                const auto dim1_stride = 128 * num_elems_per_pack;
+                const auto dim2_stride = 4 * num_elems_per_pack;
+                const auto dim3_stride = num_elems_per_pack;
+
+                const auto dim0_offset = token_idx / 128;
+                const auto dim2_offset = (token_idx % 128) % 32;
+                const auto dim3_offset = (token_idx % 128) / 32;
+
                 #pragma unroll
                 for (int j = lane_id; j < num_scales; j += 32) {
-                    const auto pack_idx = j / num_elems_per_pack;
-                    const auto elem_idx = j % num_elems_per_pack;
+                    const auto dim1_offset = j / num_elems_per_pack;
+                    const auto dim4_offset = j % num_elems_per_pack;
                     auto scale = ld_nc_global(src_scales + j);
-                    // recv_x_scales[token_idx * token_stride + pack_idx * pack_stride + elem_idx] = scale;                   
-                    recv_x_scales[rm * token_stride * 128 + pack_idx * pack_stride * 128 + rm_res * pack_stride + elem_idx] = scale;
+                    const auto offset = dim0_offset * dim0_stride + dim1_offset * dim1_stride + dim2_offset * dim2_stride + dim3_offset * dim3_stride + dim4_offset;
+                    recv_x_scales[offset] = scale;
                 }
             }
         }
@@ -543,14 +550,14 @@ void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
               int* packed_recv_count,
               int* cumulative_local_expert_recv_stats,
               int64_t* dispatch_wait_recv_cost_stats,
-              const float* x_global_scales,
+              const float* x_global_scale,
               void* rdma_recv_x, int* rdma_recv_count, void* rdma_x,
               const void* x, const int64_t* topk_idx,
               int* next_clean, int num_next_clean_int,
               int num_tokens, int hidden, int num_max_dispatch_tokens_per_rank,
               int num_topk, int num_experts, int rank, int num_ranks,
               bool use_fp8, bool round_scale, bool use_ue8m0,
-              bool use_nvfp4, bool use_ue8m0_for_nvfp4_x_scale,
+              bool use_nvfp4, bool use_ue8m0_for_sf,
               void* workspace, int num_device_sms,
               cudaStream_t stream, int phases) {
     constexpr int kNumMaxTopK = 9;
@@ -578,9 +585,9 @@ if (use_fp8 and not use_ue8m0) \
     dispatch_func = dispatch<true, false, false, false, hidden>; \
 if (use_fp8 and use_ue8m0) \
     dispatch_func = dispatch<true, true, false, false, hidden>; \
-if (use_nvfp4 and not use_ue8m0_for_nvfp4_x_scale) \
+if (use_nvfp4 and not use_ue8m0_for_sf) \
     dispatch_func = dispatch<false, false, true, false, hidden>; \
-if (use_nvfp4 and use_ue8m0_for_nvfp4_x_scale) \
+if (use_nvfp4 and use_ue8m0_for_sf) \
     dispatch_func = dispatch<false, false, true, true, hidden>; \
 LAUNCH_KERNEL(&cfg, dispatch_func, \
               packed_recv_x, packed_recv_x_scales, \
@@ -588,7 +595,7 @@ LAUNCH_KERNEL(&cfg, dispatch_func, \
               packed_recv_count, \
               cumulative_local_expert_recv_stats, \
               dispatch_wait_recv_cost_stats, \
-              x_global_scales, \
+              x_global_scale, \
               rdma_recv_x, rdma_recv_count, rdma_x, \
               x, topk_idx, \
               atomic_counter_per_expert, atomic_finish_counter_per_expert, \
diff --git a/deep_ep/buffer.py b/deep_ep/buffer.py
index 2a6b75ba..ccdc2611 100644
--- a/deep_ep/buffer.py
+++ b/deep_ep/buffer.py
@@ -528,9 +528,9 @@ def low_latency_dispatch(self, x: torch.Tensor, topk_idx: torch.Tensor,
                              num_max_dispatch_tokens_per_rank: int, num_experts: int,
                              cumulative_local_expert_recv_stats: Optional[torch.Tensor] = None,
                              dispatch_wait_recv_cost_stats: Optional[torch.Tensor] = None,
-                             x_global_scales: Optional[torch.Tensor] = None,
+                             x_global_scale: Optional[torch.Tensor] = None,
                              use_fp8: bool = True, round_scale: bool = False, use_ue8m0: bool = False,
-                             use_nvfp4: bool = False, use_ue8m0_for_nvfp4_x_scale: bool = False,
+                             use_nvfp4: bool = False, use_ue8m0_for_sf: bool = False,
                              async_finish: bool = False, return_recv_hook: bool = False) -> \
             Tuple[Tuple[torch.Tensor, torch.Tensor], torch.Tensor, Tuple, EventOverlap, Callable]:
         """
@@ -553,12 +553,12 @@ def low_latency_dispatch(self, x: torch.Tensor, topk_idx: torch.Tensor,
             dispatch_wait_recv_cost_stats: a cumulative time spent waiting to receive each token tensor for statistics,
                 which should have shape `[num_ranks, num_ranks]` and be typed as `torch.int64`.
                 This is useful for detecting and pre-cisely localizing slow anomalies.
-            x_global_scales: a float32 tensor with dim() == 0, the scaling factors for the entire dispatch.
+            x_global_scale: a float32 tensor with dim() == 0, the scaling factors for the entire dispatch.
             use_fp8: whether to enable FP8 casting, with this, the received data will be a tuple of FP8 tensor and scaling factors.
             round_scale: whether round the scaling factors into power of 2.
             use_ue8m0: whether use UE8M0 as scaling factor format (available only with `round_scale=True`).
             use_nvfp4: whether to enable NVFP4 casting, with this, the received data will be a tuple of NVFP4 tensor and scaling factors.
-            use_ue8m0_for_nvfp4_x_scale: whether use UE8M0 as NVFP4 scaling factor format (available only with `use_nvfp4=True`).
+            use_ue8m0_for_sf: whether use UE8M0 as NVFP4 scaling factor format (available only with `use_nvfp4=True`).
             async_finish: the current stream will not wait for the communication kernels to be finished if set.
             return_recv_hook: return a receiving hook if set. If set, the kernel will just do the RDMA request issues,
                 but **without actually receiving the data**. You must call the received hook to make sure the data's arrival.
@@ -591,17 +591,17 @@ def low_latency_dispatch(self, x: torch.Tensor, topk_idx: torch.Tensor,
             self.runtime.low_latency_dispatch(x, topk_idx,
                                               cumulative_local_expert_recv_stats,
                                               dispatch_wait_recv_cost_stats,
-                                              x_global_scales,
+                                              x_global_scale,
                                               num_max_dispatch_tokens_per_rank, num_experts,
                                               use_fp8, round_scale, use_ue8m0,
-                                              use_nvfp4, use_ue8m0_for_nvfp4_x_scale,
+                                              use_nvfp4, use_ue8m0_for_sf,
                                               async_finish, return_recv_hook)
         handle = (packed_recv_src_info, packed_recv_layout_range, num_max_dispatch_tokens_per_rank, x.size(1), num_experts)
         tensors_to_record = (x, topk_idx,
                              packed_recv_x, packed_recv_x_scales, packed_recv_count,
                              packed_recv_src_info, packed_recv_layout_range,
                              cumulative_local_expert_recv_stats,
-                             x_global_scales)
+                             x_global_scale)
         if use_fp8 or use_nvfp4:
             packed_recv_x = (packed_recv_x, packed_recv_x_scales)
         return packed_recv_x, packed_recv_count, handle, \
diff --git a/tests/test_low_latency.py b/tests/test_low_latency.py
index 9de52d6b..d64eebbd 100644
--- a/tests/test_low_latency.py
+++ b/tests/test_low_latency.py
@@ -11,13 +11,12 @@
 import deep_ep
 from utils import init_dist, bench, bench_kineto, calc_diff, hash_tensor, per_token_cast_back
 
-MAX_E4M3 = 448
-MAX_NVFP4 = 6.0
-
+FLOAT4_E2M1_MAX = 6.0
+FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
 
 def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
               rank: int, num_ranks: int, group: dist.ProcessGroup, buffer: deep_ep.Buffer,
-              use_logfmt: bool = False, seed: int = 0):
+              use_logfmt: bool = False, seed: int = 0, args: argparse.Namespace = None):
     torch.manual_seed(seed + rank)
     random.seed(seed + rank)
 
@@ -54,32 +53,28 @@ def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
             for dispatch_data_type in ('bf16', 'fp8', 'nvfp4'):
                 dispatch_use_fp8 = dispatch_data_type == 'fp8'
                 dispatch_use_nvfp4 = dispatch_data_type == 'nvfp4'
-                use_ue8m0_for_nvfp4_x_scale = False
+                use_ue8m0_for_sf = False
                 for round_scale in (False, True) if dispatch_use_fp8 else (False, ):
                     for use_ue8m0 in (False, True) if round_scale else (False, ):
                         num_times += 1
                         for i in range((num_times % 2) + 1):
                             cumulative_local_expert_recv_stats = torch.zeros((num_local_experts, ), dtype=torch.int, device='cuda')
                             x_max = torch.max(torch.abs(current_x))
-                            x_global_scales = (MAX_E4M3 * MAX_NVFP4) / x_max.to(torch.float32)
-                            dist.all_reduce(x_global_scales, op=dist.ReduceOp.MIN, group=group)
+                            x_global_scale = (FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / x_max.to(torch.float32)
+                            dist.all_reduce(x_global_scale, op=dist.ReduceOp.MIN, group=group)
                             packed_recv_x, packed_recv_count, handle, event, hook = \
                                 buffer.low_latency_dispatch(current_x, topk_idx, num_tokens, num_experts,
                                                             use_fp8=dispatch_use_fp8, round_scale=round_scale, use_ue8m0=use_ue8m0,
-                                                            use_nvfp4=dispatch_use_nvfp4, use_ue8m0_for_nvfp4_x_scale=use_ue8m0_for_nvfp4_x_scale,
+                                                            use_nvfp4=dispatch_use_nvfp4, use_ue8m0_for_sf=use_ue8m0_for_sf,
                                                             cumulative_local_expert_recv_stats=cumulative_local_expert_recv_stats,
-                                                            x_global_scales=x_global_scales,
+                                                            x_global_scale=x_global_scale,
                                                             async_finish=not return_recv_hook, return_recv_hook=return_recv_hook)
                             hook() if return_recv_hook else event.current_stream_wait()
                         if dispatch_use_fp8:
                             packed_recv_x = (packed_recv_x[0], packed_recv_x[1].contiguous())
                             simulated_gemm_x = per_token_cast_back(packed_recv_x[0].view(-1, hidden), packed_recv_x[1].view(-1, hidden // 128)).view(packed_recv_x[0].shape)
                         elif dispatch_use_nvfp4:
-                            recv_x_scale_view = packed_recv_x[1]
-                            recv_x_scale_view = recv_x_scale_view.permute(5, 2, 0, 1, 4, 3)
-                            recv_x_scale_view = recv_x_scale_view.contiguous().view(num_local_experts, int(num_ranks * num_tokens), hidden // 16)
-                            packed_recv_x = (packed_recv_x[0], recv_x_scale_view)
-                            simulated_gemm_x = per_token_cast_back(packed_recv_x[0], packed_recv_x[1], x_global_scales, use_ue8m0_for_nvfp4_x_scale=use_ue8m0_for_nvfp4_x_scale, src_data_format='nvfp4')
+                            simulated_gemm_x = per_token_cast_back(packed_recv_x[0], packed_recv_x[1], x_global_scale, use_ue8m0_for_sf=use_ue8m0_for_sf, src_data_format='nvfp4')
                         else:
                             packed_recv_x = packed_recv_x
                             simulated_gemm_x = packed_recv_x.clone()
@@ -116,9 +111,11 @@ def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
                                     if not round_scale and not dispatch_use_nvfp4:
                                         assert (recv_x_amin == j - rank_offset).sum().item() == (all_topk_idx[j] == expert_id).sum().item()
                                         assert (recv_x[begin_idx:begin_idx + count, :-128] - j + rank_offset).sum().item() == 0
-                            if dispatch_use_fp8 or dispatch_use_nvfp4:
+                            if dispatch_use_fp8:
                                 hash_value ^= hash_tensor(packed_recv_x[0][i, :num_valid_tokens])
                                 hash_value ^= hash_tensor(packed_recv_x[1][i, :num_valid_tokens])
+                            elif dispatch_use_nvfp4:
+                                hash_value ^= hash_tensor(simulated_gemm_x[i, :num_valid_tokens])
                             else:
                                 hash_value ^= hash_tensor(packed_recv_x[i, :num_valid_tokens])
 
@@ -136,12 +133,12 @@ def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
                                 diff = calc_diff(current_x * topk_weights.masked_fill(topk_idx == -1, 0).sum(dim=1).view(-1, 1), combined_x)
                                 assert torch.isnan(combined_x).sum().item() == 0
                                 if dispatch_use_fp8:
-                                    diff_threshold = 9e-4
+                                    diff_threshold = 0.007
                                 elif dispatch_use_nvfp4:
                                     diff_threshold = 0.007
                                 else:
                                     diff_threshold = 1e-5
-                                assert diff < diff_threshold, f'Error: {diff=}, {dispatch_use_fp8=}, {dispatch_use_nvfp4=}, {zero_copy=}'
+                                assert diff < diff_threshold, f'Error: {diff=}, {diff_threshold=}, {dispatch_use_fp8=}, {dispatch_use_nvfp4=}, {zero_copy=}'
                                 hash_value ^= hash_tensor(combined_x)
 
     # noinspection PyShadowingNames
@@ -152,18 +149,32 @@ def large_gemm_with_hook(hook):
         hook()
 
     # noinspection PyShadowingNames
-    def test_func(return_recv_hook: bool):
-        # NOTE: use nvfp4
+    def test_func_fp8(return_recv_hook: bool):
+        recv_x, recv_count, handle, event, hook = \
+            buffer.low_latency_dispatch(current_x, topk_idx, num_tokens, num_experts,
+                                        cumulative_local_expert_recv_stats=cumulative_local_expert_recv_stats,
+                                        use_fp8=True, use_nvfp4=False, x_global_scale=x_global_scale,
+                                        async_finish=False, return_recv_hook=return_recv_hook)
+        large_gemm_with_hook(hook) if return_recv_hook else None
+        combined_x, event, hook = buffer.low_latency_combine(simulated_gemm_x, topk_idx, topk_weights, handle,
+                                                             use_logfmt=use_logfmt, return_recv_hook=return_recv_hook)
+        large_gemm_with_hook(hook) if return_recv_hook else None
+
+    # noinspection PyShadowingNames
+    def test_func_nvfp4(return_recv_hook: bool):
         recv_x, recv_count, handle, event, hook = \
             buffer.low_latency_dispatch(current_x, topk_idx, num_tokens, num_experts,
                                         cumulative_local_expert_recv_stats=cumulative_local_expert_recv_stats,
-                                        use_fp8=False, use_nvfp4=True, x_global_scales=x_global_scales,
+                                        use_fp8=False, use_nvfp4=True, x_global_scale=x_global_scale,
                                         async_finish=False, return_recv_hook=return_recv_hook)
         large_gemm_with_hook(hook) if return_recv_hook else None
         combined_x, event, hook = buffer.low_latency_combine(simulated_gemm_x, topk_idx, topk_weights, handle,
                                                              use_logfmt=use_logfmt, return_recv_hook=return_recv_hook)
         large_gemm_with_hook(hook) if return_recv_hook else None
 
+    ########################################################
+    # fp8
+    ########################################################
     # Calculate bandwidth
     num_fp8_bytes, num_bf16_bytes = (hidden + hidden / 128 * 4 + 16), hidden * 2
     num_logfmt10_bytes = hidden * 10 / 8 + hidden / 128 * 4
@@ -174,22 +185,91 @@ def test_func(return_recv_hook: bool):
         num_combine_comm_bytes += (num_logfmt10_bytes if use_logfmt else num_bf16_bytes) * num_selections
 
     # Dispatch + combine testing
-    avg_t, min_t, max_t = bench(partial(test_func, return_recv_hook=False))
-    print(f'[rank {rank}] Dispatch + combine bandwidth: {(num_dispatch_comm_bytes + num_combine_comm_bytes) / 1e9 / avg_t:.2f} GB/s, '
+    avg_t, min_t, max_t = bench(partial(test_func_fp8, return_recv_hook=False), num_tests=1000)
+    print(f'[rank {rank}] fp8 Dispatch + bf16 combine bandwidth: {(num_dispatch_comm_bytes + num_combine_comm_bytes) / 1e9 / avg_t:.2f} GB/s, '
+          f'avg_t={avg_t * 1e6:.2f} us, min_t={min_t * 1e6:.2f} us, max_t={max_t * 1e6:.2f} us', flush=True)
+
+    # Separate profiling
+    for return_recv_hook in (False, True):
+        if args.no_kineto_profile:
+            # if start with nsys profiling, kineto profiling will fail, so skip it.
+            break
+        group.barrier()
+        dispatch_t, combine_t = bench_kineto(partial(test_func_fp8, return_recv_hook=return_recv_hook),
+                                             kernel_names=('dispatch', 'combine'), barrier_comm_profiling=True,
+                                             suppress_kineto_output=True, num_kernels_per_period=2 if return_recv_hook else 1, num_tests=1000)
+        if not return_recv_hook:
+            print(f'[rank {rank}] fp8 Dispatch bandwidth: {num_dispatch_comm_bytes / 1e9 / dispatch_t:.2f} GB/s, avg_t={dispatch_t * 1e6:.2f} us | '
+                  f'bf16 Combine bandwidth: {num_combine_comm_bytes / 1e9 / combine_t:.2f} GB/s, avg_t={combine_t * 1e6:.2f} us', flush=True)
+        else:
+            print(f'[rank {rank}] fp8 Dispatch send/recv time: {dispatch_t[0] * 1e6:.2f} + {dispatch_t[1] * 1e6:.2f} us | '
+                  f'bf16 Combine send/recv time: {combine_t[0] * 1e6:.2f} + {combine_t[1] * 1e6:.2f} us', flush=True)
+
+
+    ########################################################
+    # nvfp4
+    ########################################################
+    # Calculate bandwidth
+    num_nvfp4_bytes, num_bf16_bytes = (hidden / 2 + hidden / 16 + 16), hidden * 2
+    num_logfmt10_bytes = hidden * 10 / 8 + hidden / 128 * 4
+    num_dispatch_comm_bytes, num_combine_comm_bytes = 0, 0
+    for i in range(num_tokens):
+        num_selections = (topk_idx[i] != -1).sum().item()
+        num_dispatch_comm_bytes += num_nvfp4_bytes * num_selections
+        num_combine_comm_bytes += (num_logfmt10_bytes if use_logfmt else num_bf16_bytes) * num_selections
+
+    # Dispatch + combine testing
+    avg_t, min_t, max_t = bench(partial(test_func_nvfp4, return_recv_hook=False), num_tests=1000)
+    print(f'[rank {rank}] nvfp4 Dispatch + bf16 combine bandwidth: {(num_dispatch_comm_bytes + num_combine_comm_bytes) / 1e9 / avg_t:.2f} GB/s, '
           f'avg_t={avg_t * 1e6:.2f} us, min_t={min_t * 1e6:.2f} us, max_t={max_t * 1e6:.2f} us', flush=True)
 
     # Separate profiling
     for return_recv_hook in (False, True):
+        if args.no_kineto_profile:
+            break
         group.barrier()
-        dispatch_t, combine_t = bench_kineto(partial(test_func, return_recv_hook=return_recv_hook),
+        dispatch_t, combine_t = bench_kineto(partial(test_func_nvfp4, return_recv_hook=return_recv_hook),
                                              kernel_names=('dispatch', 'combine'), barrier_comm_profiling=True,
-                                             suppress_kineto_output=True, num_kernels_per_period=2 if return_recv_hook else 1)
+                                             suppress_kineto_output=True, num_kernels_per_period=2 if return_recv_hook else 1, num_tests=1000)
         if not return_recv_hook:
-            print(f'[rank {rank}] Dispatch bandwidth: {num_dispatch_comm_bytes / 1e9 / dispatch_t:.2f} GB/s, avg_t={dispatch_t * 1e6:.2f} us | '
-                  f'Combine bandwidth: {num_combine_comm_bytes / 1e9 / combine_t:.2f} GB/s, avg_t={combine_t * 1e6:.2f} us', flush=True)
+            print(f'[rank {rank}] nvfp4 Dispatch bandwidth: {num_dispatch_comm_bytes / 1e9 / dispatch_t:.2f} GB/s, avg_t={dispatch_t * 1e6:.2f} us | '
+                  f'bf16 Combine bandwidth: {num_combine_comm_bytes / 1e9 / combine_t:.2f} GB/s, avg_t={combine_t * 1e6:.2f} us', flush=True)
         else:
-            print(f'[rank {rank}] Dispatch send/recv time: {dispatch_t[0] * 1e6:.2f} + {dispatch_t[1] * 1e6:.2f} us | '
-                  f'Combine send/recv time: {combine_t[0] * 1e6:.2f} + {combine_t[1] * 1e6:.2f} us', flush=True)
+            print(f'[rank {rank}] nvfp4 Dispatch send/recv time: {dispatch_t[0] * 1e6:.2f} + {dispatch_t[1] * 1e6:.2f} us | '
+                  f'bf16 Combine send/recv time: {combine_t[0] * 1e6:.2f} + {combine_t[1] * 1e6:.2f} us', flush=True)
+
+    if args.no_kineto_profile:
+        torch.cuda.profiler.start()
+        if rank == 0:
+            print(f'Torch profiling ...', flush=True)
+        for return_recv_hook in (False, True):
+            REPEAT = 3
+            for repeat in range(REPEAT):
+                for return_recv_hook in (False, True):
+                    group.barrier()
+                    ITERATIONS = 100
+                    with torch.cuda.nvtx.range(f"fp8 dispatch with return_recv_hook:{return_recv_hook}"):
+                        for _ in range(ITERATIONS):
+                            with torch.cuda.nvtx.range("fp8 dispatch"):
+                                recv_x, recv_count, handle, event, hook = \
+                                    buffer.low_latency_dispatch(current_x, topk_idx, num_tokens, num_experts,
+                                                    cumulative_local_expert_recv_stats=cumulative_local_expert_recv_stats,
+                                                    use_fp8=True, use_nvfp4=False, x_global_scale=x_global_scale,
+                                                    async_finish=False, return_recv_hook=return_recv_hook)
+                    group.barrier()
+                    with torch.cuda.nvtx.range(f"nvfp4 dispatch with return_recv_hook:{return_recv_hook}"):
+                        for _ in range(ITERATIONS):
+                            with torch.cuda.nvtx.range("nvfp4 dispatch"):
+                                recv_x, recv_count, handle, event, hook = \
+                                    buffer.low_latency_dispatch(current_x, topk_idx, num_tokens, num_experts,
+                                                    cumulative_local_expert_recv_stats=cumulative_local_expert_recv_stats,
+                                                    use_fp8=False, use_nvfp4=True, x_global_scale=x_global_scale,
+                                                    async_finish=False, return_recv_hook=return_recv_hook)
+                    group.barrier()
+        time.sleep(0.1)
+        torch.cuda.profiler.stop()
+        if rank == 0:
+            print(f'Torch profiling done', flush=True)
     return hash_value
 
 
@@ -207,7 +287,7 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
                             allow_nvlink_for_low_latency_mode=not args.disable_nvlink, explicitly_destroy=True,
                             allow_mnnvl=args.allow_mnnvl)
     test_main(num_tokens, hidden, num_experts, num_topk, rank, num_ranks, group, buffer,
-              use_logfmt=args.use_logfmt, seed=1)
+              use_logfmt=args.use_logfmt, seed=1, args=args)
 
     do_pressure_test = args.pressure_test
     for seed in range(int(1e9) if do_pressure_test else 0):
@@ -247,6 +327,8 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
                         help='Whether to test LogFMT combine')
     parser.add_argument("--pressure-test", action='store_true',
                         help='Whether to do pressure test')
+    parser.add_argument("--no-kineto-profile", action='store_true',
+                        help='Whether to do torch profile')
 
     args = parser.parse_args()
 
diff --git a/tests/test_low_latency_fp4.py b/tests/test_low_latency_fp4.py
new file mode 100644
index 00000000..acd5eb59
--- /dev/null
+++ b/tests/test_low_latency_fp4.py
@@ -0,0 +1,206 @@
+import argparse
+import random
+import time
+import os
+import torch
+import torch.distributed as dist
+import numpy as np
+from functools import partial
+from typing import Optional
+
+from sgl_kernel import scaled_fp4_grouped_quant
+
+import deep_ep
+from utils import init_dist, bench, bench_kineto, calc_diff, hash_tensor, per_token_cast_back, get_global_token_idxs, recover_experts_swizzled_scales, get_pair_token_idx
+
+FLOAT4_E2M1_MAX = 6.0
+FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
+
+def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
+              rank: int, num_ranks: int, group: dist.ProcessGroup, buffer: deep_ep.Buffer,
+              use_logfmt: bool = False, seed: int = 0, args: argparse.Namespace = None):
+    #############################################################
+    # set configurations
+    #############################################################
+    torch.manual_seed(seed + rank)
+    random.seed(seed + rank)
+    # torch.set_printoptions(threshold=2000, edgeitems=8) 
+    assert num_experts % num_ranks == 0
+    num_local_experts = num_experts // num_ranks
+    padded_m = (((num_ranks * num_tokens) + 128 - 1) // 128) * 128
+    padded_k = ((hidden + 64 - 1) // 64) * 64
+    if rank == 0:
+        print(f'Start testing nvfp4 dispatch')
+
+    #############################################################
+    # prepare data to dispatch.
+    # Because we can not ensure the order of tokens after dispatch,
+    # to make the correctness check easier, we supposed each token has the same value.
+    # Otherwise, we have to get the order of received tokens before the correctness check
+    #############################################################
+    x = torch.randn((num_tokens, hidden), dtype=torch.bfloat16, device='cuda')
+    x[:, 0:16] = torch.tensor([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16], device='cuda').to(torch.bfloat16) + 1
+    x[:, -128:] = torch.arange(num_tokens, device='cuda').to(torch.bfloat16).view(-1, 1) + rank * num_tokens
+    
+    scores = torch.randn((num_tokens, num_experts), dtype=torch.float32, device='cuda').abs() + 1
+    topk_idx = torch.topk(scores, num_topk, dim=-1, largest=True, sorted=True)[1]
+    topk_weights = torch.randn((num_tokens, num_topk), dtype=torch.float32, device='cuda').abs()
+    x_max = torch.max(torch.abs(x))
+    x_global_scale = (FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / x_max.to(torch.float32)
+    dist.all_reduce(x_global_scale, op=dist.ReduceOp.MIN, group=group)
+
+    #############################################################
+    # dispatch with bf16 data format
+    #############################################################
+    return_recv_hook = False
+    cumulative_local_expert_recv_stats = torch.zeros((num_local_experts, ), dtype=torch.int, device='cuda')
+    packed_recv_x, packed_recv_count, handle, event, hook = \
+        buffer.low_latency_dispatch(x, topk_idx, num_tokens, num_experts,
+                                    use_fp8=False, use_nvfp4=False, x_global_scale=x_global_scale,
+                                    cumulative_local_expert_recv_stats=cumulative_local_expert_recv_stats,
+                                    async_finish=not return_recv_hook, return_recv_hook=return_recv_hook)
+    hook() if return_recv_hook else event.current_stream_wait()
+    recv_x = packed_recv_x
+    recv_count, recv_src_info, recv_layout_range = packed_recv_count, handle[0], handle[1]
+
+    #############################################################
+    # dispatch with nvfp4 data format
+    #############################################################
+    packed_recv_x_pre_quant, packed_recv_count_pre_quant, handle_pre_quant, event_pre_quant, hook_pre_quant = \
+        buffer.low_latency_dispatch(x, topk_idx, num_tokens, num_experts,
+                                    use_fp8=False, use_nvfp4=True, x_global_scale=x_global_scale,
+                                    cumulative_local_expert_recv_stats=cumulative_local_expert_recv_stats,
+                                    async_finish=not return_recv_hook, return_recv_hook=return_recv_hook)
+    hook_pre_quant() if return_recv_hook else event_pre_quant.current_stream_wait()
+    recv_x_pre_quant = packed_recv_x_pre_quant[0]
+    recv_x_pre_quant_scales = packed_recv_x_pre_quant[1]
+    recv_count_pre_quant, recv_src_info_pre_quant, recv_layout_range_pre_quant = packed_recv_count_pre_quant, handle_pre_quant[0], handle_pre_quant[1]
+    #############################################################
+    # prepare data to for checking correctness
+    #############################################################
+    global_token_idxs_ret = get_global_token_idxs(recv_count, recv_src_info, recv_layout_range, num_local_experts, num_ranks, num_tokens)
+    global_token_idxs_test = get_global_token_idxs(recv_count_pre_quant, recv_src_info_pre_quant, recv_layout_range_pre_quant, num_local_experts, num_ranks, num_tokens)
+
+    #############################################################
+    # correctness checking.
+    # the reference is got by dispatching with bf16 data format
+    # and then quantizing the output of dispatch with nvfp4 data format
+    #############################################################
+    if args.CUDA_ARCH >= 100:
+        if rank == 0:
+            print(f'Compare nvfp4 dispatch output with grouped quantize output')
+        mask = recv_count
+        x_sf_global = torch.ones((num_local_experts, ), dtype=torch.float32, device='cuda') * x_global_scale
+        recv_x_post_quant, recv_x_scales_post_quant = scaled_fp4_grouped_quant(
+            recv_x,
+            x_sf_global,
+            mask,
+        )
+        # refer to  https://github.com/sgl-project/sglang/blob/19d64f2b725889cfbdb000937a2d57c07db5cfa8/sgl-kernel/tests/test_fp4_quantize.py#L194
+        # # output in logical (m, k, l), but its physical layout is (l, m, k).
+        # # So permute first to (l, m, k).
+        # output = output.permute(2, 0, 1)
+        # # output_scale in logical (32, 4, rm, 4, rk, l), but its physical layout is (l, rm, rk, 32, 4, 4).
+        # # So permute first to (l, rm, rk, 32, 4, 4).
+        # padded_m = ((m + 128 - 1) // 128) * 128
+        # output_scales = output_scales.permute(5, 2, 4, 0, 1, 3).view(l, padded_m, -1)
+        recv_x_ref = recv_x_post_quant.permute(2, 0, 1)
+        recv_x_scales_ref = recv_x_scales_post_quant.permute(5, 2, 4, 0, 1, 3).view(num_local_experts, -1)
+        recv_x_scales_ref = recover_experts_swizzled_scales(recv_x_scales_ref, num_local_experts, padded_m, padded_k)
+        
+        recv_x_test = recv_x_pre_quant.permute(2, 0, 1)
+        recv_x_scales_test = recv_x_pre_quant_scales.permute(5, 2, 4, 0, 1, 3).view(num_local_experts, -1)
+        recv_x_scales_test = recover_experts_swizzled_scales(recv_x_scales_test, num_local_experts, padded_m, padded_k)
+
+        for local_expert in range(num_local_experts):
+            num_valid_tokens = recv_count[local_expert].item()
+            for test_token_idx in range(num_valid_tokens):
+                # get the pair token index
+                ref_token_idx, global_token_idxs = get_pair_token_idx(global_token_idxs_test, global_token_idxs_ret, local_expert, test_token_idx)
+                # check recv_x
+                recv_x_bf16_ref_per_token = recv_x[local_expert, ref_token_idx]
+                recv_x_ref_per_token = recv_x_ref[local_expert, ref_token_idx]
+                recv_x_test_per_token = recv_x_test[local_expert, test_token_idx]
+                assert torch.equal(recv_x_ref_per_token, recv_x_test_per_token), f'rank {rank}, recv_x_ref_per_token: {recv_x_ref_per_token}, recv_x_test_per_token: {recv_x_test_per_token}'
+                # check recv_x_scales
+                recv_x_scales_ref_per_token = recv_x_scales_ref[local_expert, ref_token_idx]
+                recv_x_scales_test_per_token = recv_x_scales_test[local_expert, test_token_idx]
+                assert torch.equal(recv_x_scales_ref_per_token, recv_x_scales_test_per_token), f'rank {rank}, recv_x_scales_ref_per_token: {recv_x_scales_ref_per_token}, recv_x_scales_test_per_token: {recv_x_scales_test_per_token}'
+    
+    
+    #############################################################
+    # correctness checking.
+    # the reference is got by dispatching with bf16 data format,
+    # and then the reference is compared with dequantized output of nvfp4 dispatch
+    #############################################################
+    if rank == 0:
+        print(f'Compare dequantized nvfp4 dispatch output with bf16 dispatch output')
+    recv_x_test = per_token_cast_back(recv_x_pre_quant, recv_x_pre_quant_scales, x_global_scale, src_data_format='nvfp4')
+    for local_expert in range(num_local_experts):
+        num_valid_tokens = recv_count[local_expert].item()
+        assert recv_count_pre_quant[local_expert].item() == num_valid_tokens, f'num_valid_tokens_pre_quant: {num_valid_tokens_pre_quant}, num_valid_tokens: {num_valid_tokens}'
+        for test_token_idx in range(num_valid_tokens):
+            # get the pair token index
+            ref_token_idx, global_token_idxs = get_pair_token_idx(global_token_idxs_test, global_token_idxs_ret, local_expert, test_token_idx)
+            # check recv_x
+            recv_x_ref_per_token = recv_x[local_expert, ref_token_idx]
+            recv_x_test_per_token = recv_x_test[local_expert, test_token_idx]
+            diff = calc_diff(recv_x_ref_per_token, recv_x_test_per_token)
+            assert diff < 1e-1, f'diff: {diff}'
+    if rank == 0:
+        print(f'Test nvfp4 dispatch passed')
+    return
+
+
+# noinspection PyUnboundLocalVariable,PyShadowingNames
+def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
+    rank, num_ranks, group = init_dist(local_rank, num_local_ranks)
+    num_tokens, hidden = args.num_tokens, args.hidden
+    num_topk, num_experts = args.num_topk, args.num_experts
+    num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(num_tokens, hidden, num_ranks, num_experts)
+    if local_rank == 0:
+        print(f'Allocating buffer size: {num_rdma_bytes / 1e6} MB ...', flush=True)
+    buffer = deep_ep.Buffer(group, num_rdma_bytes=num_rdma_bytes, low_latency_mode=True,
+                            num_qps_per_rank=num_experts // num_ranks,
+                            allow_nvlink_for_low_latency_mode=not args.disable_nvlink, explicitly_destroy=True,
+                            allow_mnnvl=args.allow_mnnvl)
+    test_main(num_tokens, hidden, num_experts, num_topk, rank, num_ranks, group, buffer,
+              use_logfmt=args.use_logfmt, seed=1, args=args)
+
+    # Destroy the buffer runtime and communication group
+    buffer.destroy()
+    dist.barrier()
+    dist.destroy_process_group()
+
+
+if __name__ == '__main__':
+    # TODO: you may modify NUMA binding for less CPU overhead
+    # TODO: buggy with `num_tokens=512`
+    parser = argparse.ArgumentParser(description='Test low-latency EP kernels')
+    parser.add_argument('--num-processes', type=int, default=8,
+                       help='Number of processes to spawn (default: 8)')
+    parser.add_argument('--num-tokens', type=int, default=128,
+                       help='Number of tokens (default: 128)')
+    parser.add_argument('--hidden', type=int, default=7168,
+                       help='Hidden dimension size (default: 7168)')
+    parser.add_argument('--num-topk', type=int, default=8,
+                       help='Number of top-k experts (default: 8)')
+    parser.add_argument('--num-experts', type=int, default=288,
+                       help='Number of experts (default: 288)')
+    parser.add_argument('--allow-mnnvl', action="store_true",
+                        help='Allow MNNVL for communication')
+    parser.add_argument('--disable-nvlink', action='store_true',
+                        help='Whether to disable NVLink for testing')
+    parser.add_argument('--use-logfmt', action='store_true',
+                        help='Whether to test LogFMT combine')
+    parser.add_argument("--pressure-test", action='store_true',
+                        help='Whether to do pressure test')
+    parser.add_argument("--no-kineto-profile", action='store_true',
+                        help='Whether to do torch profile')
+    parser.add_argument("--CUDA_ARCH", type=int, default=90,
+                        help='Whether to do torch profile')
+
+    args = parser.parse_args()
+
+    num_processes = args.num_processes
+    torch.multiprocessing.spawn(test_loop, args=(num_processes, args), nprocs=num_processes)
diff --git a/tests/utils.py b/tests/utils.py
index 622ee201..592260fd 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -10,7 +10,7 @@
 import torch.distributed as dist
 from typing import Optional, Union
 
-
+BLOCK_SIZE = 16
 def init_dist(local_rank: int, num_local_ranks: int):
     # NOTES: you may rewrite this function with your own cluster settings
     ip = os.getenv('MASTER_ADDR', '127.0.0.1')
@@ -51,7 +51,7 @@ def per_token_cast_to_fp8(x: torch.Tensor):
     return (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(m, n), (x_amax / 448.0).view(m, -1)
 
     
-def cast_fp8_to_fp32(x_fp8: torch.Tensor, x_scales: torch.Tensor):
+def cast_fp8_to_bf16(x_fp8: torch.Tensor, x_scales: torch.Tensor):
     if x_fp8.numel() == 0:
         return x_fp8.to(torch.bfloat16)
     if x_scales.dtype == torch.int:
@@ -61,6 +61,44 @@ def cast_fp8_to_fp32(x_fp8: torch.Tensor, x_scales: torch.Tensor):
     x_scales = x_scales.view(x_fp8.size(0), -1, 1)
     return (x_fp32 * x_scales).view(x_fp8.shape).to(torch.bfloat16)
 
+def get_global_token_idxs(recv_count: torch.Tensor, recv_src_info: torch.Tensor, recv_layout_range: torch.Tensor, num_local_experts: int, num_ranks: int, num_tokens: int):
+    rank = dist.get_rank()
+    int_mask = (2 ** 32) - 1
+    begin_idx = torch.zeros((num_local_experts, num_ranks), dtype=torch.int, device='cuda')
+    count = torch.zeros((num_local_experts, num_ranks), dtype=torch.int, device='cuda')
+    global_token_idxs = torch.ones((num_local_experts, num_ranks * num_tokens), dtype=torch.int, device='cuda') * -1
+    for local_expert in range(num_local_experts):
+        num_valid_tokens = recv_count[local_expert].item()
+        for src_rank in range(num_ranks):
+            begin_idx_local, count_local = (recv_layout_range[local_expert][src_rank] >> 32).item(), (recv_layout_range[local_expert][src_rank] & int_mask).item()
+            begin_idx[local_expert, src_rank], count[local_expert, src_rank] = begin_idx_local, count_local
+            for recv_idx in range(begin_idx_local, begin_idx_local + count_local):
+                global_token_idxs[local_expert, recv_idx] = recv_src_info[local_expert, recv_idx] + src_rank * num_tokens
+    return global_token_idxs
+
+
+def get_pair_token_idx(global_token_idxs_test: torch.Tensor, global_token_idxs_ref: torch.Tensor, local_expert: int, token_idx: int):
+    global_token_idxs_temp = global_token_idxs_test[local_expert, token_idx]    
+    idx_arr = torch.nonzero(global_token_idxs_ref[local_expert, :] == global_token_idxs_temp, as_tuple=False)
+    assert idx_arr.numel() == 1, f'idx_arr.numel(): {idx_arr.numel()}'
+    return idx_arr.item(), global_token_idxs_temp
+
+
+def recover_swizzled_scales(scale, m, n):
+    rounded_m = ((m + 128 - 1) // 128) * 128
+    scale_n = n // BLOCK_SIZE
+    rounded_n = ((scale_n + 4 - 1) // 4) * 4
+    # Recover the swizzled scaling factor to linear layout
+    tmp = torch.reshape(scale, (1, rounded_m // 128, rounded_n // 4, 32, 4, 4))
+    tmp = torch.permute(tmp, (0, 1, 4, 3, 2, 5))
+    result = torch.reshape(tmp, (rounded_m, rounded_n)).to(torch.float32)
+    return result[:m, :scale_n]
+
+def recover_experts_swizzled_scales(scale, l, m, n):
+    recovered_tensor = torch.empty((l, m, n//16), dtype=torch.float32, device=scale.device)
+    for i in range(l):
+        recovered_tensor[i] = recover_swizzled_scales(scale[i], m, n)
+    return recovered_tensor
 
 def int32_to_8floats_lookup(tensor: torch.Tensor, table: torch.Tensor) -> torch.Tensor:
     """
@@ -90,31 +128,66 @@ def int32_to_8floats_lookup(tensor: torch.Tensor, table: torch.Tensor) -> torch.
     return out
 
 
-def cast_nvfp4_to_fp32(x_nvfp4: torch.Tensor, x_scales: torch.Tensor, x_global_scales: torch.Tensor, use_ue8m0_for_nvfp4_x_scale: bool = False):
+def uint8_to_2floats_lookup(tensor: torch.Tensor) -> torch.Tensor:
+    """
+    Decomposes each uint8 in the input tensor into 2 4-bit values,
+    and converts them into float values using a lookup table.
+
+    Args:
+        tensor: (uint8 Tensor) Tensor of any shape, e.g., [B, N]
+
+    Returns:
+        float32 Tensor: Merges the last two dimensions, so shape is [..., n*M], where n is the number of uint8 and 2 per uint8.
+    """
     NVFP4_TABLE = torch.tensor([0, 0.5, 1, 1.5, 2, 3, 4, 6, 0, -0.5, -1.0, -1.5, -2, -3, -4, -6], dtype=torch.float32, device='cuda')
-    if use_ue8m0_for_nvfp4_x_scale:
+    assert tensor.dtype == torch.uint8, "Input must be of uint8 type"
+
+    result = []
+    for i in range(2):
+        shift = i * 4
+        idx = ((tensor >> shift) & 0xF).long()  # Extract 4-bit index [0, 15]
+        val = NVFP4_TABLE[idx].unsqueeze(-1)  # Lookup and preserve dimensions
+        result.append(val)
+
+    out = torch.cat(result, dim=-1)  # Output shape: [..., 2]
+    # Merge the last two dimension
+    out = out.reshape(*out.shape[:-2], -1) if out.ndim > 1 else out
+    return out
+
+
+def cast_nvfp4_to_bf16(x_nvfp4: torch.Tensor, x_scales: torch.Tensor, x_global_scale: float, use_ue8m0_for_sf: bool = False):
+    assert x_nvfp4.dtype == torch.uint8, "Input must be of int8 type, but got " + str(x_nvfp4.dtype)
+    assert x_scales.ndim == 6, "Input scales must be of 6 dimensions"
+    assert x_scales.shape[0] == 32 and x_scales.shape[1] == 4 and x_scales.shape[3] == 4, "Input scales shape must be [32, 4, rm, 4, rk, l]"
+    _, _, rm, _, rk, l = x_scales.shape
+    assert x_nvfp4.ndim == 3, "Input nvfp4 must be of 3 dimensions"
+    assert x_nvfp4.shape[2] == l, "Input nvfp4 shape must be [m, k//2, l], but got " + str(x_nvfp4.shape)
+    x_nvfp4 = x_nvfp4.permute(2, 0, 1)
+    
+    if use_ue8m0_for_sf:
+        assert x_scales.dtype == torch.int8, "Input scales must be of int8 type if use_ue8m0_for_sf is True"
         x_scales = x_scales.view(dtype=torch.int8).to(torch.int) << 23
         x_scales = x_scales.view(dtype=torch.float)
     else:
-        x_scales = x_scales.view(dtype=torch.float8_e4m3fn).to(torch.float32)
-    x_scales = x_scales * (1 / x_global_scales)
-    
-    x_int32 = x_nvfp4.view(dtype=torch.int32)
-    x_fp32 = int32_to_8floats_lookup(x_int32, NVFP4_TABLE) 
+        assert x_scales.dtype == torch.float8_e4m3fn, "Input scales must be of float8_e4m3fn type if use_ue8m0_for_sf is False"
+        x_scales = x_scales.to(torch.float32)
+    x_scales = x_scales * (1 / x_global_scale)
     
-    x_fp32 = x_fp32.view(*x_fp32.shape[:-1], -1, 16)
-    x_scales = x_scales.view(*x_scales.shape[:-1], -1, 1)
-    x_fp32 = x_fp32 * x_scales
-    x_fp32 = x_fp32.view(*x_nvfp4.shape[:-1], -1).to(torch.bfloat16)
+    x_fp32 = uint8_to_2floats_lookup(x_nvfp4).to(torch.float32)
+    x_scales_view = x_scales.permute(5, 2, 4, 0, 1, 3).view(l, rm, -1)
+    x_scales_view_recover = torch.empty((l, rm*128, rk*4), dtype=torch.float32, device=x_scales.device)
+    for i in range(l):
+        x_scales_view_recover[i] = recover_swizzled_scales(x_scales_view[i], rm*128, rk*64)
+    x_fp32_dequantized = x_fp32 * x_scales_view_recover.repeat_interleave(16, dim=-1)[:, :x_nvfp4.shape[1], :]
 
-    return x_fp32
+    return x_fp32_dequantized.contiguous().to(torch.bfloat16)
 
 
-def per_token_cast_back(x: torch.Tensor, x_scales: torch.Tensor, x_global_scales: torch.Tensor = None, use_ue8m0_for_nvfp4_x_scale: bool = False, src_data_format: str = 'fp8'):
+def per_token_cast_back(x: torch.Tensor, x_scales: torch.Tensor, x_global_scale: torch.Tensor = None, use_ue8m0_for_sf: bool = False, src_data_format: str = 'fp8'):
     if src_data_format == 'fp8':
-        return cast_fp8_to_fp32(x, x_scales)
+        return cast_fp8_to_bf16(x, x_scales)
     elif src_data_format == 'nvfp4':
-        return cast_nvfp4_to_fp32(x, x_scales, x_global_scales, use_ue8m0_for_nvfp4_x_scale)
+        return cast_nvfp4_to_bf16(x, x_scales, x_global_scale, use_ue8m0_for_sf)
     else:
         raise ValueError(f"Unsupported src_data_format: {src_data_format}")
 

From 87c9f8fa0bc50419e4cfec802c1b17f540d02192 Mon Sep 17 00:00:00 2001
From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com>
Date: Fri, 12 Sep 2025 15:40:53 +0800
Subject: [PATCH 57/60] Fix wrong accuracy

---
 csrc/kernels/internode_ll.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/csrc/kernels/internode_ll.cu b/csrc/kernels/internode_ll.cu
index abf381c1..f41b2277 100644
--- a/csrc/kernels/internode_ll.cu
+++ b/csrc/kernels/internode_ll.cu
@@ -522,8 +522,8 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales,
                 const auto num_elems_per_pack = static_cast<int>(sizeof(packed_t) / sizeof(scale_t));
                 const auto token_idx = recv_token_begin_idx + i;
                 
-                const auto padded_k = (kHidden + (kNumPerChannels * num_elems_per_pack) -1 ) / (kNumPerChannels * num_elems_per_pack);
-                const auto dim0_stride = 128 * padded_k / kNumPerChannels;
+                const auto rk = align<int>(kHidden / kNumPerChannels, 4) / 4;
+                const auto dim0_stride = rk * 128 * num_elems_per_pack;
                 const auto dim1_stride = 128 * num_elems_per_pack;
                 const auto dim2_stride = 4 * num_elems_per_pack;
                 const auto dim3_stride = num_elems_per_pack;

From 92a14ccfa06adb073fee8d4b0fbe62161e54a36a Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Sun, 14 Sep 2025 18:13:47 +0800
Subject: [PATCH 58/60] copy kernel and modify upper level

---
 csrc/deep_ep.cpp             |   6 +-
 csrc/deep_ep.hpp             |   3 +-
 csrc/kernels/api.cuh         |   3 +-
 csrc/kernels/internode_ll.cu | 518 ++++++++++++++++++++++++++++++++++-
 deep_ep/buffer.py            |   6 +-
 5 files changed, 529 insertions(+), 7 deletions(-)

diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index cd08655b..7fe9769b 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -1372,7 +1372,8 @@ Buffer::low_latency_combine(const torch::Tensor& x, const torch::Tensor& topk_id
                             const std::optional<torch::Tensor>& combine_wait_recv_cost_stats,
                             int num_max_dispatch_tokens_per_rank, int num_experts,
                             bool use_logfmt, bool zero_copy, bool async, bool return_recv_hook,
-                            const std::optional<torch::Tensor>& out) {
+                            const std::optional<torch::Tensor>& out,
+                            bool overlap, const std::optional<torch::Tensor>& src_signals, uint32_t src_signal_expect_value) {
 #ifndef DISABLE_NVSHMEM
     EP_HOST_ASSERT(low_latency_mode);
 
@@ -1442,7 +1443,8 @@ Buffer::low_latency_combine(const torch::Tensor& x, const torch::Tensor& topk_id
                               num_topk, num_experts, rank, num_ranks,
                               use_logfmt,
                               workspace, num_device_sms,
-                              launch_stream, phases, zero_copy);
+                              launch_stream, phases, zero_copy,
+                              overlap, src_signals.has_value() ? src_signals->data_ptr<uint32_t>() : nullptr, src_signal_expect_value);
     };
     launcher(return_recv_hook ? LOW_LATENCY_SEND_PHASE : (LOW_LATENCY_SEND_PHASE | LOW_LATENCY_RECV_PHASE));
 
diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp
index a3db7398..302b62cd 100644
--- a/csrc/deep_ep.hpp
+++ b/csrc/deep_ep.hpp
@@ -188,7 +188,8 @@ struct Buffer {
                         const std::optional<torch::Tensor>& combine_wait_recv_cost_stats,
                         int num_max_dispatch_tokens_per_rank, int num_experts,
                         bool use_logfmt, bool zero_copy, bool async, bool return_recv_hook,
-                        const std::optional<torch::Tensor>& out = std::nullopt);
+                        const std::optional<torch::Tensor>& out = std::nullopt,
+                        bool overlap = false, const std::optional<torch::Tensor>& src_signals = std::nullopt, uint32_t src_signal_expect_value = 0);
 
     torch::Tensor
     get_next_low_latency_combine_buffer(int num_max_dispatch_tokens_per_rank, int hidden, int num_experts) const;
diff --git a/csrc/kernels/api.cuh b/csrc/kernels/api.cuh
index dc93e877..a0eb0635 100644
--- a/csrc/kernels/api.cuh
+++ b/csrc/kernels/api.cuh
@@ -165,7 +165,8 @@ void combine(void* combined_x,
              int num_topk, int num_experts, int rank, int num_ranks,
              bool use_logfmt,
              void* workspace, int num_device_sms,
-             cudaStream_t stream, int phases, bool zero_copy);
+             cudaStream_t stream, int phases, bool zero_copy,
+             bool overlap, uint32_t* src_signals, uint32_t src_signal_expect_value);
 
 } // namespace internode_ll
 
diff --git a/csrc/kernels/internode_ll.cu b/csrc/kernels/internode_ll.cu
index f41b2277..39d8dd01 100644
--- a/csrc/kernels/internode_ll.cu
+++ b/csrc/kernels/internode_ll.cu
@@ -769,6 +769,504 @@ __forceinline__ __device__ void decode_and_accumulate(uint32_t* ld_buffer, float
     }
 }
 
+// TODO unify with original code
+template <bool kUseLogFMT, int kHidden, int kNumMaxTopk, int kNumMaxUnrolls>
+__global__
+// __launch_bounds__(1024, 1)
+__maxnreg__(48) // TODO
+void
+combine_v2(void* combined_x,
+        void* rdma_recv_x, int* rdma_recv_flag, void* rdma_send_x,
+        const void* x, const int64_t* topk_idx, const float* topk_weights,
+        const int* src_info, const int64_t* layout_range,
+        int64_t* combine_wait_recv_cost_stats,
+        int* next_clean, int num_next_clean_int,
+        int* atomic_clean_flag,
+        int num_combined_tokens, int hidden, int num_topk,
+        int num_max_dispatch_tokens_per_rank,
+        int num_experts, int rank, int num_ranks,
+        int num_warp_groups, int num_warps_per_group,
+        int phases, bool zero_copy,
+        uint32_t* src_signals, uint32_t src_signal_expect_value) {
+    const auto sm_id = __shfl_sync(0xffffffff, static_cast<int>(blockIdx.x), 0);
+    const auto num_sms = __shfl_sync(0xffffffff, static_cast<int>(gridDim.x), 0);
+    const auto thread_id = static_cast<int>(threadIdx.x);
+    const auto num_threads = __shfl_sync(0xffffffff, static_cast<int>(blockDim.x), 0);
+    const auto warp_id = __shfl_sync(0xffffffff, thread_id / 32, 0), lane_id = get_lane_id();
+    const auto num_local_experts = num_experts / num_ranks;
+    const auto warp_group_id = warp_id / num_warps_per_group;
+    const auto sub_warp_id = warp_id % num_warps_per_group;
+    const auto responsible_expert_idx = sm_id * num_warp_groups + warp_group_id;
+
+    extern __shared__ __align__(1024) uint8_t smem_buffer[];
+
+    // Data type staffs
+    constexpr int kNumElemsPerInt4 = sizeof(int4) / sizeof(nv_bfloat16);
+    constexpr int64_t hidden_bf16_int4 = kHidden / kNumElemsPerInt4;
+
+    // Use different unroll factors for send and recv phases
+    constexpr int kNumSendUnrolls = kHidden % (32 * 4 * sizeof(int4) / sizeof(nv_bfloat16)) == 0 ? 4 : 2;
+    constexpr int kNumRecvUnrolls = 2;
+    constexpr int hidden_bf16_int4_pad = align(static_cast<int>(hidden_bf16_int4), 32 * kNumSendUnrolls);
+    EP_STATIC_ASSERT(kHidden % (32 * 2 * sizeof(int4) / sizeof(nv_bfloat16)) == 0, "Invalid hidden");
+    EP_STATIC_ASSERT(kNumSendUnrolls <= kNumMaxUnrolls and kNumRecvUnrolls <= kNumMaxUnrolls, "Invalid unrolls");
+    EP_STATIC_ASSERT(hidden_bf16_int4 % kNumSendUnrolls == 0, "Invalid hidden");
+    EP_STATIC_ASSERT(kNumSendUnrolls >= kNumRecvUnrolls, "Invalid unroll factors");
+
+    // Message package
+    EP_STATIC_ASSERT(kHidden % 128 == 0, "Invalid hidden");
+    constexpr int kNumDivisions = kHidden / 128;
+    constexpr int kNumMetaBytes = kNumDivisions * sizeof(nv_bfloat162);
+    constexpr size_t num_bytes_per_slot = kHidden * sizeof(nv_bfloat16) + kNumMetaBytes;
+    EP_STATIC_ASSERT(num_bytes_per_slot % sizeof(int4) == 0, "Invalid vectorization");
+
+    // Sending phase
+    if ((phases & LOW_LATENCY_SEND_PHASE) == 0)
+        goto LOW_LATENCY_COMBINE_RECV;
+
+    // Clean up next buffer
+    if (sm_id == 0 and warp_group_id == 0 and sub_warp_id == 0) {
+        #pragma unroll
+        for (int i = lane_id; i < num_next_clean_int; i += 32)
+            next_clean[i] = 0;
+
+        // Notify before executing `int_p`
+        __syncwarp();
+        if (lane_id == 0)
+            atomic_add_release_global(atomic_clean_flag, num_experts);
+    }
+
+    // Issue IBGDA sends
+    if (responsible_expert_idx < num_experts) {
+        // NOTE move tma-related to outside local_expert_idx loop
+        // ------------------------------------------ START tma-related -------------------------------------------------
+        // TMA stuffs
+        constexpr int kNumTMABufferBytes = sizeof(int4) * 32 * kNumSendUnrolls;
+        constexpr int kNumStages = 3;
+        constexpr int kNumPrefetch = 1;
+        EP_STATIC_ASSERT(kNumStages == 3 and kNumPrefetch == 1, "Invalid stages");
+
+        auto smem_ptr = smem_buffer + warp_id * (kNumStages * (kNumTMABufferBytes + 16) + kNumMetaBytes);
+        uint32_t tma_phase = 0;
+        auto tma_buffers   = PatternVisitor([=](const int& i) { return reinterpret_cast<int4*>(smem_ptr + i * (kNumTMABufferBytes + 16)); });
+        auto full_barriers = PatternVisitor([=](const int& i) { return reinterpret_cast<uint64_t*>(smem_ptr + i * (kNumTMABufferBytes + 16) + kNumTMABufferBytes); });
+        auto meta_buffers  = kUseLogFMT ? reinterpret_cast<nv_bfloat162*>(smem_ptr + kNumStages * (kNumTMABufferBytes + 16)) : nullptr;
+        EP_STATIC_ASSERT(kNumSendUnrolls * kNumStages <= 12, "TMA buffer size exceed limit");
+
+        // Initialize m-barriers
+        if (lane_id < kNumStages) {
+            mbarrier_init(full_barriers[lane_id], 1);
+            fence_view_async_shared();
+            fence_barrier_init();
+        }
+        __syncwarp();
+
+        constexpr int kNumIters = hidden_bf16_int4_pad / (32 * kNumSendUnrolls);
+        auto tma_load_and_arrive = [&](const int& stage_idx, const int4* gmem_ptr, const int& num_bytes) {
+            tma_load_1d(tma_buffers[stage_idx], gmem_ptr, full_barriers[stage_idx], num_bytes);
+            mbarrier_arrive_and_expect_tx(full_barriers[stage_idx], num_bytes);
+        };
+        auto get_num_tma_bytes = [&](const int& offset_int4) {
+            return min(kNumTMABufferBytes, static_cast<int>((hidden_bf16_int4 - offset_int4) * sizeof(int4)));
+        };
+        // -------------------------------------------- END tma-related -----------------------------------------------
+
+        const auto dst_rank = responsible_expert_idx / num_local_experts;
+
+        // NOTE
+        // before: "one warp group --- all tokens for one (dsk_rank, local_expert_idx)"
+        // after: "multiple warp groups --- cooperate on tokens for one (dsk_rank, local_expert_idx)"
+        for (int local_expert_idx = 0; local_expert_idx < num_local_experts; ++local_expert_idx) {
+            // NOTE changed
+            // const auto local_expert_idx = responsible_expert_idx % num_local_experts;
+            const auto token_cooperate_part_idx = responsible_expert_idx % num_local_experts;
+            const auto num_token_cooperate_parts = num_local_experts;
+
+            const auto global_expert_idx = rank * num_local_experts + local_expert_idx;
+            const auto layout = __ldg(layout_range + local_expert_idx * num_ranks + dst_rank);
+            const auto local_x = static_cast<const int4*>(x) +
+                    local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank * hidden_bf16_int4;
+            const auto local_src_info = src_info + local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank;
+            const auto rdma_send_x_vec = static_cast<uint8_t*>(rdma_send_x) +
+                    local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank * num_bytes_per_slot;
+
+            // Unpack layout
+            int offset, num_tokens_to_send;
+            unpack2(layout, num_tokens_to_send, offset);
+
+            // NOTE added
+            if (src_signals != nullptr) {
+                // TODO shall we let 1st expert be separately computed and then do *not* wait for it
+                // if ((threadIdx.x == 0) and (local_expert_idx > 0)) {
+                if (threadIdx.x == 0) {
+                    wait_signal(src_signals + local_expert_idx, src_signal_expect_value);
+                }
+
+                // TODO original code uses NamedBarrier, better than this?
+                __syncthreads();
+            }
+
+            // Issue IBGDA send
+            // NOTE changed
+            // for (int token_idx = offset + sub_warp_id; token_idx < offset + num_tokens_to_send; token_idx += num_warps_per_group) {
+            const int num_tokens_to_send_per_cooperate_part = ceil_div(num_tokens_to_send, num_token_cooperate_parts);
+            const int token_idx_part_end = offset + min(num_tokens_to_send, num_tokens_to_send_per_cooperate_part * (token_cooperate_part_idx + 1));
+            for (
+                int token_idx = offset + num_tokens_to_send_per_cooperate_part * token_cooperate_part_idx + sub_warp_id;
+                token_idx < token_idx_part_end;
+                token_idx += num_warps_per_group
+            ) {
+                const auto x_int4 = local_x + token_idx * hidden_bf16_int4;
+                const auto rdma_send_type_row = reinterpret_cast<int*>(rdma_send_x_vec + token_idx * num_bytes_per_slot);
+                const auto rdma_send_x_vec_row = reinterpret_cast<uint8_t*>(rdma_send_type_row);
+
+                // Copy directly to local rank, or copy to buffer and issue RDMA
+                const auto src_idx = __shfl_sync(0xffffffff, __ldg(local_src_info + token_idx), 0);
+                const auto buf_ptr = reinterpret_cast<int64_t>(rdma_send_x_vec_row);
+                const auto dst_ptr = reinterpret_cast<uint64_t>(rdma_recv_x) + (global_expert_idx * num_max_dispatch_tokens_per_rank + src_idx) * num_bytes_per_slot;
+                const auto dst_p2p_ptr = nvshmemi_get_p2p_ptr(dst_ptr, rank, dst_rank);
+                int num_send_bytes = hidden * sizeof(nv_bfloat16);
+
+                if (not zero_copy or dst_p2p_ptr != 0) {
+                    // Read from `cpy_src_int4_ptr` and copy into `cpy_dst_int4_ptr`
+                    const auto cpy_src_int4_ptr = zero_copy ? reinterpret_cast<int4*>(buf_ptr) : x_int4;
+                    const auto cpy_dst_int4_ptr = dst_p2p_ptr == 0 ? reinterpret_cast<int4*>(buf_ptr) : reinterpret_cast<int4*>(dst_p2p_ptr);
+
+                    // Prefetch
+                    if (elect_one_sync(lane_id))
+                        tma_load_and_arrive(0, cpy_src_int4_ptr, get_num_tma_bytes(0));
+                    __syncwarp();
+
+                    int tma_offset_bytes = kNumMetaBytes;
+                    #pragma unroll
+                    for (int i = lane_id * kNumSendUnrolls, iter_idx = 0; i < hidden_bf16_int4_pad; i += 32 * kNumSendUnrolls, ++ iter_idx) {
+                        // Load the next iteration
+                        const int& stage_idx = iter_idx % kNumStages;
+                        const int& next_stage_idx = (iter_idx + 1) % kNumStages;
+                        if (iter_idx + 1 < kNumIters and elect_one_sync(lane_id)) {
+                            tma_store_wait<kNumStages - kNumPrefetch - 1>();
+                            const auto& offset_int4 = i + 32 * kNumSendUnrolls;
+                            tma_load_and_arrive(next_stage_idx, cpy_src_int4_ptr + offset_int4, get_num_tma_bytes(offset_int4));
+                        }
+                        __syncwarp();
+
+                        // Wait the current TMA arrival
+                        EP_STATIC_ASSERT(kNumStages < 32, "Too many stages");
+                        mbarrier_wait<true>(full_barriers[stage_idx], tma_phase, stage_idx);
+                        if constexpr (kUseLogFMT) {
+                            // Cast if possible
+                            constexpr int kNumInt4PerDivision = 128 / kNumElemsPerInt4;
+                            int num_tma_bytes = logfmt_encode<kNumSendUnrolls>(
+                                tma_buffers[stage_idx],
+                                // NOTES: only the leader lane will write the result
+                                (i % kNumInt4PerDivision == 0) ? meta_buffers + i / kNumInt4PerDivision : nullptr,
+                                lane_id);
+                            if (elect_one_sync(lane_id))
+                                tma_store_1d(tma_buffers[stage_idx], reinterpret_cast<uint8_t*>(cpy_dst_int4_ptr) + tma_offset_bytes, num_tma_bytes);
+                            tma_offset_bytes += num_tma_bytes;
+                        } else {
+                            // BF16 original values
+                            if (elect_one_sync(lane_id))
+                                tma_store_1d(tma_buffers[stage_idx], cpy_dst_int4_ptr + i, get_num_tma_bytes(i));
+                        }
+                        __syncwarp();
+                    }
+
+                    // Store metadata (min/max values) for LogFMT
+                    if constexpr (kUseLogFMT) {
+                        num_send_bytes = tma_offset_bytes;
+                        if (elect_one_sync(lane_id))
+                            tma_store_1d(meta_buffers, cpy_dst_int4_ptr, kNumMetaBytes);
+                    }
+
+                    // Flush all stores
+                    tma_store_wait();
+                    __syncwarp();
+                }
+
+                // Issue RDMA
+                // NOTES: for zero-copy mode, we assume the data is already in the send buffer
+                if (dst_p2p_ptr == 0)
+                    nvshmemi_ibgda_put_nbi_warp(dst_ptr, buf_ptr, num_send_bytes, dst_rank, local_expert_idx, lane_id, token_idx - offset);
+            }
+        }
+
+        // TODO maybe move to above?
+        // Put the finishing flag
+        EP_DEVICE_ASSERT(num_warps_per_group > 1 and num_warp_groups < 16);
+        asm volatile("bar.sync %0, %1;" :: "r"(warp_group_id + 1), "r"(num_warps_per_group * 32));
+        if (sub_warp_id == 1 and lane_id == 0) {
+            // copied from global to this part
+            const auto local_expert_idx_for_signal = responsible_expert_idx % num_local_experts;
+            const auto global_expert_idx_for_signal = rank * num_local_experts + local_expert_idx_for_signal;
+            // =============================================
+
+            while (ld_acquire_global(atomic_clean_flag) == 0);
+            auto dst_ptr = reinterpret_cast<uint64_t>(rdma_recv_flag + global_expert_idx_for_signal);
+            auto dst_p2p_ptr = nvshmemi_get_p2p_ptr(dst_ptr, rank, dst_rank);
+            if (dst_p2p_ptr == 0) {
+                // will not visit this branch
+                // nvshmemi_ibgda_amo_nonfetch_add(reinterpret_cast<int*>(dst_ptr), 1, dst_rank, local_expert_idx);
+                EP_DEVICE_ASSERT(0);
+            } else {
+                st_release_sys_global(reinterpret_cast<int*>(dst_p2p_ptr), 1);
+            }
+            atomic_add_release_global(atomic_clean_flag, -1);
+        }
+        __syncwarp();
+
+        // Destroy m-barriers
+        if (lane_id < kNumStages) {
+            mbarrier_inval(full_barriers[lane_id]);
+            fence_view_async_shared();
+            fence_barrier_init();
+        }
+        __syncwarp();
+    } else {
+        // NOTE add
+        for (int local_expert_idx = 0; local_expert_idx < num_local_experts; ++local_expert_idx) {
+            if (src_signals != nullptr) {
+              // TODO original code uses NamedBarrier, better than this?
+              __syncthreads();
+            }
+        }
+    }
+//     if (thread_id % 32 == 0) { printf("[R%d,S%d,T%d] combine phase=send END\n", rank, sm_id, thread_id); }
+
+    // Receiving phase
+    LOW_LATENCY_COMBINE_RECV:
+    if ((phases & LOW_LATENCY_RECV_PHASE) == 0)
+        return;
+
+    // Wait all ranks to arrive
+    if (responsible_expert_idx < num_experts) {
+        EP_DEVICE_ASSERT(num_warps_per_group > 1);
+        if (sub_warp_id == 0 and lane_id == 0) {
+            auto start_time = clock64();
+            while (ld_acquire_sys_global(rdma_recv_flag + responsible_expert_idx) == 0);
+            auto wait_recv_cost = clock64() - start_time;
+            if (combine_wait_recv_cost_stats != nullptr) {
+                const auto& src_rank = responsible_expert_idx / num_local_experts;
+                atomicAdd(reinterpret_cast<unsigned long long*>(combine_wait_recv_cost_stats + src_rank), wait_recv_cost);
+            }
+        }
+    }
+    cg::this_grid().sync();
+
+    // Reassign warp groups
+    constexpr int kMaxNumGroups = 2;
+    const int num_decode_warps = hidden_bf16_int4_pad / (kNumRecvUnrolls * 32);
+    const int num_groups = min(kMaxNumGroups, (num_threads / 32) / (num_decode_warps + 1));
+    const int decode_warp_idx = __shfl_sync(0xffffffff, warp_id % (num_decode_warps + 1), 0);
+    const int group_idx = __shfl_sync(0xffffffff, warp_id / (num_decode_warps + 1), 0);
+    EP_STATIC_ASSERT(kHidden % (32 * kNumElemsPerInt4) == 0, "Invalid vectorization");
+    EP_DEVICE_ASSERT(num_topk <= 32);
+    EP_DEVICE_ASSERT(num_groups > 0);
+
+    if (group_idx < num_groups) {
+        constexpr int kNumStages = 3;
+        constexpr int kNumTMABufferBytes = 16 * 2 + kHidden * 2;
+        constexpr int kNumBF16PerWarpBytes = 32 * kNumRecvUnrolls * kNumElemsPerInt4 * 2;
+        constexpr int kNumLogFMTPerWarpBytes = kNumBF16PerWarpBytes / 16 * 10;
+        constexpr int kNumDivisionBytes = kNumDivisions * sizeof(uint32_t);
+        constexpr int kNumBytesPerGroup = kNumStages * kNumTMABufferBytes + kHidden * 2 + kNumStages * kNumDivisionBytes * 3;
+
+        // Reallocate shared memory
+        const auto smem_group_buffer = smem_buffer + kNumBytesPerGroup * group_idx;
+        auto full_barriers  = PatternVisitor([=](const int& i) { return reinterpret_cast<uint64_t*>(smem_group_buffer + i * kNumTMABufferBytes); });
+        auto empty_barriers = PatternVisitor([=](const int& i) { return reinterpret_cast<uint64_t*>(smem_group_buffer + i * kNumTMABufferBytes + 8); });
+        auto tma_ld_buffers = PatternVisitor([=](const int& i) { return reinterpret_cast<uint8_t* >(smem_group_buffer + i * kNumTMABufferBytes + 16); });
+        auto tma_st_buffers = PatternVisitor([=](const int& i) { return reinterpret_cast<uint32_t*>(smem_group_buffer + kNumStages * kNumTMABufferBytes + i * kNumBF16PerWarpBytes); });
+
+        // Redundant when logfmt is disabled
+        const auto smem_group_ptr = smem_group_buffer + kNumStages * kNumTMABufferBytes + kHidden * 2;
+        auto log_amax_buffers  = PatternVisitor([=](const int& i) { return reinterpret_cast<float*>(smem_group_ptr + i * kNumDivisionBytes); });
+        auto log_amin_buffers  = PatternVisitor([=](const int& i) { return reinterpret_cast<float*>(smem_group_ptr + kNumStages * kNumDivisionBytes + i * kNumDivisionBytes); });
+        auto cast_info_buffers = PatternVisitor([=](const int& i) { return reinterpret_cast<int*>  (smem_group_ptr + kNumStages * kNumDivisionBytes * 2 + i * kNumDivisionBytes); });
+
+        uint32_t tma_phase = 0;
+        EP_STATIC_ASSERT(kNumStages < 32, "Too many stages");
+        if (decode_warp_idx == num_decode_warps)
+            tma_phase = (1 << kNumStages) - 1;
+
+        // Initialize m-barriers
+        if (decode_warp_idx == num_decode_warps and lane_id < kNumStages) {
+            mbarrier_init(full_barriers[lane_id], 1);
+            mbarrier_init(empty_barriers[lane_id], num_decode_warps);
+        }
+        asm volatile("bar.sync %0, %1;" :: "r"(group_idx + 1), "r"((num_decode_warps + 1) * 32));
+
+        int stage_idx = 0, topk_idx_by_lane = 0;
+        EP_STATIC_ASSERT(kNumMaxTopk <= 32, "Invalid number of topks");
+        if (decode_warp_idx == num_decode_warps) {
+            // TMA load warp
+            for (int token_idx = sm_id + num_sms * group_idx; token_idx < num_combined_tokens; token_idx += num_sms * num_groups) {
+                if (lane_id < num_topk)
+                    topk_idx_by_lane = static_cast<int>(__ldg(topk_idx + token_idx * num_topk + lane_id));
+                for (int i = 0; i < num_topk; ++ i) {
+                    int topk_idx_reg = __shfl_sync(0xffffffff, topk_idx_by_lane, i);
+                    if (topk_idx_reg < 0)
+                        continue;
+
+                    mbarrier_wait<true>(empty_barriers[stage_idx], tma_phase, stage_idx);
+                    auto buffer = static_cast<uint8_t*>(rdma_recv_x) + (topk_idx_reg * num_max_dispatch_tokens_per_rank + token_idx) * num_bytes_per_slot;
+                    if constexpr (kUseLogFMT) {
+                        logfmt_check_amaxmin<kNumDivisions / 2, kNumSendUnrolls, kNumRecvUnrolls>(
+                            buffer, reinterpret_cast<float2*>(log_amax_buffers[stage_idx]),
+                            reinterpret_cast<float2*>(log_amin_buffers[stage_idx]), cast_info_buffers[stage_idx], lane_id);
+                    }
+                    if (elect_one_sync(lane_id)) {
+                        int num_casted = 0;
+                        if constexpr (kUseLogFMT) {
+                            const auto& info = cast_info_buffers[stage_idx][num_decode_warps - 1];
+                            num_casted = (info >> 1) + (info & 1);
+                        }
+                        int num_tma_bytes = num_casted * kNumLogFMTPerWarpBytes + (num_decode_warps - num_casted) * kNumBF16PerWarpBytes;
+                        tma_load_1d(tma_ld_buffers[stage_idx], buffer + (kUseLogFMT ? kNumMetaBytes : 0), full_barriers[stage_idx], num_tma_bytes);
+                        mbarrier_arrive_and_expect_tx(full_barriers[stage_idx], num_tma_bytes);
+                    }
+                    __syncwarp();
+                    stage_idx = (stage_idx + 1) % kNumStages;
+                }
+            }
+        } else {
+            // Reduction warps
+            float topk_weights_by_lane;
+            for (int token_idx = sm_id + num_sms * group_idx; token_idx < num_combined_tokens; token_idx += num_sms * num_groups) {
+                if (lane_id < num_topk) {
+                    topk_idx_by_lane = static_cast<int>(__ldg(topk_idx + token_idx * num_topk + lane_id));
+                    topk_weights_by_lane = __ldg(topk_weights + token_idx * num_topk + lane_id);
+                }
+                __syncwarp();
+
+                float combined_values[kNumElemsPerInt4 * kNumRecvUnrolls] = {0.0f};
+                for (int i = 0; i < num_topk; ++ i) {
+                    if (__shfl_sync(0xffffffff, topk_idx_by_lane, i) < 0)
+                        continue;
+                    const auto& topk_weight = __shfl_sync(0xffffffff, topk_weights_by_lane, i);
+
+                    mbarrier_wait<true>(full_barriers[stage_idx], tma_phase, stage_idx);
+                    if constexpr (kUseLogFMT) {
+                        const auto& info = cast_info_buffers[stage_idx][decode_warp_idx];
+                        bool enable_cast = info & 1;
+                        int num_casted_prefix = info >> 1;
+                        int tma_offset = kNumLogFMTPerWarpBytes * num_casted_prefix + kNumBF16PerWarpBytes * (decode_warp_idx - num_casted_prefix);
+                        int division_idx = decode_warp_idx * (kNumRecvUnrolls * 2) + lane_id * kNumRecvUnrolls / 16;
+                        decode_and_accumulate<kNumRecvUnrolls>(
+                            reinterpret_cast<uint32_t*>(tma_ld_buffers[stage_idx] + tma_offset + (enable_cast ? kNumLogFMTPerWarpBytes : kNumBF16PerWarpBytes) / 32 * lane_id),
+                            combined_values, log_amax_buffers[stage_idx][division_idx], log_amin_buffers[stage_idx][division_idx], enable_cast, topk_weight
+                        );
+                    } else {
+                        int tma_offset = kNumBF16PerWarpBytes * decode_warp_idx;
+                        decode_and_accumulate<kNumRecvUnrolls>(
+                            reinterpret_cast<uint32_t*>(tma_ld_buffers[stage_idx] + tma_offset + kNumBF16PerWarpBytes / 32 * lane_id),
+                            combined_values, 0, 0, false, topk_weight
+                        );
+                    }
+
+                    if (elect_one_sync(lane_id))
+                        mbarrier_arrive(empty_barriers[stage_idx]);
+                    stage_idx = (stage_idx + 1) % kNumStages;
+                }
+                tma_store_wait<0>();
+
+                #pragma unroll
+                for (int k = 0; k < kNumRecvUnrolls * 4; ++ k) {
+                    auto combined_pack = __nv_bfloat162(combined_values[k * 2], combined_values[k * 2 + 1]);
+                    tma_st_buffers[decode_warp_idx][kNumRecvUnrolls * 4 * lane_id + k] = *reinterpret_cast<uint32_t*>(&combined_pack);
+                }
+                tma_store_fence();
+                if (elect_one_sync(lane_id)) {
+                    tma_store_1d(tma_st_buffers[decode_warp_idx],
+                                 static_cast<int4*>(combined_x) + token_idx * hidden_bf16_int4 + decode_warp_idx * kNumRecvUnrolls * 32,
+                                 kNumBF16PerWarpBytes);
+                }
+                __syncwarp();
+            }
+        }
+
+        // Flush all stores
+        tma_store_wait<0>();
+    }
+
+//     if (thread_id % 32 == 0) { printf("[R%d,S%d,T%d] combine phase=recv END\n", rank, sm_id, thread_id); }
+}
+
+void combine_v2(void* combined_x,
+             void* rdma_recv_x, int* rdma_recv_flag, void* rdma_send_x,
+             const void* x, const int64_t* topk_idx, const float* topk_weights,
+             const int* src_info, const int64_t* layout_range,
+             int64_t* combine_wait_recv_cost_stats,
+             int* next_clean, int num_next_clean_int,
+             int num_combined_tokens, int hidden, int num_max_dispatch_tokens_per_rank,
+             int num_topk, int num_experts, int rank, int num_ranks,
+             bool use_logfmt,
+             void* workspace, int num_device_sms,
+             cudaStream_t stream, int phases, bool zero_copy,
+             uint32_t* src_signals, uint32_t src_signal_expect_value) {
+    // NOTE reduce combine_send num sm
+    if ((phases & LOW_LATENCY_RECV_PHASE) == 0) {
+        // TODO let it be configurable
+        num_device_sms = 32;
+    }
+
+    constexpr int kNumMaxTopk = 9;
+    const int num_warp_groups = ceil_div(num_experts, num_device_sms);
+    const int num_warps_per_group = 32 / num_warp_groups;
+    const int num_recv_per_sm = ceil_div(num_combined_tokens, num_device_sms);
+    EP_HOST_ASSERT(num_warp_groups > 0 and num_warps_per_group > 0 and ((num_combined_tokens == 0) or (num_recv_per_sm > 0)));
+
+    const auto num_warps = num_warp_groups * num_warps_per_group;
+    const auto num_sms = max(ceil_div(num_experts, num_warp_groups), ceil_div(num_combined_tokens, num_recv_per_sm));
+
+    // Check workspace
+    auto atomic_clean_flag = static_cast<int*>(workspace);
+    EP_HOST_ASSERT(sizeof(int) <= NUM_WORKSPACE_BYTES);
+    EP_HOST_ASSERT(num_topk <= kNumMaxTopk);
+
+    // Online cast cannot use zero-copy
+    EP_HOST_ASSERT(not (zero_copy and use_logfmt));
+
+    constexpr int kNumStages = 3;
+    constexpr int kNumMaxUnrolls = 4;
+    constexpr int kMaxNumGroups = 2;
+
+    // Send buffer size
+    const int num_meta_bytes = hidden / 128 * 4;
+    const int num_send_tma_bytes = 32 * sizeof(int4) * kNumMaxUnrolls + 16;
+    const int smem_send_size = num_warps * (kNumStages * num_send_tma_bytes + num_meta_bytes);
+
+    // Receive buffer size
+    const int num_recv_tma_bytes = 16 + hidden * 2;
+    const int smem_recv_size = kMaxNumGroups * (kNumStages * num_recv_tma_bytes + hidden * 2 + kNumStages * num_meta_bytes * 3);
+
+    // Total requirement
+    const int smem_size = max(smem_send_size, smem_recv_size);
+
+#define COMBINE_LAUNCH_CASE(hidden) { \
+auto combine_func = use_logfmt ? \
+    combine_v2<true, hidden, kNumMaxTopk, kNumMaxUnrolls> : \
+    combine_v2<false, hidden, kNumMaxTopk, kNumMaxUnrolls>; \
+SET_SHARED_MEMORY_FOR_TMA(combine_func); \
+LAUNCH_KERNEL(&cfg, combine_func, \
+              combined_x, \
+              rdma_recv_x, rdma_recv_flag, rdma_send_x, \
+              x, topk_idx, topk_weights, src_info, layout_range, \
+              combine_wait_recv_cost_stats, \
+              next_clean, num_next_clean_int, \
+              atomic_clean_flag, \
+              num_combined_tokens, hidden, num_topk, \
+              num_max_dispatch_tokens_per_rank, \
+              num_experts, rank, num_ranks, \
+              num_warp_groups, num_warps_per_group, \
+              phases, zero_copy, \
+              src_signals, src_signal_expect_value); } break
+
+    SETUP_LAUNCH_CONFIG(num_sms, num_warps * 32, stream);
+    SWITCH_HIDDEN(COMBINE_LAUNCH_CASE);
+#undef COMBINE_LAUNCH_CASE
+}
+
 template <bool kUseLogFMT, int kHidden, int kNumMaxTopk, int kNumMaxUnrolls>
 __global__ __launch_bounds__(1024, 1) void
 combine(void* combined_x,
@@ -1145,7 +1643,25 @@ void combine(void* combined_x,
              int num_topk, int num_experts, int rank, int num_ranks,
              bool use_logfmt,
              void* workspace, int num_device_sms,
-             cudaStream_t stream, int phases, bool zero_copy) {
+             cudaStream_t stream, int phases, bool zero_copy,
+             bool overlap, uint32_t* src_signals, uint32_t src_signal_expect_value) {
+    if (overlap) {
+        return combine_v2(
+            combined_x,
+            rdma_recv_x, rdma_recv_flag, rdma_send_x,
+            x, topk_idx, topk_weights,
+            src_info, layout_range,
+            combine_wait_recv_cost_stats,
+            next_clean, num_next_clean_int,
+            num_combined_tokens, hidden, num_max_dispatch_tokens_per_rank,
+            num_topk, num_experts, rank, num_ranks,
+            use_logfmt,
+            workspace, num_device_sms,
+            stream, phases, zero_copy,
+            src_signals, src_signal_expect_value
+        );
+    }
+
     constexpr int kNumMaxTopk = 9;
     const int num_warp_groups = ceil_div(num_experts, num_device_sms);
     const int num_warps_per_group = 32 / num_warp_groups;
diff --git a/deep_ep/buffer.py b/deep_ep/buffer.py
index ccdc2611..2b5eb1a2 100644
--- a/deep_ep/buffer.py
+++ b/deep_ep/buffer.py
@@ -611,7 +611,8 @@ def low_latency_dispatch(self, x: torch.Tensor, topk_idx: torch.Tensor,
     def low_latency_combine(self, x: torch.Tensor, topk_idx: torch.Tensor, topk_weights: torch.Tensor,
                             handle: tuple, use_logfmt: bool = False, zero_copy: bool = False, async_finish: bool = False,
                             return_recv_hook: bool = False, out: Optional[torch.Tensor] = None,
-                            combine_wait_recv_cost_stats: Optional[torch.Tensor] = None) -> \
+                            combine_wait_recv_cost_stats: Optional[torch.Tensor] = None,
+                            overlap: bool = False, src_signals: Optional[torch.Tensor] = None, src_signal_expect_value: int = 0) -> \
             Tuple[torch.Tensor, EventOverlap, Callable]:
         """
         A low-latency implementation for combining tokens (reduce **with weights**) with IBGDA.
@@ -651,7 +652,8 @@ def low_latency_combine(self, x: torch.Tensor, topk_idx: torch.Tensor, topk_weig
                                                                    combine_wait_recv_cost_stats,
                                                                    num_max_dispatch_tokens_per_rank, num_experts,
                                                                    use_logfmt, zero_copy, async_finish, return_recv_hook,
-                                                                   out)
+                                                                   out,
+                                                                   overlap, src_signals, src_signal_expect_value)
         tensors_to_record = (x, topk_idx, topk_weights, src_info, layout_range, combined_x)
         return combined_x, EventOverlap(event, tensors_to_record if async_finish else None), hook
 

From e00553c23f60b535e53337cdd7037e9ca49bc1b6 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Sun, 14 Sep 2025 18:33:31 +0800
Subject: [PATCH 59/60] more cherry pick

---
 csrc/kernels/utils.cuh | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/csrc/kernels/utils.cuh b/csrc/kernels/utils.cuh
index 594bff39..00fe1220 100644
--- a/csrc/kernels/utils.cuh
+++ b/csrc/kernels/utils.cuh
@@ -595,4 +595,22 @@ __forceinline__ __device__ T warp_reduce_or(T value) {
     return warp_reduce<kNumLanesPerGroup, kIntergroupReduce, T>(value, ReduceOr<T>{});
 }
 
+// TODO wait once per thraed block, not per thread
+// TODO correct?
+__device__ __forceinline__ void wait_signal(uint32_t* addr, uint32_t expect_value) {
+  while (true) {
+    uint32_t ready = 0;
+    asm volatile("ld.acquire.gpu.global.u32 %0, [%1];"
+                 : "=r"(ready)
+                 : "l"(addr)
+                 : "memory");
+
+    if (ready == expect_value) {
+        return;
+    }
+
+    asm volatile("nanosleep.u32 20;");
+  };
+}
+
 } // namespace deep_ep

From 1fd57b0276311d035d16176bb0076426166e52f3 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Tue, 16 Sep 2025 15:34:16 +0800
Subject: [PATCH 60/60] rm maxnreg

---
 csrc/kernels/internode_ll.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/csrc/kernels/internode_ll.cu b/csrc/kernels/internode_ll.cu
index 39d8dd01..23494e8f 100644
--- a/csrc/kernels/internode_ll.cu
+++ b/csrc/kernels/internode_ll.cu
@@ -772,8 +772,8 @@ __forceinline__ __device__ void decode_and_accumulate(uint32_t* ld_buffer, float
 // TODO unify with original code
 template <bool kUseLogFMT, int kHidden, int kNumMaxTopk, int kNumMaxUnrolls>
 __global__
-// __launch_bounds__(1024, 1)
-__maxnreg__(48) // TODO
+__launch_bounds__(1024, 1)
+// __maxnreg__(48) // rm
 void
 combine_v2(void* combined_x,
         void* rdma_recv_x, int* rdma_recv_flag, void* rdma_send_x,