From ce054aeaa21b0b52d67a749ab4b8073eb316e093 Mon Sep 17 00:00:00 2001 From: Pierangelo Di Pilato Date: Tue, 28 Oct 2025 18:09:18 +0100 Subject: [PATCH] Add barrier before nvshmem_team_split_strided to ensure proper initialization Move nvshmem_barrier_all() to execute before nvshmem_team_split_strided rather than after. This is required because team split is a collective operation that must be called by all PEs in the parent team, and all PEs must reach this call in a synchronized manner. Without the barrier after nvshmemx_init_attr(), ranks may complete initialization at different times, leading to race conditions where some PEs attempt to split teams before others have finished NVSHMEM initialization. This can cause undefined behavior and incorrect team formation. The barrier after team split was unnecessary per NVSHMEM documentation: teams are immediately usable after creation without intervening synchronization. Signed-off-by: Pierangelo Di Pilato --- csrc/kernels/runtime.cu | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/csrc/kernels/runtime.cu b/csrc/kernels/runtime.cu index c4fbb8ed..410348fd 100644 --- a/csrc/kernels/runtime.cu +++ b/csrc/kernels/runtime.cu @@ -53,6 +53,9 @@ int init(const std::vector& root_unique_id_val, int rank, int num_ranks nvshmemx_set_attr_uniqueid_args(rank, num_ranks, &root_unique_id, &attr); nvshmemx_init_attr(NVSHMEMX_INIT_WITH_UNIQUEID, &attr); + // Initialize before nvshmem_team_split_strided + nvshmem_barrier_all(); + // Create sub-RDMA teams // NOTES: if `num_ranks <= NUM_MAX_NVL_PEERS` then only low-latency kernels are used if (low_latency_mode and num_ranks > NUM_MAX_NVL_PEERS) { @@ -68,7 +71,6 @@ int init(const std::vector& root_unique_id_val, int rank, int num_ranks EP_HOST_ASSERT(cpu_rdma_team != NVSHMEM_TEAM_INVALID); } - nvshmem_barrier_all(); return nvshmem_my_pe(); }