From ce054aeaa21b0b52d67a749ab4b8073eb316e093 Mon Sep 17 00:00:00 2001
From: Pierangelo Di Pilato <pierdipi@redhat.com>
Date: Tue, 28 Oct 2025 18:09:18 +0100
Subject: [PATCH] Add barrier before nvshmem_team_split_strided to ensure
 proper initialization

Move nvshmem_barrier_all() to execute before nvshmem_team_split_strided
rather than after. This is required because team split is a collective
operation that must be called by all PEs in the parent team, and all
PEs must reach this call in a synchronized manner.

Without the barrier after nvshmemx_init_attr(), ranks may complete
initialization at different times, leading to race conditions where
some PEs attempt to split teams before others have finished NVSHMEM
initialization. This can cause undefined behavior and incorrect team
formation.

The barrier after team split was unnecessary per NVSHMEM documentation:
teams are immediately usable after creation without intervening
synchronization.

Signed-off-by: Pierangelo Di Pilato <pierdipi@redhat.com>
---
 csrc/kernels/runtime.cu | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/csrc/kernels/runtime.cu b/csrc/kernels/runtime.cu
index c4fbb8ed..410348fd 100644
--- a/csrc/kernels/runtime.cu
+++ b/csrc/kernels/runtime.cu
@@ -53,6 +53,9 @@ int init(const std::vector<uint8_t>& root_unique_id_val, int rank, int num_ranks
     nvshmemx_set_attr_uniqueid_args(rank, num_ranks, &root_unique_id, &attr);
     nvshmemx_init_attr(NVSHMEMX_INIT_WITH_UNIQUEID, &attr);
 
+    // Initialize before nvshmem_team_split_strided
+    nvshmem_barrier_all();
+
     // Create sub-RDMA teams
     // NOTES: if `num_ranks <= NUM_MAX_NVL_PEERS` then only low-latency kernels are used
     if (low_latency_mode and num_ranks > NUM_MAX_NVL_PEERS) {
@@ -68,7 +71,6 @@ int init(const std::vector<uint8_t>& root_unique_id_val, int rank, int num_ranks
         EP_HOST_ASSERT(cpu_rdma_team != NVSHMEM_TEAM_INVALID);
     }
 
-    nvshmem_barrier_all();
     return nvshmem_my_pe();
 }