From aa9821bfdbf6b78cb38b54505fe3f644f8b9fea8 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Sun, 11 Jan 2026 13:02:18 -0800 Subject: [PATCH 01/57] Add Slurm support for rrun --- cmake/thirdparty/get_pmix.cmake | 210 +++++++++++++++++ cpp/CMakeLists.txt | 21 ++ cpp/include/rapidsmpf/bootstrap/bootstrap.hpp | 21 +- .../rapidsmpf/bootstrap/slurm_backend.hpp | 135 +++++++++++ cpp/src/bootstrap/bootstrap.cpp | 104 +++++++- cpp/src/bootstrap/slurm_backend.cpp | 222 ++++++++++++++++++ 6 files changed, 710 insertions(+), 3 deletions(-) create mode 100644 cmake/thirdparty/get_pmix.cmake create mode 100644 cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp create mode 100644 cpp/src/bootstrap/slurm_backend.cpp diff --git a/cmake/thirdparty/get_pmix.cmake b/cmake/thirdparty/get_pmix.cmake new file mode 100644 index 000000000..aaaacc272 --- /dev/null +++ b/cmake/thirdparty/get_pmix.cmake @@ -0,0 +1,210 @@ +# ============================================================================= +# cmake-format: off +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +# cmake-format: on +# ============================================================================= + +#[=======================================================================[.rst: +get_pmix +-------- + +Find the PMIx (Process Management Interface - Exascale) library. + +This module finds the PMIx library, which is typically provided by Slurm +or OpenPMIx installations. PMIx enables scalable process coordination +without requiring a shared filesystem. + +Imported Targets +^^^^^^^^^^^^^^^^ + +This module provides the following imported targets, if found: + +``PMIx::PMIx`` + The PMIx library + +Result Variables +^^^^^^^^^^^^^^^^ + +This will define the following variables: + +``PMIx_FOUND`` + True if the system has the PMIx library. +``PMIx_VERSION`` + The version of the PMIx library which was found. +``PMIx_INCLUDE_DIRS`` + Include directories needed to use PMIx. +``PMIx_LIBRARIES`` + Libraries needed to link to PMIx. + +Hints +^^^^^ + +The following variables can be set to help find PMIx: + +``PMIx_ROOT`` + Root directory of PMIx installation. +``PMIX_ROOT`` + Alternative root directory variable. +``SLURM_ROOT`` + Slurm installation directory (PMIx may be bundled with Slurm). + +#]=======================================================================] + +# Extract PMIx version from header file. Sets PMIx_VERSION in parent scope if version can be +# determined. +function(_pmix_extract_version include_dir) + if(NOT EXISTS "${include_dir}/pmix_version.h") + return() + endif() + + file(STRINGS "${include_dir}/pmix_version.h" _pmix_version_lines + REGEX "#define[ \t]+PMIX_(MAJOR|MINOR|RELEASE)_VERSION" + ) + + foreach(_line ${_pmix_version_lines}) + if(_line MATCHES "#define[ \t]+PMIX_MAJOR_VERSION[ \t]+([0-9]+)") + set(_pmix_major "${CMAKE_MATCH_1}") + elseif(_line MATCHES "#define[ \t]+PMIX_MINOR_VERSION[ \t]+([0-9]+)") + set(_pmix_minor "${CMAKE_MATCH_1}") + elseif(_line MATCHES "#define[ \t]+PMIX_RELEASE_VERSION[ \t]+([0-9]+)") + set(_pmix_release "${CMAKE_MATCH_1}") + endif() + endforeach() + + if(DEFINED _pmix_major + AND DEFINED _pmix_minor + AND DEFINED _pmix_release + ) + set(PMIx_VERSION + "${_pmix_major}.${_pmix_minor}.${_pmix_release}" + PARENT_SCOPE + ) + elseif(DEFINED _pmix_major AND DEFINED _pmix_minor) + set(PMIx_VERSION + "${_pmix_major}.${_pmix_minor}" + PARENT_SCOPE + ) + endif() +endfunction() + +# Create the PMIx::PMIx imported target and find optional dependencies. +function(_pmix_create_target library include_dir) + if(TARGET PMIx::PMIx) + return() + endif() + + add_library(PMIx::PMIx UNKNOWN IMPORTED) + set_target_properties( + PMIx::PMIx PROPERTIES IMPORTED_LOCATION "${library}" INTERFACE_INCLUDE_DIRECTORIES + "${include_dir}" + ) + + # PMIx may have dependencies on libevent or hwloc. Try to find and link them if available. + find_library(EVENT_CORE_LIBRARY event_core) + find_library(EVENT_PTHREADS_LIBRARY event_pthreads) + find_library(HWLOC_LIBRARY hwloc) + + set(_pmix_extra_libs "") + foreach(_lib EVENT_CORE_LIBRARY EVENT_PTHREADS_LIBRARY HWLOC_LIBRARY) + if(${_lib}) + list(APPEND _pmix_extra_libs "${${_lib}}") + endif() + endforeach() + + if(_pmix_extra_libs) + set_property( + TARGET PMIx::PMIx + APPEND + PROPERTY INTERFACE_LINK_LIBRARIES "${_pmix_extra_libs}" + ) + endif() + + mark_as_advanced( + PMIx_INCLUDE_DIR PMIx_LIBRARY EVENT_CORE_LIBRARY EVENT_PTHREADS_LIBRARY HWLOC_LIBRARY + ) +endfunction() + +# Find and configure the PMIx library. Sets PMIx_FOUND, PMIx_VERSION, PMIx_INCLUDE_DIRS, +# PMIx_LIBRARIES in parent scope. Creates PMIx::PMIx imported target if found. +function(find_and_configure_pmix) + # Return early if already found + if(TARGET PMIx::PMIx) + set(PMIx_FOUND + TRUE + PARENT_SCOPE + ) + return() + endif() + + # First try pkg-config (most reliable method) + find_package(PkgConfig QUIET) + if(PKG_CONFIG_FOUND) + pkg_check_modules(PC_PMIx QUIET pmix) + endif() + + # Find include directory + find_path( + PMIx_INCLUDE_DIR + NAMES pmix.h + HINTS ${PC_PMIx_INCLUDEDIR} ${PC_PMIx_INCLUDE_DIRS} ${PMIx_ROOT}/include $ENV{PMIx_ROOT}/include + $ENV{PMIX_ROOT}/include ${SLURM_ROOT}/include $ENV{SLURM_ROOT}/include + PATHS /usr/include /usr/local/include /opt/pmix/include /usr/include/slurm + /usr/local/include/slurm + ) + + # Find library + find_library( + PMIx_LIBRARY + NAMES pmix + HINTS ${PC_PMIx_LIBDIR} + ${PC_PMIx_LIBRARY_DIRS} + ${PMIx_ROOT}/lib + ${PMIx_ROOT}/lib64 + $ENV{PMIx_ROOT}/lib + $ENV{PMIx_ROOT}/lib64 + $ENV{PMIX_ROOT}/lib + $ENV{PMIX_ROOT}/lib64 + ${SLURM_ROOT}/lib + ${SLURM_ROOT}/lib64 + $ENV{SLURM_ROOT}/lib + $ENV{SLURM_ROOT}/lib64 + PATHS /usr/lib /usr/lib64 /usr/local/lib /usr/local/lib64 /opt/pmix/lib /opt/pmix/lib64 + ) + + # Get version from header if found + if(PMIx_INCLUDE_DIR) + _pmix_extract_version("${PMIx_INCLUDE_DIR}") + endif() + + include(FindPackageHandleStandardArgs) + find_package_handle_standard_args( + PMIx + REQUIRED_VARS PMIx_LIBRARY PMIx_INCLUDE_DIR + VERSION_VAR PMIx_VERSION + ) + + if(PMIx_FOUND) + _pmix_create_target("${PMIx_LIBRARY}" "${PMIx_INCLUDE_DIR}") + endif() + + # Export results to parent scope + set(PMIx_FOUND + ${PMIx_FOUND} + PARENT_SCOPE + ) + if(DEFINED PMIx_VERSION) + set(PMIx_VERSION + ${PMIx_VERSION} + PARENT_SCOPE + ) + endif() + set(PMIx_INCLUDE_DIRS + ${PMIx_INCLUDE_DIR} + PARENT_SCOPE + ) + set(PMIx_LIBRARIES + ${PMIx_LIBRARY} + PARENT_SCOPE + ) +endfunction() diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 115295a18..6240cf7f9 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -46,6 +46,7 @@ option(BUILD_UCXX_SUPPORT "Build RapidsMPF with UCXX support" ON) option(BUILD_STREAMING_SUPPORT "Build RapidsMPF with streaming support" ON) option(BUILD_CUPTI_SUPPORT "Build RapidsMPF with CUPTI support" OFF) option(BUILD_NUMA_SUPPORT "Build RapidsMPF with NUMA support" ON) +option(BUILD_SLURM_SUPPORT "Build RapidsMPF with Slurm/PMIx bootstrap support" ON) option(BUILD_TESTS "Configure CMake to build tests" ON) option(BUILD_BENCHMARKS "Configure CMake to build benchmarks" ON) option(BUILD_EXAMPLES "Configure CMake to build examples" ON) @@ -62,6 +63,7 @@ message(STATUS " BUILD_UCXX_SUPPORT : ${BUILD_UCXX_SUPPORT}") message(STATUS " BUILD_STREAMING_SUPPORT : ${BUILD_STREAMING_SUPPORT}") message(STATUS " BUILD_CUPTI_SUPPORT : ${BUILD_CUPTI_SUPPORT}") message(STATUS " BUILD_NUMA_SUPPORT : ${BUILD_NUMA_SUPPORT}") +message(STATUS " BUILD_SLURM_SUPPORT : ${BUILD_SLURM_SUPPORT}") message(STATUS " BUILD_TESTS : ${BUILD_TESTS}") message(STATUS " BUILD_BENCHMARKS : ${BUILD_BENCHMARKS}") message(STATUS " BUILD_EXAMPLES : ${BUILD_EXAMPLES}") @@ -77,6 +79,7 @@ set(RAPIDSMPF_HAVE_UCXX ${BUILD_UCXX_SUPPORT}) set(RAPIDSMPF_HAVE_STREAMING ${BUILD_STREAMING_SUPPORT}) set(RAPIDSMPF_HAVE_CUPTI ${BUILD_CUPTI_SUPPORT}) set(RAPIDSMPF_HAVE_NUMA ${BUILD_NUMA_SUPPORT}) +set(RAPIDSMPF_HAVE_SLURM OFF) # Will be set to ON if PMIx is found set(RAPIDSMPF_BUILD_TESTS ${BUILD_TESTS}) set(RAPIDSMPF_BUILD_BENCHMARKS ${BUILD_BENCHMARKS}) set(RAPIDSMPF_BUILD_EXAMPLES ${BUILD_EXAMPLES}) @@ -155,6 +158,18 @@ endif() if(RAPIDSMPF_HAVE_STREAMING) include(../cmake/thirdparty/get_libcoro.cmake) endif() +if(BUILD_SLURM_SUPPORT) + include(../cmake/thirdparty/get_pmix.cmake) + find_and_configure_pmix() + if(PMIx_FOUND) + set(RAPIDSMPF_HAVE_SLURM ON) + message(STATUS "PMIx found (version ${PMIx_VERSION}) - Slurm bootstrap backend enabled") + else() + message(WARNING "PMIx not found - Slurm bootstrap backend will be disabled. " + "Set PMIx_ROOT or PMIX_ROOT to the PMIx installation directory." + ) + endif() +endif() # ################################################################################################## # * library targets -------------------------------------------------------------------------------- @@ -230,6 +245,9 @@ endif() if(RAPIDSMPF_HAVE_CUPTI) target_sources(rapidsmpf PRIVATE src/cupti.cpp) endif() +if(RAPIDSMPF_HAVE_SLURM) + target_sources(rapidsmpf PRIVATE src/bootstrap/slurm_backend.cpp) +endif() set_target_properties( rapidsmpf @@ -291,6 +309,7 @@ target_link_libraries( PRIVATE $<$:numa> $ $<$:CUDA::cupti> + $ $ maybe_asan $ @@ -306,6 +325,7 @@ target_compile_definitions( $<$:RAPIDSMPF_HAVE_STREAMING> $<$:RAPIDSMPF_HAVE_CUPTI> $<$:RAPIDSMPF_HAVE_NUMA> + $<$:RAPIDSMPF_HAVE_SLURM> $<$:RAPIDSMPF_VERBOSE_INFO> ) @@ -430,6 +450,7 @@ string( "set(RAPIDSMPF_HAVE_UCXX [=[${RAPIDSMPF_HAVE_UCXX}]=])" "set(RAPIDSMPF_HAVE_STREAMING [=[${RAPIDSMPF_HAVE_STREAMING}]=])" "set(RAPIDSMPF_HAVE_CUPTI [=[${RAPIDSMPF_HAVE_CUPTI}]=])" + "set(RAPIDSMPF_HAVE_SLURM [=[${RAPIDSMPF_HAVE_SLURM}]=])" ) rapids_export( diff --git a/cpp/include/rapidsmpf/bootstrap/bootstrap.hpp b/cpp/include/rapidsmpf/bootstrap/bootstrap.hpp index a8d38d390..da9884c8b 100644 --- a/cpp/include/rapidsmpf/bootstrap/bootstrap.hpp +++ b/cpp/include/rapidsmpf/bootstrap/bootstrap.hpp @@ -1,5 +1,5 @@ /** - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. * SPDX-License-Identifier: Apache-2.0 */ @@ -23,7 +23,8 @@ enum class Backend { * @brief Automatically detect the best backend based on environment. * * Detection order: - * 1. File-based (default fallback) + * 1. Slurm/PMIx (if PMIX_NAMESPACE or SLURM environment detected) + * 2. File-based (default fallback) */ AUTO, @@ -35,6 +36,22 @@ enum class Backend { * RAPIDSMPF_NRANKS, RAPIDSMPF_COORD_DIR environment variables. */ FILE, + + /** + * @brief Slurm-based coordination using PMIx. + * + * Uses PMIx (Process Management Interface for Exascale) for scalable process + * coordination without requiring a shared filesystem. Designed for Slurm clusters + * and supports multi-node deployments. + * + * Run with: `srun --mpi=pmix -n ./program` + * + * Environment variables (automatically set by Slurm): + * - PMIX_NAMESPACE: PMIx namespace identifier + * - SLURM_PROCID: Process rank + * - SLURM_NPROCS/SLURM_NTASKS: Total number of processes + */ + SLURM, }; /** diff --git a/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp b/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp new file mode 100644 index 000000000..e03760bb0 --- /dev/null +++ b/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp @@ -0,0 +1,135 @@ +/** + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include + +#ifdef RAPIDSMPF_HAVE_SLURM + +#include +#include +#include + +#include + +#include + +namespace rapidsmpf::bootstrap::detail { + +/** + * @brief Slurm-based coordination backend using PMIx. + * + * This class implements coordination using PMIx (Process Management Interface + * for Exascale), which provides scalable process coordination without requiring + * a shared filesystem. It is designed for Slurm clusters and supports multi-node + * deployments. + * + * PMIx operations: + * - PMIx_Put/PMIx_Get: Key-value store operations + * - PMIx_Commit: Make local puts visible + * - PMIx_Fence: Global synchronization and data exchange + * + * Usage: + * ```bash + * # Single node + * srun --mpi=pmix -n 4 ./my_program + * + * # Multi-node + * srun --mpi=pmix -N 2 -n 8 ./my_program + * ``` + */ +class SlurmBackend { + public: + /** + * @brief Construct a Slurm backend using PMIx. + * + * Initializes PMIx and retrieves process information from the runtime. + * + * @param ctx Bootstrap context containing rank information. + * @throws std::runtime_error if PMIx initialization fails. + */ + explicit SlurmBackend(Context ctx); + + /** + * @brief Destructor - finalizes PMIx. + */ + ~SlurmBackend(); + + // Non-copyable, non-movable (PMIx state is process-global) + SlurmBackend(SlurmBackend const&) = delete; + SlurmBackend& operator=(SlurmBackend const&) = delete; + SlurmBackend(SlurmBackend&&) = delete; + SlurmBackend& operator=(SlurmBackend&&) = delete; + + /** + * @brief Store a key-value pair in the PMIx KVS. + * + * The key-value pair is committed immediately and made visible to other + * ranks via a fence operation. + * + * @param key Key name. + * @param value Value to store. + * @throws std::runtime_error if PMIx operation fails. + */ + void put(std::string const& key, std::string const& value); + + /** + * @brief Retrieve a value from the PMIx KVS. + * + * Blocks until the key is available or timeout occurs. Uses polling + * with exponential backoff. + * + * @param key Key name. + * @param timeout Timeout duration. + * @return Value associated with key. + * @throws std::runtime_error if key not found within timeout. + */ + std::string get(std::string const& key, Duration timeout); + + /** + * @brief Perform a barrier synchronization using PMIx_Fence. + * + * All ranks must call this before any rank proceeds. The fence also + * ensures all committed key-value pairs are visible to all ranks. + * + * @throws std::runtime_error if PMIx_Fence fails. + */ + void barrier(); + + /** + * @brief Broadcast data from root to all ranks. + * + * Root rank publishes data via put(), then all ranks synchronize + * and non-root ranks retrieve the data via get(). + * + * @param data Data buffer (input on root, output on others). + * @param size Size in bytes. + * @param root Root rank. + * @throws std::runtime_error if broadcast fails or size mismatch occurs. + */ + void broadcast(void* data, std::size_t size, Rank root); + + private: + Context ctx_; + std::size_t barrier_count_{0}; + bool pmix_initialized_{false}; + pmix_proc_t proc_{}; ///< PMIx process identifier + std::array nspace_{}; ///< PMIx namespace (job identifier) + + /** + * @brief Commit local key-value pairs to make them visible. + * + * Must be called after put() operations. The subsequent fence() + * or barrier() will make the data globally visible. + * + * @throws std::runtime_error if PMIx_Commit fails. + */ + void commit(); +}; + +} // namespace rapidsmpf::bootstrap::detail + +#endif // RAPIDSMPF_HAVE_SLURM diff --git a/cpp/src/bootstrap/bootstrap.cpp b/cpp/src/bootstrap/bootstrap.cpp index 2f8c12996..05d19540d 100644 --- a/cpp/src/bootstrap/bootstrap.cpp +++ b/cpp/src/bootstrap/bootstrap.cpp @@ -9,6 +9,11 @@ #include #include +#include + +#ifdef RAPIDSMPF_HAVE_SLURM +#include +#endif // NOTE: Do not use RAPIDSMPF_EXPECTS or RAPIDSMPF_FAIL in this file. // Using these macros introduces a CUDA dependency via rapidsmpf/error.hpp. @@ -52,11 +57,22 @@ std::optional getenv_int(std::string_view name) { * @brief Detect backend from environment variables. */ Backend detect_backend() { - // Check for file-based coordination + // Check for file-based coordination first (explicit configuration takes priority) if (getenv_optional("RAPIDSMPF_COORD_DIR")) { return Backend::FILE; } +#ifdef RAPIDSMPF_HAVE_SLURM + // Check for PMIx/Slurm environment + // PMIX_NAMESPACE is set by PMIx-enabled launchers (srun --mpi=pmix) + // SLURM_JOB_ID + SLURM_STEP_ID indicate a Slurm job step + if (getenv_optional("PMIX_NAMESPACE") + || (getenv_optional("SLURM_JOB_ID") && getenv_optional("SLURM_STEP_ID"))) + { + return Backend::SLURM; + } +#endif + // Default to file-based return Backend::FILE; } @@ -108,6 +124,61 @@ Context init(Backend backend) { } break; } + case Backend::SLURM: + { +#ifdef RAPIDSMPF_HAVE_SLURM + // For SLURM backend, we can get rank/nranks from multiple sources: + // 1. Explicit RAPIDSMPF_* variables (override) + // 2. PMIx environment variables (set by pmix-enabled srun) + // 3. Slurm environment variables (fallback) + auto rank_opt = getenv_int("RAPIDSMPF_RANK"); + if (!rank_opt) { + rank_opt = getenv_int("PMIX_RANK"); + } + if (!rank_opt) { + rank_opt = getenv_int("SLURM_PROCID"); + } + + auto nranks_opt = getenv_int("RAPIDSMPF_NRANKS"); + if (!nranks_opt) { + nranks_opt = getenv_int("SLURM_NPROCS"); + } + if (!nranks_opt) { + nranks_opt = getenv_int("SLURM_NTASKS"); + } + + if (!rank_opt.has_value()) { + throw std::runtime_error( + "Could not determine rank for SLURM backend. " + "Ensure you're running with 'srun --mpi=pmix' or set RAPIDSMPF_RANK." + ); + } + + if (!nranks_opt.has_value()) { + throw std::runtime_error( + "Could not determine nranks for SLURM backend. " + "Ensure you're running with 'srun --mpi=pmix' or set " + "RAPIDSMPF_NRANKS." + ); + } + + ctx.rank = static_cast(*rank_opt); + ctx.nranks = static_cast(*nranks_opt); + + if (!(ctx.rank >= 0 && ctx.rank < ctx.nranks)) { + throw std::runtime_error( + "Invalid rank: " + std::to_string(ctx.rank) + " must be in range [0, " + + std::to_string(ctx.nranks) + ")" + ); + } + break; +#else + throw std::runtime_error( + "SLURM backend requested but rapidsmpf was not built with PMIx support. " + "Rebuild with RAPIDSMPF_ENABLE_SLURM=ON and ensure PMIx is available." + ); +#endif + } case Backend::AUTO: { // Should have been resolved above @@ -125,6 +196,14 @@ void broadcast(Context const& ctx, void* data, std::size_t size, Rank root) { backend.broadcast(data, size, root); break; } +#ifdef RAPIDSMPF_HAVE_SLURM + case Backend::SLURM: + { + detail::SlurmBackend backend{ctx}; + backend.broadcast(data, size, root); + break; + } +#endif default: throw std::runtime_error("broadcast not implemented for this backend"); } @@ -138,6 +217,14 @@ void barrier(Context const& ctx) { backend.barrier(); break; } +#ifdef RAPIDSMPF_HAVE_SLURM + case Backend::SLURM: + { + detail::SlurmBackend backend{ctx}; + backend.barrier(); + break; + } +#endif default: throw std::runtime_error("barrier not implemented for this backend"); } @@ -151,6 +238,14 @@ void put(Context const& ctx, std::string const& key, std::string const& value) { backend.put(key, value); break; } +#ifdef RAPIDSMPF_HAVE_SLURM + case Backend::SLURM: + { + detail::SlurmBackend backend{ctx}; + backend.put(key, value); + break; + } +#endif default: throw std::runtime_error("put not implemented for this backend"); } @@ -163,6 +258,13 @@ std::string get(Context const& ctx, std::string const& key, Duration timeout) { detail::FileBackend backend{ctx}; return backend.get(key, timeout); } +#ifdef RAPIDSMPF_HAVE_SLURM + case Backend::SLURM: + { + detail::SlurmBackend backend{ctx}; + return backend.get(key, timeout); + } +#endif default: throw std::runtime_error("get not implemented for this backend"); } diff --git a/cpp/src/bootstrap/slurm_backend.cpp b/cpp/src/bootstrap/slurm_backend.cpp new file mode 100644 index 000000000..e8c355c84 --- /dev/null +++ b/cpp/src/bootstrap/slurm_backend.cpp @@ -0,0 +1,222 @@ +/** + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include + +#ifdef RAPIDSMPF_HAVE_SLURM + +#include +#include +#include +#include +#include +#include + +#include + +// NOTE: Do not use RAPIDSMPF_EXPECTS or RAPIDSMPF_FAIL in this file. +// Using these macros introduces a CUDA dependency via rapidsmpf/error.hpp. +// Prefer throwing standard exceptions instead. + +namespace rapidsmpf::bootstrap::detail { + +namespace { + +/** + * @brief Convert PMIx status to string for error messages. + * + * @param status PMIx status code to convert. + * @return Human-readable string describing the status. + */ +std::string pmix_error_string(pmix_status_t status) { + return std::string{PMIx_Error_string(status)}; +} + +/** + * @brief Check PMIx status and throw on error. + * + * @param status PMIx status code to check. + * @param operation Description of the operation (used in error message). + * @throws std::runtime_error if status is not PMIX_SUCCESS. + */ +void check_pmix_status(pmix_status_t status, std::string const& operation) { + if (status != PMIX_SUCCESS) { + throw std::runtime_error(operation + " failed: " + pmix_error_string(status)); + } +} + +} // namespace + +SlurmBackend::SlurmBackend(Context ctx) : ctx_{std::move(ctx)} { + pmix_status_t rc; + pmix_proc_t proc; + + rc = PMIx_Init(&proc, nullptr, 0); + if (rc != PMIX_SUCCESS) { + throw std::runtime_error( + "PMIx_Init failed: " + pmix_error_string(rc) + + ". Ensure you're running under Slurm with --mpi=pmix" + ); + } + pmix_initialized_ = true; + + proc_ = proc; + // Copy full nspace buffer (both are PMIX_MAX_NSLEN + 1 in size) + static_assert(sizeof(proc.nspace) == PMIX_MAX_NSLEN + 1); + std::memcpy(nspace_.data(), proc.nspace, nspace_.size()); + + // Verify rank matches what we expect (if context has a valid rank) + // Note: For SLURM backend, ctx_.rank may be set from environment variables + // before PMIx_Init, so we verify they match + if (ctx_.rank >= 0 && std::cmp_not_equal(proc.rank, ctx_.rank)) { + throw std::runtime_error( + "PMIx rank (" + std::to_string(proc.rank) + ") doesn't match context rank (" + + std::to_string(ctx_.rank) + ")" + ); + } + + // Update context rank from PMIx if not already set + if (ctx_.rank < 0) { + ctx_.rank = static_cast(proc.rank); + } +} + +SlurmBackend::~SlurmBackend() { + if (pmix_initialized_) { + pmix_status_t rc = PMIx_Finalize(nullptr, 0); + if (rc != PMIX_SUCCESS) { + // Log but don't throw from destructor + std::cerr << "Warning: PMIx_Finalize failed: " << pmix_error_string(rc) + << std::endl; + } + } +} + +void SlurmBackend::put(std::string const& key, std::string const& value) { + pmix_value_t pmix_value; + + PMIX_VALUE_CONSTRUCT(&pmix_value); + pmix_value.type = PMIX_BYTE_OBJECT; + pmix_value.data.bo.bytes = const_cast(value.data()); + pmix_value.data.bo.size = value.size(); + + pmix_status_t rc = PMIx_Put(PMIX_GLOBAL, key.c_str(), &pmix_value); + if (rc != PMIX_SUCCESS) { + throw std::runtime_error( + "PMIx_Put for key '" + key + "' failed: " + pmix_error_string(rc) + ); + } + + // Note: We don't call PMIX_VALUE_DESTRUCT here because we don't own the + // byte data (it points to value.data()). PMIX_VALUE_DESTRUCT would try + // to free that memory. + + // Commit to make the data available for subsequent fence + commit(); +} + +void SlurmBackend::commit() { + pmix_status_t rc = PMIx_Commit(); + check_pmix_status(rc, "PMIx_Commit"); +} + +std::string SlurmBackend::get(std::string const& key, Duration timeout) { + auto start = std::chrono::steady_clock::now(); + auto poll_interval = std::chrono::milliseconds{10}; + + // Create proc to get from (wildcard to search all ranks in namespace) + pmix_proc_t proc; + PMIX_PROC_CONSTRUCT(&proc); + std::memcpy(proc.nspace, nspace_.data(), nspace_.size()); + proc.rank = PMIX_RANK_WILDCARD; + + while (true) { + pmix_value_t* val = nullptr; + pmix_status_t rc = PMIx_Get(&proc, key.c_str(), nullptr, 0, &val); + + if (rc == PMIX_SUCCESS && val != nullptr) { + std::string result; + + if (val->type == PMIX_BYTE_OBJECT) { + result = std::string{ + static_cast(val->data.bo.bytes), val->data.bo.size + }; + } else if (val->type == PMIX_STRING) { + result = std::string{val->data.string}; + } else { + PMIX_VALUE_RELEASE(val); + throw std::runtime_error( + "Unexpected PMIx value type for key '" + key + + "': " + std::to_string(static_cast(val->type)) + ); + } + + PMIX_VALUE_RELEASE(val); + return result; + } + + // Check timeout + auto elapsed = std::chrono::steady_clock::now() - start; + if (elapsed >= timeout) { + throw std::runtime_error( + "Key '" + key + "' not available within " + + std::to_string( + std::chrono::duration_cast(timeout).count() + ) + + "s timeout" + ); + } + + // Sleep before retry with exponential backoff + std::this_thread::sleep_for(poll_interval); + if (poll_interval < std::chrono::milliseconds{100}) { + poll_interval = std::min(poll_interval * 2, std::chrono::milliseconds{100}); + } + } +} + +void SlurmBackend::barrier() { + // Create proc array for all ranks (wildcard) in our namespace + pmix_proc_t proc; + PMIX_PROC_CONSTRUCT(&proc); + std::memcpy(proc.nspace, nspace_.data(), nspace_.size()); + proc.rank = PMIX_RANK_WILDCARD; + + // PMIx_Fence performs barrier and exchanges committed data + pmix_status_t rc = PMIx_Fence(&proc, 1, nullptr, 0); + check_pmix_status(rc, "PMIx_Fence (barrier)"); +} + +void SlurmBackend::broadcast(void* data, std::size_t size, Rank root) { + // Use unique key for each broadcast to avoid collisions + std::string bcast_key = + "bcast_" + std::to_string(root) + "_" + std::to_string(barrier_count_++); + + if (ctx_.rank == root) { + // Root publishes data + std::string bcast_data{static_cast(data), size}; + put(bcast_key, bcast_data); + } + + barrier(); + + if (ctx_.rank != root) { + // Non-root ranks retrieve data + std::string bcast_data = get(bcast_key, std::chrono::seconds{30}); + if (bcast_data.size() != size) { + throw std::runtime_error( + "Broadcast size mismatch: expected " + std::to_string(size) + ", got " + + std::to_string(bcast_data.size()) + ); + } + std::memcpy(data, bcast_data.data(), size); + } + + barrier(); +} + +} // namespace rapidsmpf::bootstrap::detail + +#endif // RAPIDSMPF_HAVE_SLURM From 9642eae587f746c1ccba609c9cf8e1f89e1b39fe Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Sun, 11 Jan 2026 13:17:16 -0800 Subject: [PATCH 02/57] Add libpmix-devel dependency --- conda/environments/all_cuda-129_arch-aarch64.yaml | 1 + conda/environments/all_cuda-129_arch-x86_64.yaml | 1 + conda/environments/all_cuda-131_arch-aarch64.yaml | 1 + conda/environments/all_cuda-131_arch-x86_64.yaml | 1 + conda/recipes/librapidsmpf/recipe.yaml | 5 ++++- dependencies.yaml | 1 + 6 files changed, 9 insertions(+), 1 deletion(-) diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index 0350fdca7..84ffc9e08 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -31,6 +31,7 @@ dependencies: - ipython - libcudf==26.2.*,>=0.0.0a0 - libnuma +- libpmix-devel >=5.0 - librmm==26.2.*,>=0.0.0a0 - libucxx==0.48.*,>=0.0.0a0 - make diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index 60109a8ea..90bbb5811 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -31,6 +31,7 @@ dependencies: - ipython - libcudf==26.2.*,>=0.0.0a0 - libnuma +- libpmix-devel >=5.0 - librmm==26.2.*,>=0.0.0a0 - libucxx==0.48.*,>=0.0.0a0 - make diff --git a/conda/environments/all_cuda-131_arch-aarch64.yaml b/conda/environments/all_cuda-131_arch-aarch64.yaml index 3a25f9737..038a5b2ba 100644 --- a/conda/environments/all_cuda-131_arch-aarch64.yaml +++ b/conda/environments/all_cuda-131_arch-aarch64.yaml @@ -31,6 +31,7 @@ dependencies: - ipython - libcudf==26.2.*,>=0.0.0a0 - libnuma +- libpmix-devel >=5.0 - librmm==26.2.*,>=0.0.0a0 - libucxx==0.48.*,>=0.0.0a0 - make diff --git a/conda/environments/all_cuda-131_arch-x86_64.yaml b/conda/environments/all_cuda-131_arch-x86_64.yaml index e0584e4ee..e755513f6 100644 --- a/conda/environments/all_cuda-131_arch-x86_64.yaml +++ b/conda/environments/all_cuda-131_arch-x86_64.yaml @@ -31,6 +31,7 @@ dependencies: - ipython - libcudf==26.2.*,>=0.0.0a0 - libnuma +- libpmix-devel >=5.0 - librmm==26.2.*,>=0.0.0a0 - libucxx==0.48.*,>=0.0.0a0 - make diff --git a/conda/recipes/librapidsmpf/recipe.yaml b/conda/recipes/librapidsmpf/recipe.yaml index d4a48ac68..2c2edc7fd 100644 --- a/conda/recipes/librapidsmpf/recipe.yaml +++ b/conda/recipes/librapidsmpf/recipe.yaml @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 schema_version: 1 @@ -79,6 +79,7 @@ cache: - cuda-nvml-dev - libcudf =${{ minor_version }} - libnuma + - libpmix-devel >=5.0 - librmm =${{ minor_version }} - openmpi >=5.0 # See - ucxx ${{ ucxx_version }} @@ -115,6 +116,7 @@ outputs: - cuda-cupti-dev - cuda-nvml-dev - libcudf =${{ minor_version }} + - libpmix >=5.0 - openmpi >=5.0 - ucxx ${{ ucxx_version }} run: @@ -123,6 +125,7 @@ outputs: - cuda-cupti - librmm =${{ minor_version }} - libcudf =${{ minor_version }} + - libpmix >=5.0 - openmpi >=5.0 # See - ucxx ${{ ucxx_version }} ignore_run_exports: diff --git a/dependencies.yaml b/dependencies.yaml index 77f847cc1..94c045a1f 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -176,6 +176,7 @@ dependencies: - cuda-nvcc - cxx-compiler - libnuma + - libpmix-devel >=5.0 - openmpi >=5.0 # See specific: - output_types: conda From f80ffcf57d37d7a131e70b6440d82a650b504e14 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Sun, 11 Jan 2026 13:46:50 -0800 Subject: [PATCH 03/57] Add build-pmix dependency matrix --- dependencies.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dependencies.yaml b/dependencies.yaml index 94c045a1f..1eded0f0e 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -13,6 +13,7 @@ files: - build-cpp - build-python - build-mpi + - build-pmix - checks - clang_tidy - cuda @@ -218,6 +219,11 @@ dependencies: packages: - openmpi >=5.0 # See - mpi4py + build-pmix: + common: + - output_types: [conda, pyproject, requirements] + packages: + - libpmix-devel >=5.0 build-python: common: - output_types: [conda, pyproject, requirements] From d39476c8f428516ea8d174242ac93472c5c52267 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Sun, 11 Jan 2026 13:56:53 -0800 Subject: [PATCH 04/57] Fix libpmix-devel conda build dependency --- conda/recipes/librapidsmpf/recipe.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/recipes/librapidsmpf/recipe.yaml b/conda/recipes/librapidsmpf/recipe.yaml index 2c2edc7fd..ae026d881 100644 --- a/conda/recipes/librapidsmpf/recipe.yaml +++ b/conda/recipes/librapidsmpf/recipe.yaml @@ -116,7 +116,7 @@ outputs: - cuda-cupti-dev - cuda-nvml-dev - libcudf =${{ minor_version }} - - libpmix >=5.0 + - libpmix-devel >=5.0 - openmpi >=5.0 - ucxx ${{ ucxx_version }} run: From 90c35651bc0a01d7a90314cb5ffac2561720687d Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Sun, 11 Jan 2026 14:04:54 -0800 Subject: [PATCH 05/57] Pin libpmix <6.0 --- conda/environments/all_cuda-129_arch-aarch64.yaml | 2 +- conda/environments/all_cuda-129_arch-x86_64.yaml | 2 +- conda/environments/all_cuda-131_arch-aarch64.yaml | 2 +- conda/environments/all_cuda-131_arch-x86_64.yaml | 2 +- conda/recipes/librapidsmpf/recipe.yaml | 6 +++--- dependencies.yaml | 4 ++-- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index 84ffc9e08..c681a5aac 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -31,7 +31,7 @@ dependencies: - ipython - libcudf==26.2.*,>=0.0.0a0 - libnuma -- libpmix-devel >=5.0 +- libpmix-devel >=5.0,<6.0 - librmm==26.2.*,>=0.0.0a0 - libucxx==0.48.*,>=0.0.0a0 - make diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index 90bbb5811..aabac457f 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -31,7 +31,7 @@ dependencies: - ipython - libcudf==26.2.*,>=0.0.0a0 - libnuma -- libpmix-devel >=5.0 +- libpmix-devel >=5.0,<6.0 - librmm==26.2.*,>=0.0.0a0 - libucxx==0.48.*,>=0.0.0a0 - make diff --git a/conda/environments/all_cuda-131_arch-aarch64.yaml b/conda/environments/all_cuda-131_arch-aarch64.yaml index 038a5b2ba..c0cb4bc2d 100644 --- a/conda/environments/all_cuda-131_arch-aarch64.yaml +++ b/conda/environments/all_cuda-131_arch-aarch64.yaml @@ -31,7 +31,7 @@ dependencies: - ipython - libcudf==26.2.*,>=0.0.0a0 - libnuma -- libpmix-devel >=5.0 +- libpmix-devel >=5.0,<6.0 - librmm==26.2.*,>=0.0.0a0 - libucxx==0.48.*,>=0.0.0a0 - make diff --git a/conda/environments/all_cuda-131_arch-x86_64.yaml b/conda/environments/all_cuda-131_arch-x86_64.yaml index e755513f6..d2a9c00f0 100644 --- a/conda/environments/all_cuda-131_arch-x86_64.yaml +++ b/conda/environments/all_cuda-131_arch-x86_64.yaml @@ -31,7 +31,7 @@ dependencies: - ipython - libcudf==26.2.*,>=0.0.0a0 - libnuma -- libpmix-devel >=5.0 +- libpmix-devel >=5.0,<6.0 - librmm==26.2.*,>=0.0.0a0 - libucxx==0.48.*,>=0.0.0a0 - make diff --git a/conda/recipes/librapidsmpf/recipe.yaml b/conda/recipes/librapidsmpf/recipe.yaml index ae026d881..b6de7059d 100644 --- a/conda/recipes/librapidsmpf/recipe.yaml +++ b/conda/recipes/librapidsmpf/recipe.yaml @@ -79,7 +79,7 @@ cache: - cuda-nvml-dev - libcudf =${{ minor_version }} - libnuma - - libpmix-devel >=5.0 + - libpmix-devel >=5.0,<6.0 - librmm =${{ minor_version }} - openmpi >=5.0 # See - ucxx ${{ ucxx_version }} @@ -116,7 +116,7 @@ outputs: - cuda-cupti-dev - cuda-nvml-dev - libcudf =${{ minor_version }} - - libpmix-devel >=5.0 + - libpmix-devel >=5.0,<6.0 - openmpi >=5.0 - ucxx ${{ ucxx_version }} run: @@ -125,7 +125,7 @@ outputs: - cuda-cupti - librmm =${{ minor_version }} - libcudf =${{ minor_version }} - - libpmix >=5.0 + - libpmix >=5.0,<6.0 - openmpi >=5.0 # See - ucxx ${{ ucxx_version }} ignore_run_exports: diff --git a/dependencies.yaml b/dependencies.yaml index 1eded0f0e..0b705f954 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -177,7 +177,7 @@ dependencies: - cuda-nvcc - cxx-compiler - libnuma - - libpmix-devel >=5.0 + - libpmix-devel >=5.0,<6.0 - openmpi >=5.0 # See specific: - output_types: conda @@ -223,7 +223,7 @@ dependencies: common: - output_types: [conda, pyproject, requirements] packages: - - libpmix-devel >=5.0 + - libpmix-devel >=5.0,<6.0 build-python: common: - output_types: [conda, pyproject, requirements] From 92686f4c4e848cd134e8f42f5081825aaac33093 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 13 Jan 2026 05:36:22 -0800 Subject: [PATCH 06/57] Support `srun` (with debug) --- cpp/benchmarks/bench_comm.cpp | 8 +- cpp/benchmarks/bench_shuffle.cpp | 10 +- .../streaming/bench_streaming_shuffle.cpp | 8 +- cpp/benchmarks/streaming/ndsh/utils.cpp | 7 +- cpp/include/rapidsmpf/bootstrap/utils.hpp | 43 ++++-- cpp/src/bootstrap/slurm_backend.cpp | 133 ++++++++++++------ cpp/src/bootstrap/ucxx.cpp | 12 +- cpp/src/bootstrap/utils.cpp | 90 +++++++++--- 8 files changed, 226 insertions(+), 85 deletions(-) diff --git a/cpp/benchmarks/bench_comm.cpp b/cpp/benchmarks/bench_comm.cpp index 40db45966..dc4623871 100644 --- a/cpp/benchmarks/bench_comm.cpp +++ b/cpp/benchmarks/bench_comm.cpp @@ -1,5 +1,5 @@ /** - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. * SPDX-License-Identifier: Apache-2.0 */ @@ -256,7 +256,7 @@ Duration run( } int main(int argc, char** argv) { - bool use_bootstrap = rapidsmpf::bootstrap::is_running_with_rrun(); + bool use_bootstrap = rapidsmpf::bootstrap::is_running_with_bootstrap(); int provided = 0; if (!use_bootstrap) { @@ -279,14 +279,14 @@ int main(int argc, char** argv) { if (args.comm_type == "mpi") { if (use_bootstrap) { std::cerr << "Error: MPI communicator requires MPI initialization. " - << "Don't use with rrun or unset RAPIDSMPF_RANK." << std::endl; + << "Don't use with rrun/srun bootstrap mode." << std::endl; return 1; } mpi::init(&argc, &argv); comm = std::make_shared(MPI_COMM_WORLD, options); } else if (args.comm_type == "ucxx") { if (use_bootstrap) { - // Launched with rrun - use bootstrap backend + // Launched with rrun or srun --mpi=pmix - use bootstrap backend comm = rapidsmpf::bootstrap::create_ucxx_comm( rapidsmpf::bootstrap::Backend::AUTO, options ); diff --git a/cpp/benchmarks/bench_shuffle.cpp b/cpp/benchmarks/bench_shuffle.cpp index d1047f1ef..7eb23a8a6 100644 --- a/cpp/benchmarks/bench_shuffle.cpp +++ b/cpp/benchmarks/bench_shuffle.cpp @@ -1,5 +1,5 @@ /** - * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. + * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. * SPDX-License-Identifier: Apache-2.0 */ @@ -282,7 +282,7 @@ class ArgumentParser { }; void barrier(std::shared_ptr& comm) { - bool use_bootstrap = rapidsmpf::bootstrap::is_running_with_rrun(); + bool use_bootstrap = rapidsmpf::bootstrap::is_running_with_bootstrap(); if (!use_bootstrap) { RAPIDSMPF_MPI(MPI_Barrier(MPI_COMM_WORLD)); } else { @@ -575,7 +575,7 @@ rapidsmpf::Duration run_hash_partition_with_datagen( } int main(int argc, char** argv) { - bool use_bootstrap = rapidsmpf::bootstrap::is_running_with_rrun(); + bool use_bootstrap = rapidsmpf::bootstrap::is_running_with_bootstrap(); // Explicitly initialize MPI with thread support, as this is needed for both mpi // and ucxx communicators when not using bootstrap mode. @@ -599,7 +599,7 @@ int main(int argc, char** argv) { if (use_bootstrap) { std::cerr << "Error: MPI communicator requires MPI initialization. Don't use with " - "rrun or unset RAPIDSMPF_RANK." + "rrun/srun bootstrap mode." << std::endl; return 1; } @@ -607,7 +607,7 @@ int main(int argc, char** argv) { comm = std::make_shared(MPI_COMM_WORLD, options); } else if (args.comm_type == "ucxx") { if (use_bootstrap) { - // Launched with rrun - use bootstrap backend + // Launched with rrun or srun --mpi=pmix - use bootstrap backend comm = rapidsmpf::bootstrap::create_ucxx_comm( rapidsmpf::bootstrap::Backend::AUTO, options ); diff --git a/cpp/benchmarks/streaming/bench_streaming_shuffle.cpp b/cpp/benchmarks/streaming/bench_streaming_shuffle.cpp index a9bedf6f2..6089a0d88 100644 --- a/cpp/benchmarks/streaming/bench_streaming_shuffle.cpp +++ b/cpp/benchmarks/streaming/bench_streaming_shuffle.cpp @@ -1,5 +1,5 @@ /** - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. * SPDX-License-Identifier: Apache-2.0 */ @@ -302,7 +302,7 @@ rapidsmpf::Duration run( } int main(int argc, char** argv) { - bool use_bootstrap = rapidsmpf::bootstrap::is_running_with_rrun(); + bool use_bootstrap = rapidsmpf::bootstrap::is_running_with_bootstrap(); // Explicitly initialize MPI with thread support, as this is needed for both mpi // and ucxx communicators when not using bootstrap mode. @@ -325,7 +325,7 @@ int main(int argc, char** argv) { if (use_bootstrap) { std::cerr << "Error: MPI communicator requires MPI initialization. Don't use with " - "rrun or unset RAPIDSMPF_RANK." + "rrun/srun bootstrap mode." << std::endl; return 1; } @@ -333,7 +333,7 @@ int main(int argc, char** argv) { comm = std::make_shared(MPI_COMM_WORLD, options); } else if (args.comm_type == "ucxx") { if (use_bootstrap) { - // Launched with rrun - use bootstrap backend + // Launched with rrun or srun --mpi=pmix - use bootstrap backend comm = rapidsmpf::bootstrap::create_ucxx_comm( rapidsmpf::bootstrap::Backend::AUTO, options ); diff --git a/cpp/benchmarks/streaming/ndsh/utils.cpp b/cpp/benchmarks/streaming/ndsh/utils.cpp index c76671cfd..faca07522 100644 --- a/cpp/benchmarks/streaming/ndsh/utils.cpp +++ b/cpp/benchmarks/streaming/ndsh/utils.cpp @@ -1,5 +1,5 @@ /** - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. * SPDX-License-Identifier: Apache-2.0 */ @@ -162,7 +162,8 @@ std::shared_ptr create_context( switch (arguments.comm_type) { case CommType::MPI: RAPIDSMPF_EXPECTS( - !bootstrap::is_running_with_rrun(), "Can't use MPI communicator with rrun" + !bootstrap::is_running_with_bootstrap(), + "Can't use MPI communicator with rrun/srun bootstrap mode" ); mpi::init(nullptr, nullptr); @@ -172,7 +173,7 @@ std::shared_ptr create_context( comm = std::make_shared(options); break; case CommType::UCXX: - if (bootstrap::is_running_with_rrun()) { + if (bootstrap::is_running_with_bootstrap()) { comm = bootstrap::create_ucxx_comm(bootstrap::Backend::AUTO, options); } else { mpi::init(nullptr, nullptr); diff --git a/cpp/include/rapidsmpf/bootstrap/utils.hpp b/cpp/include/rapidsmpf/bootstrap/utils.hpp index ec4d2bde1..becb41610 100644 --- a/cpp/include/rapidsmpf/bootstrap/utils.hpp +++ b/cpp/include/rapidsmpf/bootstrap/utils.hpp @@ -55,24 +55,51 @@ int get_gpu_id(); bool is_running_with_rrun(); /** - * @brief Get the current `rrun` rank. + * @brief Check if the current process is running under Slurm with PMIx. * - * This helper retrieves the rank of the current process when running with `rrun`. - * The rank is fetched from the `RAPIDSMPF_RANK` environment variable. + * This helper detects Slurm environment by checking for PMIx namespace + * or Slurm job step environment variables. + * + * @return true if running under Slurm with PMIx, false otherwise. + */ +bool is_running_with_slurm(); + +/** + * @brief Check if the current process is running with any bootstrap launcher. + * + * This helper detects bootstrap mode by checking for either `rrun` or Slurm/PMIx + * environment. Use this function when you need to determine whether to use + * bootstrap-based initialization vs MPI-based initialization. + * + * @return true if running under any bootstrap mode (rrun or Slurm), false otherwise. + */ +bool is_running_with_bootstrap(); + +/** + * @brief Get the current bootstrap rank. + * + * This helper retrieves the rank of the current process when running with a + * bootstrap launcher (rrun or Slurm). Checks environment variables in order: + * 1. RAPIDSMPF_RANK (set by rrun) + * 2. PMIX_RANK (set by PMIx) + * 3. SLURM_PROCID (set by Slurm) * * @return Rank of the current process (>= 0) if found, -1 otherwise. */ Rank get_rank(); /** - * @brief Get the number of `rrun` ranks. + * @brief Get the number of bootstrap ranks. * - * This helper retrieves the number of ranks when running with `rrun`. - * The number of ranks is fetched from the `RAPIDSMPF_NRANKS` environment variable. + * This helper retrieves the number of ranks when running with a bootstrap + * launcher (rrun or Slurm). Checks environment variables in order: + * 1. RAPIDSMPF_NRANKS (set by rrun) + * 2. SLURM_NPROCS (set by Slurm) + * 3. SLURM_NTASKS (set by Slurm) * * @return Number of ranks. - * @throws std::runtime_error if not running with `rrun` or if `RAPIDSMPF_NRANKS` is not - * set or cannot be parsed. + * @throws std::runtime_error if not running with a bootstrap launcher or if + * the environment variable cannot be parsed. */ Rank get_nranks(); diff --git a/cpp/src/bootstrap/slurm_backend.cpp b/cpp/src/bootstrap/slurm_backend.cpp index e8c355c84..db7c08a67 100644 --- a/cpp/src/bootstrap/slurm_backend.cpp +++ b/cpp/src/bootstrap/slurm_backend.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -24,6 +25,20 @@ namespace rapidsmpf::bootstrap::detail { namespace { +// PMIx initialization is process-global and must only happen once. +// Once initialized, PMIx stays active for the lifetime of the process. +// We track initialization state but do NOT finalize PMIx in the destructor, +// as multiple SlurmBackend instances may be created/destroyed during the +// bootstrap process. PMIx will be cleaned up when the process exits. +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +std::mutex g_pmix_mutex; +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +bool g_pmix_initialized = false; +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +pmix_proc_t g_pmix_proc{}; +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +std::array g_pmix_nspace{}; + /** * @brief Convert PMIx status to string for error messages. * @@ -50,53 +65,64 @@ void check_pmix_status(pmix_status_t status, std::string const& operation) { } // namespace SlurmBackend::SlurmBackend(Context ctx) : ctx_{std::move(ctx)} { - pmix_status_t rc; - pmix_proc_t proc; + std::lock_guard lock{g_pmix_mutex}; - rc = PMIx_Init(&proc, nullptr, 0); - if (rc != PMIX_SUCCESS) { - throw std::runtime_error( - "PMIx_Init failed: " + pmix_error_string(rc) - + ". Ensure you're running under Slurm with --mpi=pmix" - ); + if (!g_pmix_initialized) { + // First instance - initialize PMIx (will stay initialized for process lifetime) + pmix_proc_t proc; + pmix_status_t rc = PMIx_Init(&proc, nullptr, 0); + if (rc != PMIX_SUCCESS) { + throw std::runtime_error( + "PMIx_Init failed: " + pmix_error_string(rc) + + ". Ensure you're running under Slurm with --mpi=pmix" + ); + } + + g_pmix_proc = proc; + // Copy full nspace buffer (both are PMIX_MAX_NSLEN + 1 in size) + static_assert(sizeof(proc.nspace) == PMIX_MAX_NSLEN + 1); + std::memcpy(g_pmix_nspace.data(), proc.nspace, g_pmix_nspace.size()); + g_pmix_initialized = true; } + pmix_initialized_ = true; - proc_ = proc; - // Copy full nspace buffer (both are PMIX_MAX_NSLEN + 1 in size) - static_assert(sizeof(proc.nspace) == PMIX_MAX_NSLEN + 1); - std::memcpy(nspace_.data(), proc.nspace, nspace_.size()); + // Copy global state to instance members + proc_ = g_pmix_proc; + nspace_ = g_pmix_nspace; // Verify rank matches what we expect (if context has a valid rank) // Note: For SLURM backend, ctx_.rank may be set from environment variables // before PMIx_Init, so we verify they match - if (ctx_.rank >= 0 && std::cmp_not_equal(proc.rank, ctx_.rank)) { + if (ctx_.rank >= 0 && std::cmp_not_equal(g_pmix_proc.rank, ctx_.rank)) { throw std::runtime_error( - "PMIx rank (" + std::to_string(proc.rank) + ") doesn't match context rank (" - + std::to_string(ctx_.rank) + ")" + "PMIx rank (" + std::to_string(g_pmix_proc.rank) + + ") doesn't match context rank (" + std::to_string(ctx_.rank) + ")" ); } // Update context rank from PMIx if not already set if (ctx_.rank < 0) { - ctx_.rank = static_cast(proc.rank); + ctx_.rank = static_cast(g_pmix_proc.rank); } } SlurmBackend::~SlurmBackend() { - if (pmix_initialized_) { - pmix_status_t rc = PMIx_Finalize(nullptr, 0); - if (rc != PMIX_SUCCESS) { - // Log but don't throw from destructor - std::cerr << "Warning: PMIx_Finalize failed: " << pmix_error_string(rc) - << std::endl; - } - } + // Intentionally do NOT call PMIx_Finalize here. + // PMIx must stay initialized for the lifetime of the process because + // multiple SlurmBackend instances may be created and destroyed during + // bootstrap operations (put, barrier, get each create a new instance). + // + // TODO: Check whether it's safe to let PMIx clean itself up when the + // process exits, and potentially come up with a better solution. Maybe + // refcounting? } void SlurmBackend::put(std::string const& key, std::string const& value) { - pmix_value_t pmix_value; + std::cerr << "[Rank " << ctx_.rank << "] PMIx_Put: key='" << key + << "', value_len=" << value.size() << std::endl; + pmix_value_t pmix_value; PMIX_VALUE_CONSTRUCT(&pmix_value); pmix_value.type = PMIX_BYTE_OBJECT; pmix_value.data.bo.bytes = const_cast(value.data()); @@ -109,12 +135,10 @@ void SlurmBackend::put(std::string const& key, std::string const& value) { ); } - // Note: We don't call PMIX_VALUE_DESTRUCT here because we don't own the - // byte data (it points to value.data()). PMIX_VALUE_DESTRUCT would try - // to free that memory. - - // Commit to make the data available for subsequent fence + // Commit to make the data available commit(); + + std::cerr << "[Rank " << ctx_.rank << "] PMIx_Put + Commit succeeded" << std::endl; } void SlurmBackend::commit() { @@ -124,18 +148,25 @@ void SlurmBackend::commit() { std::string SlurmBackend::get(std::string const& key, Duration timeout) { auto start = std::chrono::steady_clock::now(); - auto poll_interval = std::chrono::milliseconds{10}; + auto poll_interval = std::chrono::milliseconds{100}; + + std::cerr << "[Rank " << ctx_.rank << "] PMIx_Get: waiting for key='" << key << "'" + << std::endl; - // Create proc to get from (wildcard to search all ranks in namespace) + // Get from rank 0 specifically (since that's where the key is stored) + // Using PMIX_RANK_WILDCARD doesn't seem to work reliably pmix_proc_t proc; PMIX_PROC_CONSTRUCT(&proc); std::memcpy(proc.nspace, nspace_.data(), nspace_.size()); - proc.rank = PMIX_RANK_WILDCARD; + proc.rank = 0; // Get from rank 0 specifically while (true) { pmix_value_t* val = nullptr; pmix_status_t rc = PMIx_Get(&proc, key.c_str(), nullptr, 0, &val); + std::cerr << "[Rank " << ctx_.rank + << "] PMIx_Get returned: " << pmix_error_string(rc) << std::endl; + if (rc == PMIX_SUCCESS && val != nullptr) { std::string result; @@ -154,6 +185,9 @@ std::string SlurmBackend::get(std::string const& key, Duration timeout) { } PMIX_VALUE_RELEASE(val); + + std::cerr << "[Rank " << ctx_.rank << "] PMIx_Get succeeded: key='" << key + << "', value_len=" << result.size() << std::endl; return result; } @@ -165,15 +199,12 @@ std::string SlurmBackend::get(std::string const& key, Duration timeout) { + std::to_string( std::chrono::duration_cast(timeout).count() ) - + "s timeout" + + "s timeout (last error: " + pmix_error_string(rc) + ")" ); } - // Sleep before retry with exponential backoff + // Sleep before retry std::this_thread::sleep_for(poll_interval); - if (poll_interval < std::chrono::milliseconds{100}) { - poll_interval = std::min(poll_interval * 2, std::chrono::milliseconds{100}); - } } } @@ -184,9 +215,29 @@ void SlurmBackend::barrier() { std::memcpy(proc.nspace, nspace_.data(), nspace_.size()); proc.rank = PMIX_RANK_WILDCARD; - // PMIx_Fence performs barrier and exchanges committed data - pmix_status_t rc = PMIx_Fence(&proc, 1, nullptr, 0); - check_pmix_status(rc, "PMIx_Fence (barrier)"); + // Set up info to collect data during fence + pmix_info_t info; + bool collect = true; + PMIX_INFO_CONSTRUCT(&info); + PMIX_INFO_LOAD(&info, PMIX_COLLECT_DATA, &collect, PMIX_BOOL); + + std::cerr << "[Rank " << ctx_.rank << "] PMIx_Fence: entering barrier" << std::endl; + + // PMIx_Fence performs synchronization barrier and data exchange + pmix_status_t rc = PMIx_Fence(&proc, 1, &info, 1); + PMIX_INFO_DESTRUCT(&info); + + std::cerr << "[Rank " << ctx_.rank + << "] PMIx_Fence returned: " << pmix_error_string(rc) << std::endl; + + // Accept both SUCCESS and PARTIAL_SUCCESS for the fence + // PARTIAL_SUCCESS can occur in some PMIx implementations when not all + // processes have data to contribute, but the synchronization succeeded + if (rc != PMIX_SUCCESS && rc != PMIX_ERR_PARTIAL_SUCCESS) { + throw std::runtime_error("PMIx_Fence (barrier) failed: " + pmix_error_string(rc)); + } + + std::cerr << "[Rank " << ctx_.rank << "] PMIx_Fence: exited barrier" << std::endl; } void SlurmBackend::broadcast(void* data, std::size_t size, Rank root) { diff --git a/cpp/src/bootstrap/ucxx.cpp b/cpp/src/bootstrap/ucxx.cpp index 39a07d68b..42ef069c1 100644 --- a/cpp/src/bootstrap/ucxx.cpp +++ b/cpp/src/bootstrap/ucxx.cpp @@ -1,5 +1,5 @@ /** - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. * SPDX-License-Identifier: Apache-2.0 */ @@ -41,7 +41,14 @@ std::shared_ptr create_ucxx_comm(Backend backend, config::Options op std::get>(listener_address.address) ->getString(); put(ctx, "ucxx_root_address", root_worker_address_str); - } else { + } + + // All ranks must barrier to make PMIx put() data visible. + // For file backend this is a no-op synchronization. + // For PMIx/Slurm backend this executes PMIx_Fence to exchange data. + barrier(ctx); + + if (ctx.rank != 0) { // Worker ranks retrieve the root address and connect auto root_worker_address_str = get(ctx, "ucxx_root_address", std::chrono::seconds{30}); @@ -52,6 +59,7 @@ std::shared_ptr create_ucxx_comm(Backend backend, config::Options op ucxx::init(nullptr, ctx.nranks, root_worker_address, options); comm = std::make_shared(std::move(ucxx_initialized_rank), options); } + comm->barrier(); return comm; } diff --git a/cpp/src/bootstrap/utils.cpp b/cpp/src/bootstrap/utils.cpp index 259f1e610..842137dc6 100644 --- a/cpp/src/bootstrap/utils.cpp +++ b/cpp/src/bootstrap/utils.cpp @@ -94,9 +94,42 @@ bool is_running_with_rrun() { return std::getenv("RAPIDSMPF_RANK") != nullptr; } +bool is_running_with_slurm() { + // Check for PMIx namespace (set by PMIx-enabled launchers like srun --mpi=pmix) + if (std::getenv("PMIX_NAMESPACE") != nullptr) { + return true; + } + // Check for Slurm job step (SLURM_JOB_ID + SLURM_STEP_ID indicate a job step) + if (std::getenv("SLURM_JOB_ID") != nullptr && std::getenv("SLURM_STEP_ID") != nullptr) + { + return true; + } + return false; +} + +bool is_running_with_bootstrap() { + return is_running_with_rrun() || is_running_with_slurm(); +} + Rank get_rank() { - char* rank_env = std::getenv("RAPIDSMPF_RANK"); - if (rank_env) { + // Check rrun first (explicit configuration takes priority) + if (char* rank_env = std::getenv("RAPIDSMPF_RANK")) { + try { + return std::stoi(rank_env); + } catch (...) { + // Ignore parse errors, try next source + } + } + // Check PMIx rank + if (char* rank_env = std::getenv("PMIX_RANK")) { + try { + return std::stoi(rank_env); + } catch (...) { + // Ignore parse errors, try next source + } + } + // Check Slurm process ID + if (char* rank_env = std::getenv("SLURM_PROCID")) { try { return std::stoi(rank_env); } catch (...) { @@ -107,29 +140,50 @@ Rank get_rank() { } Rank get_nranks() { - if (!is_running_with_rrun()) { + if (!is_running_with_bootstrap()) { throw std::runtime_error( - "get_nranks() can only be called when running with `rrun`. " - "Set RAPIDSMPF_RANK environment variable or use a launcher like 'rrun'." + "get_nranks() can only be called when running with a bootstrap launcher. " + "Use 'rrun' or 'srun --mpi=pmix' to launch the application." ); } - char const* nranks_str = std::getenv("RAPIDSMPF_NRANKS"); - if (nranks_str == nullptr) { - throw std::runtime_error( - "RAPIDSMPF_NRANKS environment variable not set. " - "Make sure to use a rrun launcher to call this function." - ); + // Check rrun first (explicit configuration takes priority) + if (char const* nranks_str = std::getenv("RAPIDSMPF_NRANKS")) { + try { + return std::stoi(nranks_str); + } catch (...) { + throw std::runtime_error( + "Failed to parse integer from RAPIDSMPF_NRANKS: " + + std::string(nranks_str) + ); + } } - try { - return std::stoi(nranks_str); - } catch (...) { - throw std::runtime_error( - "Failed to parse integer from RAPIDSMPF_NRANKS environment variable: " - + std::string(nranks_str) - ); + // Check Slurm environment variables + if (char const* nranks_str = std::getenv("SLURM_NPROCS")) { + try { + return std::stoi(nranks_str); + } catch (...) { + throw std::runtime_error( + "Failed to parse integer from SLURM_NPROCS: " + std::string(nranks_str) + ); + } + } + + if (char const* nranks_str = std::getenv("SLURM_NTASKS")) { + try { + return std::stoi(nranks_str); + } catch (...) { + throw std::runtime_error( + "Failed to parse integer from SLURM_NTASKS: " + std::string(nranks_str) + ); + } } + + throw std::runtime_error( + "Could not determine number of ranks. " + "Ensure RAPIDSMPF_NRANKS, SLURM_NPROCS, or SLURM_NTASKS is set." + ); } std::vector parse_cpu_list(std::string const& cpulist) { From 02fd79d67aff8121bea8d253c95a6446cbb4df63 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 13 Jan 2026 14:03:11 -0800 Subject: [PATCH 07/57] Remove debugging --- cpp/src/bootstrap/slurm_backend.cpp | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/cpp/src/bootstrap/slurm_backend.cpp b/cpp/src/bootstrap/slurm_backend.cpp index db7c08a67..d54c8810c 100644 --- a/cpp/src/bootstrap/slurm_backend.cpp +++ b/cpp/src/bootstrap/slurm_backend.cpp @@ -9,7 +9,6 @@ #include #include -#include #include #include #include @@ -119,9 +118,6 @@ SlurmBackend::~SlurmBackend() { } void SlurmBackend::put(std::string const& key, std::string const& value) { - std::cerr << "[Rank " << ctx_.rank << "] PMIx_Put: key='" << key - << "', value_len=" << value.size() << std::endl; - pmix_value_t pmix_value; PMIX_VALUE_CONSTRUCT(&pmix_value); pmix_value.type = PMIX_BYTE_OBJECT; @@ -137,8 +133,6 @@ void SlurmBackend::put(std::string const& key, std::string const& value) { // Commit to make the data available commit(); - - std::cerr << "[Rank " << ctx_.rank << "] PMIx_Put + Commit succeeded" << std::endl; } void SlurmBackend::commit() { @@ -150,9 +144,6 @@ std::string SlurmBackend::get(std::string const& key, Duration timeout) { auto start = std::chrono::steady_clock::now(); auto poll_interval = std::chrono::milliseconds{100}; - std::cerr << "[Rank " << ctx_.rank << "] PMIx_Get: waiting for key='" << key << "'" - << std::endl; - // Get from rank 0 specifically (since that's where the key is stored) // Using PMIX_RANK_WILDCARD doesn't seem to work reliably pmix_proc_t proc; @@ -164,9 +155,6 @@ std::string SlurmBackend::get(std::string const& key, Duration timeout) { pmix_value_t* val = nullptr; pmix_status_t rc = PMIx_Get(&proc, key.c_str(), nullptr, 0, &val); - std::cerr << "[Rank " << ctx_.rank - << "] PMIx_Get returned: " << pmix_error_string(rc) << std::endl; - if (rc == PMIX_SUCCESS && val != nullptr) { std::string result; @@ -185,9 +173,6 @@ std::string SlurmBackend::get(std::string const& key, Duration timeout) { } PMIX_VALUE_RELEASE(val); - - std::cerr << "[Rank " << ctx_.rank << "] PMIx_Get succeeded: key='" << key - << "', value_len=" << result.size() << std::endl; return result; } @@ -221,23 +206,16 @@ void SlurmBackend::barrier() { PMIX_INFO_CONSTRUCT(&info); PMIX_INFO_LOAD(&info, PMIX_COLLECT_DATA, &collect, PMIX_BOOL); - std::cerr << "[Rank " << ctx_.rank << "] PMIx_Fence: entering barrier" << std::endl; - // PMIx_Fence performs synchronization barrier and data exchange pmix_status_t rc = PMIx_Fence(&proc, 1, &info, 1); PMIX_INFO_DESTRUCT(&info); - std::cerr << "[Rank " << ctx_.rank - << "] PMIx_Fence returned: " << pmix_error_string(rc) << std::endl; - // Accept both SUCCESS and PARTIAL_SUCCESS for the fence // PARTIAL_SUCCESS can occur in some PMIx implementations when not all // processes have data to contribute, but the synchronization succeeded if (rc != PMIX_SUCCESS && rc != PMIX_ERR_PARTIAL_SUCCESS) { throw std::runtime_error("PMIx_Fence (barrier) failed: " + pmix_error_string(rc)); } - - std::cerr << "[Rank " << ctx_.rank << "] PMIx_Fence: exited barrier" << std::endl; } void SlurmBackend::broadcast(void* data, std::size_t size, Rank root) { From 0a569f8c17e361d7e4896819fc6ce9fb272e5a77 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 13 Jan 2026 14:03:27 -0800 Subject: [PATCH 08/57] Add Slurm mode to rrun --- cpp/tools/rrun.cpp | 281 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 228 insertions(+), 53 deletions(-) diff --git a/cpp/tools/rrun.cpp b/cpp/tools/rrun.cpp index f9f5c4772..fb57dc26f 100644 --- a/cpp/tools/rrun.cpp +++ b/cpp/tools/rrun.cpp @@ -1,5 +1,5 @@ /** - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. * SPDX-License-Identifier: Apache-2.0 */ @@ -87,6 +87,10 @@ struct Config { topology; // Discovered topology information std::map gpu_topology_map; // Map GPU ID to topology info + bool slurm_mode{false}; // Running under Slurm (--slurm or auto-detected) + int slurm_local_id{-1}; // Local rank within node (SLURM_LOCALID) + int slurm_global_rank{-1}; // Global rank (SLURM_PROCID) + int slurm_ntasks{-1}; // Total number of tasks (SLURM_NTASKS) }; /** @@ -147,12 +151,20 @@ void print_usage(std::string_view prog_name) { << "rrun - RapidsMPF Process Launcher\n\n" << "Usage: " << prog_name << " [options] [app_args...]\n\n" << "Single-Node Options:\n" - << " -n Number of ranks to launch (required)\n" + << " -n Number of ranks to launch (required in single-node " + "mode)\n" << " -g Comma-separated list of GPU IDs (e.g., 0,1,2,3)\n" << " If not specified, auto-detect available GPUs\n\n" + << "Slurm Options:\n" + << " --slurm Run in Slurm mode (apply topology bindings only)\n" + << " Auto-detected when SLURM_JOB_ID is set\n" + << " In this mode, rrun applies topology bindings and\n" + << " execs the application (no process launching)\n\n" << "Common Options:\n" << " -d Coordination directory (default: /tmp/rrun_)\n" + << " Not used in Slurm mode with PMIx backend\n" << " --tag-output Tag stdout and stderr with rank number\n" + << " Not applicable in Slurm mode\n" << " --bind-to Bind to topology resources (default: all)\n" << " Can be specified multiple times\n" << " Options: cpu, memory, network, all, none\n" @@ -175,6 +187,12 @@ void print_usage(std::string_view prog_name) { << " # Launch with custom environment variables:\n" << " rrun -n 2 -x UCX_TLS=cuda_copy,cuda_ipc,rc,tcp -x MY_VAR=value " "./bench_comm\n\n" + << "Slurm Examples:\n" + << " # Multi-node with topology binding (2 nodes, 4 GPUs per node):\n" + << " srun --mpi=pmix -N 2 --ntasks-per-node=4 --gres=gpu:4 rrun --slurm " + "./bench_shuffle -C ucxx\n\n" + << " # Auto-detect Slurm mode:\n" + << " srun --mpi=pmix -n 4 rrun ./bench_shuffle -C ucxx\n\n" << std::endl; } @@ -289,6 +307,105 @@ bool set_numa_memory_binding(std::vector const& memory_binding) { #endif } +/** + * @brief Check if running under Slurm and populate Slurm-related config fields. + * + * @param cfg Configuration to populate with Slurm information. + * @return true if running under Slurm with required environment variables. + */ +bool detect_slurm_environment(Config& cfg) { + // Check for required Slurm environment variables + char const* slurm_job_id = std::getenv("SLURM_JOB_ID"); + char const* slurm_local_id = std::getenv("SLURM_LOCALID"); + char const* slurm_procid = std::getenv("SLURM_PROCID"); + char const* slurm_ntasks = std::getenv("SLURM_NTASKS"); + + // Need at least job ID and local ID to be in Slurm mode + if (!slurm_job_id || !slurm_local_id) { + return false; + } + + try { + cfg.slurm_local_id = std::stoi(slurm_local_id); + + if (slurm_procid) { + cfg.slurm_global_rank = std::stoi(slurm_procid); + } + + if (slurm_ntasks) { + cfg.slurm_ntasks = std::stoi(slurm_ntasks); + } else { + // Try SLURM_NPROCS as fallback + char const* slurm_nprocs = std::getenv("SLURM_NPROCS"); + if (slurm_nprocs) { + cfg.slurm_ntasks = std::stoi(slurm_nprocs); + } + } + + return true; + } catch (...) { + return false; + } +} + +/** + * @brief Apply topology-based bindings for a specific GPU. + * + * This function sets CPU affinity, NUMA memory binding, and network device + * environment variables based on the topology information for the given GPU. + * + * @param cfg Configuration containing topology information. + * @param gpu_id GPU ID to apply bindings for. + * @param verbose Print warnings on failure. + */ +void apply_topology_bindings(Config const& cfg, int gpu_id, bool verbose) { + if (!cfg.topology.has_value() || gpu_id < 0) { + return; + } + + auto it = cfg.gpu_topology_map.find(gpu_id); + if (it == cfg.gpu_topology_map.end()) { + if (verbose) { + std::cerr << "Warning: No topology information for GPU " << gpu_id + << std::endl; + } + return; + } + + auto const& gpu_info = *it->second; + + if (cfg.bind_cpu && !gpu_info.cpu_affinity_list.empty()) { + if (!set_cpu_affinity(gpu_info.cpu_affinity_list)) { + if (verbose) { + std::cerr << "Warning: Failed to set CPU affinity for GPU " << gpu_id + << std::endl; + } + } + } + + if (cfg.bind_memory && !gpu_info.memory_binding.empty()) { + if (!set_numa_memory_binding(gpu_info.memory_binding)) { +#if RAPIDSMPF_HAVE_NUMA + if (verbose) { + std::cerr << "Warning: Failed to set NUMA memory binding for GPU " + << gpu_id << std::endl; + } +#endif + } + } + + if (cfg.bind_network && !gpu_info.network_devices.empty()) { + std::string ucx_net_devices; + for (size_t i = 0; i < gpu_info.network_devices.size(); ++i) { + if (i > 0) { + ucx_net_devices += ","; + } + ucx_net_devices += gpu_info.network_devices[i]; + } + setenv("UCX_NET_DEVICES", ucx_net_devices.c_str(), 1); + } +} + /** * @brief Parse GPU list from comma-separated string. */ @@ -414,6 +531,17 @@ Config parse_args(int argc, char* argv[]) { cfg.verbose = true; } else if (arg == "--no-cleanup") { cfg.cleanup = false; + } else if (arg == "--slurm") { + cfg.slurm_mode = true; + } else if (arg == "--") { + // Everything after -- is the application and its arguments + if (i + 1 < argc) { + cfg.app_binary = argv[i + 1]; + for (int j = i + 2; j < argc; ++j) { + cfg.app_args.push_back(argv[j]); + } + } + break; } else if (arg[0] == '-') { throw std::runtime_error("Unknown option: " + arg); } else { @@ -433,9 +561,39 @@ Config parse_args(int argc, char* argv[]) { throw std::runtime_error("Missing application binary"); } - // Single-node mode validation - if (cfg.nranks <= 0) { - throw std::runtime_error("Number of ranks (-n) must be specified and positive"); + // Auto-detect Slurm mode if not explicitly specified + if (!cfg.slurm_mode) { + cfg.slurm_mode = detect_slurm_environment(cfg); + } else { + // --slurm was specified, populate Slurm info + if (!detect_slurm_environment(cfg)) { + throw std::runtime_error( + "--slurm specified but required Slurm environment variables " + "(SLURM_JOB_ID, SLURM_LOCALID) are not set. " + "Ensure you're running under srun." + ); + } + } + + if (cfg.slurm_mode) { + // Slurm mode validation + if (cfg.slurm_local_id < 0) { + throw std::runtime_error( + "SLURM_LOCALID environment variable not set or invalid" + ); + } + + // In Slurm mode, nranks comes from SLURM_NTASKS + if (cfg.slurm_ntasks > 0) { + cfg.nranks = cfg.slurm_ntasks; + } + } else { + // Single-node mode validation + if (cfg.nranks <= 0) { + throw std::runtime_error( + "Number of ranks (-n) must be specified and positive" + ); + } } // Auto-detect GPUs if not specified @@ -448,15 +606,17 @@ Config parse_args(int argc, char* argv[]) { } } - // Validate GPU count vs rank count - if (!cfg.gpus.empty() && cfg.nranks > static_cast(cfg.gpus.size())) { + // Validate GPU count vs rank count (only warn in single-node mode) + if (!cfg.slurm_mode && !cfg.gpus.empty() + && cfg.nranks > static_cast(cfg.gpus.size())) + { std::cerr << "Warning: Number of ranks (" << cfg.nranks << ") exceeds number of GPUs (" << cfg.gpus.size() << "). Multiple ranks will share GPUs." << std::endl; } - // Generate coordination directory if not specified - if (cfg.coord_dir.empty()) { + // Generate coordination directory if not specified (not needed in Slurm mode) + if (cfg.coord_dir.empty() && !cfg.slurm_mode) { cfg.coord_dir = "/tmp/rrun_" + generate_session_id(); } @@ -598,43 +758,7 @@ pid_t launch_rank_local( setenv("CUDA_VISIBLE_DEVICES", std::to_string(gpu_id).c_str(), 1); } - // Apply topology-based configuration if available - if (cfg.topology.has_value() && gpu_id >= 0) { - auto it = cfg.gpu_topology_map.find(gpu_id); - if (it != cfg.gpu_topology_map.end()) { - auto const& gpu_info = *it->second; - - if (cfg.bind_cpu && !gpu_info.cpu_affinity_list.empty()) { - if (!set_cpu_affinity(gpu_info.cpu_affinity_list)) { - std::cerr << "Warning: Failed to set CPU affinity for rank " - << captured_rank << " (GPU " << gpu_id << ")" - << std::endl; - } - } - - if (cfg.bind_memory && !gpu_info.memory_binding.empty()) { - if (!set_numa_memory_binding(gpu_info.memory_binding)) { -#if RAPIDSMPF_HAVE_NUMA - std::cerr - << "Warning: Failed to set NUMA memory binding for rank " - << captured_rank << " (GPU " << gpu_id << ")" - << std::endl; -#endif - } - } - - if (cfg.bind_network && !gpu_info.network_devices.empty()) { - std::string ucx_net_devices; - for (size_t i = 0; i < gpu_info.network_devices.size(); ++i) { - if (i > 0) { - ucx_net_devices += ","; - } - ucx_net_devices += gpu_info.network_devices[i]; - } - setenv("UCX_NET_DEVICES", ucx_net_devices.c_str(), 1); - } - } - } + apply_topology_bindings(cfg, gpu_id, cfg.verbose); // Prepare arguments for execvp std::vector exec_args; @@ -702,7 +826,8 @@ int main(int argc, char* argv[]) { if (cfg.verbose) { std::cout << "rrun configuration:\n"; - std::cout << " Mode: Single-node\n" + std::cout << " Mode: " << (cfg.slurm_mode ? "Slurm" : "Single-node") + << "\n" << " GPUs: "; if (cfg.gpus.empty()) { std::cout << "(none)\n"; @@ -714,13 +839,19 @@ int main(int argc, char* argv[]) { } std::cout << "\n"; } - if (cfg.tag_output) { - std::cout << " Tag Output: Yes\n"; + if (cfg.slurm_mode) { + std::cout << " Slurm Local ID: " << cfg.slurm_local_id << "\n" + << " Slurm Rank: " << cfg.slurm_global_rank << "\n" + << " Slurm NTasks: " << cfg.slurm_ntasks << "\n"; + } else { + if (cfg.tag_output) { + std::cout << " Tag Output: Yes\n"; + } + std::cout << " Ranks: " << cfg.nranks << "\n" + << " Coord Dir: " << cfg.coord_dir << "\n" + << " Cleanup: " << (cfg.cleanup ? "yes" : "no") << "\n"; } - std::cout << " Ranks: " << cfg.nranks << "\n" - << " Application: " << cfg.app_binary << "\n" - << " Coord Dir: " << cfg.coord_dir << "\n" - << " Cleanup: " << (cfg.cleanup ? "yes" : "no") << "\n"; + std::cout << " Application: " << cfg.app_binary << "\n"; std::vector bind_types; if (cfg.bind_cpu) bind_types.push_back("cpu"); @@ -752,6 +883,50 @@ int main(int argc, char* argv[]) { std::cout << std::endl; } + // ===================================================================== + // Slurm Mode: Apply topology bindings and exec the application + // ===================================================================== + if (cfg.slurm_mode) { + // Set custom environment variables + for (auto const& env_pair : cfg.env_vars) { + setenv(env_pair.first.c_str(), env_pair.second.c_str(), 1); + } + + // Determine GPU for this local rank + int gpu_id = -1; + if (!cfg.gpus.empty()) { + gpu_id = + cfg.gpus[static_cast(cfg.slurm_local_id) % cfg.gpus.size()]; + setenv("CUDA_VISIBLE_DEVICES", std::to_string(gpu_id).c_str(), 1); + + if (cfg.verbose) { + std::cerr << "[rrun] Slurm local_id=" << cfg.slurm_local_id + << " assigned to GPU " << gpu_id << std::endl; + } + } + + apply_topology_bindings(cfg, gpu_id, cfg.verbose); + + // Prepare arguments for execvp + std::vector exec_args; + exec_args.push_back(const_cast(cfg.app_binary.c_str())); + for (auto const& arg : cfg.app_args) { + exec_args.push_back(const_cast(arg.c_str())); + } + exec_args.push_back(nullptr); + + // Exec the application (this replaces the current process) + execvp(cfg.app_binary.c_str(), exec_args.data()); + + // If we get here, execvp failed + std::cerr << "Failed to execute " << cfg.app_binary << ": " + << std::strerror(errno) << std::endl; + return 1; + } + + // ===================================================================== + // Single-Node Mode: Fork processes locally + // ===================================================================== std::filesystem::create_directories(cfg.coord_dir); std::vector pids; From 66da920bf098c39239d7dbfd2241f5ee5a0dfd23 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 14 Jan 2026 01:53:53 -0800 Subject: [PATCH 09/57] Fix run on Slurm check without breaking run with mpirun --- cpp/src/bootstrap/bootstrap.cpp | 11 +++++------ cpp/src/bootstrap/utils.cpp | 7 +------ 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/cpp/src/bootstrap/bootstrap.cpp b/cpp/src/bootstrap/bootstrap.cpp index 05d19540d..e58636ac2 100644 --- a/cpp/src/bootstrap/bootstrap.cpp +++ b/cpp/src/bootstrap/bootstrap.cpp @@ -63,12 +63,11 @@ Backend detect_backend() { } #ifdef RAPIDSMPF_HAVE_SLURM - // Check for PMIx/Slurm environment - // PMIX_NAMESPACE is set by PMIx-enabled launchers (srun --mpi=pmix) - // SLURM_JOB_ID + SLURM_STEP_ID indicate a Slurm job step - if (getenv_optional("PMIX_NAMESPACE") - || (getenv_optional("SLURM_JOB_ID") && getenv_optional("SLURM_STEP_ID"))) - { + // Check for Slurm-specific environment variables. + // Note: We don't check PMIX_NAMESPACE alone because OpenMPI also uses PMIx + // internally and sets PMIX_NAMESPACE when launched with mpirun. + // SLURM_JOB_ID + SLURM_PROCID is specific to Slurm srun tasks. + if (getenv_optional("SLURM_JOB_ID") && getenv_optional("SLURM_PROCID")) { return Backend::SLURM; } #endif diff --git a/cpp/src/bootstrap/utils.cpp b/cpp/src/bootstrap/utils.cpp index 842137dc6..da715c17e 100644 --- a/cpp/src/bootstrap/utils.cpp +++ b/cpp/src/bootstrap/utils.cpp @@ -95,12 +95,7 @@ bool is_running_with_rrun() { } bool is_running_with_slurm() { - // Check for PMIx namespace (set by PMIx-enabled launchers like srun --mpi=pmix) - if (std::getenv("PMIX_NAMESPACE") != nullptr) { - return true; - } - // Check for Slurm job step (SLURM_JOB_ID + SLURM_STEP_ID indicate a job step) - if (std::getenv("SLURM_JOB_ID") != nullptr && std::getenv("SLURM_STEP_ID") != nullptr) + if (std::getenv("SLURM_JOB_ID") != nullptr && std::getenv("SLURM_PROCID") != nullptr) { return true; } From df1ed25b870b3a0d81811208f3f4a05bbcf0e005 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 14 Jan 2026 03:30:47 -0800 Subject: [PATCH 10/57] Support multiple ranks with single Slurm task --- cpp/tools/rrun.cpp | 190 ++++++++++++++++++++++++++++++++------------- 1 file changed, 135 insertions(+), 55 deletions(-) diff --git a/cpp/tools/rrun.cpp b/cpp/tools/rrun.cpp index fb57dc26f..d2d75710b 100644 --- a/cpp/tools/rrun.cpp +++ b/cpp/tools/rrun.cpp @@ -156,10 +156,14 @@ void print_usage(std::string_view prog_name) { << " -g Comma-separated list of GPU IDs (e.g., 0,1,2,3)\n" << " If not specified, auto-detect available GPUs\n\n" << "Slurm Options:\n" - << " --slurm Run in Slurm mode (apply topology bindings only)\n" - << " Auto-detected when SLURM_JOB_ID is set\n" - << " In this mode, rrun applies topology bindings and\n" - << " execs the application (no process launching)\n\n" + << " --slurm Run in Slurm mode (auto-detected when SLURM_JOB_ID is " + "set)\n" + << " Two sub-modes:\n" + << " 1. Passthrough (no -n): Apply bindings and exec\n" + << " 2. Hybrid (with -n): Launch N ranks per Slurm task\n" + << " In hybrid mode, each Slurm task launches multiple " + "ranks\n" + << " with coordinated global rank numbering\n\n" << "Common Options:\n" << " -d Coordination directory (default: /tmp/rrun_)\n" << " Not used in Slurm mode with PMIx backend\n" @@ -188,11 +192,15 @@ void print_usage(std::string_view prog_name) { << " rrun -n 2 -x UCX_TLS=cuda_copy,cuda_ipc,rc,tcp -x MY_VAR=value " "./bench_comm\n\n" << "Slurm Examples:\n" - << " # Multi-node with topology binding (2 nodes, 4 GPUs per node):\n" - << " srun --mpi=pmix -N 2 --ntasks-per-node=4 --gres=gpu:4 rrun --slurm " + << " # Passthrough mode (1 rank per Slurm task, 8 tasks total):\n" + << " srun --mpi=pmix -N 2 --ntasks-per-node=4 --gres=gpu:4 rrun " + "./bench_shuffle -C ucxx\n\n" + << " # Hybrid mode (2 Slurm tasks × 4 ranks/task = 8 total ranks):\n" + << " srun --mpi=pmix -N 2 --ntasks-per-node=1 --gres=gpu:4 rrun -n 4 " + "./bench_shuffle -C ucxx\n\n" + << " # Hybrid mode with --gpus-per-task:\n" + << " srun --mpi=pmix --ntasks-per-node=2 --gpus-per-task=4 rrun -n 4 " "./bench_shuffle -C ucxx\n\n" - << " # Auto-detect Slurm mode:\n" - << " srun --mpi=pmix -n 4 rrun ./bench_shuffle -C ucxx\n\n" << std::endl; } @@ -583,10 +591,14 @@ Config parse_args(int argc, char* argv[]) { ); } - // In Slurm mode, nranks comes from SLURM_NTASKS - if (cfg.slurm_ntasks > 0) { - cfg.nranks = cfg.slurm_ntasks; + // In Slurm mode: + // - If -n is specified: launch N ranks per Slurm task (hybrid mode) + // - If -n is not specified: just apply bindings and exec (passthrough mode) + if (cfg.nranks <= 0) { + // Passthrough mode: one rank per Slurm task + cfg.nranks = 1; } + // else: hybrid mode with cfg.nranks children per Slurm task } else { // Single-node mode validation if (cfg.nranks <= 0) { @@ -730,31 +742,48 @@ pid_t fork_with_piped_stdio( /** * @brief Launch a single rank locally (fork-based). + * + * @param cfg Configuration. + * @param global_rank Global rank number (used for RAPIDSMPF_RANK). + * @param local_rank Local rank for GPU assignment (defaults to global_rank). + * @param total_ranks Total number of ranks across all tasks (used for RAPIDSMPF_NRANKS). + * @param out_fd_stdout Output file descriptor for stdout. + * @param out_fd_stderr Output file descriptor for stderr. + * @return Child process PID. */ pid_t launch_rank_local( - Config const& cfg, int rank, int* out_fd_stdout, int* out_fd_stderr + Config const& cfg, + int global_rank, + int local_rank, + int total_ranks, + int* out_fd_stdout, + int* out_fd_stderr ) { - // Capture rank by value explicitly to avoid any potential issues - int captured_rank = rank; + // Capture ranks by value explicitly to avoid any potential issues + int captured_global_rank = global_rank; + int captured_local_rank = local_rank; + int captured_total_ranks = total_ranks; return fork_with_piped_stdio( out_fd_stdout, out_fd_stderr, /*combine_stderr*/ false, - [&cfg, captured_rank]() { + [&cfg, captured_global_rank, captured_local_rank, captured_total_ranks]() { // Set custom environment variables first (can be overridden by specific vars) for (auto const& env_pair : cfg.env_vars) { setenv(env_pair.first.c_str(), env_pair.second.c_str(), 1); } // Set environment variables - setenv("RAPIDSMPF_RANK", std::to_string(captured_rank).c_str(), 1); - setenv("RAPIDSMPF_NRANKS", std::to_string(cfg.nranks).c_str(), 1); + setenv("RAPIDSMPF_RANK", std::to_string(captured_global_rank).c_str(), 1); + setenv("RAPIDSMPF_NRANKS", std::to_string(captured_total_ranks).c_str(), 1); setenv("RAPIDSMPF_COORD_DIR", cfg.coord_dir.c_str(), 1); // Set CUDA_VISIBLE_DEVICES if GPUs are available + // Use local_rank for GPU assignment (for Slurm hybrid mode) int gpu_id = -1; if (!cfg.gpus.empty()) { - gpu_id = cfg.gpus[static_cast(captured_rank) % cfg.gpus.size()]; + gpu_id = + cfg.gpus[static_cast(captured_local_rank) % cfg.gpus.size()]; setenv("CUDA_VISIBLE_DEVICES", std::to_string(gpu_id).c_str(), 1); } @@ -884,53 +913,101 @@ int main(int argc, char* argv[]) { } // ===================================================================== - // Slurm Mode: Apply topology bindings and exec the application + // Slurm Mode: Two sub-modes based on whether -n was specified // ===================================================================== if (cfg.slurm_mode) { - // Set custom environment variables - for (auto const& env_pair : cfg.env_vars) { - setenv(env_pair.first.c_str(), env_pair.second.c_str(), 1); - } + if (cfg.nranks == 1) { + // ===== Passthrough Mode: Just apply bindings and exec ===== + // Set custom environment variables + for (auto const& env_pair : cfg.env_vars) { + setenv(env_pair.first.c_str(), env_pair.second.c_str(), 1); + } - // Determine GPU for this local rank - int gpu_id = -1; - if (!cfg.gpus.empty()) { - gpu_id = - cfg.gpus[static_cast(cfg.slurm_local_id) % cfg.gpus.size()]; - setenv("CUDA_VISIBLE_DEVICES", std::to_string(gpu_id).c_str(), 1); + // Determine GPU for this Slurm task + int gpu_id = -1; + if (!cfg.gpus.empty()) { + gpu_id = + cfg.gpus + [static_cast(cfg.slurm_local_id) % cfg.gpus.size()]; + setenv("CUDA_VISIBLE_DEVICES", std::to_string(gpu_id).c_str(), 1); + + if (cfg.verbose) { + std::cerr << "[rrun] Slurm task (passthrough) local_id=" + << cfg.slurm_local_id << " assigned to GPU " << gpu_id + << std::endl; + } + } - if (cfg.verbose) { - std::cerr << "[rrun] Slurm local_id=" << cfg.slurm_local_id - << " assigned to GPU " << gpu_id << std::endl; + apply_topology_bindings(cfg, gpu_id, cfg.verbose); + + // Prepare arguments for execvp + std::vector exec_args; + exec_args.push_back(const_cast(cfg.app_binary.c_str())); + for (auto const& arg : cfg.app_args) { + exec_args.push_back(const_cast(arg.c_str())); } + exec_args.push_back(nullptr); + + // Exec the application (this replaces the current process) + execvp(cfg.app_binary.c_str(), exec_args.data()); + + // If we get here, execvp failed + std::cerr << "Failed to execute " << cfg.app_binary << ": " + << std::strerror(errno) << std::endl; + return 1; } - apply_topology_bindings(cfg, gpu_id, cfg.verbose); + // ===== Hybrid Mode: Fork multiple ranks within this Slurm task ===== + if (cfg.verbose) { + std::cout << "[rrun] Slurm hybrid mode: task " << cfg.slurm_global_rank + << " launching " << cfg.nranks << " ranks per task" + << std::endl; + } - // Prepare arguments for execvp - std::vector exec_args; - exec_args.push_back(const_cast(cfg.app_binary.c_str())); - for (auto const& arg : cfg.app_args) { - exec_args.push_back(const_cast(arg.c_str())); + // Generate coordination directory if not specified + // In hybrid mode, we need FILE backend since PMIx doesn't know about children + if (cfg.coord_dir.empty()) { + // Use Slurm job ID for coordination directory to ensure uniqueness + char const* job_id = std::getenv("SLURM_JOB_ID"); + if (job_id) { + cfg.coord_dir = "/tmp/rrun_slurm_" + std::string{job_id}; + } else { + cfg.coord_dir = "/tmp/rrun_" + generate_session_id(); + } } - exec_args.push_back(nullptr); - // Exec the application (this replaces the current process) - execvp(cfg.app_binary.c_str(), exec_args.data()); + std::filesystem::create_directories(cfg.coord_dir); - // If we get here, execvp failed - std::cerr << "Failed to execute " << cfg.app_binary << ": " - << std::strerror(errno) << std::endl; - return 1; + // Continue to fork-based launch below (like single-node mode) + // cfg.nranks already contains ranks per task, will be adjusted below } // ===================================================================== - // Single-Node Mode: Fork processes locally + // Fork-based Launch: Single-Node or Slurm Hybrid Mode // ===================================================================== std::filesystem::create_directories(cfg.coord_dir); + // Determine rank offset and total ranks + int rank_offset = 0; + int ranks_per_task = cfg.nranks; // Ranks to launch in this process + int total_ranks = cfg.nranks; // Total ranks across all processes + + if (cfg.slurm_mode) { + // Hybrid mode: multiple ranks per Slurm task + int slurm_ntasks = cfg.slurm_ntasks > 0 ? cfg.slurm_ntasks : 1; + rank_offset = cfg.slurm_global_rank * ranks_per_task; + total_ranks = slurm_ntasks * ranks_per_task; + + if (cfg.verbose) { + std::cout << "[rrun] Task " << cfg.slurm_global_rank + << " launching ranks " << rank_offset << "-" + << (rank_offset + ranks_per_task - 1) + << " (total: " << total_ranks << " ranks)" << std::endl; + } + } + std::vector pids; - pids.reserve(static_cast(cfg.nranks)); + pids.reserve(static_cast(ranks_per_task)); // Block SIGINT/SIGTERM in this thread; a dedicated thread will handle them. sigset_t signal_set; @@ -942,7 +1019,7 @@ int main(int argc, char* argv[]) { // Output suppression flag and forwarder threads auto suppress_output = std::make_shared>(false); std::vector forwarders; - forwarders.reserve(static_cast(cfg.nranks) * 2); + forwarders.reserve(static_cast(ranks_per_task) * 2); // Helper to start a forwarder thread for a given fd auto start_forwarder = [&](int fd, int rank, bool to_stderr) { @@ -977,19 +1054,22 @@ int main(int argc, char* argv[]) { }); }; - // Single-node local mode - for (int rank = 0; rank < cfg.nranks; ++rank) { + // Launch ranks (with offset for Slurm hybrid mode) + for (int local_rank = 0; local_rank < ranks_per_task; ++local_rank) { + int global_rank = rank_offset + local_rank; int fd_out = -1; int fd_err = -1; - pid_t pid = launch_rank_local(cfg, rank, &fd_out, &fd_err); + pid_t pid = launch_rank_local( + cfg, global_rank, local_rank, total_ranks, &fd_out, &fd_err + ); pids.push_back(pid); if (cfg.verbose) { std::ostringstream msg; - msg << "Launched rank " << rank << " (PID " << pid << ")"; + msg << "Launched rank " << global_rank << " (PID " << pid << ")"; if (!cfg.gpus.empty()) { msg << " on GPU " - << cfg.gpus[static_cast(rank) % cfg.gpus.size()]; + << cfg.gpus[static_cast(local_rank) % cfg.gpus.size()]; } msg << std::endl; std::string msg_str = msg.str(); @@ -998,8 +1078,8 @@ int main(int argc, char* argv[]) { std::cout.flush(); } // Parent-side forwarders for local stdout and stderr - start_forwarder(fd_out, rank, false); - start_forwarder(fd_err, rank, true); + start_forwarder(fd_out, global_rank, false); + start_forwarder(fd_err, global_rank, true); } // Start a signal-waiting thread to forward signals. From fff8d863a7b837cdca5abf3ac05482078a18392c Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 14 Jan 2026 12:05:32 -0800 Subject: [PATCH 11/57] Use file backend in Slurm hybrid mode --- cpp/tools/rrun.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cpp/tools/rrun.cpp b/cpp/tools/rrun.cpp index d2d75710b..429db19f3 100644 --- a/cpp/tools/rrun.cpp +++ b/cpp/tools/rrun.cpp @@ -778,6 +778,16 @@ pid_t launch_rank_local( setenv("RAPIDSMPF_NRANKS", std::to_string(captured_total_ranks).c_str(), 1); setenv("RAPIDSMPF_COORD_DIR", cfg.coord_dir.c_str(), 1); + // In Slurm hybrid mode, unset Slurm rank variables to avoid confusion + // Children inherit parent's SLURM_PROCID, which could interfere with + // bootstrap Since RAPIDSMPF_COORD_DIR is set, FILE backend will be used + // anyway + if (cfg.slurm_mode) { + unsetenv("SLURM_PROCID"); + unsetenv("SLURM_LOCALID"); + unsetenv("PMIX_RANK"); + } + // Set CUDA_VISIBLE_DEVICES if GPUs are available // Use local_rank for GPU assignment (for Slurm hybrid mode) int gpu_id = -1; From e87e5350694447679b2db907766104d3487c6481 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 14 Jan 2026 13:51:51 -0800 Subject: [PATCH 12/57] Use rrun parent process as PMIx coordinator --- cpp/src/bootstrap/ucxx.cpp | 105 ++++++++--- cpp/tools/CMakeLists.txt | 14 +- cpp/tools/rrun.cpp | 352 +++++++++++++++++++++++++++++++++++-- 3 files changed, 424 insertions(+), 47 deletions(-) diff --git a/cpp/src/bootstrap/ucxx.cpp b/cpp/src/bootstrap/ucxx.cpp index 42ef069c1..bd928b338 100644 --- a/cpp/src/bootstrap/ucxx.cpp +++ b/cpp/src/bootstrap/ucxx.cpp @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -29,38 +30,86 @@ std::shared_ptr create_ucxx_comm(Backend backend, config::Options op std::shared_ptr comm; - if (ctx.rank == 0) { - // Create root UCXX communicator - auto ucxx_initialized_rank = - ucxx::init(nullptr, ctx.nranks, std::nullopt, options); - comm = std::make_shared(std::move(ucxx_initialized_rank), options); - - // Get the listener address and publish - auto listener_address = comm->listener_address(); - auto root_worker_address_str = - std::get>(listener_address.address) - ->getString(); - put(ctx, "ucxx_root_address", root_worker_address_str); + // Check if root address was provided by parent process (rrun hybrid mode) + char const* precomputed_address = std::getenv("RAPIDSMPF_ROOT_ADDRESS"); + + if (precomputed_address != nullptr) { + // Parent process already coordinated the root address via PMIx + // Children skip bootstrap coordination and use the provided address directly + if (ctx.rank == 0) { + // Root child creates listener + auto ucxx_initialized_rank = + ucxx::init(nullptr, ctx.nranks, std::nullopt, options); + comm = + std::make_shared(std::move(ucxx_initialized_rank), options); + } else { + // Worker children connect using provided address + auto root_worker_address = + ::ucxx::createAddressFromString(precomputed_address); + auto ucxx_initialized_rank = + ucxx::init(nullptr, ctx.nranks, root_worker_address, options); + comm = + std::make_shared(std::move(ucxx_initialized_rank), options); + } + } else { + // Standard bootstrap coordination via put/get/barrier + if (ctx.rank == 0) { + // Create root UCXX communicator + auto ucxx_initialized_rank = + ucxx::init(nullptr, ctx.nranks, std::nullopt, options); + comm = + std::make_shared(std::move(ucxx_initialized_rank), options); + + // Get the listener address and publish + auto listener_address = comm->listener_address(); + auto root_worker_address_str = + std::get>(listener_address.address) + ->getString(); + put(ctx, "ucxx_root_address", root_worker_address_str); + } + + // All ranks must barrier to make PMIx put() data visible. + // For file backend this is a no-op synchronization. + // For PMIx/Slurm backend this executes PMIx_Fence to exchange data. + barrier(ctx); + + if (ctx.rank != 0) { + // Worker ranks retrieve the root address and connect + auto root_worker_address_str = + get(ctx, "ucxx_root_address", std::chrono::seconds{30}); + auto root_worker_address = + ::ucxx::createAddressFromString(root_worker_address_str); + + auto ucxx_initialized_rank = + ucxx::init(nullptr, ctx.nranks, root_worker_address, options); + comm = + std::make_shared(std::move(ucxx_initialized_rank), options); + } } - // All ranks must barrier to make PMIx put() data visible. - // For file backend this is a no-op synchronization. - // For PMIx/Slurm backend this executes PMIx_Fence to exchange data. - barrier(ctx); - - if (ctx.rank != 0) { - // Worker ranks retrieve the root address and connect - auto root_worker_address_str = - get(ctx, "ucxx_root_address", std::chrono::seconds{30}); - auto root_worker_address = - ::ucxx::createAddressFromString(root_worker_address_str); - - auto ucxx_initialized_rank = - ucxx::init(nullptr, ctx.nranks, root_worker_address, options); - comm = std::make_shared(std::move(ucxx_initialized_rank), options); + comm->barrier(); + + // If root rank and address file path is specified, write the address + // This is used for parent-mediated coordination in rrun hybrid mode + if (ctx.rank == 0) { + char const* address_file = std::getenv("RAPIDSMPF_ROOT_ADDRESS_FILE"); + if (address_file != nullptr) { + auto listener_address = comm->listener_address(); + auto root_address_str = + std::get>(listener_address.address) + ->getString(); + + std::ofstream addr_file(address_file); + if (!addr_file) { + throw std::runtime_error( + "Failed to write root address to file: " + std::string{address_file} + ); + } + addr_file << root_address_str << std::endl; + addr_file.close(); + } } - comm->barrier(); return comm; } } // namespace rapidsmpf::bootstrap diff --git a/cpp/tools/CMakeLists.txt b/cpp/tools/CMakeLists.txt index 30976cb7e..20694d2a2 100644 --- a/cpp/tools/CMakeLists.txt +++ b/cpp/tools/CMakeLists.txt @@ -20,10 +20,18 @@ if(NVML_INCLUDE_DIR) target_include_directories(rrun PRIVATE ${NVML_INCLUDE_DIR}) endif() target_compile_options(rrun PRIVATE "$<$:${RAPIDSMPF_CXX_FLAGS}>") -target_compile_definitions(rrun PRIVATE $<$:RAPIDSMPF_HAVE_NUMA>) +target_compile_definitions( + rrun PRIVATE $<$:RAPIDSMPF_HAVE_NUMA> + $<$:RAPIDSMPF_HAVE_SLURM> +) target_link_libraries( - rrun PRIVATE Threads::Threads $ maybe_asan - $<$:numa> ${CMAKE_DL_LIBS} + rrun + PRIVATE Threads::Threads + $ + maybe_asan + $<$:numa> + $ + ${CMAKE_DL_LIBS} ) install( TARGETS rrun diff --git a/cpp/tools/rrun.cpp b/cpp/tools/rrun.cpp index 429db19f3..f95db56e2 100644 --- a/cpp/tools/rrun.cpp +++ b/cpp/tools/rrun.cpp @@ -46,6 +46,10 @@ #include +#ifdef RAPIDSMPF_HAVE_SLURM +#include +#endif + // NOTE: Do not use RAPIDSMPF_EXPECTS or RAPIDSMPF_FAIL in this file. // Using these macros introduces a CUDA dependency via rapidsmpf/error.hpp. // Prefer throwing standard exceptions instead. @@ -740,6 +744,136 @@ pid_t fork_with_piped_stdio( return pid; } +#ifdef RAPIDSMPF_HAVE_SLURM +/** + * @brief Coordinate root address between parent processes using PMIx + * + * This function is called by parent rrun processes in Slurm hybrid mode. + * The root parent (PMIX_RANK=0) publishes the root address, and non-root + * parents retrieve it. This avoids file-based coordination. + * + * @param is_root Whether this is the root parent (PMIX_RANK=0) + * @param root_address_to_publish Address to publish (only used if is_root) + * @param verbose Whether to print debug messages + * @return Root address (either published or retrieved) + * @throws std::runtime_error on PMIx errors + */ +std::string coordinate_root_address_via_pmix( + bool is_root, std::string const& root_address_to_publish, bool verbose +) { + // Initialize PMIx for parent process + pmix_proc_t proc; + pmix_status_t rc = PMIx_Init(&proc, nullptr, 0); + if (rc != PMIX_SUCCESS) { + throw std::runtime_error( + "PMIx_Init failed in rrun parent: " + std::string{PMIx_Error_string(rc)} + ); + } + + if (verbose) { + std::cout << "[rrun] Parent PMIx initialized: rank " << proc.rank + << ", namespace " << proc.nspace << std::endl; + } + + std::string root_address; + + if (is_root) { + // Root parent publishes the address + if (verbose) { + std::cout << "[rrun] Publishing root address via PMIx: " + << root_address_to_publish << std::endl; + } + + // Use PMIx_Put with GLOBAL scope + pmix_value_t value; + PMIX_VALUE_CONSTRUCT(&value); + value.type = PMIX_STRING; + value.data.string = strdup(root_address_to_publish.c_str()); + + rc = PMIx_Put(PMIX_GLOBAL, "rapidsmpf_root_address", &value); + PMIX_VALUE_DESTRUCT(&value); + + if (rc != PMIX_SUCCESS) { + PMIx_Finalize(nullptr, 0); + throw std::runtime_error( + "PMIx_Put failed: " + std::string{PMIx_Error_string(rc)} + ); + } + + // Commit the data + rc = PMIx_Commit(); + if (rc != PMIX_SUCCESS) { + PMIx_Finalize(nullptr, 0); + throw std::runtime_error( + "PMIx_Commit failed: " + std::string{PMIx_Error_string(rc)} + ); + } + + root_address = root_address_to_publish; + } + + // Barrier with PMIX_COLLECT_DATA to ensure data exchange + pmix_info_t info; + PMIX_INFO_CONSTRUCT(&info); + bool collect_data = true; + PMIX_INFO_LOAD(&info, PMIX_COLLECT_DATA, &collect_data, PMIX_BOOL); + + pmix_proc_t proc_wildcard; + PMIX_PROC_CONSTRUCT(&proc_wildcard); + std::memcpy(proc_wildcard.nspace, proc.nspace, PMIX_MAX_NSLEN + 1); + proc_wildcard.rank = PMIX_RANK_WILDCARD; + + rc = PMIx_Fence(&proc_wildcard, 1, &info, 1); + PMIX_INFO_DESTRUCT(&info); + + // Accept partial success (some PMIx implementations return this for fences) + if (rc != PMIX_SUCCESS && rc != PMIX_ERR_PARTIAL_SUCCESS) { + PMIx_Finalize(nullptr, 0); + throw std::runtime_error( + "PMIx_Fence failed: " + std::string{PMIx_Error_string(rc)} + ); + } + + if (!is_root) { + // Non-root parents retrieve the address + pmix_proc_t source_proc; + PMIX_PROC_CONSTRUCT(&source_proc); + std::memcpy(source_proc.nspace, proc.nspace, PMIX_MAX_NSLEN + 1); + source_proc.rank = 0; // Get from rank 0 + + pmix_value_t* value = nullptr; + rc = PMIx_Get(&source_proc, "rapidsmpf_root_address", nullptr, 0, &value); + + if (rc != PMIX_SUCCESS || value == nullptr) { + PMIx_Finalize(nullptr, 0); + throw std::runtime_error( + "PMIx_Get failed: " + std::string{PMIx_Error_string(rc)} + ); + } + + if (value->type != PMIX_STRING) { + PMIX_VALUE_RELEASE(value); + PMIx_Finalize(nullptr, 0); + throw std::runtime_error("PMIx_Get returned non-string value"); + } + + root_address = value->data.string; + PMIX_VALUE_RELEASE(value); + + if (verbose) { + std::cout << "[rrun] Retrieved root address via PMIx: " << root_address + << std::endl; + } + } + + // Keep PMIx session alive - will finalize after children complete + // Note: We don't call PMIx_Finalize here because we want the session + // to stay alive while children are running + + return root_address; +} +#endif // RAPIDSMPF_HAVE_SLURM + /** * @brief Launch a single rank locally (fork-based). * @@ -747,6 +881,7 @@ pid_t fork_with_piped_stdio( * @param global_rank Global rank number (used for RAPIDSMPF_RANK). * @param local_rank Local rank for GPU assignment (defaults to global_rank). * @param total_ranks Total number of ranks across all tasks (used for RAPIDSMPF_NRANKS). + * @param root_address Optional pre-coordinated root address (for hybrid mode). * @param out_fd_stdout Output file descriptor for stdout. * @param out_fd_stderr Output file descriptor for stderr. * @return Child process PID. @@ -756,18 +891,25 @@ pid_t launch_rank_local( int global_rank, int local_rank, int total_ranks, + std::string const& root_address, int* out_fd_stdout, int* out_fd_stderr ) { - // Capture ranks by value explicitly to avoid any potential issues + // Capture all parameters by value to avoid any potential issues int captured_global_rank = global_rank; int captured_local_rank = local_rank; int captured_total_ranks = total_ranks; + std::string captured_root_address = root_address; + return fork_with_piped_stdio( out_fd_stdout, out_fd_stderr, /*combine_stderr*/ false, - [&cfg, captured_global_rank, captured_local_rank, captured_total_ranks]() { + [&cfg, + captured_global_rank, + captured_local_rank, + captured_total_ranks, + captured_root_address]() { // Set custom environment variables first (can be overridden by specific vars) for (auto const& env_pair : cfg.env_vars) { setenv(env_pair.first.c_str(), env_pair.second.c_str(), 1); @@ -776,16 +918,23 @@ pid_t launch_rank_local( // Set environment variables setenv("RAPIDSMPF_RANK", std::to_string(captured_global_rank).c_str(), 1); setenv("RAPIDSMPF_NRANKS", std::to_string(captured_total_ranks).c_str(), 1); - setenv("RAPIDSMPF_COORD_DIR", cfg.coord_dir.c_str(), 1); - // In Slurm hybrid mode, unset Slurm rank variables to avoid confusion - // Children inherit parent's SLURM_PROCID, which could interfere with - // bootstrap Since RAPIDSMPF_COORD_DIR is set, FILE backend will be used - // anyway + // If root address was pre-coordinated by parent, set it + // This allows children to skip bootstrap coordination entirely + if (!captured_root_address.empty()) { + setenv("RAPIDSMPF_ROOT_ADDRESS", captured_root_address.c_str(), 1); + } else { + // Use FILE backend coordination + setenv("RAPIDSMPF_COORD_DIR", cfg.coord_dir.c_str(), 1); + } + + // In Slurm hybrid mode, unset Slurm/PMIx rank variables to avoid confusion + // Children should not try to initialize PMIx themselves if (cfg.slurm_mode) { unsetenv("SLURM_PROCID"); unsetenv("SLURM_LOCALID"); unsetenv("PMIX_RANK"); + unsetenv("PMIX_NAMESPACE"); } // Set CUDA_VISIBLE_DEVICES if GPUs are available @@ -922,6 +1071,9 @@ int main(int argc, char* argv[]) { std::cout << std::endl; } + // Variable to hold pre-coordinated root address (for Slurm hybrid mode) + std::string coordinated_root_address; + // ===================================================================== // Slurm Mode: Two sub-modes based on whether -n was specified // ===================================================================== @@ -967,17 +1119,153 @@ int main(int argc, char* argv[]) { return 1; } - // ===== Hybrid Mode: Fork multiple ranks within this Slurm task ===== + // ===== Hybrid Mode: Parent-mediated coordination via PMIx ===== +#ifdef RAPIDSMPF_HAVE_SLURM + if (cfg.verbose) { + std::cout << "[rrun] Slurm hybrid mode: task " << cfg.slurm_global_rank + << " launching " << cfg.nranks << " ranks per task" + << std::endl; + std::cout << "[rrun] Using PMIx for parent coordination (no file I/O)" + << std::endl; + } + + // Root parent needs to launch rank 0 first to get address + bool is_root_parent = (cfg.slurm_global_rank == 0); + + if (is_root_parent) { + // Root parent: Launch ONLY rank 0 first to get UCXX address + if (cfg.verbose) { + std::cout + << "[rrun] Root parent: launching rank 0 first to get address" + << std::endl; + } + + // We need to extract the root address from rank 0's output + // This is a bit tricky - we'll launch it with a special mode + // For now, we'll rely on the fact that the root address will be + // printed or we can read it from a pipe + + // Actually, let's use a temporary file for the root to write its address + // This is cleaner than parsing stdout + std::string address_file = "/tmp/rapidsmpf_root_address_" + + std::string{std::getenv("SLURM_JOB_ID")}; + setenv("RAPIDSMPF_ROOT_ADDRESS_FILE", address_file.c_str(), 1); + + // Launch rank 0 + int fd_out = -1, fd_err = -1; + int slurm_ntasks = cfg.slurm_ntasks > 0 ? cfg.slurm_ntasks : 1; + int total_ranks = slurm_ntasks * cfg.nranks; + + pid_t rank0_pid = + launch_rank_local(cfg, 0, 0, total_ranks, "", &fd_out, &fd_err); + + // Start forwarders for rank 0 output + std::thread rank0_stdout_forwarder; + std::thread rank0_stderr_forwarder; + auto suppress = std::make_shared>(false); + + if (fd_out >= 0) { + rank0_stdout_forwarder = std::thread([fd_out, suppress]() { + FILE* stream = fdopen(fd_out, "r"); + if (!stream) { + close(fd_out); + return; + } + char buffer[4096]; + while (fgets(buffer, sizeof(buffer), stream) != nullptr) { + if (suppress->load()) + continue; + std::lock_guard lock(output_mutex); + fputs(buffer, stdout); + fflush(stdout); + } + fclose(stream); + }); + } + + if (fd_err >= 0) { + rank0_stderr_forwarder = std::thread([fd_err, suppress]() { + FILE* stream = fdopen(fd_err, "r"); + if (!stream) { + close(fd_err); + return; + } + char buffer[4096]; + while (fgets(buffer, sizeof(buffer), stream) != nullptr) { + if (suppress->load()) + continue; + std::lock_guard lock(output_mutex); + fputs(buffer, stderr); + fflush(stderr); + } + fclose(stream); + }); + } + + // Wait for rank 0 to write the address file (with timeout) + auto start = std::chrono::steady_clock::now(); + while (!std::filesystem::exists(address_file)) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + auto elapsed = std::chrono::steady_clock::now() - start; + if (elapsed > std::chrono::seconds(30)) { + suppress->store(true); + kill(rank0_pid, SIGKILL); + waitpid(rank0_pid, nullptr, 0); + if (rank0_stdout_forwarder.joinable()) + rank0_stdout_forwarder.join(); + if (rank0_stderr_forwarder.joinable()) + rank0_stderr_forwarder.join(); + throw std::runtime_error( + "Timeout waiting for rank 0 to write root address" + ); + } + } + + // Read the address + std::ifstream addr_stream(address_file); + std::getline(addr_stream, coordinated_root_address); + addr_stream.close(); + + if (cfg.verbose) { + std::cout << "[rrun] Got root address from rank 0: " + << coordinated_root_address << std::endl; + } + + // Coordinate with other parents via PMIx + coordinated_root_address = coordinate_root_address_via_pmix( + true, coordinated_root_address, cfg.verbose + ); + + // Rank 0 is already running - we'll track its PID separately + // It will be handled along with other children + + if (rank0_stdout_forwarder.joinable()) + rank0_stdout_forwarder.detach(); + if (rank0_stderr_forwarder.joinable()) + rank0_stderr_forwarder.detach(); + + } else { + // Non-root parent: Get address from root via PMIx + coordinated_root_address = + coordinate_root_address_via_pmix(false, "", cfg.verbose); + } + + // Now all parents have the coordinated_root_address + // Continue to fork-based launch below with this address + unsetenv("RAPIDSMPF_ROOT_ADDRESS_FILE"); +#else + // Fallback to FILE backend if PMIx not available if (cfg.verbose) { std::cout << "[rrun] Slurm hybrid mode: task " << cfg.slurm_global_rank << " launching " << cfg.nranks << " ranks per task" << std::endl; + std::cout + << "[rrun] WARNING: PMIx not available, falling back to FILE backend" + << std::endl; } // Generate coordination directory if not specified - // In hybrid mode, we need FILE backend since PMIx doesn't know about children if (cfg.coord_dir.empty()) { - // Use Slurm job ID for coordination directory to ensure uniqueness char const* job_id = std::getenv("SLURM_JOB_ID"); if (job_id) { cfg.coord_dir = "/tmp/rrun_slurm_" + std::string{job_id}; @@ -987,26 +1275,34 @@ int main(int argc, char* argv[]) { } std::filesystem::create_directories(cfg.coord_dir); - - // Continue to fork-based launch below (like single-node mode) - // cfg.nranks already contains ranks per task, will be adjusted below + coordinated_root_address = ""; // Empty means use FILE backend +#endif } // ===================================================================== // Fork-based Launch: Single-Node or Slurm Hybrid Mode // ===================================================================== - std::filesystem::create_directories(cfg.coord_dir); + + // For non-Slurm mode or FILE backend fallback, create coord dir + if (!cfg.slurm_mode || coordinated_root_address.empty()) { + if (cfg.coord_dir.empty()) { + cfg.coord_dir = "/tmp/rrun_" + generate_session_id(); + } + std::filesystem::create_directories(cfg.coord_dir); + } // Determine rank offset and total ranks int rank_offset = 0; int ranks_per_task = cfg.nranks; // Ranks to launch in this process int total_ranks = cfg.nranks; // Total ranks across all processes + bool is_root_parent = false; if (cfg.slurm_mode) { // Hybrid mode: multiple ranks per Slurm task int slurm_ntasks = cfg.slurm_ntasks > 0 ? cfg.slurm_ntasks : 1; rank_offset = cfg.slurm_global_rank * ranks_per_task; total_ranks = slurm_ntasks * ranks_per_task; + is_root_parent = (cfg.slurm_global_rank == 0); if (cfg.verbose) { std::cout << "[rrun] Task " << cfg.slurm_global_rank @@ -1065,12 +1361,23 @@ int main(int argc, char* argv[]) { }; // Launch ranks (with offset for Slurm hybrid mode) - for (int local_rank = 0; local_rank < ranks_per_task; ++local_rank) { + // Note: Root parent already launched rank 0 in PMIx coordination phase + int start_local_rank = + (is_root_parent && !coordinated_root_address.empty()) ? 1 : 0; + + for (int local_rank = start_local_rank; local_rank < ranks_per_task; ++local_rank) + { int global_rank = rank_offset + local_rank; int fd_out = -1; int fd_err = -1; pid_t pid = launch_rank_local( - cfg, global_rank, local_rank, total_ranks, &fd_out, &fd_err + cfg, + global_rank, + local_rank, + total_ranks, + coordinated_root_address, + &fd_out, + &fd_err ); pids.push_back(pid); @@ -1143,11 +1450,24 @@ int main(int argc, char* argv[]) { std::cout << "\nAll ranks completed successfully." << std::endl; } +#ifdef RAPIDSMPF_HAVE_SLURM + if (cfg.slurm_mode && !coordinated_root_address.empty()) { + if (cfg.verbose) { + std::cout << "[rrun] Finalizing PMIx in parent" << std::endl; + } + PMIx_Finalize(nullptr, 0); + } +#endif + return exit_status; } catch (std::exception const& e) { std::cerr << "Error: " << e.what() << std::endl; std::cerr << "Run with -h or --help for usage information." << std::endl; + +#ifdef RAPIDSMPF_HAVE_SLURM + PMIx_Finalize(nullptr, 0); +#endif return 1; } } From af4c3900a541c4a9c7d45b8f0e790367a5cf6004 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 14 Jan 2026 14:04:34 -0800 Subject: [PATCH 13/57] Fix subprocess coordination --- cpp/src/bootstrap/ucxx.cpp | 139 ++++++++++++++++++++++++++----------- cpp/tools/rrun.cpp | 99 +++++++++++++++++++------- 2 files changed, 173 insertions(+), 65 deletions(-) diff --git a/cpp/src/bootstrap/ucxx.cpp b/cpp/src/bootstrap/ucxx.cpp index bd928b338..97217bed7 100644 --- a/cpp/src/bootstrap/ucxx.cpp +++ b/cpp/src/bootstrap/ucxx.cpp @@ -11,9 +11,11 @@ #include #include #include +#include #include #include +#include // for unsetenv #include #include @@ -22,6 +24,36 @@ namespace rapidsmpf::bootstrap { +namespace { +// Hex encoding for binary-safe address transmission +std::string hex_encode(std::string const& input) { + static constexpr const char* hex_chars = "0123456789abcdef"; + std::string result; + result.reserve(input.size() * 2); + for (char ch : input) { + auto c = static_cast(ch); + result.push_back(hex_chars[c >> 4]); + result.push_back(hex_chars[c & 0x0F]); + } + return result; +} + +std::string hex_decode(std::string const& input) { + std::string result; + result.reserve(input.size() / 2); + for (size_t i = 0; i < input.size(); i += 2) { + auto high = static_cast( + (input[i] >= 'a') ? (input[i] - 'a' + 10) : (input[i] - '0') + ); + auto low = static_cast( + (input[i + 1] >= 'a') ? (input[i + 1] - 'a' + 10) : (input[i + 1] - '0') + ); + result.push_back(static_cast((high << 4) | low)); + } + return result; +} +} // namespace + std::shared_ptr create_ucxx_comm(Backend backend, config::Options options) { auto ctx = init(backend); @@ -31,10 +63,13 @@ std::shared_ptr create_ucxx_comm(Backend backend, config::Options op std::shared_ptr comm; // Check if root address was provided by parent process (rrun hybrid mode) - char const* precomputed_address = std::getenv("RAPIDSMPF_ROOT_ADDRESS"); + char const* precomputed_address_encoded = std::getenv("RAPIDSMPF_ROOT_ADDRESS"); - if (precomputed_address != nullptr) { + if (precomputed_address_encoded != nullptr) { // Parent process already coordinated the root address via PMIx + // Address is hex-encoded to avoid issues with binary data in env vars + std::string precomputed_address = hex_decode(precomputed_address_encoded); + // Children skip bootstrap coordination and use the provided address directly if (ctx.rank == 0) { // Root child creates listener @@ -53,6 +88,14 @@ std::shared_ptr create_ucxx_comm(Backend backend, config::Options op } } else { // Standard bootstrap coordination via put/get/barrier + + // Special case: If rank 0 is asked to write address file before full bootstrap, + // it means we're in rrun hybrid parent-mediated mode where rank 0 is launched + // first to get its address, then other ranks are launched later. + // In this case, skip the put/barrier/get dance and just create the listener. + char const* address_file = std::getenv("RAPIDSMPF_ROOT_ADDRESS_FILE"); + bool early_address_mode = (ctx.rank == 0 && address_file != nullptr); + if (ctx.rank == 0) { // Create root UCXX communicator auto ucxx_initialized_rank = @@ -60,56 +103,72 @@ std::shared_ptr create_ucxx_comm(Backend backend, config::Options op comm = std::make_shared(std::move(ucxx_initialized_rank), options); - // Get the listener address and publish + // Get the listener address auto listener_address = comm->listener_address(); auto root_worker_address_str = std::get>(listener_address.address) ->getString(); - put(ctx, "ucxx_root_address", root_worker_address_str); - } - - // All ranks must barrier to make PMIx put() data visible. - // For file backend this is a no-op synchronization. - // For PMIx/Slurm backend this executes PMIx_Fence to exchange data. - barrier(ctx); - - if (ctx.rank != 0) { - // Worker ranks retrieve the root address and connect - auto root_worker_address_str = - get(ctx, "ucxx_root_address", std::chrono::seconds{30}); - auto root_worker_address = - ::ucxx::createAddressFromString(root_worker_address_str); - auto ucxx_initialized_rank = - ucxx::init(nullptr, ctx.nranks, root_worker_address, options); - comm = - std::make_shared(std::move(ucxx_initialized_rank), options); + if (early_address_mode) { + // Write address file immediately and skip bootstrap coordination + // Parent will coordinate with other parents via PMIx + // Encode as hex to avoid issues with binary data + std::string encoded_address = hex_encode(root_worker_address_str); + std::ofstream addr_file(address_file); + if (!addr_file) { + throw std::runtime_error( + "Failed to write root address to file: " + + std::string{address_file} + ); + } + addr_file << encoded_address << std::endl; + addr_file.close(); + + char const* verbose = std::getenv("RAPIDSMPF_VERBOSE"); + if (verbose && std::string{verbose} == "1") { + std::cerr << "[rank 0] Wrote address to " << address_file + << ", skipping bootstrap coordination" << std::endl; + } + + // Unset the flag so rank 0 won't skip the final barrier + // (we need all ranks to synchronize at the end) + unsetenv("RAPIDSMPF_ROOT_ADDRESS_FILE"); + + // Skip put/barrier - other ranks will get address via + // RAPIDSMPF_ROOT_ADDRESS Return early, don't do full bootstrap + } else { + // Normal mode: publish address for other ranks + put(ctx, "ucxx_root_address", root_worker_address_str); + } } - } - - comm->barrier(); - // If root rank and address file path is specified, write the address - // This is used for parent-mediated coordination in rrun hybrid mode - if (ctx.rank == 0) { - char const* address_file = std::getenv("RAPIDSMPF_ROOT_ADDRESS_FILE"); - if (address_file != nullptr) { - auto listener_address = comm->listener_address(); - auto root_address_str = - std::get>(listener_address.address) - ->getString(); - - std::ofstream addr_file(address_file); - if (!addr_file) { - throw std::runtime_error( - "Failed to write root address to file: " + std::string{address_file} + if (!early_address_mode) { + // All ranks must barrier to make PMIx put() data visible. + // For file backend this is a no-op synchronization. + // For PMIx/Slurm backend this executes PMIx_Fence to exchange data. + barrier(ctx); + + if (ctx.rank != 0) { + // Worker ranks retrieve the root address and connect + auto root_worker_address_str = + get(ctx, "ucxx_root_address", std::chrono::seconds{30}); + auto root_worker_address = + ::ucxx::createAddressFromString(root_worker_address_str); + + auto ucxx_initialized_rank = + ucxx::init(nullptr, ctx.nranks, root_worker_address, options); + comm = std::make_shared( + std::move(ucxx_initialized_rank), options ); } - addr_file << root_address_str << std::endl; - addr_file.close(); } } + // Final barrier to synchronize all ranks before returning + // Note: rank 0 in early address mode unsets RAPIDSMPF_ROOT_ADDRESS_FILE + // after writing the file, so it participates in this barrier + comm->barrier(); + return comm; } } // namespace rapidsmpf::bootstrap diff --git a/cpp/tools/rrun.cpp b/cpp/tools/rrun.cpp index f95db56e2..49efb7331 100644 --- a/cpp/tools/rrun.cpp +++ b/cpp/tools/rrun.cpp @@ -50,6 +50,36 @@ #include #endif +// Hex encoding for binary-safe address transmission +namespace { +std::string hex_encode(std::string const& input) { + static constexpr const char* hex_chars = "0123456789abcdef"; + std::string result; + result.reserve(input.size() * 2); + for (char ch : input) { + auto c = static_cast(ch); + result.push_back(hex_chars[c >> 4]); + result.push_back(hex_chars[c & 0x0F]); + } + return result; +} + +std::string hex_decode(std::string const& input) { + std::string result; + result.reserve(input.size() / 2); + for (size_t i = 0; i < input.size(); i += 2) { + auto high = static_cast( + (input[i] >= 'a') ? (input[i] - 'a' + 10) : (input[i] - '0') + ); + auto low = static_cast( + (input[i + 1] >= 'a') ? (input[i + 1] - 'a' + 10) : (input[i + 1] - '0') + ); + result.push_back(static_cast((high << 4) | low)); + } + return result; +} +} // namespace + // NOTE: Do not use RAPIDSMPF_EXPECTS or RAPIDSMPF_FAIL in this file. // Using these macros introduces a CUDA dependency via rapidsmpf/error.hpp. // Prefer throwing standard exceptions instead. @@ -778,17 +808,20 @@ std::string coordinate_root_address_via_pmix( std::string root_address; if (is_root) { - // Root parent publishes the address + // Root parent publishes the address (hex-encoded for binary safety) + std::string encoded_address = hex_encode(root_address_to_publish); + if (verbose) { - std::cout << "[rrun] Publishing root address via PMIx: " - << root_address_to_publish << std::endl; + std::cout << "[rrun] Publishing root address via PMIx (hex-encoded, " + << root_address_to_publish.size() << " bytes -> " + << encoded_address.size() << " chars)" << std::endl; } // Use PMIx_Put with GLOBAL scope pmix_value_t value; PMIX_VALUE_CONSTRUCT(&value); value.type = PMIX_STRING; - value.data.string = strdup(root_address_to_publish.c_str()); + value.data.string = strdup(encoded_address.c_str()); rc = PMIx_Put(PMIX_GLOBAL, "rapidsmpf_root_address", &value); PMIX_VALUE_DESTRUCT(&value); @@ -857,12 +890,15 @@ std::string coordinate_root_address_via_pmix( throw std::runtime_error("PMIx_Get returned non-string value"); } - root_address = value->data.string; + std::string encoded_address = value->data.string; PMIX_VALUE_RELEASE(value); + root_address = hex_decode(encoded_address); + if (verbose) { - std::cout << "[rrun] Retrieved root address via PMIx: " << root_address - << std::endl; + std::cout << "[rrun] Retrieved root address via PMIx (hex-encoded, " + << encoded_address.size() << " chars -> " << root_address.size() + << " bytes)" << std::endl; } } @@ -919,13 +955,17 @@ pid_t launch_rank_local( setenv("RAPIDSMPF_RANK", std::to_string(captured_global_rank).c_str(), 1); setenv("RAPIDSMPF_NRANKS", std::to_string(captured_total_ranks).c_str(), 1); - // If root address was pre-coordinated by parent, set it + // Always set coord_dir for bootstrap initialization + // (needed even if using RAPIDSMPF_ROOT_ADDRESS for coordination) + if (!cfg.coord_dir.empty()) { + setenv("RAPIDSMPF_COORD_DIR", cfg.coord_dir.c_str(), 1); + } + + // If root address was pre-coordinated by parent, set it (hex-encoded) // This allows children to skip bootstrap coordination entirely if (!captured_root_address.empty()) { - setenv("RAPIDSMPF_ROOT_ADDRESS", captured_root_address.c_str(), 1); - } else { - // Use FILE backend coordination - setenv("RAPIDSMPF_COORD_DIR", cfg.coord_dir.c_str(), 1); + std::string encoded_address = hex_encode(captured_root_address); + setenv("RAPIDSMPF_ROOT_ADDRESS", encoded_address.c_str(), 1); } // In Slurm hybrid mode, unset Slurm/PMIx rank variables to avoid confusion @@ -1129,6 +1169,17 @@ int main(int argc, char* argv[]) { << std::endl; } + // Set up coordination directory (needed by all tasks for child bootstrap) + char const* job_id = std::getenv("SLURM_JOB_ID"); + if (cfg.coord_dir.empty()) { + if (job_id) { + cfg.coord_dir = "/tmp/rrun_slurm_" + std::string{job_id}; + } else { + cfg.coord_dir = "/tmp/rrun_" + generate_session_id(); + } + } + std::filesystem::create_directories(cfg.coord_dir); + // Root parent needs to launch rank 0 first to get address bool is_root_parent = (cfg.slurm_global_rank == 0); @@ -1140,15 +1191,9 @@ int main(int argc, char* argv[]) { << std::endl; } - // We need to extract the root address from rank 0's output - // This is a bit tricky - we'll launch it with a special mode - // For now, we'll rely on the fact that the root address will be - // printed or we can read it from a pipe - - // Actually, let's use a temporary file for the root to write its address - // This is cleaner than parsing stdout + // Set up address file for rank 0 to write to std::string address_file = "/tmp/rapidsmpf_root_address_" - + std::string{std::getenv("SLURM_JOB_ID")}; + + std::string{job_id ? job_id : "unknown"}; setenv("RAPIDSMPF_ROOT_ADDRESS_FILE", address_file.c_str(), 1); // Launch rank 0 @@ -1221,14 +1266,19 @@ int main(int argc, char* argv[]) { } } - // Read the address + // Read the hex-encoded address, decode and remove file + std::string encoded_address; std::ifstream addr_stream(address_file); - std::getline(addr_stream, coordinated_root_address); + std::getline(addr_stream, encoded_address); addr_stream.close(); + coordinated_root_address = hex_decode(encoded_address); + std::filesystem::remove(address_file); if (cfg.verbose) { - std::cout << "[rrun] Got root address from rank 0: " - << coordinated_root_address << std::endl; + std::cout << "[rrun] Got root address from rank 0 (hex-encoded, " + << encoded_address.size() << " chars -> " + << coordinated_root_address.size() << " bytes)" + << std::endl; } // Coordinate with other parents via PMIx @@ -1238,7 +1288,6 @@ int main(int argc, char* argv[]) { // Rank 0 is already running - we'll track its PID separately // It will be handled along with other children - if (rank0_stdout_forwarder.joinable()) rank0_stdout_forwarder.detach(); if (rank0_stderr_forwarder.joinable()) From 5417c515c6b057344d4cb845fb86a3acb9e1516c Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Thu, 15 Jan 2026 03:40:30 -0800 Subject: [PATCH 14/57] Move rank launching into new functions --- cpp/tools/rrun.cpp | 502 +++++++++++++++++++++++++-------------------- 1 file changed, 278 insertions(+), 224 deletions(-) diff --git a/cpp/tools/rrun.cpp b/cpp/tools/rrun.cpp index 49efb7331..ed872bede 100644 --- a/cpp/tools/rrun.cpp +++ b/cpp/tools/rrun.cpp @@ -775,6 +775,111 @@ pid_t fork_with_piped_stdio( } #ifdef RAPIDSMPF_HAVE_SLURM +/** + * @brief Launch rank 0 first to obtain its UCXX root address + * + * @param cfg Configuration + * @param address_file Path to file where rank 0 will write its address + * @param total_ranks Total number of ranks across all tasks + * @return Hex-encoded root address + * @throws std::runtime_error on timeout or launch failure + */ +std::string launch_rank0_and_get_address( + Config const& cfg, std::string const& address_file, int total_ranks +) { + if (cfg.verbose) { + std::cout << "[rrun] Root parent: launching rank 0 first to get address" + << std::endl; + } + + setenv("RAPIDSMPF_ROOT_ADDRESS_FILE", address_file.c_str(), 1); + + int fd_out = -1, fd_err = -1; + pid_t rank0_pid = launch_rank_local(cfg, 0, 0, total_ranks, "", &fd_out, &fd_err); + + // Start forwarders for rank 0 output + std::thread rank0_stdout_forwarder; + std::thread rank0_stderr_forwarder; + auto suppress = std::make_shared>(false); + + if (fd_out >= 0) { + rank0_stdout_forwarder = std::thread([fd_out, suppress]() { + FILE* stream = fdopen(fd_out, "r"); + if (!stream) { + close(fd_out); + return; + } + char buffer[4096]; + while (fgets(buffer, sizeof(buffer), stream) != nullptr) { + if (suppress->load()) + continue; + std::lock_guard lock(output_mutex); + fputs(buffer, stdout); + fflush(stdout); + } + fclose(stream); + }); + } + + if (fd_err >= 0) { + rank0_stderr_forwarder = std::thread([fd_err, suppress]() { + FILE* stream = fdopen(fd_err, "r"); + if (!stream) { + close(fd_err); + return; + } + char buffer[4096]; + while (fgets(buffer, sizeof(buffer), stream) != nullptr) { + if (suppress->load()) + continue; + std::lock_guard lock(output_mutex); + fputs(buffer, stderr); + fflush(stderr); + } + fclose(stream); + }); + } + + // Wait for rank 0 to write the address file (with timeout) + auto start = std::chrono::steady_clock::now(); + while (!std::filesystem::exists(address_file)) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + auto elapsed = std::chrono::steady_clock::now() - start; + if (elapsed > std::chrono::seconds(30)) { + suppress->store(true); + kill(rank0_pid, SIGKILL); + waitpid(rank0_pid, nullptr, 0); + if (rank0_stdout_forwarder.joinable()) + rank0_stdout_forwarder.join(); + if (rank0_stderr_forwarder.joinable()) + rank0_stderr_forwarder.join(); + throw std::runtime_error("Timeout waiting for rank 0 to write root address"); + } + } + + // Read the hex-encoded address, decode and remove file + std::string encoded_address; + std::ifstream addr_stream(address_file); + std::getline(addr_stream, encoded_address); + addr_stream.close(); + std::string root_address = hex_decode(encoded_address); + std::filesystem::remove(address_file); + + if (cfg.verbose) { + std::cout << "[rrun] Got root address from rank 0 (hex-encoded, " + << encoded_address.size() << " chars -> " << root_address.size() + << " bytes)" << std::endl; + } + + // Rank 0 is already running - detach forwarders + if (rank0_stdout_forwarder.joinable()) + rank0_stdout_forwarder.detach(); + if (rank0_stderr_forwarder.joinable()) + rank0_stderr_forwarder.detach(); + + return root_address; +} + /** * @brief Coordinate root address between parent processes using PMIx * @@ -910,6 +1015,161 @@ std::string coordinate_root_address_via_pmix( } #endif // RAPIDSMPF_HAVE_SLURM +/** + * @brief Launch multiple ranks locally using fork + * + * @param cfg Configuration + * @param rank_offset Starting global rank for this task + * @param ranks_per_task Number of ranks to launch + * @param total_ranks Total ranks across all tasks + * @param root_address Pre-coordinated root address (empty for FILE backend) + * @param is_root_parent Whether this is root parent (affects which ranks to launch) + * @return Exit status (0 for success) + */ +int launch_ranks_fork_based( + Config const& cfg, + int rank_offset, + int ranks_per_task, + int total_ranks, + std::string const& root_address, + bool is_root_parent +) { + std::vector pids; + pids.reserve(static_cast(ranks_per_task)); + + // Block SIGINT/SIGTERM in this thread; a dedicated thread will handle them. + sigset_t signal_set; + sigemptyset(&signal_set); + sigaddset(&signal_set, SIGINT); + sigaddset(&signal_set, SIGTERM); + sigprocmask(SIG_BLOCK, &signal_set, nullptr); + + // Output suppression flag and forwarder threads + auto suppress_output = std::make_shared>(false); + std::vector forwarders; + forwarders.reserve(static_cast(ranks_per_task) * 2); + + // Helper to start a forwarder thread for a given fd + auto start_forwarder = [&](int fd, int rank, bool to_stderr) { + if (fd < 0) { + return; + } + forwarders.emplace_back([fd, rank, to_stderr, &cfg, suppress_output]() { + FILE* stream = fdopen(fd, "r"); + if (!stream) { + close(fd); + return; + } + std::string tag = + cfg.tag_output ? ("[" + std::to_string(rank) + "] ") : std::string{}; + char buffer[4096]; + while (fgets(buffer, sizeof(buffer), stream) != nullptr) { + if (suppress_output->load(std::memory_order_relaxed)) { + continue; + } + FILE* out = to_stderr ? stderr : stdout; + { + std::lock_guard lock(output_mutex); + if (!tag.empty()) { + fputs(tag.c_str(), out); + } + fputs(buffer, out); + fflush(out); + } + } + fclose(stream); + }); + }; + + // Launch ranks (skip rank 0 if root parent already launched it) + int start_local_rank = (is_root_parent && !root_address.empty()) ? 1 : 0; + + for (int local_rank = start_local_rank; local_rank < ranks_per_task; ++local_rank) { + int global_rank = rank_offset + local_rank; + int fd_out = -1; + int fd_err = -1; + pid_t pid = launch_rank_local( + cfg, global_rank, local_rank, total_ranks, root_address, &fd_out, &fd_err + ); + pids.push_back(pid); + + if (cfg.verbose) { + std::ostringstream msg; + msg << "Launched rank " << global_rank << " (PID " << pid << ")"; + if (!cfg.gpus.empty()) { + msg << " on GPU " + << cfg.gpus[static_cast(local_rank) % cfg.gpus.size()]; + } + msg << std::endl; + std::string msg_str = msg.str(); + + std::cout << msg_str; + std::cout.flush(); + } + start_forwarder(fd_out, global_rank, false); + start_forwarder(fd_err, global_rank, true); + } + + // Start a signal-waiting thread to forward signals. + std::thread([signal_set, &pids, suppress_output]() mutable { + for (;;) { + int sig = 0; + int rc = sigwait(&signal_set, &sig); + if (rc != 0) { + continue; + } + suppress_output->store(true, std::memory_order_relaxed); + for (pid_t pid : pids) { + kill(pid, sig); + } + return; + } + }).detach(); + + std::cout << "\nAll ranks launched. Waiting for completion...\n" << std::endl; + + // Wait for all processes + int exit_status = 0; + for (size_t i = 0; i < pids.size(); ++i) { + int status = 0; + pid_t pid = pids[i]; + if (waitpid(pid, &status, 0) < 0) { + std::cerr << "Failed to wait for rank " << i << " (PID " << pid + << "): " << std::strerror(errno) << std::endl; + exit_status = 1; + continue; + } + + if (WIFEXITED(status)) { + int code = WEXITSTATUS(status); + if (code != 0) { + std::cerr << "Rank " + << (rank_offset + + (is_root_parent && !root_address.empty() ? i + 1 : i)) + << " (PID " << pid << ") exited with code " << code + << std::endl; + exit_status = code; + } + } else if (WIFSIGNALED(status)) { + int sig = WTERMSIG(status); + std::cerr << "Rank " + << (rank_offset + + (is_root_parent && !root_address.empty() ? i + 1 : i)) + << " (PID " << pid << ") terminated by signal " << sig << std::endl; + exit_status = 128 + sig; + } + } + + // Wait for forwarder threads to finish + for (auto& t : forwarders) { + if (t.joinable()) { + t.join(); + } + } + + return exit_status; +} + /** * @brief Launch a single rank locally (fork-based). * @@ -1183,116 +1443,19 @@ int main(int argc, char* argv[]) { // Root parent needs to launch rank 0 first to get address bool is_root_parent = (cfg.slurm_global_rank == 0); - if (is_root_parent) { - // Root parent: Launch ONLY rank 0 first to get UCXX address - if (cfg.verbose) { - std::cout - << "[rrun] Root parent: launching rank 0 first to get address" - << std::endl; - } + // Coordinate root address via PMIx + int slurm_ntasks = cfg.slurm_ntasks > 0 ? cfg.slurm_ntasks : 1; + int total_ranks = slurm_ntasks * cfg.nranks; - // Set up address file for rank 0 to write to + if (is_root_parent) { + // Root parent: Launch rank 0, get address, coordinate via PMIx std::string address_file = "/tmp/rapidsmpf_root_address_" + std::string{job_id ? job_id : "unknown"}; - setenv("RAPIDSMPF_ROOT_ADDRESS_FILE", address_file.c_str(), 1); - - // Launch rank 0 - int fd_out = -1, fd_err = -1; - int slurm_ntasks = cfg.slurm_ntasks > 0 ? cfg.slurm_ntasks : 1; - int total_ranks = slurm_ntasks * cfg.nranks; - - pid_t rank0_pid = - launch_rank_local(cfg, 0, 0, total_ranks, "", &fd_out, &fd_err); - - // Start forwarders for rank 0 output - std::thread rank0_stdout_forwarder; - std::thread rank0_stderr_forwarder; - auto suppress = std::make_shared>(false); - - if (fd_out >= 0) { - rank0_stdout_forwarder = std::thread([fd_out, suppress]() { - FILE* stream = fdopen(fd_out, "r"); - if (!stream) { - close(fd_out); - return; - } - char buffer[4096]; - while (fgets(buffer, sizeof(buffer), stream) != nullptr) { - if (suppress->load()) - continue; - std::lock_guard lock(output_mutex); - fputs(buffer, stdout); - fflush(stdout); - } - fclose(stream); - }); - } - - if (fd_err >= 0) { - rank0_stderr_forwarder = std::thread([fd_err, suppress]() { - FILE* stream = fdopen(fd_err, "r"); - if (!stream) { - close(fd_err); - return; - } - char buffer[4096]; - while (fgets(buffer, sizeof(buffer), stream) != nullptr) { - if (suppress->load()) - continue; - std::lock_guard lock(output_mutex); - fputs(buffer, stderr); - fflush(stderr); - } - fclose(stream); - }); - } - - // Wait for rank 0 to write the address file (with timeout) - auto start = std::chrono::steady_clock::now(); - while (!std::filesystem::exists(address_file)) { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - auto elapsed = std::chrono::steady_clock::now() - start; - if (elapsed > std::chrono::seconds(30)) { - suppress->store(true); - kill(rank0_pid, SIGKILL); - waitpid(rank0_pid, nullptr, 0); - if (rank0_stdout_forwarder.joinable()) - rank0_stdout_forwarder.join(); - if (rank0_stderr_forwarder.joinable()) - rank0_stderr_forwarder.join(); - throw std::runtime_error( - "Timeout waiting for rank 0 to write root address" - ); - } - } - - // Read the hex-encoded address, decode and remove file - std::string encoded_address; - std::ifstream addr_stream(address_file); - std::getline(addr_stream, encoded_address); - addr_stream.close(); - coordinated_root_address = hex_decode(encoded_address); - std::filesystem::remove(address_file); - - if (cfg.verbose) { - std::cout << "[rrun] Got root address from rank 0 (hex-encoded, " - << encoded_address.size() << " chars -> " - << coordinated_root_address.size() << " bytes)" - << std::endl; - } - - // Coordinate with other parents via PMIx + coordinated_root_address = + launch_rank0_and_get_address(cfg, address_file, total_ranks); coordinated_root_address = coordinate_root_address_via_pmix( true, coordinated_root_address, cfg.verbose ); - - // Rank 0 is already running - we'll track its PID separately - // It will be handled along with other children - if (rank0_stdout_forwarder.joinable()) - rank0_stdout_forwarder.detach(); - if (rank0_stderr_forwarder.joinable()) - rank0_stderr_forwarder.detach(); - } else { // Non-root parent: Get address from root via PMIx coordinated_root_address = @@ -1342,12 +1505,11 @@ int main(int argc, char* argv[]) { // Determine rank offset and total ranks int rank_offset = 0; - int ranks_per_task = cfg.nranks; // Ranks to launch in this process - int total_ranks = cfg.nranks; // Total ranks across all processes + int ranks_per_task = cfg.nranks; + int total_ranks = cfg.nranks; bool is_root_parent = false; if (cfg.slurm_mode) { - // Hybrid mode: multiple ranks per Slurm task int slurm_ntasks = cfg.slurm_ntasks > 0 ? cfg.slurm_ntasks : 1; rank_offset = cfg.slurm_global_rank * ranks_per_task; total_ranks = slurm_ntasks * ranks_per_task; @@ -1361,123 +1523,15 @@ int main(int argc, char* argv[]) { } } - std::vector pids; - pids.reserve(static_cast(ranks_per_task)); - - // Block SIGINT/SIGTERM in this thread; a dedicated thread will handle them. - sigset_t signal_set; - sigemptyset(&signal_set); - sigaddset(&signal_set, SIGINT); - sigaddset(&signal_set, SIGTERM); - sigprocmask(SIG_BLOCK, &signal_set, nullptr); - - // Output suppression flag and forwarder threads - auto suppress_output = std::make_shared>(false); - std::vector forwarders; - forwarders.reserve(static_cast(ranks_per_task) * 2); - - // Helper to start a forwarder thread for a given fd - auto start_forwarder = [&](int fd, int rank, bool to_stderr) { - if (fd < 0) { - return; - } - forwarders.emplace_back([fd, rank, to_stderr, &cfg, suppress_output]() { - FILE* stream = fdopen(fd, "r"); - if (!stream) { - close(fd); - return; - } - std::string tag = - cfg.tag_output ? ("[" + std::to_string(rank) + "] ") : std::string{}; - char buffer[4096]; - while (fgets(buffer, sizeof(buffer), stream) != nullptr) { - if (suppress_output->load(std::memory_order_relaxed)) { - // Discard further lines after suppression - continue; - } - FILE* out = to_stderr ? stderr : stdout; - { - std::lock_guard lock(output_mutex); - if (!tag.empty()) { - fputs(tag.c_str(), out); - } - fputs(buffer, out); - fflush(out); - } - } - fclose(stream); - }); - }; - - // Launch ranks (with offset for Slurm hybrid mode) - // Note: Root parent already launched rank 0 in PMIx coordination phase - int start_local_rank = - (is_root_parent && !coordinated_root_address.empty()) ? 1 : 0; - - for (int local_rank = start_local_rank; local_rank < ranks_per_task; ++local_rank) - { - int global_rank = rank_offset + local_rank; - int fd_out = -1; - int fd_err = -1; - pid_t pid = launch_rank_local( - cfg, - global_rank, - local_rank, - total_ranks, - coordinated_root_address, - &fd_out, - &fd_err - ); - pids.push_back(pid); - - if (cfg.verbose) { - std::ostringstream msg; - msg << "Launched rank " << global_rank << " (PID " << pid << ")"; - if (!cfg.gpus.empty()) { - msg << " on GPU " - << cfg.gpus[static_cast(local_rank) % cfg.gpus.size()]; - } - msg << std::endl; - std::string msg_str = msg.str(); - - std::cout << msg_str; - std::cout.flush(); - } - // Parent-side forwarders for local stdout and stderr - start_forwarder(fd_out, global_rank, false); - start_forwarder(fd_err, global_rank, true); - } - - // Start a signal-waiting thread to forward signals. - std::thread([signal_set, &pids, suppress_output]() mutable { - for (;;) { - int sig = 0; - int rc = sigwait(&signal_set, &sig); - if (rc != 0) { - return; - } - // Stop printing further output immediately - suppress_output->store(true, std::memory_order_relaxed); - // Forward signal to all local children - for (pid_t pid : pids) { - std::ignore = kill(pid, sig); - } - } - }).detach(); - - if (cfg.verbose) { - std::cout << "\nAll ranks launched. Waiting for completion...\n" << std::endl; - } - - // Wait for all ranks to complete - int exit_status = wait_for_ranks(pids); - - // Join forwarders before cleanup - for (auto& th : forwarders) { - if (th.joinable()) { - th.join(); - } - } + // Launch ranks and wait for completion + int exit_status = launch_ranks_fork_based( + cfg, + rank_offset, + ranks_per_task, + total_ranks, + coordinated_root_address, + is_root_parent + ); if (cfg.cleanup) { if (cfg.verbose) { From ceda7a8029ee5bae87428b5b2dc4f2b8750a15bd Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Thu, 15 Jan 2026 07:04:22 -0800 Subject: [PATCH 15/57] Use different execute functions for each launch condition --- cpp/tools/rrun.cpp | 420 ++++++++++++++++++++++++--------------------- 1 file changed, 223 insertions(+), 197 deletions(-) diff --git a/cpp/tools/rrun.cpp b/cpp/tools/rrun.cpp index ed872bede..25a9a167a 100644 --- a/cpp/tools/rrun.cpp +++ b/cpp/tools/rrun.cpp @@ -775,6 +775,207 @@ pid_t fork_with_piped_stdio( } #ifdef RAPIDSMPF_HAVE_SLURM +/** + * @brief Execute application in Slurm passthrough mode (single rank per task). + * + * Applies topology bindings and executes the application directly without forking. + * + * @param cfg Configuration. + * @return Exit status. Does not return on success, only on error. + */ +int execute_slurm_passthrough_mode(Config const& cfg) { + if (cfg.verbose) { + std::cout << "[rrun] Slurm passthrough mode: applying bindings and exec'ing" + << std::endl; + } + + // Set custom environment variables + for (auto const& env_pair : cfg.env_vars) { + setenv(env_pair.first.c_str(), env_pair.second.c_str(), 1); + } + + // Determine GPU for this Slurm task + int gpu_id = -1; + if (!cfg.gpus.empty()) { + gpu_id = cfg.gpus[static_cast(cfg.slurm_local_id) % cfg.gpus.size()]; + setenv("CUDA_VISIBLE_DEVICES", std::to_string(gpu_id).c_str(), 1); + + if (cfg.verbose) { + std::cerr << "[rrun] Slurm task (passthrough) local_id=" << cfg.slurm_local_id + << " assigned to GPU " << gpu_id << std::endl; + } + } + + apply_topology_bindings(cfg, gpu_id, cfg.verbose); + + // Prepare arguments for execvp + std::vector exec_args; + exec_args.push_back(const_cast(cfg.app_binary.c_str())); + for (auto const& arg : cfg.app_args) { + exec_args.push_back(const_cast(arg.c_str())); + } + exec_args.push_back(nullptr); + + // Exec the application (this replaces the current process) + execvp(cfg.app_binary.c_str(), exec_args.data()); + + // If we get here, execvp failed + std::cerr << "Failed to execute " << cfg.app_binary << ": " << std::strerror(errno) + << std::endl; + return 1; +} + +/** + * @brief Execute application in Slurm hybrid mode with PMIx coordination. + * + * Root parent launches rank 0 first to get address, coordinates via PMIx, then parents + * on all nodes launch their remaining ranks. Uses fork-based execution. + * + * @param cfg Configuration. + * @return Exit status (0 for success) + */ +int execute_slurm_hybrid_mode(Config& cfg) { + if (cfg.verbose) { + std::cout << "[rrun] Slurm hybrid mode: task " << cfg.slurm_global_rank + << " launching " << cfg.nranks << " ranks per task" << std::endl; + std::cout << "[rrun] Using PMIx for parent coordination (no file I/O)" + << std::endl; + } + + // Set up coordination directory (needed by all tasks for child bootstrap) + char const* job_id = std::getenv("SLURM_JOB_ID"); + if (cfg.coord_dir.empty()) { + if (job_id) { + cfg.coord_dir = "/tmp/rrun_slurm_" + std::string{job_id}; + } else { + cfg.coord_dir = "/tmp/rrun_" + generate_session_id(); + } + } + std::filesystem::create_directories(cfg.coord_dir); + + // Root parent needs to launch rank 0 first to get address + bool is_root_parent = (cfg.slurm_global_rank == 0); + + // Coordinate root address with other nodes via PMIx + int slurm_ntasks = cfg.slurm_ntasks > 0 ? cfg.slurm_ntasks : 1; + int total_ranks = slurm_ntasks * cfg.nranks; + std::string coordinated_root_address; + + if (is_root_parent) { + // Root parent: Launch rank 0, get address, coordinate via PMIx + std::string address_file = + "/tmp/rapidsmpf_root_address_" + std::string{job_id ? job_id : "unknown"}; + coordinated_root_address = + launch_rank0_and_get_address(cfg, address_file, total_ranks); + coordinated_root_address = + coordinate_root_address_via_pmix(true, coordinated_root_address, cfg.verbose); + } else { + // Non-root parent: Get address from root via PMIx + coordinated_root_address = + coordinate_root_address_via_pmix(false, "", cfg.verbose); + } + + // Now all parents have the coordinated_root_address + // Continue to fork-based launch below with this address + unsetenv("RAPIDSMPF_ROOT_ADDRESS_FILE"); + + // Calculate rank offsets + int rank_offset = cfg.slurm_global_rank * cfg.nranks; + + if (cfg.verbose) { + std::cout << "[rrun] Task " << cfg.slurm_global_rank << " launching ranks " + << rank_offset << "-" << (rank_offset + cfg.nranks - 1) + << " (total: " << total_ranks << " ranks)" << std::endl; + } + + // Launch ranks and wait for completion + int exit_status = launch_ranks_fork_based( + cfg, + rank_offset, + cfg.nranks, + total_ranks, + coordinated_root_address, + is_root_parent + ); + + // Cleanup + if (cfg.cleanup) { + if (cfg.verbose) { + std::cout << "Cleaning up coordination directory: " << cfg.coord_dir + << std::endl; + } + std::error_code ec; + std::filesystem::remove_all(cfg.coord_dir, ec); + if (ec) { + std::cerr << "Warning: Failed to cleanup directory: " << cfg.coord_dir << ": " + << ec.message() << std::endl; + } + } else if (cfg.verbose) { + std::cout << "Coordination directory preserved: " << cfg.coord_dir << std::endl; + } + + if (cfg.verbose && exit_status == 0) { + std::cout << "\nAll ranks completed successfully." << std::endl; + } + + // Finalize PMIx + if (!coordinated_root_address.empty()) { + if (cfg.verbose) { + std::cout << "[rrun] Finalizing PMIx in parent" << std::endl; + } + PMIx_Finalize(nullptr, 0); + } + + return exit_status; +} +#endif // RAPIDSMPF_HAVE_SLURM + +/** + * @brief Execute application in single-node mode with FILE backend + * + * Uses fork-based execution with file-based coordination. + * + * @param cfg Configuration + * @return Exit status (0 for success) + */ +int execute_single_node_mode(Config& cfg) { + if (cfg.verbose) { + std::cout << "[rrun] Single-node mode: launching " << cfg.nranks << " ranks" + << std::endl; + } + + // Set up coordination directory + if (cfg.coord_dir.empty()) { + cfg.coord_dir = "/tmp/rrun_" + generate_session_id(); + } + std::filesystem::create_directories(cfg.coord_dir); + + // Launch ranks and wait for completion + int exit_status = launch_ranks_fork_based(cfg, 0, cfg.nranks, cfg.nranks, "", false); + + // Cleanup + if (cfg.cleanup) { + if (cfg.verbose) { + std::cout << "Cleaning up coordination directory: " << cfg.coord_dir + << std::endl; + } + std::error_code ec; + std::filesystem::remove_all(cfg.coord_dir, ec); + if (ec) { + std::cerr << "Warning: Failed to cleanup directory: " << cfg.coord_dir << ": " + << ec.message() << std::endl; + } + } else if (cfg.verbose) { + std::cout << "Coordination directory preserved: " << cfg.coord_dir << std::endl; + } + + if (cfg.verbose && exit_status == 0) { + std::cout << "\nAll ranks completed successfully." << std::endl; + } + + return exit_status; +} + /** * @brief Launch rank 0 first to obtain its UCXX root address * @@ -1016,15 +1217,15 @@ std::string coordinate_root_address_via_pmix( #endif // RAPIDSMPF_HAVE_SLURM /** - * @brief Launch multiple ranks locally using fork + * @brief Launch multiple ranks locally using fork. * - * @param cfg Configuration - * @param rank_offset Starting global rank for this task - * @param ranks_per_task Number of ranks to launch - * @param total_ranks Total ranks across all tasks - * @param root_address Pre-coordinated root address (empty for FILE backend) - * @param is_root_parent Whether this is root parent (affects which ranks to launch) - * @return Exit status (0 for success) + * @param cfg Configuration. + * @param rank_offset Starting global rank for this task. + * @param ranks_per_task Number of ranks to launch. + * @param total_ranks Total ranks across all tasks. + * @param root_address Pre-coordinated root address (empty for FILE backend). + * @param is_root_parent Whether this is root parent (affects which ranks to launch). + * @return Exit status (0 for success). */ int launch_ranks_fork_based( Config const& cfg, @@ -1371,206 +1572,31 @@ int main(int argc, char* argv[]) { std::cout << std::endl; } - // Variable to hold pre-coordinated root address (for Slurm hybrid mode) - std::string coordinated_root_address; - - // ===================================================================== - // Slurm Mode: Two sub-modes based on whether -n was specified - // ===================================================================== if (cfg.slurm_mode) { if (cfg.nranks == 1) { - // ===== Passthrough Mode: Just apply bindings and exec ===== - // Set custom environment variables - for (auto const& env_pair : cfg.env_vars) { - setenv(env_pair.first.c_str(), env_pair.second.c_str(), 1); - } - - // Determine GPU for this Slurm task - int gpu_id = -1; - if (!cfg.gpus.empty()) { - gpu_id = - cfg.gpus - [static_cast(cfg.slurm_local_id) % cfg.gpus.size()]; - setenv("CUDA_VISIBLE_DEVICES", std::to_string(gpu_id).c_str(), 1); - - if (cfg.verbose) { - std::cerr << "[rrun] Slurm task (passthrough) local_id=" - << cfg.slurm_local_id << " assigned to GPU " << gpu_id - << std::endl; - } - } - - apply_topology_bindings(cfg, gpu_id, cfg.verbose); - - // Prepare arguments for execvp - std::vector exec_args; - exec_args.push_back(const_cast(cfg.app_binary.c_str())); - for (auto const& arg : cfg.app_args) { - exec_args.push_back(const_cast(arg.c_str())); - } - exec_args.push_back(nullptr); - - // Exec the application (this replaces the current process) - execvp(cfg.app_binary.c_str(), exec_args.data()); - - // If we get here, execvp failed - std::cerr << "Failed to execute " << cfg.app_binary << ": " - << std::strerror(errno) << std::endl; - return 1; - } - - // ===== Hybrid Mode: Parent-mediated coordination via PMIx ===== -#ifdef RAPIDSMPF_HAVE_SLURM - if (cfg.verbose) { - std::cout << "[rrun] Slurm hybrid mode: task " << cfg.slurm_global_rank - << " launching " << cfg.nranks << " ranks per task" - << std::endl; - std::cout << "[rrun] Using PMIx for parent coordination (no file I/O)" - << std::endl; - } - - // Set up coordination directory (needed by all tasks for child bootstrap) - char const* job_id = std::getenv("SLURM_JOB_ID"); - if (cfg.coord_dir.empty()) { - if (job_id) { - cfg.coord_dir = "/tmp/rrun_slurm_" + std::string{job_id}; - } else { - cfg.coord_dir = "/tmp/rrun_" + generate_session_id(); - } - } - std::filesystem::create_directories(cfg.coord_dir); - - // Root parent needs to launch rank 0 first to get address - bool is_root_parent = (cfg.slurm_global_rank == 0); - - // Coordinate root address via PMIx - int slurm_ntasks = cfg.slurm_ntasks > 0 ? cfg.slurm_ntasks : 1; - int total_ranks = slurm_ntasks * cfg.nranks; - - if (is_root_parent) { - // Root parent: Launch rank 0, get address, coordinate via PMIx - std::string address_file = "/tmp/rapidsmpf_root_address_" - + std::string{job_id ? job_id : "unknown"}; - coordinated_root_address = - launch_rank0_and_get_address(cfg, address_file, total_ranks); - coordinated_root_address = coordinate_root_address_via_pmix( - true, coordinated_root_address, cfg.verbose - ); + // Slurm passthrough mode: single rank per task, no forking + return execute_slurm_passthrough_mode(cfg); } else { - // Non-root parent: Get address from root via PMIx - coordinated_root_address = - coordinate_root_address_via_pmix(false, "", cfg.verbose); - } - - // Now all parents have the coordinated_root_address - // Continue to fork-based launch below with this address - unsetenv("RAPIDSMPF_ROOT_ADDRESS_FILE"); + // Slurm hybrid mode: multiple ranks per task with PMIx coordination +#ifdef RAPIDSMPF_HAVE_SLURM + return execute_slurm_hybrid_mode(cfg); #else - // Fallback to FILE backend if PMIx not available - if (cfg.verbose) { - std::cout << "[rrun] Slurm hybrid mode: task " << cfg.slurm_global_rank - << " launching " << cfg.nranks << " ranks per task" - << std::endl; - std::cout - << "[rrun] WARNING: PMIx not available, falling back to FILE backend" - << std::endl; - } - - // Generate coordination directory if not specified - if (cfg.coord_dir.empty()) { - char const* job_id = std::getenv("SLURM_JOB_ID"); - if (job_id) { - cfg.coord_dir = "/tmp/rrun_slurm_" + std::string{job_id}; - } else { - cfg.coord_dir = "/tmp/rrun_" + generate_session_id(); - } - } - - std::filesystem::create_directories(cfg.coord_dir); - coordinated_root_address = ""; // Empty means use FILE backend + std::cerr << "Error: Slurm hybrid mode requires PMIx support but " + << "rapidsmpf was not built with PMIx." << std::endl; + std::cerr + << "Rebuild with -DBUILD_SLURM_SUPPORT=ON or use passthrough mode " + << "(without -n flag)." << std::endl; + return 1; #endif - } - - // ===================================================================== - // Fork-based Launch: Single-Node or Slurm Hybrid Mode - // ===================================================================== - - // For non-Slurm mode or FILE backend fallback, create coord dir - if (!cfg.slurm_mode || coordinated_root_address.empty()) { - if (cfg.coord_dir.empty()) { - cfg.coord_dir = "/tmp/rrun_" + generate_session_id(); - } - std::filesystem::create_directories(cfg.coord_dir); - } - - // Determine rank offset and total ranks - int rank_offset = 0; - int ranks_per_task = cfg.nranks; - int total_ranks = cfg.nranks; - bool is_root_parent = false; - - if (cfg.slurm_mode) { - int slurm_ntasks = cfg.slurm_ntasks > 0 ? cfg.slurm_ntasks : 1; - rank_offset = cfg.slurm_global_rank * ranks_per_task; - total_ranks = slurm_ntasks * ranks_per_task; - is_root_parent = (cfg.slurm_global_rank == 0); - - if (cfg.verbose) { - std::cout << "[rrun] Task " << cfg.slurm_global_rank - << " launching ranks " << rank_offset << "-" - << (rank_offset + ranks_per_task - 1) - << " (total: " << total_ranks << " ranks)" << std::endl; - } - } - - // Launch ranks and wait for completion - int exit_status = launch_ranks_fork_based( - cfg, - rank_offset, - ranks_per_task, - total_ranks, - coordinated_root_address, - is_root_parent - ); - - if (cfg.cleanup) { - if (cfg.verbose) { - std::cout << "Cleaning up coordination directory: " << cfg.coord_dir - << std::endl; } - std::error_code ec; - std::filesystem::remove_all(cfg.coord_dir, ec); - if (ec) { - std::cerr << "Warning: Failed to cleanup directory: " << cfg.coord_dir - << ": " << ec.message() << std::endl; - } - } else if (cfg.verbose) { - std::cout << "Coordination directory preserved: " << cfg.coord_dir - << std::endl; - } - - if (cfg.verbose && exit_status == 0) { - std::cout << "\nAll ranks completed successfully." << std::endl; - } - -#ifdef RAPIDSMPF_HAVE_SLURM - if (cfg.slurm_mode && !coordinated_root_address.empty()) { - if (cfg.verbose) { - std::cout << "[rrun] Finalizing PMIx in parent" << std::endl; - } - PMIx_Finalize(nullptr, 0); + } else { + // Single-node mode with FILE backend + return execute_single_node_mode(cfg); } -#endif - - return exit_status; } catch (std::exception const& e) { std::cerr << "Error: " << e.what() << std::endl; std::cerr << "Run with -h or --help for usage information." << std::endl; - -#ifdef RAPIDSMPF_HAVE_SLURM - PMIx_Finalize(nullptr, 0); -#endif return 1; } } From 0876fd66ebd6cf277c804bfc53137556b8e34575 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Thu, 15 Jan 2026 07:48:43 -0800 Subject: [PATCH 16/57] Make coordinate_root_address_via_pmix cleaner with std::optional --- cpp/tools/rrun.cpp | 103 ++++++++++++++++++++------------------------- 1 file changed, 46 insertions(+), 57 deletions(-) diff --git a/cpp/tools/rrun.cpp b/cpp/tools/rrun.cpp index 25a9a167a..f77fe55ec 100644 --- a/cpp/tools/rrun.cpp +++ b/cpp/tools/rrun.cpp @@ -78,6 +78,34 @@ std::string hex_decode(std::string const& input) { } return result; } + +// Forward declarations of helper functions +struct Config; +#ifdef RAPIDSMPF_HAVE_SLURM +std::string launch_rank0_and_get_address( + Config const& cfg, std::string const& address_file, int total_ranks +); +std::string coordinate_root_address_via_pmix( + std::optional const& root_address_to_publish, bool verbose +); +#endif +int launch_ranks_fork_based( + Config const& cfg, + int rank_offset, + int ranks_per_task, + int total_ranks, + std::string const& root_address, + bool is_root_parent +); +pid_t launch_rank_local( + Config const& cfg, + int global_rank, + int local_rank, + int total_ranks, + std::string const& root_address, + int* out_fd_stdout, + int* out_fd_stderr +); } // namespace // NOTE: Do not use RAPIDSMPF_EXPECTS or RAPIDSMPF_FAIL in this file. @@ -868,11 +896,11 @@ int execute_slurm_hybrid_mode(Config& cfg) { coordinated_root_address = launch_rank0_and_get_address(cfg, address_file, total_ranks); coordinated_root_address = - coordinate_root_address_via_pmix(true, coordinated_root_address, cfg.verbose); + coordinate_root_address_via_pmix(coordinated_root_address, cfg.verbose); } else { // Non-root parent: Get address from root via PMIx coordinated_root_address = - coordinate_root_address_via_pmix(false, "", cfg.verbose); + coordinate_root_address_via_pmix(std::nullopt, cfg.verbose); } // Now all parents have the coordinated_root_address @@ -976,6 +1004,7 @@ int execute_single_node_mode(Config& cfg) { return exit_status; } +#ifdef RAPIDSMPF_HAVE_SLURM /** * @brief Launch rank 0 first to obtain its UCXX root address * @@ -1082,20 +1111,21 @@ std::string launch_rank0_and_get_address( } /** - * @brief Coordinate root address between parent processes using PMIx + * @brief Coordinate root address between parent processes using PMIx. * * This function is called by parent rrun processes in Slurm hybrid mode. * The root parent (PMIX_RANK=0) publishes the root address, and non-root * parents retrieve it. This avoids file-based coordination. * - * @param is_root Whether this is the root parent (PMIX_RANK=0) - * @param root_address_to_publish Address to publish (only used if is_root) - * @param verbose Whether to print debug messages - * @return Root address (either published or retrieved) - * @throws std::runtime_error on PMIx errors + * @param root_address_to_publish Root address to publish. If set (has_value()), this is + * the root parent and it will publish. If empty (nullopt), + * this is a non-root parent and it will retrieve. + * @param verbose Whether to print debug messages. + * @return Root address (either published or retrieved). + * @throws std::runtime_error on PMIx errors. */ std::string coordinate_root_address_via_pmix( - bool is_root, std::string const& root_address_to_publish, bool verbose + std::optional const& root_address_to_publish, bool verbose ) { // Initialize PMIx for parent process pmix_proc_t proc; @@ -1113,13 +1143,13 @@ std::string coordinate_root_address_via_pmix( std::string root_address; - if (is_root) { + if (root_address_to_publish.has_value()) { // Root parent publishes the address (hex-encoded for binary safety) - std::string encoded_address = hex_encode(root_address_to_publish); + std::string encoded_address = hex_encode(root_address_to_publish.value()); if (verbose) { std::cout << "[rrun] Publishing root address via PMIx (hex-encoded, " - << root_address_to_publish.size() << " bytes -> " + << root_address_to_publish.value().size() << " bytes -> " << encoded_address.size() << " chars)" << std::endl; } @@ -1148,7 +1178,7 @@ std::string coordinate_root_address_via_pmix( ); } - root_address = root_address_to_publish; + root_address = root_address_to_publish.value(); } // Barrier with PMIX_COLLECT_DATA to ensure data exchange @@ -1173,7 +1203,7 @@ std::string coordinate_root_address_via_pmix( ); } - if (!is_root) { + if (!root_address_to_publish.has_value()) { // Non-root parents retrieve the address pmix_proc_t source_proc; PMIX_PROC_CONSTRUCT(&source_proc); @@ -1345,7 +1375,7 @@ int launch_ranks_fork_based( int code = WEXITSTATUS(status); if (code != 0) { std::cerr << "Rank " - << (rank_offset + << (static_cast(rank_offset) + (is_root_parent && !root_address.empty() ? i + 1 : i)) << " (PID " << pid << ") exited with code " << code << std::endl; @@ -1354,7 +1384,7 @@ int launch_ranks_fork_based( } else if (WIFSIGNALED(status)) { int sig = WTERMSIG(status); std::cerr << "Rank " - << (rank_offset + << (static_cast(rank_offset) + (is_root_parent && !root_address.empty() ? i + 1 : i)) << " (PID " << pid << ") terminated by signal " << sig << std::endl; exit_status = 128 + sig; @@ -1465,47 +1495,6 @@ pid_t launch_rank_local( ); } -/** - * @brief Wait for all child processes and check their exit status. - */ -int wait_for_ranks(std::vector const& pids) { - int overall_status = 0; - - for (size_t i = 0; i < pids.size(); ++i) { - int status; - while (true) { - pid_t result = waitpid(pids[i], &status, 0); - - if (result < 0) { - if (errno == EINTR) { - // Retry waitpid for the same pid - continue; - } - std::cerr << "Error waiting for rank " << i << ": " - << std::strerror(errno) << std::endl; - overall_status = 1; - break; - } - - if (WIFEXITED(status)) { - int exit_code = WEXITSTATUS(status); - if (exit_code != 0) { - std::cerr << "Rank " << i << " (PID " << pids[i] - << ") exited with code " << exit_code << std::endl; - overall_status = exit_code; - } - } else if (WIFSIGNALED(status)) { - int signal = WTERMSIG(status); - std::cerr << "Rank " << i << " (PID " << pids[i] - << ") terminated by signal " << signal << std::endl; - overall_status = 128 + signal; - } - break; - } - } - - return overall_status; -} } // namespace int main(int argc, char* argv[]) { From 72d2399a08e2af425248899d07dcebd8aa42bd46 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Thu, 15 Jan 2026 14:00:42 -0800 Subject: [PATCH 17/57] Add [rrun] prefix to cout/cerr messages --- cpp/tools/rrun.cpp | 71 ++++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 34 deletions(-) diff --git a/cpp/tools/rrun.cpp b/cpp/tools/rrun.cpp index f77fe55ec..882a7f3da 100644 --- a/cpp/tools/rrun.cpp +++ b/cpp/tools/rrun.cpp @@ -189,7 +189,8 @@ std::vector detect_gpus() { FILE* pipe = popen("nvidia-smi --query-gpu=index --format=csv,noheader 2>/dev/null", "r"); if (!pipe) { - std::cerr << "Warning: Could not detect GPUs using nvidia-smi" << std::endl; + std::cerr << "[rrun] Warning: Could not detect GPUs using nvidia-smi" + << std::endl; return {}; } @@ -436,7 +437,7 @@ void apply_topology_bindings(Config const& cfg, int gpu_id, bool verbose) { auto it = cfg.gpu_topology_map.find(gpu_id); if (it == cfg.gpu_topology_map.end()) { if (verbose) { - std::cerr << "Warning: No topology information for GPU " << gpu_id + std::cerr << "[rrun] Warning: No topology information for GPU " << gpu_id << std::endl; } return; @@ -447,8 +448,8 @@ void apply_topology_bindings(Config const& cfg, int gpu_id, bool verbose) { if (cfg.bind_cpu && !gpu_info.cpu_affinity_list.empty()) { if (!set_cpu_affinity(gpu_info.cpu_affinity_list)) { if (verbose) { - std::cerr << "Warning: Failed to set CPU affinity for GPU " << gpu_id - << std::endl; + std::cerr << "[rrun] Warning: Failed to set CPU affinity for GPU " + << gpu_id << std::endl; } } } @@ -457,7 +458,7 @@ void apply_topology_bindings(Config const& cfg, int gpu_id, bool verbose) { if (!set_numa_memory_binding(gpu_info.memory_binding)) { #if RAPIDSMPF_HAVE_NUMA if (verbose) { - std::cerr << "Warning: Failed to set NUMA memory binding for GPU " + std::cerr << "[rrun] Warning: Failed to set NUMA memory binding for GPU " << gpu_id << std::endl; } #endif @@ -674,9 +675,9 @@ Config parse_args(int argc, char* argv[]) { if (cfg.gpus.empty()) { cfg.gpus = detect_gpus(); if (cfg.gpus.empty()) { - std::cerr - << "Warning: No GPUs detected. CUDA_VISIBLE_DEVICES will not be set." - << std::endl; + std::cerr << "[rrun] Warning: No GPUs detected. CUDA_VISIBLE_DEVICES will " + "not be set." + << std::endl; } } @@ -684,7 +685,7 @@ Config parse_args(int argc, char* argv[]) { if (!cfg.slurm_mode && !cfg.gpus.empty() && cfg.nranks > static_cast(cfg.gpus.size())) { - std::cerr << "Warning: Number of ranks (" << cfg.nranks + std::cerr << "[rrun] Warning: Number of ranks (" << cfg.nranks << ") exceeds number of GPUs (" << cfg.gpus.size() << "). Multiple ranks will share GPUs." << std::endl; } @@ -711,7 +712,7 @@ Config parse_args(int argc, char* argv[]) { } } else { if (cfg.verbose) { - std::cerr << "Warning: Failed to discover system topology. " + std::cerr << "[rrun] Warning: Failed to discover system topology. " << "CPU affinity, NUMA binding, and UCX network device " << "configuration will be skipped." << std::endl; } @@ -848,8 +849,8 @@ int execute_slurm_passthrough_mode(Config const& cfg) { execvp(cfg.app_binary.c_str(), exec_args.data()); // If we get here, execvp failed - std::cerr << "Failed to execute " << cfg.app_binary << ": " << std::strerror(errno) - << std::endl; + std::cerr << "[rrun] Failed to execute " << cfg.app_binary << ": " + << std::strerror(errno) << std::endl; return 1; } @@ -929,21 +930,22 @@ int execute_slurm_hybrid_mode(Config& cfg) { // Cleanup if (cfg.cleanup) { if (cfg.verbose) { - std::cout << "Cleaning up coordination directory: " << cfg.coord_dir + std::cout << "[rrun] Cleaning up coordination directory: " << cfg.coord_dir << std::endl; } std::error_code ec; std::filesystem::remove_all(cfg.coord_dir, ec); if (ec) { - std::cerr << "Warning: Failed to cleanup directory: " << cfg.coord_dir << ": " - << ec.message() << std::endl; + std::cerr << "[rrun] Warning: Failed to cleanup directory: " << cfg.coord_dir + << ": " << ec.message() << std::endl; } } else if (cfg.verbose) { - std::cout << "Coordination directory preserved: " << cfg.coord_dir << std::endl; + std::cout << "[rrun] Coordination directory preserved: " << cfg.coord_dir + << std::endl; } if (cfg.verbose && exit_status == 0) { - std::cout << "\nAll ranks completed successfully." << std::endl; + std::cout << "\n[rrun] All ranks completed successfully." << std::endl; } // Finalize PMIx @@ -984,21 +986,22 @@ int execute_single_node_mode(Config& cfg) { // Cleanup if (cfg.cleanup) { if (cfg.verbose) { - std::cout << "Cleaning up coordination directory: " << cfg.coord_dir + std::cout << "[rrun] Cleaning up coordination directory: " << cfg.coord_dir << std::endl; } std::error_code ec; std::filesystem::remove_all(cfg.coord_dir, ec); if (ec) { - std::cerr << "Warning: Failed to cleanup directory: " << cfg.coord_dir << ": " - << ec.message() << std::endl; + std::cerr << "[rrun] Warning: Failed to cleanup directory: " << cfg.coord_dir + << ": " << ec.message() << std::endl; } } else if (cfg.verbose) { - std::cout << "Coordination directory preserved: " << cfg.coord_dir << std::endl; + std::cout << "[rrun] Coordination directory preserved: " << cfg.coord_dir + << std::endl; } if (cfg.verbose && exit_status == 0) { - std::cout << "\nAll ranks completed successfully." << std::endl; + std::cout << "\n[rrun] All ranks completed successfully." << std::endl; } return exit_status; @@ -1326,7 +1329,7 @@ int launch_ranks_fork_based( if (cfg.verbose) { std::ostringstream msg; - msg << "Launched rank " << global_rank << " (PID " << pid << ")"; + msg << "[rrun] Launched rank " << global_rank << " (PID " << pid << ")"; if (!cfg.gpus.empty()) { msg << " on GPU " << cfg.gpus[static_cast(local_rank) % cfg.gpus.size()]; @@ -1357,7 +1360,7 @@ int launch_ranks_fork_based( } }).detach(); - std::cout << "\nAll ranks launched. Waiting for completion...\n" << std::endl; + std::cout << "\n[rrun] All ranks launched. Waiting for completion...\n" << std::endl; // Wait for all processes int exit_status = 0; @@ -1365,7 +1368,7 @@ int launch_ranks_fork_based( int status = 0; pid_t pid = pids[i]; if (waitpid(pid, &status, 0) < 0) { - std::cerr << "Failed to wait for rank " << i << " (PID " << pid + std::cerr << "[rrun] Failed to wait for rank " << i << " (PID " << pid << "): " << std::strerror(errno) << std::endl; exit_status = 1; continue; @@ -1374,7 +1377,7 @@ int launch_ranks_fork_based( if (WIFEXITED(status)) { int code = WEXITSTATUS(status); if (code != 0) { - std::cerr << "Rank " + std::cerr << "[rrun] Rank " << (static_cast(rank_offset) + (is_root_parent && !root_address.empty() ? i + 1 : i)) << " (PID " << pid << ") exited with code " << code @@ -1383,7 +1386,7 @@ int launch_ranks_fork_based( } } else if (WIFSIGNALED(status)) { int sig = WTERMSIG(status); - std::cerr << "Rank " + std::cerr << "[rrun] Rank " << (static_cast(rank_offset) + (is_root_parent && !root_address.empty() ? i + 1 : i)) << " (PID " << pid << ") terminated by signal " << sig << std::endl; @@ -1488,7 +1491,7 @@ pid_t launch_rank_local( exec_args.push_back(nullptr); execvp(cfg.app_binary.c_str(), exec_args.data()); - std::cerr << "Failed to execute " << cfg.app_binary << ": " + std::cerr << "[rrun] Failed to execute " << cfg.app_binary << ": " << std::strerror(errno) << std::endl; _exit(1); } @@ -1570,11 +1573,11 @@ int main(int argc, char* argv[]) { #ifdef RAPIDSMPF_HAVE_SLURM return execute_slurm_hybrid_mode(cfg); #else - std::cerr << "Error: Slurm hybrid mode requires PMIx support but " + std::cerr << "[rrun] Error: Slurm hybrid mode requires PMIx support but " << "rapidsmpf was not built with PMIx." << std::endl; - std::cerr - << "Rebuild with -DBUILD_SLURM_SUPPORT=ON or use passthrough mode " - << "(without -n flag)." << std::endl; + std::cerr << "[rrun] Rebuild with -DBUILD_SLURM_SUPPORT=ON or use " + "passthrough mode " + << "(without -n flag)." << std::endl; return 1; #endif } @@ -1584,8 +1587,8 @@ int main(int argc, char* argv[]) { } } catch (std::exception const& e) { - std::cerr << "Error: " << e.what() << std::endl; - std::cerr << "Run with -h or --help for usage information." << std::endl; + std::cerr << "[rrun] Error: " << e.what() << std::endl; + std::cerr << "[rrun] Run with -h or --help for usage information." << std::endl; return 1; } } From 20d5ae326aabb0644570ae84ddbd84715ddfcbe4 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Fri, 16 Jan 2026 13:32:46 -0800 Subject: [PATCH 18/57] Generalize into setup_launch_and_cleanup --- cpp/tools/rrun.cpp | 140 ++++++++++++++++++++++++--------------------- 1 file changed, 74 insertions(+), 66 deletions(-) diff --git a/cpp/tools/rrun.cpp b/cpp/tools/rrun.cpp index 882a7f3da..2fd32e0a9 100644 --- a/cpp/tools/rrun.cpp +++ b/cpp/tools/rrun.cpp @@ -803,6 +803,70 @@ pid_t fork_with_piped_stdio( return pid; } +/** + * @brief Common helper to set up coordination, launch ranks, and cleanup. + * + * This function encapsulates the common workflow shared by both Slurm hybrid mode + * and single-node mode: create coordination directory, launch ranks via fork, + * cleanup, and report results. + * + * @param cfg Configuration (will modify coord_dir if empty). + * @param rank_offset Starting global rank for this task. + * @param ranks_per_task Number of ranks to launch locally. + * @param total_ranks Total ranks across all tasks. + * @param root_address Pre-coordinated root address (empty for FILE backend). + * @param is_root_parent Whether this is root parent (affects launch logic). + * @param coord_dir_hint Hint for coordination directory name (e.g., job ID). + * @return Exit status (0 for success). + */ +int setup_launch_and_cleanup( + Config& cfg, + int rank_offset, + int ranks_per_task, + int total_ranks, + std::string const& root_address, + bool is_root_parent, + std::string const& coord_dir_hint = "" +) { + // Set up coordination directory + if (cfg.coord_dir.empty()) { + if (!coord_dir_hint.empty()) { + cfg.coord_dir = "/tmp/rrun_" + coord_dir_hint; + } else { + cfg.coord_dir = "/tmp/rrun_" + generate_session_id(); + } + } + std::filesystem::create_directories(cfg.coord_dir); + + // Launch ranks and wait for completion + int exit_status = launch_ranks_fork_based( + cfg, rank_offset, ranks_per_task, total_ranks, root_address, is_root_parent + ); + + // Cleanup + if (cfg.cleanup) { + if (cfg.verbose) { + std::cout << "[rrun] Cleaning up coordination directory: " << cfg.coord_dir + << std::endl; + } + std::error_code ec; + std::filesystem::remove_all(cfg.coord_dir, ec); + if (ec) { + std::cerr << "[rrun] Warning: Failed to cleanup directory: " << cfg.coord_dir + << ": " << ec.message() << std::endl; + } + } else if (cfg.verbose) { + std::cout << "[rrun] Coordination directory preserved: " << cfg.coord_dir + << std::endl; + } + + if (cfg.verbose && exit_status == 0) { + std::cout << "\n[rrun] All ranks completed successfully." << std::endl; + } + + return exit_status; +} + #ifdef RAPIDSMPF_HAVE_SLURM /** * @brief Execute application in Slurm passthrough mode (single rank per task). @@ -871,17 +935,6 @@ int execute_slurm_hybrid_mode(Config& cfg) { << std::endl; } - // Set up coordination directory (needed by all tasks for child bootstrap) - char const* job_id = std::getenv("SLURM_JOB_ID"); - if (cfg.coord_dir.empty()) { - if (job_id) { - cfg.coord_dir = "/tmp/rrun_slurm_" + std::string{job_id}; - } else { - cfg.coord_dir = "/tmp/rrun_" + generate_session_id(); - } - } - std::filesystem::create_directories(cfg.coord_dir); - // Root parent needs to launch rank 0 first to get address bool is_root_parent = (cfg.slurm_global_rank == 0); @@ -890,6 +943,7 @@ int execute_slurm_hybrid_mode(Config& cfg) { int total_ranks = slurm_ntasks * cfg.nranks; std::string coordinated_root_address; + char const* job_id = std::getenv("SLURM_JOB_ID"); if (is_root_parent) { // Root parent: Launch rank 0, get address, coordinate via PMIx std::string address_file = @@ -917,37 +971,18 @@ int execute_slurm_hybrid_mode(Config& cfg) { << " (total: " << total_ranks << " ranks)" << std::endl; } - // Launch ranks and wait for completion - int exit_status = launch_ranks_fork_based( + // Use common helper for launch and cleanup + std::string coord_hint = job_id ? ("slurm_" + std::string{job_id}) : ""; + int exit_status = setup_launch_and_cleanup( cfg, rank_offset, cfg.nranks, total_ranks, coordinated_root_address, - is_root_parent + is_root_parent, + coord_hint ); - // Cleanup - if (cfg.cleanup) { - if (cfg.verbose) { - std::cout << "[rrun] Cleaning up coordination directory: " << cfg.coord_dir - << std::endl; - } - std::error_code ec; - std::filesystem::remove_all(cfg.coord_dir, ec); - if (ec) { - std::cerr << "[rrun] Warning: Failed to cleanup directory: " << cfg.coord_dir - << ": " << ec.message() << std::endl; - } - } else if (cfg.verbose) { - std::cout << "[rrun] Coordination directory preserved: " << cfg.coord_dir - << std::endl; - } - - if (cfg.verbose && exit_status == 0) { - std::cout << "\n[rrun] All ranks completed successfully." << std::endl; - } - // Finalize PMIx if (!coordinated_root_address.empty()) { if (cfg.verbose) { @@ -974,37 +1009,10 @@ int execute_single_node_mode(Config& cfg) { << std::endl; } - // Set up coordination directory - if (cfg.coord_dir.empty()) { - cfg.coord_dir = "/tmp/rrun_" + generate_session_id(); - } - std::filesystem::create_directories(cfg.coord_dir); - - // Launch ranks and wait for completion - int exit_status = launch_ranks_fork_based(cfg, 0, cfg.nranks, cfg.nranks, "", false); - - // Cleanup - if (cfg.cleanup) { - if (cfg.verbose) { - std::cout << "[rrun] Cleaning up coordination directory: " << cfg.coord_dir - << std::endl; - } - std::error_code ec; - std::filesystem::remove_all(cfg.coord_dir, ec); - if (ec) { - std::cerr << "[rrun] Warning: Failed to cleanup directory: " << cfg.coord_dir - << ": " << ec.message() << std::endl; - } - } else if (cfg.verbose) { - std::cout << "[rrun] Coordination directory preserved: " << cfg.coord_dir - << std::endl; - } - - if (cfg.verbose && exit_status == 0) { - std::cout << "\n[rrun] All ranks completed successfully." << std::endl; - } - - return exit_status; + // Use common helper for launch and cleanup + // rank_offset=0, ranks_per_task=nranks, total_ranks=nranks, no root_address, not + // root_parent + return setup_launch_and_cleanup(cfg, 0, cfg.nranks, cfg.nranks, "", false); } #ifdef RAPIDSMPF_HAVE_SLURM From f3b40fbec6fa66696360c444960db66d6a7c7f05 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Thu, 22 Jan 2026 07:50:15 -0800 Subject: [PATCH 19/57] Fix `RAPIDSMPF_COORD_DIR` not set error in Slurm hybrid mode --- cpp/tools/rrun.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/cpp/tools/rrun.cpp b/cpp/tools/rrun.cpp index 2fd32e0a9..41181c083 100644 --- a/cpp/tools/rrun.cpp +++ b/cpp/tools/rrun.cpp @@ -935,6 +935,17 @@ int execute_slurm_hybrid_mode(Config& cfg) { << std::endl; } + // Set up coordination directory FIRST (needed by rank 0 when it's launched early) + char const* job_id = std::getenv("SLURM_JOB_ID"); + if (cfg.coord_dir.empty()) { + if (job_id) { + cfg.coord_dir = "/tmp/rrun_slurm_" + std::string{job_id}; + } else { + cfg.coord_dir = "/tmp/rrun_" + generate_session_id(); + } + } + std::filesystem::create_directories(cfg.coord_dir); + // Root parent needs to launch rank 0 first to get address bool is_root_parent = (cfg.slurm_global_rank == 0); @@ -943,7 +954,6 @@ int execute_slurm_hybrid_mode(Config& cfg) { int total_ranks = slurm_ntasks * cfg.nranks; std::string coordinated_root_address; - char const* job_id = std::getenv("SLURM_JOB_ID"); if (is_root_parent) { // Root parent: Launch rank 0, get address, coordinate via PMIx std::string address_file = From 44290ece4831d3c430a5e55241b7f012bbfe1431 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Thu, 22 Jan 2026 14:00:48 -0800 Subject: [PATCH 20/57] Throw if attempting to recreate root --- cpp/src/bootstrap/ucxx.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/cpp/src/bootstrap/ucxx.cpp b/cpp/src/bootstrap/ucxx.cpp index 983d34b8d..963e00c0a 100644 --- a/cpp/src/bootstrap/ucxx.cpp +++ b/cpp/src/bootstrap/ucxx.cpp @@ -72,11 +72,7 @@ std::shared_ptr create_ucxx_comm(Backend backend, config::Options op // Children skip bootstrap coordination and use the provided address directly if (ctx.rank == 0) { - // Root child creates listener - auto ucxx_initialized_rank = - ucxx::init(nullptr, ctx.nranks, std::nullopt, options); - comm = - std::make_shared(std::move(ucxx_initialized_rank), options); + throw std::runtime_error("The root rank was already created.") } else { // Worker children connect using provided address auto root_worker_address = From 984dc68befd3e90cd45c6a74daa3dcd3bbb16c41 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Thu, 22 Jan 2026 23:58:48 -0800 Subject: [PATCH 21/57] Missing semicolon --- cpp/src/bootstrap/ucxx.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/bootstrap/ucxx.cpp b/cpp/src/bootstrap/ucxx.cpp index 963e00c0a..64b236faa 100644 --- a/cpp/src/bootstrap/ucxx.cpp +++ b/cpp/src/bootstrap/ucxx.cpp @@ -72,7 +72,7 @@ std::shared_ptr create_ucxx_comm(Backend backend, config::Options op // Children skip bootstrap coordination and use the provided address directly if (ctx.rank == 0) { - throw std::runtime_error("The root rank was already created.") + throw std::runtime_error("The root rank was already created."); } else { // Worker children connect using provided address auto root_worker_address = From 951023fb2fa1bb9f36a78238bbc385fe696e8e66 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 3 Feb 2026 07:21:00 -0800 Subject: [PATCH 22/57] Fix rrun file-based coordination with Slurm --- cpp/src/bootstrap/bootstrap.cpp | 15 ++++++++++++--- cpp/src/bootstrap/utils.cpp | 6 +++++- cpp/tools/rrun.cpp | 18 +++++++++++------- 3 files changed, 28 insertions(+), 11 deletions(-) diff --git a/cpp/src/bootstrap/bootstrap.cpp b/cpp/src/bootstrap/bootstrap.cpp index 10c21eacd..705ea2dc9 100644 --- a/cpp/src/bootstrap/bootstrap.cpp +++ b/cpp/src/bootstrap/bootstrap.cpp @@ -57,16 +57,25 @@ std::optional getenv_int(std::string_view name) { * @brief Detect backend from environment variables. */ Backend detect_backend() { - // Check for file-based coordination first (explicit configuration takes priority) - if (getenv_optional("RAPIDSMPF_COORD_DIR")) { + // Check for rrun coordination first (explicit configuration takes priority) + // If RAPIDSMPF_COORD_DIR or RAPIDSMPF_ROOT_ADDRESS is set, rrun is coordinating + // and we should use FILE backend (with or without pre-coordinated address) + if (getenv_optional("RAPIDSMPF_COORD_DIR") + || getenv_optional("RAPIDSMPF_ROOT_ADDRESS")) + { return Backend::FILE; } #ifdef RAPIDSMPF_HAVE_SLURM - // Check for Slurm-specific environment variables. + // Check for Slurm-specific environment variables ONLY if rrun is NOT coordinating. + // This allows direct use of Slurm/PMIx backend when NOT launched via rrun. // Note: We don't check PMIX_NAMESPACE alone because OpenMPI also uses PMIx // internally and sets PMIX_NAMESPACE when launched with mpirun. // SLURM_JOB_ID + SLURM_PROCID is specific to Slurm srun tasks. + // + // Important: This path should only be taken by Slurm parent processes that are + // NOT launched by rrun. Child processes launched by rrun will have RAPIDSMPF_* + // variables set and will use FILE backend above. if (getenv_optional("SLURM_JOB_ID") && getenv_optional("SLURM_PROCID")) { return Backend::SLURM; } diff --git a/cpp/src/bootstrap/utils.cpp b/cpp/src/bootstrap/utils.cpp index adccbe00a..8b40674cd 100644 --- a/cpp/src/bootstrap/utils.cpp +++ b/cpp/src/bootstrap/utils.cpp @@ -103,7 +103,11 @@ bool is_running_with_slurm() { } bool is_running_with_bootstrap() { - return is_running_with_rrun() || is_running_with_slurm(); + // Only return true if rrun is coordinating (i.e., RAPIDSMPF_RANK is set). + // Even if Slurm environment variables are present, the user may want to use + // MPI directly with `srun --mpi=pmix`, so we shouldn't force bootstrap mode + // unless rrun is explicitly managing the launch. + return is_running_with_rrun(); } Rank get_rank() { diff --git a/cpp/tools/rrun.cpp b/cpp/tools/rrun.cpp index 64ec17af3..f62bb73b4 100644 --- a/cpp/tools/rrun.cpp +++ b/cpp/tools/rrun.cpp @@ -215,28 +215,27 @@ void print_usage(std::string_view prog_name) { << "Usage: " << prog_name << " [options] [app_args...]\n\n" << "Single-Node Options:\n" << " -n Number of ranks to launch (required in single-node " - "mode)\n" + << " mode)\n" << " -g Comma-separated list of GPU IDs (e.g., 0,1,2,3)\n" << " If not specified, auto-detect available GPUs\n\n" << "Slurm Options:\n" << " --slurm Run in Slurm mode (auto-detected when SLURM_JOB_ID is " - "set)\n" + << " set)\n" << " Two sub-modes:\n" << " 1. Passthrough (no -n): Apply bindings and exec\n" << " 2. Hybrid (with -n): Launch N ranks per Slurm task\n" - << " In hybrid mode, each Slurm task launches multiple " - "ranks\n" - << " with coordinated global rank numbering\n\n" + << " In hybrid mode, each Slurm task launches multiple\n" + << " ranks with coordinated global rank numbering\n\n" << "Common Options:\n" << " -d Coordination directory (default: /tmp/rrun_)\n" - << " Not used in Slurm mode with PMIx backend\n" + << " Not applicable in Slurm mode\n" << " --tag-output Tag stdout and stderr with rank number\n" << " Not applicable in Slurm mode\n" << " --bind-to Bind to topology resources (default: all)\n" << " Can be specified multiple times\n" << " Options: cpu, memory, network, all, none\n" << " Examples: --bind-to cpu --bind-to network\n" - << " --bind-to none (disable all bindings)\n" + << " --bind-to none (disable all bindings)\n" << " -x, --set-env \n" << " Set environment variable for all ranks\n" << " Can be specified multiple times\n" @@ -882,6 +881,11 @@ int execute_slurm_passthrough_mode(Config const& cfg) { << std::endl; } + // Set rrun coordination environment variables so the application knows + // it's being launched by rrun and should use bootstrap mode + setenv("RAPIDSMPF_RANK", std::to_string(cfg.slurm_global_rank).c_str(), 1); + setenv("RAPIDSMPF_NRANKS", std::to_string(cfg.slurm_ntasks).c_str(), 1); + // Set custom environment variables for (auto const& env_pair : cfg.env_vars) { setenv(env_pair.first.c_str(), env_pair.second.c_str(), 1); From 21fa254ee73aead609677bc2967c324496deb167 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 3 Feb 2026 09:28:01 -0800 Subject: [PATCH 23/57] Fix file based bootstrapping --- .../rapidsmpf/bootstrap/file_backend.hpp | 7 ++- cpp/src/bootstrap/file_backend.cpp | 47 ++++++++++++------- cpp/src/bootstrap/ucxx.cpp | 21 ++++----- cpp/tools/rrun.cpp | 10 ++-- 4 files changed, 49 insertions(+), 36 deletions(-) diff --git a/cpp/include/rapidsmpf/bootstrap/file_backend.hpp b/cpp/include/rapidsmpf/bootstrap/file_backend.hpp index 3805ad798..77ea1d844 100644 --- a/cpp/include/rapidsmpf/bootstrap/file_backend.hpp +++ b/cpp/include/rapidsmpf/bootstrap/file_backend.hpp @@ -1,10 +1,11 @@ /** - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ #pragma once +#include #include #include @@ -78,7 +79,9 @@ class FileBackend { std::string coord_dir_; std::string kv_dir_; std::string barrier_dir_; - std::size_t barrier_count_{0}; + // Note: barrier_count_ must be static to persist across FileBackend instances + // since FileBackend is created as a temporary for each put/get/barrier operation + static inline std::atomic barrier_count_{0}; /** * @brief Get path for a key-value file. diff --git a/cpp/src/bootstrap/file_backend.cpp b/cpp/src/bootstrap/file_backend.cpp index c91a3fbce..47446b7c0 100644 --- a/cpp/src/bootstrap/file_backend.cpp +++ b/cpp/src/bootstrap/file_backend.cpp @@ -1,8 +1,9 @@ /** - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ +#include #include #include #include @@ -22,6 +23,13 @@ namespace rapidsmpf::bootstrap::detail { +namespace { +// Process-wide flag to track if alive file has been created for this process +// Since FileBackend is created/destroyed multiple times for put/get/barrier, +// we only want to create the alive file once and never remove it until process exit +std::atomic alive_file_created{false}; +} // namespace + FileBackend::FileBackend(Context ctx) : ctx_{std::move(ctx)} { if (!ctx_.coord_dir.has_value()) { throw std::runtime_error("FileBackend requires coord_dir in context"); @@ -42,25 +50,26 @@ FileBackend::FileBackend(Context ctx) : ctx_{std::move(ctx)} { ); } - // Create rank alive file - write_file(get_rank_alive_path(ctx_.rank), std::to_string(getpid())); + // Create rank alive file only once per process + // FileBackend is created/destroyed for each bootstrap operation (put/get/barrier), + // but the alive file should persist until process exit + bool expected = false; + if (alive_file_created.compare_exchange_strong(expected, true)) { + write_file(get_rank_alive_path(ctx_.rank), std::to_string(getpid())); + } // Note: Do not block in the constructor. Ranks only create their alive file // and continue. Synchronization occurs where needed (e.g., get/put/barrier). } FileBackend::~FileBackend() { - // Clean up rank alive file - try { - std::error_code ec; - if (!std::filesystem::remove(get_rank_alive_path(ctx_.rank), ec) && ec) { - std::cerr << "Error removing rank alive file: " << ec.message() << std::endl; - } - } catch (const std::exception& e) { - std::cerr << "Exception during rank alive file cleanup: " << e.what() - << std::endl; - } - cleanup_coordination_directory(); + // Don't clean up in destructor since FileBackend is used as a temporary object + // for each bootstrap operation (put/get/barrier). The coordination directory + // and alive files should persist across operations and be cleaned up by the + // launcher (rrun) after all ranks complete. + // + // Note: The alive file was created in the constructor and should remain + // until the process exits or the launcher cleans it up. } void FileBackend::put(std::string const& key, std::string const& value) { @@ -103,9 +112,13 @@ void FileBackend::barrier() { } } - // Clean up our barrier file - std::error_code ec; - std::filesystem::remove(my_barrier_file, ec); + // Don't delete barrier files here - they should persist until process exit + // or until cleaned up by the launcher (rrun). Deleting them immediately + // creates a race condition where faster ranks delete their files before + // slower ranks can check for them. + // + // The barrier files are small (1 byte each) and will be cleaned up when + // rrun removes the coordination directory. } void FileBackend::broadcast(void* data, std::size_t size, Rank root) { diff --git a/cpp/src/bootstrap/ucxx.cpp b/cpp/src/bootstrap/ucxx.cpp index 64b236faa..8ae91c3d5 100644 --- a/cpp/src/bootstrap/ucxx.cpp +++ b/cpp/src/bootstrap/ucxx.cpp @@ -65,23 +65,18 @@ std::shared_ptr create_ucxx_comm(Backend backend, config::Options op // Check if root address was provided by parent process (rrun hybrid mode) char const* precomputed_address_encoded = std::getenv("RAPIDSMPF_ROOT_ADDRESS"); - if (precomputed_address_encoded != nullptr) { + if (precomputed_address_encoded != nullptr && ctx.rank != 0) { // Parent process already coordinated the root address via PMIx // Address is hex-encoded to avoid issues with binary data in env vars + // Note: Only non-root ranks use this path. Rank 0 should always create the + // listener. std::string precomputed_address = hex_decode(precomputed_address_encoded); - // Children skip bootstrap coordination and use the provided address directly - if (ctx.rank == 0) { - throw std::runtime_error("The root rank was already created."); - } else { - // Worker children connect using provided address - auto root_worker_address = - ::ucxx::createAddressFromString(precomputed_address); - auto ucxx_initialized_rank = - ucxx::init(nullptr, ctx.nranks, root_worker_address, options); - comm = - std::make_shared(std::move(ucxx_initialized_rank), options); - } + // Worker children connect using provided address + auto root_worker_address = ::ucxx::createAddressFromString(precomputed_address); + auto ucxx_initialized_rank = + ucxx::init(nullptr, ctx.nranks, root_worker_address, options); + comm = std::make_shared(std::move(ucxx_initialized_rank), options); } else { // Standard bootstrap coordination via put/get/barrier diff --git a/cpp/tools/rrun.cpp b/cpp/tools/rrun.cpp index f62bb73b4..1f44a1140 100644 --- a/cpp/tools/rrun.cpp +++ b/cpp/tools/rrun.cpp @@ -64,6 +64,7 @@ std::string hex_encode(std::string const& input) { return result; } +#ifdef RAPIDSMPF_HAVE_SLURM std::string hex_decode(std::string const& input) { std::string result; result.reserve(input.size() / 2); @@ -78,10 +79,14 @@ std::string hex_decode(std::string const& input) { } return result; } +#endif -// Forward declarations of helper functions +// Forward declarations of mode execution functions (defined later, outside namespace) struct Config; +int execute_slurm_passthrough_mode(Config const& cfg); +int execute_single_node_mode(Config& cfg); #ifdef RAPIDSMPF_HAVE_SLURM +int execute_slurm_hybrid_mode(Config& cfg); std::string launch_rank0_and_get_address( Config const& cfg, std::string const& address_file, int total_ranks ); @@ -106,14 +111,11 @@ pid_t launch_rank_local( int* out_fd_stdout, int* out_fd_stderr ); -} // namespace // NOTE: Do not use RAPIDSMPF_EXPECTS or RAPIDSMPF_FAIL in this file. // Using these macros introduces a CUDA dependency via rapidsmpf/error.hpp. // Prefer throwing standard exceptions instead. -namespace { - static std::mutex output_mutex; /** From 1c74512236370c07f4e30b74d97495d5fc77ff9a Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 3 Feb 2026 10:22:57 -0800 Subject: [PATCH 24/57] Simplify non-Slurm FileBackend synchronization --- .../rapidsmpf/bootstrap/file_backend.hpp | 5 +- cpp/src/bootstrap/file_backend.cpp | 47 +++++++------------ cpp/src/bootstrap/ucxx.cpp | 11 +++-- 3 files changed, 25 insertions(+), 38 deletions(-) diff --git a/cpp/include/rapidsmpf/bootstrap/file_backend.hpp b/cpp/include/rapidsmpf/bootstrap/file_backend.hpp index 77ea1d844..bb386a3c8 100644 --- a/cpp/include/rapidsmpf/bootstrap/file_backend.hpp +++ b/cpp/include/rapidsmpf/bootstrap/file_backend.hpp @@ -5,7 +5,6 @@ #pragma once -#include #include #include @@ -79,9 +78,7 @@ class FileBackend { std::string coord_dir_; std::string kv_dir_; std::string barrier_dir_; - // Note: barrier_count_ must be static to persist across FileBackend instances - // since FileBackend is created as a temporary for each put/get/barrier operation - static inline std::atomic barrier_count_{0}; + std::size_t barrier_count_{0}; /** * @brief Get path for a key-value file. diff --git a/cpp/src/bootstrap/file_backend.cpp b/cpp/src/bootstrap/file_backend.cpp index 47446b7c0..5e1cafd64 100644 --- a/cpp/src/bootstrap/file_backend.cpp +++ b/cpp/src/bootstrap/file_backend.cpp @@ -1,9 +1,8 @@ /** - * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. * SPDX-License-Identifier: Apache-2.0 */ -#include #include #include #include @@ -23,13 +22,6 @@ namespace rapidsmpf::bootstrap::detail { -namespace { -// Process-wide flag to track if alive file has been created for this process -// Since FileBackend is created/destroyed multiple times for put/get/barrier, -// we only want to create the alive file once and never remove it until process exit -std::atomic alive_file_created{false}; -} // namespace - FileBackend::FileBackend(Context ctx) : ctx_{std::move(ctx)} { if (!ctx_.coord_dir.has_value()) { throw std::runtime_error("FileBackend requires coord_dir in context"); @@ -50,26 +42,25 @@ FileBackend::FileBackend(Context ctx) : ctx_{std::move(ctx)} { ); } - // Create rank alive file only once per process - // FileBackend is created/destroyed for each bootstrap operation (put/get/barrier), - // but the alive file should persist until process exit - bool expected = false; - if (alive_file_created.compare_exchange_strong(expected, true)) { - write_file(get_rank_alive_path(ctx_.rank), std::to_string(getpid())); - } + // Create rank alive file + write_file(get_rank_alive_path(ctx_.rank), std::to_string(getpid())); // Note: Do not block in the constructor. Ranks only create their alive file // and continue. Synchronization occurs where needed (e.g., get/put/barrier). } FileBackend::~FileBackend() { - // Don't clean up in destructor since FileBackend is used as a temporary object - // for each bootstrap operation (put/get/barrier). The coordination directory - // and alive files should persist across operations and be cleaned up by the - // launcher (rrun) after all ranks complete. - // - // Note: The alive file was created in the constructor and should remain - // until the process exits or the launcher cleans it up. + // Clean up rank alive file + try { + std::error_code ec; + if (!std::filesystem::remove(get_rank_alive_path(ctx_.rank), ec) && ec) { + std::cerr << "Error removing rank alive file: " << ec.message() << std::endl; + } + } catch (const std::exception& e) { + std::cerr << "Exception during rank alive file cleanup: " << e.what() + << std::endl; + } + cleanup_coordination_directory(); } void FileBackend::put(std::string const& key, std::string const& value) { @@ -112,13 +103,9 @@ void FileBackend::barrier() { } } - // Don't delete barrier files here - they should persist until process exit - // or until cleaned up by the launcher (rrun). Deleting them immediately - // creates a race condition where faster ranks delete their files before - // slower ranks can check for them. - // - // The barrier files are small (1 byte each) and will be cleaned up when - // rrun removes the coordination directory. + // Clean up our barrier file + std::error_code ec; + std::filesystem::remove(my_barrier_file, ec); } void FileBackend::broadcast(void* data, std::size_t size, Rank root) { diff --git a/cpp/src/bootstrap/ucxx.cpp b/cpp/src/bootstrap/ucxx.cpp index 8ae91c3d5..f46a95723 100644 --- a/cpp/src/bootstrap/ucxx.cpp +++ b/cpp/src/bootstrap/ucxx.cpp @@ -134,10 +134,13 @@ std::shared_ptr create_ucxx_comm(Backend backend, config::Options op } if (!early_address_mode) { - // All ranks must barrier to make PMIx put() data visible. - // For file backend this is a no-op synchronization. - // For PMIx/Slurm backend this executes PMIx_Fence to exchange data. - barrier(ctx); + // For PMIx/Slurm backend, barrier is needed to execute PMIx_Fence + // which makes put() data visible across nodes. + // For FILE backend, barrier is not needed since put/get already + // provide implicit synchronization via filesystem operations. + if (ctx.backend == Backend::SLURM) { + barrier(ctx); + } if (ctx.rank != 0) { // Worker ranks retrieve the root address and connect From 26858e0ac07bddbca96b0bb96bb3d2569b7a6358 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 4 Feb 2026 07:35:10 -0800 Subject: [PATCH 25/57] More cleanup --- cpp/tools/rrun.cpp | 46 ++++++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/cpp/tools/rrun.cpp b/cpp/tools/rrun.cpp index 1f44a1140..ae58a1b2f 100644 --- a/cpp/tools/rrun.cpp +++ b/cpp/tools/rrun.cpp @@ -657,12 +657,11 @@ Config parse_args(int argc, char* argv[]) { // In Slurm mode: // - If -n is specified: launch N ranks per Slurm task (hybrid mode) - // - If -n is not specified: just apply bindings and exec (passthrough mode) + // - If -n is not specified: just apply bindings and exec (passthrough mode, + // one rank per task) if (cfg.nranks <= 0) { - // Passthrough mode: one rank per Slurm task cfg.nranks = 1; } - // else: hybrid mode with cfg.nranks children per Slurm task } else { // Single-node mode validation if (cfg.nranks <= 0) { @@ -811,6 +810,9 @@ pid_t fork_with_piped_stdio( * and single-node mode: create coordination directory, launch ranks via fork, * cleanup, and report results. * + * A task here denotes a Slurm unit of execution, e.g., a single instance of a + * program or process, e.g., an instance of the `rrun` executable itself. + * * @param cfg Configuration (will modify coord_dir if empty). * @param rank_offset Starting global rank for this task. * @param ranks_per_task Number of ranks to launch locally. @@ -844,7 +846,6 @@ int setup_launch_and_cleanup( cfg, rank_offset, ranks_per_task, total_ranks, root_address, is_root_parent ); - // Cleanup if (cfg.cleanup) { if (cfg.verbose) { std::cout << "[rrun] Cleaning up coordination directory: " << cfg.coord_dir @@ -888,11 +889,6 @@ int execute_slurm_passthrough_mode(Config const& cfg) { setenv("RAPIDSMPF_RANK", std::to_string(cfg.slurm_global_rank).c_str(), 1); setenv("RAPIDSMPF_NRANKS", std::to_string(cfg.slurm_ntasks).c_str(), 1); - // Set custom environment variables - for (auto const& env_pair : cfg.env_vars) { - setenv(env_pair.first.c_str(), env_pair.second.c_str(), 1); - } - // Determine GPU for this Slurm task int gpu_id = -1; if (!cfg.gpus.empty()) { @@ -900,11 +896,16 @@ int execute_slurm_passthrough_mode(Config const& cfg) { setenv("CUDA_VISIBLE_DEVICES", std::to_string(gpu_id).c_str(), 1); if (cfg.verbose) { - std::cerr << "[rrun] Slurm task (passthrough) local_id=" << cfg.slurm_local_id + std::cout << "[rrun] Slurm task (passthrough) local_id=" << cfg.slurm_local_id << " assigned to GPU " << gpu_id << std::endl; } } + // Set custom environment variables + for (auto const& env_pair : cfg.env_vars) { + setenv(env_pair.first.c_str(), env_pair.second.c_str(), 1); + } + apply_topology_bindings(cfg, gpu_id, cfg.verbose); // Prepare arguments for execvp @@ -931,7 +932,7 @@ int execute_slurm_passthrough_mode(Config const& cfg) { * on all nodes launch their remaining ranks. Uses fork-based execution. * * @param cfg Configuration. - * @return Exit status (0 for success) + * @return Exit status (0 for success). */ int execute_slurm_hybrid_mode(Config& cfg) { if (cfg.verbose) { @@ -1012,12 +1013,12 @@ int execute_slurm_hybrid_mode(Config& cfg) { #endif // RAPIDSMPF_HAVE_SLURM /** - * @brief Execute application in single-node mode with FILE backend + * @brief Execute application in single-node mode with FILE backend. * * Uses fork-based execution with file-based coordination. * - * @param cfg Configuration - * @return Exit status (0 for success) + * @param cfg Configuration. + * @return Exit status (0 for success). */ int execute_single_node_mode(Config& cfg) { if (cfg.verbose) { @@ -1033,13 +1034,14 @@ int execute_single_node_mode(Config& cfg) { #ifdef RAPIDSMPF_HAVE_SLURM /** - * @brief Launch rank 0 first to obtain its UCXX root address + * @brief Launch rank 0 first to obtain its UCXX root address. * - * @param cfg Configuration - * @param address_file Path to file where rank 0 will write its address - * @param total_ranks Total number of ranks across all tasks - * @return Hex-encoded root address - * @throws std::runtime_error on timeout or launch failure + * @param cfg Configuration. + * @param address_file Path to file where rank 0 will write its address. + * @param total_ranks Total number of ranks across all tasks. + * @return Hex-encoded root address. + * + * @throws std::runtime_error on timeout or launch failure. */ std::string launch_rank0_and_get_address( Config const& cfg, std::string const& address_file, int total_ranks @@ -1149,6 +1151,7 @@ std::string launch_rank0_and_get_address( * this is a non-root parent and it will retrieve. * @param verbose Whether to print debug messages. * @return Root address (either published or retrieved). + * * @throws std::runtime_error on PMIx errors. */ std::string coordinate_root_address_via_pmix( @@ -1276,6 +1279,9 @@ std::string coordinate_root_address_via_pmix( /** * @brief Launch multiple ranks locally using fork. * + * A task here denotes a Slurm unit of execution, e.g., a single instance of a + * program or process, e.g., an instance of the `rrun` executable itself. + * * @param cfg Configuration. * @param rank_offset Starting global rank for this task. * @param ranks_per_task Number of ranks to launch. From 28779c3150df7529cc940cd6238a8bd28fd266ae Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 4 Feb 2026 07:45:13 -0800 Subject: [PATCH 26/57] Move duplicate logic to exec_application --- cpp/tools/rrun.cpp | 84 ++++++++++++++++++++++++---------------------- 1 file changed, 43 insertions(+), 41 deletions(-) diff --git a/cpp/tools/rrun.cpp b/cpp/tools/rrun.cpp index ae58a1b2f..4a54993b3 100644 --- a/cpp/tools/rrun.cpp +++ b/cpp/tools/rrun.cpp @@ -83,7 +83,7 @@ std::string hex_decode(std::string const& input) { // Forward declarations of mode execution functions (defined later, outside namespace) struct Config; -int execute_slurm_passthrough_mode(Config const& cfg); +[[noreturn]] void execute_slurm_passthrough_mode(Config const& cfg); int execute_single_node_mode(Config& cfg); #ifdef RAPIDSMPF_HAVE_SLURM int execute_slurm_hybrid_mode(Config& cfg); @@ -102,6 +102,7 @@ int launch_ranks_fork_based( std::string const& root_address, bool is_root_parent ); +[[noreturn]] void exec_application(Config const& cfg); pid_t launch_rank_local( Config const& cfg, int global_rank, @@ -869,16 +870,43 @@ int setup_launch_and_cleanup( return exit_status; } +/** + * @brief Execute application via execvp (never returns). + * + * Prepares arguments and calls execvp. On failure, prints error and exits. + * This function never returns - it either replaces the current process + * or calls _exit(1) on error. + * + * @param cfg Configuration containing application binary and arguments. + */ +[[noreturn]] void exec_application(Config const& cfg) { + // Prepare arguments for execvp + std::vector exec_args; + exec_args.push_back(const_cast(cfg.app_binary.c_str())); + for (auto const& arg : cfg.app_args) { + exec_args.push_back(const_cast(arg.c_str())); + } + exec_args.push_back(nullptr); + + // Exec the application (this replaces the current process) + execvp(cfg.app_binary.c_str(), exec_args.data()); + + // If we get here, execvp failed + std::cerr << "[rrun] Failed to execute " << cfg.app_binary << ": " + << std::strerror(errno) << std::endl; + _exit(1); +} + #ifdef RAPIDSMPF_HAVE_SLURM /** * @brief Execute application in Slurm passthrough mode (single rank per task). * * Applies topology bindings and executes the application directly without forking. + * This function never returns - it either replaces the current process or exits on error. * * @param cfg Configuration. - * @return Exit status. Does not return on success, only on error. */ -int execute_slurm_passthrough_mode(Config const& cfg) { +[[noreturn]] void execute_slurm_passthrough_mode(Config const& cfg) { if (cfg.verbose) { std::cout << "[rrun] Slurm passthrough mode: applying bindings and exec'ing" << std::endl; @@ -908,21 +936,7 @@ int execute_slurm_passthrough_mode(Config const& cfg) { apply_topology_bindings(cfg, gpu_id, cfg.verbose); - // Prepare arguments for execvp - std::vector exec_args; - exec_args.push_back(const_cast(cfg.app_binary.c_str())); - for (auto const& arg : cfg.app_args) { - exec_args.push_back(const_cast(arg.c_str())); - } - exec_args.push_back(nullptr); - - // Exec the application (this replaces the current process) - execvp(cfg.app_binary.c_str(), exec_args.data()); - - // If we get here, execvp failed - std::cerr << "[rrun] Failed to execute " << cfg.app_binary << ": " - << std::strerror(errno) << std::endl; - return 1; + exec_application(cfg); } /** @@ -1512,18 +1526,7 @@ pid_t launch_rank_local( apply_topology_bindings(cfg, gpu_id, cfg.verbose); - // Prepare arguments for execvp - std::vector exec_args; - exec_args.push_back(const_cast(cfg.app_binary.c_str())); - for (auto const& arg : cfg.app_args) { - exec_args.push_back(const_cast(arg.c_str())); - } - exec_args.push_back(nullptr); - - execvp(cfg.app_binary.c_str(), exec_args.data()); - std::cerr << "[rrun] Failed to execute " << cfg.app_binary << ": " - << std::strerror(errno) << std::endl; - _exit(1); + exec_application(cfg); } ); } @@ -1597,20 +1600,19 @@ int main(int argc, char* argv[]) { if (cfg.slurm_mode) { if (cfg.nranks == 1) { // Slurm passthrough mode: single rank per task, no forking - return execute_slurm_passthrough_mode(cfg); - } else { - // Slurm hybrid mode: multiple ranks per task with PMIx coordination + execute_slurm_passthrough_mode(cfg); + } + // Slurm hybrid mode: multiple ranks per task with PMIx coordination #ifdef RAPIDSMPF_HAVE_SLURM - return execute_slurm_hybrid_mode(cfg); + return execute_slurm_hybrid_mode(cfg); #else - std::cerr << "[rrun] Error: Slurm hybrid mode requires PMIx support but " - << "rapidsmpf was not built with PMIx." << std::endl; - std::cerr << "[rrun] Rebuild with -DBUILD_SLURM_SUPPORT=ON or use " - "passthrough mode " - << "(without -n flag)." << std::endl; - return 1; + std::cerr << "[rrun] Error: Slurm hybrid mode requires PMIx support but " + << "rapidsmpf was not built with PMIx." << std::endl; + std::cerr << "[rrun] Rebuild with -DBUILD_SLURM_SUPPORT=ON or use " + "passthrough mode " + << "(without -n flag)." << std::endl; + return 1; #endif - } } else { // Single-node mode with FILE backend return execute_single_node_mode(cfg); From cd4f9477a4a97069d11b429541fd6e88de686201 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 4 Feb 2026 08:30:47 -0800 Subject: [PATCH 27/57] Fix encoded/decoding confusion, use optional string for root address --- cpp/tools/rrun.cpp | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/cpp/tools/rrun.cpp b/cpp/tools/rrun.cpp index 4a54993b3..9c22c5bca 100644 --- a/cpp/tools/rrun.cpp +++ b/cpp/tools/rrun.cpp @@ -99,7 +99,7 @@ int launch_ranks_fork_based( int rank_offset, int ranks_per_task, int total_ranks, - std::string const& root_address, + std::optional const& root_address, bool is_root_parent ); [[noreturn]] void exec_application(Config const& cfg); @@ -108,7 +108,7 @@ pid_t launch_rank_local( int global_rank, int local_rank, int total_ranks, - std::string const& root_address, + std::optional const& root_address, int* out_fd_stdout, int* out_fd_stderr ); @@ -828,7 +828,7 @@ int setup_launch_and_cleanup( int rank_offset, int ranks_per_task, int total_ranks, - std::string const& root_address, + std::optional const& root_address, bool is_root_parent, std::string const& coord_dir_hint = "" ) { @@ -973,16 +973,16 @@ int execute_slurm_hybrid_mode(Config& cfg) { // Coordinate root address with other nodes via PMIx int slurm_ntasks = cfg.slurm_ntasks > 0 ? cfg.slurm_ntasks : 1; int total_ranks = slurm_ntasks * cfg.nranks; - std::string coordinated_root_address; + std::string encoded_root_address, coordinated_root_address; if (is_root_parent) { // Root parent: Launch rank 0, get address, coordinate via PMIx std::string address_file = "/tmp/rapidsmpf_root_address_" + std::string{job_id ? job_id : "unknown"}; - coordinated_root_address = + encoded_root_address = launch_rank0_and_get_address(cfg, address_file, total_ranks); coordinated_root_address = - coordinate_root_address_via_pmix(coordinated_root_address, cfg.verbose); + coordinate_root_address_via_pmix(encoded_root_address, cfg.verbose); } else { // Non-root parent: Get address from root via PMIx coordinated_root_address = @@ -1043,7 +1043,7 @@ int execute_single_node_mode(Config& cfg) { // Use common helper for launch and cleanup // rank_offset=0, ranks_per_task=nranks, total_ranks=nranks, no root_address, not // root_parent - return setup_launch_and_cleanup(cfg, 0, cfg.nranks, cfg.nranks, "", false); + return setup_launch_and_cleanup(cfg, 0, cfg.nranks, cfg.nranks, std::nullopt, false); } #ifdef RAPIDSMPF_HAVE_SLURM @@ -1068,7 +1068,8 @@ std::string launch_rank0_and_get_address( setenv("RAPIDSMPF_ROOT_ADDRESS_FILE", address_file.c_str(), 1); int fd_out = -1, fd_err = -1; - pid_t rank0_pid = launch_rank_local(cfg, 0, 0, total_ranks, "", &fd_out, &fd_err); + pid_t rank0_pid = + launch_rank_local(cfg, 0, 0, total_ranks, std::nullopt, &fd_out, &fd_err); // Start forwarders for rank 0 output std::thread rank0_stdout_forwarder; @@ -1150,7 +1151,7 @@ std::string launch_rank0_and_get_address( if (rank0_stderr_forwarder.joinable()) rank0_stderr_forwarder.detach(); - return root_address; + return encoded_address; } /** @@ -1189,19 +1190,19 @@ std::string coordinate_root_address_via_pmix( if (root_address_to_publish.has_value()) { // Root parent publishes the address (hex-encoded for binary safety) - std::string encoded_address = hex_encode(root_address_to_publish.value()); + std::string decoded_address = hex_encode(root_address_to_publish.value()); if (verbose) { std::cout << "[rrun] Publishing root address via PMIx (hex-encoded, " - << root_address_to_publish.value().size() << " bytes -> " - << encoded_address.size() << " chars)" << std::endl; + << decoded_address.size() << " bytes -> " + << root_address_to_publish.value().size() << " chars)" << std::endl; } // Use PMIx_Put with GLOBAL scope pmix_value_t value; PMIX_VALUE_CONSTRUCT(&value); value.type = PMIX_STRING; - value.data.string = strdup(encoded_address.c_str()); + value.data.string = strdup(root_address_to_publish.value().c_str()); rc = PMIx_Put(PMIX_GLOBAL, "rapidsmpf_root_address", &value); PMIX_VALUE_DESTRUCT(&value); @@ -1309,7 +1310,7 @@ int launch_ranks_fork_based( int rank_offset, int ranks_per_task, int total_ranks, - std::string const& root_address, + std::optional const& root_address, bool is_root_parent ) { std::vector pids; @@ -1360,7 +1361,7 @@ int launch_ranks_fork_based( }; // Launch ranks (skip rank 0 if root parent already launched it) - int start_local_rank = (is_root_parent && !root_address.empty()) ? 1 : 0; + int start_local_rank = (is_root_parent && root_address.has_value()) ? 1 : 0; for (int local_rank = start_local_rank; local_rank < ranks_per_task; ++local_rank) { int global_rank = rank_offset + local_rank; @@ -1423,7 +1424,7 @@ int launch_ranks_fork_based( if (code != 0) { std::cerr << "[rrun] Rank " << (static_cast(rank_offset) - + (is_root_parent && !root_address.empty() ? i + 1 : i)) + + (is_root_parent && root_address.has_value() ? i + 1 : i)) << " (PID " << pid << ") exited with code " << code << std::endl; exit_status = code; @@ -1432,7 +1433,7 @@ int launch_ranks_fork_based( int sig = WTERMSIG(status); std::cerr << "[rrun] Rank " << (static_cast(rank_offset) - + (is_root_parent && !root_address.empty() ? i + 1 : i)) + + (is_root_parent && root_address.has_value() ? i + 1 : i)) << " (PID " << pid << ") terminated by signal " << sig << std::endl; exit_status = 128 + sig; } @@ -1465,7 +1466,7 @@ pid_t launch_rank_local( int global_rank, int local_rank, int total_ranks, - std::string const& root_address, + std::optional const& root_address, int* out_fd_stdout, int* out_fd_stderr ) { @@ -1473,7 +1474,7 @@ pid_t launch_rank_local( int captured_global_rank = global_rank; int captured_local_rank = local_rank; int captured_total_ranks = total_ranks; - std::string captured_root_address = root_address; + std::optional captured_root_address = root_address; return fork_with_piped_stdio( out_fd_stdout, @@ -1501,8 +1502,8 @@ pid_t launch_rank_local( // If root address was pre-coordinated by parent, set it (hex-encoded) // This allows children to skip bootstrap coordination entirely - if (!captured_root_address.empty()) { - std::string encoded_address = hex_encode(captured_root_address); + if (captured_root_address.has_value()) { + std::string encoded_address = hex_encode(*captured_root_address); setenv("RAPIDSMPF_ROOT_ADDRESS", encoded_address.c_str(), 1); } From 8dccd95c9ca85e62df9e49e29fd1e4f4627dad79 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 4 Feb 2026 08:56:05 -0800 Subject: [PATCH 28/57] Add error/finalizer errors for PMIx --- cpp/tools/rrun.cpp | 81 +++++++++++++++++++++++++++------------------- 1 file changed, 48 insertions(+), 33 deletions(-) diff --git a/cpp/tools/rrun.cpp b/cpp/tools/rrun.cpp index 9c22c5bca..277961f9f 100644 --- a/cpp/tools/rrun.cpp +++ b/cpp/tools/rrun.cpp @@ -1154,6 +1154,46 @@ std::string launch_rank0_and_get_address( return encoded_address; } +/** + * @brief Helper function to handle PMIx errors consistently. + * + * Checks the PMIx status code and throws an exception with proper cleanup if it indicates + * failure. Optionally allows partial success for operations like PMIx_Fence. + * + * @param rc PMIx status code to check. + * @param operation Description of the PMIx operation (e.g., "PMIx_Init"). + * @param allow_partial_success If true, PMIX_ERR_PARTIAL_SUCCESS is treated as success. + * + * @throws std::runtime_error if the operation failed (after calling PMIx_Finalize). + */ +void handle_pmix_error( + pmix_status_t rc, std::string const& operation, bool allow_partial_success = false +) { + if (rc == PMIX_SUCCESS || (allow_partial_success && rc == PMIX_ERR_PARTIAL_SUCCESS)) { + return; + } + PMIx_Finalize(nullptr, 0); + throw std::runtime_error( + operation + " failed: " + std::string{PMIx_Error_string(rc)} + ); +} + +/** + * @brief Helper function to throw an error with PMIx cleanup. + * + * Calls PMIx_Finalize and throws a runtime_error with the given message. + * Use this for validation errors or other non-PMIx-status failures that occur + * after PMIx has been initialized. + * + * @param error_message The error message to include in the exception. + * + * @throws std::runtime_error Always throws after calling PMIx_Finalize. + */ +[[noreturn]] void pmix_fatal_error(std::string const& error_message) { + PMIx_Finalize(nullptr, 0); + throw std::runtime_error(error_message); +} + /** * @brief Coordinate root address between parent processes using PMIx. * @@ -1175,11 +1215,7 @@ std::string coordinate_root_address_via_pmix( // Initialize PMIx for parent process pmix_proc_t proc; pmix_status_t rc = PMIx_Init(&proc, nullptr, 0); - if (rc != PMIX_SUCCESS) { - throw std::runtime_error( - "PMIx_Init failed in rrun parent: " + std::string{PMIx_Error_string(rc)} - ); - } + handle_pmix_error(rc, "PMIx_Init in rrun parent"); if (verbose) { std::cout << "[rrun] Parent PMIx initialized: rank " << proc.rank @@ -1206,22 +1242,11 @@ std::string coordinate_root_address_via_pmix( rc = PMIx_Put(PMIX_GLOBAL, "rapidsmpf_root_address", &value); PMIX_VALUE_DESTRUCT(&value); - - if (rc != PMIX_SUCCESS) { - PMIx_Finalize(nullptr, 0); - throw std::runtime_error( - "PMIx_Put failed: " + std::string{PMIx_Error_string(rc)} - ); - } + handle_pmix_error(rc, "PMIx_Put"); // Commit the data rc = PMIx_Commit(); - if (rc != PMIX_SUCCESS) { - PMIx_Finalize(nullptr, 0); - throw std::runtime_error( - "PMIx_Commit failed: " + std::string{PMIx_Error_string(rc)} - ); - } + handle_pmix_error(rc, "PMIx_Commit"); root_address = root_address_to_publish.value(); } @@ -1239,14 +1264,7 @@ std::string coordinate_root_address_via_pmix( rc = PMIx_Fence(&proc_wildcard, 1, &info, 1); PMIX_INFO_DESTRUCT(&info); - - // Accept partial success (some PMIx implementations return this for fences) - if (rc != PMIX_SUCCESS && rc != PMIX_ERR_PARTIAL_SUCCESS) { - PMIx_Finalize(nullptr, 0); - throw std::runtime_error( - "PMIx_Fence failed: " + std::string{PMIx_Error_string(rc)} - ); - } + handle_pmix_error(rc, "PMIx_Fence", true); if (!root_address_to_publish.has_value()) { // Non-root parents retrieve the address @@ -1257,18 +1275,15 @@ std::string coordinate_root_address_via_pmix( pmix_value_t* value = nullptr; rc = PMIx_Get(&source_proc, "rapidsmpf_root_address", nullptr, 0, &value); + handle_pmix_error(rc, "PMIx_Get"); - if (rc != PMIX_SUCCESS || value == nullptr) { - PMIx_Finalize(nullptr, 0); - throw std::runtime_error( - "PMIx_Get failed: " + std::string{PMIx_Error_string(rc)} - ); + if (value == nullptr) { + pmix_fatal_error("PMIx_Get returned null value"); } if (value->type != PMIX_STRING) { PMIX_VALUE_RELEASE(value); - PMIx_Finalize(nullptr, 0); - throw std::runtime_error("PMIx_Get returned non-string value"); + pmix_fatal_error("PMIx_Get returned non-string value"); } std::string encoded_address = value->data.string; From 68e9f0e4bbb6e96b0a58373778f164123f0710f4 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 4 Feb 2026 09:02:17 -0800 Subject: [PATCH 29/57] More cleanup --- cpp/tools/rrun.cpp | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/cpp/tools/rrun.cpp b/cpp/tools/rrun.cpp index 277961f9f..6a4e02b4d 100644 --- a/cpp/tools/rrun.cpp +++ b/cpp/tools/rrun.cpp @@ -989,11 +989,8 @@ int execute_slurm_hybrid_mode(Config& cfg) { coordinate_root_address_via_pmix(std::nullopt, cfg.verbose); } - // Now all parents have the coordinated_root_address - // Continue to fork-based launch below with this address unsetenv("RAPIDSMPF_ROOT_ADDRESS_FILE"); - // Calculate rank offsets int rank_offset = cfg.slurm_global_rank * cfg.nranks; if (cfg.verbose) { @@ -1002,7 +999,6 @@ int execute_slurm_hybrid_mode(Config& cfg) { << " (total: " << total_ranks << " ranks)" << std::endl; } - // Use common helper for launch and cleanup std::string coord_hint = job_id ? ("slurm_" + std::string{job_id}) : ""; int exit_status = setup_launch_and_cleanup( cfg, @@ -1014,7 +1010,6 @@ int execute_slurm_hybrid_mode(Config& cfg) { coord_hint ); - // Finalize PMIx if (!coordinated_root_address.empty()) { if (cfg.verbose) { std::cout << "[rrun] Finalizing PMIx in parent" << std::endl; @@ -1040,9 +1035,6 @@ int execute_single_node_mode(Config& cfg) { << std::endl; } - // Use common helper for launch and cleanup - // rank_offset=0, ranks_per_task=nranks, total_ranks=nranks, no root_address, not - // root_parent return setup_launch_and_cleanup(cfg, 0, cfg.nranks, cfg.nranks, std::nullopt, false); } @@ -1505,7 +1497,6 @@ pid_t launch_rank_local( setenv(env_pair.first.c_str(), env_pair.second.c_str(), 1); } - // Set environment variables setenv("RAPIDSMPF_RANK", std::to_string(captured_global_rank).c_str(), 1); setenv("RAPIDSMPF_NRANKS", std::to_string(captured_total_ranks).c_str(), 1); @@ -1630,7 +1621,7 @@ int main(int argc, char* argv[]) { return 1; #endif } else { - // Single-node mode with FILE backend + // Single-node mode with file backend return execute_single_node_mode(cfg); } From c6d876148f55314d6a199aead9d49722b5d10675 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 4 Feb 2026 09:29:24 -0800 Subject: [PATCH 30/57] Fix decoding --- cpp/tools/rrun.cpp | 50 +++++++--------------------------------------- 1 file changed, 7 insertions(+), 43 deletions(-) diff --git a/cpp/tools/rrun.cpp b/cpp/tools/rrun.cpp index 6a4e02b4d..204600608 100644 --- a/cpp/tools/rrun.cpp +++ b/cpp/tools/rrun.cpp @@ -50,36 +50,7 @@ #include #endif -// Hex encoding for binary-safe address transmission namespace { -std::string hex_encode(std::string const& input) { - static constexpr const char* hex_chars = "0123456789abcdef"; - std::string result; - result.reserve(input.size() * 2); - for (char ch : input) { - auto c = static_cast(ch); - result.push_back(hex_chars[c >> 4]); - result.push_back(hex_chars[c & 0x0F]); - } - return result; -} - -#ifdef RAPIDSMPF_HAVE_SLURM -std::string hex_decode(std::string const& input) { - std::string result; - result.reserve(input.size() / 2); - for (size_t i = 0; i < input.size(); i += 2) { - auto high = static_cast( - (input[i] >= 'a') ? (input[i] - 'a' + 10) : (input[i] - '0') - ); - auto low = static_cast( - (input[i + 1] >= 'a') ? (input[i + 1] - 'a' + 10) : (input[i + 1] - '0') - ); - result.push_back(static_cast((high << 4) | low)); - } - return result; -} -#endif // Forward declarations of mode execution functions (defined later, outside namespace) struct Config; @@ -1123,18 +1094,16 @@ std::string launch_rank0_and_get_address( } } - // Read the hex-encoded address, decode and remove file + // Read the hex-encoded address and remove file std::string encoded_address; std::ifstream addr_stream(address_file); std::getline(addr_stream, encoded_address); addr_stream.close(); - std::string root_address = hex_decode(encoded_address); std::filesystem::remove(address_file); if (cfg.verbose) { std::cout << "[rrun] Got root address from rank 0 (hex-encoded, " - << encoded_address.size() << " chars -> " << root_address.size() - << " bytes)" << std::endl; + << encoded_address.size() << " chars)" << std::endl; } // Rank 0 is already running - detach forwarders @@ -1217,12 +1186,9 @@ std::string coordinate_root_address_via_pmix( std::string root_address; if (root_address_to_publish.has_value()) { - // Root parent publishes the address (hex-encoded for binary safety) - std::string decoded_address = hex_encode(root_address_to_publish.value()); - + // Root parent publishes the address (already hex-encoded for binary safety) if (verbose) { std::cout << "[rrun] Publishing root address via PMIx (hex-encoded, " - << decoded_address.size() << " bytes -> " << root_address_to_publish.value().size() << " chars)" << std::endl; } @@ -1281,12 +1247,11 @@ std::string coordinate_root_address_via_pmix( std::string encoded_address = value->data.string; PMIX_VALUE_RELEASE(value); - root_address = hex_decode(encoded_address); + root_address = encoded_address; if (verbose) { std::cout << "[rrun] Retrieved root address via PMIx (hex-encoded, " - << encoded_address.size() << " chars -> " << root_address.size() - << " bytes)" << std::endl; + << encoded_address.size() << " chars)" << std::endl; } } @@ -1506,11 +1471,10 @@ pid_t launch_rank_local( setenv("RAPIDSMPF_COORD_DIR", cfg.coord_dir.c_str(), 1); } - // If root address was pre-coordinated by parent, set it (hex-encoded) + // If root address was pre-coordinated by parent, set it (already hex-encoded) // This allows children to skip bootstrap coordination entirely if (captured_root_address.has_value()) { - std::string encoded_address = hex_encode(*captured_root_address); - setenv("RAPIDSMPF_ROOT_ADDRESS", encoded_address.c_str(), 1); + setenv("RAPIDSMPF_ROOT_ADDRESS", captured_root_address->c_str(), 1); } // In Slurm hybrid mode, unset Slurm/PMIx rank variables to avoid confusion From ae7342910e1a4f5c2025a75295c4b3c08075f610 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 4 Feb 2026 09:49:06 -0800 Subject: [PATCH 31/57] Fix copyright headers --- cpp/include/rapidsmpf/bootstrap/file_backend.hpp | 2 +- cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp | 2 +- cpp/src/bootstrap/file_backend.cpp | 2 +- cpp/src/bootstrap/slurm_backend.cpp | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/include/rapidsmpf/bootstrap/file_backend.hpp b/cpp/include/rapidsmpf/bootstrap/file_backend.hpp index bb386a3c8..3805ad798 100644 --- a/cpp/include/rapidsmpf/bootstrap/file_backend.hpp +++ b/cpp/include/rapidsmpf/bootstrap/file_backend.hpp @@ -1,5 +1,5 @@ /** - * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ diff --git a/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp b/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp index e03760bb0..26782cd2d 100644 --- a/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp +++ b/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp @@ -1,5 +1,5 @@ /** - * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ diff --git a/cpp/src/bootstrap/file_backend.cpp b/cpp/src/bootstrap/file_backend.cpp index 5e1cafd64..c91a3fbce 100644 --- a/cpp/src/bootstrap/file_backend.cpp +++ b/cpp/src/bootstrap/file_backend.cpp @@ -1,5 +1,5 @@ /** - * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ diff --git a/cpp/src/bootstrap/slurm_backend.cpp b/cpp/src/bootstrap/slurm_backend.cpp index d54c8810c..6b891a297 100644 --- a/cpp/src/bootstrap/slurm_backend.cpp +++ b/cpp/src/bootstrap/slurm_backend.cpp @@ -1,5 +1,5 @@ /** - * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ From bdae5e5981c3a61b99997b99fa26ddc69c2fb2da Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 4 Feb 2026 10:12:45 -0800 Subject: [PATCH 32/57] Move getenv_int/getenv_optional to utils --- cpp/include/rapidsmpf/bootstrap/utils.hpp | 24 +++++ cpp/src/bootstrap/bootstrap.cpp | 32 +------ cpp/src/bootstrap/utils.cpp | 102 ++++++++++------------ 3 files changed, 69 insertions(+), 89 deletions(-) diff --git a/cpp/include/rapidsmpf/bootstrap/utils.hpp b/cpp/include/rapidsmpf/bootstrap/utils.hpp index 7a3aa8813..c6453f909 100644 --- a/cpp/include/rapidsmpf/bootstrap/utils.hpp +++ b/cpp/include/rapidsmpf/bootstrap/utils.hpp @@ -5,13 +5,37 @@ #pragma once +#include #include +#include #include #include namespace rapidsmpf::bootstrap { +/** + * @brief Get environment variable as optional string. + * + * Retrieves the value of an environment variable by name, returning it as + * std::optional. Returns std::nullopt if the variable is not set. + * + * @param name Name of the environment variable to retrieve. + * @return Value of the environment variable, or std::nullopt if not set. + */ +std::optional getenv_optional(std::string_view name); + +/** + * @brief Parse integer from environment variable. + * + * Retrieves an environment variable and parses it as an integer. + * + * @param name Name of the environment variable to retrieve. + * @return Parsed integer value, or std::nullopt if not set. + * @throws std::runtime_error if the variable is set but cannot be parsed as an integer. + */ +std::optional getenv_int(std::string_view name); + /** * @brief Get current CPU affinity as a string. * diff --git a/cpp/src/bootstrap/bootstrap.cpp b/cpp/src/bootstrap/bootstrap.cpp index 705ea2dc9..676931bf5 100644 --- a/cpp/src/bootstrap/bootstrap.cpp +++ b/cpp/src/bootstrap/bootstrap.cpp @@ -9,6 +9,7 @@ #include #include +#include #include #ifdef RAPIDSMPF_HAVE_SLURM @@ -22,37 +23,6 @@ namespace rapidsmpf::bootstrap { namespace { -/** - * @brief Get environment variable as string. - */ -std::optional getenv_optional(std::string_view name) { - // std::getenv requires a null-terminated string; construct a std::string - // to ensure this even when called with a non-literal std::string_view. - char const* value = std::getenv(std::string{name}.c_str()); - if (value == nullptr) { - return std::nullopt; - } - return std::string{value}; -} - -/** - * @brief Parse integer from environment variable. - */ -std::optional getenv_int(std::string_view name) { - auto value = getenv_optional(name); - if (!value) { - return std::nullopt; - } - try { - return std::stoi(*value); - } catch (...) { - throw std::runtime_error( - std::string{"Failed to parse integer from environment variable "} - + std::string{name} + ": " + *value - ); - } -} - /** * @brief Detect backend from environment variables. */ diff --git a/cpp/src/bootstrap/utils.cpp b/cpp/src/bootstrap/utils.cpp index 8b40674cd..58d287b40 100644 --- a/cpp/src/bootstrap/utils.cpp +++ b/cpp/src/bootstrap/utils.cpp @@ -26,6 +26,31 @@ namespace rapidsmpf::bootstrap { +std::optional getenv_optional(std::string_view name) { + // std::getenv requires a null-terminated string; construct a std::string + // to ensure this even when called with a non-literal std::string_view. + char const* value = std::getenv(std::string{name}.c_str()); + if (value == nullptr) { + return std::nullopt; + } + return std::string{value}; +} + +std::optional getenv_int(std::string_view name) { + auto value = getenv_optional(name); + if (!value) { + return std::nullopt; + } + try { + return std::stoi(*value); + } catch (...) { + throw std::runtime_error( + std::string{"Failed to parse integer from environment variable "} + + std::string{name} + ": " + *value + ); + } +} + std::string get_current_cpu_affinity() { cpu_set_t cpuset; CPU_ZERO(&cpuset); @@ -73,67 +98,46 @@ std::string get_current_cpu_affinity() { } std::string get_ucx_net_devices() { - char* env = std::getenv("UCX_NET_DEVICES"); - return env ? std::string(env) : std::string(); + return getenv_optional("UCX_NET_DEVICES").value_or(""); } int get_gpu_id() { - char* cuda_visible = std::getenv("CUDA_VISIBLE_DEVICES"); + auto cuda_visible = getenv_optional("CUDA_VISIBLE_DEVICES"); if (cuda_visible) { try { - return std::stoi(cuda_visible); + return std::stoi(*cuda_visible); } catch (...) { // Ignore parse errors } } - return -1; } bool is_running_with_rrun() { - return std::getenv("RAPIDSMPF_RANK") != nullptr; + return getenv_optional("RAPIDSMPF_RANK").has_value(); } bool is_running_with_slurm() { - if (std::getenv("SLURM_JOB_ID") != nullptr && std::getenv("SLURM_PROCID") != nullptr) - { - return true; - } - return false; + return getenv_optional("SLURM_JOB_ID").has_value() + && getenv_optional("SLURM_PROCID").has_value(); } bool is_running_with_bootstrap() { - // Only return true if rrun is coordinating (i.e., RAPIDSMPF_RANK is set). - // Even if Slurm environment variables are present, the user may want to use - // MPI directly with `srun --mpi=pmix`, so we shouldn't force bootstrap mode - // unless rrun is explicitly managing the launch. return is_running_with_rrun(); } Rank get_rank() { // Check rrun first (explicit configuration takes priority) - if (char* rank_env = std::getenv("RAPIDSMPF_RANK")) { - try { - return std::stoi(rank_env); - } catch (...) { - // Ignore parse errors, try next source - } + if (auto rank_opt = getenv_int("RAPIDSMPF_RANK")) { + return *rank_opt; } // Check PMIx rank - if (char* rank_env = std::getenv("PMIX_RANK")) { - try { - return std::stoi(rank_env); - } catch (...) { - // Ignore parse errors, try next source - } + if (auto rank_opt = getenv_int("PMIX_RANK")) { + return *rank_opt; } // Check Slurm process ID - if (char* rank_env = std::getenv("SLURM_PROCID")) { - try { - return std::stoi(rank_env); - } catch (...) { - // Ignore parse errors - } + if (auto rank_opt = getenv_int("SLURM_PROCID")) { + return *rank_opt; } return -1; } @@ -147,36 +151,18 @@ Rank get_nranks() { } // Check rrun first (explicit configuration takes priority) - if (char const* nranks_str = std::getenv("RAPIDSMPF_NRANKS")) { - try { - return std::stoi(nranks_str); - } catch (...) { - throw std::runtime_error( - "Failed to parse integer from RAPIDSMPF_NRANKS: " - + std::string(nranks_str) - ); - } + // getenv_int will throw if the variable is set but cannot be parsed + if (auto nranks_opt = getenv_int("RAPIDSMPF_NRANKS")) { + return *nranks_opt; } // Check Slurm environment variables - if (char const* nranks_str = std::getenv("SLURM_NPROCS")) { - try { - return std::stoi(nranks_str); - } catch (...) { - throw std::runtime_error( - "Failed to parse integer from SLURM_NPROCS: " + std::string(nranks_str) - ); - } + if (auto nranks_opt = getenv_int("SLURM_NPROCS")) { + return *nranks_opt; } - if (char const* nranks_str = std::getenv("SLURM_NTASKS")) { - try { - return std::stoi(nranks_str); - } catch (...) { - throw std::runtime_error( - "Failed to parse integer from SLURM_NTASKS: " + std::string(nranks_str) - ); - } + if (auto nranks_opt = getenv_int("SLURM_NTASKS")) { + return *nranks_opt; } throw std::runtime_error( From 3c176a69b748ec9345564dc714a1336f7097ead3 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 4 Feb 2026 10:23:36 -0800 Subject: [PATCH 33/57] Use is_running_with_slurm --- cpp/src/bootstrap/bootstrap.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/bootstrap/bootstrap.cpp b/cpp/src/bootstrap/bootstrap.cpp index 676931bf5..11bb1350f 100644 --- a/cpp/src/bootstrap/bootstrap.cpp +++ b/cpp/src/bootstrap/bootstrap.cpp @@ -46,7 +46,7 @@ Backend detect_backend() { // Important: This path should only be taken by Slurm parent processes that are // NOT launched by rrun. Child processes launched by rrun will have RAPIDSMPF_* // variables set and will use FILE backend above. - if (getenv_optional("SLURM_JOB_ID") && getenv_optional("SLURM_PROCID")) { + if (is_running_with_slurm()) { return Backend::SLURM; } #endif From f7b0ed21b3ca52ba2073f2306de4b5f334b4927b Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 4 Feb 2026 10:27:37 -0800 Subject: [PATCH 34/57] Remove srun comments --- cpp/benchmarks/bench_comm.cpp | 4 ++-- cpp/benchmarks/bench_shuffle.cpp | 4 ++-- cpp/benchmarks/streaming/bench_streaming_shuffle.cpp | 4 ++-- cpp/benchmarks/streaming/ndsh/utils.cpp | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/benchmarks/bench_comm.cpp b/cpp/benchmarks/bench_comm.cpp index b47ce0d94..2f276506f 100644 --- a/cpp/benchmarks/bench_comm.cpp +++ b/cpp/benchmarks/bench_comm.cpp @@ -281,14 +281,14 @@ int main(int argc, char** argv) { if (args.comm_type == "mpi") { if (use_bootstrap) { std::cerr << "Error: MPI communicator requires MPI initialization. " - << "Don't use with rrun/srun bootstrap mode." << std::endl; + << "Don't use with rrun bootstrap mode." << std::endl; return 1; } mpi::init(&argc, &argv); comm = std::make_shared(MPI_COMM_WORLD, options); } else if (args.comm_type == "ucxx") { if (use_bootstrap) { - // Launched with rrun or srun --mpi=pmix - use bootstrap backend + // Launched with rrun - use bootstrap backend comm = rapidsmpf::bootstrap::create_ucxx_comm( rapidsmpf::bootstrap::Backend::AUTO, options ); diff --git a/cpp/benchmarks/bench_shuffle.cpp b/cpp/benchmarks/bench_shuffle.cpp index c92cc6b29..2cce4a911 100644 --- a/cpp/benchmarks/bench_shuffle.cpp +++ b/cpp/benchmarks/bench_shuffle.cpp @@ -604,7 +604,7 @@ int main(int argc, char** argv) { if (use_bootstrap) { std::cerr << "Error: MPI communicator requires MPI initialization. Don't use with " - "rrun/srun bootstrap mode." + "rrun bootstrap mode." << std::endl; return 1; } @@ -612,7 +612,7 @@ int main(int argc, char** argv) { comm = std::make_shared(MPI_COMM_WORLD, options); } else if (args.comm_type == "ucxx") { if (use_bootstrap) { - // Launched with rrun or srun --mpi=pmix - use bootstrap backend + // Launched with rrun - use bootstrap backend comm = rapidsmpf::bootstrap::create_ucxx_comm( rapidsmpf::bootstrap::Backend::AUTO, options ); diff --git a/cpp/benchmarks/streaming/bench_streaming_shuffle.cpp b/cpp/benchmarks/streaming/bench_streaming_shuffle.cpp index c8e562577..6c757bfb2 100644 --- a/cpp/benchmarks/streaming/bench_streaming_shuffle.cpp +++ b/cpp/benchmarks/streaming/bench_streaming_shuffle.cpp @@ -325,7 +325,7 @@ int main(int argc, char** argv) { if (use_bootstrap) { std::cerr << "Error: MPI communicator requires MPI initialization. Don't use with " - "rrun/srun bootstrap mode." + "rrun bootstrap mode." << std::endl; return 1; } @@ -333,7 +333,7 @@ int main(int argc, char** argv) { comm = std::make_shared(MPI_COMM_WORLD, options); } else if (args.comm_type == "ucxx") { if (use_bootstrap) { - // Launched with rrun or srun --mpi=pmix - use bootstrap backend + // Launched with rrun - use bootstrap backend comm = rapidsmpf::bootstrap::create_ucxx_comm( rapidsmpf::bootstrap::Backend::AUTO, options ); diff --git a/cpp/benchmarks/streaming/ndsh/utils.cpp b/cpp/benchmarks/streaming/ndsh/utils.cpp index 461c109ea..42df946fb 100644 --- a/cpp/benchmarks/streaming/ndsh/utils.cpp +++ b/cpp/benchmarks/streaming/ndsh/utils.cpp @@ -187,7 +187,7 @@ std::shared_ptr create_context( case CommType::MPI: RAPIDSMPF_EXPECTS( !bootstrap::is_running_with_bootstrap(), - "Can't use MPI communicator with rrun/srun bootstrap mode" + "Can't use MPI communicator with rrun" ); mpi::init(nullptr, nullptr); From caa918207d9c6b23992bd9a2effe824b26fe653e Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 4 Feb 2026 10:33:28 -0800 Subject: [PATCH 35/57] Revert comments to originals --- cpp/benchmarks/bench_comm.cpp | 2 +- cpp/benchmarks/bench_shuffle.cpp | 2 +- cpp/benchmarks/streaming/bench_streaming_shuffle.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/benchmarks/bench_comm.cpp b/cpp/benchmarks/bench_comm.cpp index 2f276506f..6d220d1e2 100644 --- a/cpp/benchmarks/bench_comm.cpp +++ b/cpp/benchmarks/bench_comm.cpp @@ -281,7 +281,7 @@ int main(int argc, char** argv) { if (args.comm_type == "mpi") { if (use_bootstrap) { std::cerr << "Error: MPI communicator requires MPI initialization. " - << "Don't use with rrun bootstrap mode." << std::endl; + << "Don't use with rrun or unset RAPIDSMPF_RANK." << std::endl; return 1; } mpi::init(&argc, &argv); diff --git a/cpp/benchmarks/bench_shuffle.cpp b/cpp/benchmarks/bench_shuffle.cpp index 2cce4a911..329e8b962 100644 --- a/cpp/benchmarks/bench_shuffle.cpp +++ b/cpp/benchmarks/bench_shuffle.cpp @@ -604,7 +604,7 @@ int main(int argc, char** argv) { if (use_bootstrap) { std::cerr << "Error: MPI communicator requires MPI initialization. Don't use with " - "rrun bootstrap mode." + "rrun or unset RAPIDSMPF_RANK." << std::endl; return 1; } diff --git a/cpp/benchmarks/streaming/bench_streaming_shuffle.cpp b/cpp/benchmarks/streaming/bench_streaming_shuffle.cpp index 6c757bfb2..68d87ec8e 100644 --- a/cpp/benchmarks/streaming/bench_streaming_shuffle.cpp +++ b/cpp/benchmarks/streaming/bench_streaming_shuffle.cpp @@ -325,7 +325,7 @@ int main(int argc, char** argv) { if (use_bootstrap) { std::cerr << "Error: MPI communicator requires MPI initialization. Don't use with " - "rrun bootstrap mode." + "rrun or unset RAPIDSMPF_RANK." << std::endl; return 1; } From 3f149ae1312f758f22fef0e6ea5993af5b555bc6 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 4 Feb 2026 10:38:00 -0800 Subject: [PATCH 36/57] Improve get_rank/get_nranks implementations --- cpp/src/bootstrap/utils.cpp | 37 ++++++++++++------------------------- 1 file changed, 12 insertions(+), 25 deletions(-) diff --git a/cpp/src/bootstrap/utils.cpp b/cpp/src/bootstrap/utils.cpp index 58d287b40..7cf352df7 100644 --- a/cpp/src/bootstrap/utils.cpp +++ b/cpp/src/bootstrap/utils.cpp @@ -127,19 +127,15 @@ bool is_running_with_bootstrap() { } Rank get_rank() { - // Check rrun first (explicit configuration takes priority) if (auto rank_opt = getenv_int("RAPIDSMPF_RANK")) { return *rank_opt; - } - // Check PMIx rank - if (auto rank_opt = getenv_int("PMIX_RANK")) { + } else if (auto rank_opt = getenv_int("PMIX_RANK")) { return *rank_opt; - } - // Check Slurm process ID - if (auto rank_opt = getenv_int("SLURM_PROCID")) { + } else if (auto rank_opt = getenv_int("SLURM_PROCID")) { return *rank_opt; + } else { + return -1; } - return -1; } Rank get_nranks() { @@ -148,27 +144,18 @@ Rank get_nranks() { "get_nranks() can only be called when running with a bootstrap launcher. " "Use 'rrun' or 'srun --mpi=pmix' to launch the application." ); - } - - // Check rrun first (explicit configuration takes priority) - // getenv_int will throw if the variable is set but cannot be parsed - if (auto nranks_opt = getenv_int("RAPIDSMPF_NRANKS")) { + } else if (auto nranks_opt = getenv_int("RAPIDSMPF_NRANKS")) { return *nranks_opt; - } - - // Check Slurm environment variables - if (auto nranks_opt = getenv_int("SLURM_NPROCS")) { + } else if (auto nranks_opt = getenv_int("SLURM_NPROCS")) { return *nranks_opt; - } - - if (auto nranks_opt = getenv_int("SLURM_NTASKS")) { + } else if (auto nranks_opt = getenv_int("SLURM_NTASKS")) { return *nranks_opt; + } else { + throw std::runtime_error( + "Could not determine number of ranks. " + "Ensure RAPIDSMPF_NRANKS, SLURM_NPROCS, or SLURM_NTASKS is set." + ); } - - throw std::runtime_error( - "Could not determine number of ranks. " - "Ensure RAPIDSMPF_NRANKS, SLURM_NPROCS, or SLURM_NTASKS is set." - ); } std::vector parse_cpu_list(std::string const& cpulist) { From 1bc3e68b48e05d0b1ab6c2e362e39970e3b81984 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 4 Feb 2026 12:00:30 -0800 Subject: [PATCH 37/57] Restructure create_ucxx_comm --- cpp/src/bootstrap/ucxx.cpp | 170 ++++++++++++++++++------------------- 1 file changed, 81 insertions(+), 89 deletions(-) diff --git a/cpp/src/bootstrap/ucxx.cpp b/cpp/src/bootstrap/ucxx.cpp index f46a95723..5bad871d2 100644 --- a/cpp/src/bootstrap/ucxx.cpp +++ b/cpp/src/bootstrap/ucxx.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -62,105 +63,96 @@ std::shared_ptr create_ucxx_comm(Backend backend, config::Options op std::shared_ptr comm; - // Check if root address was provided by parent process (rrun hybrid mode) - char const* precomputed_address_encoded = std::getenv("RAPIDSMPF_ROOT_ADDRESS"); + auto precomputed_address_encoded = getenv_optional("RAPIDSMPF_ROOT_ADDRESS"); + auto address_file = getenv_optional("RAPIDSMPF_ROOT_ADDRESS_FILE"); - if (precomputed_address_encoded != nullptr && ctx.rank != 0) { - // Parent process already coordinated the root address via PMIx - // Address is hex-encoded to avoid issues with binary data in env vars - // Note: Only non-root ranks use this path. Rank 0 should always create the - // listener. - std::string precomputed_address = hex_decode(precomputed_address_encoded); + // Path 1: Early address mode for root rank in Slurm hybrid mode. + // Rank 0 is launched first to create its address and write it to a file. + // Parent will coordinate with other parents via PMIx, then launch worker ranks + // with RAPIDSMPF_ROOT_ADDRESS set. No PMIx put/barrier/get bootstrap coordination. + if (ctx.rank == 0 && address_file.has_value()) { + auto ucxx_initialized_rank = + ucxx::init(nullptr, ctx.nranks, std::nullopt, options); + comm = std::make_shared(std::move(ucxx_initialized_rank), options); + + auto listener_address = comm->listener_address(); + auto root_worker_address_str = + std::get>(listener_address.address) + ->getString(); + + std::string encoded_address = hex_encode(root_worker_address_str); + std::ofstream addr_file(*address_file); + if (!addr_file) { + throw std::runtime_error( + "Failed to write root address to file: " + *address_file + ); + } + addr_file << encoded_address << std::endl; + addr_file.close(); + + auto verbose = getenv_optional("RAPIDSMPF_VERBOSE"); + if (verbose && *verbose == "1") { + std::cerr << "[rank 0] Wrote address to " << *address_file + << ", skipping bootstrap coordination" << std::endl; + } - // Worker children connect using provided address + // Unset the flag so rank 0 participates in the final barrier + unsetenv("RAPIDSMPF_ROOT_ADDRESS_FILE"); + } + // Path 2: Slurm hybrid mode for non-root ranks. + // Parent process already coordinated the root address via PMIx and provided it + // via RAPIDSMPF_ROOT_ADDRESS environment variable (hex-encoded). + else if (precomputed_address_encoded.has_value() && ctx.rank != 0) + { + std::string precomputed_address = hex_decode(*precomputed_address_encoded); auto root_worker_address = ::ucxx::createAddressFromString(precomputed_address); auto ucxx_initialized_rank = ucxx::init(nullptr, ctx.nranks, root_worker_address, options); comm = std::make_shared(std::move(ucxx_initialized_rank), options); - } else { - // Standard bootstrap coordination via put/get/barrier - - // Special case: If rank 0 is asked to write address file before full bootstrap, - // it means we're in rrun hybrid parent-mediated mode where rank 0 is launched - // first to get its address, then other ranks are launched later. - // In this case, skip the put/barrier/get dance and just create the listener. - char const* address_file = std::getenv("RAPIDSMPF_ROOT_ADDRESS_FILE"); - bool early_address_mode = (ctx.rank == 0 && address_file != nullptr); - - if (ctx.rank == 0) { - // Create root UCXX communicator - auto ucxx_initialized_rank = - ucxx::init(nullptr, ctx.nranks, std::nullopt, options); - comm = - std::make_shared(std::move(ucxx_initialized_rank), options); - - // Get the listener address - auto listener_address = comm->listener_address(); - auto root_worker_address_str = - std::get>(listener_address.address) - ->getString(); - - if (early_address_mode) { - // Write address file immediately and skip bootstrap coordination - // Parent will coordinate with other parents via PMIx - // Encode as hex to avoid issues with binary data - std::string encoded_address = hex_encode(root_worker_address_str); - std::ofstream addr_file(address_file); - if (!addr_file) { - throw std::runtime_error( - "Failed to write root address to file: " - + std::string{address_file} - ); - } - addr_file << encoded_address << std::endl; - addr_file.close(); - - char const* verbose = std::getenv("RAPIDSMPF_VERBOSE"); - if (verbose && std::string{verbose} == "1") { - std::cerr << "[rank 0] Wrote address to " << address_file - << ", skipping bootstrap coordination" << std::endl; - } - - // Unset the flag so rank 0 won't skip the final barrier - // (we need all ranks to synchronize at the end) - unsetenv("RAPIDSMPF_ROOT_ADDRESS_FILE"); - - // Skip put/barrier - other ranks will get address via - // RAPIDSMPF_ROOT_ADDRESS Return early, don't do full bootstrap - } else { - // Normal mode: publish address for other ranks - put(ctx, "ucxx_root_address", root_worker_address_str); - } - } + } + // Path 3: Normal bootstrap mode for root rank. + // Create listener and publish address via PMIx put() for non-root ranks to retrieve. + else if (ctx.rank == 0) + { + auto ucxx_initialized_rank = + ucxx::init(nullptr, ctx.nranks, std::nullopt, options); + comm = std::make_shared(std::move(ucxx_initialized_rank), options); + + auto listener_address = comm->listener_address(); + auto root_worker_address_str = + std::get>(listener_address.address) + ->getString(); + + put(ctx, "ucxx_root_address", root_worker_address_str); - if (!early_address_mode) { - // For PMIx/Slurm backend, barrier is needed to execute PMIx_Fence - // which makes put() data visible across nodes. - // For FILE backend, barrier is not needed since put/get already - // provide implicit synchronization via filesystem operations. - if (ctx.backend == Backend::SLURM) { - barrier(ctx); - } - - if (ctx.rank != 0) { - // Worker ranks retrieve the root address and connect - auto root_worker_address_str = - get(ctx, "ucxx_root_address", std::chrono::seconds{30}); - auto root_worker_address = - ::ucxx::createAddressFromString(root_worker_address_str); - - auto ucxx_initialized_rank = - ucxx::init(nullptr, ctx.nranks, root_worker_address, options); - comm = std::make_shared( - std::move(ucxx_initialized_rank), options - ); - } + // For PMIx/Slurm backend, barrier is needed to execute PMIx_Fence + // which makes put() data visible across nodes. + // For FILE backend, barrier is not needed since put/get already + // provide implicit synchronization via filesystem operations. + if (ctx.backend == Backend::SLURM) { + barrier(ctx); + } + } + // Path 4: Normal bootstrap mode for non-root ranks. + // Retrieve root address via get() and connect. + else + { + // For PMIx/Slurm backend, barrier is needed to execute PMIx_Fence + // which makes put() data visible across nodes. + if (ctx.backend == Backend::SLURM) { + barrier(ctx); } + + auto root_worker_address_str = + get(ctx, "ucxx_root_address", std::chrono::seconds{30}); + auto root_worker_address = + ::ucxx::createAddressFromString(root_worker_address_str); + + auto ucxx_initialized_rank = + ucxx::init(nullptr, ctx.nranks, root_worker_address, options); + comm = std::make_shared(std::move(ucxx_initialized_rank), options); } - // Final barrier to synchronize all ranks before returning - // Note: rank 0 in early address mode unsets RAPIDSMPF_ROOT_ADDRESS_FILE - // after writing the file, so it participates in this barrier comm->barrier(); return comm; From 51b7a169990f9496854b1cf89223dbafe200d35f Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 4 Feb 2026 12:45:33 -0800 Subject: [PATCH 38/57] Add sync() to ensure put() data visibilty --- cpp/include/rapidsmpf/bootstrap/bootstrap.hpp | 14 +++++++++++++ .../rapidsmpf/bootstrap/file_backend.hpp | 11 +++++++++- .../rapidsmpf/bootstrap/slurm_backend.hpp | 12 +++++++++++ cpp/src/bootstrap/bootstrap.cpp | 21 +++++++++++++++++++ cpp/src/bootstrap/file_backend.cpp | 4 +++- cpp/src/bootstrap/slurm_backend.cpp | 19 +++++++++++++++++ cpp/src/bootstrap/ucxx.cpp | 17 +++------------ 7 files changed, 82 insertions(+), 16 deletions(-) diff --git a/cpp/include/rapidsmpf/bootstrap/bootstrap.hpp b/cpp/include/rapidsmpf/bootstrap/bootstrap.hpp index a23293741..b375bdf20 100644 --- a/cpp/include/rapidsmpf/bootstrap/bootstrap.hpp +++ b/cpp/include/rapidsmpf/bootstrap/bootstrap.hpp @@ -118,6 +118,20 @@ void broadcast(Context const& ctx, void* data, std::size_t size, Rank root = 0); */ void barrier(Context const& ctx); +/** + * @brief Ensure all previous put() operations are globally visible. + * + * Different backends have different visibility semantics for put() operations: + * - Slurm/PMIx: Requires explicit fence (PMIx_Fence) to make data visible across nodes. + * - FILE: put() operations are immediately visible via atomic filesystem operations. + * + * This function abstracts these differences. Call sync() after put() operations + * to ensure data is visible to other ranks before they attempt get(). + * + * @param ctx Bootstrap context. + */ +void sync(Context const& ctx); + /** * @brief Store a key-value pair in the coordination backend. * diff --git a/cpp/include/rapidsmpf/bootstrap/file_backend.hpp b/cpp/include/rapidsmpf/bootstrap/file_backend.hpp index 3805ad798..144af97ac 100644 --- a/cpp/include/rapidsmpf/bootstrap/file_backend.hpp +++ b/cpp/include/rapidsmpf/bootstrap/file_backend.hpp @@ -1,5 +1,5 @@ /** - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ @@ -64,6 +64,15 @@ class FileBackend { */ void barrier(); + /** + * @brief Ensure all previous put() operations are globally visible. + * + * For FileBackend, this is a no-op since put() operations use atomic + * file writes that are immediately visible to all processes via the + * shared filesystem. + */ + void sync(); + /** * @brief Broadcast data from root to all ranks. * diff --git a/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp b/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp index 26782cd2d..b7a678ef7 100644 --- a/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp +++ b/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp @@ -99,6 +99,18 @@ class SlurmBackend { */ void barrier(); + /** + * @brief Ensure all previous put() operations are globally visible. + * + * For Slurm/PMIx backend, this executes PMIx_Fence to make all committed + * key-value pairs visible across all nodes. This is required because + * PMIx_Put + PMIx_Commit only makes data locally visible; PMIx_Fence + * performs the global synchronization and data exchange. + * + * @throws std::runtime_error if PMIx_Fence fails. + */ + void sync(); + /** * @brief Broadcast data from root to all ranks. * diff --git a/cpp/src/bootstrap/bootstrap.cpp b/cpp/src/bootstrap/bootstrap.cpp index 11bb1350f..448c95616 100644 --- a/cpp/src/bootstrap/bootstrap.cpp +++ b/cpp/src/bootstrap/bootstrap.cpp @@ -208,6 +208,27 @@ void barrier(Context const& ctx) { } } +void sync(Context const& ctx) { + switch (ctx.backend) { + case Backend::FILE: + { + detail::FileBackend backend{ctx}; + backend.sync(); + break; + } +#ifdef RAPIDSMPF_HAVE_SLURM + case Backend::SLURM: + { + detail::SlurmBackend backend{ctx}; + backend.sync(); + break; + } +#endif + default: + throw std::runtime_error("sync not implemented for this backend"); + } +} + void put(Context const& ctx, std::string const& key, std::string const& value) { switch (ctx.backend) { case Backend::FILE: diff --git a/cpp/src/bootstrap/file_backend.cpp b/cpp/src/bootstrap/file_backend.cpp index c91a3fbce..c267606ae 100644 --- a/cpp/src/bootstrap/file_backend.cpp +++ b/cpp/src/bootstrap/file_backend.cpp @@ -1,5 +1,5 @@ /** - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ @@ -108,6 +108,8 @@ void FileBackend::barrier() { std::filesystem::remove(my_barrier_file, ec); } +void FileBackend::sync() {} + void FileBackend::broadcast(void* data, std::size_t size, Rank root) { if (ctx_.rank == root) { // Root writes data diff --git a/cpp/src/bootstrap/slurm_backend.cpp b/cpp/src/bootstrap/slurm_backend.cpp index 6b891a297..b0e4dd2f8 100644 --- a/cpp/src/bootstrap/slurm_backend.cpp +++ b/cpp/src/bootstrap/slurm_backend.cpp @@ -218,6 +218,25 @@ void SlurmBackend::barrier() { } } +void SlurmBackend::sync() { + pmix_proc_t proc; + PMIX_PROC_CONSTRUCT(&proc); + std::memcpy(proc.nspace, nspace_.data(), nspace_.size()); + proc.rank = PMIX_RANK_WILDCARD; + + pmix_info_t info; + bool collect = true; + PMIX_INFO_CONSTRUCT(&info); + PMIX_INFO_LOAD(&info, PMIX_COLLECT_DATA, &collect, PMIX_BOOL); + + pmix_status_t rc = PMIx_Fence(&proc, 1, &info, 1); + PMIX_INFO_DESTRUCT(&info); + + if (rc != PMIX_SUCCESS && rc != PMIX_ERR_PARTIAL_SUCCESS) { + throw std::runtime_error("PMIx_Fence (sync) failed: " + pmix_error_string(rc)); + } +} + void SlurmBackend::broadcast(void* data, std::size_t size, Rank root) { // Use unique key for each broadcast to avoid collisions std::string bcast_key = diff --git a/cpp/src/bootstrap/ucxx.cpp b/cpp/src/bootstrap/ucxx.cpp index 5bad871d2..e4e1c2bd9 100644 --- a/cpp/src/bootstrap/ucxx.cpp +++ b/cpp/src/bootstrap/ucxx.cpp @@ -111,7 +111,7 @@ std::shared_ptr create_ucxx_comm(Backend backend, config::Options op comm = std::make_shared(std::move(ucxx_initialized_rank), options); } // Path 3: Normal bootstrap mode for root rank. - // Create listener and publish address via PMIx put() for non-root ranks to retrieve. + // Create listener and publish address via put() for non-root ranks to retrieve. else if (ctx.rank == 0) { auto ucxx_initialized_rank = @@ -124,24 +124,13 @@ std::shared_ptr create_ucxx_comm(Backend backend, config::Options op ->getString(); put(ctx, "ucxx_root_address", root_worker_address_str); - - // For PMIx/Slurm backend, barrier is needed to execute PMIx_Fence - // which makes put() data visible across nodes. - // For FILE backend, barrier is not needed since put/get already - // provide implicit synchronization via filesystem operations. - if (ctx.backend == Backend::SLURM) { - barrier(ctx); - } + sync(ctx); } // Path 4: Normal bootstrap mode for non-root ranks. // Retrieve root address via get() and connect. else { - // For PMIx/Slurm backend, barrier is needed to execute PMIx_Fence - // which makes put() data visible across nodes. - if (ctx.backend == Backend::SLURM) { - barrier(ctx); - } + sync(ctx); auto root_worker_address_str = get(ctx, "ucxx_root_address", std::chrono::seconds{30}); From 54b8a2461c519df03987446679a67db5a000ab0b Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 4 Feb 2026 13:15:49 -0800 Subject: [PATCH 39/57] Improve slurm rank/nranks parsing during initialization --- cpp/include/rapidsmpf/bootstrap/utils.hpp | 6 +++- cpp/src/bootstrap/bootstrap.cpp | 44 +++++++---------------- cpp/src/bootstrap/utils.cpp | 12 +++---- 3 files changed, 22 insertions(+), 40 deletions(-) diff --git a/cpp/include/rapidsmpf/bootstrap/utils.hpp b/cpp/include/rapidsmpf/bootstrap/utils.hpp index c6453f909..4660944bc 100644 --- a/cpp/include/rapidsmpf/bootstrap/utils.hpp +++ b/cpp/include/rapidsmpf/bootstrap/utils.hpp @@ -108,7 +108,10 @@ bool is_running_with_bootstrap(); * 2. PMIX_RANK (set by PMIx) * 3. SLURM_PROCID (set by Slurm) * - * @return Rank of the current process (>= 0) if found, -1 otherwise. + * @return Rank of the current process. + * + * @throws std::runtime_error if not running with a bootstrap launcher or if + * the environment variable cannot be parsed. */ Rank get_rank(); @@ -122,6 +125,7 @@ Rank get_rank(); * 3. SLURM_NTASKS (set by Slurm) * * @return Number of ranks. + * * @throws std::runtime_error if not running with a bootstrap launcher or if * the environment variable cannot be parsed. */ diff --git a/cpp/src/bootstrap/bootstrap.cpp b/cpp/src/bootstrap/bootstrap.cpp index 448c95616..bf77b0e60 100644 --- a/cpp/src/bootstrap/bootstrap.cpp +++ b/cpp/src/bootstrap/bootstrap.cpp @@ -27,9 +27,9 @@ namespace { * @brief Detect backend from environment variables. */ Backend detect_backend() { - // Check for rrun coordination first (explicit configuration takes priority) + // Check for rrun coordination first (explicit configuration takes priority). // If RAPIDSMPF_COORD_DIR or RAPIDSMPF_ROOT_ADDRESS is set, rrun is coordinating - // and we should use FILE backend (with or without pre-coordinated address) + // and we should use FILE backend (with or without pre-coordinated address). if (getenv_optional("RAPIDSMPF_COORD_DIR") || getenv_optional("RAPIDSMPF_ROOT_ADDRESS")) { @@ -105,44 +105,24 @@ Context init(Backend backend) { case Backend::SLURM: { #ifdef RAPIDSMPF_HAVE_SLURM - // For SLURM backend, we can get rank/nranks from multiple sources: - // 1. Explicit RAPIDSMPF_* variables (override) - // 2. PMIx environment variables (set by pmix-enabled srun) - // 3. Slurm environment variables (fallback) - auto rank_opt = getenv_int("RAPIDSMPF_RANK"); - if (!rank_opt) { - rank_opt = getenv_int("PMIX_RANK"); - } - if (!rank_opt) { - rank_opt = getenv_int("SLURM_PROCID"); - } - - auto nranks_opt = getenv_int("RAPIDSMPF_NRANKS"); - if (!nranks_opt) { - nranks_opt = getenv_int("SLURM_NPROCS"); - } - if (!nranks_opt) { - nranks_opt = getenv_int("SLURM_NTASKS"); - } - - if (!rank_opt.has_value()) { + try { + ctx.rank = get_rank(); + } catch (const std::runtime_error& e) { throw std::runtime_error( - "Could not determine rank for SLURM backend. " - "Ensure you're running with 'srun --mpi=pmix' or set RAPIDSMPF_RANK." + "Could not determine rank for Slurm backend. " + "Ensure you're running with 'srun --mpi=pmix'." ); } - if (!nranks_opt.has_value()) { + try { + ctx.nranks = get_nranks(); + } catch (const std::runtime_error& e) { throw std::runtime_error( - "Could not determine nranks for SLURM backend. " - "Ensure you're running with 'srun --mpi=pmix' or set " - "RAPIDSMPF_NRANKS." + "Could not determine nranks for Slurm backend. " + "Ensure you're running with 'srun --mpi=pmix'." ); } - ctx.rank = static_cast(*rank_opt); - ctx.nranks = static_cast(*nranks_opt); - if (!(ctx.rank >= 0 && ctx.rank < ctx.nranks)) { throw std::runtime_error( "Invalid rank: " + std::to_string(ctx.rank) + " must be in range [0, " diff --git a/cpp/src/bootstrap/utils.cpp b/cpp/src/bootstrap/utils.cpp index 7cf352df7..94ac83afc 100644 --- a/cpp/src/bootstrap/utils.cpp +++ b/cpp/src/bootstrap/utils.cpp @@ -134,17 +134,15 @@ Rank get_rank() { } else if (auto rank_opt = getenv_int("SLURM_PROCID")) { return *rank_opt; } else { - return -1; + throw std::runtime_error( + "Could not determine number of ranks. " + "Ensure RAPIDSMPF_RANK, PMIX_RANK, or SLURM_PROCID is set." + ); } } Rank get_nranks() { - if (!is_running_with_bootstrap()) { - throw std::runtime_error( - "get_nranks() can only be called when running with a bootstrap launcher. " - "Use 'rrun' or 'srun --mpi=pmix' to launch the application." - ); - } else if (auto nranks_opt = getenv_int("RAPIDSMPF_NRANKS")) { + if (auto nranks_opt = getenv_int("RAPIDSMPF_NRANKS")) { return *nranks_opt; } else if (auto nranks_opt = getenv_int("SLURM_NPROCS")) { return *nranks_opt; From e489cd120835d97d9e6ba678ca0f2f079a90dc7d Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 4 Feb 2026 13:29:55 -0800 Subject: [PATCH 40/57] Clarify example --- .../rapidsmpf/bootstrap/slurm_backend.hpp | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp b/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp index b7a678ef7..7915e7398 100644 --- a/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp +++ b/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp @@ -27,18 +27,27 @@ namespace rapidsmpf::bootstrap::detail { * a shared filesystem. It is designed for Slurm clusters and supports multi-node * deployments. * - * PMIx operations: - * - PMIx_Put/PMIx_Get: Key-value store operations - * - PMIx_Commit: Make local puts visible - * - PMIx_Fence: Global synchronization and data exchange - * * Usage: * ```bash - * # Single node - * srun --mpi=pmix -n 4 ./my_program + * # Passthrough: multiple (4) tasks per node, one task per GPU, two nodes. + * srun \ + * --mpi=pmix \ + * --nodes=2 \ + * --ntasks-per-node=4 \ + * --cpus-per-task=36 \ + * --gpus-per-task=1 \ + * --gres=gpu:4 \ + * rrun ./benchmarks/bench_shuffle -C ucxx * - * # Multi-node - * srun --mpi=pmix -N 2 -n 8 ./my_program + * # Hybrid mode: one task per node, 4 GPUs per task, two nodes. + * srun \ + * --mpi=pmix \ + * --nodes=2 \ + * --ntasks-per-node=1 \ + * --cpus-per-task=144 \ + * --gpus-per-task=4 \ + * --gres=gpu:4 \ + * rrun -n 4 ./benchmarks/bench_shuffle -C ucxx * ``` */ class SlurmBackend { From 3ab0819db433d5779965f668926f3520829d12d6 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 4 Feb 2026 13:39:42 -0800 Subject: [PATCH 41/57] More cleanups --- cpp/src/bootstrap/slurm_backend.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/bootstrap/slurm_backend.cpp b/cpp/src/bootstrap/slurm_backend.cpp index b0e4dd2f8..d8768fa49 100644 --- a/cpp/src/bootstrap/slurm_backend.cpp +++ b/cpp/src/bootstrap/slurm_backend.cpp @@ -67,7 +67,6 @@ SlurmBackend::SlurmBackend(Context ctx) : ctx_{std::move(ctx)} { std::lock_guard lock{g_pmix_mutex}; if (!g_pmix_initialized) { - // First instance - initialize PMIx (will stay initialized for process lifetime) pmix_proc_t proc; pmix_status_t rc = PMIx_Init(&proc, nullptr, 0); if (rc != PMIX_SUCCESS) { @@ -91,7 +90,7 @@ SlurmBackend::SlurmBackend(Context ctx) : ctx_{std::move(ctx)} { nspace_ = g_pmix_nspace; // Verify rank matches what we expect (if context has a valid rank) - // Note: For SLURM backend, ctx_.rank may be set from environment variables + // Note: For Slurm backend, ctx_.rank may be set from environment variables // before PMIx_Init, so we verify they match if (ctx_.rank >= 0 && std::cmp_not_equal(g_pmix_proc.rank, ctx_.rank)) { throw std::runtime_error( From 6d60d761f5a606673dfe2b27c93eba1bb2b952d0 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 4 Feb 2026 13:49:18 -0800 Subject: [PATCH 42/57] Fix PMIx finalizer --- cpp/src/bootstrap/slurm_backend.cpp | 33 ++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/cpp/src/bootstrap/slurm_backend.cpp b/cpp/src/bootstrap/slurm_backend.cpp index d8768fa49..17b657d94 100644 --- a/cpp/src/bootstrap/slurm_backend.cpp +++ b/cpp/src/bootstrap/slurm_backend.cpp @@ -26,9 +26,8 @@ namespace { // PMIx initialization is process-global and must only happen once. // Once initialized, PMIx stays active for the lifetime of the process. -// We track initialization state but do NOT finalize PMIx in the destructor, -// as multiple SlurmBackend instances may be created/destroyed during the -// bootstrap process. PMIx will be cleaned up when the process exits. +// We register an atexit handler to finalize PMIx when the process exits, +// ensuring proper cleanup without breaking ongoing collective operations. // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) std::mutex g_pmix_mutex; // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) @@ -38,6 +37,22 @@ pmix_proc_t g_pmix_proc{}; // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) std::array g_pmix_nspace{}; +/** + * @brief Finalize PMIx at process exit. + * + * This function is registered via atexit() to ensure PMIx is properly + * finalized when the process terminates. It runs after all destructors, + * so PMIx remains available for the entire process lifetime. + */ +void finalize_pmix_at_exit() { + // Note: This runs at process exit after all destructors. + // No need for mutex as other threads should be done by now. + if (g_pmix_initialized) { + PMIx_Finalize(nullptr, 0); + g_pmix_initialized = false; + } +} + /** * @brief Convert PMIx status to string for error messages. * @@ -81,6 +96,9 @@ SlurmBackend::SlurmBackend(Context ctx) : ctx_{std::move(ctx)} { static_assert(sizeof(proc.nspace) == PMIX_MAX_NSLEN + 1); std::memcpy(g_pmix_nspace.data(), proc.nspace, g_pmix_nspace.size()); g_pmix_initialized = true; + + // Register cleanup handler to finalize PMIx at process exit + std::atexit(finalize_pmix_at_exit); } pmix_initialized_ = true; @@ -106,14 +124,13 @@ SlurmBackend::SlurmBackend(Context ctx) : ctx_{std::move(ctx)} { } SlurmBackend::~SlurmBackend() { - // Intentionally do NOT call PMIx_Finalize here. // PMIx must stay initialized for the lifetime of the process because // multiple SlurmBackend instances may be created and destroyed during - // bootstrap operations (put, barrier, get each create a new instance). + // bootstrap operations, and finalizing PMIx while other processes are + // still in collective operations (fence/barrier) will cause errors. // - // TODO: Check whether it's safe to let PMIx clean itself up when the - // process exits, and potentially come up with a better solution. Maybe - // refcounting? + // PMIx_Finalize is called via the atexit handler registered during + // initialization, ensuring proper cleanup when the process exits. } void SlurmBackend::put(std::string const& key, std::string const& value) { From 0d573ee27b92e8d458889bd88e23f4e3c92d2e00 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 4 Feb 2026 13:53:17 -0800 Subject: [PATCH 43/57] Use a class for PMIx global state --- cpp/src/bootstrap/slurm_backend.cpp | 93 +++++++++++++++++------------ 1 file changed, 56 insertions(+), 37 deletions(-) diff --git a/cpp/src/bootstrap/slurm_backend.cpp b/cpp/src/bootstrap/slurm_backend.cpp index 17b657d94..2b96f7f80 100644 --- a/cpp/src/bootstrap/slurm_backend.cpp +++ b/cpp/src/bootstrap/slurm_backend.cpp @@ -24,34 +24,55 @@ namespace rapidsmpf::bootstrap::detail { namespace { -// PMIx initialization is process-global and must only happen once. -// Once initialized, PMIx stays active for the lifetime of the process. -// We register an atexit handler to finalize PMIx when the process exits, -// ensuring proper cleanup without breaking ongoing collective operations. -// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) -std::mutex g_pmix_mutex; -// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) -bool g_pmix_initialized = false; -// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) -pmix_proc_t g_pmix_proc{}; -// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) -std::array g_pmix_nspace{}; - /** - * @brief Finalize PMIx at process exit. + * @brief Process-global PMIx state and lifecycle management. + * + * PMIx initialization is process-global and must only happen once. + * This singleton encapsulates all PMIx state and ensures proper cleanup + * when the process exits, without breaking ongoing collective operations. * - * This function is registered via atexit() to ensure PMIx is properly - * finalized when the process terminates. It runs after all destructors, - * so PMIx remains available for the entire process lifetime. + * Uses Meyer's Singleton pattern (function-local static) which provides: + * - Thread-safe initialization (guaranteed by C++11) + * - Automatic destruction at program exit (after all other destructors) + * - Clean encapsulation of related state */ -void finalize_pmix_at_exit() { - // Note: This runs at process exit after all destructors. - // No need for mutex as other threads should be done by now. - if (g_pmix_initialized) { - PMIx_Finalize(nullptr, 0); - g_pmix_initialized = false; +class PmixGlobalState { + public: + std::mutex mutex; + bool initialized{false}; + pmix_proc_t proc{}; + std::array nspace{}; + + /** + * @brief Get the singleton instance. + * + * Thread-safe initialization guaranteed by C++11 (function-local static). + * The instance is destroyed automatically at program exit. + */ + static PmixGlobalState& instance() { + static PmixGlobalState state; + return state; } -} + + // Non-copyable, non-movable + PmixGlobalState(PmixGlobalState const&) = delete; + PmixGlobalState& operator=(PmixGlobalState const&) = delete; + PmixGlobalState(PmixGlobalState&&) = delete; + PmixGlobalState& operator=(PmixGlobalState&&) = delete; + + private: + PmixGlobalState() = default; + + ~PmixGlobalState() { + // Finalize PMIx when the singleton is destroyed at program exit. + // This happens after all other destructors, so PMIx remains available + // for the entire process lifetime. + if (initialized) { + PMIx_Finalize(nullptr, 0); + initialized = false; + } + } +}; /** * @brief Convert PMIx status to string for error messages. @@ -79,9 +100,10 @@ void check_pmix_status(pmix_status_t status, std::string const& operation) { } // namespace SlurmBackend::SlurmBackend(Context ctx) : ctx_{std::move(ctx)} { - std::lock_guard lock{g_pmix_mutex}; + auto& pmix_state = PmixGlobalState::instance(); + std::lock_guard lock{pmix_state.mutex}; - if (!g_pmix_initialized) { + if (!pmix_state.initialized) { pmix_proc_t proc; pmix_status_t rc = PMIx_Init(&proc, nullptr, 0); if (rc != PMIX_SUCCESS) { @@ -91,35 +113,32 @@ SlurmBackend::SlurmBackend(Context ctx) : ctx_{std::move(ctx)} { ); } - g_pmix_proc = proc; + pmix_state.proc = proc; // Copy full nspace buffer (both are PMIX_MAX_NSLEN + 1 in size) static_assert(sizeof(proc.nspace) == PMIX_MAX_NSLEN + 1); - std::memcpy(g_pmix_nspace.data(), proc.nspace, g_pmix_nspace.size()); - g_pmix_initialized = true; - - // Register cleanup handler to finalize PMIx at process exit - std::atexit(finalize_pmix_at_exit); + std::memcpy(pmix_state.nspace.data(), proc.nspace, pmix_state.nspace.size()); + pmix_state.initialized = true; } pmix_initialized_ = true; // Copy global state to instance members - proc_ = g_pmix_proc; - nspace_ = g_pmix_nspace; + proc_ = pmix_state.proc; + nspace_ = pmix_state.nspace; // Verify rank matches what we expect (if context has a valid rank) // Note: For Slurm backend, ctx_.rank may be set from environment variables // before PMIx_Init, so we verify they match - if (ctx_.rank >= 0 && std::cmp_not_equal(g_pmix_proc.rank, ctx_.rank)) { + if (ctx_.rank >= 0 && std::cmp_not_equal(pmix_state.proc.rank, ctx_.rank)) { throw std::runtime_error( - "PMIx rank (" + std::to_string(g_pmix_proc.rank) + "PMIx rank (" + std::to_string(pmix_state.proc.rank) + ") doesn't match context rank (" + std::to_string(ctx_.rank) + ")" ); } // Update context rank from PMIx if not already set if (ctx_.rank < 0) { - ctx_.rank = static_cast(g_pmix_proc.rank); + ctx_.rank = static_cast(pmix_state.proc.rank); } } From 17eaa2d55c2b8211984a17219411e5e0e5220861 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 4 Feb 2026 14:07:30 -0800 Subject: [PATCH 44/57] More SlurmBackend cleanup --- .../rapidsmpf/bootstrap/slurm_backend.hpp | 4 +++ cpp/src/bootstrap/slurm_backend.cpp | 29 ++++--------------- 2 files changed, 10 insertions(+), 23 deletions(-) diff --git a/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp b/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp index 7915e7398..61aef9be1 100644 --- a/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp +++ b/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp @@ -58,6 +58,7 @@ class SlurmBackend { * Initializes PMIx and retrieves process information from the runtime. * * @param ctx Bootstrap context containing rank information. + * * @throws std::runtime_error if PMIx initialization fails. */ explicit SlurmBackend(Context ctx); @@ -81,6 +82,7 @@ class SlurmBackend { * * @param key Key name. * @param value Value to store. + * * @throws std::runtime_error if PMIx operation fails. */ void put(std::string const& key, std::string const& value); @@ -94,6 +96,7 @@ class SlurmBackend { * @param key Key name. * @param timeout Timeout duration. * @return Value associated with key. + * * @throws std::runtime_error if key not found within timeout. */ std::string get(std::string const& key, Duration timeout); @@ -129,6 +132,7 @@ class SlurmBackend { * @param data Data buffer (input on root, output on others). * @param size Size in bytes. * @param root Root rank. + * * @throws std::runtime_error if broadcast fails or size mismatch occurs. */ void broadcast(void* data, std::size_t size, Rank root); diff --git a/cpp/src/bootstrap/slurm_backend.cpp b/cpp/src/bootstrap/slurm_backend.cpp index 2b96f7f80..94853a30a 100644 --- a/cpp/src/bootstrap/slurm_backend.cpp +++ b/cpp/src/bootstrap/slurm_backend.cpp @@ -30,11 +30,6 @@ namespace { * PMIx initialization is process-global and must only happen once. * This singleton encapsulates all PMIx state and ensures proper cleanup * when the process exits, without breaking ongoing collective operations. - * - * Uses Meyer's Singleton pattern (function-local static) which provides: - * - Thread-safe initialization (guaranteed by C++11) - * - Automatic destruction at program exit (after all other destructors) - * - Clean encapsulation of related state */ class PmixGlobalState { public: @@ -45,16 +40,12 @@ class PmixGlobalState { /** * @brief Get the singleton instance. - * - * Thread-safe initialization guaranteed by C++11 (function-local static). - * The instance is destroyed automatically at program exit. */ static PmixGlobalState& instance() { static PmixGlobalState state; return state; } - // Non-copyable, non-movable PmixGlobalState(PmixGlobalState const&) = delete; PmixGlobalState& operator=(PmixGlobalState const&) = delete; PmixGlobalState(PmixGlobalState&&) = delete; @@ -63,10 +54,10 @@ class PmixGlobalState { private: PmixGlobalState() = default; + /** + * @brief Destructor ensuring PMIx finalization only at program exit. + */ ~PmixGlobalState() { - // Finalize PMIx when the singleton is destroyed at program exit. - // This happens after all other destructors, so PMIx remains available - // for the entire process lifetime. if (initialized) { PMIx_Finalize(nullptr, 0); initialized = false; @@ -147,9 +138,6 @@ SlurmBackend::~SlurmBackend() { // multiple SlurmBackend instances may be created and destroyed during // bootstrap operations, and finalizing PMIx while other processes are // still in collective operations (fence/barrier) will cause errors. - // - // PMIx_Finalize is called via the atexit handler registered during - // initialization, ensuring proper cleanup when the process exits. } void SlurmBackend::put(std::string const& key, std::string const& value) { @@ -184,7 +172,7 @@ std::string SlurmBackend::get(std::string const& key, Duration timeout) { pmix_proc_t proc; PMIX_PROC_CONSTRUCT(&proc); std::memcpy(proc.nspace, nspace_.data(), nspace_.size()); - proc.rank = 0; // Get from rank 0 specifically + proc.rank = 0; while (true) { pmix_value_t* val = nullptr; @@ -211,7 +199,6 @@ std::string SlurmBackend::get(std::string const& key, Duration timeout) { return result; } - // Check timeout auto elapsed = std::chrono::steady_clock::now() - start; if (elapsed >= timeout) { throw std::runtime_error( @@ -223,31 +210,27 @@ std::string SlurmBackend::get(std::string const& key, Duration timeout) { ); } - // Sleep before retry std::this_thread::sleep_for(poll_interval); } } void SlurmBackend::barrier() { - // Create proc array for all ranks (wildcard) in our namespace pmix_proc_t proc; PMIX_PROC_CONSTRUCT(&proc); std::memcpy(proc.nspace, nspace_.data(), nspace_.size()); proc.rank = PMIX_RANK_WILDCARD; - // Set up info to collect data during fence pmix_info_t info; bool collect = true; PMIX_INFO_CONSTRUCT(&info); PMIX_INFO_LOAD(&info, PMIX_COLLECT_DATA, &collect, PMIX_BOOL); - // PMIx_Fence performs synchronization barrier and data exchange pmix_status_t rc = PMIx_Fence(&proc, 1, &info, 1); PMIX_INFO_DESTRUCT(&info); - // Accept both SUCCESS and PARTIAL_SUCCESS for the fence + // Accept both SUCCESS and PARTIAL_SUCCESS for the fence. // PARTIAL_SUCCESS can occur in some PMIx implementations when not all - // processes have data to contribute, but the synchronization succeeded + // processes have data to contribute, but the synchronization succeeded. if (rc != PMIX_SUCCESS && rc != PMIX_ERR_PARTIAL_SUCCESS) { throw std::runtime_error("PMIx_Fence (barrier) failed: " + pmix_error_string(rc)); } From 741e1a93c16e7beeaf338c0f547e0ba1dac04d07 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 4 Feb 2026 14:09:54 -0800 Subject: [PATCH 45/57] Use same implementation function for SlurmBackend barrier/sync --- cpp/src/bootstrap/slurm_backend.cpp | 74 +++++++++++++++-------------- 1 file changed, 39 insertions(+), 35 deletions(-) diff --git a/cpp/src/bootstrap/slurm_backend.cpp b/cpp/src/bootstrap/slurm_backend.cpp index 94853a30a..720d2943d 100644 --- a/cpp/src/bootstrap/slurm_backend.cpp +++ b/cpp/src/bootstrap/slurm_backend.cpp @@ -88,6 +88,43 @@ void check_pmix_status(pmix_status_t status, std::string const& operation) { } } +/** + * @brief Perform PMIx fence operation across all ranks. + * + * Executes PMIx_Fence with PMIX_COLLECT_DATA to synchronize all ranks + * in the namespace and exchange data. Accepts both PMIX_SUCCESS and + * PMIX_ERR_PARTIAL_SUCCESS as success conditions, since PARTIAL_SUCCESS + * can occur in some PMIx implementations when not all processes have + * data to contribute, but the synchronization succeeded. + * + * @param nspace The PMIx namespace to fence across. + * @param operation_name Name of the operation for error messages (e.g., "barrier", + * "sync"). + * @throws std::runtime_error if the fence operation fails. + */ +void pmix_fence_all( + std::array const& nspace, std::string const& operation_name +) { + pmix_proc_t proc; + PMIX_PROC_CONSTRUCT(&proc); + std::memcpy(proc.nspace, nspace.data(), nspace.size()); + proc.rank = PMIX_RANK_WILDCARD; + + pmix_info_t info; + bool collect = true; + PMIX_INFO_CONSTRUCT(&info); + PMIX_INFO_LOAD(&info, PMIX_COLLECT_DATA, &collect, PMIX_BOOL); + + pmix_status_t rc = PMIx_Fence(&proc, 1, &info, 1); + PMIX_INFO_DESTRUCT(&info); + + if (rc != PMIX_SUCCESS && rc != PMIX_ERR_PARTIAL_SUCCESS) { + throw std::runtime_error( + "PMIx_Fence (" + operation_name + ") failed: " + pmix_error_string(rc) + ); + } +} + } // namespace SlurmBackend::SlurmBackend(Context ctx) : ctx_{std::move(ctx)} { @@ -215,44 +252,11 @@ std::string SlurmBackend::get(std::string const& key, Duration timeout) { } void SlurmBackend::barrier() { - pmix_proc_t proc; - PMIX_PROC_CONSTRUCT(&proc); - std::memcpy(proc.nspace, nspace_.data(), nspace_.size()); - proc.rank = PMIX_RANK_WILDCARD; - - pmix_info_t info; - bool collect = true; - PMIX_INFO_CONSTRUCT(&info); - PMIX_INFO_LOAD(&info, PMIX_COLLECT_DATA, &collect, PMIX_BOOL); - - pmix_status_t rc = PMIx_Fence(&proc, 1, &info, 1); - PMIX_INFO_DESTRUCT(&info); - - // Accept both SUCCESS and PARTIAL_SUCCESS for the fence. - // PARTIAL_SUCCESS can occur in some PMIx implementations when not all - // processes have data to contribute, but the synchronization succeeded. - if (rc != PMIX_SUCCESS && rc != PMIX_ERR_PARTIAL_SUCCESS) { - throw std::runtime_error("PMIx_Fence (barrier) failed: " + pmix_error_string(rc)); - } + pmix_fence_all(nspace_, "barrier"); } void SlurmBackend::sync() { - pmix_proc_t proc; - PMIX_PROC_CONSTRUCT(&proc); - std::memcpy(proc.nspace, nspace_.data(), nspace_.size()); - proc.rank = PMIX_RANK_WILDCARD; - - pmix_info_t info; - bool collect = true; - PMIX_INFO_CONSTRUCT(&info); - PMIX_INFO_LOAD(&info, PMIX_COLLECT_DATA, &collect, PMIX_BOOL); - - pmix_status_t rc = PMIx_Fence(&proc, 1, &info, 1); - PMIX_INFO_DESTRUCT(&info); - - if (rc != PMIX_SUCCESS && rc != PMIX_ERR_PARTIAL_SUCCESS) { - throw std::runtime_error("PMIx_Fence (sync) failed: " + pmix_error_string(rc)); - } + pmix_fence_all(nspace_, "sync"); } void SlurmBackend::broadcast(void* data, std::size_t size, Rank root) { From 71fe0e8cc894c2f96dbdf77afd87d8c327f5e805 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 4 Feb 2026 14:20:10 -0800 Subject: [PATCH 46/57] Remove unnecessary is_running_with_bootstrap --- cpp/benchmarks/bench_comm.cpp | 2 +- cpp/benchmarks/bench_shuffle.cpp | 4 ++-- cpp/benchmarks/streaming/bench_streaming_shuffle.cpp | 2 +- cpp/benchmarks/streaming/ndsh/utils.cpp | 5 ++--- cpp/include/rapidsmpf/bootstrap/utils.hpp | 11 ----------- cpp/src/bootstrap/utils.cpp | 4 ---- 6 files changed, 6 insertions(+), 22 deletions(-) diff --git a/cpp/benchmarks/bench_comm.cpp b/cpp/benchmarks/bench_comm.cpp index 6d220d1e2..ce5166d88 100644 --- a/cpp/benchmarks/bench_comm.cpp +++ b/cpp/benchmarks/bench_comm.cpp @@ -258,7 +258,7 @@ Duration run( } int main(int argc, char** argv) { - bool use_bootstrap = rapidsmpf::bootstrap::is_running_with_bootstrap(); + bool use_bootstrap = rapidsmpf::bootstrap::is_running_with_rrun(); int provided = 0; if (!use_bootstrap) { diff --git a/cpp/benchmarks/bench_shuffle.cpp b/cpp/benchmarks/bench_shuffle.cpp index 329e8b962..126d1a32a 100644 --- a/cpp/benchmarks/bench_shuffle.cpp +++ b/cpp/benchmarks/bench_shuffle.cpp @@ -284,7 +284,7 @@ class ArgumentParser { }; void barrier(std::shared_ptr& comm) { - bool use_bootstrap = rapidsmpf::bootstrap::is_running_with_bootstrap(); + bool use_bootstrap = rapidsmpf::bootstrap::is_running_with_rrun(); if (!use_bootstrap) { RAPIDSMPF_MPI(MPI_Barrier(MPI_COMM_WORLD)); } else { @@ -580,7 +580,7 @@ rapidsmpf::Duration run_hash_partition_with_datagen( } int main(int argc, char** argv) { - bool use_bootstrap = rapidsmpf::bootstrap::is_running_with_bootstrap(); + bool use_bootstrap = rapidsmpf::bootstrap::is_running_with_rrun(); // Explicitly initialize MPI with thread support, as this is needed for both mpi // and ucxx communicators when not using bootstrap mode. diff --git a/cpp/benchmarks/streaming/bench_streaming_shuffle.cpp b/cpp/benchmarks/streaming/bench_streaming_shuffle.cpp index 68d87ec8e..0b73fd763 100644 --- a/cpp/benchmarks/streaming/bench_streaming_shuffle.cpp +++ b/cpp/benchmarks/streaming/bench_streaming_shuffle.cpp @@ -302,7 +302,7 @@ rapidsmpf::Duration run( } int main(int argc, char** argv) { - bool use_bootstrap = rapidsmpf::bootstrap::is_running_with_bootstrap(); + bool use_bootstrap = rapidsmpf::bootstrap::is_running_with_rrun(); // Explicitly initialize MPI with thread support, as this is needed for both mpi // and ucxx communicators when not using bootstrap mode. diff --git a/cpp/benchmarks/streaming/ndsh/utils.cpp b/cpp/benchmarks/streaming/ndsh/utils.cpp index 42df946fb..72c9f0994 100644 --- a/cpp/benchmarks/streaming/ndsh/utils.cpp +++ b/cpp/benchmarks/streaming/ndsh/utils.cpp @@ -186,8 +186,7 @@ std::shared_ptr create_context( switch (arguments.comm_type) { case CommType::MPI: RAPIDSMPF_EXPECTS( - !bootstrap::is_running_with_bootstrap(), - "Can't use MPI communicator with rrun" + !bootstrap::is_running_with_rrun(), "Can't use MPI communicator with rrun" ); mpi::init(nullptr, nullptr); @@ -197,7 +196,7 @@ std::shared_ptr create_context( comm = std::make_shared(options); break; case CommType::UCXX: - if (bootstrap::is_running_with_bootstrap()) { + if (bootstrap::is_running_with_rrun()) { comm = bootstrap::create_ucxx_comm(bootstrap::Backend::AUTO, options); } else { mpi::init(nullptr, nullptr); diff --git a/cpp/include/rapidsmpf/bootstrap/utils.hpp b/cpp/include/rapidsmpf/bootstrap/utils.hpp index 4660944bc..a42460914 100644 --- a/cpp/include/rapidsmpf/bootstrap/utils.hpp +++ b/cpp/include/rapidsmpf/bootstrap/utils.hpp @@ -88,17 +88,6 @@ bool is_running_with_rrun(); */ bool is_running_with_slurm(); -/** - * @brief Check if the current process is running with any bootstrap launcher. - * - * This helper detects bootstrap mode by checking for either `rrun` or Slurm/PMIx - * environment. Use this function when you need to determine whether to use - * bootstrap-based initialization vs MPI-based initialization. - * - * @return true if running under any bootstrap mode (rrun or Slurm), false otherwise. - */ -bool is_running_with_bootstrap(); - /** * @brief Get the current bootstrap rank. * diff --git a/cpp/src/bootstrap/utils.cpp b/cpp/src/bootstrap/utils.cpp index 94ac83afc..6203b8118 100644 --- a/cpp/src/bootstrap/utils.cpp +++ b/cpp/src/bootstrap/utils.cpp @@ -122,10 +122,6 @@ bool is_running_with_slurm() { && getenv_optional("SLURM_PROCID").has_value(); } -bool is_running_with_bootstrap() { - return is_running_with_rrun(); -} - Rank get_rank() { if (auto rank_opt = getenv_int("RAPIDSMPF_RANK")) { return *rank_opt; From 0763300a0e45d677a5439686cd8064916a807518 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 4 Feb 2026 14:34:21 -0800 Subject: [PATCH 47/57] Update rrun usage instructions --- cpp/tools/rrun.cpp | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/cpp/tools/rrun.cpp b/cpp/tools/rrun.cpp index 204600608..e00bc13c3 100644 --- a/cpp/tools/rrun.cpp +++ b/cpp/tools/rrun.cpp @@ -228,15 +228,14 @@ void print_usage(std::string_view prog_name) { << " rrun -n 2 -x UCX_TLS=cuda_copy,cuda_ipc,rc,tcp -x MY_VAR=value " "./bench_comm\n\n" << "Slurm Examples:\n" - << " # Passthrough mode (1 rank per Slurm task, 8 tasks total):\n" - << " srun --mpi=pmix -N 2 --ntasks-per-node=4 --gres=gpu:4 rrun " - "./bench_shuffle -C ucxx\n\n" - << " # Hybrid mode (2 Slurm tasks × 4 ranks/task = 8 total ranks):\n" - << " srun --mpi=pmix -N 2 --ntasks-per-node=1 --gres=gpu:4 rrun -n 4 " - "./bench_shuffle -C ucxx\n\n" - << " # Hybrid mode with --gpus-per-task:\n" - << " srun --mpi=pmix --ntasks-per-node=2 --gpus-per-task=4 rrun -n 4 " - "./bench_shuffle -C ucxx\n\n" + << " # Passthrough: multiple (4) tasks per node, one task per GPU, two nodes.\n" + << " srun --mpi=pmix --nodes=2 --ntasks-per-node=4 --cpus-per-task=36 \\\n" + << " --gpus-per-task=1 --gres=gpu:4 \\\n" + << " rrun ./benchmarks/bench_shuffle -C ucxx\n\n" + << " # Hybrid mode: one task per node, 4 GPUs per task, two nodes.\n" + << " srun --mpi=pmix --nodes=2 --ntasks-per-node=1 --cpus-per-task=144 \\\n" + << " --gpus-per-task=4 --gres=gpu:4 \\\n" + << " rrun -n 4 ./benchmarks/bench_shuffle -C ucxx\n\n" << std::endl; } From 76d2f298845ea41f5deccac1289ae5886d05a404 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 4 Feb 2026 14:52:52 -0800 Subject: [PATCH 48/57] Move rrun PMIx complexity to SlurmBackend --- .../rapidsmpf/bootstrap/slurm_backend.hpp | 14 ++ cpp/src/bootstrap/slurm_backend.cpp | 17 ++ cpp/tools/CMakeLists.txt | 7 +- cpp/tools/rrun.cpp | 145 +++++------------- 4 files changed, 75 insertions(+), 108 deletions(-) diff --git a/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp b/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp index 61aef9be1..41be72b43 100644 --- a/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp +++ b/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp @@ -137,6 +137,20 @@ class SlurmBackend { */ void broadcast(void* data, std::size_t size, Rank root); + /** + * @brief Explicitly finalize the global PMIx session. + * + * This is useful for scenarios like rrun parent coordination where PMIx + * needs to be finalized before process exit (e.g., after child processes + * complete). If not called explicitly, PMIx will be finalized when the + * process exits via the PmixGlobalState destructor. + * + * This function is safe to call multiple times, subsequent calls are no-ops. + * + * @throws std::runtime_error if PMIx_Finalize fails. + */ + static void finalize_pmix(); + private: Context ctx_; std::size_t barrier_count_{0}; diff --git a/cpp/src/bootstrap/slurm_backend.cpp b/cpp/src/bootstrap/slurm_backend.cpp index 720d2943d..36dc37101 100644 --- a/cpp/src/bootstrap/slurm_backend.cpp +++ b/cpp/src/bootstrap/slurm_backend.cpp @@ -46,6 +46,19 @@ class PmixGlobalState { return state; } + /** + * @brief Explicitly finalize PMIx session. + * + * Safe to call multiple times, subsequent calls are no-ops. + */ + void finalize() { + std::lock_guard lock{mutex}; + if (initialized) { + PMIx_Finalize(nullptr, 0); + initialized = false; + } + } + PmixGlobalState(PmixGlobalState const&) = delete; PmixGlobalState& operator=(PmixGlobalState const&) = delete; PmixGlobalState(PmixGlobalState&&) = delete; @@ -287,6 +300,10 @@ void SlurmBackend::broadcast(void* data, std::size_t size, Rank root) { barrier(); } +void SlurmBackend::finalize_pmix() { + PmixGlobalState::instance().finalize(); +} + } // namespace rapidsmpf::bootstrap::detail #endif // RAPIDSMPF_HAVE_SLURM diff --git a/cpp/tools/CMakeLists.txt b/cpp/tools/CMakeLists.txt index eb6487790..7df00008f 100644 --- a/cpp/tools/CMakeLists.txt +++ b/cpp/tools/CMakeLists.txt @@ -5,7 +5,12 @@ # cmake-format: on # ================================================================================= -add_executable(rrun "rrun.cpp" "$") +add_executable( + rrun + "rrun.cpp" + "$" + "$<$:$>" +) set_target_properties( rrun PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${RAPIDSMPF_BINARY_DIR}/tools" diff --git a/cpp/tools/rrun.cpp b/cpp/tools/rrun.cpp index e00bc13c3..8eafa19c2 100644 --- a/cpp/tools/rrun.cpp +++ b/cpp/tools/rrun.cpp @@ -48,6 +48,8 @@ #ifdef RAPIDSMPF_HAVE_SLURM #include + +#include #endif namespace { @@ -980,11 +982,12 @@ int execute_slurm_hybrid_mode(Config& cfg) { coord_hint ); + // Finalize PMIx session used for parent coordination if (!coordinated_root_address.empty()) { if (cfg.verbose) { std::cout << "[rrun] Finalizing PMIx in parent" << std::endl; } - PMIx_Finalize(nullptr, 0); + rapidsmpf::bootstrap::detail::SlurmBackend::finalize_pmix(); } return exit_status; @@ -1115,51 +1118,11 @@ std::string launch_rank0_and_get_address( } /** - * @brief Helper function to handle PMIx errors consistently. - * - * Checks the PMIx status code and throws an exception with proper cleanup if it indicates - * failure. Optionally allows partial success for operations like PMIx_Fence. - * - * @param rc PMIx status code to check. - * @param operation Description of the PMIx operation (e.g., "PMIx_Init"). - * @param allow_partial_success If true, PMIX_ERR_PARTIAL_SUCCESS is treated as success. - * - * @throws std::runtime_error if the operation failed (after calling PMIx_Finalize). - */ -void handle_pmix_error( - pmix_status_t rc, std::string const& operation, bool allow_partial_success = false -) { - if (rc == PMIX_SUCCESS || (allow_partial_success && rc == PMIX_ERR_PARTIAL_SUCCESS)) { - return; - } - PMIx_Finalize(nullptr, 0); - throw std::runtime_error( - operation + " failed: " + std::string{PMIx_Error_string(rc)} - ); -} - -/** - * @brief Helper function to throw an error with PMIx cleanup. - * - * Calls PMIx_Finalize and throws a runtime_error with the given message. - * Use this for validation errors or other non-PMIx-status failures that occur - * after PMIx has been initialized. - * - * @param error_message The error message to include in the exception. - * - * @throws std::runtime_error Always throws after calling PMIx_Finalize. - */ -[[noreturn]] void pmix_fatal_error(std::string const& error_message) { - PMIx_Finalize(nullptr, 0); - throw std::runtime_error(error_message); -} - -/** - * @brief Coordinate root address between parent processes using PMIx. + * @brief Coordinate root address between parent processes using SlurmBackend. * * This function is called by parent rrun processes in Slurm hybrid mode. - * The root parent (PMIX_RANK=0) publishes the root address, and non-root - * parents retrieve it. This avoids file-based coordination. + * The root parent (SLURM_PROCID=0) publishes the root address, and non-root + * parents retrieve it. * * @param root_address_to_publish Root address to publish. If set (has_value()), this is * the root parent and it will publish. If empty (nullopt), @@ -1167,19 +1130,34 @@ void handle_pmix_error( * @param verbose Whether to print debug messages. * @return Root address (either published or retrieved). * - * @throws std::runtime_error on PMIx errors. + * @throws std::runtime_error on coordination errors. */ std::string coordinate_root_address_via_pmix( std::optional const& root_address_to_publish, bool verbose ) { - // Initialize PMIx for parent process - pmix_proc_t proc; - pmix_status_t rc = PMIx_Init(&proc, nullptr, 0); - handle_pmix_error(rc, "PMIx_Init in rrun parent"); + // Get Slurm rank information for parent coordination + char const* slurm_procid = std::getenv("SLURM_PROCID"); + char const* slurm_ntasks = std::getenv("SLURM_NTASKS"); + + if (!slurm_procid || !slurm_ntasks) { + throw std::runtime_error( + "SLURM_PROCID and SLURM_NTASKS must be set for parent coordination" + ); + } + + int parent_rank = std::stoi(slurm_procid); + int parent_nranks = std::stoi(slurm_ntasks); + + // Create SlurmBackend for parent-level coordination + rapidsmpf::bootstrap::Context parent_ctx{ + parent_rank, parent_nranks, rapidsmpf::bootstrap::Backend::SLURM, std::nullopt + }; + + rapidsmpf::bootstrap::detail::SlurmBackend backend{parent_ctx}; if (verbose) { - std::cout << "[rrun] Parent PMIx initialized: rank " << proc.rank - << ", namespace " << proc.nspace << std::endl; + std::cout << "[rrun] Parent coordination initialized: rank " << parent_rank + << " of " << parent_nranks << std::endl; } std::string root_address; @@ -1187,76 +1165,29 @@ std::string coordinate_root_address_via_pmix( if (root_address_to_publish.has_value()) { // Root parent publishes the address (already hex-encoded for binary safety) if (verbose) { - std::cout << "[rrun] Publishing root address via PMIx (hex-encoded, " + std::cout << "[rrun] Publishing root address via SlurmBackend (hex-encoded, " << root_address_to_publish.value().size() << " chars)" << std::endl; } - // Use PMIx_Put with GLOBAL scope - pmix_value_t value; - PMIX_VALUE_CONSTRUCT(&value); - value.type = PMIX_STRING; - value.data.string = strdup(root_address_to_publish.value().c_str()); - - rc = PMIx_Put(PMIX_GLOBAL, "rapidsmpf_root_address", &value); - PMIX_VALUE_DESTRUCT(&value); - handle_pmix_error(rc, "PMIx_Put"); - - // Commit the data - rc = PMIx_Commit(); - handle_pmix_error(rc, "PMIx_Commit"); - + backend.put("rapidsmpf_root_address", root_address_to_publish.value()); root_address = root_address_to_publish.value(); } - // Barrier with PMIX_COLLECT_DATA to ensure data exchange - pmix_info_t info; - PMIX_INFO_CONSTRUCT(&info); - bool collect_data = true; - PMIX_INFO_LOAD(&info, PMIX_COLLECT_DATA, &collect_data, PMIX_BOOL); - - pmix_proc_t proc_wildcard; - PMIX_PROC_CONSTRUCT(&proc_wildcard); - std::memcpy(proc_wildcard.nspace, proc.nspace, PMIX_MAX_NSLEN + 1); - proc_wildcard.rank = PMIX_RANK_WILDCARD; - - rc = PMIx_Fence(&proc_wildcard, 1, &info, 1); - PMIX_INFO_DESTRUCT(&info); - handle_pmix_error(rc, "PMIx_Fence", true); + // Barrier to ensure data exchange + backend.barrier(); if (!root_address_to_publish.has_value()) { // Non-root parents retrieve the address - pmix_proc_t source_proc; - PMIX_PROC_CONSTRUCT(&source_proc); - std::memcpy(source_proc.nspace, proc.nspace, PMIX_MAX_NSLEN + 1); - source_proc.rank = 0; // Get from rank 0 - - pmix_value_t* value = nullptr; - rc = PMIx_Get(&source_proc, "rapidsmpf_root_address", nullptr, 0, &value); - handle_pmix_error(rc, "PMIx_Get"); - - if (value == nullptr) { - pmix_fatal_error("PMIx_Get returned null value"); - } - - if (value->type != PMIX_STRING) { - PMIX_VALUE_RELEASE(value); - pmix_fatal_error("PMIx_Get returned non-string value"); - } - - std::string encoded_address = value->data.string; - PMIX_VALUE_RELEASE(value); - - root_address = encoded_address; + root_address = backend.get("rapidsmpf_root_address", std::chrono::seconds{30}); if (verbose) { - std::cout << "[rrun] Retrieved root address via PMIx (hex-encoded, " - << encoded_address.size() << " chars)" << std::endl; + std::cout << "[rrun] Retrieved root address via SlurmBackend (hex-encoded, " + << root_address.size() << " chars)" << std::endl; } } - // Keep PMIx session alive - will finalize after children complete - // Note: We don't call PMIx_Finalize here because we want the session - // to stay alive while children are running + // Note: PMIx session will be explicitly finalized after children complete + // (see execute_slurm_hybrid_mode where finalize_pmix() is called) return root_address; } From 530adab92825fbe6fafcb3adb23dae27bd927537 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 4 Feb 2026 15:10:59 -0800 Subject: [PATCH 49/57] Fix build for the case PMIx is unavailable --- cpp/tools/rrun.cpp | 82 +++++++++++++++++++++++----------------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/cpp/tools/rrun.cpp b/cpp/tools/rrun.cpp index 8eafa19c2..7e886124f 100644 --- a/cpp/tools/rrun.cpp +++ b/cpp/tools/rrun.cpp @@ -870,47 +870,6 @@ int setup_launch_and_cleanup( } #ifdef RAPIDSMPF_HAVE_SLURM -/** - * @brief Execute application in Slurm passthrough mode (single rank per task). - * - * Applies topology bindings and executes the application directly without forking. - * This function never returns - it either replaces the current process or exits on error. - * - * @param cfg Configuration. - */ -[[noreturn]] void execute_slurm_passthrough_mode(Config const& cfg) { - if (cfg.verbose) { - std::cout << "[rrun] Slurm passthrough mode: applying bindings and exec'ing" - << std::endl; - } - - // Set rrun coordination environment variables so the application knows - // it's being launched by rrun and should use bootstrap mode - setenv("RAPIDSMPF_RANK", std::to_string(cfg.slurm_global_rank).c_str(), 1); - setenv("RAPIDSMPF_NRANKS", std::to_string(cfg.slurm_ntasks).c_str(), 1); - - // Determine GPU for this Slurm task - int gpu_id = -1; - if (!cfg.gpus.empty()) { - gpu_id = cfg.gpus[static_cast(cfg.slurm_local_id) % cfg.gpus.size()]; - setenv("CUDA_VISIBLE_DEVICES", std::to_string(gpu_id).c_str(), 1); - - if (cfg.verbose) { - std::cout << "[rrun] Slurm task (passthrough) local_id=" << cfg.slurm_local_id - << " assigned to GPU " << gpu_id << std::endl; - } - } - - // Set custom environment variables - for (auto const& env_pair : cfg.env_vars) { - setenv(env_pair.first.c_str(), env_pair.second.c_str(), 1); - } - - apply_topology_bindings(cfg, gpu_id, cfg.verbose); - - exec_application(cfg); -} - /** * @brief Execute application in Slurm hybrid mode with PMIx coordination. * @@ -1011,6 +970,47 @@ int execute_single_node_mode(Config& cfg) { return setup_launch_and_cleanup(cfg, 0, cfg.nranks, cfg.nranks, std::nullopt, false); } +/** + * @brief Execute application in Slurm passthrough mode (single rank per task). + * + * Applies topology bindings and executes the application directly without forking. + * This function never returns - it either replaces the current process or exits on error. + * + * @param cfg Configuration. + */ +[[noreturn]] void execute_slurm_passthrough_mode(Config const& cfg) { + if (cfg.verbose) { + std::cout << "[rrun] Slurm passthrough mode: applying bindings and exec'ing" + << std::endl; + } + + // Set rrun coordination environment variables so the application knows + // it's being launched by rrun and should use bootstrap mode + setenv("RAPIDSMPF_RANK", std::to_string(cfg.slurm_global_rank).c_str(), 1); + setenv("RAPIDSMPF_NRANKS", std::to_string(cfg.slurm_ntasks).c_str(), 1); + + // Determine GPU for this Slurm task + int gpu_id = -1; + if (!cfg.gpus.empty()) { + gpu_id = cfg.gpus[static_cast(cfg.slurm_local_id) % cfg.gpus.size()]; + setenv("CUDA_VISIBLE_DEVICES", std::to_string(gpu_id).c_str(), 1); + + if (cfg.verbose) { + std::cout << "[rrun] Slurm task (passthrough) local_id=" << cfg.slurm_local_id + << " assigned to GPU " << gpu_id << std::endl; + } + } + + // Set custom environment variables + for (auto const& env_pair : cfg.env_vars) { + setenv(env_pair.first.c_str(), env_pair.second.c_str(), 1); + } + + apply_topology_bindings(cfg, gpu_id, cfg.verbose); + + exec_application(cfg); +} + #ifdef RAPIDSMPF_HAVE_SLURM /** * @brief Launch rank 0 first to obtain its UCXX root address. From a779cdafc21f1a48b0d26586219b21ae966660d3 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Thu, 5 Feb 2026 00:18:24 -0800 Subject: [PATCH 50/57] Fix detection order documentation --- cpp/include/rapidsmpf/bootstrap/bootstrap.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/include/rapidsmpf/bootstrap/bootstrap.hpp b/cpp/include/rapidsmpf/bootstrap/bootstrap.hpp index b375bdf20..3aeb2612f 100644 --- a/cpp/include/rapidsmpf/bootstrap/bootstrap.hpp +++ b/cpp/include/rapidsmpf/bootstrap/bootstrap.hpp @@ -23,8 +23,9 @@ enum class Backend { * @brief Automatically detect the best backend based on environment. * * Detection order: - * 1. Slurm/PMIx (if PMIX_NAMESPACE or SLURM environment detected) - * 2. File-based (default fallback) + * 1. File-based (if RAPIDSMPF_COORD_DIR or RAPIDSMPF_ROOT_ADDRESS set by rrun) + * 2. Slurm/PMIx (if SLURM environment detected) + * 3. File-based (default fallback) */ AUTO, From 3841f65eb595ab2dd2cf58286f3e3dbcc140c36b Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Thu, 5 Feb 2026 00:23:15 -0800 Subject: [PATCH 51/57] Use specific initializer functions --- cpp/src/bootstrap/bootstrap.cpp | 178 +++++++++++++++++--------------- 1 file changed, 97 insertions(+), 81 deletions(-) diff --git a/cpp/src/bootstrap/bootstrap.cpp b/cpp/src/bootstrap/bootstrap.cpp index bf77b0e60..751b1992f 100644 --- a/cpp/src/bootstrap/bootstrap.cpp +++ b/cpp/src/bootstrap/bootstrap.cpp @@ -54,96 +54,112 @@ Backend detect_backend() { // Default to file-based return Backend::FILE; } + +/** + * @brief Initialize context for FILE backend. + */ +Context file_backend_init() { + Context ctx; + ctx.backend = Backend::FILE; + + // Require explicit RAPIDSMPF_RANK and RAPIDSMPF_NRANKS + auto rank_opt = getenv_int("RAPIDSMPF_RANK"); + auto nranks_opt = getenv_int("RAPIDSMPF_NRANKS"); + auto coord_dir_opt = getenv_optional("RAPIDSMPF_COORD_DIR"); + + if (!rank_opt.has_value()) { + throw std::runtime_error( + "RAPIDSMPF_RANK environment variable not set. " + "Set it or use a launcher like 'rrun'." + ); + } + + if (!nranks_opt.has_value()) { + throw std::runtime_error( + "RAPIDSMPF_NRANKS environment variable not set. " + "Set it or use a launcher like 'rrun'." + ); + } + + if (!coord_dir_opt.has_value()) { + throw std::runtime_error( + "RAPIDSMPF_COORD_DIR environment variable not set. " + "Set it or use a launcher like 'rrun'." + ); + } + + ctx.rank = static_cast(*rank_opt); + ctx.nranks = static_cast(*nranks_opt); + ctx.coord_dir = *coord_dir_opt; + + if (!(ctx.rank >= 0 && ctx.rank < ctx.nranks)) { + throw std::runtime_error( + "Invalid rank: RAPIDSMPF_RANK=" + std::to_string(ctx.rank) + + " must be in range [0, " + std::to_string(ctx.nranks) + ")" + ); + } + + return ctx; +} + +/** + * @brief Initialize context for SLURM backend. + */ +Context slurm_backend_init() { +#ifdef RAPIDSMPF_HAVE_SLURM + Context ctx; + ctx.backend = Backend::SLURM; + + try { + ctx.rank = get_rank(); + } catch (const std::runtime_error& e) { + throw std::runtime_error( + "Could not determine rank for Slurm backend. " + "Ensure you're running with 'srun --mpi=pmix'." + ); + } + + try { + ctx.nranks = get_nranks(); + } catch (const std::runtime_error& e) { + throw std::runtime_error( + "Could not determine nranks for Slurm backend. " + "Ensure you're running with 'srun --mpi=pmix'." + ); + } + + if (!(ctx.rank >= 0 && ctx.rank < ctx.nranks)) { + throw std::runtime_error( + "Invalid rank: " + std::to_string(ctx.rank) + " must be in range [0, " + + std::to_string(ctx.nranks) + ")" + ); + } + + return ctx; +#else + throw std::runtime_error( + "SLURM backend requested but rapidsmpf was not built with PMIx support. " + "Rebuild with RAPIDSMPF_ENABLE_SLURM=ON and ensure PMIx is available." + ); +#endif +} } // namespace Context init(Backend backend) { - Context ctx; - ctx.backend = (backend == Backend::AUTO) ? detect_backend() : backend; + if (backend == Backend::AUTO) { + backend = detect_backend(); + } // Get rank and nranks based on backend - switch (ctx.backend) { + switch (backend) { case Backend::FILE: - { - // Require explicit RAPIDSMPF_RANK and RAPIDSMPF_NRANKS - auto rank_opt = getenv_int("RAPIDSMPF_RANK"); - auto nranks_opt = getenv_int("RAPIDSMPF_NRANKS"); - auto coord_dir_opt = getenv_optional("RAPIDSMPF_COORD_DIR"); - - if (!rank_opt.has_value()) { - throw std::runtime_error( - "RAPIDSMPF_RANK environment variable not set. " - "Set it or use a launcher like 'rrun'." - ); - } - - if (!nranks_opt.has_value()) { - throw std::runtime_error( - "RAPIDSMPF_NRANKS environment variable not set. " - "Set it or use a launcher like 'rrun'." - ); - } - - if (!coord_dir_opt.has_value()) { - throw std::runtime_error( - "RAPIDSMPF_COORD_DIR environment variable not set. " - "Set it or use a launcher like 'rrun'." - ); - } - - ctx.rank = static_cast(*rank_opt); - ctx.nranks = static_cast(*nranks_opt); - ctx.coord_dir = *coord_dir_opt; - - if (!(ctx.rank >= 0 && ctx.rank < ctx.nranks)) { - throw std::runtime_error( - "Invalid rank: RAPIDSMPF_RANK=" + std::to_string(ctx.rank) - + " must be in range [0, " + std::to_string(ctx.nranks) + ")" - ); - } - break; - } + return file_backend_init(); case Backend::SLURM: - { -#ifdef RAPIDSMPF_HAVE_SLURM - try { - ctx.rank = get_rank(); - } catch (const std::runtime_error& e) { - throw std::runtime_error( - "Could not determine rank for Slurm backend. " - "Ensure you're running with 'srun --mpi=pmix'." - ); - } - - try { - ctx.nranks = get_nranks(); - } catch (const std::runtime_error& e) { - throw std::runtime_error( - "Could not determine nranks for Slurm backend. " - "Ensure you're running with 'srun --mpi=pmix'." - ); - } - - if (!(ctx.rank >= 0 && ctx.rank < ctx.nranks)) { - throw std::runtime_error( - "Invalid rank: " + std::to_string(ctx.rank) + " must be in range [0, " - + std::to_string(ctx.nranks) + ")" - ); - } - break; -#else - throw std::runtime_error( - "SLURM backend requested but rapidsmpf was not built with PMIx support. " - "Rebuild with RAPIDSMPF_ENABLE_SLURM=ON and ensure PMIx is available." - ); -#endif - } + return slurm_backend_init(); case Backend::AUTO: - { - // Should have been resolved above - throw std::logic_error("Backend::AUTO should have been resolved"); - } + // Should have been resolved above + throw std::logic_error("Backend::AUTO should have been resolved"); } - return ctx; } void broadcast(Context const& ctx, void* data, std::size_t size, Rank root) { From 82441ef12d9c4124ae8a5271760e210405cdc758 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Thu, 5 Feb 2026 01:09:38 -0800 Subject: [PATCH 52/57] Add backend interface, use a pointer to the interface in Context --- cpp/benchmarks/bench_comm.cpp | 2 +- cpp/benchmarks/bench_shuffle.cpp | 2 +- .../streaming/bench_streaming_shuffle.cpp | 2 +- cpp/benchmarks/streaming/ndsh/utils.cpp | 2 +- cpp/include/rapidsmpf/bootstrap/backend.hpp | 117 +++++++++++++ cpp/include/rapidsmpf/bootstrap/bootstrap.hpp | 52 +----- .../rapidsmpf/bootstrap/file_backend.hpp | 15 +- .../rapidsmpf/bootstrap/slurm_backend.hpp | 17 +- cpp/include/rapidsmpf/bootstrap/ucxx.hpp | 4 +- cpp/src/bootstrap/bootstrap.cpp | 157 ++++++------------ cpp/src/bootstrap/ucxx.cpp | 4 +- cpp/tools/rrun.cpp | 15 +- .../rapidsmpf/bootstrap/bootstrap.pyx | 14 +- 13 files changed, 217 insertions(+), 186 deletions(-) create mode 100644 cpp/include/rapidsmpf/bootstrap/backend.hpp diff --git a/cpp/benchmarks/bench_comm.cpp b/cpp/benchmarks/bench_comm.cpp index ce5166d88..3924d6820 100644 --- a/cpp/benchmarks/bench_comm.cpp +++ b/cpp/benchmarks/bench_comm.cpp @@ -290,7 +290,7 @@ int main(int argc, char** argv) { if (use_bootstrap) { // Launched with rrun - use bootstrap backend comm = rapidsmpf::bootstrap::create_ucxx_comm( - rapidsmpf::bootstrap::Backend::AUTO, options + rapidsmpf::bootstrap::BackendType::AUTO, options ); } else { // Launched with mpirun - use MPI bootstrap diff --git a/cpp/benchmarks/bench_shuffle.cpp b/cpp/benchmarks/bench_shuffle.cpp index 126d1a32a..224d3c5a5 100644 --- a/cpp/benchmarks/bench_shuffle.cpp +++ b/cpp/benchmarks/bench_shuffle.cpp @@ -614,7 +614,7 @@ int main(int argc, char** argv) { if (use_bootstrap) { // Launched with rrun - use bootstrap backend comm = rapidsmpf::bootstrap::create_ucxx_comm( - rapidsmpf::bootstrap::Backend::AUTO, options + rapidsmpf::bootstrap::BackendType::AUTO, options ); } else { // Launched with mpirun - use MPI bootstrap diff --git a/cpp/benchmarks/streaming/bench_streaming_shuffle.cpp b/cpp/benchmarks/streaming/bench_streaming_shuffle.cpp index 0b73fd763..5b2c4932d 100644 --- a/cpp/benchmarks/streaming/bench_streaming_shuffle.cpp +++ b/cpp/benchmarks/streaming/bench_streaming_shuffle.cpp @@ -335,7 +335,7 @@ int main(int argc, char** argv) { if (use_bootstrap) { // Launched with rrun - use bootstrap backend comm = rapidsmpf::bootstrap::create_ucxx_comm( - rapidsmpf::bootstrap::Backend::AUTO, options + rapidsmpf::bootstrap::BackendType::AUTO, options ); } else { // Launched with mpirun - use MPI bootstrap diff --git a/cpp/benchmarks/streaming/ndsh/utils.cpp b/cpp/benchmarks/streaming/ndsh/utils.cpp index 72c9f0994..a8a5ffdd8 100644 --- a/cpp/benchmarks/streaming/ndsh/utils.cpp +++ b/cpp/benchmarks/streaming/ndsh/utils.cpp @@ -197,7 +197,7 @@ std::shared_ptr create_context( break; case CommType::UCXX: if (bootstrap::is_running_with_rrun()) { - comm = bootstrap::create_ucxx_comm(bootstrap::Backend::AUTO, options); + comm = bootstrap::create_ucxx_comm(bootstrap::BackendType::AUTO, options); } else { mpi::init(nullptr, nullptr); comm = ucxx::init_using_mpi(MPI_COMM_WORLD, options); diff --git a/cpp/include/rapidsmpf/bootstrap/backend.hpp b/cpp/include/rapidsmpf/bootstrap/backend.hpp new file mode 100644 index 000000000..190dfbebf --- /dev/null +++ b/cpp/include/rapidsmpf/bootstrap/backend.hpp @@ -0,0 +1,117 @@ +/** + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include + +#include + +namespace rapidsmpf::bootstrap { + +/** + * @brief Backend types for process coordination and bootstrapping. + */ +enum class BackendType { + /** + * @brief Automatically detect the best backend based on environment. + * + * Detection order: + * 1. File-based (if RAPIDSMPF_COORD_DIR or RAPIDSMPF_ROOT_ADDRESS set by rrun) + * 2. Slurm/PMIx (if SLURM environment detected) + * 3. File-based (default fallback) + */ + AUTO, + + /** + * @brief File-based coordination using a shared directory. + * + * Uses filesystem for rank coordination and address exchange. Works on single-node + * and multi-node with shared storage (e.g., NFS) via SSH. Requires RAPIDSMPF_RANK, + * RAPIDSMPF_NRANKS, RAPIDSMPF_COORD_DIR environment variables. + */ + FILE, + + /** + * @brief Slurm-based coordination using PMIx. + * + * Uses PMIx (Process Management Interface for Exascale) for scalable process + * coordination without requiring a shared filesystem. Designed for Slurm clusters + * and supports multi-node deployments. + * + * Run with: `srun --mpi=pmix -n ./program` + * + * Environment variables (automatically set by Slurm): + * - PMIX_NAMESPACE: PMIx namespace identifier + * - SLURM_PROCID: Process rank + * - SLURM_NPROCS/SLURM_NTASKS: Total number of processes + */ + SLURM, +}; + +namespace detail { + +/** + * @brief Abstract interface for bootstrap coordination backends. + * + * This interface defines the common operations that all backend implementations + * must support. Backend instances are stored in Context and reused across + * multiple operations to preserve state. + */ +class Backend { + public: + virtual ~Backend() = default; + + /** + * @brief Store a key-value pair. + * + * @param key Key name. + * @param value Value to store. + */ + virtual void put(std::string const& key, std::string const& value) = 0; + + /** + * @brief Retrieve a value, blocking until available or timeout occurs. + * + * @param key Key name. + * @param timeout Timeout duration. + * @return Value associated with key. + */ + virtual std::string get(std::string const& key, Duration timeout) = 0; + + /** + * @brief Perform a barrier synchronization. + * + * All ranks must call this before any rank proceeds. + */ + virtual void barrier() = 0; + + /** + * @brief Ensure all previous put() operations are globally visible. + */ + virtual void sync() = 0; + + /** + * @brief Broadcast data from root to all ranks. + * + * @param data Data buffer. + * @param size Size in bytes. + * @param root Root rank. + */ + virtual void broadcast(void* data, std::size_t size, Rank root) = 0; + + // Non-copyable, non-movable (backends manage resources) + Backend(Backend const&) = delete; + Backend& operator=(Backend const&) = delete; + Backend(Backend&&) = delete; + Backend& operator=(Backend&&) = delete; + + protected: + Backend() = default; +}; + +} // namespace detail +} // namespace rapidsmpf::bootstrap diff --git a/cpp/include/rapidsmpf/bootstrap/bootstrap.hpp b/cpp/include/rapidsmpf/bootstrap/bootstrap.hpp index 3aeb2612f..5bba00a17 100644 --- a/cpp/include/rapidsmpf/bootstrap/bootstrap.hpp +++ b/cpp/include/rapidsmpf/bootstrap/bootstrap.hpp @@ -11,50 +11,11 @@ #include #include +#include #include namespace rapidsmpf::bootstrap { -/** - * @brief Backend types for process coordination and bootstrapping. - */ -enum class Backend { - /** - * @brief Automatically detect the best backend based on environment. - * - * Detection order: - * 1. File-based (if RAPIDSMPF_COORD_DIR or RAPIDSMPF_ROOT_ADDRESS set by rrun) - * 2. Slurm/PMIx (if SLURM environment detected) - * 3. File-based (default fallback) - */ - AUTO, - - /** - * @brief File-based coordination using a shared directory. - * - * Uses filesystem for rank coordination and address exchange. Works on single-node - * and multi-node with shared storage (e.g., NFS) via SSH. Requires RAPIDSMPF_RANK, - * RAPIDSMPF_NRANKS, RAPIDSMPF_COORD_DIR environment variables. - */ - FILE, - - /** - * @brief Slurm-based coordination using PMIx. - * - * Uses PMIx (Process Management Interface for Exascale) for scalable process - * coordination without requiring a shared filesystem. Designed for Slurm clusters - * and supports multi-node deployments. - * - * Run with: `srun --mpi=pmix -n ./program` - * - * Environment variables (automatically set by Slurm): - * - PMIX_NAMESPACE: PMIx namespace identifier - * - SLURM_PROCID: Process rank - * - SLURM_NPROCS/SLURM_NTASKS: Total number of processes - */ - SLURM, -}; - /** * @brief Context information for the current process/rank. * @@ -68,11 +29,14 @@ struct Context { /** @brief Total number of ranks in the job. */ Rank nranks; - /** @brief Backend used for coordination. */ - Backend backend; + /** @brief Backend type used for coordination. */ + BackendType type; /** @brief Coordination directory (for FILE backend). */ std::optional coord_dir; + + /** @brief Backend implementation (internal, do not access directly). */ + std::shared_ptr backend; }; /** @@ -86,7 +50,7 @@ struct Context { * - RAPIDSMPF_NRANKS: Explicitly set total rank count * - RAPIDSMPF_COORD_DIR: File-based coordination directory * - * @param backend Backend to use (default: AUTO for auto-detection). + * @param type Backend type to use (default: AUTO for auto-detection). * @return Context object containing rank and coordination information. * @throws std::runtime_error if environment is not properly configured. * @@ -95,7 +59,7 @@ struct Context { * std::cout << "I am rank " << ctx.rank << " of " << ctx.nranks << std::endl; * @endcode */ -Context init(Backend backend = Backend::AUTO); +Context init(BackendType type = BackendType::AUTO); /** * @brief Broadcast data from root rank to all other ranks. diff --git a/cpp/include/rapidsmpf/bootstrap/file_backend.hpp b/cpp/include/rapidsmpf/bootstrap/file_backend.hpp index 144af97ac..10c4f80d4 100644 --- a/cpp/include/rapidsmpf/bootstrap/file_backend.hpp +++ b/cpp/include/rapidsmpf/bootstrap/file_backend.hpp @@ -8,6 +8,7 @@ #include #include +#include #include namespace rapidsmpf::bootstrap::detail { @@ -29,7 +30,7 @@ namespace rapidsmpf::bootstrap::detail { * └── barrier_ # Barrier synchronization * ``` */ -class FileBackend { +class FileBackend : public Backend { public: /** * @brief Construct a file backend. @@ -38,7 +39,7 @@ class FileBackend { */ explicit FileBackend(Context ctx); - ~FileBackend(); + ~FileBackend() override; /** * @brief Store a key-value pair. @@ -46,7 +47,7 @@ class FileBackend { * @param key Key name. * @param value Value to store. */ - void put(std::string const& key, std::string const& value); + void put(std::string const& key, std::string const& value) override; /** * @brief Retrieve a value, blocking until available or timeout occurs. @@ -55,14 +56,14 @@ class FileBackend { * @param timeout Timeout duration. * @return Value associated with key. */ - std::string get(std::string const& key, Duration timeout); + std::string get(std::string const& key, Duration timeout) override; /** * @brief Perform a barrier synchronization. * * All ranks must call this before any rank proceeds. */ - void barrier(); + void barrier() override; /** * @brief Ensure all previous put() operations are globally visible. @@ -71,7 +72,7 @@ class FileBackend { * file writes that are immediately visible to all processes via the * shared filesystem. */ - void sync(); + void sync() override; /** * @brief Broadcast data from root to all ranks. @@ -80,7 +81,7 @@ class FileBackend { * @param size Size in bytes. * @param root Root rank. */ - void broadcast(void* data, std::size_t size, Rank root); + void broadcast(void* data, std::size_t size, Rank root) override; private: Context ctx_; diff --git a/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp b/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp index 41be72b43..a07c251c1 100644 --- a/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp +++ b/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp @@ -15,6 +15,7 @@ #include +#include #include namespace rapidsmpf::bootstrap::detail { @@ -50,7 +51,7 @@ namespace rapidsmpf::bootstrap::detail { * rrun -n 4 ./benchmarks/bench_shuffle -C ucxx * ``` */ -class SlurmBackend { +class SlurmBackend : public Backend { public: /** * @brief Construct a Slurm backend using PMIx. @@ -64,9 +65,9 @@ class SlurmBackend { explicit SlurmBackend(Context ctx); /** - * @brief Destructor - finalizes PMIx. + * @brief Destructor that finalizes PMIx. */ - ~SlurmBackend(); + ~SlurmBackend() override; // Non-copyable, non-movable (PMIx state is process-global) SlurmBackend(SlurmBackend const&) = delete; @@ -85,7 +86,7 @@ class SlurmBackend { * * @throws std::runtime_error if PMIx operation fails. */ - void put(std::string const& key, std::string const& value); + void put(std::string const& key, std::string const& value) override; /** * @brief Retrieve a value from the PMIx KVS. @@ -99,7 +100,7 @@ class SlurmBackend { * * @throws std::runtime_error if key not found within timeout. */ - std::string get(std::string const& key, Duration timeout); + std::string get(std::string const& key, Duration timeout) override; /** * @brief Perform a barrier synchronization using PMIx_Fence. @@ -109,7 +110,7 @@ class SlurmBackend { * * @throws std::runtime_error if PMIx_Fence fails. */ - void barrier(); + void barrier() override; /** * @brief Ensure all previous put() operations are globally visible. @@ -121,7 +122,7 @@ class SlurmBackend { * * @throws std::runtime_error if PMIx_Fence fails. */ - void sync(); + void sync() override; /** * @brief Broadcast data from root to all ranks. @@ -135,7 +136,7 @@ class SlurmBackend { * * @throws std::runtime_error if broadcast fails or size mismatch occurs. */ - void broadcast(void* data, std::size_t size, Rank root); + void broadcast(void* data, std::size_t size, Rank root) override; /** * @brief Explicitly finalize the global PMIx session. diff --git a/cpp/include/rapidsmpf/bootstrap/ucxx.hpp b/cpp/include/rapidsmpf/bootstrap/ucxx.hpp index 636748947..18f7faac5 100644 --- a/cpp/include/rapidsmpf/bootstrap/ucxx.hpp +++ b/cpp/include/rapidsmpf/bootstrap/ucxx.hpp @@ -1,5 +1,5 @@ /** - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ @@ -44,7 +44,7 @@ namespace bootstrap { * @endcode */ std::shared_ptr create_ucxx_comm( - Backend backend = Backend::AUTO, config::Options options = config::Options{} + BackendType type = BackendType::AUTO, config::Options options = config::Options{} ); } // namespace bootstrap diff --git a/cpp/src/bootstrap/bootstrap.cpp b/cpp/src/bootstrap/bootstrap.cpp index 751b1992f..85a62ed16 100644 --- a/cpp/src/bootstrap/bootstrap.cpp +++ b/cpp/src/bootstrap/bootstrap.cpp @@ -26,14 +26,14 @@ namespace { /** * @brief Detect backend from environment variables. */ -Backend detect_backend() { +BackendType detect_backend() { // Check for rrun coordination first (explicit configuration takes priority). // If RAPIDSMPF_COORD_DIR or RAPIDSMPF_ROOT_ADDRESS is set, rrun is coordinating // and we should use FILE backend (with or without pre-coordinated address). if (getenv_optional("RAPIDSMPF_COORD_DIR") || getenv_optional("RAPIDSMPF_ROOT_ADDRESS")) { - return Backend::FILE; + return BackendType::FILE; } #ifdef RAPIDSMPF_HAVE_SLURM @@ -47,12 +47,12 @@ Backend detect_backend() { // NOT launched by rrun. Child processes launched by rrun will have RAPIDSMPF_* // variables set and will use FILE backend above. if (is_running_with_slurm()) { - return Backend::SLURM; + return BackendType::SLURM; } #endif // Default to file-based - return Backend::FILE; + return BackendType::FILE; } /** @@ -60,7 +60,7 @@ Backend detect_backend() { */ Context file_backend_init() { Context ctx; - ctx.backend = Backend::FILE; + ctx.type = BackendType::FILE; // Require explicit RAPIDSMPF_RANK and RAPIDSMPF_NRANKS auto rank_opt = getenv_int("RAPIDSMPF_RANK"); @@ -102,13 +102,13 @@ Context file_backend_init() { return ctx; } +#ifdef RAPIDSMPF_HAVE_SLURM /** * @brief Initialize context for SLURM backend. */ Context slurm_backend_init() { -#ifdef RAPIDSMPF_HAVE_SLURM Context ctx; - ctx.backend = Backend::SLURM; + ctx.type = BackendType::SLURM; try { ctx.rank = get_rank(); @@ -136,133 +136,76 @@ Context slurm_backend_init() { } return ctx; -#else - throw std::runtime_error( - "SLURM backend requested but rapidsmpf was not built with PMIx support. " - "Rebuild with RAPIDSMPF_ENABLE_SLURM=ON and ensure PMIx is available." - ); -#endif } +#endif } // namespace -Context init(Backend backend) { - if (backend == Backend::AUTO) { - backend = detect_backend(); +Context init(BackendType type) { + if (type == BackendType::AUTO) { + type = detect_backend(); } - // Get rank and nranks based on backend - switch (backend) { - case Backend::FILE: - return file_backend_init(); - case Backend::SLURM: - return slurm_backend_init(); - case Backend::AUTO: + Context ctx; + + // Get rank and nranks based on backend, then create backend instance + switch (type) { + case BackendType::FILE: + ctx = file_backend_init(); + ctx.backend = std::make_shared(ctx); + break; +#ifdef RAPIDSMPF_HAVE_SLURM + case BackendType::SLURM: + ctx = slurm_backend_init(); + ctx.backend = std::make_shared(ctx); + break; +#else + case BackendType::SLURM: + throw std::runtime_error( + "SLURM backend requested but rapidsmpf was not built with PMIx support. " + "Rebuild with RAPIDSMPF_ENABLE_SLURM=ON and ensure PMIx is available." + ); +#endif + case BackendType::AUTO: // Should have been resolved above - throw std::logic_error("Backend::AUTO should have been resolved"); + throw std::logic_error("BackendType::AUTO should have been resolved"); } + + return ctx; } void broadcast(Context const& ctx, void* data, std::size_t size, Rank root) { - switch (ctx.backend) { - case Backend::FILE: - { - detail::FileBackend backend{ctx}; - backend.broadcast(data, size, root); - break; - } -#ifdef RAPIDSMPF_HAVE_SLURM - case Backend::SLURM: - { - detail::SlurmBackend backend{ctx}; - backend.broadcast(data, size, root); - break; - } -#endif - default: - throw std::runtime_error("broadcast not implemented for this backend"); + if (!ctx.backend) { + throw std::runtime_error("Context not properly initialized - backend is null"); } + ctx.backend->broadcast(data, size, root); } void barrier(Context const& ctx) { - switch (ctx.backend) { - case Backend::FILE: - { - detail::FileBackend backend{ctx}; - backend.barrier(); - break; - } -#ifdef RAPIDSMPF_HAVE_SLURM - case Backend::SLURM: - { - detail::SlurmBackend backend{ctx}; - backend.barrier(); - break; - } -#endif - default: - throw std::runtime_error("barrier not implemented for this backend"); + if (!ctx.backend) { + throw std::runtime_error("Context not properly initialized - backend is null"); } + ctx.backend->barrier(); } void sync(Context const& ctx) { - switch (ctx.backend) { - case Backend::FILE: - { - detail::FileBackend backend{ctx}; - backend.sync(); - break; - } -#ifdef RAPIDSMPF_HAVE_SLURM - case Backend::SLURM: - { - detail::SlurmBackend backend{ctx}; - backend.sync(); - break; - } -#endif - default: - throw std::runtime_error("sync not implemented for this backend"); + if (!ctx.backend) { + throw std::runtime_error("Context not properly initialized - backend is null"); } + ctx.backend->sync(); } void put(Context const& ctx, std::string const& key, std::string const& value) { - switch (ctx.backend) { - case Backend::FILE: - { - detail::FileBackend backend{ctx}; - backend.put(key, value); - break; - } -#ifdef RAPIDSMPF_HAVE_SLURM - case Backend::SLURM: - { - detail::SlurmBackend backend{ctx}; - backend.put(key, value); - break; - } -#endif - default: - throw std::runtime_error("put not implemented for this backend"); + if (!ctx.backend) { + throw std::runtime_error("Context not properly initialized - backend is null"); } + ctx.backend->put(key, value); } std::string get(Context const& ctx, std::string const& key, Duration timeout) { - switch (ctx.backend) { - case Backend::FILE: - { - detail::FileBackend backend{ctx}; - return backend.get(key, timeout); - } -#ifdef RAPIDSMPF_HAVE_SLURM - case Backend::SLURM: - { - detail::SlurmBackend backend{ctx}; - return backend.get(key, timeout); - } -#endif - default: - throw std::runtime_error("get not implemented for this backend"); + if (!ctx.backend) { + throw std::runtime_error("Context not properly initialized - backend is null"); } + return ctx.backend->get(key, timeout); } } // namespace rapidsmpf::bootstrap diff --git a/cpp/src/bootstrap/ucxx.cpp b/cpp/src/bootstrap/ucxx.cpp index e4e1c2bd9..8fc788d03 100644 --- a/cpp/src/bootstrap/ucxx.cpp +++ b/cpp/src/bootstrap/ucxx.cpp @@ -55,8 +55,8 @@ std::string hex_decode(std::string const& input) { } } // namespace -std::shared_ptr create_ucxx_comm(Backend backend, config::Options options) { - auto ctx = init(backend); +std::shared_ptr create_ucxx_comm(BackendType type, config::Options options) { + auto ctx = init(type); // Ensure CUDA context is created before UCX is initialized cudaFree(nullptr); diff --git a/cpp/tools/rrun.cpp b/cpp/tools/rrun.cpp index 7e886124f..ab3fe1a86 100644 --- a/cpp/tools/rrun.cpp +++ b/cpp/tools/rrun.cpp @@ -1150,10 +1150,15 @@ std::string coordinate_root_address_via_pmix( // Create SlurmBackend for parent-level coordination rapidsmpf::bootstrap::Context parent_ctx{ - parent_rank, parent_nranks, rapidsmpf::bootstrap::Backend::SLURM, std::nullopt + parent_rank, + parent_nranks, + rapidsmpf::bootstrap::BackendType::SLURM, + std::nullopt, + nullptr }; - rapidsmpf::bootstrap::detail::SlurmBackend backend{parent_ctx}; + auto backend = + std::make_shared(parent_ctx); if (verbose) { std::cout << "[rrun] Parent coordination initialized: rank " << parent_rank @@ -1169,16 +1174,16 @@ std::string coordinate_root_address_via_pmix( << root_address_to_publish.value().size() << " chars)" << std::endl; } - backend.put("rapidsmpf_root_address", root_address_to_publish.value()); + backend->put("rapidsmpf_root_address", root_address_to_publish.value()); root_address = root_address_to_publish.value(); } // Barrier to ensure data exchange - backend.barrier(); + backend->barrier(); if (!root_address_to_publish.has_value()) { // Non-root parents retrieve the address - root_address = backend.get("rapidsmpf_root_address", std::chrono::seconds{30}); + root_address = backend->get("rapidsmpf_root_address", std::chrono::seconds{30}); if (verbose) { std::cout << "[rrun] Retrieved root address via SlurmBackend (hex-encoded, " diff --git a/python/rapidsmpf/rapidsmpf/bootstrap/bootstrap.pyx b/python/rapidsmpf/rapidsmpf/bootstrap/bootstrap.pyx index e85b1b60b..b43511ce5 100644 --- a/python/rapidsmpf/rapidsmpf/bootstrap/bootstrap.pyx +++ b/python/rapidsmpf/rapidsmpf/bootstrap/bootstrap.pyx @@ -8,9 +8,9 @@ from rapidsmpf.communicator.communicator cimport Communicator, cpp_Communicator from rapidsmpf.config cimport Options, cpp_Options -cdef extern from "" namespace \ +cdef extern from "" namespace \ "rapidsmpf::bootstrap" nogil: - cpdef enum class Backend(int): + cpdef enum class BackendType(int): AUTO FILE @@ -31,12 +31,12 @@ cdef extern from "" nogil: cdef extern from "" nogil: shared_ptr[cpp_UCXX_Communicator] cpp_create_ucxx_comm \ "rapidsmpf::bootstrap::create_ucxx_comm"( - Backend backend, + BackendType type, cpp_Options options, ) except +ex_handler -def create_ucxx_comm(Backend backend = Backend.AUTO, options = None): +def create_ucxx_comm(BackendType type = BackendType.AUTO, options = None): """ Create a UCXX communicator using the bootstrap backend. @@ -47,8 +47,8 @@ def create_ucxx_comm(Backend backend = Backend.AUTO, options = None): Parameters ---------- - backend - Backend to use for coordination. By default, ``Backend.AUTO`` is used, + type + Backend type to use for coordination. By default, ``BackendType.AUTO`` is used, which currently resolves to the file-based backend. options Configuration options for the UCXX communicator. If ``None``, a default @@ -73,7 +73,7 @@ def create_ucxx_comm(Backend backend = Backend.AUTO, options = None): cpp_options = options with nogil: - ucxx_comm = cpp_create_ucxx_comm(backend, cpp_options._handle) + ucxx_comm = cpp_create_ucxx_comm(type, cpp_options._handle) base_comm = dynamic_pointer_cast[cpp_Communicator, cpp_UCXX_Communicator]( ucxx_comm ) From 9d42fd7d281e607284f7134ad6f809f49078ae28 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Thu, 5 Feb 2026 01:43:12 -0800 Subject: [PATCH 53/57] Remove build-pmix from wheels --- dependencies.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dependencies.yaml b/dependencies.yaml index 2ee9efcde..0b2a4c8ee 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -221,7 +221,7 @@ dependencies: - mpi4py build-pmix: common: - - output_types: [conda, pyproject, requirements] + - output_types: conda packages: - libpmix-devel >=5.0,<6.0 build-python: From 7d1c478dc064429cdb59fa8b413c71f5627e8002 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Thu, 5 Feb 2026 02:15:28 -0800 Subject: [PATCH 54/57] Generalized Backend docstrings, moved implementation details to cpp --- cpp/include/rapidsmpf/bootstrap/backend.hpp | 9 +++- .../rapidsmpf/bootstrap/file_backend.hpp | 27 +++--------- .../rapidsmpf/bootstrap/slurm_backend.hpp | 42 +++---------------- cpp/src/bootstrap/file_backend.cpp | 6 ++- cpp/src/bootstrap/slurm_backend.cpp | 4 ++ 5 files changed, 28 insertions(+), 60 deletions(-) diff --git a/cpp/include/rapidsmpf/bootstrap/backend.hpp b/cpp/include/rapidsmpf/bootstrap/backend.hpp index 190dfbebf..dd7c842be 100644 --- a/cpp/include/rapidsmpf/bootstrap/backend.hpp +++ b/cpp/include/rapidsmpf/bootstrap/backend.hpp @@ -68,6 +68,9 @@ class Backend { /** * @brief Store a key-value pair. * + * The key-value pair is committed immediately and made visible to other + * ranks after a collective `sync()`. + * * @param key Key name. * @param value Value to store. */ @@ -79,6 +82,8 @@ class Backend { * @param key Key name. * @param timeout Timeout duration. * @return Value associated with key. + * + * @throws std::runtime_error if key not found within timeout. */ virtual std::string get(std::string const& key, Duration timeout) = 0; @@ -97,9 +102,11 @@ class Backend { /** * @brief Broadcast data from root to all ranks. * - * @param data Data buffer. + * @param data Data buffer (input on root, output on other ranks). * @param size Size in bytes. * @param root Root rank. + * + * @throws std::runtime_error if broadcast fails or size mismatch occurs. */ virtual void broadcast(void* data, std::size_t size, Rank root) = 0; diff --git a/cpp/include/rapidsmpf/bootstrap/file_backend.hpp b/cpp/include/rapidsmpf/bootstrap/file_backend.hpp index 10c4f80d4..1c25ba4dc 100644 --- a/cpp/include/rapidsmpf/bootstrap/file_backend.hpp +++ b/cpp/include/rapidsmpf/bootstrap/file_backend.hpp @@ -42,44 +42,27 @@ class FileBackend : public Backend { ~FileBackend() override; /** - * @brief Store a key-value pair. - * - * @param key Key name. - * @param value Value to store. + * @copydoc Backend::put */ void put(std::string const& key, std::string const& value) override; /** - * @brief Retrieve a value, blocking until available or timeout occurs. - * - * @param key Key name. - * @param timeout Timeout duration. - * @return Value associated with key. + * @copydoc Backend::get */ std::string get(std::string const& key, Duration timeout) override; /** - * @brief Perform a barrier synchronization. - * - * All ranks must call this before any rank proceeds. + * @copydoc Backend::barrier */ void barrier() override; /** - * @brief Ensure all previous put() operations are globally visible. - * - * For FileBackend, this is a no-op since put() operations use atomic - * file writes that are immediately visible to all processes via the - * shared filesystem. + * @copydoc Backend::sync */ void sync() override; /** - * @brief Broadcast data from root to all ranks. - * - * @param data Data buffer. - * @param size Size in bytes. - * @param root Root rank. + * @copydoc Backend::broadcast */ void broadcast(void* data, std::size_t size, Rank root) override; diff --git a/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp b/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp index a07c251c1..6550b90c0 100644 --- a/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp +++ b/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp @@ -76,65 +76,35 @@ class SlurmBackend : public Backend { SlurmBackend& operator=(SlurmBackend&&) = delete; /** - * @brief Store a key-value pair in the PMIx KVS. - * - * The key-value pair is committed immediately and made visible to other - * ranks via a fence operation. - * - * @param key Key name. - * @param value Value to store. + * @copydoc Backend::put() * * @throws std::runtime_error if PMIx operation fails. */ void put(std::string const& key, std::string const& value) override; /** - * @brief Retrieve a value from the PMIx KVS. - * - * Blocks until the key is available or timeout occurs. Uses polling - * with exponential backoff. - * - * @param key Key name. - * @param timeout Timeout duration. - * @return Value associated with key. - * - * @throws std::runtime_error if key not found within timeout. + * @copydoc Backend::get() */ std::string get(std::string const& key, Duration timeout) override; /** - * @brief Perform a barrier synchronization using PMIx_Fence. - * - * All ranks must call this before any rank proceeds. The fence also - * ensures all committed key-value pairs are visible to all ranks. + * @copydoc Backend::barrier() * * @throws std::runtime_error if PMIx_Fence fails. */ void barrier() override; /** - * @brief Ensure all previous put() operations are globally visible. - * - * For Slurm/PMIx backend, this executes PMIx_Fence to make all committed - * key-value pairs visible across all nodes. This is required because - * PMIx_Put + PMIx_Commit only makes data locally visible; PMIx_Fence - * performs the global synchronization and data exchange. + * @copydoc Backend::sync() * * @throws std::runtime_error if PMIx_Fence fails. */ void sync() override; /** - * @brief Broadcast data from root to all ranks. - * - * Root rank publishes data via put(), then all ranks synchronize - * and non-root ranks retrieve the data via get(). + * @copydoc Backend::broadcast() * - * @param data Data buffer (input on root, output on others). - * @param size Size in bytes. - * @param root Root rank. - * - * @throws std::runtime_error if broadcast fails or size mismatch occurs. + * @throws std::runtime_error if PMIx operation fails. */ void broadcast(void* data, std::size_t size, Rank root) override; diff --git a/cpp/src/bootstrap/file_backend.cpp b/cpp/src/bootstrap/file_backend.cpp index c267606ae..5525162cf 100644 --- a/cpp/src/bootstrap/file_backend.cpp +++ b/cpp/src/bootstrap/file_backend.cpp @@ -108,7 +108,11 @@ void FileBackend::barrier() { std::filesystem::remove(my_barrier_file, ec); } -void FileBackend::sync() {} +void FileBackend::sync() { + // For FileBackend, this is a no-op since put() operations use atomic + // file writes that are immediately visible to all processes via the + // shared filesystem. +} void FileBackend::broadcast(void* data, std::size_t size, Rank root) { if (ctx_.rank == root) { diff --git a/cpp/src/bootstrap/slurm_backend.cpp b/cpp/src/bootstrap/slurm_backend.cpp index 36dc37101..b94b1fb6a 100644 --- a/cpp/src/bootstrap/slurm_backend.cpp +++ b/cpp/src/bootstrap/slurm_backend.cpp @@ -269,6 +269,10 @@ void SlurmBackend::barrier() { } void SlurmBackend::sync() { + // For Slurm/PMIx backend, this executes PMIx_Fence to make all committed + // key-value pairs visible across all nodes. This is required because + // PMIx_Put + PMIx_Commit only makes data locally visible; PMIx_Fence + // performs the global synchronization and data exchange. pmix_fence_all(nspace_, "sync"); } From acc85fc6958093826d90b0dc2c027f7da00e9a6e Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Thu, 5 Feb 2026 02:27:02 -0800 Subject: [PATCH 55/57] Use sync instead of barrier --- cpp/tools/rrun.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/tools/rrun.cpp b/cpp/tools/rrun.cpp index ab3fe1a86..8f61eb837 100644 --- a/cpp/tools/rrun.cpp +++ b/cpp/tools/rrun.cpp @@ -1178,8 +1178,7 @@ std::string coordinate_root_address_via_pmix( root_address = root_address_to_publish.value(); } - // Barrier to ensure data exchange - backend->barrier(); + backend->sync(); if (!root_address_to_publish.has_value()) { // Non-root parents retrieve the address From 2891959ff13b4d61348e924f827c14a3ab35308e Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Thu, 5 Feb 2026 03:39:46 -0800 Subject: [PATCH 56/57] Fix BackendType in Python --- python/rapidsmpf/rapidsmpf/bootstrap/__init__.py | 6 +++--- python/rapidsmpf/rapidsmpf/bootstrap/bootstrap.pyi | 4 ++-- python/rapidsmpf/rapidsmpf/examples/bulk_mpi_shuffle.py | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/python/rapidsmpf/rapidsmpf/bootstrap/__init__.py b/python/rapidsmpf/rapidsmpf/bootstrap/__init__.py index d1e3e5a54..bc92910c0 100644 --- a/python/rapidsmpf/rapidsmpf/bootstrap/__init__.py +++ b/python/rapidsmpf/rapidsmpf/bootstrap/__init__.py @@ -1,13 +1,13 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 """Bootstrap utilities for communicator creation.""" from __future__ import annotations from rapidsmpf.bootstrap.bootstrap import ( - Backend, + BackendType, create_ucxx_comm, is_running_with_rrun, ) -__all__ = ["Backend", "create_ucxx_comm", "is_running_with_rrun"] +__all__ = ["BackendType", "create_ucxx_comm", "is_running_with_rrun"] diff --git a/python/rapidsmpf/rapidsmpf/bootstrap/bootstrap.pyi b/python/rapidsmpf/rapidsmpf/bootstrap/bootstrap.pyi index bf3c0477d..fc5d39b4f 100644 --- a/python/rapidsmpf/rapidsmpf/bootstrap/bootstrap.pyi +++ b/python/rapidsmpf/rapidsmpf/bootstrap/bootstrap.pyi @@ -6,12 +6,12 @@ from enum import IntEnum from rapidsmpf.communicator.communicator import Communicator from rapidsmpf.config import Options -class Backend(IntEnum): +class BackendType(IntEnum): AUTO = ... FILE = ... def create_ucxx_comm( - backend: Backend = ..., + type: BackendType = ..., options: Options | None = ..., ) -> Communicator: ... def is_running_with_rrun() -> bool: ... diff --git a/python/rapidsmpf/rapidsmpf/examples/bulk_mpi_shuffle.py b/python/rapidsmpf/rapidsmpf/examples/bulk_mpi_shuffle.py index 7e6b271f4..efd9193d8 100644 --- a/python/rapidsmpf/rapidsmpf/examples/bulk_mpi_shuffle.py +++ b/python/rapidsmpf/rapidsmpf/examples/bulk_mpi_shuffle.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 """Bulk-synchronous MPI shuffle.""" @@ -299,7 +299,7 @@ def setup_and_run(args: argparse.Namespace) -> None: elif args.cluster_type == "ucxx": if rapidsmpf.bootstrap.is_running_with_rrun(): comm = rapidsmpf.bootstrap.create_ucxx_comm( - backend=rapidsmpf.bootstrap.Backend.AUTO, options=options + type=rapidsmpf.bootstrap.BackendType.AUTO, options=options ) else: comm = ucxx_mpi_setup(options) From 5b9da5c41f67f8454a7fd9dba5e2395fbb4d9e03 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Thu, 5 Feb 2026 03:20:27 -0800 Subject: [PATCH 57/57] Remove Slurm hybrid mode --- cpp/include/rapidsmpf/bootstrap/backend.hpp | 2 +- .../rapidsmpf/bootstrap/slurm_backend.hpp | 26 +- cpp/src/bootstrap/bootstrap.cpp | 7 +- cpp/src/bootstrap/slurm_backend.cpp | 4 - cpp/src/bootstrap/ucxx.cpp | 90 +--- cpp/tools/rrun.cpp | 411 ++---------------- 6 files changed, 33 insertions(+), 507 deletions(-) diff --git a/cpp/include/rapidsmpf/bootstrap/backend.hpp b/cpp/include/rapidsmpf/bootstrap/backend.hpp index dd7c842be..540bd64fe 100644 --- a/cpp/include/rapidsmpf/bootstrap/backend.hpp +++ b/cpp/include/rapidsmpf/bootstrap/backend.hpp @@ -20,7 +20,7 @@ enum class BackendType { * @brief Automatically detect the best backend based on environment. * * Detection order: - * 1. File-based (if RAPIDSMPF_COORD_DIR or RAPIDSMPF_ROOT_ADDRESS set by rrun) + * 1. File-based (if RAPIDSMPF_COORD_DIR set by rrun) * 2. Slurm/PMIx (if SLURM environment detected) * 3. File-based (default fallback) */ diff --git a/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp b/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp index 6550b90c0..d232d85eb 100644 --- a/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp +++ b/cpp/include/rapidsmpf/bootstrap/slurm_backend.hpp @@ -30,7 +30,7 @@ namespace rapidsmpf::bootstrap::detail { * * Usage: * ```bash - * # Passthrough: multiple (4) tasks per node, one task per GPU, two nodes. + * # Multiple (4) tasks per node, one task per GPU, two nodes. * srun \ * --mpi=pmix \ * --nodes=2 \ @@ -39,16 +39,6 @@ namespace rapidsmpf::bootstrap::detail { * --gpus-per-task=1 \ * --gres=gpu:4 \ * rrun ./benchmarks/bench_shuffle -C ucxx - * - * # Hybrid mode: one task per node, 4 GPUs per task, two nodes. - * srun \ - * --mpi=pmix \ - * --nodes=2 \ - * --ntasks-per-node=1 \ - * --cpus-per-task=144 \ - * --gpus-per-task=4 \ - * --gres=gpu:4 \ - * rrun -n 4 ./benchmarks/bench_shuffle -C ucxx * ``` */ class SlurmBackend : public Backend { @@ -108,20 +98,6 @@ class SlurmBackend : public Backend { */ void broadcast(void* data, std::size_t size, Rank root) override; - /** - * @brief Explicitly finalize the global PMIx session. - * - * This is useful for scenarios like rrun parent coordination where PMIx - * needs to be finalized before process exit (e.g., after child processes - * complete). If not called explicitly, PMIx will be finalized when the - * process exits via the PmixGlobalState destructor. - * - * This function is safe to call multiple times, subsequent calls are no-ops. - * - * @throws std::runtime_error if PMIx_Finalize fails. - */ - static void finalize_pmix(); - private: Context ctx_; std::size_t barrier_count_{0}; diff --git a/cpp/src/bootstrap/bootstrap.cpp b/cpp/src/bootstrap/bootstrap.cpp index 85a62ed16..f8fa86729 100644 --- a/cpp/src/bootstrap/bootstrap.cpp +++ b/cpp/src/bootstrap/bootstrap.cpp @@ -28,11 +28,8 @@ namespace { */ BackendType detect_backend() { // Check for rrun coordination first (explicit configuration takes priority). - // If RAPIDSMPF_COORD_DIR or RAPIDSMPF_ROOT_ADDRESS is set, rrun is coordinating - // and we should use FILE backend (with or without pre-coordinated address). - if (getenv_optional("RAPIDSMPF_COORD_DIR") - || getenv_optional("RAPIDSMPF_ROOT_ADDRESS")) - { + // If RAPIDSMPF_COORD_DIR is set, rrun is coordinating and we should use FILE backend. + if (getenv_optional("RAPIDSMPF_COORD_DIR")) { return BackendType::FILE; } diff --git a/cpp/src/bootstrap/slurm_backend.cpp b/cpp/src/bootstrap/slurm_backend.cpp index b94b1fb6a..9c065ff30 100644 --- a/cpp/src/bootstrap/slurm_backend.cpp +++ b/cpp/src/bootstrap/slurm_backend.cpp @@ -304,10 +304,6 @@ void SlurmBackend::broadcast(void* data, std::size_t size, Rank root) { barrier(); } -void SlurmBackend::finalize_pmix() { - PmixGlobalState::instance().finalize(); -} - } // namespace rapidsmpf::bootstrap::detail #endif // RAPIDSMPF_HAVE_SLURM diff --git a/cpp/src/bootstrap/ucxx.cpp b/cpp/src/bootstrap/ucxx.cpp index 8fc788d03..6f458a9fe 100644 --- a/cpp/src/bootstrap/ucxx.cpp +++ b/cpp/src/bootstrap/ucxx.cpp @@ -8,14 +8,10 @@ #ifdef RAPIDSMPF_HAVE_UCXX #include -#include -#include #include -#include #include #include -#include // for unsetenv #include #include @@ -25,36 +21,6 @@ namespace rapidsmpf::bootstrap { -namespace { -// Hex encoding for binary-safe address transmission -std::string hex_encode(std::string const& input) { - static constexpr const char* hex_chars = "0123456789abcdef"; - std::string result; - result.reserve(input.size() * 2); - for (char ch : input) { - auto c = static_cast(ch); - result.push_back(hex_chars[c >> 4]); - result.push_back(hex_chars[c & 0x0F]); - } - return result; -} - -std::string hex_decode(std::string const& input) { - std::string result; - result.reserve(input.size() / 2); - for (size_t i = 0; i < input.size(); i += 2) { - auto high = static_cast( - (input[i] >= 'a') ? (input[i] - 'a' + 10) : (input[i] - '0') - ); - auto low = static_cast( - (input[i + 1] >= 'a') ? (input[i + 1] - 'a' + 10) : (input[i + 1] - '0') - ); - result.push_back(static_cast((high << 4) | low)); - } - return result; -} -} // namespace - std::shared_ptr create_ucxx_comm(BackendType type, config::Options options) { auto ctx = init(type); @@ -63,57 +29,8 @@ std::shared_ptr create_ucxx_comm(BackendType type, config::Options o std::shared_ptr comm; - auto precomputed_address_encoded = getenv_optional("RAPIDSMPF_ROOT_ADDRESS"); - auto address_file = getenv_optional("RAPIDSMPF_ROOT_ADDRESS_FILE"); - - // Path 1: Early address mode for root rank in Slurm hybrid mode. - // Rank 0 is launched first to create its address and write it to a file. - // Parent will coordinate with other parents via PMIx, then launch worker ranks - // with RAPIDSMPF_ROOT_ADDRESS set. No PMIx put/barrier/get bootstrap coordination. - if (ctx.rank == 0 && address_file.has_value()) { - auto ucxx_initialized_rank = - ucxx::init(nullptr, ctx.nranks, std::nullopt, options); - comm = std::make_shared(std::move(ucxx_initialized_rank), options); - - auto listener_address = comm->listener_address(); - auto root_worker_address_str = - std::get>(listener_address.address) - ->getString(); - - std::string encoded_address = hex_encode(root_worker_address_str); - std::ofstream addr_file(*address_file); - if (!addr_file) { - throw std::runtime_error( - "Failed to write root address to file: " + *address_file - ); - } - addr_file << encoded_address << std::endl; - addr_file.close(); - - auto verbose = getenv_optional("RAPIDSMPF_VERBOSE"); - if (verbose && *verbose == "1") { - std::cerr << "[rank 0] Wrote address to " << *address_file - << ", skipping bootstrap coordination" << std::endl; - } - - // Unset the flag so rank 0 participates in the final barrier - unsetenv("RAPIDSMPF_ROOT_ADDRESS_FILE"); - } - // Path 2: Slurm hybrid mode for non-root ranks. - // Parent process already coordinated the root address via PMIx and provided it - // via RAPIDSMPF_ROOT_ADDRESS environment variable (hex-encoded). - else if (precomputed_address_encoded.has_value() && ctx.rank != 0) - { - std::string precomputed_address = hex_decode(*precomputed_address_encoded); - auto root_worker_address = ::ucxx::createAddressFromString(precomputed_address); - auto ucxx_initialized_rank = - ucxx::init(nullptr, ctx.nranks, root_worker_address, options); - comm = std::make_shared(std::move(ucxx_initialized_rank), options); - } - // Path 3: Normal bootstrap mode for root rank. - // Create listener and publish address via put() for non-root ranks to retrieve. - else if (ctx.rank == 0) - { + // Root rank: Create listener and publish address via put() for non-root ranks. + if (ctx.rank == 0) { auto ucxx_initialized_rank = ucxx::init(nullptr, ctx.nranks, std::nullopt, options); comm = std::make_shared(std::move(ucxx_initialized_rank), options); @@ -126,8 +43,7 @@ std::shared_ptr create_ucxx_comm(BackendType type, config::Options o put(ctx, "ucxx_root_address", root_worker_address_str); sync(ctx); } - // Path 4: Normal bootstrap mode for non-root ranks. - // Retrieve root address via get() and connect. + // Non-root ranks: Retrieve root address via get() and connect. else { sync(ctx); diff --git a/cpp/tools/rrun.cpp b/cpp/tools/rrun.cpp index 8f61eb837..3d2ecc460 100644 --- a/cpp/tools/rrun.cpp +++ b/cpp/tools/rrun.cpp @@ -16,11 +16,9 @@ #include #include #include -#include #include #include #include -#include #include #include #include @@ -58,22 +56,8 @@ namespace { struct Config; [[noreturn]] void execute_slurm_passthrough_mode(Config const& cfg); int execute_single_node_mode(Config& cfg); -#ifdef RAPIDSMPF_HAVE_SLURM -int execute_slurm_hybrid_mode(Config& cfg); -std::string launch_rank0_and_get_address( - Config const& cfg, std::string const& address_file, int total_ranks -); -std::string coordinate_root_address_via_pmix( - std::optional const& root_address_to_publish, bool verbose -); -#endif int launch_ranks_fork_based( - Config const& cfg, - int rank_offset, - int ranks_per_task, - int total_ranks, - std::optional const& root_address, - bool is_root_parent + Config const& cfg, int rank_offset, int ranks_per_task, int total_ranks ); [[noreturn]] void exec_application(Config const& cfg); pid_t launch_rank_local( @@ -81,7 +65,6 @@ pid_t launch_rank_local( int global_rank, int local_rank, int total_ranks, - std::optional const& root_address, int* out_fd_stdout, int* out_fd_stderr ); @@ -197,11 +180,7 @@ void print_usage(std::string_view prog_name) { << "Slurm Options:\n" << " --slurm Run in Slurm mode (auto-detected when SLURM_JOB_ID is " << " set)\n" - << " Two sub-modes:\n" - << " 1. Passthrough (no -n): Apply bindings and exec\n" - << " 2. Hybrid (with -n): Launch N ranks per Slurm task\n" - << " In hybrid mode, each Slurm task launches multiple\n" - << " ranks with coordinated global rank numbering\n\n" + << " Applies topology bindings and executes application\n\n" << "Common Options:\n" << " -d Coordination directory (default: /tmp/rrun_)\n" << " Not applicable in Slurm mode\n" @@ -230,14 +209,10 @@ void print_usage(std::string_view prog_name) { << " rrun -n 2 -x UCX_TLS=cuda_copy,cuda_ipc,rc,tcp -x MY_VAR=value " "./bench_comm\n\n" << "Slurm Examples:\n" - << " # Passthrough: multiple (4) tasks per node, one task per GPU, two nodes.\n" + << " # Multiple (4) tasks per node, one task per GPU, two nodes.\n" << " srun --mpi=pmix --nodes=2 --ntasks-per-node=4 --cpus-per-task=36 \\\n" << " --gpus-per-task=1 --gres=gpu:4 \\\n" << " rrun ./benchmarks/bench_shuffle -C ucxx\n\n" - << " # Hybrid mode: one task per node, 4 GPUs per task, two nodes.\n" - << " srun --mpi=pmix --nodes=2 --ntasks-per-node=1 --cpus-per-task=144 \\\n" - << " --gpus-per-task=4 --gres=gpu:4 \\\n" - << " rrun -n 4 ./benchmarks/bench_shuffle -C ucxx\n\n" << std::endl; } @@ -628,12 +603,11 @@ Config parse_args(int argc, char* argv[]) { ); } - // In Slurm mode: - // - If -n is specified: launch N ranks per Slurm task (hybrid mode) - // - If -n is not specified: just apply bindings and exec (passthrough mode, - // one rank per task) - if (cfg.nranks <= 0) { - cfg.nranks = 1; + // In Slurm mode, -n flag is not used (one rank per task) + if (cfg.nranks != 1) { + throw std::runtime_error( + "-n flag is not supported in Slurm mode. Use srun to control task count." + ); } } else { // Single-node mode validation @@ -779,19 +753,13 @@ pid_t fork_with_piped_stdio( /** * @brief Common helper to set up coordination, launch ranks, and cleanup. * - * This function encapsulates the common workflow shared by both Slurm hybrid mode - * and single-node mode: create coordination directory, launch ranks via fork, - * cleanup, and report results. - * - * A task here denotes a Slurm unit of execution, e.g., a single instance of a - * program or process, e.g., an instance of the `rrun` executable itself. + * This function encapsulates the common workflow for single-node mode: + * create coordination directory, launch ranks via fork, cleanup, and report results. * * @param cfg Configuration (will modify coord_dir if empty). * @param rank_offset Starting global rank for this task. * @param ranks_per_task Number of ranks to launch locally. * @param total_ranks Total ranks across all tasks. - * @param root_address Pre-coordinated root address (empty for FILE backend). - * @param is_root_parent Whether this is root parent (affects launch logic). * @param coord_dir_hint Hint for coordination directory name (e.g., job ID). * @return Exit status (0 for success). */ @@ -800,8 +768,6 @@ int setup_launch_and_cleanup( int rank_offset, int ranks_per_task, int total_ranks, - std::optional const& root_address, - bool is_root_parent, std::string const& coord_dir_hint = "" ) { // Set up coordination directory @@ -815,9 +781,8 @@ int setup_launch_and_cleanup( std::filesystem::create_directories(cfg.coord_dir); // Launch ranks and wait for completion - int exit_status = launch_ranks_fork_based( - cfg, rank_offset, ranks_per_task, total_ranks, root_address, is_root_parent - ); + int exit_status = + launch_ranks_fork_based(cfg, rank_offset, ranks_per_task, total_ranks); if (cfg.cleanup) { if (cfg.verbose) { @@ -869,90 +834,6 @@ int setup_launch_and_cleanup( _exit(1); } -#ifdef RAPIDSMPF_HAVE_SLURM -/** - * @brief Execute application in Slurm hybrid mode with PMIx coordination. - * - * Root parent launches rank 0 first to get address, coordinates via PMIx, then parents - * on all nodes launch their remaining ranks. Uses fork-based execution. - * - * @param cfg Configuration. - * @return Exit status (0 for success). - */ -int execute_slurm_hybrid_mode(Config& cfg) { - if (cfg.verbose) { - std::cout << "[rrun] Slurm hybrid mode: task " << cfg.slurm_global_rank - << " launching " << cfg.nranks << " ranks per task" << std::endl; - std::cout << "[rrun] Using PMIx for parent coordination (no file I/O)" - << std::endl; - } - - // Set up coordination directory FIRST (needed by rank 0 when it's launched early) - char const* job_id = std::getenv("SLURM_JOB_ID"); - if (cfg.coord_dir.empty()) { - if (job_id) { - cfg.coord_dir = "/tmp/rrun_slurm_" + std::string{job_id}; - } else { - cfg.coord_dir = "/tmp/rrun_" + generate_session_id(); - } - } - std::filesystem::create_directories(cfg.coord_dir); - - // Root parent needs to launch rank 0 first to get address - bool is_root_parent = (cfg.slurm_global_rank == 0); - - // Coordinate root address with other nodes via PMIx - int slurm_ntasks = cfg.slurm_ntasks > 0 ? cfg.slurm_ntasks : 1; - int total_ranks = slurm_ntasks * cfg.nranks; - std::string encoded_root_address, coordinated_root_address; - - if (is_root_parent) { - // Root parent: Launch rank 0, get address, coordinate via PMIx - std::string address_file = - "/tmp/rapidsmpf_root_address_" + std::string{job_id ? job_id : "unknown"}; - encoded_root_address = - launch_rank0_and_get_address(cfg, address_file, total_ranks); - coordinated_root_address = - coordinate_root_address_via_pmix(encoded_root_address, cfg.verbose); - } else { - // Non-root parent: Get address from root via PMIx - coordinated_root_address = - coordinate_root_address_via_pmix(std::nullopt, cfg.verbose); - } - - unsetenv("RAPIDSMPF_ROOT_ADDRESS_FILE"); - - int rank_offset = cfg.slurm_global_rank * cfg.nranks; - - if (cfg.verbose) { - std::cout << "[rrun] Task " << cfg.slurm_global_rank << " launching ranks " - << rank_offset << "-" << (rank_offset + cfg.nranks - 1) - << " (total: " << total_ranks << " ranks)" << std::endl; - } - - std::string coord_hint = job_id ? ("slurm_" + std::string{job_id}) : ""; - int exit_status = setup_launch_and_cleanup( - cfg, - rank_offset, - cfg.nranks, - total_ranks, - coordinated_root_address, - is_root_parent, - coord_hint - ); - - // Finalize PMIx session used for parent coordination - if (!coordinated_root_address.empty()) { - if (cfg.verbose) { - std::cout << "[rrun] Finalizing PMIx in parent" << std::endl; - } - rapidsmpf::bootstrap::detail::SlurmBackend::finalize_pmix(); - } - - return exit_status; -} -#endif // RAPIDSMPF_HAVE_SLURM - /** * @brief Execute application in single-node mode with FILE backend. * @@ -967,11 +848,11 @@ int execute_single_node_mode(Config& cfg) { << std::endl; } - return setup_launch_and_cleanup(cfg, 0, cfg.nranks, cfg.nranks, std::nullopt, false); + return setup_launch_and_cleanup(cfg, 0, cfg.nranks, cfg.nranks); } /** - * @brief Execute application in Slurm passthrough mode (single rank per task). + * @brief Execute application in Slurm mode (single rank per task). * * Applies topology bindings and executes the application directly without forking. * This function never returns - it either replaces the current process or exits on error. @@ -980,8 +861,7 @@ int execute_single_node_mode(Config& cfg) { */ [[noreturn]] void execute_slurm_passthrough_mode(Config const& cfg) { if (cfg.verbose) { - std::cout << "[rrun] Slurm passthrough mode: applying bindings and exec'ing" - << std::endl; + std::cout << "[rrun] Slurm mode: applying bindings and exec'ing" << std::endl; } // Set rrun coordination environment variables so the application knows @@ -996,7 +876,7 @@ int execute_single_node_mode(Config& cfg) { setenv("CUDA_VISIBLE_DEVICES", std::to_string(gpu_id).c_str(), 1); if (cfg.verbose) { - std::cout << "[rrun] Slurm task (passthrough) local_id=" << cfg.slurm_local_id + std::cout << "[rrun] Slurm task local_id=" << cfg.slurm_local_id << " assigned to GPU " << gpu_id << std::endl; } } @@ -1011,213 +891,17 @@ int execute_single_node_mode(Config& cfg) { exec_application(cfg); } -#ifdef RAPIDSMPF_HAVE_SLURM -/** - * @brief Launch rank 0 first to obtain its UCXX root address. - * - * @param cfg Configuration. - * @param address_file Path to file where rank 0 will write its address. - * @param total_ranks Total number of ranks across all tasks. - * @return Hex-encoded root address. - * - * @throws std::runtime_error on timeout or launch failure. - */ -std::string launch_rank0_and_get_address( - Config const& cfg, std::string const& address_file, int total_ranks -) { - if (cfg.verbose) { - std::cout << "[rrun] Root parent: launching rank 0 first to get address" - << std::endl; - } - - setenv("RAPIDSMPF_ROOT_ADDRESS_FILE", address_file.c_str(), 1); - - int fd_out = -1, fd_err = -1; - pid_t rank0_pid = - launch_rank_local(cfg, 0, 0, total_ranks, std::nullopt, &fd_out, &fd_err); - - // Start forwarders for rank 0 output - std::thread rank0_stdout_forwarder; - std::thread rank0_stderr_forwarder; - auto suppress = std::make_shared>(false); - - if (fd_out >= 0) { - rank0_stdout_forwarder = std::thread([fd_out, suppress]() { - FILE* stream = fdopen(fd_out, "r"); - if (!stream) { - close(fd_out); - return; - } - char buffer[4096]; - while (fgets(buffer, sizeof(buffer), stream) != nullptr) { - if (suppress->load()) - continue; - std::lock_guard lock(output_mutex); - fputs(buffer, stdout); - fflush(stdout); - } - fclose(stream); - }); - } - - if (fd_err >= 0) { - rank0_stderr_forwarder = std::thread([fd_err, suppress]() { - FILE* stream = fdopen(fd_err, "r"); - if (!stream) { - close(fd_err); - return; - } - char buffer[4096]; - while (fgets(buffer, sizeof(buffer), stream) != nullptr) { - if (suppress->load()) - continue; - std::lock_guard lock(output_mutex); - fputs(buffer, stderr); - fflush(stderr); - } - fclose(stream); - }); - } - - // Wait for rank 0 to write the address file (with timeout) - auto start = std::chrono::steady_clock::now(); - while (!std::filesystem::exists(address_file)) { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - auto elapsed = std::chrono::steady_clock::now() - start; - if (elapsed > std::chrono::seconds(30)) { - suppress->store(true); - kill(rank0_pid, SIGKILL); - waitpid(rank0_pid, nullptr, 0); - if (rank0_stdout_forwarder.joinable()) - rank0_stdout_forwarder.join(); - if (rank0_stderr_forwarder.joinable()) - rank0_stderr_forwarder.join(); - throw std::runtime_error("Timeout waiting for rank 0 to write root address"); - } - } - - // Read the hex-encoded address and remove file - std::string encoded_address; - std::ifstream addr_stream(address_file); - std::getline(addr_stream, encoded_address); - addr_stream.close(); - std::filesystem::remove(address_file); - - if (cfg.verbose) { - std::cout << "[rrun] Got root address from rank 0 (hex-encoded, " - << encoded_address.size() << " chars)" << std::endl; - } - - // Rank 0 is already running - detach forwarders - if (rank0_stdout_forwarder.joinable()) - rank0_stdout_forwarder.detach(); - if (rank0_stderr_forwarder.joinable()) - rank0_stderr_forwarder.detach(); - - return encoded_address; -} - -/** - * @brief Coordinate root address between parent processes using SlurmBackend. - * - * This function is called by parent rrun processes in Slurm hybrid mode. - * The root parent (SLURM_PROCID=0) publishes the root address, and non-root - * parents retrieve it. - * - * @param root_address_to_publish Root address to publish. If set (has_value()), this is - * the root parent and it will publish. If empty (nullopt), - * this is a non-root parent and it will retrieve. - * @param verbose Whether to print debug messages. - * @return Root address (either published or retrieved). - * - * @throws std::runtime_error on coordination errors. - */ -std::string coordinate_root_address_via_pmix( - std::optional const& root_address_to_publish, bool verbose -) { - // Get Slurm rank information for parent coordination - char const* slurm_procid = std::getenv("SLURM_PROCID"); - char const* slurm_ntasks = std::getenv("SLURM_NTASKS"); - - if (!slurm_procid || !slurm_ntasks) { - throw std::runtime_error( - "SLURM_PROCID and SLURM_NTASKS must be set for parent coordination" - ); - } - - int parent_rank = std::stoi(slurm_procid); - int parent_nranks = std::stoi(slurm_ntasks); - - // Create SlurmBackend for parent-level coordination - rapidsmpf::bootstrap::Context parent_ctx{ - parent_rank, - parent_nranks, - rapidsmpf::bootstrap::BackendType::SLURM, - std::nullopt, - nullptr - }; - - auto backend = - std::make_shared(parent_ctx); - - if (verbose) { - std::cout << "[rrun] Parent coordination initialized: rank " << parent_rank - << " of " << parent_nranks << std::endl; - } - - std::string root_address; - - if (root_address_to_publish.has_value()) { - // Root parent publishes the address (already hex-encoded for binary safety) - if (verbose) { - std::cout << "[rrun] Publishing root address via SlurmBackend (hex-encoded, " - << root_address_to_publish.value().size() << " chars)" << std::endl; - } - - backend->put("rapidsmpf_root_address", root_address_to_publish.value()); - root_address = root_address_to_publish.value(); - } - - backend->sync(); - - if (!root_address_to_publish.has_value()) { - // Non-root parents retrieve the address - root_address = backend->get("rapidsmpf_root_address", std::chrono::seconds{30}); - - if (verbose) { - std::cout << "[rrun] Retrieved root address via SlurmBackend (hex-encoded, " - << root_address.size() << " chars)" << std::endl; - } - } - - // Note: PMIx session will be explicitly finalized after children complete - // (see execute_slurm_hybrid_mode where finalize_pmix() is called) - - return root_address; -} -#endif // RAPIDSMPF_HAVE_SLURM - /** * @brief Launch multiple ranks locally using fork. * - * A task here denotes a Slurm unit of execution, e.g., a single instance of a - * program or process, e.g., an instance of the `rrun` executable itself. - * * @param cfg Configuration. * @param rank_offset Starting global rank for this task. * @param ranks_per_task Number of ranks to launch. * @param total_ranks Total ranks across all tasks. - * @param root_address Pre-coordinated root address (empty for FILE backend). - * @param is_root_parent Whether this is root parent (affects which ranks to launch). * @return Exit status (0 for success). */ int launch_ranks_fork_based( - Config const& cfg, - int rank_offset, - int ranks_per_task, - int total_ranks, - std::optional const& root_address, - bool is_root_parent + Config const& cfg, int rank_offset, int ranks_per_task, int total_ranks ) { std::vector pids; pids.reserve(static_cast(ranks_per_task)); @@ -1266,15 +950,13 @@ int launch_ranks_fork_based( }); }; - // Launch ranks (skip rank 0 if root parent already launched it) - int start_local_rank = (is_root_parent && root_address.has_value()) ? 1 : 0; - - for (int local_rank = start_local_rank; local_rank < ranks_per_task; ++local_rank) { + // Launch ranks + for (int local_rank = 0; local_rank < ranks_per_task; ++local_rank) { int global_rank = rank_offset + local_rank; int fd_out = -1; int fd_err = -1; pid_t pid = launch_rank_local( - cfg, global_rank, local_rank, total_ranks, root_address, &fd_out, &fd_err + cfg, global_rank, local_rank, total_ranks, &fd_out, &fd_err ); pids.push_back(pid); @@ -1328,18 +1010,14 @@ int launch_ranks_fork_based( if (WIFEXITED(status)) { int code = WEXITSTATUS(status); if (code != 0) { - std::cerr << "[rrun] Rank " - << (static_cast(rank_offset) - + (is_root_parent && root_address.has_value() ? i + 1 : i)) + std::cerr << "[rrun] Rank " << (static_cast(rank_offset) + i) << " (PID " << pid << ") exited with code " << code << std::endl; exit_status = code; } } else if (WIFSIGNALED(status)) { int sig = WTERMSIG(status); - std::cerr << "[rrun] Rank " - << (static_cast(rank_offset) - + (is_root_parent && root_address.has_value() ? i + 1 : i)) + std::cerr << "[rrun] Rank " << (static_cast(rank_offset) + i) << " (PID " << pid << ") terminated by signal " << sig << std::endl; exit_status = 128 + sig; } @@ -1362,7 +1040,6 @@ int launch_ranks_fork_based( * @param global_rank Global rank number (used for RAPIDSMPF_RANK). * @param local_rank Local rank for GPU assignment (defaults to global_rank). * @param total_ranks Total number of ranks across all tasks (used for RAPIDSMPF_NRANKS). - * @param root_address Optional pre-coordinated root address (for hybrid mode). * @param out_fd_stdout Output file descriptor for stdout. * @param out_fd_stderr Output file descriptor for stderr. * @return Child process PID. @@ -1372,7 +1049,6 @@ pid_t launch_rank_local( int global_rank, int local_rank, int total_ranks, - std::optional const& root_address, int* out_fd_stdout, int* out_fd_stderr ) { @@ -1380,17 +1056,12 @@ pid_t launch_rank_local( int captured_global_rank = global_rank; int captured_local_rank = local_rank; int captured_total_ranks = total_ranks; - std::optional captured_root_address = root_address; return fork_with_piped_stdio( out_fd_stdout, out_fd_stderr, /*combine_stderr*/ false, - [&cfg, - captured_global_rank, - captured_local_rank, - captured_total_ranks, - captured_root_address]() { + [&cfg, captured_global_rank, captured_local_rank, captured_total_ranks]() { // Set custom environment variables first (can be overridden by specific vars) for (auto const& env_pair : cfg.env_vars) { setenv(env_pair.first.c_str(), env_pair.second.c_str(), 1); @@ -1399,29 +1070,12 @@ pid_t launch_rank_local( setenv("RAPIDSMPF_RANK", std::to_string(captured_global_rank).c_str(), 1); setenv("RAPIDSMPF_NRANKS", std::to_string(captured_total_ranks).c_str(), 1); - // Always set coord_dir for bootstrap initialization - // (needed even if using RAPIDSMPF_ROOT_ADDRESS for coordination) + // Set coordination directory for bootstrap initialization if (!cfg.coord_dir.empty()) { setenv("RAPIDSMPF_COORD_DIR", cfg.coord_dir.c_str(), 1); } - // If root address was pre-coordinated by parent, set it (already hex-encoded) - // This allows children to skip bootstrap coordination entirely - if (captured_root_address.has_value()) { - setenv("RAPIDSMPF_ROOT_ADDRESS", captured_root_address->c_str(), 1); - } - - // In Slurm hybrid mode, unset Slurm/PMIx rank variables to avoid confusion - // Children should not try to initialize PMIx themselves - if (cfg.slurm_mode) { - unsetenv("SLURM_PROCID"); - unsetenv("SLURM_LOCALID"); - unsetenv("PMIX_RANK"); - unsetenv("PMIX_NAMESPACE"); - } - // Set CUDA_VISIBLE_DEVICES if GPUs are available - // Use local_rank for GPU assignment (for Slurm hybrid mode) int gpu_id = -1; if (!cfg.gpus.empty()) { gpu_id = @@ -1503,21 +1157,8 @@ int main(int argc, char* argv[]) { } if (cfg.slurm_mode) { - if (cfg.nranks == 1) { - // Slurm passthrough mode: single rank per task, no forking - execute_slurm_passthrough_mode(cfg); - } - // Slurm hybrid mode: multiple ranks per task with PMIx coordination -#ifdef RAPIDSMPF_HAVE_SLURM - return execute_slurm_hybrid_mode(cfg); -#else - std::cerr << "[rrun] Error: Slurm hybrid mode requires PMIx support but " - << "rapidsmpf was not built with PMIx." << std::endl; - std::cerr << "[rrun] Rebuild with -DBUILD_SLURM_SUPPORT=ON or use " - "passthrough mode " - << "(without -n flag)." << std::endl; - return 1; -#endif + // Slurm mode: single rank per task, apply bindings and exec + execute_slurm_passthrough_mode(cfg); } else { // Single-node mode with file backend return execute_single_node_mode(cfg);