diff --git a/.cscs-ci/container/build.Containerfile b/.cscs-ci/container/build.Containerfile new file mode 100644 index 00000000..fe3e707f --- /dev/null +++ b/.cscs-ci/container/build.Containerfile @@ -0,0 +1,20 @@ +ARG DEPS_IMAGE +FROM $DEPS_IMAGE + +COPY . /oomph +WORKDIR /oomph + +ARG BACKEND +ARG NUM_PROCS +RUN spack -e ci build-env oomph -- \ + cmake -G Ninja -B build \ + -DCMAKE_BUILD_TYPE=Debug \ + -DOOMPH_WITH_TESTING=ON \ + -DOOMPH_WITH_$(echo $BACKEND | tr '[:lower:]' '[:upper:]')=ON \ + -DOOMPH_USE_BUNDLED_LIBS=ON \ + -DOOMPH_USE_BUNDLED_HWMALLOC=OFF \ + -DMPIEXEC_EXECUTABLE="" \ + -DMPIEXEC_NUMPROC_FLAG="" \ + -DMPIEXEC_PREFLAGS="" \ + -DMPIEXEC_POSTFLAGS="" && \ + spack -e ci build-env oomph -- cmake --build build -j$NUM_PROCS diff --git a/.cscs-ci/container/deps.Containerfile b/.cscs-ci/container/deps.Containerfile new file mode 100644 index 00000000..f5867ac5 --- /dev/null +++ b/.cscs-ci/container/deps.Containerfile @@ -0,0 +1,24 @@ +ARG BASE_IMAGE +FROM $BASE_IMAGE + +ARG SPACK_SHA +RUN mkdir -p /opt/spack && \ + curl -fLsS "https://api.github.com/repos/spack/spack/tarball/$SPACK_SHA" | tar --strip-components=1 -xz -C /opt/spack + +ENV PATH="/opt/spack/bin:$PATH" + +ARG SPACK_PACKAGES_SHA +RUN mkdir -p /opt/spack-packages && \ + curl -fLsS "https://api.github.com/repos/spack/spack-packages/tarball/$SPACK_PACKAGES_SHA" | tar --strip-components=1 -xz -C /opt/spack-packages + +RUN spack repo remove --scope defaults:base builtin && \ + spack repo add --scope site /opt/spack-packages/repos/spack_repo/builtin + +ARG SPACK_ENV_FILE +COPY $SPACK_ENV_FILE /spack_environment/spack.yaml + +ARG NUM_PROCS +RUN spack external find --all && \ + spack env create ci /spack_environment/spack.yaml && \ + spack -e ci concretize -f && \ + spack -e ci install --jobs $NUM_PROCS --fail-fast --only=dependencies diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml new file mode 100644 index 00000000..c88a4522 --- /dev/null +++ b/.cscs-ci/default.yaml @@ -0,0 +1,192 @@ +include: + - remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml' + +variables: + BASE_IMAGE: jfrog.svc.cscs.ch/docker-group-csstaff/alps-images/ngc-pytorch:26.01-py3-alps4-dev + SPACK_SHA: v1.1.1 + SPACK_PACKAGES_SHA: bc93746ce936d6653271b6e98f6df6ee28f64e84 # develop on 2026-03-25 + FF_TIMESTAMPS: true + +.build_deps_template: + timeout: 1 hour + before_script: + - echo $DOCKERHUB_TOKEN | podman login docker.io -u $DOCKERHUB_USERNAME --password-stdin || true + - export DOCKERFILE_SHA=`sha256sum .cscs-ci/container/deps.Containerfile | head -c 16` + - export ENV_FILE_SHA=`sha256sum ${SPACK_ENV_FILE} | head -c 16` + - export CONFIG_TAG=`echo $DOCKERFILE_SHA-$BASE_IMAGE-$SPACK_SHA-$SPACK_PACKAGES_SHA-$ENV_FILE_SHA | sha256sum - | head -c 16` + - export PERSIST_IMAGE_NAME=$CSCS_REGISTRY_PATH/public/oomph-spack-deps-$BACKEND:$CONFIG_TAG + - echo -e "CONFIG_TAG=$CONFIG_TAG" >> base-${BACKEND}.env + - echo -e "DEPS_IMAGE=$PERSIST_IMAGE_NAME" >> base-${BACKEND}.env + variables: + DOCKERFILE: .cscs-ci/container/deps.Containerfile + DOCKER_BUILD_ARGS: '["BASE_IMAGE", "SPACK_SHA", "SPACK_PACKAGES_SHA", "SPACK_ENV_FILE"]' + SPACK_ENV_FILE: .cscs-ci/spack/$BACKEND.yaml + artifacts: + reports: + dotenv: base-${BACKEND}.env + +# TODO: NCCL will be enabled in https://github.com/ghex-org/oomph/pull/55 +# build_deps_nccl: +# variables: +# BACKEND: nccl +# extends: +# - .container-builder-cscs-gh200 +# - .build_deps_template + +build_deps_mpi: + variables: + BACKEND: mpi + extends: + - .container-builder-cscs-gh200 + - .build_deps_template + +build_deps_ucx: + variables: + BACKEND: ucx + extends: + - .container-builder-cscs-gh200 + - .build_deps_template + +build_deps_libfabric: + variables: + BACKEND: libfabric + extends: + - .container-builder-cscs-gh200 + - .build_deps_template + +.build_template: + extends: .container-builder-cscs-gh200 + timeout: 15 minutes + before_script: + - echo $DOCKERHUB_TOKEN | podman login docker.io -u $DOCKERHUB_USERNAME --password-stdin || true + - export PERSIST_IMAGE_NAME=$CSCS_REGISTRY_PATH/public/oomph-build-$BACKEND:$CI_COMMIT_SHA + - echo -e "BUILD_IMAGE=$PERSIST_IMAGE_NAME" >> build-${BACKEND}.env + variables: + DOCKERFILE: .cscs-ci/container/build.Containerfile + DOCKER_BUILD_ARGS: '["DEPS_IMAGE", "BACKEND"]' + artifacts: + reports: + dotenv: build-${BACKEND}.env + +# TODO: NCCL will be enabled in https://github.com/ghex-org/oomph/pull/55 +# build_nccl: +# variables: +# BACKEND: nccl +# extends: .build_template +# needs: +# - job: build_deps_nccl +# artifacts: true + +build_mpi: + variables: + BACKEND: mpi + extends: .build_template + needs: + - job: build_deps_mpi + artifacts: true + +build_ucx: + variables: + BACKEND: ucx + extends: .build_template + needs: + - job: build_deps_ucx + artifacts: true + +build_libfabric: + variables: + BACKEND: libfabric + extends: .build_template + needs: + - job: build_deps_libfabric + artifacts: true + +.test_template_base: + extends: .container-runner-clariden-gh200 + variables: + SLURM_JOB_NUM_NODES: 1 + SLURM_GPUS_PER_TASK: 1 + SLURM_TIMELIMIT: '5:00' + SLURM_PARTITION: normal + SLURM_MPI_TYPE: pmix + SLURM_NETWORK: disable_rdzv_get + SLURM_LABELIO: 1 + SLURM_UNBUFFEREDIO: 1 + PMIX_MCA_psec: native + PMIX_MCA_gds: "^shmem2" + USE_MPI: NO + +.test_serial_template: + extends: .test_template_base + variables: + SLURM_NTASKS: 1 + script: + - ctest --test-dir /oomph/build -L "serial" --output-on-failure --timeout 60 --parallel 8 + +.test_parallel_template: + extends: .test_template_base + variables: + SLURM_NTASKS: 4 + script: + # All ranks write to ctest files in Testing, but this can deadlock when + # writing inside the container. + - if [[ "${SLURM_PROCID}" == 0 ]]; then rm -rf /oomph/build/Testing; mkdir /tmp/Testing; ln -s /tmp/Testing /oomph/build/Testing; fi + - sleep 1 + - ctest --test-dir /oomph/build -L "parallel-ranks-4" --output-on-failure --timeout 60 + +# TODO: NCCL will be enabled in https://github.com/ghex-org/oomph/pull/55 +# test_serial_nccl: +# extends: .test_serial_template +# needs: +# - job: build_nccl +# artifacts: true +# image: $BUILD_IMAGE + +# test_parallel_nccl: +# extends: .test_parallel_template +# needs: +# - job: build_nccl +# artifacts: true +# image: $BUILD_IMAGE + +test_serial_mpi: + extends: .test_serial_template + needs: + - job: build_mpi + artifacts: true + image: $BUILD_IMAGE + +test_parallel_mpi: + extends: .test_parallel_template + needs: + - job: build_mpi + artifacts: true + image: $BUILD_IMAGE + +test_serial_ucx: + extends: .test_serial_template + needs: + - job: build_ucx + artifacts: true + image: $BUILD_IMAGE + +test_parallel_ucx: + extends: .test_parallel_template + needs: + - job: build_ucx + artifacts: true + image: $BUILD_IMAGE + +test_serial_libfabric: + extends: .test_serial_template + needs: + - job: build_libfabric + artifacts: true + image: $BUILD_IMAGE + +test_parallel_libfabric: + extends: .test_parallel_template + needs: + - job: build_libfabric + artifacts: true + image: $BUILD_IMAGE diff --git a/.cscs-ci/spack/libfabric.yaml b/.cscs-ci/spack/libfabric.yaml new file mode 100644 index 00000000..fac7f88f --- /dev/null +++ b/.cscs-ci/spack/libfabric.yaml @@ -0,0 +1,6 @@ +spack: + specs: + - oomph@main backend=libfabric +cuda + view: false + concretizer: + unify: true diff --git a/.cscs-ci/spack/mpi.yaml b/.cscs-ci/spack/mpi.yaml new file mode 100644 index 00000000..d59aab13 --- /dev/null +++ b/.cscs-ci/spack/mpi.yaml @@ -0,0 +1,6 @@ +spack: + specs: + - oomph@main backend=mpi +cuda + view: false + concretizer: + unify: true diff --git a/.cscs-ci/spack/nccl.yaml b/.cscs-ci/spack/nccl.yaml new file mode 100644 index 00000000..94f0dd31 --- /dev/null +++ b/.cscs-ci/spack/nccl.yaml @@ -0,0 +1,6 @@ +spack: + specs: + - oomph@main backend=nccl +cuda + view: false + concretizer: + unify: true diff --git a/.cscs-ci/spack/ucx.yaml b/.cscs-ci/spack/ucx.yaml new file mode 100644 index 00000000..51377dd8 --- /dev/null +++ b/.cscs-ci/spack/ucx.yaml @@ -0,0 +1,6 @@ +spack: + specs: + - oomph@main backend=ucx +cuda + view: false + concretizer: + unify: true diff --git a/CMakeLists.txt b/CMakeLists.txt index 90a582d1..3db53422 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,12 +1,6 @@ cmake_minimum_required(VERSION 3.17) # CMake version is set at 3.17 because of find_package(CUDAToolkit) -if (NOT ${CMAKE_VERSION} VERSION_LESS 3.27) - # new in 3.27: additionally use uppercase _ROOT - # environment and CMake variables for find_package - cmake_policy(SET CMP0144 NEW) -endif() - set(OOMPH_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") list(APPEND CMAKE_MODULE_PATH "${OOMPH_MODULE_PATH}") @@ -28,6 +22,7 @@ endfunction() set_policy(CMP0074 NEW) # find_package uses XXX_ROOT vars using PackageName set_policy(CMP0144 NEW) # find_package allows XXX_ROOT vars using PACKAGENAME Uppercase +set_policy(CMP0167 NEW) # find_package uses new boost config (boost 1.70 onwards) # --------------------------------------------------------------------- # CMake setup, C++ version, build type, modules, etc @@ -92,7 +87,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/config.hpp.in ${CMAKE_CURRENT_BINARY_DIR}/include/oomph/config.hpp @ONLY) install(FILES ${PROJECT_BINARY_DIR}/include/oomph/config.hpp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/oomph) -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/cmake_config.inc.in +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/cmake_config.inc.in ${CMAKE_CURRENT_BINARY_DIR}/include/oomph/cmake_config.inc) # --------------------------------------------------------------------- diff --git a/cmake/config.hpp.in b/cmake/config.hpp.in index 458b038a..e9fcf5e4 100644 --- a/cmake/config.hpp.in +++ b/cmake/config.hpp.in @@ -26,9 +26,12 @@ #cmakedefine01 OOMPH_USE_FAST_PIMPL #cmakedefine01 OOMPH_ENABLE_BARRIER + +// clang-format off #define OOMPH_RECURSION_DEPTH @OOMPH_RECURSION_DEPTH@ #define OOMPH_VERSION @OOMPH_VERSION_NUMERIC@ #define OOMPH_VERSION_MAJOR @OOMPH_VERSION_MAJOR@ #define OOMPH_VERSION_MINOR @OOMPH_VERSION_MINOR@ #define OOMPH_VERSION_PATCH @OOMPH_VERSION_PATCH@ +// clang-format on diff --git a/cmake/oomph_defs.hpp.in b/cmake/oomph_defs.hpp.in index 70ae8732..a52a943f 100644 --- a/cmake/oomph_defs.hpp.in +++ b/cmake/oomph_defs.hpp.in @@ -15,7 +15,9 @@ namespace oomph { namespace fort { + // clang-format off using fp_type = @OOMPH_FORTRAN_FP@; + // clang-format on typedef enum { OomphBarrierGlobal=1, OomphBarrierThread=2, diff --git a/cmake/oomph_libfabric.cmake b/cmake/oomph_libfabric.cmake index 758f3f4d..1ddaf71d 100644 --- a/cmake/oomph_libfabric.cmake +++ b/cmake/oomph_libfabric.cmake @@ -1,176 +1,196 @@ # set all libfabric related options and values -#------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ # Enable libfabric support -#------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ set(OOMPH_WITH_LIBFABRIC OFF CACHE BOOL "Build with LIBFABRIC backend") -if (OOMPH_WITH_LIBFABRIC) - find_package(Libfabric REQUIRED) - add_library(oomph_libfabric SHARED) - add_library(oomph::libfabric ALIAS oomph_libfabric) - oomph_shared_lib_options(oomph_libfabric) - target_link_libraries(oomph_libfabric PUBLIC libfabric::libfabric) - install(TARGETS oomph_libfabric - EXPORT oomph-targets - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}) - - # --------------------------------------------------------------------- - # Function to add config defines to a list that depends on a namespace variable - # #defines that match the namespace can later be written out to a file - # --------------------------------------------------------------------- - function(oomph_libfabric_add_config_define_namespace) - set(options) - set(one_value_args DEFINE NAMESPACE) - set(multi_value_args VALUE) - cmake_parse_arguments(OPTION - "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) - - set(DEF_VAR OOMPH_LIBFABRIC_CONFIG_DEFINITIONS_${OPTION_NAMESPACE}) - - # to avoid extra trailing spaces (no value), use an if check - if(OPTION_VALUE) - set_property(GLOBAL APPEND PROPERTY ${DEF_VAR} "${OPTION_DEFINE} ${OPTION_VALUE}") - else() - set_property(GLOBAL APPEND PROPERTY ${DEF_VAR} "${OPTION_DEFINE}") - endif() - - endfunction() - - # --------------------------------------------------------------------- - # Function to write out all the config defines for a given namespace - # into a config file - # --------------------------------------------------------------------- - function(oomph_libfabric_write_config_defines_file) - set(options) - set(one_value_args TEMPLATE NAMESPACE FILENAME) - set(multi_value_args) - cmake_parse_arguments(OPTION - "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) - - get_property(DEFINITIONS_VAR GLOBAL PROPERTY - OOMPH_LIBFABRIC_CONFIG_DEFINITIONS_${OPTION_NAMESPACE}) - - if(DEFINED DEFINITIONS_VAR) - list(SORT DEFINITIONS_VAR) - list(REMOVE_DUPLICATES DEFINITIONS_VAR) - endif() - - set(oomph_config_defines "\n") - foreach(def ${DEFINITIONS_VAR}) - set(oomph_config_defines "${oomph_config_defines}#define ${def}\n") - endforeach() - - # if the user has not specified a template, generate a proper header file - if (NOT OPTION_TEMPLATE) - string(TOUPPER ${OPTION_NAMESPACE} NAMESPACE_UPPER) - set(PREAMBLE - "\n" - "// Do not edit this file! It has been generated by the cmake configuration step.\n" - "\n" - "#ifndef OOMPH_LIBFABRIC_CONFIG_${NAMESPACE_UPPER}_HPP\n" - "#define OOMPH_LIBFABRIC_CONFIG_${NAMESPACE_UPPER}_HPP\n" - ) - set(TEMP_FILENAME "${PROJECT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/${NAMESPACE_UPPER}") - file(WRITE ${TEMP_FILENAME} - ${PREAMBLE} - ${oomph_config_defines} - "#endif\n" - ) - configure_file("${TEMP_FILENAME}" "${OPTION_FILENAME}" COPYONLY) - file(REMOVE "${TEMP_FILENAME}") - else() - configure_file("${OPTION_TEMPLATE}" - "${OPTION_FILENAME}" - @ONLY) - endif() - endfunction() - - include(CMakeParseArguments) - - #------------------------------------------------------------------------------ - # Hardware device selection - #------------------------------------------------------------------------------ - set(OOMPH_LIBFABRIC_PROVIDER "tcp" CACHE - STRING "The provider (cxi(Cray Slingshot)/efa(Amazon Elastic)/gni(Cray Gemini)/psm2(Intel Omni-Path)/tcp/verbs(Infiniband))") - set_property(CACHE OOMPH_LIBFABRIC_PROVIDER PROPERTY STRINGS - "cxi" "efa" "gni" "psm2" "tcp" "verbs") +if(OOMPH_WITH_LIBFABRIC) + find_package(Libfabric REQUIRED) + add_library(oomph_libfabric SHARED) + add_library(oomph::libfabric ALIAS oomph_libfabric) + oomph_shared_lib_options(oomph_libfabric) + target_link_libraries(oomph_libfabric PUBLIC libfabric::libfabric) + install(TARGETS oomph_libfabric EXPORT oomph-targets + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + ) + + # --------------------------------------------------------------------- + # Function to add config defines to a list that depends on a namespace + # variable #defines that match the namespace can later be written out to a + # file + # --------------------------------------------------------------------- + function(oomph_libfabric_add_config_define_namespace) + set(options) + set(one_value_args DEFINE NAMESPACE) + set(multi_value_args VALUE) + cmake_parse_arguments( + OPTION "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN} + ) - oomph_libfabric_add_config_define_namespace( - DEFINE HAVE_LIBFABRIC_PROVIDER - VALUE "\"${OOMPH_LIBFABRIC_PROVIDER}\"" - NAMESPACE libfabric) - - option(OOMPH_LIBFABRIC_V1_API "Support older libfabric@1.15" OFF) - if (OOMPH_LIBFABRIC_V1_API) - oomph_libfabric_add_config_define_namespace( - DEFINE OOMPH_LIBFABRIC_V1_API - NAMESPACE libfabric) - endif() - - if(OOMPH_LIBFABRIC_PROVIDER MATCHES "verbs") - oomph_libfabric_add_config_define_namespace( - DEFINE HAVE_LIBFABRIC_VERBS - NAMESPACE libfabric) - elseif(OOMPH_LIBFABRIC_PROVIDER MATCHES "gni") - oomph_libfabric_add_config_define_namespace( - DEFINE HAVE_LIBFABRIC_GNI - NAMESPACE libfabric) - # add pmi library - set(_libfabric_libraries ${_libfabric_libraries} PMIx::libpmix) - elseif(OOMPH_LIBFABRIC_PROVIDER MATCHES "cxi") - oomph_libfabric_add_config_define_namespace( - DEFINE HAVE_LIBFABRIC_CXI - NAMESPACE libfabric) - elseif(OOMPH_LIBFABRIC_PROVIDER MATCHES "efa") - oomph_libfabric_add_config_define_namespace( - DEFINE HAVE_LIBFABRIC_EFA - NAMESPACE libfabric) - elseif(OOMPH_LIBFABRIC_PROVIDER MATCHES "tcp") - oomph_libfabric_add_config_define_namespace( - DEFINE HAVE_LIBFABRIC_TCP - NAMESPACE libfabric) - elseif(OOMPH_LIBFABRIC_PROVIDER MATCHES "sockets") - message(WARNING "The Sockets provider is deprecated in favor of the tcp, udp, " - "and utility providers") - oomph_libfabric_add_config_define_namespace( - DEFINE HAVE_LIBFABRIC_SOCKETS - NAMESPACE libfabric) - elseif(OOMPH_LIBFABRIC_PROVIDER MATCHES "psm2") - oomph_libfabric_add_config_define_namespace( - DEFINE HAVE_LIBFABRIC_PSM2 - NAMESPACE libfabric) - endif() + set(DEF_VAR OOMPH_LIBFABRIC_CONFIG_DEFINITIONS_${OPTION_NAMESPACE}) - #------------------------------------------------------------------------------ - # Performance counters - #------------------------------------------------------------------------------ - set(OOMPH_LIBFABRIC_WITH_PERFORMANCE_COUNTERS OFF BOOL - STRING "Enable libfabric parcelport performance counters (default: OFF)") - mark_as_advanced(OOMPH_LIBFABRIC_WITH_PERFORMANCE_COUNTERS) - - if (OOMPH_LIBFABRIC_WITH_PERFORMANCE_COUNTERS) - oomph_libfabric_add_config_define_namespace( - DEFINE OOMPH_LIBFABRIC_HAVE_PERFORMANCE_COUNTERS - NAMESPACE libfabric) + # to avoid extra trailing spaces (no value), use an if check + if(OPTION_VALUE) + set_property( + GLOBAL APPEND PROPERTY ${DEF_VAR} "${OPTION_DEFINE} ${OPTION_VALUE}" + ) + else() + set_property(GLOBAL APPEND PROPERTY ${DEF_VAR} "${OPTION_DEFINE}") endif() - #------------------------------------------------------------------------------ - # used by template expansion for location of print.hpp - #------------------------------------------------------------------------------ - set(OOMPH_SRC_LIBFABRIC_DIR "${PROJECT_SOURCE_DIR}/src/libfabric") - - #------------------------------------------------------------------------------ - # Write options to file in build dir - #------------------------------------------------------------------------------ - oomph_libfabric_write_config_defines_file( - NAMESPACE libfabric - FILENAME "${PROJECT_BINARY_DIR}/src/libfabric/oomph_libfabric_defines.hpp" - TEMPLATE "${OOMPH_SRC_LIBFABRIC_DIR}/libfabric_defines_template.hpp" + endfunction() + + # --------------------------------------------------------------------- + # Function to write out all the config defines for a given namespace into a + # config file + # --------------------------------------------------------------------- + function(oomph_libfabric_write_config_defines_file) + set(options) + set(one_value_args TEMPLATE NAMESPACE FILENAME) + set(multi_value_args) + cmake_parse_arguments( + OPTION "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN} ) - target_include_directories(oomph_libfabric PRIVATE "${PROJECT_BINARY_DIR}/src/libfabric") -endif() + get_property( + DEFINITIONS_VAR GLOBAL + PROPERTY OOMPH_LIBFABRIC_CONFIG_DEFINITIONS_${OPTION_NAMESPACE} + ) + + if(DEFINED DEFINITIONS_VAR) + list(SORT DEFINITIONS_VAR) + list(REMOVE_DUPLICATES DEFINITIONS_VAR) + endif() + set(oomph_config_defines "\n") + foreach(def ${DEFINITIONS_VAR}) + set(oomph_config_defines "${oomph_config_defines}#define ${def}\n") + endforeach() + # if the user has not specified a template, generate a proper header file + if(NOT OPTION_TEMPLATE) + string(TOUPPER ${OPTION_NAMESPACE} NAMESPACE_UPPER) + set(PREAMBLE + "\n" + "// Do not edit this file! It has been generated by the cmake configuration step.\n" + "\n" + "#ifndef OOMPH_LIBFABRIC_CONFIG_${NAMESPACE_UPPER}_HPP\n" + "#define OOMPH_LIBFABRIC_CONFIG_${NAMESPACE_UPPER}_HPP\n" + ) + set(TEMP_FILENAME + "${PROJECT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/${NAMESPACE_UPPER}" + ) + file(WRITE ${TEMP_FILENAME} ${PREAMBLE} ${oomph_config_defines} + "#endif\n" + ) + configure_file("${TEMP_FILENAME}" "${OPTION_FILENAME}" COPYONLY) + file(REMOVE "${TEMP_FILENAME}") + else() + configure_file("${OPTION_TEMPLATE}" "${OPTION_FILENAME}" @ONLY) + endif() + endfunction() + + include(CMakeParseArguments) + + # ------------------------------------------------------------------------------ + # Hardware device selection + # ------------------------------------------------------------------------------ + set(OOMPH_LIBFABRIC_PROVIDER + "tcp" + CACHE + STRING + "The provider cxi(Cray Slingshot)/efa(Amazon Elastic)/gni(Cray Gemini)/psm2(Intel Omni-Path)/tcp/verbs(Infiniband), shm, lnx" + ) + set_property( + CACHE OOMPH_LIBFABRIC_PROVIDER + PROPERTY STRINGS + "cxi" + "efa" + "gni" + "psm2" + "tcp" + "verbs" + "shm" + "lnx" + ) + + oomph_libfabric_add_config_define_namespace( + DEFINE HAVE_LIBFABRIC_PROVIDER VALUE "\"${OOMPH_LIBFABRIC_PROVIDER}\"" + NAMESPACE libfabric + ) + + option(OOMPH_LIBFABRIC_V1_API "Support older libfabric@1.15" OFF) + if(OOMPH_LIBFABRIC_V1_API) + oomph_libfabric_add_config_define_namespace( + DEFINE OOMPH_LIBFABRIC_V1_API NAMESPACE libfabric + ) + endif() + + # Map provider string to uppercase and create a define + string(TOUPPER "${OOMPH_LIBFABRIC_PROVIDER}" PROVIDER_UPPER) + oomph_libfabric_add_config_define_namespace( + DEFINE "HAVE_LIBFABRIC_${PROVIDER_UPPER}" NAMESPACE libfabric + ) + + # Special handling for deprecated or extra cases + if(OOMPH_LIBFABRIC_PROVIDER STREQUAL "sockets") + message( + WARNING + "The + Sockets + provider + is + deprecated + in + favor + of + the + tcp, + udp, + and + utility + providers" + ) + endif() + + # Special handling for gni provider needing PMIx + if(OOMPH_LIBFABRIC_PROVIDER STREQUAL "gni") + set(_libfabric_libraries ${_libfabric_libraries} PMIx::libpmix) + endif() + + # ------------------------------------------------------------------------------ + # Performance counters + # ------------------------------------------------------------------------------ + set(OOMPH_LIBFABRIC_WITH_PERFORMANCE_COUNTERS + OFF + BOOL + STRING + "Enable libfabric performance counters (default: OFF)" + ) + mark_as_advanced(OOMPH_LIBFABRIC_WITH_PERFORMANCE_COUNTERS) + + if(OOMPH_LIBFABRIC_WITH_PERFORMANCE_COUNTERS) + oomph_libfabric_add_config_define_namespace( + DEFINE OOMPH_LIBFABRIC_HAVE_PERFORMANCE_COUNTERS NAMESPACE libfabric + ) + endif() + + # ------------------------------------------------------------------------------ + # used by template expansion for location of print.hpp + # ------------------------------------------------------------------------------ + set(OOMPH_SRC_LIBFABRIC_DIR "${PROJECT_SOURCE_DIR}/src/libfabric") + + # ------------------------------------------------------------------------------ + # Write options to file in build dir + # ------------------------------------------------------------------------------ + oomph_libfabric_write_config_defines_file( + NAMESPACE libfabric FILENAME + "${PROJECT_BINARY_DIR}/src/libfabric/oomph_libfabric_defines.hpp" TEMPLATE + "${OOMPH_SRC_LIBFABRIC_DIR}/libfabric_defines_template.hpp" + ) + target_include_directories( + oomph_libfabric PRIVATE "${PROJECT_BINARY_DIR}/src/libfabric" + ) +endif() diff --git a/include/oomph/detail/communicator_helper.hpp b/include/oomph/detail/communicator_helper.hpp index 6e0e97d5..8335c6eb 100644 --- a/include/oomph/detail/communicator_helper.hpp +++ b/include/oomph/detail/communicator_helper.hpp @@ -11,6 +11,7 @@ #include #include +#include #include #include //#include @@ -33,7 +34,7 @@ #define OOMPH_CHECK_CALLBACK_MSG_REF \ static_assert(std::is_same&>::value || \ - std::is_same const&>::value, \ + std::is_same const&>::value, \ "first callback argument type is not an l-value reference to a message_buffer"); #define OOMPH_CHECK_CALLBACK_MSG_CONST_REF \ @@ -41,129 +42,107 @@ "first callback argument type is not a const l-value reference to a message_buffer"); #define OOMPH_CHECK_CALLBACK(CALLBACK) \ - { \ - OOMPH_CHECK_CALLBACK_F(CALLBACK, rank_type, tag_type) \ - OOMPH_CHECK_CALLBACK_MSG \ - } + {OOMPH_CHECK_CALLBACK_F(CALLBACK, rank_type, tag_type) OOMPH_CHECK_CALLBACK_MSG} #define OOMPH_CHECK_CALLBACK_MULTI(CALLBACK) \ - { \ - OOMPH_CHECK_CALLBACK_F(CALLBACK, std::vector, tag_type) \ - OOMPH_CHECK_CALLBACK_MSG \ - } + {OOMPH_CHECK_CALLBACK_F(CALLBACK, std::vector, tag_type) OOMPH_CHECK_CALLBACK_MSG} #define OOMPH_CHECK_CALLBACK_MULTI_TAGS(CALLBACK) \ - { \ - OOMPH_CHECK_CALLBACK_F(CALLBACK, std::vector, std::vector) \ - OOMPH_CHECK_CALLBACK_MSG \ - } + {OOMPH_CHECK_CALLBACK_F(CALLBACK, std::vector, std::vector) \ + OOMPH_CHECK_CALLBACK_MSG} #define OOMPH_CHECK_CALLBACK_REF(CALLBACK) \ - { \ - OOMPH_CHECK_CALLBACK_F(CALLBACK, rank_type, tag_type) \ - OOMPH_CHECK_CALLBACK_MSG_REF \ - } + {OOMPH_CHECK_CALLBACK_F(CALLBACK, rank_type, tag_type) OOMPH_CHECK_CALLBACK_MSG_REF} #define OOMPH_CHECK_CALLBACK_MULTI_REF(CALLBACK) \ - { \ - OOMPH_CHECK_CALLBACK_F(CALLBACK, std::vector, tag_type) \ - OOMPH_CHECK_CALLBACK_MSG_REF \ - } + {OOMPH_CHECK_CALLBACK_F(CALLBACK, std::vector, tag_type) \ + OOMPH_CHECK_CALLBACK_MSG_REF} #define OOMPH_CHECK_CALLBACK_MULTI_REF_TAGS(CALLBACK) \ - { \ - OOMPH_CHECK_CALLBACK_F(CALLBACK, std::vector, std::vector) \ - OOMPH_CHECK_CALLBACK_MSG_REF \ - } + {OOMPH_CHECK_CALLBACK_F(CALLBACK, std::vector, std::vector) \ + OOMPH_CHECK_CALLBACK_MSG_REF} #define OOMPH_CHECK_CALLBACK_CONST_REF(CALLBACK) \ - { \ - OOMPH_CHECK_CALLBACK_F(CALLBACK, rank_type, tag_type) \ - OOMPH_CHECK_CALLBACK_MSG_CONST_REF \ - } + {OOMPH_CHECK_CALLBACK_F(CALLBACK, rank_type, tag_type) OOMPH_CHECK_CALLBACK_MSG_CONST_REF} #define OOMPH_CHECK_CALLBACK_MULTI_CONST_REF(CALLBACK) \ - { \ - OOMPH_CHECK_CALLBACK_F(CALLBACK, std::vector, tag_type) \ - OOMPH_CHECK_CALLBACK_MSG_CONST_REF \ - } + {OOMPH_CHECK_CALLBACK_F(CALLBACK, std::vector, tag_type) \ + OOMPH_CHECK_CALLBACK_MSG_CONST_REF} #define OOMPH_CHECK_CALLBACK_MULTI_CONST_REF_TAGS(CALLBACK) \ - { \ - OOMPH_CHECK_CALLBACK_F(CALLBACK, std::vector, std::vector) \ - OOMPH_CHECK_CALLBACK_MSG_CONST_REF \ - } - -namespace oomph -{ -class communicator_impl; - -namespace detail -{ -struct communicator_state -{ - using impl_type = communicator_impl; - impl_type* m_impl; - std::atomic* m_shared_scheduled_recvs; - util::pool_factory m_mrs_factory; - std::size_t scheduled_sends = 0; - std::size_t scheduled_recvs = 0; - - communicator_state(impl_type* impl_, std::atomic* shared_scheduled_recvs); - ~communicator_state(); - communicator_state(communicator_state const&) = delete; - communicator_state(communicator_state&&) = delete; - communicator_state& operator=(communicator_state const&) = delete; - communicator_state& operator=(communicator_state&&) = delete; - - auto make_multi_request_state(std::size_t ns) { return m_mrs_factory.make(m_impl, ns); } - - template - auto make_multi_request_state(std::vector&& neighs, - oomph::message_buffer const& msg) - { - return m_mrs_factory.make(m_impl, neighs.size(), std::move(neighs), std::vector{}, - msg.size(), &msg); - } - - template - auto make_multi_request_state(std::vector&& neighs, std::vector&& tags, - oomph::message_buffer const& msg) - { - return m_mrs_factory.make(m_impl, neighs.size(), std::move(neighs), std::move(tags), - msg.size(), &msg); - } - - template - auto make_multi_request_state(std::vector&& neighs, oomph::message_buffer& msg) - { - return m_mrs_factory.make(m_impl, neighs.size(), std::move(neighs), std::vector{}, - msg.size(), &msg); - } - - template - auto make_multi_request_state(std::vector&& neighs, std::vector&& tags, - oomph::message_buffer& msg) - { - return m_mrs_factory.make(m_impl, neighs.size(), std::move(neighs), std::move(tags), - msg.size(), &msg); - } - - template - auto make_multi_request_state(std::vector&& neighs, oomph::message_buffer&& msg) - { - return m_mrs_factory.make(m_impl, neighs.size(), std::move(neighs), std::vector{}, - msg.size(), nullptr, std::move(msg.m)); - } - - template - auto make_multi_request_state(std::vector&& neighs, std::vector&& tags, - oomph::message_buffer&& msg) - { - return m_mrs_factory.make(m_impl, neighs.size(), std::move(neighs), std::move(tags), - msg.size(), nullptr, std::move(msg.m)); - } -}; - -} // namespace detail -} // namespace oomph + {OOMPH_CHECK_CALLBACK_F(CALLBACK, std::vector, std::vector) \ + OOMPH_CHECK_CALLBACK_MSG_CONST_REF} + +namespace oomph { + class communicator_impl; + + namespace detail { + struct communicator_state + { + using impl_type = communicator_impl; + impl_type* m_impl; + std::atomic* m_shared_scheduled_recvs; + util::pool_factory m_mrs_factory; + std::size_t scheduled_sends = 0; + std::size_t scheduled_recvs = 0; + + communicator_state(impl_type* impl_, std::atomic* shared_scheduled_recvs); + ~communicator_state(); + communicator_state(communicator_state const&) = delete; + communicator_state(communicator_state&&) = delete; + communicator_state& operator=(communicator_state const&) = delete; + communicator_state& operator=(communicator_state&&) = delete; + + auto make_multi_request_state(std::size_t ns) { return m_mrs_factory.make(m_impl, ns); } + + template + auto make_multi_request_state( + std::vector&& neighs, oomph::message_buffer const& msg) + { + return m_mrs_factory.make(m_impl, neighs.size(), std::move(neighs), + std::vector{}, msg.size(), &msg); + } + + template + auto make_multi_request_state(std::vector&& neighs, + std::vector&& tags, oomph::message_buffer const& msg) + { + return m_mrs_factory.make( + m_impl, neighs.size(), std::move(neighs), std::move(tags), msg.size(), &msg); + } + + template + auto + make_multi_request_state(std::vector&& neighs, oomph::message_buffer& msg) + { + return m_mrs_factory.make(m_impl, neighs.size(), std::move(neighs), + std::vector{}, msg.size(), &msg); + } + + template + auto make_multi_request_state(std::vector&& neighs, + std::vector&& tags, oomph::message_buffer& msg) + { + return m_mrs_factory.make( + m_impl, neighs.size(), std::move(neighs), std::move(tags), msg.size(), &msg); + } + + template + auto make_multi_request_state( + std::vector&& neighs, oomph::message_buffer&& msg) + { + return m_mrs_factory.make(m_impl, neighs.size(), std::move(neighs), + std::vector{}, msg.size(), nullptr, std::move(msg.m)); + } + + template + auto make_multi_request_state(std::vector&& neighs, + std::vector&& tags, oomph::message_buffer&& msg) + { + return m_mrs_factory.make(m_impl, neighs.size(), std::move(neighs), std::move(tags), + msg.size(), nullptr, std::move(msg.m)); + } + }; + + } // namespace detail +} // namespace oomph diff --git a/src/libfabric/CMakeLists.txt b/src/libfabric/CMakeLists.txt index c82e387d..92128897 100644 --- a/src/libfabric/CMakeLists.txt +++ b/src/libfabric/CMakeLists.txt @@ -19,4 +19,24 @@ list(TRANSFORM oomph_sources PREPEND ${CMAKE_CURRENT_SOURCE_DIR}/../ target_sources(oomph_libfabric PRIVATE ${oomph_sources_libfabric}) target_sources(oomph_libfabric PRIVATE context.cpp) target_sources(oomph_libfabric PRIVATE operation_context.cpp) -target_sources(oomph_libfabric PRIVATE locality.cpp) + +# if we are using GPU, then the libfabric library was probably built with +# gpu support, and we should link to cuda to prevent link errors +if (HWMALLOC_ENABLE_DEVICE) + include(CheckLanguage) + check_language(CUDA) + + if(CMAKE_CUDA_COMPILER) + enable_language(CUDA) + else() + message(STATUS "No CUDA support") + return() + endif() + + find_package(CUDAToolkit) + target_link_libraries(oomph_libfabric PRIVATE CUDA::cudart) +endif() + +add_executable(check_libfabric test/check_libfabric.cpp) +target_link_libraries(check_libfabric PUBLIC oomph_libfabric) +target_include_directories(check_libfabric PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/src/libfabric/communicator.hpp b/src/libfabric/communicator.hpp index ff8fc945..6bec497b 100644 --- a/src/libfabric/communicator.hpp +++ b/src/libfabric/communicator.hpp @@ -14,131 +14,133 @@ #include -#include #include +#include // paths relative to backend #include <../communicator_base.hpp> #include <../device_guard.hpp> +#include +#include #include #include -#include -#include - -namespace oomph -{ -using operation_context = libfabric::operation_context; +namespace oomph { -using tag_disp = NS_DEBUG::detail::hex<12, uintptr_t>; + using operation_context = libfabric::operation_context; -template -inline /*constexpr*/ NS_DEBUG::print_threshold com_deb("COMMUNI"); + using tag_disp = NS_DEBUG::detail::hex<12, uintptr_t>; -static NS_DEBUG::enable_print com_err("COMMUNI"); + template + inline NS_DEBUG::print_threshold com_deb("COMMUNI"); -class communicator_impl : public communicator_base -{ - using tag_type = std::uint64_t; - // - using segment_type = libfabric::memory_segment; - using region_type = segment_type::handle_type; + static NS_DEBUG::enable_print com_err("COMMUNI"); - using callback_queue = boost::lockfree::queue, boost::lockfree::allocator>>; - - public: - context_impl* m_context; - libfabric::endpoint_wrapper m_tx_endpoint; - libfabric::endpoint_wrapper m_rx_endpoint; - // - callback_queue m_send_cb_queue; - callback_queue m_recv_cb_queue; - callback_queue m_recv_cb_cancel; - - // -------------------------------------------------------------------- - communicator_impl(context_impl* ctxt) - : communicator_base(ctxt) - , m_context(ctxt) - , m_send_cb_queue(128) - , m_recv_cb_queue(128) - , m_recv_cb_cancel(8) + class communicator_impl : public communicator_base { - LF_DEB(com_deb<9>, debug(NS_DEBUG::str<>("MPI_comm"), NS_DEBUG::ptr(mpi_comm()))); - m_tx_endpoint = m_context->get_controller()->get_tx_endpoint(); - m_rx_endpoint = m_context->get_controller()->get_rx_endpoint(); - } + using tag_type = std::uint64_t; + // + using segment_type = libfabric::memory_segment; + using region_type = segment_type::handle_type; + + using callback_queue = boost::lockfree::queue, boost::lockfree::allocator>>; + + public: + context_impl* m_context; + libfabric::endpoint_wrapper m_tx_endpoint; + libfabric::endpoint_wrapper m_rx_endpoint; + // + callback_queue m_send_cb_queue; + callback_queue m_recv_cb_queue; + callback_queue m_recv_cb_cancel; + + // -------------------------------------------------------------------- + communicator_impl(context_impl* ctxt) + : communicator_base(ctxt) + , m_context(ctxt) + , m_send_cb_queue(128) + , m_recv_cb_queue(128) + , m_recv_cb_cancel(8) + { + LF_DEB(com_deb<9>, debug(str<>("MPI_comm"), hptr(mpi_comm()))); + m_tx_endpoint = m_context->get_controller()->get_tx_endpoint(); + m_rx_endpoint = m_context->get_controller()->get_rx_endpoint(); + } - // -------------------------------------------------------------------- - ~communicator_impl() { clear_callback_queues(); } + // -------------------------------------------------------------------- + ~communicator_impl() { clear_callback_queues(); } - // -------------------------------------------------------------------- - auto& get_heap() noexcept { return m_context->get_heap(); } + // -------------------------------------------------------------------- + auto& get_heap() noexcept { return m_context->get_heap(); } - // -------------------------------------------------------------------- - /// generate a tag with 0xRRRRRRRRtttttttt rank, tag. - /// original tag can be 32bits, then we add 32bits of rank info. - inline std::uint64_t make_tag64(std::uint32_t tag, /*std::uint32_t rank, */ std::uintptr_t ctxt) - { - return (((ctxt & 0x0000000000FFFFFF) << 24) | ((std::uint64_t(tag) & 0x0000000000FFFFFF))); - } + // -------------------------------------------------------------------- + /// generate a tag with 0xRRRRRRRRtttttttt rank, tag. + /// original tag can be 32bits, then we add 32bits of rank info. + inline std::uint64_t make_tag64( + std::uint32_t tag, /*std::uint32_t rank, */ std::uintptr_t ctxt) + { + return (((ctxt & 0x0000'0000'00FF'FFFF) << 24) | + ((std::uint64_t(tag) & 0x0000'0000'00FF'FFFF))); + } - // -------------------------------------------------------------------- - template - inline void execute_fi_function(Func F, const char* msg, Args&&... args) - { - bool ok = false; - while (!ok) + // -------------------------------------------------------------------- + template + inline void execute_fi_function(Func F, char const* msg, Args&&... args) { - ssize_t ret = F(std::forward(args)...); - if (ret == 0) { return; } - else if (ret == -FI_EAGAIN) - { - // com_deb<9>.error("Reposting", msg); - // no point stressing the system - m_context->get_controller()->poll_for_work_completions(this); - } - else if (ret == -FI_ENOENT) + bool ok = false; + while (!ok) { - // if a node has failed, we can recover - // @TODO : put something better here - com_err.error("No destination endpoint, terminating."); - std::terminate(); + ssize_t ret = F(std::forward(args)...); + if (ret == 0) { return; } + else if (ret == -FI_EAGAIN) + { + // com_deb<9>.error("Reposting", msg); + // no point stressing the system + m_context->get_controller()->poll_for_work_completions(this); + } + else if (ret == -FI_ENOENT) + { + // if a node has failed, we can recover + // @TODO : put something better here + com_err.error("No destination endpoint, terminating."); + std::terminate(); + } + else if (ret) { throw NS_LIBFABRIC::fabric_error(int(ret), msg); } } - else if (ret) { throw NS_LIBFABRIC::fabric_error(int(ret), msg); } } - } - // -------------------------------------------------------------------- - // this takes a pinned memory region and sends it - void send_tagged_region(region_type const& send_region, std::size_t size, fi_addr_t dst_addr_, - uint64_t tag_, operation_context* ctxt) - { - [[maybe_unused]] auto scp = com_deb<9>.scope(NS_DEBUG::ptr(this), __func__); - // clang-format off + // -------------------------------------------------------------------- + // this takes a pinned memory region and sends it + void send_tagged_region(region_type const& send_region, std::size_t size, + fi_addr_t dst_addr_, uint64_t tag_, operation_context* ctxt) + { + [[maybe_unused]] auto scp = com_deb<9>.scope(NS_DEBUG::hptr(this), __func__); + // clang-format off LF_DEB(com_deb<9>, - debug(NS_DEBUG::str<>("send_tagged_region"), - "->", NS_DEBUG::dec<2>(dst_addr_), + debug(str<>("send_tagged_region"), + "->", dec<2>(dst_addr_), send_region, "tag", tag_disp(tag_), - "context", NS_DEBUG::ptr(ctxt), - "tx endpoint", NS_DEBUG::ptr(m_tx_endpoint.get_ep()))); - // clang-format on - execute_fi_function(fi_tsend, "fi_tsend", m_tx_endpoint.get_ep(), send_region.get_address(), - size, send_region.get_local_key(), dst_addr_, tag_, ctxt); - } + "context", hptr(ctxt), + "tx endpoint", hptr(m_tx_endpoint.get_ep()))); + // clang-format on + execute_fi_function(fi_tsend, "fi_tsend", m_tx_endpoint.get_ep(), + send_region.get_address(), size, send_region.get_local_key(), dst_addr_, tag_, + ctxt); + } - // -------------------------------------------------------------------- - // this takes a pinned memory region and sends it using inject instead of send - void inject_tagged_region(region_type const& send_region, std::size_t size, fi_addr_t dst_addr_, - uint64_t tag_) - { - [[maybe_unused]] auto scp = com_deb<9>.scope(NS_DEBUG::ptr(this), __func__); - // clang-format on - LF_DEB(com_deb<9>, - debug(NS_DEBUG::str<>("inject tagged"), "->", NS_DEBUG::dec<2>(dst_addr_), send_region, - "tag", tag_disp(tag_), "tx endpoint", NS_DEBUG::ptr(m_tx_endpoint.get_ep()))); - // clang-format off + // -------------------------------------------------------------------- + // this takes a pinned memory region and sends it using inject instead of send + void inject_tagged_region( + region_type const& send_region, std::size_t size, fi_addr_t dst_addr_, uint64_t tag_) + { + [[maybe_unused]] auto scp = com_deb<9>.scope(NS_DEBUG::hptr(this), __func__); + // clang-format on + LF_DEB(com_deb<9>, + debug(str<>("inject tagged"), "->", dec<2>(dst_addr_), send_region, "tag", + tag_disp(tag_), "tx endpoint", hptr(m_tx_endpoint.get_ep()))); + // clang-format off execute_fi_function(fi_tinject, "fi_tinject", m_tx_endpoint.get_ep(), send_region.get_address(), size, dst_addr_, tag_); } @@ -150,285 +152,283 @@ class communicator_impl : public communicator_base void recv_tagged_region(region_type const& recv_region, std::size_t size, fi_addr_t src_addr_, uint64_t tag_, operation_context* ctxt) { - [[maybe_unused]] auto scp = com_deb<9>.scope(NS_DEBUG::ptr(this), __func__); + [[maybe_unused]] auto scp = com_deb<9>.scope(NS_DEBUG::hptr(this), __func__); // clang-format off LF_DEB(com_deb<1>, - debug(NS_DEBUG::str<>("recv_tagged_region"), - "<-", NS_DEBUG::dec<2>(src_addr_), + debug(str<>("recv_tagged_region"), + "<-", dec<2>(src_addr_), recv_region, "tag", tag_disp(tag_), - "context", NS_DEBUG::ptr(ctxt), - "rx endpoint", NS_DEBUG::ptr(m_rx_endpoint.get_ep()))); - // clang-format on - constexpr uint64_t ignore = 0; - execute_fi_function(fi_trecv, "fi_trecv", m_rx_endpoint.get_ep(), recv_region.get_address(), - size, recv_region.get_local_key(), src_addr_, tag_, ignore, ctxt); - // if (l.owns_lock()) l.unlock(); - } + "context", hptr(ctxt), + "rx endpoint", hptr(m_rx_endpoint.get_ep()))); + // clang-format on + constexpr uint64_t ignore = 0; + execute_fi_function(fi_trecv, "fi_trecv", m_rx_endpoint.get_ep(), + recv_region.get_address(), size, recv_region.get_local_key(), src_addr_, tag_, + ignore, ctxt); + // if (l.owns_lock()) l.unlock(); + } - // -------------------------------------------------------------------- - send_request send(context_impl::heap_type::pointer const& ptr, std::size_t size, rank_type dst, - oomph::tag_type tag, util::unique_function&& cb, - std::size_t* scheduled) - { - [[maybe_unused]] auto scp = com_deb<9>.scope(NS_DEBUG::ptr(this), __func__); - std::uint64_t stag = make_tag64(tag, /*this->rank(), */ this->m_context->get_context_tag()); + // -------------------------------------------------------------------- + send_request send(context_impl::heap_type::pointer const& ptr, std::size_t size, + rank_type dst, oomph::tag_type tag, + util::unique_function&& cb, std::size_t* scheduled) + { + [[maybe_unused]] auto scp = com_deb<9>.scope(NS_DEBUG::hptr(this), __func__); + std::uint64_t stag = + make_tag64(tag, /*this->rank(), */ this->m_context->get_context_tag()); #if OOMPH_ENABLE_DEVICE - auto const& reg = ptr.on_device() ? ptr.device_handle() : ptr.handle(); + auto const& reg = ptr.on_device() ? ptr.device_handle() : ptr.handle(); #else - auto const& reg = ptr.handle(); + auto const& reg = ptr.handle(); #endif #ifdef EXTRA_SIZE_CHECKS - if (size != reg.get_size()) - { - LF_DEB(com_err, error(NS_DEBUG::str<>("send mismatch"), "size", NS_DEBUG::hex<6>(size), - "reg size", NS_DEBUG::hex<6>(reg.get_size()))); - } -#endif - m_context->get_controller()->sends_posted_++; - - // use optimized inject if msg is very small - if (size <= m_context->get_controller()->get_tx_inject_size()) - { - inject_tagged_region(reg, size, fi_addr_t(dst), stag); - if (!has_reached_recursion_depth()) + if (size != reg.get_size()) { - auto inc = recursion(); - cb(dst, tag); - return {}; + LF_DEB(com_err, + error(str<>("send mismatch"), "size", hex<6>(size), "reg size", + hex<6>(reg.get_size()))); } - else +#endif + m_context->get_controller()->sends_posted_++; + + // use optimized inject if msg is very small + if (size <= m_context->get_controller()->get_tx_inject_size()) { - // construct request which is also an operation context - auto s = - m_req_state_factory.make(m_context, this, scheduled, dst, tag, std::move(cb)); - s->create_self_ref(); - while (!m_send_cb_queue.push(s.get())) {} - return {std::move(s)}; + inject_tagged_region(reg, size, fi_addr_t(dst), stag); + if (!has_reached_recursion_depth()) + { + auto inc = recursion(); + cb(dst, tag); + return {}; + } + else + { + // construct request which is also an operation context + auto s = m_req_state_factory.make( + m_context, this, scheduled, dst, tag, std::move(cb)); + s->create_self_ref(); + while (!m_send_cb_queue.push(s.get())) {} + return {std::move(s)}; + } } - } - // construct request which is also an operation context - auto s = m_req_state_factory.make(m_context, this, scheduled, dst, tag, std::move(cb)); - s->create_self_ref(); + // construct request which is also an operation context + auto s = m_req_state_factory.make(m_context, this, scheduled, dst, tag, std::move(cb)); + s->create_self_ref(); - // clang-format off + // clang-format off LF_DEB(com_deb<9>, - debug(NS_DEBUG::str<>("Send"), - "thisrank", NS_DEBUG::dec<>(rank()), - "rank", NS_DEBUG::dec<>(dst), + debug(str<>("Send"), + "thisrank", dec<>(rank()), + "rank", dec<>(dst), "tag", tag_disp(std::uint64_t(tag)), //"wrapped tag", tag_disp(std::uint64_t(tag.get())), "stag", tag_disp(stag), - "addr", NS_DEBUG::ptr(reg.get_address()), - "size", NS_DEBUG::hex<6>(size), - "reg size", NS_DEBUG::hex<6>(reg.get_size()), - "op_ctx", NS_DEBUG::ptr(&(s->m_operation_context)), - "req", NS_DEBUG::ptr(s.get()))); + "addr", hptr(reg.get_address()), + "size", hex<6>(size), + "reg size", hex<6>(reg.get_size()), + "op_ctx", hptr(&(s->m_operation_context)), + "req", hptr(s.get()))); #if OOMPH_ENABLE_DEVICE if (!ptr.on_device()) { LF_DEB(com_deb<9>, - debug(NS_DEBUG::str<>("send region CRC32"), - NS_DEBUG::mem_crc32(reg.get_address(), size, "CRC32"))); + debug(str<>("send region CRC32"), + mem_crc32(reg.get_address(), size, "CRC32"))); } #endif - // clang-format on + // clang-format on - send_tagged_region(reg, size, fi_addr_t(dst), stag, &(s->m_operation_context)); - return {std::move(s)}; - } + send_tagged_region(reg, size, fi_addr_t(dst), stag, &(s->m_operation_context)); + return {std::move(s)}; + } - recv_request recv(context_impl::heap_type::pointer& ptr, std::size_t size, rank_type src, - oomph::tag_type tag, util::unique_function&& cb, - std::size_t* scheduled) - { - [[maybe_unused]] auto scp = com_deb<9>.scope(NS_DEBUG::ptr(this), __func__); - std::uint64_t stag = make_tag64(tag, /*src, */ this->m_context->get_context_tag()); + recv_request recv(context_impl::heap_type::pointer& ptr, std::size_t size, rank_type src, + oomph::tag_type tag, util::unique_function&& cb, + std::size_t* scheduled) + { + [[maybe_unused]] auto scp = com_deb<9>.scope(NS_DEBUG::hptr(this), __func__); + std::uint64_t stag = make_tag64(tag, /*src, */ this->m_context->get_context_tag()); #if OOMPH_ENABLE_DEVICE - auto const& reg = ptr.on_device() ? ptr.device_handle() : ptr.handle(); + auto const& reg = ptr.on_device() ? ptr.device_handle() : ptr.handle(); #else - auto const& reg = ptr.handle(); + auto const& reg = ptr.handle(); #endif #ifdef EXTRA_SIZE_CHECKS - if (size != reg.get_size()) - { - LF_DEB(com_err, error(NS_DEBUG::str<>("recv mismatch"), "size", NS_DEBUG::hex<6>(size), - "reg size", NS_DEBUG::hex<6>(reg.get_size()))); - } + if (size != reg.get_size()) + { + LF_DEB(com_err, + error(str<>("recv mismatch"), "size", hex<6>(size), "reg size", + hex<6>(reg.get_size()))); + } #endif - m_context->get_controller()->recvs_posted_++; + m_context->get_controller()->recvs_posted_++; - // construct request which is also an operation context - auto s = m_req_state_factory.make(m_context, this, scheduled, src, tag, std::move(cb)); - s->create_self_ref(); + // construct request which is also an operation context + auto s = m_req_state_factory.make(m_context, this, scheduled, src, tag, std::move(cb)); + s->create_self_ref(); - // clang-format off + // clang-format off LF_DEB(com_deb<9>, - debug(NS_DEBUG::str<>("recv"), - "thisrank", NS_DEBUG::dec<>(rank()), - "rank", NS_DEBUG::dec<>(src), + debug(str<>("recv"), + "thisrank", dec<>(rank()), + "rank", dec<>(src), "tag", tag_disp(std::uint64_t(tag)), //"wrapped tag", tag_disp(std::uint64_t(tag.get())), "stag", tag_disp(stag), - "addr", NS_DEBUG::ptr(reg.get_address()), - "size", NS_DEBUG::hex<6>(size), - "reg size", NS_DEBUG::hex<6>(reg.get_size()), - "op_ctx", NS_DEBUG::ptr(&(s->m_operation_context)), - "req", NS_DEBUG::ptr(s.get()))); + "addr", hptr(reg.get_address()), + "size", hex<6>(size), + "reg size", hex<6>(reg.get_size()), + "op_ctx", hptr(&(s->m_operation_context)), + "req", hptr(s.get()))); #if OOMPH_ENABLE_DEVICE if (!ptr.on_device()) { LF_DEB(com_deb<9>, - debug(NS_DEBUG::str<>("recv region CRC32"), - NS_DEBUG::mem_crc32(reg.get_address(), size, "CRC32"))); + debug(str<>("recv region CRC32"), + mem_crc32(reg.get_address(), size, "CRC32"))); } #endif - // clang-format on + // clang-format on - recv_tagged_region(reg, size, fi_addr_t(src), stag, &(s->m_operation_context)); - return {std::move(s)}; - } + recv_tagged_region(reg, size, fi_addr_t(src), stag, &(s->m_operation_context)); + return {std::move(s)}; + } - shared_recv_request shared_recv(context_impl::heap_type::pointer& ptr, std::size_t size, - rank_type src, oomph::tag_type tag, - util::unique_function&& cb, - std::atomic* scheduled) - { - [[maybe_unused]] auto scp = com_deb<9>.scope(NS_DEBUG::ptr(this), __func__); - std::uint64_t stag = make_tag64(tag, /*src, */ this->m_context->get_context_tag()); + shared_recv_request shared_recv(context_impl::heap_type::pointer& ptr, std::size_t size, + rank_type src, oomph::tag_type tag, + util::unique_function&& cb, + std::atomic* scheduled) + { + [[maybe_unused]] auto scp = com_deb<9>.scope(NS_DEBUG::hptr(this), __func__); + std::uint64_t stag = make_tag64(tag, /*src, */ this->m_context->get_context_tag()); #if OOMPH_ENABLE_DEVICE - auto const& reg = ptr.on_device() ? ptr.device_handle() : ptr.handle(); + auto const& reg = ptr.on_device() ? ptr.device_handle() : ptr.handle(); #else - auto const& reg = ptr.handle(); + auto const& reg = ptr.handle(); #endif #ifdef EXTRA_SIZE_CHECKS - if (size != reg.get_size()) - { - LF_DEB(com_err, error(NS_DEBUG::str<>("recv mismatch"), "size", NS_DEBUG::hex<6>(size), - "reg size", NS_DEBUG::hex<6>(reg.get_size()))); - } + if (size != reg.get_size()) + { + LF_DEB(com_err, + error(str<>("recv mismatch"), "size", hex<6>(size), "reg size", + hex<6>(reg.get_size()))); + } #endif - m_context->get_controller()->recvs_posted_++; + m_context->get_controller()->recvs_posted_++; - // construct request which is also an operation context - auto s = std::make_shared(m_context, this, scheduled, src, - tag, std::move(cb)); - s->create_self_ref(); + // construct request which is also an operation context + auto s = std::make_shared( + m_context, this, scheduled, src, tag, std::move(cb)); + s->create_self_ref(); - // clang-format off + // clang-format off LF_DEB(com_deb<9>, - debug(NS_DEBUG::str<>("shared_recv"), - "thisrank", NS_DEBUG::dec<>(rank()), - "rank", NS_DEBUG::dec<>(src), + debug(str<>("shared_recv"), + "thisrank", dec<>(rank()), + "rank", dec<>(src), "tag", tag_disp(std::uint64_t(tag)), //"wrapped tag", tag_disp(std::uint64_t(tag.get())), "stag", tag_disp(stag), - "addr", NS_DEBUG::ptr(reg.get_address()), - "size", NS_DEBUG::hex<6>(size), - "reg size", NS_DEBUG::hex<6>(reg.get_size()), - "op_ctx", NS_DEBUG::ptr(&(s->m_operation_context)), - "req", NS_DEBUG::ptr(s.get()))); - // clang-format on - - recv_tagged_region(reg, size, fi_addr_t(src), stag, &(s->m_operation_context)); - m_context->get_controller()->poll_recv_queue(m_rx_endpoint.get_rx_cq(), this); - return {std::move(s)}; - } + "addr", hptr(reg.get_address()), + "size", hex<6>(size), + "reg size", hex<6>(reg.get_size()), + "op_ctx", hptr(&(s->m_operation_context)), + "req", hptr(s.get()))); + // clang-format on + + recv_tagged_region(reg, size, fi_addr_t(src), stag, &(s->m_operation_context)); + m_context->get_controller()->poll_recv_queue(m_rx_endpoint.get_rx_cq(), this); + return {std::move(s)}; + } - void progress() - { - m_context->get_controller()->poll_for_work_completions(this); - clear_callback_queues(); - } + void progress() + { + m_context->get_controller()->poll_for_work_completions(this); + clear_callback_queues(); + } - void clear_callback_queues() - { - // work through ready callbacks, which were pushed to the queue - // (by other threads) - m_send_cb_queue.consume_all( - [](oomph::detail::request_state* req) - { + void clear_callback_queues() + { + // work through ready callbacks, which were pushed to the queue + // (by other threads) + m_send_cb_queue.consume_all([](oomph::detail::request_state* req) { [[maybe_unused]] auto scp = - com_deb<9>.scope("m_send_cb_queue.consume_all", NS_DEBUG::ptr(req)); + com_deb<9>.scope("m_send_cb_queue.consume_all", NS_DEBUG::hptr(req)); auto ptr = req->release_self_ref(); req->invoke_cb(); }); - m_recv_cb_queue.consume_all( - [](oomph::detail::request_state* req) - { + m_recv_cb_queue.consume_all([](oomph::detail::request_state* req) { [[maybe_unused]] auto scp = - com_deb<9>.scope("m_recv_cb_queue.consume_all", NS_DEBUG::ptr(req)); + com_deb<9>.scope("m_recv_cb_queue.consume_all", NS_DEBUG::hptr(req)); auto ptr = req->release_self_ref(); req->invoke_cb(); }); - m_context->m_recv_cb_queue.consume_all( - [](detail::shared_request_state* req) - { + m_context->m_recv_cb_queue.consume_all([](detail::shared_request_state* req) { auto ptr = req->release_self_ref(); req->invoke_cb(); }); - } + } - // Cancel is a problem with libfabric because fi_cancel is asynchronous. - // The item to be cancelled will either complete with CANCELLED status - // or will complete as usual (ie before the cancel could take effect) - // - // We can only be certain if we poll until the completion happens - // or attach a callback to the cancel notification which is not supported - // by oomph. - bool cancel_recv(detail::request_state* s) - { - // get the original message operation context - operation_context* op_ctx = &(s->m_operation_context); + // Cancel is a problem with libfabric because fi_cancel is asynchronous. + // The item to be cancelled will either complete with CANCELLED status + // or will complete as usual (ie before the cancel could take effect) + // + // We can only be certain if we poll until the completion happens + // or attach a callback to the cancel notification which is not supported + // by oomph. + bool cancel_recv(detail::request_state* s) + { + // get the original message operation context + operation_context* op_ctx = &(s->m_operation_context); - // submit the cancellation request - bool ok = (fi_cancel(&m_rx_endpoint.get_ep()->fid, op_ctx) == 0); - LF_DEB(com_deb<9>, - debug(NS_DEBUG::str<>("Cancel"), "ok", ok, "op_ctx", NS_DEBUG::ptr(op_ctx))); + // submit the cancellation request + bool ok = (fi_cancel(&m_rx_endpoint.get_ep()->fid, op_ctx) == 0); + LF_DEB(com_deb<9>, debug(str<>("Cancel"), "ok", ok, "op_ctx", hptr(op_ctx))); - // if the cancel operation failed completely, return - if (!ok) return false; + // if the cancel operation failed completely, return + if (!ok) return false; - bool found = false; - while (!found) - { - m_context->get_controller()->poll_recv_queue(m_rx_endpoint.get_rx_cq(), this); - // otherwise, poll until we know if it worked - std::stack temp_stack; - detail::request_state* temp; - while (!found && m_recv_cb_cancel.pop(temp)) + bool found = false; + while (!found) { - if (temp == s) + m_context->get_controller()->poll_recv_queue(m_rx_endpoint.get_rx_cq(), this); + // otherwise, poll until we know if it worked + std::stack temp_stack; + detail::request_state* temp; + while (!found && m_recv_cb_cancel.pop(temp)) { - // our recv was cancelled correctly - found = true; - LF_DEB(com_deb<9>, debug(NS_DEBUG::str<>("Cancel"), "succeeded", "op_ctx", - NS_DEBUG::ptr(op_ctx))); - auto ptr = s->release_self_ref(); - s->set_canceled(); + if (temp == s) + { + // our recv was cancelled correctly + found = true; + LF_DEB(com_deb<9>, + debug(str<>("Cancel"), "succeeded", "op_ctx", hptr(op_ctx))); + auto ptr = s->release_self_ref(); + s->set_canceled(); + } + else + { + // a different cancel operation + temp_stack.push(temp); + } } - else + // return any weird unhandled cancels back to the queue + while (!temp_stack.empty()) { - // a different cancel operation - temp_stack.push(temp); + auto temp = temp_stack.top(); + temp_stack.pop(); + m_recv_cb_cancel.push(temp); } } - // return any weird unhandled cancels back to the queue - while (!temp_stack.empty()) - { - auto temp = temp_stack.top(); - temp_stack.pop(); - m_recv_cb_cancel.push(temp); - } + return found; } - return found; - } -}; + }; -} // namespace oomph +} // namespace oomph diff --git a/src/libfabric/context.cpp b/src/libfabric/context.cpp index 5621a83b..a1debfd7 100644 --- a/src/libfabric/context.cpp +++ b/src/libfabric/context.cpp @@ -10,88 +10,95 @@ #include // #include - -#include - // paths relative to backend -#include -#include #include #include +#include +#include -namespace oomph -{ -// cppcheck-suppress ConfigurationNotChecked -static NS_DEBUG::enable_print src_deb("__SRC__"); +namespace oomph { + // cppcheck-suppress ConfigurationNotChecked + static NS_DEBUG::enable_print src_deb("__SRC__"); -using controller_type = libfabric::controller; + using controller_type = libfabric::controller; -context_impl::context_impl(MPI_Comm comm, bool thread_safe, - hwmalloc::heap_config const& heap_config) -: context_base(comm, thread_safe) -, m_heap{this, heap_config} -, m_recv_cb_queue(128) -, m_recv_cb_cancel(8) -{ - int rank, size; - OOMPH_CHECK_MPI_RESULT(MPI_Comm_rank(comm, &rank)); - OOMPH_CHECK_MPI_RESULT(MPI_Comm_size(comm, &size)); + context_impl::context_impl( + MPI_Comm comm, bool thread_safe, hwmalloc::heap_config const& heap_config, bool debug) + : context_base(comm, thread_safe) + , m_heap{this, heap_config} + , m_recv_cb_queue(128) + , m_recv_cb_cancel(8) + { + int rank, size; + OOMPH_CHECK_MPI_RESULT(MPI_Comm_rank(comm, &rank)); + OOMPH_CHECK_MPI_RESULT(MPI_Comm_size(comm, &size)); - m_ctxt_tag = reinterpret_cast(this); - OOMPH_CHECK_MPI_RESULT(MPI_Bcast(&m_ctxt_tag, 1, MPI_UINT64_T, 0, comm)); - LF_DEB(src_deb, debug(NS_DEBUG::str<>("Broadcast"), "rank", debug::dec<3>(rank), "context", - debug::ptr(m_ctxt_tag))); + m_ctxt_tag = reinterpret_cast(this); + OOMPH_CHECK_MPI_RESULT(MPI_Bcast(&m_ctxt_tag, 1, MPI_UINT64_T, 0, comm)); + LF_DEB( + src_deb, debug(str<>("Broadcast"), "rank", dec<3>(rank), "context", hptr(m_ctxt_tag))); - // TODO fix the thread safety - // problem: controller is a singleton and has problems when 2 contexts are created in the - // following order: single threaded first, then multi-threaded after - //int threads = thread_safe ? std::thread::hardware_concurrency() : 1; - //int threads = std::thread::hardware_concurrency(); - int threads = boost::thread::physical_concurrency(); - m_controller = init_libfabric_controller(this, comm, rank, size, threads); - m_domain = m_controller->get_domain(); -} + // TODO fix the thread safety + // problem: controller is a singleton and has problems when 2 contexts are created + // in the following order: single threaded first, then multi-threaded after + // int threads = thread_safe ? std::thread::hardware_concurrency() : 1; + // int threads = std::thread::hardware_concurrency(); + // Determine the number of threads based on the CPU affinity mask + int threads = 1; +#if defined(_GNU_SOURCE) + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + if (sched_getaffinity(0, sizeof(cpuset), &cpuset) == 0) + threads = CPU_COUNT(&cpuset); + else + threads = boost::thread::physical_concurrency(); +#else + threads = boost::thread::physical_concurrency(); +#endif + m_controller = init_libfabric_controller(this, comm, rank, size, threads, debug); + m_domain = m_controller->get_domain(); + } -communicator_impl* -context_impl::get_communicator() -{ - auto comm = new communicator_impl{this}; - m_comms_set.insert(comm); - return comm; -} + communicator_impl* context_impl::get_communicator() + { + auto comm = new communicator_impl{this}; + m_comms_set.insert(comm); + return comm; + } -const char* -context_impl::get_transport_option(const std::string& opt) -{ - if (opt == "name") { return "libfabric"; } - else if (opt == "progress") { return libfabric_progress_string(); } - else if (opt == "endpoint") { return libfabric_endpoint_string(); } - else if (opt == "rendezvous_threshold") + char const* context_impl::get_transport_option(std::string const& opt) { - static char buffer[32]; - std::string temp = std::to_string(m_controller->rendezvous_threshold()); - strncpy(buffer, temp.c_str(), std::min(size_t(31), std::strlen(temp.c_str()))); - return buffer; + if (opt == "name") { return "libfabric"; } + else if (opt == "progress") { return libfabric_progress_string(); } + else if (opt == "endpoint") { return libfabric_endpoint_string(); } + else if (opt == "rendezvous_threshold") + { + static char buffer[32]; + std::string temp = std::to_string(m_controller->rendezvous_threshold()); + if (temp.size() > 31) throw std::runtime_error("Bad string option check, fix please"); + strncpy(buffer, temp.c_str(), 32); + return buffer; + } + else { return "unspecified"; } } - else { return "unspecified"; } -} -std::shared_ptr -context_impl::init_libfabric_controller(oomph::context_impl* /*ctx*/, MPI_Comm comm, int rank, - int size, int threads) -{ - // only allow one thread to pass, make other wait - static std::mutex m_init_mutex; - std::lock_guard lock(m_init_mutex); - static std::shared_ptr instance(nullptr); - if (!instance.get()) + std::shared_ptr context_impl::init_libfabric_controller( + oomph::context_impl* /*ctx*/, MPI_Comm comm, int rank, int size, int threads, bool debug) { - LF_DEB(src_deb, debug(NS_DEBUG::str<>("New Controller"), "rank", debug::dec<3>(rank), - "size", debug::dec<3>(size), "threads", debug::dec<3>(threads))); - instance.reset(new controller_type()); - instance->initialize(HAVE_LIBFABRIC_PROVIDER, rank == 0, size, threads, comm); + // only allow one thread to pass, make other wait + static std::mutex m_init_mutex; + std::lock_guard lock(m_init_mutex); + static std::shared_ptr instance(nullptr); + if (!instance.get()) + { + LF_DEB(src_deb, + debug(NS_DEBUG::str<>("New Controller"), "rank", dec<3>(rank), "size", dec<3>(size), + "threads", dec<3>(threads))); + instance.reset(new controller_type()); + if (debug) instance->enable_debug(); + instance->initialize(HAVE_LIBFABRIC_PROVIDER, rank == 0, size, threads, comm); + } + return instance; } - return instance; -} -} // namespace oomph +} // namespace oomph diff --git a/src/libfabric/context.hpp b/src/libfabric/context.hpp index a7c0c112..76654d66 100644 --- a/src/libfabric/context.hpp +++ b/src/libfabric/context.hpp @@ -9,148 +9,152 @@ */ #pragma once -#include #include #include -#include #include #include // paths relative to backend #include <../context_base.hpp> -#include #include +#include #include -namespace oomph -{ - -static NS_DEBUG::enable_print ctx_deb("CONTEXT"); - -using controller_type = libfabric::controller; - -class context_impl : public context_base -{ - public: - using region_type = libfabric::memory_segment; - using domain_type = region_type::provider_domain; - using device_region_type = libfabric::memory_segment; - using heap_type = hwmalloc::heap; - using callback_queue = boost::lockfree::queue, boost::lockfree::allocator>>; - - private: - heap_type m_heap; - domain_type* m_domain; - std::shared_ptr m_controller; - std::uintptr_t m_ctxt_tag; - - public: - // -------------------------------------------------- - // create a singleton ptr to a libfabric controller that - // can be shared between oomph context objects - static std::shared_ptr init_libfabric_controller(oomph::context_impl* ctx, - MPI_Comm comm, int rank, int size, int threads); - - // queue for shared recv callbacks - callback_queue m_recv_cb_queue; - // queue for canceled shared recv requests - callback_queue m_recv_cb_cancel; - - public: - context_impl(MPI_Comm comm, bool thread_safe, hwmalloc::heap_config const& heap_config); - context_impl(context_impl const&) = delete; - context_impl(context_impl&&) = delete; - - region_type make_region(void* const ptr, std::size_t size, int device_id) +namespace oomph { + + static NS_DEBUG::enable_print ctx_deb("CONTEXT"); + + using controller_type = libfabric::controller; + + class context_impl : public context_base { - if (m_controller->get_mrbind()) + public: + using region_type = libfabric::memory_segment; + using domain_type = region_type::provider_domain; + using device_region_type = libfabric::memory_segment; + using heap_type = hwmalloc::heap; + using callback_queue = boost::lockfree::queue, boost::lockfree::allocator>>; + + private: + heap_type m_heap; + domain_type* m_domain; + std::shared_ptr m_controller; + std::uintptr_t m_ctxt_tag; + + public: + // -------------------------------------------------- + // create a singleton ptr to a libfabric controller that + // can be shared between oomph context objects + static std::shared_ptr init_libfabric_controller(oomph::context_impl* ctx, + MPI_Comm comm, int rank, int size, int threads, bool debug = false); + + // queue for shared recv callbacks + callback_queue m_recv_cb_queue; + // queue for canceled shared recv requests + callback_queue m_recv_cb_cancel; + + public: + context_impl(MPI_Comm comm, bool thread_safe, hwmalloc::heap_config const& heap_config, + bool debug = false); + // context_impl(MPI_Comm comm, bool thread_safe, bool message_pool_never_free, + // std::size_t message_pool_reserve, bool debug = false); + context_impl(context_impl const&) = delete; + context_impl(context_impl&&) = delete; + + region_type make_region(void* const ptr, std::size_t size, int device_id) { - void* endpoint = m_controller->get_rx_endpoint().get_ep(); - return libfabric::memory_segment(m_domain, ptr, size, true, endpoint, device_id); + if (m_controller->get_mrbind()) + { + void* endpoint = m_controller->get_rx_endpoint().get_ep(); + return libfabric::memory_segment(m_domain, ptr, size, true, endpoint, device_id); + } + else + { + return libfabric::memory_segment(m_domain, ptr, size, false, nullptr, device_id); + } } - else { return libfabric::memory_segment(m_domain, ptr, size, false, nullptr, device_id); } - } - auto& get_heap() noexcept { return m_heap; } + auto& get_heap() noexcept { return m_heap; } - communicator_impl* get_communicator(); + communicator_impl* get_communicator(); - // we must modify all tags to use 32bits of context ptr for uniqueness - inline std::uintptr_t get_context_tag() { return m_ctxt_tag; } + // we must modify all tags to use 32bits of context ptr for uniqueness + inline std::uintptr_t get_context_tag() { return m_ctxt_tag; } - inline controller_type* get_controller() /*const */ { return m_controller.get(); } - const char* get_transport_option(const std::string& opt); + inline controller_type* get_controller() /*const */ { return m_controller.get(); } + char const* get_transport_option(std::string const& opt); - void progress() { get_controller()->poll_for_work_completions(nullptr); } + void progress() { get_controller()->poll_for_work_completions(nullptr); } - bool cancel_recv(detail::shared_request_state* s) - { - // get the original message operation context - auto op_ctx = &(s->m_operation_context); + bool cancel_recv(detail::shared_request_state* s) + { + // get the original message operation context + auto op_ctx = &(s->m_operation_context); - // submit the cancellation request - bool ok = (fi_cancel(&(get_controller()->get_rx_endpoint().get_ep()->fid), op_ctx) == 0); + // submit the cancellation request + bool ok = + (fi_cancel(&(get_controller()->get_rx_endpoint().get_ep()->fid), op_ctx) == 0); - // if the cancel operation failed completely, return - if (!ok) return false; + // if the cancel operation failed completely, return + if (!ok) return false; - bool found = false; - while (!found) - { - get_controller()->poll_recv_queue(get_controller()->get_rx_endpoint().get_rx_cq(), - nullptr); - // otherwise, poll until we know if it worked - std::stack temp_stack; - detail::shared_request_state* temp; - while (!found && m_recv_cb_cancel.pop(temp)) + bool found = false; + while (!found) { - if (temp == s) + get_controller()->poll_recv_queue( + get_controller()->get_rx_endpoint().get_rx_cq(), nullptr); + // otherwise, poll until we know if it worked + std::stack temp_stack; + detail::shared_request_state* temp; + while (!found && m_recv_cb_cancel.pop(temp)) { - // our recv was cancelled correctly - found = true; - LF_DEB(oomph::ctx_deb, debug(NS_DEBUG::str<>("Cancel shared"), "succeeded", - "op_ctx", NS_DEBUG::ptr(op_ctx))); - auto ptr = s->release_self_ref(); - s->set_canceled(); + if (temp == s) + { + // our recv was cancelled correctly + found = true; + LF_DEB(oomph::ctx_deb, + debug(str<>("Cancel shared"), "succeeded", "op_ctx", hptr(op_ctx))); + auto ptr = s->release_self_ref(); + s->set_canceled(); + } + else + { + // a different cancel operation + temp_stack.push(temp); + } } - else + // return any weird unhandled cancels back to the queue + while (!temp_stack.empty()) { - // a different cancel operation - temp_stack.push(temp); + auto temp = temp_stack.top(); + temp_stack.pop(); + m_recv_cb_cancel.push(temp); } } - // return any weird unhandled cancels back to the queue - while (!temp_stack.empty()) - { - auto temp = temp_stack.top(); - temp_stack.pop(); - m_recv_cb_cancel.push(temp); - } + return found; } - return found; - } - unsigned int num_tag_bits() const noexcept { return 32; } -}; + unsigned int num_tag_bits() const noexcept { return 32; } + }; -// -------------------------------------------------------------------- -template<> -inline oomph::libfabric::memory_segment -register_memory(oomph::context_impl& c, void* const ptr, std::size_t size) -{ - return c.make_region(ptr, size, -2); -} + // -------------------------------------------------------------------- + template <> + inline oomph::libfabric::memory_segment + register_memory(oomph::context_impl& c, void* const ptr, std::size_t size) + { + return c.make_region(ptr, size, -2); + } #if OOMPH_ENABLE_DEVICE -template<> -inline oomph::libfabric::memory_segment -register_device_memory(context_impl& c, int device_id, void* ptr, std::size_t size) -{ - return c.make_region(ptr, size, device_id); -} + template <> + inline oomph::libfabric::memory_segment register_device_memory( + context_impl& c, int device_id, void* ptr, std::size_t size) + { + return c.make_region(ptr, size, device_id); + } #endif -} // namespace oomph +} // namespace oomph diff --git a/src/libfabric/controller.hpp b/src/libfabric/controller.hpp index 5becc148..f015a0c4 100644 --- a/src/libfabric/controller.hpp +++ b/src/libfabric/controller.hpp @@ -9,23 +9,13 @@ */ #pragma once -#include -#include -#include -#include -#include -#include -#include -#include #include #include -#include #include // #include #include #include -#include // #include #include @@ -35,428 +25,440 @@ #include #include // -#include "oomph_libfabric_defines.hpp" +#include "controller_base.hpp" #include "fabric_error.hpp" #include "locality.hpp" -#include "memory_region.hpp" +#include "oomph_libfabric_defines.hpp" #include "operation_context.hpp" -#include "controller_base.hpp" // #include // #include -namespace NS_DEBUG -{ -// cppcheck-suppress ConfigurationNotChecked +namespace NS_DEBUG { + // cppcheck-suppress ConfigurationNotChecked -using namespace oomph::debug; -template -inline /*constexpr*/ NS_DEBUG::print_threshold cnt_deb("CONTROL"); -// -static NS_DEBUG::enable_print cnt_err("CONTROL"); -} // namespace NS_DEBUG - -namespace oomph::libfabric -{ - -class controller : public controller_base -{ - public: - // -------------------------------------------------------------------- - controller() - : controller_base() - { - } + using namespace oomph::debug; + template + inline NS_DEBUG::print_threshold cnt_deb("CONTROL"); + // + static NS_DEBUG::enable_print cnt_err("CONTROL"); +} // namespace NS_DEBUG - // -------------------------------------------------------------------- - void initialize_derived(std::string const&, bool, int, size_t, MPI_Comm mpi_comm) - { - // Broadcast address of all endpoints to all ranks - // and fill address vector with info - exchange_addresses(av_, mpi_comm); - } +namespace oomph::libfabric { - // -------------------------------------------------------------------- - constexpr fi_threading threadlevel_flags() + class controller : public controller_base { -#if defined(HAVE_LIBFABRIC_GNI) /*|| defined(HAVE_LIBFABRIC_CXI)*/ - return FI_THREAD_ENDPOINT; -#else - return FI_THREAD_SAFE; -#endif - } + public: + // -------------------------------------------------------------------- + controller() + : controller_base() + { + } - // -------------------------------------------------------------------- - constexpr uint64_t caps_flags() - { -#if OOMPH_ENABLE_DEVICE && !defined(HAVE_LIBFABRIC_TCP) - std::int64_t hmem_flags = FI_HMEM; + // -------------------------------------------------------------------- + void initialize_derived(std::string const&, bool, int, size_t, MPI_Comm mpi_comm) + { + // Broadcast address of all endpoints to all ranks + // and fill address vector with info + exchange_addresses(av_, mpi_comm); + } + + // -------------------------------------------------------------------- + constexpr fi_threading threadlevel_flags() + { +#if defined(HAVE_LIBFABRIC_GNI) || defined(HAVE_LIBFABRIC_LNX) + return FI_THREAD_ENDPOINT; #else - std::int64_t hmem_flags = 0; + return FI_THREAD_SAFE; #endif - return hmem_flags | FI_MSG | FI_TAGGED | FI_RMA | FI_READ | FI_WRITE | FI_RECV | FI_SEND | - FI_TRANSMIT | FI_REMOTE_READ | FI_REMOTE_WRITE; - } - - // -------------------------------------------------------------------- - // we do not need to perform any special actions on init (to contact root node) - void setup_root_node_address(struct fi_info* /*info*/) {} + } - // -------------------------------------------------------------------- - // send address to rank 0 and receive array of all localities - void MPI_exchange_localities(fid_av* av, MPI_Comm comm, int rank, int size) - { - [[maybe_unused]] auto scp = NS_DEBUG::cnt_deb<9>.scope(NS_DEBUG::ptr(this), __func__); - std::vector localities(size * locality_defs::array_size, 0); - // - if (rank > 0) + // -------------------------------------------------------------------- + uint64_t caps_flags(uint64_t /*available_flags*/) const { - LF_DEB(NS_DEBUG::cnt_deb<9>, debug(debug::str<>("sending here"), iplocality(here_), - "size", locality_defs::array_size)); - /*int err = */ MPI_Send(here_.fabric_data(), locality_defs::array_size, MPI_CHAR, - 0, // dst rank - 0, // tag - comm); - - LF_DEB(NS_DEBUG::cnt_deb<9>, - debug(debug::str<>("receiving all"), "size", locality_defs::array_size)); - - MPI_Status status; - /*err = */ MPI_Recv(localities.data(), size * locality_defs::array_size, MPI_CHAR, - 0, // src rank - 0, // tag - comm, &status); - LF_DEB(NS_DEBUG::cnt_deb<9>, debug(debug::str<>("received addresses"))); + uint64_t flags_required = FI_TAGGED; +#ifndef HAVE_LIBFABRIC_LNX + flags_required |= FI_MSG | FI_TAGGED | FI_RECV | FI_SEND | FI_RMA | FI_READ | FI_WRITE | + FI_REMOTE_READ | FI_REMOTE_WRITE; +# if OOMPH_ENABLE_DEVICE + flags_required |= FI_HMEM; +# endif +#endif + return flags_required; } - else + + // -------------------------------------------------------------------- + // we do not need to perform any special actions on init (to contact root node) + void setup_root_node_address(struct fi_info* /*info*/) {} + + // -------------------------------------------------------------------- + // send address to rank 0 and receive array of all localities + void MPI_exchange_localities(fid_av* av, MPI_Comm comm, int rank, int size) { - LF_DEB(NS_DEBUG::cnt_deb<9>, debug(debug::str<>("receiving addresses"))); - memcpy(&localities[0], here_.fabric_data(), locality_defs::array_size); - for (int i = 1; i < size; ++i) + [[maybe_unused]] auto scp = NS_DEBUG::cnt_deb<9>.scope(NS_DEBUG::hptr(this), __func__); + + // array of empty locality objects + std::vector localities(size); + // + if (rank > 0) { - LF_DEB(NS_DEBUG::cnt_deb<9>, - debug(debug::str<>("receiving address"), debug::dec<>(i))); + LF_DEB(cnt_deb<9>, + debug( + str<>("sending here"), here_.to_str(), "size", locality_defs::array_size)); + /*int err = */ MPI_Send(here_.fabric_data().data(), locality_defs::array_size, + MPI_CHAR, + 0, // dst rank + 0, // tag + comm); + + LF_DEB( + cnt_deb<9>, debug(str<>("receiving all"), "size", locality_defs::array_size)); + MPI_Status status; - /*int err = */ MPI_Recv(&localities[i * locality_defs::array_size], - size * locality_defs::array_size, MPI_CHAR, - i, // src rank - 0, // tag + /*err = */ MPI_Recv(localities.data(), size * locality_defs::array_size, MPI_CHAR, + 0, // src rank + 0, // tag comm, &status); - LF_DEB(NS_DEBUG::cnt_deb<9>, - debug(debug::str<>("received address"), debug::dec<>(i))); + LF_DEB(cnt_deb<9>, debug(str<>("received addresses"))); + } + else + { + LF_DEB(cnt_deb<9>, debug(str<>("receiving addresses"))); + memcpy(&localities[0], here_.fabric_data().data(), locality_defs::array_size); + for (int i = 1; i < size; ++i) + { + LF_DEB(cnt_deb<9>, debug(str<>("receiving address"), dec<>(i))); + MPI_Status status; + /*int err = */ MPI_Recv(&localities[i], size * locality_defs::array_size, + MPI_CHAR, + i, // src rank + 0, // tag + comm, &status); + LF_DEB(cnt_deb<9>, debug(str<>("received address"), dec<>(i))); + } + + LF_DEB(cnt_deb<9>, debug(str<>("sending all"))); + for (int i = 1; i < size; ++i) + { + LF_DEB(cnt_deb<9>, debug(str<>("sending to"), dec<>(i))); + /*int err = */ MPI_Send(&localities[0], size * locality_defs::array_size, + MPI_CHAR, + i, // dst rank + 0, // tag + comm); + } } - LF_DEB(NS_DEBUG::cnt_deb<9>, debug(debug::str<>("sending all"))); - for (int i = 1; i < size; ++i) + // all ranks should now have a full localities vector + LF_DEB(cnt_deb<9>, debug(str<>("populating vector"))); + for (int i = 0; i < size; ++i) { - LF_DEB(NS_DEBUG::cnt_deb<9>, debug(debug::str<>("sending to"), debug::dec<>(i))); - /*int err = */ MPI_Send(&localities[0], size * locality_defs::array_size, MPI_CHAR, - i, // dst rank - 0, // tag - comm); + locality temp(localities[i], av); + insert_address(temp); } } - // all ranks should now have a full localities vector - LF_DEB(NS_DEBUG::cnt_deb<9>, debug(debug::str<>("populating vector"))); - for (int i = 0; i < size; ++i) + // -------------------------------------------------------------------- + // if we did not bootstrap, then fetch the list of all localities + // and insert each one into the address vector + void exchange_addresses(fid_av* av, MPI_Comm mpi_comm) { - locality temp; - int offset = i * locality_defs::array_size; - memcpy(temp.fabric_data_writable(), &localities[offset], locality_defs::array_size); - insert_address(av, temp); - } - } - - // -------------------------------------------------------------------- - // if we did not bootstrap, then fetch the list of all localities - // and insert each one into the address vector - void exchange_addresses(fid_av* av, MPI_Comm mpi_comm) - { - [[maybe_unused]] auto scp = NS_DEBUG::cnt_deb<9>.scope(NS_DEBUG::ptr(this), __func__); + [[maybe_unused]] auto scp = NS_DEBUG::cnt_deb<9>.scope(NS_DEBUG::hptr(this), __func__); - int rank, size; - MPI_Comm_rank(mpi_comm, &rank); - MPI_Comm_size(mpi_comm, &size); + int rank, size; + MPI_Comm_rank(mpi_comm, &rank); + MPI_Comm_size(mpi_comm, &size); - LF_DEB(NS_DEBUG::cnt_deb<9>, - debug(debug::str<>("initialize_localities"), size, "localities")); + LF_DEB(cnt_deb<9>, debug(str<>("initialize_localities"), size, "localities")); - MPI_exchange_localities(av, mpi_comm, rank, size); - debug_print_av_vector(size); - LF_DEB(NS_DEBUG::cnt_deb<9>, debug(debug::str<>("Done localities"))); - } + MPI_exchange_localities(av, mpi_comm, rank, size); +#ifndef HAVE_LIBFABRIC_LNX // address stuff not yet supported + debug_print_av_vector(size); +#endif + LF_DEB(cnt_deb<9>, debug(str<>("Done localities"))); + } - // -------------------------------------------------------------------- - inline constexpr bool bypass_tx_lock() - { + // -------------------------------------------------------------------- + inline constexpr bool bypass_tx_lock() + { #if defined(HAVE_LIBFABRIC_GNI) - return true; -#elif defined(HAVE_LIBFABRIC_CXI) - // @todo : cxi provider is not yet thread safe using scalable endpoints - return false; + return true; +#elif defined(HAVE_LIBFABRIC_LNX) + // @todo : cxi provider is not yet thread safe using scalable endpoints + return false; #else - return (threadlevel_flags() == FI_THREAD_SAFE || + return (threadlevel_flags() == FI_THREAD_SAFE || endpoint_type_ == endpoint_type::threadlocalTx); #endif - } + } - // -------------------------------------------------------------------- - inline controller_base::unique_lock get_tx_lock() - { - if (bypass_tx_lock()) return unique_lock(); - return unique_lock(send_mutex_); - } + // -------------------------------------------------------------------- + inline controller_base::unique_lock get_tx_lock() + { + if (bypass_tx_lock()) return unique_lock(); + return unique_lock(send_mutex_); + } - // -------------------------------------------------------------------- - inline controller_base::unique_lock try_tx_lock() - { - if (bypass_tx_lock()) return unique_lock(); - return unique_lock(send_mutex_, std::try_to_lock_t{}); - } + // -------------------------------------------------------------------- + inline controller_base::unique_lock try_tx_lock() + { + if (bypass_tx_lock()) return unique_lock(); + return unique_lock(send_mutex_, std::try_to_lock_t{}); + } - // -------------------------------------------------------------------- - inline constexpr bool bypass_rx_lock() - { + // -------------------------------------------------------------------- + inline constexpr bool bypass_rx_lock() + { #ifdef HAVE_LIBFABRIC_GNI - return true; + return true; #else - return ( - threadlevel_flags() == FI_THREAD_SAFE || endpoint_type_ == endpoint_type::scalableTxRx); + return (threadlevel_flags() == FI_THREAD_SAFE || + endpoint_type_ == endpoint_type::scalableTxRx); #endif - } + } - // -------------------------------------------------------------------- - inline controller_base::unique_lock get_rx_lock() - { - if (bypass_rx_lock()) return unique_lock(); - return unique_lock(recv_mutex_); - } + // -------------------------------------------------------------------- + inline controller_base::unique_lock get_rx_lock() + { + if (bypass_rx_lock()) return unique_lock(); + return unique_lock(recv_mutex_); + } - // -------------------------------------------------------------------- - inline controller_base::unique_lock try_rx_lock() - { - if (bypass_rx_lock()) return unique_lock(); - return unique_lock(recv_mutex_, std::try_to_lock_t{}); - } + // -------------------------------------------------------------------- + inline controller_base::unique_lock try_rx_lock() + { + if (bypass_rx_lock()) return unique_lock(); + return unique_lock(recv_mutex_, std::try_to_lock_t{}); + } - // -------------------------------------------------------------------- - int poll_send_queue(fid_cq* send_cq, void* user_data) - { + // -------------------------------------------------------------------- + int poll_send_queue(fid_cq* send_cq, void* user_data) + { #ifdef EXCESSIVE_POLLING_BACKOFF_MICRO_S - std::chrono::steady_clock::time_point now = std::chrono::steady_clock::now(); - if (std::chrono::duration_cast(now - send_poll_stamp).count() < - EXCESSIVE_POLLING_BACKOFF_MICRO_S) - return 0; - send_poll_stamp = now; + std::chrono::steady_clock::time_point now = std::chrono::steady_clock::now(); + if (std::chrono::duration_cast(now - send_poll_stamp) + .count() < EXCESSIVE_POLLING_BACKOFF_MICRO_S) + return 0; + send_poll_stamp = now; #endif - int ret; - fi_cq_msg_entry entry[max_completions_array_limit_]; - assert(max_completions_per_poll_ <= max_completions_array_limit_); - { - auto lock = try_tx_lock(); - - // if we're not threadlocal and didn't get the lock, - // then another thread is polling now, just exit - if (!bypass_tx_lock() && !lock.owns_lock()) { return -1; } + int ret; + fi_cq_msg_entry entry[max_completions_array_limit_]; + assert(max_completions_per_poll_ <= max_completions_array_limit_); + { + auto lock = try_tx_lock(); - static auto polling = - NS_DEBUG::cnt_deb<9>.make_timer(1, debug::str<>("poll send queue")); - LF_DEB(NS_DEBUG::cnt_deb<9>, timed(polling, NS_DEBUG::ptr(send_cq))); + // if we're not threadlocal and didn't get the lock, + // then another thread is polling now, just exit + if (!bypass_tx_lock() && !lock.owns_lock()) { return -1; } - // poll for completions - { - ret = fi_cq_read(send_cq, &entry[0], max_completions_per_poll_); - } - // if there is an error, retrieve it - if (ret == -FI_EAVAIL) - { - struct fi_cq_err_entry e = {}; - int err_sz = fi_cq_readerr(send_cq, &e, 0); - (void)err_sz; + static auto polling = + NS_DEBUG::cnt_deb<9>.make_timer(1, NS_DEBUG::str<>("poll send queue")); + LF_DEB(cnt_deb<9>, timed(polling, hptr(send_cq))); - // flags might not be set correctly - if ((e.flags & (FI_MSG | FI_SEND | FI_TAGGED)) != 0) + // poll for completions { - NS_DEBUG::cnt_err.error("txcq Error FI_EAVAIL for " - "FI_SEND with len", - debug::hex<6>(e.len), "context", NS_DEBUG::ptr(e.op_context), "code", - NS_DEBUG::dec<3>(e.err), "flags", debug::bin<16>(e.flags), "error", - fi_cq_strerror(send_cq, e.prov_errno, e.err_data, (char*)e.buf, e.len)); + ret = fi_cq_read(send_cq, &entry[0], max_completions_per_poll_); } - else if ((e.flags & FI_RMA) != 0) + // if there is an error, retrieve it + if (ret == -FI_EAVAIL) { - NS_DEBUG::cnt_err.error("txcq Error FI_EAVAIL for " - "FI_RMA with len", - debug::hex<6>(e.len), "context", NS_DEBUG::ptr(e.op_context), "code", - NS_DEBUG::dec<3>(e.err), "flags", debug::bin<16>(e.flags), "error", - fi_cq_strerror(send_cq, e.prov_errno, e.err_data, (char*)e.buf, e.len)); + struct fi_cq_err_entry e = {}; + int err_sz = fi_cq_readerr(send_cq, &e, 0); + (void) err_sz; + + // flags might not be set correctly + if ((e.flags & (FI_MSG | FI_SEND | FI_TAGGED)) != 0) + { + LF_DEB(cnt_err, + error("txcq Error FI_EAVAIL for FI_SEND with len", hex<6>(e.len), + "context", hptr(e.op_context), "code", dec<3>(e.err), "flags", + bin<16>(e.flags), "error", + fi_cq_strerror( + send_cq, e.prov_errno, e.err_data, (char*) e.buf, e.len))); + } + else if ((e.flags & FI_RMA) != 0) + { + LF_DEB(cnt_err, + error("txcq Error FI_EAVAIL for FI_RMA with len", hex<6>(e.len), + "context", hptr(e.op_context), "code", dec<3>(e.err), "flags", + bin<16>(e.flags), "error", + fi_cq_strerror( + send_cq, e.prov_errno, e.err_data, (char*) e.buf, e.len))); + } + operation_context* handler = reinterpret_cast(e.op_context); + handler->handle_error(e); + return 0; } - operation_context* handler = reinterpret_cast(e.op_context); - handler->handle_error(e); - return 0; } - } - // - // exit possibly locked region and process each completion - // - if (ret > 0) - { - int processed = 0; - for (int i = 0; i < ret; ++i) + // + // exit possibly locked region and process each completion + // + if (ret > 0) { - ++sends_complete; - LF_DEB(NS_DEBUG::cnt_deb<9>, - debug(debug::str<>("Completion"), i, debug::dec<2>(i), "txcq flags", - fi_tostr(&entry[i].flags, FI_TYPE_CQ_EVENT_FLAGS), "(", - debug::dec<>(entry[i].flags), ")", "context", - NS_DEBUG::ptr(entry[i].op_context), "length", debug::hex<6>(entry[i].len))); - if ((entry[i].flags & (FI_TAGGED | FI_SEND | FI_MSG)) != 0) + std::array buf; + int processed = 0; + for (int i = 0; i < ret; ++i) { - LF_DEB(NS_DEBUG::cnt_deb<9>, - debug(debug::str<>("Completion"), "txcq tagged send completion", - NS_DEBUG::ptr(entry[i].op_context))); - - operation_context* handler = - reinterpret_cast(entry[i].op_context); - processed += handler->handle_tagged_send_completion(user_data); - } - else - { - NS_DEBUG::cnt_err.error("Received an unknown txcq completion", - debug::dec<>(entry[i].flags), debug::bin<64>(entry[i].flags)); - std::terminate(); + ++sends_complete; + LF_DEB(cnt_deb<9>, + debug(str<>("Completion"), i, dec<2>(i), "txcq flags", + fi_tostr_r( + buf.data(), buf.size(), &entry[i].flags, FI_TYPE_CQ_EVENT_FLAGS), + "(", dec<>(entry[i].flags), ")", "context", hptr(entry[i].op_context), + "length", hex<6>(entry[i].len))); + if ((entry[i].flags & (FI_TAGGED | FI_SEND | FI_MSG)) != 0) + { + LF_DEB(cnt_deb<9>, + debug(str<>("Completion"), "txcq tagged send completion", + hptr(entry[i].op_context))); + + operation_context* handler = + reinterpret_cast(entry[i].op_context); + processed += handler->handle_tagged_send_completion(user_data); + } + else + { + LF_DEB(cnt_err, + error("Received an unknown txcq completion", dec<>(entry[i].flags), + bin<64>(entry[i].flags))); + std::terminate(); + } } + return processed; } - return processed; - } - else if (ret == 0 || ret == -FI_EAGAIN) - { - // do nothing, we will try again on the next check + else if (ret == 0 || ret == -FI_EAGAIN) + { + // do nothing, we will try again on the next check + } + else { LF_DEB(cnt_err, error("unknown error in completion txcq read")); } + return 0; } - else { NS_DEBUG::cnt_err.error("unknown error in completion txcq read"); } - return 0; - } - // -------------------------------------------------------------------- - int poll_recv_queue(fid_cq* rx_cq, void* user_data) - { + // -------------------------------------------------------------------- + int poll_recv_queue(fid_cq* rx_cq, void* user_data) + { #ifdef EXCESSIVE_POLLING_BACKOFF_MICRO_S - std::chrono::steady_clock::time_point now = std::chrono::steady_clock::now(); - if (std::chrono::duration_cast(now - recv_poll_stamp).count() < - EXCESSIVE_POLLING_BACKOFF_MICRO_S) - return 0; - recv_poll_stamp = now; + std::chrono::steady_clock::time_point now = std::chrono::steady_clock::now(); + if (std::chrono::duration_cast(now - recv_poll_stamp) + .count() < EXCESSIVE_POLLING_BACKOFF_MICRO_S) + return 0; + recv_poll_stamp = now; #endif - int ret; - fi_cq_msg_entry entry[max_completions_array_limit_]; - assert(max_completions_per_poll_ <= max_completions_array_limit_); - { - auto lock = get_rx_lock(); + int ret; + fi_cq_msg_entry entry[max_completions_array_limit_]; + assert(max_completions_per_poll_ <= max_completions_array_limit_); + { + auto lock = get_rx_lock(); - // if we're not threadlocal and didn't get the lock, - // then another thread is polling now, just exit - if (!bypass_rx_lock() && !lock.owns_lock()) { return -1; } + // if we're not threadlocal and didn't get the lock, + // then another thread is polling now, just exit + if (!bypass_rx_lock() && !lock.owns_lock()) { return -1; } - static auto polling = - NS_DEBUG::cnt_deb<2>.make_timer(1, debug::str<>("poll recv queue")); - LF_DEB(NS_DEBUG::cnt_deb<2>, timed(polling, NS_DEBUG::ptr(rx_cq))); + static auto polling = + NS_DEBUG::cnt_deb<2>.make_timer(1, NS_DEBUG::str<>("poll recv queue")); + LF_DEB(cnt_deb<2>, timed(polling, hptr(rx_cq))); - // poll for completions - { - ret = fi_cq_read(rx_cq, &entry[0], max_completions_per_poll_); - } - // if there is an error, retrieve it - if (ret == -FI_EAVAIL) - { - // read the full error status - struct fi_cq_err_entry e = {}; - int err_sz = fi_cq_readerr(rx_cq, &e, 0); - (void)err_sz; - // from the manpage 'man 3 fi_cq_readerr' - if (e.err == FI_ECANCELED) + // poll for completions { - LF_DEB(NS_DEBUG::cnt_deb<1>, - debug(debug::str<>("rxcq Cancelled"), "flags", debug::hex<6>(e.flags), - "len", debug::hex<6>(e.len), "context", NS_DEBUG::ptr(e.op_context))); - // the request was cancelled, we can simply exit - // as the canceller will have doone any cleanup needed - operation_context* handler = reinterpret_cast(e.op_context); - handler->handle_cancelled(); - return 0; + ret = fi_cq_read(rx_cq, &entry[0], max_completions_per_poll_); } - else if (e.err != FI_SUCCESS) + // if there is an error, retrieve it + if (ret == -FI_EAVAIL) { - NS_DEBUG::cnt_err.error(debug::str<>("poll_recv_queue"), "error code", - debug::dec<>(-e.err), "flags", debug::hex<6>(e.flags), "len", - debug::hex<6>(e.len), "context", NS_DEBUG::ptr(e.op_context), "error msg", - fi_cq_strerror(rx_cq, e.prov_errno, e.err_data, (char*)e.buf, e.len)); + // read the full error status + struct fi_cq_err_entry e = {}; + int err_sz = fi_cq_readerr(rx_cq, &e, 0); + (void) err_sz; + // from the manpage 'man 3 fi_cq_readerr' + if (e.err == FI_ECANCELED) + { + LF_DEB(cnt_deb<1>, + debug(str<>("rxcq Cancelled"), "flags", hex<6>(e.flags), "len", + hex<6>(e.len), "context", hptr(e.op_context))); + // the request was cancelled, we can simply exit + // as the canceller will have doone any cleanup needed + operation_context* handler = + reinterpret_cast(e.op_context); + handler->handle_cancelled(); + return 0; + } + else if (e.err != FI_SUCCESS) + { + LF_DEB(cnt_err, + error(str<>("poll_recv_queue"), "error code", dec<>(-e.err), "flags", + hex<6>(e.flags), "len", hex<6>(e.len), "context", + hptr(e.op_context), "error msg", + fi_cq_strerror( + rx_cq, e.prov_errno, e.err_data, (char*) e.buf, e.len))); + } + operation_context* handler = reinterpret_cast(e.op_context); + if (handler) handler->handle_error(e); + return 0; } - operation_context* handler = reinterpret_cast(e.op_context); - if (handler) handler->handle_error(e); - return 0; } - } - // - // release the lock and process each completion - // - if (ret > 0) - { - int processed = 0; - for (int i = 0; i < ret; ++i) + // + // release the lock and process each completion + // + if (ret > 0) { - ++recvs_complete; - LF_DEB(NS_DEBUG::cnt_deb<2>, - debug(debug::str<>("Completion"), i, "rxcq flags", - fi_tostr(&entry[i].flags, FI_TYPE_CQ_EVENT_FLAGS), "(", - debug::dec<>(entry[i].flags), ")", "context", - NS_DEBUG::ptr(entry[i].op_context), "length", debug::hex<6>(entry[i].len))); - if ((entry[i].flags & (FI_TAGGED | FI_RECV)) != 0) + std::array buf; + int processed = 0; + for (int i = 0; i < ret; ++i) { - LF_DEB(NS_DEBUG::cnt_deb<2>, - debug(debug::str<>("Completion"), "rxcq tagged recv completion", - NS_DEBUG::ptr(entry[i].op_context))); - - operation_context* handler = - reinterpret_cast(entry[i].op_context); - processed += handler->handle_tagged_recv_completion(user_data); - } - else - { - NS_DEBUG::cnt_err.error("Received an unknown rxcq completion", - debug::dec<>(entry[i].flags), debug::bin<64>(entry[i].flags)); - std::terminate(); + ++recvs_complete; + LF_DEB(cnt_deb<2>, + debug(str<>("Completion"), i, "rxcq flags", + fi_tostr_r( + buf.data(), buf.size(), &entry[i].flags, FI_TYPE_CQ_EVENT_FLAGS), + "(", dec<>(entry[i].flags), ")", "context", hptr(entry[i].op_context), + "length", hex<6>(entry[i].len))); + if ((entry[i].flags & (FI_TAGGED | FI_RECV)) != 0) + { + LF_DEB(cnt_deb<2>, + debug(str<>("Completion"), "rxcq tagged recv completion", + hptr(entry[i].op_context))); + + operation_context* handler = + reinterpret_cast(entry[i].op_context); + processed += handler->handle_tagged_recv_completion(user_data); + } + else + { + LF_DEB(cnt_err, + error("Received an unknown rxcq completion", dec<>(entry[i].flags), + bin<64>(entry[i].flags))); + std::terminate(); + } } + return processed; + } + else if (ret == 0 || ret == -FI_EAGAIN) + { + // do nothing, we will try again on the next check } - return processed; + else { LF_DEB(cnt_err, error("unknown error in completion rxcq read")); } + return 0; } - else if (ret == 0 || ret == -FI_EAGAIN) + + // Jobs started using mpi don't have this info + struct fi_info* set_src_dst_addresses(struct fi_info* info, bool tx) { - // do nothing, we will try again on the next check + (void) info; // unused variable warning + (void) tx; // unused variable warning + + LF_DEB(cnb_deb, debug(str<>("fi_dupinfo"))); + struct fi_info* hints = fi_dupinfo(info); + if (!hints) throw NS_LIBFABRIC::fabric_error(0, "fi_dupinfo"); + // clear any Rx address data that might be set + // free(hints->src_addr); + // hints->src_addr = nullptr; + // hints->src_addrlen = 0; + free(hints->dest_addr); + hints->dest_addr = nullptr; + hints->dest_addrlen = 0; + return hints; } - else { NS_DEBUG::cnt_err.error("unknown error in completion rxcq read"); } - return 0; - } + }; - // Jobs started using mpi don't have this info - struct fi_info* set_src_dst_addresses(struct fi_info* info, bool tx) - { - (void)info; // unused variable warning - (void)tx; // unused variable warning - - LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("fi_dupinfo"))); - struct fi_info* hints = fi_dupinfo(info); - if (!hints) throw NS_LIBFABRIC::fabric_error(0, "fi_dupinfo"); - // clear any Rx address data that might be set - // free(hints->src_addr); - // hints->src_addr = nullptr; - // hints->src_addrlen = 0; - free(hints->dest_addr); - hints->dest_addr = nullptr; - hints->dest_addrlen = 0; - return hints; - } -}; - -} // namespace oomph::libfabric +} // namespace oomph::libfabric diff --git a/src/libfabric/controller_base.hpp b/src/libfabric/controller_base.hpp index e1ce377e..d423803b 100644 --- a/src/libfabric/controller_base.hpp +++ b/src/libfabric/controller_base.hpp @@ -9,18 +9,12 @@ */ #pragma once -#include -#include #include -#include -#include #include -#include #include #include #include #include -#include // #include #include @@ -45,23 +39,25 @@ #include "memory_region.hpp" #include "operation_context_base.hpp" -//#define DISABLE_FI_INJECT -//#define EXCESSIVE_POLLING_BACKOFF_MICRO_S 50 +#if ((FI_MAJOR_VERSION == 1) && FI_MINOR_VERSION <= 12) +#define fi_tostr_r(a,b,c,d) " " +#endif + +// #define DISABLE_FI_INJECT +// #define EXCESSIVE_POLLING_BACKOFF_MICRO_S 50 // ------------------------------------------------------------------ // ---------------------------------------- // auto progress (libfabric thread) or manual // ---------------------------------------- -static fi_progress -libfabric_progress_type() +static fi_progress libfabric_progress_type() { if (std::getenv("LIBFABRIC_AUTO_PROGRESS") == nullptr) return FI_PROGRESS_MANUAL; return FI_PROGRESS_AUTO; } -static const char* -libfabric_progress_string() +static char const* libfabric_progress_string() { if (libfabric_progress_type() == FI_PROGRESS_AUTO) return "auto"; return "manual"; @@ -93,8 +89,7 @@ enum class endpoint_type : int // ---------------------------------------- // single endpoint or separate for send/recv // ---------------------------------------- -static endpoint_type -libfabric_endpoint_type() +static endpoint_type libfabric_endpoint_type() { auto env_str = std::getenv("LIBFABRIC_ENDPOINT_TYPE"); if (env_str == nullptr) return endpoint_type::single; @@ -114,8 +109,7 @@ libfabric_endpoint_type() return endpoint_type::single; } -static const char* -libfabric_endpoint_string() +static char const* libfabric_endpoint_string() { auto lf_ep_type = libfabric_endpoint_type(); if (lf_ep_type == endpoint_type::multiple) return "multiple"; @@ -128,8 +122,7 @@ libfabric_endpoint_string() // ---------------------------------------- // number of completions to handle per poll // ---------------------------------------- -static int -libfabric_completions_per_poll() +static int libfabric_completions_per_poll() { auto env_str = std::getenv("LIBFABRIC_POLL_SIZE"); if (env_str != nullptr) @@ -148,8 +141,7 @@ libfabric_completions_per_poll() // ---------------------------------------- // Eager/Rendezvous threshold // ---------------------------------------- -static int -libfabric_rendezvous_threshold(int def_val) +static int libfabric_rendezvous_threshold(int def_val) { auto env_str = std::getenv("LIBFABRIC_RENDEZVOUS_THRESHOLD"); if (env_str != nullptr) @@ -170,10 +162,10 @@ libfabric_rendezvous_threshold(int def_val) // Needed on Cray for GNI extensions // ------------------------------------------------ #ifdef HAVE_LIBFABRIC_GNI -#include "rdma/fi_ext_gni.h" -//#define OOMPH_GNI_REG "none" -#define OOMPH_GNI_REG "internal" -//#define OOMPH_GNI_REG "udreg" +# include "rdma/fi_ext_gni.h" +// #define OOMPH_GNI_REG "none" +# define OOMPH_GNI_REG "internal" +// #define OOMPH_GNI_REG "udreg" static std::vector> gni_strs = { {GNI_MR_CACHE, "GNI_MR_CACHE"}, @@ -209,23 +201,22 @@ static std::vector> gni_ints = { // clang-format on #endif -// the libfabric library expects us to ask for an API supported version, so if we know we support -// api 2.0, then we ask for that, but the cxi legacy library on daint only supports 1.15, -// so drop back to that version if needed +// the libfabric library expects us to ask for an API supported version, so if +// we know we support api 2.0, then we ask for that, but the cxi legacy library +// on daint only supports 1.15, so drop back to that version if needed #if defined(OOMPH_LIBFABRIC_V1_API) -#define LIBFABRIC_FI_VERSION_MAJOR 1 -#define LIBFABRIC_FI_VERSION_MINOR 15 +# define LIBFABRIC_FI_VERSION_MAJOR 1 +# define LIBFABRIC_FI_VERSION_MINOR 15 #else -#define LIBFABRIC_FI_VERSION_MAJOR 2 -#define LIBFABRIC_FI_VERSION_MINOR 0 +# define LIBFABRIC_FI_VERSION_MAJOR 2 +# define LIBFABRIC_FI_VERSION_MINOR 2 #endif -namespace NS_DEBUG -{ -// cppcheck-suppress ConfigurationNotChecked -static NS_DEBUG::enable_print cnb_deb("CONBASE"); -static NS_DEBUG::enable_print cnb_err("CONBASE"); -} // namespace NS_DEBUG +namespace NS_DEBUG { + // cppcheck-suppress ConfigurationNotChecked + static NS_DEBUG::enable_print cnb_deb("CONBASE"); + static NS_DEBUG::enable_print cnb_err("CONBASE"); +} // namespace NS_DEBUG /** @brief a class to return the number of progressed callbacks */ struct progress_status @@ -237,7 +228,7 @@ struct progress_status int num_sends() const noexcept { return m_num_sends; } int num_recvs() const noexcept { return m_num_recvs; } - progress_status& operator+=(const progress_status& other) noexcept + progress_status& operator+=(progress_status const& other) noexcept { m_num_sends += other.m_num_sends; m_num_recvs += other.m_num_recvs; @@ -245,1255 +236,1329 @@ struct progress_status } }; -namespace NS_LIBFABRIC -{ -/// A wrapper around fi_close that reports any error -/// Because we use so many handles, we must be careful to -/// delete them all before closing resources that use them -template -void -fidclose(Handle fid, const char* msg) -{ - LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("closing"), msg)); - int ret = fi_close(fid); - if (ret == -FI_EBUSY) { throw NS_LIBFABRIC::fabric_error(ret, "fi_close EBUSY"); } - else if (ret == FI_SUCCESS) { return; } - throw NS_LIBFABRIC::fabric_error(ret, "fi_close error"); -} - -/// when using thread local endpoints, we encapsulate things that -/// are needed to manage an endpoint -struct endpoint_wrapper -{ - private: - friend class controller; - - fid_ep* ep_ = nullptr; - fid_cq* rq_ = nullptr; - fid_cq* tq_ = nullptr; - const char* name_ = nullptr; - - public: - endpoint_wrapper() {} - endpoint_wrapper(fid_ep* ep, fid_cq* rq, fid_cq* tq, const char* name) - : ep_(ep) - , rq_(rq) - , tq_(tq) - , name_(name) +namespace NS_LIBFABRIC { + /// A wrapper around fi_close that reports any error + /// Because we use so many handles, we must be careful to + /// delete them all before closing resources that use them + template + void fidclose(Handle fid, char const* msg) { - [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__, name_); + LF_DEB(cnb_deb, debug(str<>("closing"), msg)); + int ret = fi_close(fid); + if (ret == -FI_EBUSY) { throw NS_LIBFABRIC::fabric_error(ret, "fi_close EBUSY"); } + else if (ret == FI_SUCCESS) { return; } + throw NS_LIBFABRIC::fabric_error(ret, "fi_close error"); } - // to keep boost::lockfree happy, we need these copy operators - endpoint_wrapper(const endpoint_wrapper& ep) = default; - endpoint_wrapper& operator=(const endpoint_wrapper& ep) = default; - - void cleanup() + /// when using thread local endpoints, we encapsulate things that + /// are needed to manage an endpoint + struct endpoint_wrapper { - [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__, name_); - if (ep_) + private: + friend class controller; + + fid_ep* ep_ = nullptr; + fid_cq* rq_ = nullptr; + fid_cq* tq_ = nullptr; + char const* name_ = nullptr; + + public: + endpoint_wrapper() {} + endpoint_wrapper(fid_ep* ep, fid_cq* rq, fid_cq* tq, char const* name) + : ep_(ep) + , rq_(rq) + , tq_(tq) + , name_(name) { - fidclose(&ep_->fid, "endpoint"); - ep_ = nullptr; - } - if (rq_) - { - fidclose(&rq_->fid, "rq"); - rq_ = nullptr; + [[maybe_unused]] auto scp = + NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__, name_); } - if (tq_) + + // to keep boost::lockfree happy, we need these copy operators + endpoint_wrapper(endpoint_wrapper const& ep) = default; + endpoint_wrapper& operator=(endpoint_wrapper const& ep) = default; + + void cleanup() { - fidclose(&tq_->fid, "tq"); - tq_ = nullptr; + [[maybe_unused]] auto scp = + NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__, name_); + if (ep_) + { + fidclose(&ep_->fid, "endpoint"); + ep_ = nullptr; + } + if (rq_) + { + fidclose(&rq_->fid, "rq"); + rq_ = nullptr; + } + if (tq_) + { + fidclose(&tq_->fid, "tq"); + tq_ = nullptr; + } } - } - inline fid_ep* get_ep() { return ep_; } - inline fid_cq* get_rx_cq() { return rq_; } - inline fid_cq* get_tx_cq() { return tq_; } - inline void set_tx_cq(fid_cq* cq) { tq_ = cq; } - inline const char* get_name() { return name_; } -}; + inline fid_ep* get_ep() { return ep_; } + inline fid_cq* get_rx_cq() { return rq_; } + inline fid_cq* get_tx_cq() { return tq_; } + inline void set_tx_cq(fid_cq* cq) { tq_ = cq; } + inline char const* get_name() { return name_; } + }; -using region_type = NS_MEMORY::memory_handle; -using endpoint_context_pool = - boost::lockfree::queue>; - -struct stack_endpoint -{ - endpoint_wrapper endpoint_; - endpoint_context_pool* pool_; - // - stack_endpoint() - : endpoint_() - , pool_(nullptr) - { - } - // - stack_endpoint(fid_ep* ep, fid_cq* rq, fid_cq* tq, const char* name, - endpoint_context_pool* pool) - : endpoint_(ep, rq, tq, name) - , pool_(pool) - { - } - // - stack_endpoint& operator=(stack_endpoint&& other) - { - endpoint_ = std::move(other.endpoint_); - pool_ = std::exchange(other.pool_, nullptr); - return *this; - } + using region_type = NS_MEMORY::memory_handle; + using endpoint_context_pool = + boost::lockfree::queue>; - ~stack_endpoint() + struct stack_endpoint { - if (!pool_) return; - LF_DEB(NS_DEBUG::cnb_deb, - trace(debug::str<>("Scalable Ep"), "used push", "ep", NS_DEBUG::ptr(get_ep()), "tx cq", - NS_DEBUG::ptr(get_tx_cq()), "rx cq", NS_DEBUG::ptr(get_rx_cq()))); - pool_->push(endpoint_); - } - - inline fid_ep* get_ep() { return endpoint_.get_ep(); } - - inline fid_cq* get_rx_cq() { return endpoint_.get_rx_cq(); } - - inline fid_cq* get_tx_cq() { return endpoint_.get_tx_cq(); } -}; - -struct endpoints_lifetime_manager -{ - // threadlocal endpoints - static inline thread_local stack_endpoint tl_tx_; - static inline thread_local stack_endpoint tl_stx_; - static inline thread_local stack_endpoint tl_srx_; - // non threadlocal endpoints, tx/rx - endpoint_wrapper ep_tx_; - endpoint_wrapper ep_rx_; -}; - -template -class controller_base -{ - public: - typedef std::mutex mutex_type; - typedef std::lock_guard scoped_lock; - typedef std::unique_lock unique_lock; - - protected: - // For threadlocal/scalable endpoints, - // we use a dedicated threadlocal endpoint wrapper - std::unique_ptr eps_; + endpoint_wrapper endpoint_; + endpoint_context_pool* pool_; + // + stack_endpoint() + : endpoint_() + , pool_(nullptr) + { + } + // + stack_endpoint( + fid_ep* ep, fid_cq* rq, fid_cq* tq, char const* name, endpoint_context_pool* pool) + : endpoint_(ep, rq, tq, name) + , pool_(pool) + { + } + // + stack_endpoint& operator=(stack_endpoint&& other) + { + endpoint_ = std::move(other.endpoint_); + pool_ = std::exchange(other.pool_, nullptr); + return *this; + } - using endpoint_context_pool = - boost::lockfree::queue>; - endpoint_context_pool tx_endpoints_; - endpoint_context_pool rx_endpoints_; + ~stack_endpoint() + { + if (!pool_) return; + LF_DEB(cnb_deb, + trace(str<>("Scalable Ep"), "used push", "ep", hptr(get_ep()), "tx cq", + hptr(get_tx_cq()), "rx cq", hptr(get_rx_cq()))); + pool_->push(endpoint_); + } - struct fi_info* fabric_info_; - struct fid_fabric* fabric_; - struct fid_domain* fabric_domain_; - struct fid_pep* ep_passive_; + inline fid_ep* get_ep() { return endpoint_.get_ep(); } - struct fid_av* av_; - endpoint_type endpoint_type_; + inline fid_cq* get_rx_cq() { return endpoint_.get_rx_cq(); } - locality here_; - locality root_; + inline fid_cq* get_tx_cq() { return endpoint_.get_tx_cq(); } + }; - // used during queue creation setup and during polling - mutex_type controller_mutex_; + struct endpoints_lifetime_manager + { + // threadlocal endpoints + static inline thread_local stack_endpoint tl_tx_; + static inline thread_local stack_endpoint tl_stx_; + static inline thread_local stack_endpoint tl_srx_; + // non threadlocal endpoints, tx/rx + endpoint_wrapper ep_tx_; + endpoint_wrapper ep_rx_; + }; - // used to protect send/recv resources - alignas(64) mutex_type send_mutex_; - alignas(64) mutex_type recv_mutex_; + template + class controller_base + { + public: + typedef std::mutex mutex_type; + typedef std::lock_guard scoped_lock; + typedef std::unique_lock unique_lock; + + protected: + // For threadlocal/scalable endpoints, + // we use a dedicated threadlocal endpoint wrapper + std::unique_ptr eps_; + + using endpoint_context_pool = + boost::lockfree::queue>; + endpoint_context_pool tx_endpoints_; + endpoint_context_pool rx_endpoints_; + + bool display_fabric_info_; // for debugging purposes, show fi_info hints + struct fi_info* fabric_info_; + struct fid_fabric* fabric_; + struct fid_domain* fabric_domain_; + struct fid_pep* ep_passive_; + + struct fid_av* av_; + endpoint_type endpoint_type_; + + locality here_; + locality root_; + + // used during queue creation setup and during polling + mutex_type controller_mutex_; + + // used to protect send/recv resources + alignas(64) mutex_type send_mutex_; + alignas(64) mutex_type recv_mutex_; + + std::size_t tx_inject_size_; + std::size_t tx_attr_size_; + std::size_t rx_attr_size_; + + uint32_t max_completions_per_poll_; + uint32_t msg_rendezvous_threshold_; + inline static constexpr uint32_t max_completions_array_limit_ = 256; + + static inline thread_local std::chrono::steady_clock::time_point send_poll_stamp; + static inline thread_local std::chrono::steady_clock::time_point recv_poll_stamp; + + // set if FI_MR_LOCAL is required (local access requires binding) + bool mrlocal = false; + // set if FI_MR_ENDPOINT is required (per endpoint memory binding) + bool mrbind = false; + // set if FI_MR_HRMEM provider requires heterogeneous memory registration + bool mrhmem = false; + + public: + bool get_mrbind() { return mrbind; } + + public: + NS_LIBFABRIC::simple_counter sends_posted_; + NS_LIBFABRIC::simple_counter recvs_posted_; + NS_LIBFABRIC::simple_counter sends_readied_; + NS_LIBFABRIC::simple_counter recvs_readied_; + NS_LIBFABRIC::simple_counter sends_complete; + NS_LIBFABRIC::simple_counter recvs_complete; + + void finvoke(char const* msg, char const* err, int ret) + { + LF_DEB(cnb_deb, trace(str<>(msg))); + if (ret) throw NS_LIBFABRIC::fabric_error(ret, err); + } - std::size_t tx_inject_size_; - std::size_t tx_attr_size_; - std::size_t rx_attr_size_; + public: + // -------------------------------------------------------------------- + controller_base() + : eps_(nullptr) + , tx_endpoints_(1) + , rx_endpoints_(1) + , display_fabric_info_(false) + , fabric_info_(nullptr) + , fabric_(nullptr) + , fabric_domain_(nullptr) + , ep_passive_(nullptr) + , av_(nullptr) + , tx_inject_size_(0) + , tx_attr_size_(0) + , rx_attr_size_(0) + , max_completions_per_poll_(1) + , msg_rendezvous_threshold_(0x4000) + , sends_posted_(0) + , recvs_posted_(0) + , sends_readied_(0) + , recvs_readied_(0) + , sends_complete(0) + , recvs_complete(0) + { + } - uint32_t max_completions_per_poll_; - uint32_t msg_rendezvous_threshold_; - inline static constexpr uint32_t max_completions_array_limit_ = 256; + // -------------------------------------------------------------------- + // clean up all resources + ~controller_base() + { + [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__); + unsigned int messages_handled_ = 0; + unsigned int rma_reads_ = 0; + unsigned int recv_deletes_ = 0; - static inline thread_local std::chrono::steady_clock::time_point send_poll_stamp; - static inline thread_local std::chrono::steady_clock::time_point recv_poll_stamp; + LF_DEB(cnb_deb, + debug(str<>("counters"), "Received messages", dec<>(messages_handled_), + "Total reads", dec<>(rma_reads_), "Total deletes", dec<>(recv_deletes_), + "deletes error", dec<>(messages_handled_ - recv_deletes_))); - // set if FI_MR_LOCAL is required (local access requires binding) - bool mrlocal = false; - // set if FI_MR_ENDPOINT is required (per endpoint memory binding) - bool mrbind = false; - // set if FI_MR_HRMEM provider requires heterogeneous memory registration - bool mrhmem = false; + tx_endpoints_.consume_all([](auto&& ep) { ep.cleanup(); }); + rx_endpoints_.consume_all([](auto&& ep) { ep.cleanup(); }); - public: - bool get_mrbind() { return mrbind; } + // No cleanup threadlocals : done by consume_all cleanup above + // eps_->tl_tx_.endpoint_.cleanup(); + // eps_->tl_stx_.endpoint_.cleanup(); + // eps_->tl_srx_.endpoint_.cleanup(); - public: - NS_LIBFABRIC::simple_counter sends_posted_; - NS_LIBFABRIC::simple_counter recvs_posted_; - NS_LIBFABRIC::simple_counter sends_readied_; - NS_LIBFABRIC::simple_counter recvs_readied_; - NS_LIBFABRIC::simple_counter sends_complete; - NS_LIBFABRIC::simple_counter recvs_complete; + // non threadlocal endpoints, tx/rx + eps_->ep_tx_.cleanup(); + eps_->ep_rx_.cleanup(); - void finvoke(const char* msg, const char* err, int ret) - { - LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>(msg))); - if (ret) throw NS_LIBFABRIC::fabric_error(ret, err); - } + // Cleanup endpoints + eps_.reset(nullptr); - public: - // -------------------------------------------------------------------- - controller_base() - : eps_(nullptr) - , tx_endpoints_(1) - , rx_endpoints_(1) - , fabric_info_(nullptr) - , fabric_(nullptr) - , fabric_domain_(nullptr) - , ep_passive_(nullptr) - , av_(nullptr) - , tx_inject_size_(0) - , tx_attr_size_(0) - , rx_attr_size_(0) - , max_completions_per_poll_(1) - , msg_rendezvous_threshold_(0x4000) - , sends_posted_(0) - , recvs_posted_(0) - , sends_readied_(0) - , recvs_readied_(0) - , sends_complete(0) - , recvs_complete(0) - { - } + // delete adddress vector + fidclose(&av_->fid, "Address Vector"); - // -------------------------------------------------------------------- - // clean up all resources - ~controller_base() - { - [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__); - unsigned int messages_handled_ = 0; - unsigned int rma_reads_ = 0; - unsigned int recv_deletes_ = 0; + try + { + fidclose(&fabric_domain_->fid, "Domain"); + } + catch (fabric_error& e) + { + std::cout << "fabric domain close failed : Ensure all RMA " + "objects are freed before program termination" + << std::endl; + } + fidclose(&fabric_->fid, "Fabric"); - LF_DEB(NS_DEBUG::cnb_deb, - debug(debug::str<>("counters"), "Received messages", debug::dec<>(messages_handled_), - "Total reads", debug::dec<>(rma_reads_), "Total deletes", - debug::dec<>(recv_deletes_), "deletes error", - debug::dec<>(messages_handled_ - recv_deletes_))); + // clean up + LF_DEB(cnb_deb, debug(str<>("freeing fabric_info"))); - tx_endpoints_.consume_all([](auto&& ep) { ep.cleanup(); }); - rx_endpoints_.consume_all([](auto&& ep) { ep.cleanup(); }); + fi_freeinfo(fabric_info_); + } - // No cleanup threadlocals : done by consume_all cleanup above - // eps_->tl_tx_.endpoint_.cleanup(); - // eps_->tl_stx_.endpoint_.cleanup(); - // eps_->tl_srx_.endpoint_.cleanup(); + // -------------------------------------------------------------------- + // only used in check_libfabric quick test for helpful output + void enable_debug() { display_fabric_info_ = true; } - // non threadlocal endpoints, tx/rx - eps_->ep_tx_.cleanup(); - eps_->ep_rx_.cleanup(); + // -------------------------------------------------------------------- + // setup an endpoint for receiving messages, + // usually an rx endpoint is shared by all threads + endpoint_wrapper create_rx_endpoint( + struct fid_domain* domain, struct fi_info* info, struct fid_av* av) + { + [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__); + auto ep_rx = new_endpoint_active(domain, info, false); - // Cleanup endpoints - eps_.reset(nullptr); + // bind address vector + bind_address_vector_to_endpoint(ep_rx, av); - // delete adddress vector - fidclose(&av_->fid, "Address Vector"); + // create a completion queue for the rx endpoint + info->rx_attr->op_flags |= FI_COMPLETION; + auto rx_cq = create_completion_queue(domain, info->rx_attr->size, "rx"); - try - { - fidclose(&fabric_domain_->fid, "Domain"); + // bind CQ to endpoint + bind_queue_to_endpoint(ep_rx, rx_cq, FI_RECV, "rx"); + return endpoint_wrapper(ep_rx, rx_cq, nullptr, "rx"); } - catch (fabric_error& e) + + // -------------------------------------------------------------------- + // initialize the basic fabric/domain/name + template + void initialize( + std::string const& provider, bool rootnode, int size, size_t threads, Args&&... args) { - std::cout << "fabric domain close failed : Ensure all RMA " - "objects are freed before program termination" - << std::endl; - } - fidclose(&fabric_->fid, "Fabric"); + LF_DEB(cnb_deb, eval([]() { std::cout.setf(std::ios::unitbuf); })); + [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__); - // clean up - LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("freeing fabric_info"))); + max_completions_per_poll_ = libfabric_completions_per_poll(); + LF_DEB(cnb_err, debug(str<>("Poll completions"), dec<3>(max_completions_per_poll_))); - fi_freeinfo(fabric_info_); - } + uint32_t default_val = (threads == 1) ? 0x400 : 0x4000; + msg_rendezvous_threshold_ = libfabric_rendezvous_threshold(default_val); + LF_DEB( + cnb_err, debug(str<>("Rendezvous threshold"), hex<4>(msg_rendezvous_threshold_))); - // -------------------------------------------------------------------- - // setup an endpoint for receiving messages, - // usually an rx endpoint is shared by all threads - endpoint_wrapper create_rx_endpoint(struct fid_domain* domain, struct fi_info* info, - struct fid_av* av) - { - auto ep_rx = new_endpoint_active(domain, info, false); + endpoint_type_ = static_cast(libfabric_endpoint_type()); + LF_DEB(cnb_err, debug(str<>("Endpoints"), libfabric_endpoint_string())); - // bind address vector - bind_address_vector_to_endpoint(ep_rx, av); + eps_ = std::make_unique(); - // create a completion queue for the rx endpoint - info->rx_attr->op_flags |= FI_COMPLETION; - auto rx_cq = create_completion_queue(domain, info->rx_attr->size, "rx"); + LF_DEB(cnb_deb, debug(str<>("Threads"), dec<3>(threads))); - // bind CQ to endpoint - bind_queue_to_endpoint(ep_rx, rx_cq, FI_RECV, "rx"); - return endpoint_wrapper(ep_rx, rx_cq, nullptr, "rx"); - } - - // -------------------------------------------------------------------- - // initialize the basic fabric/domain/name - template - void initialize(std::string const& provider, bool rootnode, int size, size_t threads, - Args&&... args) - { - LF_DEB(NS_DEBUG::cnb_deb, eval([]() { std::cout.setf(std::ios::unitbuf); })); - [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__); + open_fabric(provider, threads, rootnode); - max_completions_per_poll_ = libfabric_completions_per_poll(); - LF_DEB(NS_DEBUG::cnb_err, - debug(debug::str<>("Poll completions"), debug::dec<3>(max_completions_per_poll_))); + // create an address vector that will be bound to (all) endpoints + av_ = create_address_vector(fabric_info_, size, threads); - uint32_t default_val = (threads == 1) ? 0x400 : 0x4000; - msg_rendezvous_threshold_ = libfabric_rendezvous_threshold(default_val); - LF_DEB(NS_DEBUG::cnb_err, - debug(debug::str<>("Rendezvous threshold"), debug::hex<4>(msg_rendezvous_threshold_))); + // we need an rx endpoint in all cases except scalable rx + if (endpoint_type_ != endpoint_type::scalableTxRx) + { + // setup an endpoint for receiving messages + // rx endpoint is typically shared by all threads + eps_->ep_rx_ = create_rx_endpoint(fabric_domain_, fabric_info_, av_); + } - endpoint_type_ = static_cast(libfabric_endpoint_type()); - LF_DEB(NS_DEBUG::cnb_err, debug(debug::str<>("Endpoints"), libfabric_endpoint_string())); + if (endpoint_type_ == endpoint_type::single) + { + // always bind a tx cq to the rx endpoint for single endpoint type + auto tx_cq = bind_tx_queue_to_rx_endpoint(fabric_info_, eps_->ep_rx_.get_ep()); + eps_->ep_rx_.set_tx_cq(tx_cq); + } + else if (endpoint_type_ != endpoint_type::scalableTxRx) + { +#if defined(HAVE_LIBFABRIC_SOCKETS) || defined(HAVE_LIBFABRIC_TCP) || \ + defined(HAVE_LIBFABRIC_SHM) || defined(HAVE_LIBFABRIC_VERBS) || defined(HAVE_LIBFABRIC_CXI) || \ + defined(HAVE_LIBFABRIC_EFA) + // it appears that the rx endpoint cannot be enabled if it does not + // have a Tx CQ (at least when using sockets), so we create a dummy + // Tx CQ and bind it just to stop libfabric from triggering an error. + // The tx_cq won't actually be used because the user will get the real + // tx endpoint which will have the correct cq bound to it + auto dummy_cq = bind_tx_queue_to_rx_endpoint(fabric_info_, eps_->ep_rx_.get_ep()); + eps_->ep_rx_.set_tx_cq(dummy_cq); +#endif + } - eps_ = std::make_unique(); + if (endpoint_type_ == endpoint_type::multiple) + { + // create a separate Tx endpoint for sending messages + // note that the CQ needs FI_RECV even though its a Tx cq to keep + // some providers happy as they trigger an error if an endpoint + // has no Rx cq attached (appears to be a progress related bug) + auto ep_tx = new_endpoint_active(fabric_domain_, fabric_info_, true); - LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Threads"), debug::dec<3>(threads))); + // create a completion queue for tx endpoint + fabric_info_->tx_attr->op_flags |= (FI_INJECT_COMPLETE | FI_COMPLETION); + auto tx_cq = create_completion_queue( + fabric_domain_, fabric_info_->tx_attr->size, "tx multiple"); - open_fabric(provider, threads, rootnode); + bind_queue_to_endpoint(ep_tx, tx_cq, FI_TRANSMIT | FI_RECV, "tx multiple"); + bind_address_vector_to_endpoint(ep_tx, av_); + enable_endpoint(ep_tx, "tx multiple"); - // create an address vector that will be bound to (all) endpoints - av_ = create_address_vector(fabric_info_, size, threads); + // combine endpoints and CQ into wrapper for convenience + eps_->ep_tx_ = endpoint_wrapper(ep_tx, nullptr, tx_cq, "tx multiple"); + } + else if (endpoint_type_ == endpoint_type::threadlocalTx) + { + // each thread creates a Tx endpoint on first call to get_tx_endpoint() + } + else if (endpoint_type_ == endpoint_type::scalableTx || + endpoint_type_ == endpoint_type::scalableTxRx) + { + // setup tx contexts for each possible thread + size_t threads_allocated = 0; + auto ep_sx = new_endpoint_scalable( + fabric_domain_, fabric_info_, true /*Tx*/, threads, threads_allocated); + + LF_DEB(cnb_deb, + trace(str<>("scalable endpoint ok"), "Contexts allocated", + dec<4>(threads_allocated))); + + finvoke("fi_scalable_ep_bind AV", "fi_scalable_ep_bind", + fi_scalable_ep_bind(ep_sx, &av_->fid, 0)); + + // prepare the stack for insertions + tx_endpoints_.reserve(threads_allocated); + // + for (unsigned int i = 0; i < threads_allocated; i++) + { + [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope( + NS_DEBUG::hptr(this), "scalable", NS_DEBUG::dec<4>(i)); + + // For threadlocal/scalable endpoints, tx/rx resources + fid_ep* scalable_ep_tx; + fid_cq* scalable_cq_tx; + + // Create a Tx context, cq, bind and enable + finvoke("create tx context", "fi_tx_context", + fi_tx_context(ep_sx, i, NULL, &scalable_ep_tx, NULL)); + scalable_cq_tx = create_completion_queue( + fabric_domain_, fabric_info_->tx_attr->size, "tx scalable"); + bind_queue_to_endpoint( + scalable_ep_tx, scalable_cq_tx, FI_TRANSMIT, "tx scalable"); + enable_endpoint(scalable_ep_tx, "tx scalable"); + + endpoint_wrapper tx(scalable_ep_tx, nullptr, scalable_cq_tx, "tx scalable"); + LF_DEB(cnb_deb, + trace(str<>("Scalable Ep"), "initial tx push", "ep", hptr(tx.get_ep()), + "tx cq", hptr(tx.get_tx_cq()), "rx cq", hptr(tx.get_rx_cq()))); + tx_endpoints_.push(tx); + } - // we need an rx endpoint in all cases except scalable rx - if (endpoint_type_ != endpoint_type::scalableTxRx) - { - // setup an endpoint for receiving messages - // rx endpoint is typically shared by all threads - eps_->ep_rx_ = create_rx_endpoint(fabric_domain_, fabric_info_, av_); - } + eps_->ep_tx_ = endpoint_wrapper(ep_sx, nullptr, nullptr, "rx scalable"); + } - if (endpoint_type_ == endpoint_type::single) - { - // always bind a tx cq to the rx endpoint for single endpoint type - auto tx_cq = bind_tx_queue_to_rx_endpoint(fabric_info_, eps_->ep_rx_.get_ep()); - eps_->ep_rx_.set_tx_cq(tx_cq); - } - else if (endpoint_type_ != endpoint_type::scalableTxRx) - { -#if defined(HAVE_LIBFABRIC_SOCKETS) || defined(HAVE_LIBFABRIC_TCP) || \ - defined(HAVE_LIBFABRIC_VERBS) || defined(HAVE_LIBFABRIC_CXI) || defined(HAVE_LIBFABRIC_EFA) - // it appears that the rx endpoint cannot be enabled if it does not - // have a Tx CQ (at least when using sockets), so we create a dummy - // Tx CQ and bind it just to stop libfabric from triggering an error. - // The tx_cq won't actually be used because the user will get the real - // tx endpoint which will have the correct cq bound to it - auto dummy_cq = bind_tx_queue_to_rx_endpoint(fabric_info_, eps_->ep_rx_.get_ep()); - eps_->ep_rx_.set_tx_cq(dummy_cq); -#endif + // once enabled we can get the address + enable_endpoint(eps_->ep_rx_.get_ep(), "rx here"); + here_ = get_endpoint_address(&eps_->ep_rx_.get_ep()->fid); + LF_DEB(cnb_deb, debug(str<>("setting 'here'"), here_.to_str())); + + // // if we are using scalable endpoints, then setup tx/rx contexts + // // we will us a single endpoint for all Tx/Rx contexts + // if (endpoint_type_ == endpoint_type::scalableTx || + // endpoint_type_ == endpoint_type::scalableTxRx) + // { + + // // thread slots might not be same as what we asked for + // size_t threads_allocated = 0; + // auto ep_sx = new_endpoint_scalable(fabric_domain_, fabric_info_, true /*Tx*/, threads, + // threads_allocated); + // if (!ep_sx) + // throw NS_LIBFABRIC::fabric_error(FI_EOTHER, "fi_scalable endpoint creation failed"); + + // LF_DEB(cnb_deb, trace(str<>("scalable endpoint ok"), + // "Contexts allocated", dec<4>(threads_allocated))); + + // // prepare the stack for insertions + // tx_endpoints_.reserve(threads_allocated); + // rx_endpoints_.reserve(threads_allocated); + // // + // for (unsigned int i = 0; i < threads_allocated; i++) + // { + // [[maybe_unused]] auto scp = + // NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), "scalable", dec<4>(i)); + + // // For threadlocal/scalable endpoints, tx/rx resources + // fid_ep* scalable_ep_tx; + // fid_cq* scalable_cq_tx; + //// fid_ep* scalable_ep_rx; + //// fid_cq* scalable_cq_rx; + + // // Tx context setup + // finvoke("create tx context", "fi_tx_context", + // fi_tx_context(ep_sx, i, NULL, &scalable_ep_tx, NULL)); + + // scalable_cq_tx = create_completion_queue(fabric_domain_, + // fabric_info_->tx_attr->size, "tx scalable"); + + // bind_queue_to_endpoint(scalable_ep_tx, scalable_cq_tx, FI_TRANSMIT, "tx scalable"); + + // enable_endpoint(scalable_ep_tx, "tx scalable"); + + // endpoint_wrapper tx(scalable_ep_tx, nullptr, scalable_cq_tx, "tx scalable"); + // LF_DEB(cnb_deb, + // trace(str<>("Scalable Ep"), "initial tx push", "ep", + // NS_DEBUG::ptr(tx.get_ep()), "tx cq", NS_DEBUG::ptr(tx.get_tx_cq()), "rx cq", + // NS_DEBUG::ptr(tx.get_rx_cq()))); + // tx_endpoints_.push(tx); + + // // Rx contexts + //// finvoke("create rx context", "fi_rx_context", + //// fi_rx_context(ep_sx, i, NULL, &scalable_ep_rx, NULL)); + + //// scalable_cq_rx = + //// create_completion_queue(fabric_domain_, fabric_info_->rx_attr->size, "rx"); + + //// bind_queue_to_endpoint(scalable_ep_rx, scalable_cq_rx, FI_RECV, "rx scalable"); + + //// enable_endpoint(scalable_ep_rx, "rx scalable"); + + //// endpoint_wrapper rx(scalable_ep_rx, scalable_cq_rx, nullptr, "rx scalable"); + //// LF_DEB(cnb_deb, + //// trace(str<>("Scalable Ep"), "initial rx push", "ep", + //// NS_DEBUG::ptr(rx.get_ep()), "tx cq", NS_DEBUG::ptr(rx.get_tx_cq()), "rx cq", + //// NS_DEBUG::ptr(rx.get_rx_cq()))); + //// rx_endpoints_.push(rx); + // } + + // finvoke("fi_scalable_ep_bind AV", "fi_scalable_ep_bind", + // fi_scalable_ep_bind(ep_sx, &av_->fid, 0)); + + // eps_->ep_tx_ = endpoint_wrapper(ep_sx, nullptr, nullptr, "rx scalable"); + + return static_cast(this)->initialize_derived( + provider, rootnode, size, threads, std::forward(args)...); } - if (endpoint_type_ == endpoint_type::multiple) + // -------------------------------------------------------------------- + uint64_t caps_flags(uint64_t available_flags) const { - // create a separate Tx endpoint for sending messages - // note that the CQ needs FI_RECV even though its a Tx cq to keep - // some providers happy as they trigger an error if an endpoint - // has no Rx cq attached (appears to be a progress related bug) - auto ep_tx = new_endpoint_active(fabric_domain_, fabric_info_, true); - - // create a completion queue for tx endpoint - fabric_info_->tx_attr->op_flags |= (FI_INJECT_COMPLETE | FI_COMPLETION); - auto tx_cq = - create_completion_queue(fabric_domain_, fabric_info_->tx_attr->size, "tx multiple"); - - bind_queue_to_endpoint(ep_tx, tx_cq, FI_TRANSMIT | FI_RECV, "tx multiple"); - bind_address_vector_to_endpoint(ep_tx, av_); - enable_endpoint(ep_tx, "tx multiple"); - - // combine endpoints and CQ into wrapper for convenience - eps_->ep_tx_ = endpoint_wrapper(ep_tx, nullptr, tx_cq, "tx multiple"); - } - else if (endpoint_type_ == endpoint_type::threadlocalTx) - { - // each thread creates a Tx endpoint on first call to get_tx_endpoint() - } - else if (endpoint_type_ == endpoint_type::scalableTx || - endpoint_type_ == endpoint_type::scalableTxRx) - { - // setup tx contexts for each possible thread - size_t threads_allocated = 0; - auto ep_sx = new_endpoint_scalable(fabric_domain_, fabric_info_, true /*Tx*/, threads, - threads_allocated); - - LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>("scalable endpoint ok"), - "Contexts allocated", debug::dec<4>(threads_allocated))); - - finvoke("fi_scalable_ep_bind AV", "fi_scalable_ep_bind", - fi_scalable_ep_bind(ep_sx, &av_->fid, 0)); - - // prepare the stack for insertions - tx_endpoints_.reserve(threads_allocated); + char buf[1024]; + LF_DEB(cnb_err, + debug(str<>("caps available"), hex(available_flags), + fi_tostr_r(buf, 1024, &available_flags, FI_TYPE_CAPS))); + uint64_t required_flags = + static_cast(this)->caps_flags(available_flags); // - for (unsigned int i = 0; i < threads_allocated; i++) + uint64_t final_flags = required_flags; + for (uint64_t bit = 0; bit < 64; ++bit) { - [[maybe_unused]] auto scp = - NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), "scalable", debug::dec<4>(i)); - - // For threadlocal/scalable endpoints, tx/rx resources - fid_ep* scalable_ep_tx; - fid_cq* scalable_cq_tx; - - // Create a Tx context, cq, bind and enable - finvoke("create tx context", "fi_tx_context", - fi_tx_context(ep_sx, i, NULL, &scalable_ep_tx, NULL)); - scalable_cq_tx = create_completion_queue(fabric_domain_, - fabric_info_->tx_attr->size, "tx scalable"); - bind_queue_to_endpoint(scalable_ep_tx, scalable_cq_tx, FI_TRANSMIT, "tx scalable"); - enable_endpoint(scalable_ep_tx, "tx scalable"); - - endpoint_wrapper tx(scalable_ep_tx, nullptr, scalable_cq_tx, "tx scalable"); - LF_DEB(NS_DEBUG::cnb_deb, - trace(debug::str<>("Scalable Ep"), "initial tx push", "ep", - NS_DEBUG::ptr(tx.get_ep()), "tx cq", NS_DEBUG::ptr(tx.get_tx_cq()), "rx cq", - NS_DEBUG::ptr(tx.get_rx_cq()))); - tx_endpoints_.push(tx); + uint64_t f = (1ULL << bit); + if ((required_flags & f) && ((available_flags & f) == 0)) + { + LF_DEB(cnb_err, + error(str<>("caps flags unavailable"), + fi_tostr_r(buf, 1024, &f, FI_TYPE_CAPS))); + final_flags &= ~f; + } } - - eps_->ep_tx_ = endpoint_wrapper(ep_sx, nullptr, nullptr, "rx scalable"); + LF_DEB(cnb_err, + debug(str<>("caps flags requested"), hex(final_flags), + fi_tostr_r(buf, 1024, &final_flags, FI_TYPE_CAPS))); + return final_flags; } - // once enabled we can get the address - enable_endpoint(eps_->ep_rx_.get_ep(), "rx here"); - here_ = get_endpoint_address(&eps_->ep_rx_.get_ep()->fid); - LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("setting 'here'"), iplocality(here_))); - - // // if we are using scalable endpoints, then setup tx/rx contexts - // // we will us a single endpoint for all Tx/Rx contexts - // if (endpoint_type_ == endpoint_type::scalableTx || - // endpoint_type_ == endpoint_type::scalableTxRx) - // { - - // // thread slots might not be same as what we asked for - // size_t threads_allocated = 0; - // auto ep_sx = new_endpoint_scalable(fabric_domain_, fabric_info_, true /*Tx*/, threads, - // threads_allocated); - // if (!ep_sx) - // throw NS_LIBFABRIC::fabric_error(FI_EOTHER, "fi_scalable endpoint creation failed"); - - // LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>("scalable endpoint ok"), - // "Contexts allocated", debug::dec<4>(threads_allocated))); - - // // prepare the stack for insertions - // tx_endpoints_.reserve(threads_allocated); - // rx_endpoints_.reserve(threads_allocated); - // // - // for (unsigned int i = 0; i < threads_allocated; i++) - // { - // [[maybe_unused]] auto scp = - // NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), "scalable", debug::dec<4>(i)); - - // // For threadlocal/scalable endpoints, tx/rx resources - // fid_ep* scalable_ep_tx; - // fid_cq* scalable_cq_tx; - //// fid_ep* scalable_ep_rx; - //// fid_cq* scalable_cq_rx; - - // // Tx context setup - // finvoke("create tx context", "fi_tx_context", - // fi_tx_context(ep_sx, i, NULL, &scalable_ep_tx, NULL)); - - // scalable_cq_tx = create_completion_queue(fabric_domain_, - // fabric_info_->tx_attr->size, "tx scalable"); - - // bind_queue_to_endpoint(scalable_ep_tx, scalable_cq_tx, FI_TRANSMIT, "tx scalable"); - - // enable_endpoint(scalable_ep_tx, "tx scalable"); - - // endpoint_wrapper tx(scalable_ep_tx, nullptr, scalable_cq_tx, "tx scalable"); - // LF_DEB(NS_DEBUG::cnb_deb, - // trace(debug::str<>("Scalable Ep"), "initial tx push", "ep", - // NS_DEBUG::ptr(tx.get_ep()), "tx cq", NS_DEBUG::ptr(tx.get_tx_cq()), "rx cq", - // NS_DEBUG::ptr(tx.get_rx_cq()))); - // tx_endpoints_.push(tx); - - // // Rx contexts - //// finvoke("create rx context", "fi_rx_context", - //// fi_rx_context(ep_sx, i, NULL, &scalable_ep_rx, NULL)); - - //// scalable_cq_rx = - //// create_completion_queue(fabric_domain_, fabric_info_->rx_attr->size, "rx"); - - //// bind_queue_to_endpoint(scalable_ep_rx, scalable_cq_rx, FI_RECV, "rx scalable"); - - //// enable_endpoint(scalable_ep_rx, "rx scalable"); - - //// endpoint_wrapper rx(scalable_ep_rx, scalable_cq_rx, nullptr, "rx scalable"); - //// LF_DEB(NS_DEBUG::cnb_deb, - //// trace(debug::str<>("Scalable Ep"), "initial rx push", "ep", - //// NS_DEBUG::ptr(rx.get_ep()), "tx cq", NS_DEBUG::ptr(rx.get_tx_cq()), "rx cq", - //// NS_DEBUG::ptr(rx.get_rx_cq()))); - //// rx_endpoints_.push(rx); - // } - - // finvoke("fi_scalable_ep_bind AV", "fi_scalable_ep_bind", - // fi_scalable_ep_bind(ep_sx, &av_->fid, 0)); - - // eps_->ep_tx_ = endpoint_wrapper(ep_sx, nullptr, nullptr, "rx scalable"); - - return static_cast(this)->initialize_derived(provider, rootnode, size, threads, - std::forward(args)...); - } - - // -------------------------------------------------------------------- - constexpr uint64_t caps_flags() { return static_cast(this)->caps_flags(); } - - // -------------------------------------------------------------------- - constexpr fi_threading threadlevel_flags() - { - return static_cast(this)->threadlevel_flags(); - } + // -------------------------------------------------------------------- + constexpr fi_threading threadlevel_flags() + { + return static_cast(this)->threadlevel_flags(); + } - // -------------------------------------------------------------------- - constexpr std::int64_t memory_registration_mode_flags() - { - std::int64_t base_flags = FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY; + // -------------------------------------------------------------------- + constexpr std::int64_t memory_registration_mode_flags() + { +#if defined(HAVE_LIBFABRIC_LNX) + return FI_MR_HMEM; +#endif + std::int64_t base_flags = FI_MR_ALLOCATED; // | FI_MR_VIRT_ADDR | FI_MR_PROV_KEY; #if OOMPH_ENABLE_DEVICE - base_flags = base_flags | FI_MR_HMEM; + base_flags = base_flags | FI_MR_HMEM; #endif - base_flags = base_flags | FI_MR_LOCAL; + base_flags = base_flags | FI_MR_LOCAL; #if defined(HAVE_LIBFABRIC_CXI) - return base_flags | FI_MR_MMU_NOTIFY | FI_MR_ENDPOINT; + return base_flags | FI_MR_ENDPOINT; #elif defined(HAVE_LIBFABRIC_EFA) - return base_flags | FI_MR_MMU_NOTIFY | FI_MR_ENDPOINT; + return base_flags | FI_MR_MMU_NOTIFY | FI_MR_ENDPOINT; #else - return base_flags; + return base_flags; #endif - } + } - // -------------------------------------------------------------------- - uint32_t rendezvous_threshold() { return msg_rendezvous_threshold_; } - // -------------------------------------------------------------------- - // initialize the basic fabric/domain/name - void open_fabric(std::string const& provider, int threads, bool rootnode) - { - [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__); + // -------------------------------------------------------------------- + uint32_t rendezvous_threshold() { return msg_rendezvous_threshold_; } - struct fi_info* fabric_hints_ = fi_allocinfo(); - if (!fabric_hints_) + // -------------------------------------------------------------------- + // initialize the basic fabric/domain/name + void open_fabric(std::string const& provider, int threads, bool rootnode) { - throw NS_LIBFABRIC::fabric_error(-1, "Failed to allocate fabric hints"); - } + [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__); - LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Here locality"), iplocality(here_))); + struct fi_info* fabric_hints_ = fi_allocinfo(); + if (!fabric_hints_) + { + throw NS_LIBFABRIC::fabric_error(-1, "Failed to allocate fabric hints"); + } -#if defined(HAVE_LIBFABRIC_SOCKETS) || defined(HAVE_LIBFABRIC_TCP) || defined(HAVE_LIBFABRIC_VERBS) - fabric_hints_->addr_format = FI_SOCKADDR_IN; -#elif defined(HAVE_LIBFABRIC_EFA) - fabric_hints_->addr_format = FI_ADDR_EFA; + // setup the provider we want to use before getting info + if ((provider.c_str() == std::string("tcp")) || + (provider.c_str() == std::string("verbs"))) + { + fabric_hints_->fabric_attr->prov_name = + strdup(std::string(provider + ";ofi_rxm").c_str()); + } + else { fabric_hints_->fabric_attr->prov_name = strdup(provider.c_str()); } + LF_DEB(cnb_deb, debug(str<>("fabric provider"), fabric_hints_->fabric_attr->prov_name)); + +#if defined(HAVE_LIBFABRIC_CXI) + // libfabric domain for multi-nic CXI provider + char const* cxi_domain = std::getenv("FI_CXI_DEVICE_NAME"); + if (cxi_domain == nullptr) + { + LF_DEB(cnb_err, error(str<>("Domain"), "FI_CXI_DEVICE_NAME not set")); + } + else { fabric_hints_->domain_attr->name = strdup(cxi_domain); } + LF_DEB( + NS_DEBUG::cnb_deb, debug(str<>("fabric domain"), fabric_hints_->domain_attr->name)); #endif - fabric_hints_->caps = caps_flags(); + fabric_hints_->domain_attr->mr_mode = memory_registration_mode_flags(); - fabric_hints_->mode = FI_CONTEXT /*| FI_MR_LOCAL*/; - if (provider.c_str() == std::string("tcp")) - { - fabric_hints_->fabric_attr->prov_name = - strdup(std::string(provider + ";ofi_rxm").c_str()); - } - else if (provider.c_str() == std::string("verbs")) - { - fabric_hints_->fabric_attr->prov_name = - strdup(std::string(provider + ";ofi_rxm").c_str()); - } - else { fabric_hints_->fabric_attr->prov_name = strdup(provider.c_str()); } - LF_DEB(NS_DEBUG::cnb_deb, - debug(debug::str<>("fabric provider"), fabric_hints_->fabric_attr->prov_name)); + // get an info object to see what might be available before we set any flags + uint64_t flags = 0; + int ret = fi_getinfo(FI_VERSION(LIBFABRIC_FI_VERSION_MAJOR, LIBFABRIC_FI_VERSION_MINOR), + nullptr, nullptr, flags, fabric_hints_, &fabric_info_); + if (ret) throw NS_LIBFABRIC::fabric_error(ret, "Failed to get fabric info"); + if (display_fabric_info_ && fabric_info_) + { + std::array buf; + LF_DEB(cnb_err, + trace(str<>("Fabric info"), "pre-check ->", + fabric_hints_->fabric_attr->prov_name, "\n", + fi_tostr_r(buf.data(), buf.size(), fabric_info_, FI_TYPE_INFO))); + } - fabric_hints_->domain_attr->mr_mode = memory_registration_mode_flags(); + // set capabilities we want to request + uint64_t all_caps = + caps_flags(fabric_info_->rx_attr->caps | fabric_info_->tx_attr->caps); - // Enable/Disable the use of progress threads - auto progress = libfabric_progress_type(); - fabric_hints_->domain_attr->control_progress = progress; - fabric_hints_->domain_attr->data_progress = progress; - LF_DEB(NS_DEBUG::cnb_err, debug(debug::str<>("progress"), libfabric_progress_string())); + // fabric_hints_->caps = all_caps; + fabric_hints_->tx_attr->caps = fabric_info_->tx_attr->caps & all_caps; + fabric_hints_->rx_attr->caps = fabric_info_->rx_attr->caps & all_caps; - if (threads > 1) - { - LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("FI_THREAD_FID"))); - // Enable thread safe mode (Does not work with psm2 provider) - // fabric_hints_->domain_attr->threading = FI_THREAD_SAFE; - //fabric_hints_->domain_attr->threading = FI_THREAD_FID; - fabric_hints_->domain_attr->threading = threadlevel_flags(); - } - else - { - LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("FI_THREAD_DOMAIN"))); - // we serialize everything - fabric_hints_->domain_attr->threading = FI_THREAD_DOMAIN; - } + if ((fabric_info_->mode & FI_CONTEXT) == 0) + { + std::array buf; + LF_DEB(cnb_err, + debug(str<>("mode FI_CONTEXT!=0"), + fi_tostr_r(buf.data(), buf.size(), &fabric_hints_->domain_attr->mode, + FI_TYPE_MODE))); + } + fabric_hints_->domain_attr->name = strdup(fabric_info_->domain_attr->name); + + // Enable/Disable the use of progress threads + auto progress = libfabric_progress_type(); + fabric_hints_->domain_attr->control_progress = progress; + fabric_hints_->domain_attr->data_progress = progress; + LF_DEB(cnb_err, debug(str<>("progress"), libfabric_progress_string())); + + if (threads > 1) + { + LF_DEB(cnb_deb, debug(str<>("Setting Threads>1 level"))); + // fabric_hints_->domain_attr->threading = FI_THREAD_SAFE; + // fabric_hints_->domain_attr->threading = FI_THREAD_FID; + fabric_hints_->domain_attr->threading = threadlevel_flags(); + } + else + { + LF_DEB(cnb_deb, debug(str<>("FI_THREAD_DOMAIN"))); + // we serialize everything + fabric_hints_->domain_attr->threading = FI_THREAD_DOMAIN; + } - // Enable resource management - fabric_hints_->domain_attr->resource_mgmt = FI_RM_ENABLED; + // Enable resource management + fabric_hints_->domain_attr->resource_mgmt = FI_RM_ENABLED; - LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("fabric endpoint"), "RDM")); - fabric_hints_->ep_attr->type = FI_EP_RDM; + LF_DEB(cnb_deb, debug(str<>("fabric endpoint"), "RDM")); + fabric_hints_->ep_attr->type = FI_EP_RDM; - uint64_t flags = 0; - LF_DEB(NS_DEBUG::cnb_deb, - debug(debug::str<>("get fabric info"), "FI_VERSION", - debug::dec(LIBFABRIC_FI_VERSION_MAJOR), debug::dec(LIBFABRIC_FI_VERSION_MINOR))); + LF_DEB(cnb_deb, + debug(str<>("get fabric info"), "FI_VERSION", dec(LIBFABRIC_FI_VERSION_MAJOR), + dec(LIBFABRIC_FI_VERSION_MINOR))); - int ret = fi_getinfo(FI_VERSION(LIBFABRIC_FI_VERSION_MAJOR, LIBFABRIC_FI_VERSION_MINOR), - nullptr, nullptr, flags, fabric_hints_, &fabric_info_); - if (ret) throw NS_LIBFABRIC::fabric_error(ret, "Failed to get fabric info"); + ret = fi_getinfo(FI_VERSION(LIBFABRIC_FI_VERSION_MAJOR, LIBFABRIC_FI_VERSION_MINOR), + nullptr, nullptr, flags, fabric_hints_, &fabric_info_); + if (ret) throw NS_LIBFABRIC::fabric_error(ret, "Failed to get fabric info"); - if (rootnode) - { - LF_DEB(NS_DEBUG::cnb_err, - trace(debug::str<>("Fabric info"), "\n", fi_tostr(fabric_info_, FI_TYPE_INFO))); - } + if (rootnode) + { + std::array buf; + LF_DEB(cnb_err, + trace(str<>("Fabric info"), "\n", + fi_tostr_r(buf.data(), buf.size(), fabric_info_, FI_TYPE_INFO))); + } - bool context = (fabric_hints_->mode & FI_CONTEXT) != 0; - LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Requires FI_CONTEXT"), context)); + int mrkey = (fabric_info_->domain_attr->mr_mode & FI_MR_PROV_KEY) != 0; + LF_DEB(cnb_deb, debug(str<>("Requires FI_MR_PROV_KEY"), mrkey)); - mrlocal = (fabric_hints_->domain_attr->mr_mode & FI_MR_LOCAL) != 0; - LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Requires FI_MR_LOCAL"), mrlocal)); + bool context = (fabric_info_->mode & FI_CONTEXT) != 0; + LF_DEB(cnb_deb, debug(str<>("Requires FI_CONTEXT"), context)); - mrbind = (fabric_hints_->domain_attr->mr_mode & FI_MR_ENDPOINT) != 0; - LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Requires FI_MR_ENDPOINT"), mrbind)); + mrlocal = (fabric_info_->domain_attr->mr_mode & FI_MR_LOCAL) != 0; + LF_DEB(cnb_deb, debug(str<>("Requires FI_MR_LOCAL"), mrlocal)); - /* Check if provider requires heterogeneous memory registration */ - mrhmem = (fabric_hints_->domain_attr->mr_mode & FI_MR_HMEM) != 0; - LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Requires FI_MR_HMEM"), mrhmem)); + mrbind = (fabric_info_->domain_attr->mr_mode & FI_MR_ENDPOINT) != 0; + LF_DEB(cnb_deb, debug(str<>("Requires FI_MR_ENDPOINT"), mrbind)); - bool mrhalloc = (fabric_hints_->domain_attr->mr_mode & FI_MR_ALLOCATED) != 0; - LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Requires FI_MR_ALLOCATED"), mrhalloc)); + /* Check if provider requires heterogeneous memory registration */ + mrhmem = (fabric_info_->domain_attr->mr_mode & FI_MR_HMEM) != 0; + LF_DEB(cnb_deb, debug(str<>("Requires FI_MR_HMEM"), mrhmem)); - LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Creating fi_fabric"))); - ret = fi_fabric(fabric_info_->fabric_attr, &fabric_, nullptr); - if (ret) throw NS_LIBFABRIC::fabric_error(ret, "Failed to get fi_fabric"); + bool mrhalloc = (fabric_info_->domain_attr->mr_mode & FI_MR_ALLOCATED) != 0; + LF_DEB(cnb_deb, debug(str<>("Requires FI_MR_ALLOCATED"), mrhalloc)); +#if (FI_MAJOR_VERSION > 1) || ((FI_MAJOR_VERSION == 1) && FI_MINOR_VERSION >= 20) + int auth_key = (fabric_info_->domain_attr->max_ep_auth_key); + LF_DEB(cnb_deb, debug(str<>("Supported max_ep_auth_key"), auth_key)); + fabric_info_->domain_attr->max_ep_auth_key = 0; +#endif + LF_DEB(cnb_deb, debug(str<>("Creating fi_fabric"))); + ret = fi_fabric(fabric_info_->fabric_attr, &fabric_, nullptr); + if (ret) throw NS_LIBFABRIC::fabric_error(ret, "Failed to get fi_fabric"); - // Allocate a domain. - LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Allocating domain"))); - ret = fi_domain(fabric_, fabric_info_, &fabric_domain_, nullptr); - if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_domain"); + // Allocate a domain. + LF_DEB(cnb_deb, debug(str<>("Allocating domain"))); + ret = fi_domain(fabric_, fabric_info_, &fabric_domain_, nullptr); + if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_domain"); #if defined(HAVE_LIBFABRIC_GNI) - { - [[maybe_unused]] auto scp = - NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), "GNI memory registration block"); - - LF_DEB(NS_DEBUG::cnb_err, debug(debug::str<>("-------"), "GNI String values")); - // Dump out all vars for debug purposes - for (auto& gni_data : gni_strs) { - _set_check_domain_op_value(gni_data.first, 0, gni_data.second.c_str(), - false); + [[maybe_unused]] auto scp = + NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), "GNI memory registration block"); + + LF_DEB(cnb_err, debug(str<>("-------"), "GNI String values")); + // Dump out all vars for debug purposes + for (auto& gni_data : gni_strs) + { + _set_check_domain_op_value( + gni_data.first, 0, gni_data.second.c_str(), false); + } + LF_DEB(cnb_err, debug(str<>("-------"), "GNI Int values")); + for (auto& gni_data : gni_ints) + { + _set_check_domain_op_value( + gni_data.first, 0, gni_data.second.c_str(), false); + } + LF_DEB(cnb_err, debug(str<>("-------"))); + + // -------------------------- + // GNI_MR_CACHE + // set GNI mem reg to be either none, internal or udreg + // + _set_check_domain_op_value( + GNI_MR_CACHE, const_cast(OOMPH_GNI_REG), "GNI_MR_CACHE"); + + // -------------------------- + // GNI_MR_UDREG_REG_LIMIT + // Experiments showed default value of 2048 too high if + // launching multiple clients on one node + // + int32_t udreg_limit = 0x0800; // 0x0400 = 1024, 0x0800 = 2048 + _set_check_domain_op_value( + GNI_MR_UDREG_REG_LIMIT, udreg_limit, "GNI_MR_UDREG_REG_LIMIT"); + + // -------------------------- + // GNI_MR_CACHE_LAZY_DEREG + // Enable lazy deregistration in MR cache + // + int32_t enable = 1; + LF_DEB(cnb_deb, debug(str<>("setting GNI_MR_CACHE_LAZY_DEREG"))); + _set_check_domain_op_value( + GNI_MR_CACHE_LAZY_DEREG, enable, "GNI_MR_CACHE_LAZY_DEREG"); + + // -------------------------- + // GNI_MSG_RENDEZVOUS_THRESHOLD (c.f. GNI_RMA_RDMA_THRESHOLD) + // + int32_t thresh = msg_rendezvous_threshold_; + _set_check_domain_op_value( + GNI_MSG_RENDEZVOUS_THRESHOLD, thresh, "GNI_MSG_RENDEZVOUS_THRESHOLD"); } - LF_DEB(NS_DEBUG::cnb_err, debug(debug::str<>("-------"), "GNI Int values")); - for (auto& gni_data : gni_ints) +#endif + tx_inject_size_ = fabric_info_->tx_attr->inject_size; + + // the number of preposted receives, and sender queue depth + // is set by querying the tx/tx attr sizes + tx_attr_size_ = std::min(size_t(512), fabric_info_->tx_attr->size / 2); + rx_attr_size_ = std::min(size_t(512), fabric_info_->rx_attr->size / 2); + // Print fabric info to a human-readable string if available + if (display_fabric_info_ && fabric_info_) { - _set_check_domain_op_value(gni_data.first, 0, gni_data.second.c_str(), - false); + std::array buf; + std::cout << "Libfabric fabric info:\n" + << fi_tostr_r(buf.data(), buf.size(), fabric_info_, FI_TYPE_INFO) + << std::endl; } - LF_DEB(NS_DEBUG::cnb_err, debug(debug::str<>("-------"))); - - // -------------------------- - // GNI_MR_CACHE - // set GNI mem reg to be either none, internal or udreg - // - _set_check_domain_op_value(GNI_MR_CACHE, const_cast(OOMPH_GNI_REG), - "GNI_MR_CACHE"); - - // -------------------------- - // GNI_MR_UDREG_REG_LIMIT - // Experiments showed default value of 2048 too high if - // launching multiple clients on one node - // - int32_t udreg_limit = 0x0800; // 0x0400 = 1024, 0x0800 = 2048 - _set_check_domain_op_value(GNI_MR_UDREG_REG_LIMIT, udreg_limit, - "GNI_MR_UDREG_REG_LIMIT"); - - // -------------------------- - // GNI_MR_CACHE_LAZY_DEREG - // Enable lazy deregistration in MR cache - // - int32_t enable = 1; - LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("setting GNI_MR_CACHE_LAZY_DEREG"))); - _set_check_domain_op_value(GNI_MR_CACHE_LAZY_DEREG, enable, - "GNI_MR_CACHE_LAZY_DEREG"); - - // -------------------------- - // GNI_MSG_RENDEZVOUS_THRESHOLD (c.f. GNI_RMA_RDMA_THRESHOLD) - // - int32_t thresh = msg_rendezvous_threshold_; - _set_check_domain_op_value(GNI_MSG_RENDEZVOUS_THRESHOLD, thresh, - "GNI_MSG_RENDEZVOUS_THRESHOLD"); + fi_freeinfo(fabric_hints_); } -#endif - tx_inject_size_ = fabric_info_->tx_attr->inject_size; - - // the number of preposted receives, and sender queue depth - // is set by querying the tx/tx attr sizes - tx_attr_size_ = std::min(size_t(512), fabric_info_->tx_attr->size / 2); - rx_attr_size_ = std::min(size_t(512), fabric_info_->rx_attr->size / 2); - fi_freeinfo(fabric_hints_); - } - // -------------------------------------------------------------------- - struct fi_info* set_src_dst_addresses(struct fi_info* info, bool tx) - { - return static_cast(this)->set_src_dst_addresses(info, tx); - } + // -------------------------------------------------------------------- + struct fi_info* set_src_dst_addresses(struct fi_info* info, bool tx) + { + return static_cast(this)->set_src_dst_addresses(info, tx); + } #ifdef HAVE_LIBFABRIC_GNI - // -------------------------------------------------------------------- - // Special GNI extensions to disable memory registration cache + // -------------------------------------------------------------------- + // Special GNI extensions to disable memory registration cache - // if set is false, the old value is returned and nothing is set - template - int _set_check_domain_op_value(int op, T value, const char* info, bool set = true) - { - [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__); - static struct fi_gni_ops_domain* gni_domain_ops = nullptr; - int ret = 0; - - if (gni_domain_ops == nullptr) + // if set is false, the old value is returned and nothing is set + template + int _set_check_domain_op_value(int op, T value, char const* info, bool set = true) { - ret = fi_open_ops(&fabric_domain_->fid, FI_GNI_DOMAIN_OPS_1, 0, (void**)&gni_domain_ops, - nullptr); - LF_DEB(NS_DEBUG::cnb_deb, - debug(debug::str<>("gni open ops"), (ret == 0 ? "OK" : "FAIL"), - NS_DEBUG::ptr(gni_domain_ops))); - } + [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__); + static struct fi_gni_ops_domain* gni_domain_ops = nullptr; + int ret = 0; - // if open was ok and set flag is present, then set value - if (ret == 0 && set) - { - ret = gni_domain_ops->set_val(&fabric_domain_->fid, (dom_ops_val_t)(op), - reinterpret_cast(&value)); + if (gni_domain_ops == nullptr) + { + ret = fi_open_ops(&fabric_domain_->fid, FI_GNI_DOMAIN_OPS_1, 0, + (void**) &gni_domain_ops, nullptr); + LF_DEB(cnb_deb, + debug(str<>("gni open ops"), (ret == 0 ? "OK" : "FAIL"), + NS_DEBUG::ptr(gni_domain_ops))); + } - LF_DEB(NS_DEBUG::cnb_deb, - debug(debug::str<>("gni set ops val"), value, (ret == 0 ? "OK" : "FAIL"))); - } + // if open was ok and set flag is present, then set value + if (ret == 0 && set) + { + ret = gni_domain_ops->set_val( + &fabric_domain_->fid, (dom_ops_val_t) (op), reinterpret_cast(&value)); - // Get the value (so we can check that the value we set is now returned) - T new_value; - ret = gni_domain_ops->get_val(&fabric_domain_->fid, (dom_ops_val_t)(op), &new_value); - if constexpr (std::is_integral::value) - { - LF_DEB(NS_DEBUG::cnb_err, debug(debug::str<>("gni op val"), (ret == 0 ? "OK" : "FAIL"), - info, debug::hex<8>(new_value))); - } - else - { - LF_DEB(NS_DEBUG::cnb_err, - debug(debug::str<>("gni op val"), (ret == 0 ? "OK" : "FAIL"), info, new_value)); - } - // - if (ret) throw NS_LIBFABRIC::fabric_error(ret, std::string("setting ") + info); + LF_DEB(cnb_deb, debug(str<>("gni set ops val"), value, (ret == 0 ? "OK" : "FAIL"))); + } - return ret; - } + // Get the value (so we can check that the value we set is now returned) + T new_value; + ret = gni_domain_ops->get_val(&fabric_domain_->fid, (dom_ops_val_t) (op), &new_value); + if constexpr (std::is_integral::value) + { + LF_DEB(cnb_err, + debug( + str<>("gni op val"), (ret == 0 ? "OK" : "FAIL"), info, hex<8>(new_value))); + } + else + { + LF_DEB(cnb_err, + debug(str<>("gni op val"), (ret == 0 ? "OK" : "FAIL"), info, new_value)); + } + // + if (ret) throw NS_LIBFABRIC::fabric_error(ret, std::string("setting ") + info); + + return ret; + } #endif - // -------------------------------------------------------------------- - struct fid_ep* new_endpoint_active(struct fid_domain* domain, struct fi_info* info, bool tx) - { - // don't allow multiple threads to call endpoint create at the same time - scoped_lock lock(controller_mutex_); + // -------------------------------------------------------------------- + struct fid_ep* new_endpoint_active(struct fid_domain* domain, struct fi_info* info, bool tx) + { + // don't allow multiple threads to call endpoint create at the same time + scoped_lock lock(controller_mutex_); - // make sure src_addr/dst_addr are set accordingly - // and we do not create two endpoint with the same src address - struct fi_info* hints = set_src_dst_addresses(info, tx); + // make sure src_addr/dst_addr are set accordingly + // and we do not create two endpoint with the same src address + struct fi_info* hints = set_src_dst_addresses(info, tx); - [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__); - LF_DEB(NS_DEBUG::cnb_deb, - debug(debug::str<>("Got info mode"), (info->mode & FI_NOTIFY_FLAGS_ONLY))); + [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__); + LF_DEB(cnb_deb, debug(str<>("Got info mode"), (info->mode & FI_NOTIFY_FLAGS_ONLY))); - struct fid_ep* ep; - int ret = fi_endpoint(domain, hints, &ep, nullptr); - if (ret) - { - throw NS_LIBFABRIC::fabric_error(ret, "fi_endpoint (too many threadlocal " - "endpoints?)"); + struct fid_ep* ep; + int ret = fi_endpoint(domain, hints, &ep, nullptr); + if (ret) + { + throw NS_LIBFABRIC::fabric_error( + ret, "fi_endpoint (too many threadlocal endpoints?)"); + } + fi_freeinfo(hints); + LF_DEB(cnb_deb, debug(str<>("new_endpoint_active"), hptr(ep))); + return ep; } - fi_freeinfo(hints); - LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("new_endpoint_active"), NS_DEBUG::ptr(ep))); - return ep; - } - // -------------------------------------------------------------------- - struct fid_ep* new_endpoint_scalable(struct fid_domain* domain, struct fi_info* info, bool tx, - size_t threads, size_t& threads_allocated) - { - // don't allow multiple threads to call endpoint create at the same time - scoped_lock lock(controller_mutex_); + // -------------------------------------------------------------------- + struct fid_ep* new_endpoint_scalable(struct fid_domain* domain, struct fi_info* info, + bool tx, size_t threads, size_t& threads_allocated) + { + // don't allow multiple threads to call endpoint create at the same time + scoped_lock lock(controller_mutex_); - [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__); + [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__); - LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("fi_dupinfo"))); - struct fi_info* hints = fi_dupinfo(info); - if (!hints) throw NS_LIBFABRIC::fabric_error(0, "fi_dupinfo"); + LF_DEB(cnb_deb, debug(str<>("fi_dupinfo"))); + struct fi_info* hints = fi_dupinfo(info); + if (!hints) throw NS_LIBFABRIC::fabric_error(0, "fi_dupinfo"); - int flags = 0; - struct fi_info* new_hints = nullptr; - int ret = fi_getinfo(FI_VERSION(LIBFABRIC_FI_VERSION_MAJOR, LIBFABRIC_FI_VERSION_MINOR), - nullptr, nullptr, flags, hints, &new_hints); - if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_getinfo"); + int flags = 0; + struct fi_info* new_hints = nullptr; + int ret = fi_getinfo(FI_VERSION(LIBFABRIC_FI_VERSION_MAJOR, LIBFABRIC_FI_VERSION_MINOR), + nullptr, nullptr, flags, hints, &new_hints); + if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_getinfo"); - // Check the optimal number of TX/RX contexts supported by the provider - size_t context_count = 0; - if (tx) { context_count = std::min(new_hints->domain_attr->tx_ctx_cnt, threads); } - else { context_count = std::min(new_hints->domain_attr->rx_ctx_cnt, threads); } + // Check the optimal number of TX/RX contexts supported by the provider + size_t context_count = 0; + if (tx) { context_count = std::min(new_hints->domain_attr->tx_ctx_cnt, threads); } + else { context_count = std::min(new_hints->domain_attr->rx_ctx_cnt, threads); } - // clang-format off - LF_DEB(NS_DEBUG::cnb_deb, - trace(debug::str<>("scalable endpoint"), + // clang-format off + LF_DEB(cnb_deb, + trace(str<>("scalable endpoint"), "Tx", tx, - "Threads", debug::dec<3>(threads), - "tx_ctx_cnt", debug::dec<3>(new_hints->domain_attr->tx_ctx_cnt), - "rx_ctx_cnt", debug::dec<3>(new_hints->domain_attr->rx_ctx_cnt), - "context_count", debug::dec<3>(context_count))); - // clang-format on - - threads_allocated = context_count; - new_hints->ep_attr->tx_ctx_cnt = context_count; - new_hints->ep_attr->rx_ctx_cnt = context_count; - - struct fid_ep* ep; - ret = fi_scalable_ep(domain, new_hints, &ep, nullptr); - if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_scalable_ep"); - LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("new_endpoint_scalable"), NS_DEBUG::ptr(ep))); - fi_freeinfo(hints); - return ep; - } - - // -------------------------------------------------------------------- - endpoint_wrapper& get_rx_endpoint() - { - static auto rx = NS_DEBUG::cnb_deb.make_timer(1, debug::str<>("get_rx_endpoint")); - LF_DEB(NS_DEBUG::cnb_deb, timed(rx)); + "Threads", dec<3>(threads), + "tx_ctx_cnt", dec<3>(new_hints->domain_attr->tx_ctx_cnt), + "rx_ctx_cnt", dec<3>(new_hints->domain_attr->rx_ctx_cnt), + "context_count", dec<3>(context_count))); + // clang-format on + + threads_allocated = context_count; + new_hints->ep_attr->tx_ctx_cnt = context_count; + new_hints->ep_attr->rx_ctx_cnt = context_count; + + struct fid_ep* ep; + ret = fi_scalable_ep(domain, new_hints, &ep, nullptr); + if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_scalable_ep"); + LF_DEB(cnb_deb, debug(str<>("new_endpoint_scalable"), hptr(ep))); + fi_freeinfo(hints); + return ep; + } - if (endpoint_type_ == endpoint_type::scalableTxRx) + // -------------------------------------------------------------------- + endpoint_wrapper& get_rx_endpoint() { - if (eps_->tl_srx_.get_ep() == nullptr) + static auto rx = NS_DEBUG::cnb_deb.make_timer(1, NS_DEBUG::str<>("get_rx_endpoint")); + LF_DEB(cnb_deb, timed(rx)); + + if (endpoint_type_ == endpoint_type::scalableTxRx) { - endpoint_wrapper ep; - bool ok = rx_endpoints_.pop(ep); - if (!ok) + if (eps_->tl_srx_.get_ep() == nullptr) { - // clang-format off - LF_DEB(NS_DEBUG::cnb_deb, error(debug::str<>("Scalable Ep"), "pop rx", - "ep", NS_DEBUG::ptr(ep.get_ep()), - "tx cq", NS_DEBUG::ptr(ep.get_tx_cq()), - "rx cq", NS_DEBUG::ptr(ep.get_rx_cq()))); - // clang-format on - throw std::runtime_error("rx endpoint wrapper pop fail"); + endpoint_wrapper ep; + bool ok = rx_endpoints_.pop(ep); + if (!ok) + { + // clang-format off + LF_DEB(cnb_deb, error(str<>("Scalable Ep"), "pop rx", + "ep", hptr(ep.get_ep()), + "tx cq", hptr(ep.get_tx_cq()), + "rx cq", hptr(ep.get_rx_cq()))); + // clang-format on + throw std::runtime_error("rx endpoint wrapper pop fail"); + } + eps_->tl_srx_ = stack_endpoint( + ep.get_ep(), ep.get_rx_cq(), ep.get_tx_cq(), ep.get_name(), &rx_endpoints_); + LF_DEB(cnb_deb, + trace(str<>("Scalable Ep"), "pop rx", "ep", hptr(eps_->tl_srx_.get_ep()), + "tx cq", hptr(eps_->tl_srx_.get_tx_cq()), "rx cq", + hptr(eps_->tl_srx_.get_rx_cq()))); } - eps_->tl_srx_ = stack_endpoint(ep.get_ep(), ep.get_rx_cq(), ep.get_tx_cq(), - ep.get_name(), &rx_endpoints_); - LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>("Scalable Ep"), "pop rx", "ep", - NS_DEBUG::ptr(eps_->tl_srx_.get_ep()), "tx cq", - NS_DEBUG::ptr(eps_->tl_srx_.get_tx_cq()), "rx cq", - NS_DEBUG::ptr(eps_->tl_srx_.get_rx_cq()))); + return eps_->tl_srx_.endpoint_; } - return eps_->tl_srx_.endpoint_; + // otherwise just return the normal Rx endpoint + return eps_->ep_rx_; } - // otherwise just return the normal Rx endpoint - return eps_->ep_rx_; - } - // -------------------------------------------------------------------- - endpoint_wrapper& get_tx_endpoint() - { - if (endpoint_type_ == endpoint_type::threadlocalTx) + // -------------------------------------------------------------------- + endpoint_wrapper& get_tx_endpoint() { - if (eps_->tl_tx_.get_ep() == nullptr) + if (endpoint_type_ == endpoint_type::threadlocalTx) { - [[maybe_unused]] auto scp = - NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__, "threadlocal"); - - // create a completion queue for tx endpoint - fabric_info_->tx_attr->op_flags |= (FI_INJECT_COMPLETE | FI_COMPLETION); - auto tx_cq = create_completion_queue(fabric_domain_, fabric_info_->tx_attr->size, - "tx threadlocal"); - - // setup an endpoint for sending messages - // note that the CQ needs FI_RECV even though its a Tx cq to keep - // some providers happy as they trigger an error if an endpoint - // has no Rx cq attached (progress bug) - auto ep_tx = new_endpoint_active(fabric_domain_, fabric_info_, true); - bind_queue_to_endpoint(ep_tx, tx_cq, FI_TRANSMIT | FI_RECV, "tx threadlocal"); - bind_address_vector_to_endpoint(ep_tx, av_); - enable_endpoint(ep_tx, "tx threadlocal"); - - // set threadlocal endpoint wrapper - LF_DEB(NS_DEBUG::cnb_deb, - trace(debug::str<>("Threadlocal Ep"), "create Tx", "ep", NS_DEBUG::ptr(ep_tx), - "tx cq", NS_DEBUG::ptr(tx_cq), "rx cq", NS_DEBUG::ptr(nullptr))); - // for cleaning up at termination - endpoint_wrapper ep(ep_tx, nullptr, tx_cq, "tx threadlocal"); - tx_endpoints_.push(ep); - eps_->tl_tx_ = stack_endpoint(ep_tx, nullptr, tx_cq, "threadlocal", nullptr); + if (eps_->tl_tx_.get_ep() == nullptr) + { + [[maybe_unused]] auto scp = + NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__, "threadlocal"); + + // create a completion queue for tx endpoint + fabric_info_->tx_attr->op_flags |= (FI_INJECT_COMPLETE | FI_COMPLETION); + auto tx_cq = create_completion_queue( + fabric_domain_, fabric_info_->tx_attr->size, "tx threadlocal"); + + // setup an endpoint for sending messages + // note that the CQ needs FI_RECV even though its a Tx cq to keep + // some providers happy as they trigger an error if an endpoint + // has no Rx cq attached (progress bug) + auto ep_tx = new_endpoint_active(fabric_domain_, fabric_info_, true); + bind_queue_to_endpoint(ep_tx, tx_cq, FI_TRANSMIT | FI_RECV, "tx threadlocal"); + bind_address_vector_to_endpoint(ep_tx, av_); + enable_endpoint(ep_tx, "tx threadlocal"); + + // set threadlocal endpoint wrapper + LF_DEB(cnb_deb, + trace(str<>("Threadlocal Ep"), "create Tx", "ep", hptr(ep_tx), "tx cq", + hptr(tx_cq), "rx cq", hptr(nullptr))); + // for cleaning up at termination + endpoint_wrapper ep(ep_tx, nullptr, tx_cq, "tx threadlocal"); + tx_endpoints_.push(ep); + eps_->tl_tx_ = stack_endpoint(ep_tx, nullptr, tx_cq, "threadlocal", nullptr); + } + return eps_->tl_tx_.endpoint_; } - return eps_->tl_tx_.endpoint_; - } - else if (endpoint_type_ == endpoint_type::scalableTx || - endpoint_type_ == endpoint_type::scalableTxRx) - { - if (eps_->tl_stx_.get_ep() == nullptr) + else if (endpoint_type_ == endpoint_type::scalableTx || + endpoint_type_ == endpoint_type::scalableTxRx) { - endpoint_wrapper ep; - bool ok = tx_endpoints_.pop(ep); - if (!ok) + if (eps_->tl_stx_.get_ep() == nullptr) { - LF_DEB(NS_DEBUG::cnb_deb, - error(debug::str<>("Scalable Ep"), "pop tx", "ep", - NS_DEBUG::ptr(ep.get_ep()), "tx cq", NS_DEBUG::ptr(ep.get_tx_cq()), - "rx cq", NS_DEBUG::ptr(ep.get_rx_cq()))); - throw std::runtime_error("tx endpoint wrapper pop fail"); + endpoint_wrapper ep; + bool ok = tx_endpoints_.pop(ep); + if (!ok) + { + LF_DEB(cnb_deb, + error(str<>("Scalable Ep"), "pop tx", "ep", hptr(ep.get_ep()), "tx cq", + hptr(ep.get_tx_cq()), "rx cq", hptr(ep.get_rx_cq()))); + throw std::runtime_error("tx endpoint wrapper pop fail"); + } + eps_->tl_stx_ = stack_endpoint( + ep.get_ep(), ep.get_rx_cq(), ep.get_tx_cq(), ep.get_name(), &tx_endpoints_); + LF_DEB(cnb_deb, + trace(str<>("Scalable Ep"), "pop tx", "ep", hptr(eps_->tl_stx_.get_ep()), + "tx cq", hptr(eps_->tl_stx_.get_tx_cq()), "rx cq", + hptr(eps_->tl_stx_.get_rx_cq()))); } - eps_->tl_stx_ = stack_endpoint(ep.get_ep(), ep.get_rx_cq(), ep.get_tx_cq(), - ep.get_name(), &tx_endpoints_); - LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>("Scalable Ep"), "pop tx", "ep", - NS_DEBUG::ptr(eps_->tl_stx_.get_ep()), "tx cq", - NS_DEBUG::ptr(eps_->tl_stx_.get_tx_cq()), "rx cq", - NS_DEBUG::ptr(eps_->tl_stx_.get_rx_cq()))); + return eps_->tl_stx_.endpoint_; } - return eps_->tl_stx_.endpoint_; + else if (endpoint_type_ == endpoint_type::multiple) { return eps_->ep_tx_; } + // single : shared tx/rx endpoint + return eps_->ep_rx_; } - else if (endpoint_type_ == endpoint_type::multiple) { return eps_->ep_tx_; } - // single : shared tx/rx endpoint - return eps_->ep_rx_; - } - - // -------------------------------------------------------------------- - void bind_address_vector_to_endpoint(struct fid_ep* endpoint, struct fid_av* av) - { - [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__); - - LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Binding AV"), "to", NS_DEBUG::ptr(endpoint))); - int ret = fi_ep_bind(endpoint, &av->fid, 0); - if (ret) throw NS_LIBFABRIC::fabric_error(ret, "bind address_vector"); - } - // -------------------------------------------------------------------- - void bind_queue_to_endpoint(struct fid_ep* endpoint, struct fid_cq*& cq, uint32_t cqtype, - const char* type) - { - [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__, type); - - LF_DEB(NS_DEBUG::cnb_deb, - debug(debug::str<>("Binding CQ"), "to", NS_DEBUG::ptr(endpoint), type)); - int ret = fi_ep_bind(endpoint, &cq->fid, cqtype); - if (ret) throw NS_LIBFABRIC::fabric_error(ret, "bind cq"); - } + // -------------------------------------------------------------------- + void bind_address_vector_to_endpoint(struct fid_ep* endpoint, struct fid_av* av) + { + [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__); - // -------------------------------------------------------------------- - fid_cq* bind_tx_queue_to_rx_endpoint(struct fi_info* info, struct fid_ep* ep) - { - [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__); - info->tx_attr->op_flags |= (FI_INJECT_COMPLETE | FI_COMPLETION); - fid_cq* tx_cq = create_completion_queue(fabric_domain_, info->tx_attr->size, "tx->rx"); - // shared send/recv endpoint - bind send cq to the recv endpoint - bind_queue_to_endpoint(ep, tx_cq, FI_TRANSMIT, "tx->rx bug fix"); - return tx_cq; - } + LF_DEB(cnb_deb, debug(str<>("Binding AV"), "to", hptr(endpoint))); + int ret = fi_ep_bind(endpoint, &av->fid, 0); + if (ret) throw NS_LIBFABRIC::fabric_error(ret, "bind address_vector"); + } - // -------------------------------------------------------------------- - void enable_endpoint(struct fid_ep* endpoint, const char* type) - { - [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__, type); + // -------------------------------------------------------------------- + void bind_queue_to_endpoint( + struct fid_ep* endpoint, struct fid_cq*& cq, uint32_t cqtype, char const* type) + { + [[maybe_unused]] auto scp = + NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__, type); - LF_DEB(NS_DEBUG::cnb_deb, - debug(debug::str<>("Enabling endpoint"), NS_DEBUG::ptr(endpoint))); - int ret = fi_enable(endpoint); - if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_enable"); - } + LF_DEB(cnb_deb, debug(str<>("Binding CQ"), "to", hptr(endpoint), type)); + int ret = fi_ep_bind(endpoint, &cq->fid, cqtype); + if (ret) throw NS_LIBFABRIC::fabric_error(ret, "bind cq"); + } - // -------------------------------------------------------------------- - locality get_endpoint_address(struct fid* id) - { - [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__); + // -------------------------------------------------------------------- + fid_cq* bind_tx_queue_to_rx_endpoint(struct fi_info* info, struct fid_ep* ep) + { + [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__); + info->tx_attr->op_flags |= (FI_INJECT_COMPLETE | FI_COMPLETION); + fid_cq* tx_cq = create_completion_queue(fabric_domain_, info->tx_attr->size, "tx->rx"); + // shared send/recv endpoint - bind send cq to the recv endpoint + bind_queue_to_endpoint(ep, tx_cq, FI_TRANSMIT, "tx->rx bug fix"); + return tx_cq; + } - locality::locality_data local_addr; - std::size_t addrlen = locality_defs::array_size; - int ret = fi_getname(id, local_addr.data(), &addrlen); - if (ret || (addrlen > locality_defs::array_size)) + // -------------------------------------------------------------------- + void enable_endpoint(struct fid_ep* endpoint, char const* type) { - std::string err = - std::to_string(addrlen) + "=" + std::to_string(locality_defs::array_size); - NS_LIBFABRIC::fabric_error(ret, "fi_getname - size error or other problem " + err); + [[maybe_unused]] auto scp = + NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__, type); + + LF_DEB(cnb_deb, debug(str<>("Enabling endpoint"), hptr(endpoint))); + int ret = fi_enable(endpoint); + if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_enable"); } - // optimized out when debug logging is false - if constexpr (NS_DEBUG::cnb_deb.is_enabled()) + // -------------------------------------------------------------------- + locality get_endpoint_address(struct fid* id) { - std::stringstream temp1; - for (std::size_t i = 0; i < locality_defs::array_length; ++i) + [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__); + + locality::locality_data local_addr; + std::size_t addrlen = locality_defs::array_size; + int ret = fi_getname(id, local_addr.data(), &addrlen); + if (ret || (addrlen > locality_defs::array_size)) { - temp1 << debug::ipaddr(&local_addr[i]) << " - "; + std::string err = + std::to_string(addrlen) + "=" + std::to_string(locality_defs::array_size); + NS_LIBFABRIC::fabric_error(ret, "fi_getname - error (address size ?) " + err); } - LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("raw address data"), "size", - debug::dec<>(addrlen), " : ", temp1.str().c_str())); - std::stringstream temp2; - for (std::size_t i = 0; i < locality_defs::array_length; ++i) + // optimized out when debug logging is false + if constexpr (NS_DEBUG::cnb_deb.is_enabled()) { - temp2 << debug::hex<8>(local_addr[i]) << " - "; + LF_DEB(cnb_deb, + debug(str<>("raw address data"), "size", dec<4>(addrlen), " : ", + locality(local_addr, av_).to_str())); + + std::stringstream temp2; + for (std::size_t i = 0; i < locality_defs::array_length; ++i) + { + temp2 << NS_DEBUG::hex<8>(local_addr[i]) << " - "; + } + LF_DEB(cnb_deb, debug(str<>("raw address data"), temp2.str().c_str())); } - LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("raw address data"), temp2.str().c_str())); + return locality(local_addr, av_); } - return locality(local_addr); - } - // -------------------------------------------------------------------- - fid_pep* create_passive_endpoint(struct fid_fabric* fabric, struct fi_info* info) - { - [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__); + // -------------------------------------------------------------------- + fid_pep* create_passive_endpoint(struct fid_fabric* fabric, struct fi_info* info) + { + [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__); - struct fid_pep* ep; - int ret = fi_passive_ep(fabric, info, &ep, nullptr); - if (ret) { throw NS_LIBFABRIC::fabric_error(ret, "Failed to create fi_passive_ep"); } - return ep; - } + struct fid_pep* ep; + int ret = fi_passive_ep(fabric, info, &ep, nullptr); + if (ret) { throw NS_LIBFABRIC::fabric_error(ret, "Failed to create fi_passive_ep"); } + return ep; + } - // -------------------------------------------------------------------- - inline const locality& here() const { return here_; } + // -------------------------------------------------------------------- + inline locality const& here() const { return here_; } - // -------------------------------------------------------------------- - inline const fi_addr_t& fi_address() const { return here_.fi_address(); } + // -------------------------------------------------------------------- + inline fi_addr_t const& fi_address() const { return here_.fi_address(); } - // -------------------------------------------------------------------- - inline void setHere(const locality& val) { here_ = val; } + // -------------------------------------------------------------------- + inline void setHere(locality const& val) { here_ = val; } - // -------------------------------------------------------------------- - inline const locality& root() const { return root_; } + // -------------------------------------------------------------------- + inline locality const& root() const { return root_; } - // -------------------------------------------------------------------- - inline struct fid_domain* get_domain() const { return fabric_domain_; } + // -------------------------------------------------------------------- + inline struct fid_domain* get_domain() const { return fabric_domain_; } - // -------------------------------------------------------------------- - inline std::size_t get_rma_protocol_size() { return 65536; } + // -------------------------------------------------------------------- + inline std::size_t get_rma_protocol_size() { return 65536; } #ifdef DISABLE_FI_INJECT - // -------------------------------------------------------------------- - inline std::size_t get_tx_inject_size() { return 0; } + // -------------------------------------------------------------------- + inline std::size_t get_tx_inject_size() { return 0; } #else - // -------------------------------------------------------------------- - inline std::size_t get_tx_inject_size() { return tx_inject_size_; } + // -------------------------------------------------------------------- + inline std::size_t get_tx_inject_size() { return tx_inject_size_; } #endif - // -------------------------------------------------------------------- - inline std::size_t get_tx_size() { return tx_attr_size_; } + // -------------------------------------------------------------------- + inline std::size_t get_tx_size() { return tx_attr_size_; } - // -------------------------------------------------------------------- - inline std::size_t get_rx_size() { return rx_attr_size_; } + // -------------------------------------------------------------------- + inline std::size_t get_rx_size() { return rx_attr_size_; } - // -------------------------------------------------------------------- - // returns true when all connections have been disconnected and none are active - inline bool isTerminated() - { - return false; - //return (qp_endpoint_map_.size() == 0); - } + // -------------------------------------------------------------------- + // returns true when all connections have been disconnected and none are active + inline bool isTerminated() + { + return false; + // return (qp_endpoint_map_.size() == 0); + } - // -------------------------------------------------------------------- - void debug_print_av_vector(std::size_t N) - { - locality addr; - std::size_t addrlen = locality_defs::array_size; - for (std::size_t i = 0; i < N; ++i) + // -------------------------------------------------------------------- + void debug_print_av_vector(std::size_t N) { - int ret = fi_av_lookup(av_, fi_addr_t(i), addr.fabric_data_writable(), &addrlen); - addr.set_fi_address(fi_addr_t(i)); - if ((ret == 0) && (addrlen == locality_defs::array_size)) - { - LF_DEB(NS_DEBUG::cnb_deb, - debug(debug::str<>("address vector"), debug::dec<3>(i), iplocality(addr))); - } - else + locality addr; + std::size_t addrlen = locality_defs::array_size; + for (std::size_t i = 0; i < N; ++i) { - LF_DEB(NS_DEBUG::cnb_err, - error(debug::str<>("address length"), debug::dec<3>(addrlen), - debug::dec<3>(locality_defs::array_size))); - throw std::runtime_error("debug_print_av_vector : address vector " - "traversal failure"); + int ret = fi_av_lookup(av_, fi_addr_t(i), addr.fabric_data_writable(), &addrlen); + addr.set_fi_address(fi_addr_t(i)); + if ((ret == 0) && (addrlen <= locality_defs::array_size)) + { + LF_DEB(cnb_deb, debug(str<>("address vector"), dec<3>(i), addr.to_str())); + } + else + { + LF_DEB(cnb_err, + error(str<>("address length"), dec<3>(addrlen), + dec<3>(locality_defs::array_size))); + throw std::runtime_error("debug_print_av_vector : address vector " + "traversal failure"); + } } } - } - // -------------------------------------------------------------------- - inline constexpr bool bypass_tx_lock() - { + // -------------------------------------------------------------------- + inline constexpr bool bypass_tx_lock() + { #if defined(HAVE_LIBFABRIC_GNI) - return true; -#elif defined(HAVE_LIBFABRIC_CXI) - // @todo : cxi provider is not yet thread safe using scalable endpoints - return false; + return true; +#elif defined(HAVE_LIBFABRIC_LNX) + // @todo : provider is not yet thread safe using scalable endpoints + return false; #else - return (threadlevel_flags() == FI_THREAD_SAFE || + return (threadlevel_flags() == FI_THREAD_SAFE || endpoint_type_ == endpoint_type::threadlocalTx); #endif - } + } - // -------------------------------------------------------------------- - inline controller_base::unique_lock get_tx_lock() - { - if (bypass_tx_lock()) return unique_lock(); - return unique_lock(send_mutex_); - } + // -------------------------------------------------------------------- + inline controller_base::unique_lock get_tx_lock() + { + if (bypass_tx_lock()) return unique_lock(); + return unique_lock(send_mutex_); + } - // -------------------------------------------------------------------- - inline controller_base::unique_lock try_tx_lock() - { - if (bypass_tx_lock()) return unique_lock(); - return unique_lock(send_mutex_, std::try_to_lock_t{}); - } + // -------------------------------------------------------------------- + inline controller_base::unique_lock try_tx_lock() + { + if (bypass_tx_lock()) return unique_lock(); + return unique_lock(send_mutex_, std::try_to_lock_t{}); + } - // -------------------------------------------------------------------- - inline constexpr bool bypass_rx_lock() - { + // -------------------------------------------------------------------- + inline constexpr bool bypass_rx_lock() + { #ifdef HAVE_LIBFABRIC_GNI - return true; + return true; #else - return ( - threadlevel_flags() == FI_THREAD_SAFE || endpoint_type_ == endpoint_type::scalableTxRx); + return (threadlevel_flags() == FI_THREAD_SAFE || + endpoint_type_ == endpoint_type::scalableTxRx); #endif - } + } - // -------------------------------------------------------------------- - inline controller_base::unique_lock get_rx_lock() - { - if (bypass_rx_lock()) return unique_lock(); - return unique_lock(recv_mutex_); - } + // -------------------------------------------------------------------- + inline controller_base::unique_lock get_rx_lock() + { + if (bypass_rx_lock()) return unique_lock(); + return unique_lock(recv_mutex_); + } - // -------------------------------------------------------------------- - inline controller_base::unique_lock try_rx_lock() - { - if (bypass_rx_lock()) return unique_lock(); - return unique_lock(recv_mutex_, std::try_to_lock_t{}); - } + // -------------------------------------------------------------------- + inline controller_base::unique_lock try_rx_lock() + { + if (bypass_rx_lock()) return unique_lock(); + return unique_lock(recv_mutex_, std::try_to_lock_t{}); + } - // -------------------------------------------------------------------- - progress_status poll_for_work_completions(void* user_data) - { - progress_status p{0, 0}; - bool retry = false; - do { - // sends - uint32_t nsend = static_cast(this)->poll_send_queue( - get_tx_endpoint().get_tx_cq(), user_data); - p.m_num_sends += nsend; - retry = (nsend == max_completions_per_poll_); - // recvs - uint32_t nrecv = static_cast(this)->poll_recv_queue( - get_rx_endpoint().get_rx_cq(), user_data); - p.m_num_recvs += nrecv; - retry |= (nrecv == max_completions_per_poll_); - } while (retry); - return p; - } + // -------------------------------------------------------------------- + progress_status poll_for_work_completions(void* user_data) + { + progress_status p{0, 0}; + bool retry = false; + do { + // sends + uint32_t nsend = static_cast(this)->poll_send_queue( + get_tx_endpoint().get_tx_cq(), user_data); + p.m_num_sends += nsend; + retry = (nsend == max_completions_per_poll_); + // recvs + uint32_t nrecv = static_cast(this)->poll_recv_queue( + get_rx_endpoint().get_rx_cq(), user_data); + p.m_num_recvs += nrecv; + retry |= (nrecv == max_completions_per_poll_); + } while (retry); + return p; + } - // -------------------------------------------------------------------- - inline int poll_send_queue(fid_cq* tx_cq, void* user_data) - { - return static_cast(this)->poll_send_queue(tx_cq, user_data); - } + // -------------------------------------------------------------------- + inline int poll_send_queue(fid_cq* tx_cq, void* user_data) + { + return static_cast(this)->poll_send_queue(tx_cq, user_data); + } - // -------------------------------------------------------------------- - inline int poll_recv_queue(fid_cq* rx_cq, void* user_data) - { - return static_cast(this)->poll_recv_queue(rx_cq, user_data); - } + // -------------------------------------------------------------------- + inline int poll_recv_queue(fid_cq* rx_cq, void* user_data) + { + return static_cast(this)->poll_recv_queue(rx_cq, user_data); + } - // -------------------------------------------------------------------- - struct fid_cq* create_completion_queue(struct fid_domain* domain, size_t size, const char* type) - { - [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__, type); - - struct fid_cq* cq; - fi_cq_attr cq_attr = {}; - cq_attr.format = FI_CQ_FORMAT_MSG; - cq_attr.wait_obj = FI_WAIT_NONE; - cq_attr.wait_cond = FI_CQ_COND_NONE; - cq_attr.size = size; - cq_attr.flags = 0 /*FI_COMPLETION*/; - LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>("CQ size"), debug::dec<4>(size))); - // open completion queue on fabric domain and set context to null - int ret = fi_cq_open(domain, &cq_attr, &cq, nullptr); - if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_cq_open"); - return cq; - } + // -------------------------------------------------------------------- + struct fid_cq* create_completion_queue( + struct fid_domain* domain, size_t size, char const* type) + { + [[maybe_unused]] auto scp = + NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__, type); + + struct fid_cq* cq; + fi_cq_attr cq_attr = {}; + cq_attr.format = FI_CQ_FORMAT_MSG; + cq_attr.wait_obj = FI_WAIT_NONE; + cq_attr.wait_cond = FI_CQ_COND_NONE; + cq_attr.size = size; + cq_attr.flags = 0 /*FI_COMPLETION*/; + LF_DEB(cnb_deb, trace(str<>("CQ size"), dec<4>(size))); + // open completion queue on fabric domain and set context to null + int ret = fi_cq_open(domain, &cq_attr, &cq, nullptr); + if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_cq_open"); + return cq; + } - // -------------------------------------------------------------------- - fid_av* create_address_vector(struct fi_info* info, int N, int num_rx_contexts) - { - [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__); + // -------------------------------------------------------------------- + fid_av* create_address_vector(struct fi_info* info, int N, int num_rx_contexts) + { + [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__); - fid_av* av; - fi_av_attr av_attr = {fi_av_type(0), 0, 0, 0, nullptr, nullptr, 0}; + fid_av* av; + fi_av_attr av_attr = {fi_av_type(0), 0, 0, 0, nullptr, nullptr, 0}; - // number of addresses expected - av_attr.count = N; + // number of addresses expected + av_attr.count = N; - // number of receive contexts used - int rx_ctx_bits = 0; + // number of receive contexts used + int rx_ctx_bits = 0; #ifdef RX_CONTEXTS_SUPPORT - while (num_rx_contexts >> ++rx_ctx_bits) - ; - LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("rx_ctx_bits"), rx_ctx_bits)); + while (num_rx_contexts >> ++rx_ctx_bits); + LF_DEB(cnb_deb, debug(str<>("rx_ctx_bits"), rx_ctx_bits)); #endif - av_attr.rx_ctx_bits = rx_ctx_bits; - // if contexts is nonzero, then we are using a single scalable endpoint - av_attr.ep_per_node = (num_rx_contexts > 0) ? 2 : 0; + av_attr.rx_ctx_bits = rx_ctx_bits; + // if contexts is nonzero, then we are using a single scalable endpoint + av_attr.ep_per_node = (num_rx_contexts > 0) ? 2 : 0; - if (info->domain_attr->av_type != FI_AV_UNSPEC) - { - av_attr.type = info->domain_attr->av_type; - } - else - { - LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("map FI_AV_TABLE"))); - av_attr.type = FI_AV_TABLE; - } + if (info->domain_attr->av_type != FI_AV_UNSPEC) + { + av_attr.type = info->domain_attr->av_type; + } + else + { + LF_DEB(cnb_deb, debug(str<>("map FI_AV_TABLE"))); + av_attr.type = FI_AV_TABLE; + } - LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Creating AV"))); - int ret = fi_av_open(fabric_domain_, &av_attr, &av, nullptr); - if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_av_open"); - return av; - } + LF_DEB(cnb_deb, debug(str<>("Creating AV"))); + int ret = fi_av_open(fabric_domain_, &av_attr, &av, nullptr); + if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_av_open"); + return av; + } - // -------------------------------------------------------------------- - locality insert_address(const locality& address) { return insert_address(av_, address); } + // -------------------------------------------------------------------- + locality insert_address(locality const& address) { return insert_address(av_, address); } - // -------------------------------------------------------------------- - locality insert_address(fid_av* av, const locality& address) - { - [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__); - - LF_DEB(NS_DEBUG::cnb_deb, - trace(debug::str<>("inserting AV"), iplocality(address), NS_DEBUG::ptr(av))); - fi_addr_t fi_addr = 0xffffffff; - int ret = fi_av_insert(av, address.fabric_data(), 1, &fi_addr, 0, nullptr); - if (ret < 0) { throw NS_LIBFABRIC::fabric_error(ret, "fi_av_insert"); } - else if (ret == 0) + // -------------------------------------------------------------------- + locality insert_address(fid_av* av, locality const& address) { - NS_DEBUG::cnb_deb.error("fi_av_insert called with existing address"); - NS_LIBFABRIC::fabric_error(ret, "fi_av_insert did not return 1"); + [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__); + + LF_DEB(cnb_deb, trace(str<>("inserting AV"), address.to_str(), hptr(av))); + fi_addr_t fi_addr = 0xffff'ffff; + int ret = fi_av_insert(av, address.fabric_data().data(), 1, &fi_addr, 0, nullptr); + if (ret < 0) { throw NS_LIBFABRIC::fabric_error(ret, "fi_av_insert"); } + else if (ret == 0) + { + LF_DEB(cnb_deb, error("fi_av_insert called with existing address")); + NS_LIBFABRIC::fabric_error(ret, "fi_av_insert did not return 1"); + } + // address was generated correctly, now update the locality with the fi_addr + locality new_locality(address, fi_addr, av); + LF_DEB(cnb_deb, + trace(str<>("AV add"), "rank", dec<>(fi_addr), new_locality.to_str(), "fi_addr", + hex<4>(fi_addr))); + return new_locality; } - // address was generated correctly, now update the locality with the fi_addr - locality new_locality(address, fi_addr); - LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>("AV add"), "rank", debug::dec<>(fi_addr), - iplocality(new_locality), "fi_addr", debug::hex<4>(fi_addr))); - return new_locality; - } -}; + }; -} // namespace NS_LIBFABRIC +} // namespace NS_LIBFABRIC diff --git a/src/libfabric/fabric_error.hpp b/src/libfabric/fabric_error.hpp index 0f2db4c1..84e43dd5 100644 --- a/src/libfabric/fabric_error.hpp +++ b/src/libfabric/fabric_error.hpp @@ -11,42 +11,39 @@ #include #include -#include // #include // #include "oomph_libfabric_defines.hpp" -namespace NS_DEBUG -{ -// cppcheck-suppress ConfigurationNotChecked -static NS_DEBUG::enable_print err_deb("ERROR__"); -} // namespace NS_DEBUG +namespace NS_DEBUG { + // cppcheck-suppress ConfigurationNotChecked + static NS_DEBUG::enable_print err_deb("ERROR__"); +} // namespace NS_DEBUG -namespace NS_LIBFABRIC -{ +namespace NS_LIBFABRIC { -class fabric_error : public std::runtime_error -{ - public: - // -------------------------------------------------------------------- - fabric_error(int err, const std::string& msg) - : std::runtime_error(std::string(fi_strerror(-err)) + msg) - , error_(err) + class fabric_error : public std::runtime_error { - NS_DEBUG::err_deb.error(msg, ":", fi_strerror(-err)); - std::terminate(); - } + public: + // -------------------------------------------------------------------- + fabric_error(int err, std::string const& msg) + : std::runtime_error(std::string(fi_strerror(-err)) + msg) + , error_(err) + { + NS_DEBUG::err_deb.error(msg, ":", fi_strerror(-err)); + std::terminate(); + } - fabric_error(int err) - : std::runtime_error(fi_strerror(-err)) - , error_(-err) - { - NS_DEBUG::err_deb.error(what()); - std::terminate(); - } + fabric_error(int err) + : std::runtime_error(fi_strerror(-err)) + , error_(-err) + { + NS_DEBUG::err_deb.error(what()); + std::terminate(); + } - int error_; -}; + int error_; + }; -} // namespace NS_LIBFABRIC +} // namespace NS_LIBFABRIC diff --git a/src/libfabric/libfabric_defines_template.hpp b/src/libfabric/libfabric_defines_template.hpp index 64c04944..ea2a105b 100644 --- a/src/libfabric/libfabric_defines_template.hpp +++ b/src/libfabric/libfabric_defines_template.hpp @@ -14,26 +14,29 @@ // some namespaces for the lib and for debugging are setup correctly #define NS_LIBFABRIC oomph::libfabric -#define NS_MEMORY oomph::libfabric -#define NS_DEBUG oomph::debug +#define NS_MEMORY oomph::libfabric +#define NS_DEBUG oomph::debug #ifndef LF_DEB -#define LF_DEB(printer, Expr) \ - if constexpr (printer.is_enabled()) { printer.Expr; }; +# define LF_DEB(printer, Expr) \ + { \ + using namespace NS_DEBUG; \ + if constexpr (printer.is_enabled()) { printer.Expr; }; \ + } #endif #define LFSOURCE_DIR "@OOMPH_SRC_LIBFABRIC_DIR@" -#define LFPRINT_HPP "@OOMPH_SRC_LIBFABRIC_DIR@/print.hpp" -#define LFCOUNT_HPP "@OOMPH_SRC_LIBFABRIC_DIR@/simple_counter.hpp" +#define LFPRINT_HPP "@OOMPH_SRC_LIBFABRIC_DIR@/print.hpp" +#define LFCOUNT_HPP "@OOMPH_SRC_LIBFABRIC_DIR@/simple_counter.hpp" // oomph has a debug print helper file in the main source tree #if __has_include(LFPRINT_HPP) -#include LFPRINT_HPP -#define has_debug 1 +# include LFPRINT_HPP +# define has_debug 1 #endif #if __has_include(LFCOUNT_HPP) -#include LFCOUNT_HPP +# include LFCOUNT_HPP #endif #endif diff --git a/src/libfabric/locality.cpp b/src/libfabric/locality.cpp deleted file mode 100644 index 487912f5..00000000 --- a/src/libfabric/locality.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* - * ghex-org - * - * Copyright (c) 2014-2023, ETH Zurich - * All rights reserved. - * - * Please, refer to the LICENSE file in the root directory. - * SPDX-License-Identifier: BSD-3-Clause - */ - -#include - -namespace oomph -{ -namespace libfabric -{ - -// ------------------------------------------------------------------ -// format as ip address, port, libfabric address -// ------------------------------------------------------------------ -iplocality::iplocality(const locality& l) -: data(l) -{ -} - -std::ostream& -operator<<(std::ostream& os, const iplocality& p) -{ - os << std::dec << NS_DEBUG::ipaddr(p.data.fabric_data()) << " - " - << NS_DEBUG::ipaddr(p.data.ip_address()) << ":" << NS_DEBUG::dec<>(p.data.port()) << " (" - << NS_DEBUG::dec<>(p.data.fi_address()) << ") "; - return os; -} - -} // namespace libfabric -} // namespace oomph diff --git a/src/libfabric/locality.hpp b/src/libfabric/locality.hpp index 74f6b290..9e91cec1 100644 --- a/src/libfabric/locality.hpp +++ b/src/libfabric/locality.hpp @@ -16,242 +16,191 @@ #include // #include -#include +#include +// #include +#include // #include "oomph_libfabric_defines.hpp" -// Different providers use different address formats that we must accommodate -// in our locality object. +// Different providers use different address formats that we must accommodate in our locality object. #ifdef HAVE_LIBFABRIC_GNI -#define HAVE_LIBFABRIC_LOCALITY_SIZE 48 +# define HAVE_LIBFABRIC_LOCALITY_SIZE 48 #endif #ifdef HAVE_LIBFABRIC_CXI -#ifdef HAVE_LIBFABRIC_CXI_1_15 -#define HAVE_LIBFABRIC_LOCALITY_SIZE sizeof(int) -#else -#define HAVE_LIBFABRIC_LOCALITY_SIZE sizeof(long int) -#endif +# ifdef HAVE_LIBFABRIC_CXI_1_15 +# define HAVE_LIBFABRIC_LOCALITY_SIZE sizeof(int) +# else +# define HAVE_LIBFABRIC_LOCALITY_SIZE sizeof(long int) +# endif #endif #ifdef HAVE_LIBFABRIC_EFA -#define HAVE_LIBFABRIC_LOCALITY_SIZE 32 +# define HAVE_LIBFABRIC_LOCALITY_SIZE 32 #endif #if defined(HAVE_LIBFABRIC_VERBS) || defined(HAVE_LIBFABRIC_TCP) || \ defined(HAVE_LIBFABRIC_SOCKETS) || defined(HAVE_LIBFABRIC_PSM2) -#define HAVE_LIBFABRIC_LOCALITY_SIZE 16 -#define HAVE_LIBFABRIC_LOCALITY_SOCKADDR +# define HAVE_LIBFABRIC_LOCALITY_SIZE 16 #endif -namespace oomph -{ -// cppcheck-suppress ConfigurationNotChecked -static NS_DEBUG::enable_print loc_deb("LOCALTY"); -} // namespace oomph - -namespace oomph -{ -namespace libfabric -{ - -struct locality; - -// ------------------------------------------------------------------ -// format as ip address, port, libfabric address -// ------------------------------------------------------------------ -struct iplocality -{ - const locality& data; - iplocality(const locality& a); - friend std::ostream& operator<<(std::ostream& os, const iplocality& p); -}; - -// -------------------------------------------------------------------- -// Locality, in this structure we store the information required by -// libfabric to make a connection to another node. -// With libfabric 1.4.x the array contains the fabric ip address stored -// as the second uint32_t in the array. For this reason we use an -// array of uint32_t rather than uint8_t/char so we can easily access -// the ip for debug/validation purposes -// -------------------------------------------------------------------- -namespace locality_defs -{ -// the number of 32bit ints stored in our array -const uint32_t array_size = HAVE_LIBFABRIC_LOCALITY_SIZE; -const uint32_t array_length = HAVE_LIBFABRIC_LOCALITY_SIZE / 4; -} // namespace locality_defs - -struct locality -{ - // array type of our locality data - typedef std::array locality_data; - - static const char* type() { return "libfabric"; } - - explicit locality(const locality_data& in_data) - { - std::memcpy(&data_[0], &in_data[0], locality_defs::array_size); - fi_address_ = 0; - LF_DEB(loc_deb, trace(NS_DEBUG::str<>("expl constructing"), iplocality((*this)))); - } - - locality() - { - std::memset(&data_[0], 0x00, locality_defs::array_size); - fi_address_ = 0; - LF_DEB(loc_deb, trace(NS_DEBUG::str<>("default construct"), iplocality((*this)))); - } - - locality(const locality& other) - : data_(other.data_) - , fi_address_(other.fi_address_) - { - LF_DEB(loc_deb, trace(NS_DEBUG::str<>("copy construct"), iplocality((*this)))); - } - - locality(const locality& other, fi_addr_t addr) - : data_(other.data_) - , fi_address_(addr) - { - LF_DEB(loc_deb, trace(NS_DEBUG::str<>("copy fi construct"), iplocality((*this)))); - } - - locality(locality&& other) - : data_(std::move(other.data_)) - , fi_address_(other.fi_address_) - { - LF_DEB(loc_deb, trace(NS_DEBUG::str<>("move construct"), iplocality((*this)))); - } - - // provided to support sockets mode bootstrap - explicit locality(const std::string& address, const std::string& portnum) - { - LF_DEB(loc_deb, trace(NS_DEBUG::str<>("explicit construct"), address, ":", portnum)); - // - struct sockaddr_in socket_data; - memset(&socket_data, 0, sizeof(socket_data)); - socket_data.sin_family = AF_INET; - socket_data.sin_port = htons(std::stol(portnum)); - inet_pton(AF_INET, address.c_str(), &(socket_data.sin_addr)); - // - std::memcpy(&data_[0], &socket_data, locality_defs::array_size); - fi_address_ = 0; - LF_DEB(loc_deb, trace(NS_DEBUG::str<>("string constructing"), iplocality((*this)))); - } - - // some condition marking this locality as valid - explicit inline operator bool() const - { - LF_DEB(loc_deb, trace(NS_DEBUG::str<>("bool operator"), iplocality((*this)))); - return (ip_address() != 0); - } - - inline bool valid() const - { - LF_DEB(loc_deb, trace(NS_DEBUG::str<>("valid operator"), iplocality((*this)))); - return (ip_address() != 0); - } - - locality& operator=(const locality& other) - { - data_ = other.data_; - fi_address_ = other.fi_address_; - LF_DEB(loc_deb, - trace(NS_DEBUG::str<>("copy operator"), iplocality(*this), iplocality(other))); - return *this; - } - - bool operator==(const locality& other) - { - LF_DEB(loc_deb, - trace(NS_DEBUG::str<>("equality operator"), iplocality(*this), iplocality(other))); - return std::memcmp(&data_, &other.data_, locality_defs::array_size) == 0; - } - - bool less_than(const locality& other) - { - LF_DEB(loc_deb, - trace(NS_DEBUG::str<>("less operator"), iplocality(*this), iplocality(other))); - if (ip_address() < other.ip_address()) return true; - if (ip_address() == other.ip_address()) return port() < other.port(); - return false; - } +#if defined(HAVE_LIBFABRIC_SHM) +# define HAVE_LIBFABRIC_LOCALITY_SIZE 24 +#endif - const uint32_t& ip_address() const - { -#if defined(HAVE_LIBFABRIC_LOCALITY_SOCKADDR) - return reinterpret_cast(data_.data())->sin_addr.s_addr; -#elif defined(HAVE_LIBFABRIC_GNI) - return data_[0]; -#elif defined(HAVE_LIBFABRIC_CXI) - return data_[0]; -#elif defined(HAVE_LIBFABRIC_EFA) - return data_[0]; -#else - throw fabric_error(0, "unsupported fabric provider, please fix ASAP"); +#if defined(HAVE_LIBFABRIC_LNX) +# define HAVE_LIBFABRIC_LOCALITY_SIZE 512 #endif - } - static const uint32_t& ip_address(const locality_data& data) - { -#if defined(HAVE_LIBFABRIC_LOCALITY_SOCKADDR) - return reinterpret_cast(&data)->sin_addr.s_addr; -#elif defined(HAVE_LIBFABRIC_GNI) - return data[0]; -#elif defined(HAVE_LIBFABRIC_CXI) - return data[0]; -#elif defined(HAVE_LIBFABRIC_EFA) - return data[0]; +namespace oomph { + // cppcheck-suppress ConfigurationNotChecked + static NS_DEBUG::enable_print loc_deb("LOCALTY"); +} // namespace oomph + +namespace oomph { namespace libfabric { + + struct locality; + + // -------------------------------------------------------------------- + // Locality, in this structure we store the information required by + // libfabric to make a connection to another node. + // With libfabric 1.4.x the array contains the fabric ip address stored + // as the second uint32_t in the array. For this reason we use an + // array of uint32_t rather than uint8_t/char so we can easily access + // the ip for debug/validation purposes + // -------------------------------------------------------------------- + namespace locality_defs { + // the number of 32bit ints stored in our array + uint32_t const array_size = HAVE_LIBFABRIC_LOCALITY_SIZE; + uint32_t const array_length = HAVE_LIBFABRIC_LOCALITY_SIZE / 4; + } // namespace locality_defs + + struct locality + { + // array type of our locality data + typedef std::array locality_data; + + static char const* type() { return "libfabric"; } + + explicit locality(locality_data const& in_data, struct fid_av* av) + { + std::memcpy(&data_[0], &in_data[0], locality_defs::array_size); + fi_address_ = 0; + av_ = av; + LF_DEB(loc_deb, trace(NS_DEBUG::str<>("explicit construct"), to_str())); + } + + locality() + { + std::memset(&data_[0], 0x00, locality_defs::array_size); + fi_address_ = 0; + av_ = nullptr; + LF_DEB(loc_deb, trace(NS_DEBUG::str<>("default construct"), to_str())); + } + + locality(locality const& other) + : data_(other.data_) + , fi_address_(other.fi_address_) + , av_(other.av_) + { + LF_DEB(loc_deb, trace(NS_DEBUG::str<>("copy construct"), to_str())); + } + + locality(locality const& other, fi_addr_t addr, struct fid_av* av) + : data_(other.data_) + , fi_address_(addr) + , av_(av) + { + LF_DEB(loc_deb, trace(NS_DEBUG::str<>("copy fi construct"), to_str())); + } + + locality(locality&& other) + : data_(std::move(other.data_)) + , fi_address_(other.fi_address_) + , av_(other.av_) + { + LF_DEB(loc_deb, trace(NS_DEBUG::str<>("move construct"), to_str())); + } + + // provided to support sockets mode bootstrap + explicit locality(std::string const& address, std::string const& portnum) + { + LF_DEB(loc_deb, trace(NS_DEBUG::str<>("explicit construct-2"), address, ":", portnum)); + // + struct sockaddr_in socket_data; + memset(&socket_data, 0, sizeof(socket_data)); + socket_data.sin_family = AF_INET; + socket_data.sin_port = htons(std::stol(portnum)); + inet_pton(AF_INET, address.c_str(), &(socket_data.sin_addr)); + // + std::memcpy(&data_[0], &socket_data, locality_defs::array_size); + fi_address_ = 0; + av_ = nullptr; + LF_DEB(loc_deb, trace(NS_DEBUG::str<>("string constructing"), to_str())); + } + + locality& operator=(locality const& other) + { + data_ = other.data_; + fi_address_ = other.fi_address_; + av_ = other.av_; + LF_DEB(loc_deb, trace(NS_DEBUG::str<>("copy operator"), to_str(), other.to_str())); + return *this; + } + + bool operator==(locality const& other) + { + LF_DEB(loc_deb, trace(NS_DEBUG::str<>("equality operator"), to_str(), other.to_str())); + return std::memcmp(&data_, &other.data_, locality_defs::array_size) == 0; + } + + inline fi_addr_t const& fi_address() const { return fi_address_; } + + inline void set_fi_address(fi_addr_t fi_addr) { fi_address_ = fi_addr; } + + inline uint16_t port() const + { + uint16_t port = 256 * reinterpret_cast(data_.data())[2] + + reinterpret_cast(data_.data())[3]; + return port; + } + + inline locality_data const& fabric_data() const { return data_; } + + inline char* fabric_data_writable() { return reinterpret_cast(data_.data()); } + + std::string to_str() const + { + size_t buflen = 1024; + std::array buf; + if (!av_) { return "No address vector"; } + char const* straddr_ret = fi_av_straddr(av_, data_.data(), buf.data(), &buflen); +#ifdef HAVE_LIBFABRIC_LNX + return "LNX does not yet support straddr"; #else - throw fabric_error(0, "unsupported fabric provider, please fix ASAP"); + std::string result = straddr_ret ? straddr_ret : "Address formatting Error"; + return result; #endif - } - - inline const fi_addr_t& fi_address() const { return fi_address_; } - - inline void set_fi_address(fi_addr_t fi_addr) { fi_address_ = fi_addr; } - - inline uint16_t port() const - { - uint16_t port = 256 * reinterpret_cast(data_.data())[2] + - reinterpret_cast(data_.data())[3]; - return port; - } - - inline const void* fabric_data() const { return data_.data(); } - - inline char* fabric_data_writable() { return reinterpret_cast(data_.data()); } - - private: - friend bool operator==(locality const& lhs, locality const& rhs) - { - LF_DEB(loc_deb, - trace(NS_DEBUG::str<>("equality friend"), iplocality(lhs), iplocality(rhs))); - return ((lhs.data_ == rhs.data_) && (lhs.fi_address_ == rhs.fi_address_)); - } - - friend bool operator<(locality const& lhs, locality const& rhs) - { - const uint32_t& a1 = lhs.ip_address(); - const uint32_t& a2 = rhs.ip_address(); - const fi_addr_t& f1 = lhs.fi_address(); - const fi_addr_t& f2 = rhs.fi_address(); - LF_DEB(loc_deb, trace(NS_DEBUG::str<>("less friend"), iplocality(lhs), iplocality(rhs))); - return (a1 < a2) || (a1 == a2 && f1 < f2); - } - - friend std::ostream& operator<<(std::ostream& os, locality const& loc) - { - for (uint32_t i = 0; i < locality_defs::array_length; ++i) { os << loc.data_[i]; } - return os; - } - - private: - locality_data data_; - fi_addr_t fi_address_; -}; - -} // namespace libfabric -} // namespace oomph + } + + private: + friend bool operator==(locality const& lhs, locality const& rhs) + { + LF_DEB(loc_deb, trace(NS_DEBUG::str<>("equality friend"), lhs.to_str(), rhs.to_str())); + return ((lhs.data_ == rhs.data_) && (lhs.fi_address_ == rhs.fi_address_)); + } + + friend std::ostream& operator<<(std::ostream& os, locality const& loc) + { + for (uint32_t i = 0; i < locality_defs::array_length; ++i) { os << loc.data_[i]; } + return os; + } + + private: + locality_data data_; + fi_addr_t fi_address_; + struct fid_av* av_; + }; + +}} // namespace oomph::libfabric diff --git a/src/libfabric/memory_region.hpp b/src/libfabric/memory_region.hpp index 0cd5c4a7..2028fc41 100644 --- a/src/libfabric/memory_region.hpp +++ b/src/libfabric/memory_region.hpp @@ -15,23 +15,21 @@ #include // #include -#include #include -#include "oomph_libfabric_defines.hpp" #include "fabric_error.hpp" +#include "oomph_libfabric_defines.hpp" #ifdef OOMPH_ENABLE_DEVICE -#include +# include #endif // ------------------------------------------------------------------ -namespace NS_MEMORY -{ +namespace NS_MEMORY { -static NS_DEBUG::enable_print mrn_deb("REGION_"); + static NS_DEBUG::enable_print mrn_deb("REGION_"); -/* + /* struct fi_mr_attr { union { const struct iovec *mr_iov; @@ -60,342 +58,352 @@ struct fi_mr_attr { */ -// This is the only part of the code that actually -// calls libfabric functions -struct region_provider -{ - // The internal memory region handle - using provider_region = struct fid_mr; - using provider_domain = struct fid_domain; - - // register region - static inline int fi_register_memory(provider_domain* pd, int device_id, const void* buf, - size_t len, uint64_t access_flags, uint64_t offset, uint64_t request_key, - struct fid_mr** mr) + // This is the only part of the code that actually + // calls libfabric functions + struct region_provider { - [[maybe_unused]] auto scp = - NS_MEMORY::mrn_deb.scope(__func__, NS_DEBUG::ptr(buf), NS_DEBUG::dec<>(len), device_id); - // - struct iovec addresses = {/*.iov_base = */ const_cast(buf), /*.iov_len = */ len}; - fi_mr_attr attr = { - /*.mr_iov = */ &addresses, - /*.iov_count = */ 1, - /*.access = */ access_flags, - /*.offset = */ offset, - /*.requested_key = */ request_key, - /*.context = */ nullptr, - /*.auth_key_size = */ 0, - /*.auth_key = */ nullptr, - /*.iface = */ FI_HMEM_SYSTEM, - /*.device = */ {0}, + // The internal memory region handle + using provider_region = struct fid_mr; + using provider_domain = struct fid_domain; + + // register region + static inline int fi_register_memory(provider_domain* pd, int device_id, void const* buf, + size_t len, uint64_t access_flags, uint64_t offset, uint64_t request_key, + struct fid_mr** mr) + { + [[maybe_unused]] auto scp = NS_MEMORY::mrn_deb.scope( + __func__, NS_DEBUG::hptr(buf), NS_DEBUG::dec<>(len), device_id); + // + struct iovec addresses = {/*.iov_base = */ const_cast(buf), /*.iov_len = */ len}; + fi_mr_attr attr = { + /*.mr_iov = */ {&addresses}, + /*.iov_count = */ 1, + /*.access = */ access_flags, + /*.offset = */ offset, + /*.requested_key = */ request_key, + /*.context = */ nullptr, + /*.auth_key_size = */ 0, + /*.auth_key = */ nullptr, + /*.iface = */ FI_HMEM_SYSTEM, + /*.device = */ {0}, #if (FI_MAJOR_VERSION > 1) || ((FI_MAJOR_VERSION == 1) && FI_MINOR_VERSION > 17) - /*.hmem_data = */ nullptr, + /*.hmem_data = */ nullptr, #endif #if (FI_MAJOR_VERSION >= 2) - /*page_size = */ static_cast(sysconf(_SC_PAGESIZE)), - /*base_mr = */ nullptr, - /*sub_mr_cnt = */ 0, - }; + /*page_size = */ static_cast(sysconf(_SC_PAGESIZE)), + /*base_mr = */ nullptr, + /*sub_mr_cnt = */ 0, + }; #else - }; + }; #endif - if (device_id >= 0) - { + if (device_id >= 0) + { #ifdef OOMPH_ENABLE_DEVICE - attr.device.cuda = device_id; - int handle = hwmalloc::get_device_id(); - attr.device.cuda = handle; -#if defined(OOMPH_DEVICE_CUDA) - attr.iface = FI_HMEM_CUDA; - LF_DEB(NS_MEMORY::mrn_deb, - trace(NS_DEBUG::str<>("CUDA"), "set device id", device_id, handle)); -#elif defined(OOMPH_DEVICE_HIP) - attr.iface = FI_HMEM_ROCR; - LF_DEB(NS_MEMORY::mrn_deb, - trace(NS_DEBUG::str<>("HIP"), "set device id", device_id, handle)); -#endif + attr.device.cuda = device_id; + int handle = hwmalloc::get_device_id(); + attr.device.cuda = handle; +# if defined(OOMPH_DEVICE_CUDA) + attr.iface = FI_HMEM_CUDA; + LF_DEB(NS_MEMORY::mrn_deb, + trace(NS_DEBUG::str<>("CUDA"), "set device id", device_id, handle)); +# elif defined(OOMPH_DEVICE_HIP) + attr.iface = FI_HMEM_ROCR; + LF_DEB(NS_MEMORY::mrn_deb, + trace(NS_DEBUG::str<>("HIP"), "set device id", device_id, handle)); +# endif #endif + } + uint64_t flags = 0; + int ret = fi_mr_regattr(pd, &attr, flags, mr); + if (ret) { throw NS_LIBFABRIC::fabric_error(int(ret), "register_memory"); } + return ret; } - uint64_t flags = 0; - int ret = fi_mr_regattr(pd, &attr, flags, mr); - if (ret) { throw NS_LIBFABRIC::fabric_error(int(ret), "register_memory"); } - return ret; - } - // unregister region - static inline int unregister_memory(provider_region* region) { return fi_close(®ion->fid); } - - // Default registration flags for this provider - static inline constexpr int access_flags() - { - return FI_READ | FI_WRITE | FI_RECV | FI_SEND /*| FI_REMOTE_READ | FI_REMOTE_WRITE*/; - } + // unregister region + static inline int unregister_memory(provider_region* region) + { + return fi_close(®ion->fid); + } - // Get the local descriptor of the memory region. - static inline void* get_local_key(provider_region* const region) { return fi_mr_desc(region); } + // Default registration flags for this provider + static inline constexpr int access_flags() + { + return FI_READ | FI_WRITE | FI_RECV | FI_SEND /*| FI_REMOTE_READ | FI_REMOTE_WRITE*/; + } - // Get the remote key of the memory region. - static inline uint64_t get_remote_key(provider_region* const region) - { - return fi_mr_key(region); - } -}; + // Get the local descriptor of the memory region. + static inline void* get_local_key(provider_region* const region) + { + return fi_mr_desc(region); + } -// -------------------------------------------------------------------- -// This is a handle to a small chunk of memory that has been registered -// as part of a much larger allocation (a memory_segment) -struct memory_handle -{ - // -------------------------------------------------------------------- - using provider_region = region_provider::provider_region; + // Get the remote key of the memory region. + static inline uint64_t get_remote_key(provider_region* const region) + { + return fi_mr_key(region); + } + }; // -------------------------------------------------------------------- - // Default constructor creates unusable handle(region) - memory_handle() - : address_{nullptr} - , region_{nullptr} - , size_{0} - , used_space_{0} + // This is a handle to a small chunk of memory that has been registered + // as part of a much larger allocation (a memory_segment) + struct memory_handle { - } - memory_handle(memory_handle const&) noexcept = default; - memory_handle& operator=(memory_handle const&) noexcept = default; - - memory_handle(provider_region* region, unsigned char* addr, - std::size_t size /*, uint32_t flags*/) noexcept - : address_{addr} - , region_{region} - , size_{uint32_t(size)} - , used_space_{0} - { - // LF_DEB(NS_MEMORY::mrn_deb, - // trace(NS_DEBUG::str<>("memory_handle"), *this)); - } + // -------------------------------------------------------------------- + using provider_region = region_provider::provider_region; + + // -------------------------------------------------------------------- + // Default constructor creates unusable handle(region) + memory_handle() + : address_{nullptr} + , region_{nullptr} + , size_{0} + , used_space_{0} + { + } + memory_handle(memory_handle const&) noexcept = default; + memory_handle& operator=(memory_handle const&) noexcept = default; + + memory_handle(provider_region* region, unsigned char* addr, + std::size_t size /*, uint32_t flags*/) noexcept + : address_{addr} + , region_{region} + , size_{uint32_t(size)} + , used_space_{0} + { + // LF_DEB(NS_MEMORY::mrn_deb, + // trace(NS_DEBUG::str<>("memory_handle"), *this)); + } - // -------------------------------------------------------------------- - // move constructor, clear other region so that it is not unregistered twice - memory_handle(memory_handle&& other) noexcept - : address_{other.address_} - , region_{std::exchange(other.region_, nullptr)} - , size_{other.size_} - , used_space_{other.used_space_} - { - } + // -------------------------------------------------------------------- + // move constructor, clear other region so that it is not unregistered twice + memory_handle(memory_handle&& other) noexcept + : address_{other.address_} + , region_{std::exchange(other.region_, nullptr)} + , size_{other.size_} + , used_space_{other.used_space_} + { + } - // -------------------------------------------------------------------- - // move assignment, clear other region so that it is not unregistered twice - memory_handle& operator=(memory_handle&& other) noexcept - { - address_ = other.address_; - region_ = std::exchange(other.region_, nullptr); - size_ = other.size_; - used_space_ = other.used_space_; - return *this; - } + // -------------------------------------------------------------------- + // move assignment, clear other region so that it is not unregistered twice + memory_handle& operator=(memory_handle&& other) noexcept + { + address_ = other.address_; + region_ = std::exchange(other.region_, nullptr); + size_ = other.size_; + used_space_ = other.used_space_; + return *this; + } - // -------------------------------------------------------------------- - // Return the address of this memory region block. - inline unsigned char* get_address(void) const { return address_; } + // -------------------------------------------------------------------- + // Return the address of this memory region block. + inline unsigned char* get_address(void) const { return address_; } - // -------------------------------------------------------------------- - // Get the local descriptor of the memory region. - inline void* get_local_key(void) const { return region_provider::get_local_key(region_); } + // -------------------------------------------------------------------- + // Get the local descriptor of the memory region. + inline void* get_local_key(void) const { return region_provider::get_local_key(region_); } - // -------------------------------------------------------------------- - // Get the remote key of the memory region. - inline uint64_t get_remote_key(void) const { return region_provider::get_remote_key(region_); } + // -------------------------------------------------------------------- + // Get the remote key of the memory region. + inline uint64_t get_remote_key(void) const + { + return region_provider::get_remote_key(region_); + } - // -------------------------------------------------------------------- - // Get the size of the memory chunk usable by this memory region, - // this may be smaller than the value returned by get_length - // if the region is a sub region (partial region) within another block - inline uint64_t get_size(void) const { return size_; } + // -------------------------------------------------------------------- + // Get the size of the memory chunk usable by this memory region, + // this may be smaller than the value returned by get_length + // if the region is a sub region (partial region) within another block + inline uint64_t get_size(void) const { return size_; } - // -------------------------------------------------------------------- - // Get the size used by a message in the memory region. - inline uint32_t get_message_length(void) const { return used_space_; } + // -------------------------------------------------------------------- + // Get the size used by a message in the memory region. + inline uint32_t get_message_length(void) const { return used_space_; } - // -------------------------------------------------------------------- - // Set the size used by a message in the memory region. - inline void set_message_length(uint32_t length) { used_space_ = length; } + // -------------------------------------------------------------------- + // Set the size used by a message in the memory region. + inline void set_message_length(uint32_t length) { used_space_ = length; } - // -------------------------------------------------------------------- - void release_region() noexcept { region_ = nullptr; } + // -------------------------------------------------------------------- + void release_region() noexcept { region_ = nullptr; } - // -------------------------------------------------------------------- - // return the underlying libfabric region handle - inline provider_region* get_region() const { return region_; } + // -------------------------------------------------------------------- + // return the underlying libfabric region handle + inline provider_region* get_region() const { return region_; } - // -------------------------------------------------------------------- - // Deregister the memory region. - // returns 0 when successful, -1 otherwise - int deregister(void) const - { - if (region_ /*&& !get_user_region()*/) + // -------------------------------------------------------------------- + // Deregister the memory region. + // returns 0 when successful, -1 otherwise + int deregister(void) const { - LF_DEB(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("release"), region_)); - // - if (region_provider::unregister_memory(region_)) - { - LF_DEB(NS_MEMORY::mrn_deb, error("fi_close mr failed")); - return -1; - } - else + if (region_ /*&& !get_user_region()*/) { - LF_DEB(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("de-Registered region"), *this)); + LF_DEB(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("release"), region_)); + // + if (region_provider::unregister_memory(region_)) + { + LF_DEB(NS_MEMORY::mrn_deb, error("fi_close mr failed")); + return -1; + } + else + { + LF_DEB( + NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("de-Registered region"), *this)); + } + region_ = nullptr; } - region_ = nullptr; + return 0; } - return 0; - } - // -------------------------------------------------------------------- - friend std::ostream& operator<<(std::ostream& os, memory_handle const& region) - { - (void)region; + // -------------------------------------------------------------------- + friend std::ostream& operator<<(std::ostream& os, memory_handle const& region) + { + (void) region; #if 1 || has_debug - os << "region " - << NS_DEBUG::ptr(®ion) - //<< " fi_region " << NS_DEBUG::ptr(region.region_) - << " address " << NS_DEBUG::ptr(region.address_) << " size " - << NS_DEBUG::hex<6>(region.size_) - //<< " used_space " << NS_DEBUG::hex<6>(region.used_space_/*size_*/) - << " loc key " - << NS_DEBUG::ptr( - region.region_ ? region_provider::get_local_key(region.region_) : nullptr) - << " rem key " - << NS_DEBUG::ptr(region.region_ ? region_provider::get_remote_key(region.region_) : 0); - ///// clang-format off - ///// clang-format on + using namespace NS_DEBUG; + os << "region " + << hptr(®ion) + //<< " fi_region " << hptr(region.region_) + << " address " << hptr(region.address_) << " size " + << hex<6>(region.size_) + //<< " used_space " << hex<6>(region.used_space_/*size_*/) + << " loc key " + << hptr(region.region_ ? region_provider::get_local_key(region.region_) : nullptr) + << " rem key " + << hptr(region.region_ ? region_provider::get_remote_key(region.region_) : 0); + ///// clang-format off + ///// clang-format on #endif - return os; - } - - protected: - // This gives the start address of this region. - // This is the address that should be used for data storage - unsigned char* address_; + return os; + } - // The hardware level handle to the region (as returned from libfabric fi_mr_reg) - mutable provider_region* region_; + protected: + // This gives the start address of this region. + // This is the address that should be used for data storage + unsigned char* address_; - // The (maximum available) size of the memory buffer - uint32_t size_; + // The hardware level handle to the region (as returned from libfabric fi_mr_reg) + mutable provider_region* region_; - // Space used by a message in the memory region. - // This may be smaller/less than the size available if more space - // was allocated than it turns out was needed - mutable uint32_t used_space_; -}; + // The (maximum available) size of the memory buffer + uint32_t size_; -// -------------------------------------------------------------------- -// a memory segment is a pinned block of memory that has been specialized -// by a particular region provider. Each provider (infiniband, libfabric, -// other) has a different definition for the object and the protection -// domain used to limit access. -// -------------------------------------------------------------------- -struct memory_segment : public memory_handle -{ - using provider_domain = region_provider::provider_domain; - using provider_region = region_provider::provider_region; - using handle_type = memory_handle; + // Space used by a message in the memory region. + // This may be smaller/less than the size available if more space + // was allocated than it turns out was needed + mutable uint32_t used_space_; + }; // -------------------------------------------------------------------- - memory_segment(provider_region* region, unsigned char* address, unsigned char* base_address, - uint64_t size) - : memory_handle(region, address, size) - , base_addr_(base_address) - { - } - + // a memory segment is a pinned block of memory that has been specialized + // by a particular region provider. Each provider (infiniband, libfabric, + // other) has a different definition for the object and the protection + // domain used to limit access. // -------------------------------------------------------------------- - // move constructor, clear other region - memory_segment(memory_segment&& other) noexcept - : memory_handle(std::move(other)) - , base_addr_{std::exchange(other.base_addr_, nullptr)} + struct memory_segment : public memory_handle { - } + using provider_domain = region_provider::provider_domain; + using provider_region = region_provider::provider_region; + using handle_type = memory_handle; + + // -------------------------------------------------------------------- + memory_segment(provider_region* region, unsigned char* address, unsigned char* base_address, + uint64_t size) + : memory_handle(region, address, size) + , base_addr_(base_address) + { + } - // -------------------------------------------------------------------- - // move assignment, clear other region - memory_segment& operator=(memory_segment&& other) noexcept - { - memory_handle(std::move(other)); - region_ = std::exchange(other.region_, nullptr); - return *this; - } + // -------------------------------------------------------------------- + // move constructor, clear other region + memory_segment(memory_segment&& other) noexcept + : memory_handle(std::move(other)) + , base_addr_{std::exchange(other.base_addr_, nullptr)} + { + } - // -------------------------------------------------------------------- - // construct a memory region object by registering an existing address buffer - // we do not cache local/remote keys here because memory segments are only - // used by the heap to store chunks and the user will always receive - // a memory_handle - which does have keys cached - memory_segment(provider_domain* pd, const void* buffer, const uint64_t length, bool bind_mr, - void* ep, int device_id) - { - // an rma key counter to keep some providers (CXI) happy - static std::atomic key = 0; - // - address_ = static_cast(const_cast(buffer)); - size_ = length; - used_space_ = length; - region_ = nullptr; - // - base_addr_ = memory_handle::address_; - LF_DEB(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("memory_segment"), *this, device_id)); - - int ret = region_provider::fi_register_memory(pd, device_id, buffer, length, - region_provider::access_flags(), 0, key++, &(region_)); - if (!ret) + // -------------------------------------------------------------------- + // move assignment, clear other region + memory_segment& operator=(memory_segment&& other) noexcept { - LF_DEB(NS_MEMORY::mrn_deb, - trace(NS_DEBUG::str<>("Registered region"), "device", device_id, *this)); + memory_handle(std::move(other)); + region_ = std::exchange(other.region_, nullptr); + return *this; } - if (bind_mr) + // -------------------------------------------------------------------- + // construct a memory region object by registering an existing address buffer + // we do not cache local/remote keys here because memory segments are only + // used by the heap to store chunks and the user will always receive + // a memory_handle - which does have keys cached + memory_segment(provider_domain* pd, void const* buffer, uint64_t const length, bool bind_mr, + void* ep, int device_id) { - ret = fi_mr_bind(region_, (struct fid*)ep, 0); - if (ret) { throw NS_LIBFABRIC::fabric_error(int(ret), "fi_mr_bind"); } - else { LF_DEB(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("Bound region"), *this)); } + // an rma key counter to keep some providers (CXI) happy + static std::atomic key = 0; + // + address_ = static_cast(const_cast(buffer)); + size_ = length; + used_space_ = length; + region_ = nullptr; + // + base_addr_ = memory_handle::address_; + LF_DEB(NS_MEMORY::mrn_deb, trace(str<>("memory_segment"), *this, device_id)); - ret = fi_mr_enable(region_); - if (ret) { throw NS_LIBFABRIC::fabric_error(int(ret), "fi_mr_enable"); } - else { LF_DEB(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("Enabled region"), *this)); } + int ret = region_provider::fi_register_memory(pd, device_id, buffer, length, + region_provider::access_flags(), 0, key++, &(region_)); + if (!ret) + { + LF_DEB(NS_MEMORY::mrn_deb, + trace(str<>("Registered region"), "device", device_id, *this)); + } + + if (bind_mr) + { + ret = fi_mr_bind(region_, (struct fid*) ep, 0); + if (ret) { throw NS_LIBFABRIC::fabric_error(int(ret), "fi_mr_bind"); } + else { LF_DEB(NS_MEMORY::mrn_deb, trace(str<>("Bound region"), *this)); } + + ret = fi_mr_enable(region_); + if (ret) { throw NS_LIBFABRIC::fabric_error(int(ret), "fi_mr_enable"); } + else { LF_DEB(NS_MEMORY::mrn_deb, trace(str<>("Enabled region"), *this)); } + } } - } - // -------------------------------------------------------------------- - // destroy the region and memory according to flag settings - ~memory_segment() { deregister(); } + // -------------------------------------------------------------------- + // destroy the region and memory according to flag settings + ~memory_segment() { deregister(); } - handle_type get_handle(std::size_t offset, std::size_t size) const noexcept - { - return memory_handle(region_, base_addr_ + offset, size); - } + handle_type get_handle(std::size_t offset, std::size_t size) const noexcept + { + return memory_handle(region_, base_addr_ + offset, size); + } - // -------------------------------------------------------------------- - // Get the address of the base memory region. - // This is the address of the memory allocated from the system - inline unsigned char* get_base_address(void) const { return base_addr_; } + // -------------------------------------------------------------------- + // Get the address of the base memory region. + // This is the address of the memory allocated from the system + inline unsigned char* get_base_address(void) const { return base_addr_; } - // -------------------------------------------------------------------- - friend std::ostream& operator<<(std::ostream& os, memory_segment const& region) - { - (void)region; + // -------------------------------------------------------------------- + friend std::ostream& operator<<(std::ostream& os, memory_segment const& region) + { + (void) region; #if has_debug - // clang-format off + // clang-format off os << *static_cast(®ion) - << " base address " << NS_DEBUG::ptr(region.base_addr_); - // clang-format on + << " base address " << NS_DEBUG::hptr(region.base_addr_); + // clang-format on #endif - return os; - } + return os; + } - public: - // this is the base address of the memory registered by this segment - // individual memory_handles are offset from this address - unsigned char* base_addr_; -}; + public: + // this is the base address of the memory registered by this segment + // individual memory_handles are offset from this address + unsigned char* base_addr_; + }; -} // namespace NS_MEMORY +} // namespace NS_MEMORY diff --git a/src/libfabric/operation_context.cpp b/src/libfabric/operation_context.cpp index ce5081dd..0f6de97a 100644 --- a/src/libfabric/operation_context.cpp +++ b/src/libfabric/operation_context.cpp @@ -8,49 +8,52 @@ * SPDX-License-Identifier: BSD-3-Clause */ // paths relative to backend -#include -#include #include #include +#include +#include -namespace oomph::libfabric -{ -void -operation_context::handle_cancelled() -{ - [[maybe_unused]] auto scp = opctx_deb<1>.scope(NS_DEBUG::ptr(this), __func__); - // enqueue the cancelled/callback - if (std::holds_alternative(m_req)) - { - // regular (non-shared) recv - auto s = std::get(m_req); - while (!(s->m_comm->m_recv_cb_cancel.push(s))) {} - } - else if (std::holds_alternative(m_req)) +namespace oomph::libfabric { + void operation_context::handle_cancelled() { - // shared recv - auto s = std::get(m_req); - while (!(s->m_ctxt->m_recv_cb_cancel.push(s))) {} + [[maybe_unused]] auto scp = opctx_deb<1>.scope(NS_DEBUG::hptr(this), __func__); + // enqueue the cancelled/callback + if (std::holds_alternative(m_req)) + { + // regular (non-shared) recv + auto s = std::get(m_req); + while (!(s->m_comm->m_recv_cb_cancel.push(s))) {} + } + else if (std::holds_alternative(m_req)) + { + // shared recv + auto s = std::get(m_req); + while (!(s->m_ctxt->m_recv_cb_cancel.push(s))) {} + } + else { throw std::runtime_error("Request state invalid in handle_cancelled"); } } - else { throw std::runtime_error("Request state invalid in handle_cancelled"); } -} -int -operation_context::handle_tagged_recv_completion_impl(void* user_data) -{ - [[maybe_unused]] auto scp = opctx_deb<1>.scope(NS_DEBUG::ptr(this), __func__); - if (std::holds_alternative(m_req)) + int operation_context::handle_tagged_recv_completion_impl(void* user_data) { - // regular (non-shared) recv - auto s = std::get(m_req); - //if (std::this_thread::get_id() == thread_id_) - if (reinterpret_cast(user_data) == s->m_comm) + [[maybe_unused]] auto scp = opctx_deb<1>.scope(NS_DEBUG::hptr(this), __func__); + if (std::holds_alternative(m_req)) { - if (!s->m_comm->has_reached_recursion_depth()) + // regular (non-shared) recv + auto s = std::get(m_req); + //if (std::this_thread::get_id() == thread_id_) + if (reinterpret_cast(user_data) == s->m_comm) { - auto inc = s->m_comm->recursion(); - auto ptr = s->release_self_ref(); - s->invoke_cb(); + if (!s->m_comm->has_reached_recursion_depth()) + { + auto inc = s->m_comm->recursion(); + auto ptr = s->release_self_ref(); + s->invoke_cb(); + } + else + { + // enqueue the callback + while (!(s->m_comm->m_recv_cb_queue.push(s))) {} + } } else { @@ -58,82 +61,76 @@ operation_context::handle_tagged_recv_completion_impl(void* user_data) while (!(s->m_comm->m_recv_cb_queue.push(s))) {} } } - else - { - // enqueue the callback - while (!(s->m_comm->m_recv_cb_queue.push(s))) {} - } - } - else if (std::holds_alternative(m_req)) - { - // shared recv - auto s = std::get(m_req); - if (!s->m_comm->m_context->has_reached_recursion_depth()) + else if (std::holds_alternative(m_req)) { - auto inc = s->m_comm->m_context->recursion(); - auto ptr = s->release_self_ref(); - s->invoke_cb(); - } - else - { - // enqueue the callback - while (!(s->m_comm->m_context->m_recv_cb_queue.push(s))) {} - } - } - else - { - detail::request_state** req = reinterpret_cast(&m_req); - LF_DEB(NS_MEMORY::opctx_deb<9>, - error(NS_DEBUG::str<>("invalid request_state"), this, "request", NS_DEBUG::ptr(req))); - throw std::runtime_error("Request state invalid in handle_tagged_recv"); - } - return 1; -} - -int -operation_context::handle_tagged_send_completion_impl(void* user_data) -{ - if (std::holds_alternative(m_req)) - { - // regular (non-shared) recv - auto s = std::get(m_req); - if (reinterpret_cast(user_data) == s->m_comm) - { - if (!s->m_comm->has_reached_recursion_depth()) + // shared recv + auto s = std::get(m_req); + if (!s->m_comm->m_context->has_reached_recursion_depth()) { - auto inc = s->m_comm->recursion(); + auto inc = s->m_comm->m_context->recursion(); auto ptr = s->release_self_ref(); s->invoke_cb(); } else { // enqueue the callback - while (!(s->m_comm->m_send_cb_queue.push(s))) {} + while (!(s->m_comm->m_context->m_recv_cb_queue.push(s))) {} } } else { - // enqueue the callback - while (!(s->m_comm->m_send_cb_queue.push(s))) {} + detail::request_state** req = reinterpret_cast(&m_req); + LF_DEB(NS_MEMORY::opctx_deb<9>, + error( + str<>("invalid request_state"), this, "request", hptr(req))); + throw std::runtime_error("Request state invalid in handle_tagged_recv"); } + return 1; } - else if (std::holds_alternative(m_req)) + + int operation_context::handle_tagged_send_completion_impl(void* user_data) { - // shared recv - auto s = std::get(m_req); - if (!s->m_comm->m_context->has_reached_recursion_depth()) + if (std::holds_alternative(m_req)) { - auto inc = s->m_comm->m_context->recursion(); - auto ptr = s->release_self_ref(); - s->invoke_cb(); + // regular (non-shared) recv + auto s = std::get(m_req); + if (reinterpret_cast(user_data) == s->m_comm) + { + if (!s->m_comm->has_reached_recursion_depth()) + { + auto inc = s->m_comm->recursion(); + auto ptr = s->release_self_ref(); + s->invoke_cb(); + } + else + { + // enqueue the callback + while (!(s->m_comm->m_send_cb_queue.push(s))) {} + } + } + else + { + // enqueue the callback + while (!(s->m_comm->m_send_cb_queue.push(s))) {} + } } - else + else if (std::holds_alternative(m_req)) { - // enqueue the callback - while (!(s->m_comm->m_context->m_recv_cb_queue.push(s))) {} + // shared recv + auto s = std::get(m_req); + if (!s->m_comm->m_context->has_reached_recursion_depth()) + { + auto inc = s->m_comm->m_context->recursion(); + auto ptr = s->release_self_ref(); + s->invoke_cb(); + } + else + { + // enqueue the callback + while (!(s->m_comm->m_context->m_recv_cb_queue.push(s))) {} + } } + else { throw std::runtime_error("Request state invalid in handle_tagged_send"); } + return 1; } - else { throw std::runtime_error("Request state invalid in handle_tagged_send"); } - return 1; -} -} // namespace oomph::libfabric +} // namespace oomph::libfabric diff --git a/src/libfabric/operation_context.hpp b/src/libfabric/operation_context.hpp index ad106e6a..faed3d70 100644 --- a/src/libfabric/operation_context.hpp +++ b/src/libfabric/operation_context.hpp @@ -15,39 +15,38 @@ // #include "operation_context_base.hpp" // -namespace oomph::libfabric -{ - -template -inline /*constexpr*/ NS_DEBUG::print_threshold opctx_deb("OP__CXT"); - -// This struct holds the ready state of a future -// we must also store the context used in libfabric, in case -// a request is cancelled - fi_cancel(...) needs it -struct operation_context : public operation_context_base -{ - std::variant m_req; - - template - operation_context(RequestState* req) - : operation_context_base() - , m_req{req} - { - [[maybe_unused]] auto scp = - opctx_deb<9>.scope(NS_DEBUG::ptr(this), __func__, "request", req); - } - - // -------------------------------------------------------------------- - // When a completion returns FI_ECANCELED, this is called - void handle_cancelled(); +namespace oomph::libfabric { - // -------------------------------------------------------------------- - // Called when a tagged recv completes - int handle_tagged_recv_completion_impl(void* user_data); + template + inline NS_DEBUG::print_threshold opctx_deb("OP__CXT"); - // -------------------------------------------------------------------- - // Called when a tagged send completes - int handle_tagged_send_completion_impl(void* user_data); -}; - -} // namespace oomph::libfabric + // This struct holds the ready state of a future + // we must also store the context used in libfabric, in case + // a request is cancelled - fi_cancel(...) needs it + struct operation_context : public operation_context_base + { + std::variant m_req; + + template + operation_context(RequestState* req) + : operation_context_base() + , m_req{req} + { + [[maybe_unused]] auto scp = + opctx_deb<9>.scope(NS_DEBUG::hptr(this), __func__, "request", req); + } + + // -------------------------------------------------------------------- + // When a completion returns FI_ECANCELED, this is called + void handle_cancelled(); + + // -------------------------------------------------------------------- + // Called when a tagged recv completes + int handle_tagged_recv_completion_impl(void* user_data); + + // -------------------------------------------------------------------- + // Called when a tagged send completes + int handle_tagged_send_completion_impl(void* user_data); + }; + +} // namespace oomph::libfabric diff --git a/src/libfabric/operation_context_base.hpp b/src/libfabric/operation_context_base.hpp index e5156f99..462c79b5 100644 --- a/src/libfabric/operation_context_base.hpp +++ b/src/libfabric/operation_context_base.hpp @@ -12,85 +12,84 @@ #include #include "oomph_libfabric_defines.hpp" -namespace NS_LIBFABRIC -{ +namespace NS_LIBFABRIC { -class controller; + class controller; -static NS_DEBUG::enable_print ctx_bas("CTXBASE"); + static NS_DEBUG::enable_print ctx_bas("CTXBASE"); -// This struct holds the ready state of a future -// we must also store the context used in libfabric, in case -// a request is cancelled - fi_cancel(...) needs it -template -struct operation_context_base -{ - private: - // libfabric requires some space for it's internal bookkeeping - // so the first member of this struct must be fi_context - fi_context context_reserved_space; - - public: - operation_context_base() - : context_reserved_space() + // This struct holds the ready state of a future + // we must also store the context used in libfabric, in case + // a request is cancelled - fi_cancel(...) needs it + template + struct operation_context_base { - [[maybe_unused]] auto scp = ctx_bas.scope(NS_DEBUG::ptr(this), __func__); - } + private: + // libfabric requires some space for it's internal bookkeeping + // so the first member of this struct must be fi_context + fi_context context_reserved_space; - // error - void handle_error(struct fi_cq_err_entry& err) - { - static_cast(this)->handle_error_impl(err); - } - void handle_error_impl(struct fi_cq_err_entry& /*err*/) { std::terminate(); } + public: + operation_context_base() + : context_reserved_space() + { + [[maybe_unused]] auto scp = ctx_bas.scope(NS_DEBUG::hptr(this), __func__); + } - void handle_cancelled() { static_cast(this)->handle_cancelled_impl(); } - void handle_cancelled_impl() { std::terminate(); } + // error + void handle_error(struct fi_cq_err_entry& err) + { + static_cast(this)->handle_error_impl(err); + } + void handle_error_impl(struct fi_cq_err_entry& /*err*/) { std::terminate(); } - // send - int handle_send_completion() - { - return static_cast(this)->handle_send_completion_impl(); - } - int handle_send_completion_impl() { return 0; } + void handle_cancelled() { static_cast(this)->handle_cancelled_impl(); } + void handle_cancelled_impl() { std::terminate(); } - // tagged send - int handle_tagged_send_completion(void* user_data) - { - return static_cast(this)->handle_tagged_send_completion_impl(user_data); - } - int handle_tagged_send_completion_impl(void* /*user_data*/) { return 0; } + // send + int handle_send_completion() + { + return static_cast(this)->handle_send_completion_impl(); + } + int handle_send_completion_impl() { return 0; } - // recv - int handle_recv_completion(std::uint64_t len) - { - return static_cast(this)->handle_recv_completion_impl(len); - } - int handle_recv_completion_impl(std::uint64_t /*len*/) { return 0; } + // tagged send + int handle_tagged_send_completion(void* user_data) + { + return static_cast(this)->handle_tagged_send_completion_impl(user_data); + } + int handle_tagged_send_completion_impl(void* /*user_data*/) { return 0; } - // tagged recv - int handle_tagged_recv_completion(void* user_data) - { - return static_cast(this)->handle_tagged_recv_completion_impl(user_data); - } - int handle_tagged_recv_completion_impl(bool /*threadlocal*/) { return 0; } + // recv + int handle_recv_completion(std::uint64_t len) + { + return static_cast(this)->handle_recv_completion_impl(len); + } + int handle_recv_completion_impl(std::uint64_t /*len*/) { return 0; } - void handle_rma_read_completion() - { - static_cast(this)->handle_rma_read_completion_impl(); - } - void handle_rma_read_completion_impl() {} + // tagged recv + int handle_tagged_recv_completion(void* user_data) + { + return static_cast(this)->handle_tagged_recv_completion_impl(user_data); + } + int handle_tagged_recv_completion_impl(bool /*threadlocal*/) { return 0; } - // unknown sender = new connection - int handle_new_connection(controller* ctrl, std::uint64_t len) - { - return static_cast(this)->handle_new_connection_impl(ctrl, len); - } - int handle_new_connection_impl(controller*, std::uint64_t) { return 0; } -}; + void handle_rma_read_completion() + { + static_cast(this)->handle_rma_read_completion_impl(); + } + void handle_rma_read_completion_impl() {} -// provided so that a pointer can be cast to this and the operation_context_type queried -struct unspecialized_context : public operation_context_base -{ -}; -} // namespace NS_LIBFABRIC + // unknown sender = new connection + int handle_new_connection(controller* ctrl, std::uint64_t len) + { + return static_cast(this)->handle_new_connection_impl(ctrl, len); + } + int handle_new_connection_impl(controller*, std::uint64_t) { return 0; } + }; + + // provided so that a pointer can be cast to this and the operation_context_type queried + struct unspecialized_context : public operation_context_base + { + }; +} // namespace NS_LIBFABRIC diff --git a/src/libfabric/print.hpp b/src/libfabric/print.hpp index cf8de408..04364b98 100644 --- a/src/libfabric/print.hpp +++ b/src/libfabric/print.hpp @@ -27,12 +27,12 @@ #include // #if defined(__linux) || defined(linux) || defined(__linux__) -#include -#include +# include +# include #elif defined(__APPLE__) -#include -#include -#define environ (*_NSGetEnviron()) +# include +# include +# define environ (*_NSGetEnviron()) #else extern char** environ; #endif @@ -73,670 +73,633 @@ extern char** environ; // ------------------------------------------------------------ #define NS_DEBUG oomph::debug -#define LF_DEB(printer, Expr) \ - if constexpr (printer.is_enabled()) { printer.Expr; }; +#ifndef LF_DEB +# define LF_DEB(printer, Expr) \ + { \ + using namespace NS_DEBUG; \ + if constexpr (printer.is_enabled()) { printer.Expr; }; \ + } +#endif // ------------------------------------------------------------ /// \cond NODETAIL -namespace NS_DEBUG -{ - -// ------------------------------------------------------------------ -// format as zero padded int -// ------------------------------------------------------------------ -namespace detail -{ - -template -struct dec -{ - constexpr dec(T const& v) - : data_(v) - { - } +namespace NS_DEBUG { - T const& data_; + // ------------------------------------------------------------------ + // format as zero padded int + // ------------------------------------------------------------------ + namespace detail { - friend std::ostream& operator<<(std::ostream& os, dec const& d) - { - os << std::right << std::setfill('0') << std::setw(N) << std::noshowbase << std::dec - << d.data_; - return os; - } -}; -} // namespace detail - -template -constexpr detail::dec -dec(T const& v) -{ - return detail::dec(v); -} - -// ------------------------------------------------------------------ -// format as pointer -// ------------------------------------------------------------------ -struct ptr -{ - ptr(void const* v) - : data_(v) - { - } - ptr(std::uintptr_t const v) - : data_(reinterpret_cast(v)) - { - } - void const* data_; - friend std::ostream& operator<<(std::ostream& os, ptr const& d) - { - os << std::right << "0x" << std::setfill('0') << std::setw(12) << std::noshowbase - << std::hex << reinterpret_cast(d.data_); - return os; - } -}; - -// ------------------------------------------------------------------ -// format as zero padded hex -// ------------------------------------------------------------------ -namespace detail -{ - -template -struct hex; - -template -struct hex::value>::type> -{ - constexpr hex(T const& v) - : data_(v) - { - } - T const& data_; - friend std::ostream& operator<<(std::ostream& os, const hex& d) - { - os << std::right << "0x" << std::setfill('0') << std::setw(N) << std::noshowbase << std::hex - << d.data_; - return os; - } -}; + template + struct dec + { + constexpr dec(T const& v) + : data_(v) + { + } -template -struct hex::value>::type> -{ - constexpr hex(T const& v) - : data_(v) - { - } - T const& data_; - friend std::ostream& operator<<(std::ostream& os, const hex& d) - { - os << std::right << std::setw(N) << std::noshowbase << std::hex << d.data_; - return os; - } -}; -} // namespace detail - -template -constexpr detail::hex -hex(T const& v) -{ - return detail::hex(v); -} - -// ------------------------------------------------------------------ -// format as binary bits -// ------------------------------------------------------------------ -namespace detail -{ - -template -struct bin -{ - constexpr bin(T const& v) - : data_(v) - { - } - T const& data_; - friend std::ostream& operator<<(std::ostream& os, const bin& d) + T const& data_; + + friend std::ostream& operator<<(std::ostream& os, dec const& d) + { + os << std::right << std::setfill('0') << std::setw(N) << std::noshowbase << std::dec + << d.data_; + return os; + } + }; + } // namespace detail + + template + constexpr detail::dec dec(T const& v) { - os << std::bitset(d.data_); - return os; + return detail::dec(v); } -}; -} // namespace detail - -template -constexpr detail::bin -bin(T const& v) -{ - return detail::bin(v); -} - -// ------------------------------------------------------------------ -// format as padded string -// ------------------------------------------------------------------ -template -struct str -{ - constexpr str(char const* v) - : data_(v) + + // ------------------------------------------------------------------ + // format as pointer + // ------------------------------------------------------------------ + struct hptr { - } + hptr(void const* v) + : data_(v) + { + } + hptr(std::uintptr_t const v) + : data_(reinterpret_cast(v)) + { + } + void const* data_; + friend std::ostream& operator<<(std::ostream& os, hptr const& d) + { + os << std::right << "0x" << std::setfill('0') << std::setw(12) << std::noshowbase + << std::hex << reinterpret_cast(d.data_); + return os; + } + }; - char const* data_; + // ------------------------------------------------------------------ + // format as zero padded hex + // ------------------------------------------------------------------ + namespace detail { - friend std::ostream& operator<<(std::ostream& os, str const& d) - { - os << std::left << std::setfill(' ') << std::setw(N) << d.data_; - return os; - } -}; - -// ------------------------------------------------------------------ -// format as ip address -// ------------------------------------------------------------------ -struct ipaddr -{ - ipaddr(const void* a) - : data_(reinterpret_cast(a)) - , ipdata_(0) + template + struct hex; + + template + struct hex::value>::type> + { + constexpr hex(T const& v) + : data_(v) + { + } + T const& data_; + friend std::ostream& operator<<(std::ostream& os, hex const& d) + { + os << std::right << "0x" << std::setfill('0') << std::setw(N) << std::noshowbase + << std::hex << d.data_; + return os; + } + }; + + template + struct hex::value>::type> + { + constexpr hex(T const& v) + : data_(v) + { + } + T const& data_; + friend std::ostream& operator<<(std::ostream& os, hex const& d) + { + os << std::right << std::setw(N) << std::noshowbase << std::hex << d.data_; + return os; + } + }; + } // namespace detail + + template + constexpr detail::hex hex(T const& v) { + return detail::hex(v); } - ipaddr(const uint32_t a) - : data_(reinterpret_cast(&ipdata_)) - , ipdata_(a) + + // ------------------------------------------------------------------ + // format as binary bits + // ------------------------------------------------------------------ + namespace detail { + + template + struct bin + { + constexpr bin(T const& v) + : data_(v) + { + } + T const& data_; + friend std::ostream& operator<<(std::ostream& os, bin const& d) + { + os << std::bitset(d.data_); + return os; + } + }; + } // namespace detail + + template + constexpr detail::bin bin(T const& v) { + return detail::bin(v); } - const uint8_t* data_; - const uint32_t ipdata_; - friend std::ostream& operator<<(std::ostream& os, ipaddr const& p) + // ------------------------------------------------------------------ + // format as padded string + // ------------------------------------------------------------------ + template + struct str { - os << std::dec << int(p.data_[0]) << "." << int(p.data_[1]) << "." << int(p.data_[2]) << "." - << int(p.data_[3]); - return os; - } -}; - -// ------------------------------------------------------------------ -// helper fuction for printing CRC32 -// ------------------------------------------------------------------ -inline uint32_t -crc32(const void* address, size_t length) -{ - boost::crc_32_type result; - result.process_bytes(address, length); - return result.checksum(); -} - -// ------------------------------------------------------------------ -// helper fuction for printing short memory dump and crc32 -// useful for debugging corruptions in buffers during -// rma or other transfers -// ------------------------------------------------------------------ -struct mem_crc32 -{ - mem_crc32(const void* a, std::size_t len, const char* txt) - : addr_(reinterpret_cast(a)) - , len_(len) - , txt_(txt) + constexpr str(char const* v) + : data_(v) + { + } + + char const* data_; + + friend std::ostream& operator<<(std::ostream& os, str const& d) + { + os << std::left << std::setfill(' ') << std::setw(N) << d.data_; + return os; + } + }; + + // ------------------------------------------------------------------ + // helper fuction for printing CRC32 + // ------------------------------------------------------------------ + inline uint32_t crc32(void const* address, size_t length) { + boost::crc_32_type result; + result.process_bytes(address, length); + return result.checksum(); } - const std::uint8_t* addr_; - const std::size_t len_; - const char* txt_; - friend std::ostream& operator<<(std::ostream& os, mem_crc32 const& p) + + // ------------------------------------------------------------------ + // helper fuction for printing short memory dump and crc32 + // useful for debugging corruptions in buffers during + // rma or other transfers + // ------------------------------------------------------------------ + struct mem_crc32 { - const std::uint8_t* byte = static_cast(p.addr_); - os << "Memory:"; - os << " address " << ptr(p.addr_) << " length " << hex<6, std::size_t>(p.len_) - << " CRC32:" << hex<8, std::size_t>(crc32(p.addr_, p.len_)) << "\n"; - size_t i = 0; - while (i < std::min(size_t(128), p.len_)) - { - os << "0x"; - for (int j = 7; j >= 0; j--) + mem_crc32(void const* a, std::size_t len, char const* txt) + : addr_(reinterpret_cast(a)) + , len_(len) + , txt_(txt) + { + } + std::uint8_t const* addr_; + std::size_t const len_; + char const* txt_; + friend std::ostream& operator<<(std::ostream& os, mem_crc32 const& p) + { + using namespace NS_DEBUG; + std::uint8_t const* byte = static_cast(p.addr_); + os << "Memory:"; + os << " address " << hptr(p.addr_) << " length " << hex<6, std::size_t>(p.len_) + << " CRC32:" << hex<8, std::size_t>(crc32(p.addr_, p.len_)) << "\n"; + size_t i = 0; + while (i < std::min(size_t(128), p.len_)) { - os << std::hex << std::setfill('0') << std::setw(2) - << (((i + j) > p.len_) ? (int)0 : (int)byte[i + j]); + os << "0x"; + for (int j = 7; j >= 0; j--) + { + os << std::hex << std::setfill('0') << std::setw(2) + << (((i + j) > p.len_) ? (int) 0 : (int) byte[i + j]); + } + i += 8; + if (i % 32 == 0) + os << std::endl; + else + os << " "; } - i += 8; - if (i % 32 == 0) os << std::endl; - else - os << " "; + os << ": " << p.txt_; + return os; } - os << ": " << p.txt_; - return os; - } -}; - -namespace detail -{ - -template -void -tuple_print(std::ostream& os, TupleType const& t, std::index_sequence) -{ - (..., (os << (I == 0 ? "" : " ") << std::get(t))); -} - -template -void -tuple_print(std::ostream& os, const std::tuple& t) -{ - tuple_print(os, t, std::make_index_sequence()); -} -} // namespace detail - -namespace detail -{ - -// ------------------------------------------------------------------ -// helper class for printing thread ID -// ------------------------------------------------------------------ -struct current_thread_print_helper -{ -}; - -inline std::ostream& -operator<<(std::ostream& os, current_thread_print_helper const&) -{ - os << hex<12, std::thread::id>(std::this_thread::get_id()) + }; + + namespace detail { + + template + void tuple_print(std::ostream& os, TupleType const& t, std::index_sequence) + { + (..., (os << (I == 0 ? "" : " ") << std::get(t))); + } + + template + void tuple_print(std::ostream& os, std::tuple const& t) + { + tuple_print(os, t, std::make_index_sequence()); + } + } // namespace detail + + namespace detail { + + // ------------------------------------------------------------------ + // helper class for printing thread ID + // ------------------------------------------------------------------ + struct current_thread_print_helper + { + }; + + inline std::ostream& operator<<(std::ostream& os, current_thread_print_helper const&) + { + os << hex<12, std::thread::id>(std::this_thread::get_id()) #ifdef DEBUGGING_PRINT_LINUX - << " cpu " << debug::dec<3, int>(sched_getcpu()) << " "; + << " cpu " << debug::dec<3, int>(sched_getcpu()) << " "; #else - << " cpu " - << "--- "; + << " cpu " + << "--- "; #endif - return os; -} - -// ------------------------------------------------------------------ -// helper class for printing time since start -// ------------------------------------------------------------------ -struct hostname_print_helper -{ - const char* get_hostname() const - { - static bool initialized = false; - static char hostname_[20]; - if (!initialized) - { - initialized = true; - gethostname(hostname_, std::size_t(12)); - std::string temp = "(" + std::to_string(guess_rank()) + ")"; - std::strcat(hostname_, temp.c_str()); + return os; } - return hostname_; - } - int guess_rank() const - { - std::vector env_strings{"_RANK=", "_NODEID="}; - for (char** current = environ; *current; current++) + // ------------------------------------------------------------------ + // helper class for printing time since start + // ------------------------------------------------------------------ + struct hostname_print_helper { - auto e = std::string(*current); - for (auto s : env_strings) + char const* get_hostname() const { - auto pos = e.find(s); - if (pos != std::string::npos) + static bool initialized = false; + static char hostname_[20]; + if (!initialized) { - //std::cout << "Got a rank string : " << e << std::endl; - return std::stoi(e.substr(pos + s.size(), 5)); + initialized = true; + gethostname(hostname_, std::size_t(12)); + std::string temp = "(" + std::to_string(guess_rank()) + ")"; + std::strcat(hostname_, temp.c_str()); } + return hostname_; } + + int guess_rank() const + { + std::vector env_strings{"_RANK=", "_NODEID="}; + for (char** current = environ; *current; current++) + { + auto e = std::string(*current); + for (auto s : env_strings) + { + auto pos = e.find(s); + if (pos != std::string::npos) + { + //std::cout << "Got a rank string : " << e << std::endl; + return std::stoi(e.substr(pos + s.size(), 5)); + } + } + } + return -1; + } + }; + + inline std::ostream& operator<<(std::ostream& os, hostname_print_helper const& h) + { + os << debug::str<13>(h.get_hostname()) << " "; + return os; } - return -1; - } -}; - -inline std::ostream& -operator<<(std::ostream& os, hostname_print_helper const& h) -{ - os << debug::str<13>(h.get_hostname()) << " "; - return os; -} - -// ------------------------------------------------------------------ -// helper class for printing time since start -// ------------------------------------------------------------------ -struct current_time_print_helper -{ -}; - -inline std::ostream& -operator<<(std::ostream& os, current_time_print_helper const&) -{ - using namespace std::chrono; - static steady_clock::time_point log_t_start = steady_clock::now(); - // - auto now = steady_clock::now(); - auto nowt = duration_cast(now - log_t_start).count(); - // - os << debug::dec<10>(nowt) << " "; - return os; -} - -template -void -display(char const* prefix, Args const&... args) -{ - // using a temp stream object with a single copy to cout at the end - // prevents multiple threads from injecting overlapping text - std::stringstream tempstream; - tempstream << prefix << detail::current_time_print_helper() - << detail::current_thread_print_helper() << detail::hostname_print_helper(); - ((tempstream << args << " "), ...); - tempstream << "\n"; - std::cout << tempstream.str() << std::flush; -} - -template -void -debug(Args const&... args) -{ - display(" ", args...); -} - -template -void -warning(Args const&... args) -{ - display(" ", args...); -} - -template -void -error(Args const&... args) -{ - display(" ", args...); -} - -template -void -scope(Args const&... args) -{ - display(" ", args...); -} - -template -void -trace(Args const&... args) -{ - display(" ", args...); -} - -template -void -timed(Args const&... args) -{ - display(" ", args...); -} -} // namespace detail - -template -struct scoped_var -{ - // capture tuple elements by reference - no temp vars in constructor please - char const* prefix_; - std::tuple const message_; - std::string buffered_msg; - - // - scoped_var(char const* p, Args const&... args) - : prefix_(p) - , message_(args...) - { - std::stringstream tempstream; - detail::tuple_print(tempstream, message_); - buffered_msg = tempstream.str(); - detail::display(" ", prefix_, debug::str<>(">> enter <<"), tempstream.str()); - } - ~scoped_var() { detail::display(" ", prefix_, debug::str<>("<< leave >>"), buffered_msg); } -}; - -template -struct timed_var -{ - mutable std::chrono::steady_clock::time_point time_start_; - double const delay_; - std::tuple const message_; - // - timed_var(double const& delay, Args const&... args) - : time_start_(std::chrono::steady_clock::now()) - , delay_(delay) - , message_(args...) - { - } + // ------------------------------------------------------------------ + // helper class for printing time since start + // ------------------------------------------------------------------ + struct current_time_print_helper + { + }; - bool elapsed(std::chrono::steady_clock::time_point const& now) const - { - double elapsed_ = - std::chrono::duration_cast>(now - time_start_).count(); + inline std::ostream& operator<<(std::ostream& os, current_time_print_helper const&) + { + using namespace std::chrono; + static steady_clock::time_point log_t_start = steady_clock::now(); + // + auto now = steady_clock::now(); + auto nowt = duration_cast(now - log_t_start).count(); + // + os << debug::dec<10>(nowt) << " "; + return os; + } - if (elapsed_ > delay_) + template + void display(char const* prefix, Args const&... args) { - time_start_ = now; - return true; + // using a temp stream object with a single copy to cout at the end + // prevents multiple threads from injecting overlapping text + std::stringstream tempstream; + tempstream << prefix << detail::current_time_print_helper() + << detail::current_thread_print_helper() << detail::hostname_print_helper(); + ((tempstream << args << " "), ...); + tempstream << "\n"; + std::cout << tempstream.str() << std::flush; } - return false; - } - friend std::ostream& operator<<(std::ostream& os, timed_var const& ti) - { - detail::tuple_print(os, ti.message_); - return os; - } -}; + template + void debug(Args const&... args) + { + display(" ", args...); + } -/////////////////////////////////////////////////////////////////////////// -template -struct enable_print; + template + void warning(Args const&... args) + { + display(" ", args...); + } -// when false, debug statements should produce no code -template<> -struct enable_print -{ - constexpr enable_print(const char*) {} + template + void error(Args const&... args) + { + display(" ", args...); + } - constexpr bool is_enabled() const { return false; } + template + void scope(Args const&... args) + { + display(" ", args...); + } - template - constexpr void debug(Args const&...) const - { - } + template + void trace(Args const&... args) + { + display(" ", args...); + } - template - constexpr void warning(Args const&...) const - { - } + template + void timed(Args const&... args) + { + display(" ", args...); + } + } // namespace detail - template - constexpr void trace(Args const&...) const + template + struct scoped_var { - } + // capture tuple elements by reference - no temp vars in constructor please + char const* prefix_; + std::tuple const message_; + std::string buffered_msg; - template - constexpr void error(Args const&...) const - { - } + // + scoped_var(char const* p, Args const&... args) + : prefix_(p) + , message_(args...) + { + std::stringstream tempstream; + detail::tuple_print(tempstream, message_); + buffered_msg = tempstream.str(); + detail::display(" ", prefix_, debug::str<>(">> enter <<"), tempstream.str()); + } - template - constexpr void timed(Args const&...) const - { - } + ~scoped_var() + { + detail::display(" ", prefix_, debug::str<>("<< leave >>"), buffered_msg); + } + }; + + template + struct timed_var + { + mutable std::chrono::steady_clock::time_point time_start_; + double const delay_; + std::tuple const message_; + // + timed_var(double const& delay, Args const&... args) + : time_start_(std::chrono::steady_clock::now()) + , delay_(delay) + , message_(args...) + { + } - template - constexpr void array(std::string const&, std::vector const&) const - { - } + bool elapsed(std::chrono::steady_clock::time_point const& now) const + { + double elapsed_ = + std::chrono::duration_cast>(now - time_start_) + .count(); - template - constexpr void array(std::string const&, std::array const&) const - { - } + if (elapsed_ > delay_) + { + time_start_ = now; + return true; + } + return false; + } - template - constexpr void array(std::string const&, Iter, Iter) const - { - } + friend std::ostream& operator<<(std::ostream& os, timed_var const& ti) + { + detail::tuple_print(os, ti.message_); + return os; + } + }; - template - constexpr bool scope(Args const&...) - { - return true; - } + /////////////////////////////////////////////////////////////////////////// + template + struct enable_print; - template - constexpr bool declare_variable(Args const&...) const + // when false, debug statements should produce no code + template <> + struct enable_print { - return true; - } + constexpr enable_print(char const*) {} - template - constexpr void set(T&, V const&) - { - } + constexpr bool is_enabled() const { return false; } - // @todo, return void so that timers have zero footprint when disabled - template - constexpr int make_timer(const double, Args const&...) const - { - return 0; - } + template + constexpr void debug(Args const&...) const + { + } - template - constexpr bool eval(Expr const&) - { - return true; - } -}; - -// when true, debug statements produce valid output -template<> -struct enable_print -{ - private: - char const* prefix_; - - public: - constexpr enable_print() - : prefix_("") - { - } + template + constexpr void warning(Args const&...) const + { + } - constexpr enable_print(const char* p) - : prefix_(p) - { - } + template + constexpr void trace(Args const&...) const + { + } - constexpr bool is_enabled() const { return true; } + template + constexpr void error(Args const&...) const + { + } - template - constexpr void debug(Args const&... args) const - { - detail::debug(prefix_, args...); - } + template + constexpr void timed(Args const&...) const + { + } - template - constexpr void warning(Args const&... args) const - { - detail::warning(prefix_, args...); - } + template + constexpr void array(std::string const&, std::vector const&) const + { + } - template - constexpr void trace(Args const&... args) const - { - detail::trace(prefix_, args...); - } + template + constexpr void array(std::string const&, std::array const&) const + { + } - template - constexpr void error(Args const&... args) const - { - detail::error(prefix_, args...); - } + template + constexpr void array(std::string const&, Iter, Iter) const + { + } - template - scoped_var scope(Args const&... args) - { - return scoped_var(prefix_, args...); - } + template + constexpr bool scope(Args const&...) + { + return true; + } - template - void timed(timed_var const& init, Args const&... args) const - { - auto now = std::chrono::steady_clock::now(); - if (init.elapsed(now)) { detail::timed(prefix_, init, args...); } - } + template + constexpr bool declare_variable(Args const&...) const + { + return true; + } - template - void array(std::string const& name, std::vector const& v) const - { - std::cout << str<20>(name.c_str()) << ": {" << debug::dec<4>(v.size()) << "} : "; - std::copy(std::begin(v), std::end(v), std::ostream_iterator(std::cout, ", ")); - std::cout << "\n"; - } + template + constexpr void set(T&, V const&) + { + } - template - void array(std::string const& name, const std::array& v) const - { - std::cout << str<20>(name.c_str()) << ": {" << debug::dec<4>(v.size()) << "} : "; - std::copy(std::begin(v), std::end(v), std::ostream_iterator(std::cout, ", ")); - std::cout << "\n"; - } + // @todo, return void so that timers have zero footprint when disabled + template + constexpr int make_timer(double const, Args const&...) const + { + return 0; + } - template - void array(std::string const& name, Iter begin, Iter end) const - { - std::cout << str<20>(name.c_str()) << ": {" << debug::dec<4>(std::distance(begin, end)) - << "} : "; - std::copy(begin, end, - std::ostream_iterator::value_type>(std::cout, - ", ")); - std::cout << std::endl; - } + template + constexpr bool eval(Expr const&) + { + return true; + } + }; - template - T declare_variable(Args const&... args) const + // when true, debug statements produce valid output + template <> + struct enable_print { - return T(args...); - } + private: + char const* prefix_; - template - void set(T& var, V const& val) - { - var = val; - } + public: + constexpr enable_print() + : prefix_("") + { + } + + constexpr enable_print(char const* p) + : prefix_(p) + { + } + + constexpr bool is_enabled() const { return true; } + + template + constexpr void debug(Args const&... args) const + { + detail::debug(prefix_, args...); + } + + template + constexpr void warning(Args const&... args) const + { + detail::warning(prefix_, args...); + } - template - timed_var make_timer(const double delay, const Args... args) const + template + constexpr void trace(Args const&... args) const + { + detail::trace(prefix_, args...); + } + + template + constexpr void error(Args const&... args) const + { + detail::error(prefix_, args...); + } + + template + scoped_var scope(Args const&... args) + { + return scoped_var(prefix_, args...); + } + + template + void timed(timed_var const& init, Args const&... args) const + { + auto now = std::chrono::steady_clock::now(); + if (init.elapsed(now)) { detail::timed(prefix_, init, args...); } + } + + template + void array(std::string const& name, std::vector const& v) const + { + std::cout << str<20>(name.c_str()) << ": {" << debug::dec<4>(v.size()) << "} : "; + std::copy(std::begin(v), std::end(v), std::ostream_iterator(std::cout, ", ")); + std::cout << "\n"; + } + + template + void array(std::string const& name, std::array const& v) const + { + std::cout << str<20>(name.c_str()) << ": {" << debug::dec<4>(v.size()) << "} : "; + std::copy(std::begin(v), std::end(v), std::ostream_iterator(std::cout, ", ")); + std::cout << "\n"; + } + + template + void array(std::string const& name, Iter begin, Iter end) const + { + std::cout << str<20>(name.c_str()) << ": {" << debug::dec<4>(std::distance(begin, end)) + << "} : "; + std::copy(begin, end, + std::ostream_iterator::value_type>( + std::cout, ", ")); + std::cout << std::endl; + } + + template + T declare_variable(Args const&... args) const + { + return T(args...); + } + + template + void set(T& var, V const& val) + { + var = val; + } + + template + timed_var make_timer(double const delay, Args const... args) const + { + return timed_var(delay, args...); + } + + template + auto eval(Expr const& e) + { + return e(); + } + }; + + // ------------------------------------------------------------------ + // helper for N>M true/false + // ------------------------------------------------------------------ + template + struct check_level : std::integral_constant { - return timed_var(delay, args...); - } + }; - template - auto eval(Expr const& e) + template + struct print_threshold : enable_print::value> { - return e(); - } -}; - -// ------------------------------------------------------------------ -// helper for N>M true/false -// ------------------------------------------------------------------ -template -struct check_level : std::integral_constant -{ -}; - -template -struct print_threshold : enable_print::value> -{ - using base_type = enable_print::value>; - // inherit constructor - using base_type::base_type; -}; - -} // namespace NS_DEBUG + using base_type = enable_print::value>; + // inherit constructor + using base_type::base_type; + }; + +} // namespace NS_DEBUG /// \endcond diff --git a/src/libfabric/request_state.hpp b/src/libfabric/request_state.hpp index d00e0367..74958fc5 100644 --- a/src/libfabric/request_state.hpp +++ b/src/libfabric/request_state.hpp @@ -13,90 +13,88 @@ #include "../request_state_base.hpp" #include "./operation_context.hpp" -namespace oomph -{ -namespace detail -{ - -struct request_state -: public util::enable_shared_from_this -, public request_state_base -{ - using base = request_state_base; - using shared_ptr_t = util::unsafe_shared_ptr; - using operation_context = libfabric::operation_context; - - operation_context m_operation_context; - util::unsafe_shared_ptr m_self_ptr; - - request_state(oomph::context_impl* ctxt, oomph::communicator_impl* comm, std::size_t* scheduled, - rank_type rank, tag_type tag, cb_type&& cb) - : base{ctxt, comm, scheduled, rank, tag, std::move(cb)} - , m_operation_context{this} - { - } - - void progress(); - - bool cancel(); - - void create_self_ref() - { - // create a self-reference cycle!! - // this is useful if we only keep a raw pointer around internally, which still is supposed - // to keep the object alive - m_self_ptr = shared_from_this(); - } - - shared_ptr_t release_self_ref() noexcept - { - assert(((bool)m_self_ptr) && "doesn't own a self-reference!"); - return std::move(m_self_ptr); - } -}; - -struct shared_request_state -: public std::enable_shared_from_this -, public request_state_base -{ - using base = request_state_base; - using shared_ptr_t = std::shared_ptr; - using operation_context = libfabric::operation_context; - - operation_context m_operation_context; - std::shared_ptr m_self_ptr; - - shared_request_state(oomph::context_impl* ctxt, oomph::communicator_impl* comm, - std::atomic* scheduled, rank_type rank, tag_type tag, cb_type&& cb) - : base{ctxt, comm, scheduled, rank, tag, std::move(cb)} - , m_operation_context{this} - { - [[maybe_unused]] auto scp = libfabric::opctx_deb<9>.scope(NS_DEBUG::ptr(this), __func__); - } +namespace oomph { namespace detail { - ~shared_request_state() + struct request_state + : public util::enable_shared_from_this + , public request_state_base { - [[maybe_unused]] auto scp = libfabric::opctx_deb<9>.scope(NS_DEBUG::ptr(this), __func__); - } - - void progress(); - - bool cancel(); - - void create_self_ref() + using base = request_state_base; + using shared_ptr_t = util::unsafe_shared_ptr; + using operation_context = libfabric::operation_context; + + operation_context m_operation_context; + util::unsafe_shared_ptr m_self_ptr; + + request_state(oomph::context_impl* ctxt, oomph::communicator_impl* comm, + std::size_t* scheduled, rank_type rank, tag_type tag, cb_type&& cb) + : base{ctxt, comm, scheduled, rank, tag, std::move(cb)} + , m_operation_context{this} + { + } + + void progress(); + + bool cancel(); + + void create_self_ref() + { + // create a self-reference cycle!! + // this is useful if we only keep a raw pointer around internally, which still is supposed + // to keep the object alive + m_self_ptr = shared_from_this(); + } + + shared_ptr_t release_self_ref() noexcept + { + assert(((bool) m_self_ptr) && "doesn't own a self-reference!"); + return std::move(m_self_ptr); + } + }; + + struct shared_request_state + : public std::enable_shared_from_this + , public request_state_base { - // create a self-reference cycle!! - // this is useful if we only keep a raw pointer around internally, which still is supposed - // to keep the object alive - m_self_ptr = shared_from_this(); - } - - shared_ptr_t release_self_ref() noexcept - { - assert(((bool)m_self_ptr) && "doesn't own a self-reference!"); - return std::move(m_self_ptr); - } -}; - -} // namespace detail -} // namespace oomph + using base = request_state_base; + using shared_ptr_t = std::shared_ptr; + using operation_context = libfabric::operation_context; + + operation_context m_operation_context; + std::shared_ptr m_self_ptr; + + shared_request_state(oomph::context_impl* ctxt, oomph::communicator_impl* comm, + std::atomic* scheduled, rank_type rank, tag_type tag, cb_type&& cb) + : base{ctxt, comm, scheduled, rank, tag, std::move(cb)} + , m_operation_context{this} + { + [[maybe_unused]] auto scp = + libfabric::opctx_deb<9>.scope(NS_DEBUG::hptr(this), __func__); + } + + ~shared_request_state() + { + [[maybe_unused]] auto scp = + libfabric::opctx_deb<9>.scope(NS_DEBUG::hptr(this), __func__); + } + + void progress(); + + bool cancel(); + + void create_self_ref() + { + // create a self-reference cycle!! + // this is useful if we only keep a raw pointer around internally, which still is supposed + // to keep the object alive + m_self_ptr = shared_from_this(); + } + + shared_ptr_t release_self_ref() noexcept + { + assert(((bool) m_self_ptr) && "doesn't own a self-reference!"); + return std::move(m_self_ptr); + } + }; + +}} // namespace oomph::detail diff --git a/src/libfabric/simple_counter.hpp b/src/libfabric/simple_counter.hpp index f44eac92..26ecf8d5 100644 --- a/src/libfabric/simple_counter.hpp +++ b/src/libfabric/simple_counter.hpp @@ -12,13 +12,13 @@ #include "oomph_libfabric_defines.hpp" // #include -#include #include +#include #ifdef OOMPH_LIBFABRIC_HAVE_PERFORMANCE_COUNTERS -#define PERFORMANCE_COUNTER_ENABLED true +# define PERFORMANCE_COUNTER_ENABLED true #else -#define PERFORMANCE_COUNTER_ENABLED false +# define PERFORMANCE_COUNTER_ENABLED false #endif // @@ -29,90 +29,86 @@ // the performance counter that will simply do nothing when disabled - but // still allow code that uses the counters in arithmetic to compile. // -namespace oomph -{ -namespace libfabric -{ -template::value>> -struct simple_counter -{ -}; - -// -------------------------------------------------------------------- -// specialization for performance counters Enabled -// we provide an atomic that can be incremented or added/subtracted to -template -struct simple_counter -{ - simple_counter() - : value_{T()} +namespace oomph { namespace libfabric { + template ::value>> + struct simple_counter { - } + }; - simple_counter(const T& init) - : value_{init} + // -------------------------------------------------------------------- + // specialization for performance counters Enabled + // we provide an atomic that can be incremented or added/subtracted to + template + struct simple_counter { - } + simple_counter() + : value_{T()} + { + } - inline operator T() const { return value_; } + simple_counter(T const& init) + : value_{init} + { + } - inline T operator=(const T& x) { return value_ = x; } + inline operator T() const { return value_; } - inline T operator++() { return ++value_; } + inline T operator=(T const& x) { return value_ = x; } - inline T operator++(int x) { return (value_ += x); } + inline T operator++() { return ++value_; } - inline T operator+=(const T& rhs) { return (value_ += rhs); } + inline T operator++(int x) { return (value_ += x); } - inline T operator--() { return --value_; } + inline T operator+=(T const& rhs) { return (value_ += rhs); } - inline T operator--(int x) { return (value_ -= x); } + inline T operator--() { return --value_; } - inline T operator-=(const T& rhs) { return (value_ -= rhs); } + inline T operator--(int x) { return (value_ -= x); } - friend std::ostream& operator<<(std::ostream& os, const simple_counter& x) - { - os << x.value_; - return os; - } + inline T operator-=(T const& rhs) { return (value_ -= rhs); } - std::atomic value_; -}; + friend std::ostream& operator<<(std::ostream& os, simple_counter const& x) + { + os << x.value_; + return os; + } -// -------------------------------------------------------------------- -// specialization for performance counters Disabled -// just return dummy values so that arithmetic operations compile ok -template -struct simple_counter -{ - simple_counter() {} + std::atomic value_; + }; - simple_counter(const T&) {} + // -------------------------------------------------------------------- + // specialization for performance counters Disabled + // just return dummy values so that arithmetic operations compile ok + template + struct simple_counter + { + simple_counter() {} - inline operator T() const { return 0; } + simple_counter(T const&) {} - // inline bool operator==(const T&) { return true; } + inline operator T() const { return 0; } - inline T operator=(const T&) { return 0; } + // inline bool operator==(const T&) { return true; } - inline T operator++() { return 0; } + inline T operator=(T const&) { return 0; } - inline T operator++(int) { return 0; } + inline T operator++() { return 0; } - inline T operator+=(const T&) { return 0; } + inline T operator++(int) { return 0; } - inline T operator--() { return 0; } + inline T operator+=(T const&) { return 0; } - inline T operator--(int) { return 0; } + inline T operator--() { return 0; } - inline T operator-=(const T&) { return 0; } + inline T operator--(int) { return 0; } - friend std::ostream& operator<<(std::ostream& os, const simple_counter&) - { - os << "undefined"; - return os; - } -}; -} // namespace libfabric -} // namespace oomph + inline T operator-=(T const&) { return 0; } + + friend std::ostream& operator<<(std::ostream& os, simple_counter const&) + { + os << "undefined"; + return os; + } + }; +}} // namespace oomph::libfabric diff --git a/src/libfabric/test/check_libfabric.cpp b/src/libfabric/test/check_libfabric.cpp new file mode 100644 index 00000000..11d9788e --- /dev/null +++ b/src/libfabric/test/check_libfabric.cpp @@ -0,0 +1,31 @@ +/* + * ghex-org + * + * Copyright (c) 2014-2023, ETH Zurich + * All rights reserved. + * + * Please, refer to the LICENSE file in the root directory. + * SPDX-License-Identifier: BSD-3-Clause + */ + +#include +#include +#include "../benchmarks/mpi_environment.hpp" +// +#include "../communicator.hpp" +#include "../context.hpp" + +#include + +int main(int argc, char** argv) +{ + using namespace oomph; + bool const message_pool_never_free = false; + std::size_t const message_pool_reserve = 1024 * 1024 * 128; + bool const multi_threaded = true; + bool debug = true; + // + mpi_environment env(multi_threaded, argc, argv); + hwmalloc::heap_config const& default_heap = hwmalloc::get_default_heap_config(); + auto ctxt = context_impl(MPI_COMM_WORLD, true, default_heap /*, debug*/); +} diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 5217bbaf..39affd0e 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,6 +1,8 @@ add_subdirectory(mpi_runner) -set(OOMPH_TEST_LEAK_GPU_MEMORY OFF CACHE BOOL "Do not free memory (bug on Piz Daint)") +set(OOMPH_TEST_LEAK_GPU_MEMORY + OFF + CACHE BOOL "Do not free memory (bug on Piz Daint)") # --------------------------------------------------------------------- # compile tests @@ -10,30 +12,57 @@ set(OOMPH_TEST_LEAK_GPU_MEMORY OFF CACHE BOOL "Do not free memory (bug on Piz Da set(serial_tests test_unique_function test_unsafe_shared_ptr) # list of parallel tests to be executed -set(parallel_tests test_context test_send_recv test_send_multi test_cancel test_locality) -#test_tag_range) -if (OOMPH_ENABLE_BARRIER) - list(APPEND parallel_tests test_barrier) +set(parallel_tests test_context test_send_recv test_send_multi test_cancel + test_locality) + +# list of parallel tests that also have device code variants +if(HWMALLOC_ENABLE_DEVICE) + set(device_tests test_send_recv) +endif() + +# test_tag_range) +if(OOMPH_ENABLE_BARRIER) + list(APPEND parallel_tests test_barrier) endif() -# creates an object library (i.e. *.o file) +# creates an object library (i.e. *.o file), if DEVICE is specified, extra flags +# are added and the target name has a suffix function(compile_test t_) - set(t ${t_}_obj) - add_library(${t} OBJECT ${t_}.cpp) - oomph_target_compile_options(${t}) - if (OOMPH_TEST_LEAK_GPU_MEMORY) - target_compile_definitions(${t} PRIVATE OOMPH_TEST_LEAK_GPU_MEMORY) - endif() - target_link_libraries(${t} PRIVATE ext-gtest) - target_link_libraries(${t} PUBLIC oomph) + set(options DEVICE) + cmake_parse_arguments(CT "${options}" "" "" ${ARGN}) + set(source_filename_ "${t_}.cpp") + set(suffix_ "") + if(CT_DEVICE) + # Make a copy the input source file in the build directory, add a suffix + set(suffix_ "_device") + cmake_path(REPLACE_EXTENSION source_filename_ LAST_ONLY "${suffix_}.cpp" + OUTPUT_VARIABLE src_name_) + set(dst_file "${CMAKE_CURRENT_BINARY_DIR}/${src_name_}") + configure_file("${source_filename_}" "${dst_file}" COPYONLY) + set(source_filename_ "${dst_file}") + endif() + set(target_ ${t}${suffix_}_obj) + add_library(${target_} OBJECT ${source_filename_}) + oomph_target_compile_options(${target_}) + target_compile_definitions( + ${target_} + PRIVATE $<$:OOMPH_TEST_LEAK_GPU_MEMORY>) + target_compile_definitions( + ${target_} PRIVATE $<$:TEST_DEVICE_MODE_ONLY>) + target_link_libraries(${target_} PRIVATE ext-gtest) + target_link_libraries(${target_} PUBLIC oomph) endfunction() -# compile an object library for each test -# tests will be compiled only once and then linked against all enabled oomph backends +# compile an object library for each test tests will be compiled only once and +# then linked against all enabled oomph backends list(APPEND all_tests ${serial_tests} ${parallel_tests}) list(REMOVE_DUPLICATES all_tests) foreach(t ${all_tests}) - compile_test(${t}) + compile_test(${t}) + if(${t} IN_LIST device_tests) + # generate a second version of the obj file, but with DEVICE code enabled + compile_test(${t} DEVICE) + endif() endforeach() # --------------------------------------------------------------------- @@ -48,10 +77,11 @@ function(reg_serial_test t) add_test( NAME ${t} COMMAND $) + set_tests_properties(${t} PROPERTIES LABELS "serial") endfunction() foreach(t ${serial_tests}) - reg_serial_test(${t}) + reg_serial_test(${t}) endforeach() # creates an executable by linking to object file and to selected oomph backend @@ -61,29 +91,42 @@ function(reg_parallel_test t_ lib n) oomph_target_compile_options(${t}) target_link_libraries(${t} PRIVATE gtest_main_mpi) target_link_libraries(${t} PRIVATE oomph_${lib}) - add_test( - NAME ${t} - COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${n} ${MPIEXEC_PREFLAGS} - $ ${MPIEXEC_POSTFLAGS}) - set_tests_properties(${t} PROPERTIES RUN_SERIAL TRUE) + if("${MPIEXEC_EXECUTABLE}" STREQUAL "") + add_test(NAME ${t} COMMAND $) + else() + add_test( + NAME ${t} + COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${n} ${MPIEXEC_PREFLAGS} + $ ${MPIEXEC_POSTFLAGS}) + endif() + set_tests_properties(${t} PROPERTIES RUN_SERIAL TRUE LABELS "parallel-ranks-${n}") endfunction() -if (OOMPH_WITH_MPI) - foreach(t ${parallel_tests}) - reg_parallel_test(${t} mpi 4) - endforeach() +if(OOMPH_WITH_MPI) + foreach(t ${parallel_tests}) + reg_parallel_test(${t} mpi 4) + endforeach() + foreach(t ${device_tests}) + reg_parallel_test(${t}_device mpi 4) + endforeach() endif() -if (OOMPH_WITH_UCX) - foreach(t ${parallel_tests}) - reg_parallel_test(${t} ucx 4) - endforeach() +if(OOMPH_WITH_UCX) + foreach(t ${parallel_tests}) + reg_parallel_test(${t} ucx 4) + endforeach() + foreach(t ${device_tests}) + reg_parallel_test(${t}_device ucx 4) + endforeach() endif() -if (OOMPH_WITH_LIBFABRIC) - foreach(t ${parallel_tests}) - reg_parallel_test(${t} libfabric 4) - endforeach() +if(OOMPH_WITH_LIBFABRIC) + foreach(t ${parallel_tests}) + reg_parallel_test(${t} libfabric 4) + endforeach() + foreach(t ${device_tests}) + reg_parallel_test(${t}_device libfabric 4) + endforeach() endif() add_subdirectory(bindings) diff --git a/test/bindings/fortran/CMakeLists.txt b/test/bindings/fortran/CMakeLists.txt index 974d2f7c..2a5980c5 100644 --- a/test/bindings/fortran/CMakeLists.txt +++ b/test/bindings/fortran/CMakeLists.txt @@ -25,12 +25,17 @@ function(reg_parallel_test_f t_ lib n nthr) $ $ $) - add_test( - NAME ${t} - COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${n} ${MPIEXEC_PREFLAGS} - $ ${MPIEXEC_POSTFLAGS}) + if("${MPIEXEC_EXECUTABLE}" STREQUAL "") + add_test(NAME ${t} COMMAND $) + else() + add_test( + NAME ${t} + COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${n} ${MPIEXEC_PREFLAGS} + $ ${MPIEXEC_POSTFLAGS}) + endif() set_tests_properties(${t} PROPERTIES - ENVIRONMENT OMP_NUM_THREADS=${nthr}) + ENVIRONMENT OMP_NUM_THREADS=${nthr} + LABELS "parallel-ranks-${n}") endfunction() if (OOMPH_WITH_MPI) diff --git a/test/test_send_recv.cpp b/test/test_send_recv.cpp index 0cfd1170..1326eecb 100644 --- a/test/test_send_recv.cpp +++ b/test/test_send_recv.cpp @@ -7,16 +7,21 @@ * Please, refer to the LICENSE file in the root directory. * SPDX-License-Identifier: BSD-3-Clause */ -#include +#ifdef TEST_DEVICE_MODE_ONLY +# ifdef HWMALLOC_ENABLE_DEVICE +# include +# endif +#endif + #include -#include "./mpi_runner/mpi_test_fixture.hpp" -#include -#include -#include +#include +// use this path because device version in build dir needs to find include #include +#include +#include "../test/mpi_runner/mpi_test_fixture.hpp" -#define NITERS 50 -#define SIZE 64 +#define NITERS 50 +#define SIZE 64 #define NTHREADS 4 std::vector> shared_received(NTHREADS); @@ -33,22 +38,22 @@ struct test_environment_base using tag_type = oomph::tag_type; using message = oomph::message_buffer; - oomph::context& ctxt; + oomph::context& ctxt; oomph::communicator comm; - rank_type speer_rank; - rank_type rpeer_rank; - int thread_id; - int num_threads; - tag_type tag; + rank_type speer_rank; + rank_type rpeer_rank; + int thread_id; + int num_threads; + tag_type tag; test_environment_base(oomph::context& c, int tid, int num_t) - : ctxt(c) - , comm(ctxt.get_communicator()) - , speer_rank((comm.rank() + 1) % comm.size()) - , rpeer_rank((comm.rank() + comm.size() - 1) % comm.size()) - , thread_id(tid) - , num_threads(num_t) - , tag(tid) + : ctxt(c) + , comm(ctxt.get_communicator()) + , speer_rank((comm.rank() + 1) % comm.size()) + , rpeer_rank((comm.rank() + comm.size() - 1) % comm.size()) + , thread_id(tid) + , num_threads(num_t) + , tag(tid) { } }; @@ -57,25 +62,26 @@ struct test_environment : public test_environment_base { using base = test_environment_base; - static auto make_buffer(oomph::communicator& comm, std::size_t size, bool user_alloc, - rank_type* ptr) + static auto make_buffer( + oomph::communicator& comm, std::size_t size, bool user_alloc, rank_type* ptr) { - if (user_alloc) return comm.make_buffer(ptr, size); + if (user_alloc) + return comm.make_buffer(ptr, size); else return comm.make_buffer(size); } std::vector raw_smsg; std::vector raw_rmsg; - message smsg; - message rmsg; + message smsg; + message rmsg; test_environment(oomph::context& c, std::size_t size, int tid, int num_t, bool user_alloc) - : base(c, tid, num_t) - , raw_smsg(user_alloc ? size : 0) - , raw_rmsg(user_alloc ? size : 0) - , smsg(make_buffer(comm, size, user_alloc, raw_smsg.data())) - , rmsg(make_buffer(comm, size, user_alloc, raw_rmsg.data())) + : base(c, tid, num_t) + , raw_smsg(user_alloc ? size : 0) + , raw_rmsg(user_alloc ? size : 0) + , smsg(make_buffer(comm, size, user_alloc, raw_smsg.data())) + , rmsg(make_buffer(comm, size, user_alloc, raw_rmsg.data())) { fill_send_buffer(); fill_recv_buffer(); @@ -104,10 +110,11 @@ struct test_environment_device : public test_environment_base { using base = test_environment_base; - static auto make_buffer(oomph::communicator& comm, std::size_t size, bool user_alloc, - rank_type* device_ptr) + static auto make_buffer( + oomph::communicator& comm, std::size_t size, bool user_alloc, rank_type* device_ptr) { - if (user_alloc) return comm.make_device_buffer(device_ptr, size, 0); + if (user_alloc) + return comm.make_device_buffer(device_ptr, size, 0); else return comm.make_device_buffer(size, 0); } @@ -120,37 +127,37 @@ struct test_environment_device : public test_environment_base if (size) m_ptr = hwmalloc::device_malloc(size * sizeof(rank_type)); } device_allocation(device_allocation&& other) - : m_ptr{std::exchange(other.m_ptr, nullptr)} + : m_ptr{std::exchange(other.m_ptr, nullptr)} { } ~device_allocation() { -#ifndef OOMPH_TEST_LEAK_GPU_MEMORY +# ifndef OOMPH_TEST_LEAK_GPU_MEMORY if (m_ptr) hwmalloc::device_free(m_ptr); -#endif +# endif } - rank_type* get() const noexcept { return (rank_type*)m_ptr; } + rank_type* get() const noexcept { return (rank_type*) m_ptr; } }; device_allocation raw_device_smsg; device_allocation raw_device_rmsg; - message smsg; - message rmsg; - - test_environment_device(oomph::context& c, std::size_t size, int tid, int num_t, - bool user_alloc) - : base(c, tid, num_t) -#ifndef OOMPH_TEST_LEAK_GPU_MEMORY - , raw_device_smsg(user_alloc ? size : 0) - , raw_device_rmsg(user_alloc ? size : 0) - , smsg(make_buffer(comm, size, user_alloc, raw_device_smsg.get())) - , rmsg(make_buffer(comm, size, user_alloc, raw_device_rmsg.get())) -#else - , raw_device_smsg(size) - , raw_device_rmsg(size) - , smsg(make_buffer(comm, size, user_alloc, raw_device_smsg.get())) - , rmsg(make_buffer(comm, size, user_alloc, raw_device_rmsg.get())) -#endif + message smsg; + message rmsg; + + test_environment_device( + oomph::context& c, std::size_t size, int tid, int num_t, bool user_alloc) + : base(c, tid, num_t) +# ifndef OOMPH_TEST_LEAK_GPU_MEMORY + , raw_device_smsg(user_alloc ? size : 0) + , raw_device_rmsg(user_alloc ? size : 0) + , smsg(make_buffer(comm, size, user_alloc, raw_device_smsg.get())) + , rmsg(make_buffer(comm, size, user_alloc, raw_device_rmsg.get())) +# else + , raw_device_smsg(size) + , raw_device_rmsg(size) + , smsg(make_buffer(comm, size, user_alloc, raw_device_smsg.get())) + , rmsg(make_buffer(comm, size, user_alloc, raw_device_rmsg.get())) +# endif { fill_send_buffer(); fill_recv_buffer(); @@ -178,9 +185,8 @@ struct test_environment_device : public test_environment_base }; #endif -template -void -launch_test(Func f) +template +void launch_test(Func f) { // single threaded { @@ -193,7 +199,7 @@ launch_test(Func f) // multi threaded { - oomph::context ctxt(MPI_COMM_WORLD, true); + oomph::context ctxt(MPI_COMM_WORLD, true); std::vector threads; threads.reserve(NTHREADS); reset_counters(); @@ -210,9 +216,9 @@ launch_test(Func f) // no callback // =========== -template -void -test_send_recv(oomph::context& ctxt, std::size_t size, int tid, int num_threads, bool user_alloc) +template +void test_send_recv( + oomph::context& ctxt, std::size_t size, int tid, int num_threads, bool user_alloc) { Env env(ctxt, size, tid, num_threads, user_alloc); @@ -221,10 +227,7 @@ test_send_recv(oomph::context& ctxt, std::size_t size, int tid, int num_threads, { auto rreq = env.comm.recv(env.rmsg, env.rpeer_rank, env.tag); auto sreq = env.comm.send(env.smsg, env.speer_rank, env.tag); - while (!(rreq.is_ready() && sreq.is_ready())) - { - env.comm.progress(); - }; + while (!(rreq.is_ready() && sreq.is_ready())) { env.comm.progress(); }; EXPECT_TRUE(env.check_recv_buffer()); env.fill_recv_buffer(); } @@ -250,19 +253,19 @@ test_send_recv(oomph::context& ctxt, std::size_t size, int tid, int num_threads, } } -TEST_F(mpi_test_fixture, send_recv) -{ - launch_test(test_send_recv); -#if HWMALLOC_ENABLE_DEVICE - launch_test(test_send_recv); +#ifndef TEST_DEVICE_MODE_ONLY +TEST_F(mpi_test_fixture, send_recv) { launch_test(test_send_recv); } +#else +# if HWMALLOC_ENABLE_DEVICE +TEST_F(mpi_test_fixture, send_recv_device) { launch_test(test_send_recv); } +# endif #endif -} // callback: pass by l-value reference // =================================== -template -void -test_send_recv_cb(oomph::context& ctxt, std::size_t size, int tid, int num_threads, bool user_alloc) +template +void test_send_recv_cb( + oomph::context& ctxt, std::size_t size, int tid, int num_threads, bool user_alloc) { using rank_type = test_environment::rank_type; using tag_type = test_environment::tag_type; @@ -270,8 +273,8 @@ test_send_recv_cb(oomph::context& ctxt, std::size_t size, int tid, int num_threa Env env(ctxt, size, tid, num_threads, user_alloc); - volatile int received = 0; - volatile int sent = 0; + int volatile received = 0; + int volatile sent = 0; auto send_callback = [&](message const&, rank_type, tag_type) { ++sent; }; auto recv_callback = [&](message&, rank_type, tag_type) { ++received; }; @@ -317,20 +320,22 @@ test_send_recv_cb(oomph::context& ctxt, std::size_t size, int tid, int num_threa EXPECT_EQ(sent, NITERS); } -TEST_F(mpi_test_fixture, send_recv_cb) +#ifndef TEST_DEVICE_MODE_ONLY +TEST_F(mpi_test_fixture, send_recv_cb) { launch_test(test_send_recv_cb); } +#else +# if HWMALLOC_ENABLE_DEVICE +TEST_F(mpi_test_fixture, send_recv_cb_device) { - launch_test(test_send_recv_cb); -#if HWMALLOC_ENABLE_DEVICE launch_test(test_send_recv_cb); -#endif } +# endif +#endif // callback: pass by r-value reference (give up ownership) // ======================================================= -template -void -test_send_recv_cb_disown(oomph::context& ctxt, std::size_t size, int tid, int num_threads, - bool user_alloc) +template +void test_send_recv_cb_disown( + oomph::context& ctxt, std::size_t size, int tid, int num_threads, bool user_alloc) { using rank_type = test_environment::rank_type; using tag_type = test_environment::tag_type; @@ -338,16 +343,14 @@ test_send_recv_cb_disown(oomph::context& ctxt, std::size_t size, int tid, int nu Env env(ctxt, size, tid, num_threads, user_alloc); - volatile int received = 0; - volatile int sent = 0; + int volatile received = 0; + int volatile sent = 0; - auto send_callback = [&](message msg, rank_type, tag_type) - { + auto send_callback = [&](message msg, rank_type, tag_type) { ++sent; env.smsg = std::move(msg); }; - auto recv_callback = [&](message msg, rank_type, tag_type) - { + auto recv_callback = [&](message msg, rank_type, tag_type) { ++received; env.rmsg = std::move(msg); }; @@ -393,20 +396,25 @@ test_send_recv_cb_disown(oomph::context& ctxt, std::size_t size, int tid, int nu EXPECT_EQ(sent, NITERS); } +#ifndef TEST_DEVICE_MODE_ONLY TEST_F(mpi_test_fixture, send_recv_cb_disown) { launch_test(test_send_recv_cb_disown); -#if HWMALLOC_ENABLE_DEVICE +} +#else +# if HWMALLOC_ENABLE_DEVICE +TEST_F(mpi_test_fixture, send_recv_cb_disown_device) +{ launch_test(test_send_recv_cb_disown); -#endif } +# endif +#endif // callback: pass by r-value reference (give up ownership), shared recv // ==================================================================== -template -void -test_send_shared_recv_cb_disown(oomph::context& ctxt, std::size_t size, int tid, int num_threads, - bool user_alloc) +template +void test_send_shared_recv_cb_disown( + oomph::context& ctxt, std::size_t size, int tid, int num_threads, bool user_alloc) { using rank_type = test_environment::rank_type; using tag_type = test_environment::tag_type; @@ -416,19 +424,18 @@ test_send_shared_recv_cb_disown(oomph::context& ctxt, std::size_t size, int tid, thread_id = env.thread_id; - //volatile int received = 0; - volatile int sent = 0; + // volatile int received = 0; + int volatile sent = 0; - auto send_callback = [&](message msg, rank_type, tag_type) - { + auto send_callback = [&](message msg, rank_type, tag_type) { ++sent; env.smsg = std::move(msg); }; - auto recv_callback = [&](message msg, rank_type, tag_type) - { - //std::cout << thread_id << " " << env.thread_id << std::endl; - //if (thread_id != env.thread_id) std::cout << "other thread picked up callback" << std::endl; - //else std::cout << "my thread picked up callback" << std::endl; + auto recv_callback = [&](message msg, rank_type, tag_type) { + // std::cout << thread_id << " " << env.thread_id << std::endl; + // if (thread_id != env.thread_id) std::cout << "other thread picked up + // callback" << std::endl; else std::cout << "my thread picked up callback" + // << std::endl; env.rmsg = std::move(msg); ++shared_received[env.thread_id]; }; @@ -475,20 +482,25 @@ test_send_shared_recv_cb_disown(oomph::context& ctxt, std::size_t size, int tid, EXPECT_EQ(sent, NITERS); } +#ifndef TEST_DEVICE_MODE_ONLY TEST_F(mpi_test_fixture, send_shared_recv_cb_disown) { launch_test(test_send_shared_recv_cb_disown); -#if HWMALLOC_ENABLE_DEVICE +} +#else +# if HWMALLOC_ENABLE_DEVICE +TEST_F(mpi_test_fixture, send_shared_recv_cb_disown_device) +{ launch_test(test_send_shared_recv_cb_disown); -#endif } +# endif +#endif // callback: pass by l-value reference, and resubmit // ================================================= -template -void -test_send_recv_cb_resubmit(oomph::context& ctxt, std::size_t size, int tid, int num_threads, - bool user_alloc) +template +void test_send_recv_cb_resubmit( + oomph::context& ctxt, std::size_t size, int tid, int num_threads, bool user_alloc) { using rank_type = test_environment::rank_type; using tag_type = test_environment::tag_type; @@ -496,13 +508,13 @@ test_send_recv_cb_resubmit(oomph::context& ctxt, std::size_t size, int tid, int Env env(ctxt, size, tid, num_threads, user_alloc); - volatile int received = 0; - volatile int sent = 0; + int volatile received = 0; + int volatile sent = 0; struct recursive_send_callback { - Env& env; - volatile int& sent; + Env& env; + int volatile& sent; void operator()(message& msg, rank_type dst, tag_type tag) { @@ -513,8 +525,8 @@ test_send_recv_cb_resubmit(oomph::context& ctxt, std::size_t size, int tid, int struct recursive_recv_callback { - Env& env; - volatile int& received; + Env& env; + int volatile& received; void operator()(message& msg, rank_type src, tag_type tag) { @@ -531,20 +543,25 @@ test_send_recv_cb_resubmit(oomph::context& ctxt, std::size_t size, int tid, int while (sent < NITERS || received < NITERS) { env.comm.progress(); }; } +#ifndef TEST_DEVICE_MODE_ONLY TEST_F(mpi_test_fixture, send_recv_cb_resubmit) { launch_test(test_send_recv_cb_resubmit); -#if HWMALLOC_ENABLE_DEVICE +} +#else +# if HWMALLOC_ENABLE_DEVICE +TEST_F(mpi_test_fixture, send_recv_cb_resubmit_device) +{ launch_test(test_send_recv_cb_resubmit); -#endif } +# endif +#endif // callback: pass by r-value reference (give up ownership), and resubmit // ===================================================================== -template -void -test_send_recv_cb_resubmit_disown(oomph::context& ctxt, std::size_t size, int tid, int num_threads, - bool user_alloc) +template +void test_send_recv_cb_resubmit_disown( + oomph::context& ctxt, std::size_t size, int tid, int num_threads, bool user_alloc) { using rank_type = test_environment::rank_type; using tag_type = test_environment::tag_type; @@ -552,13 +569,13 @@ test_send_recv_cb_resubmit_disown(oomph::context& ctxt, std::size_t size, int ti Env env(ctxt, size, tid, num_threads, user_alloc); - volatile int received = 0; - volatile int sent = 0; + int volatile received = 0; + int volatile sent = 0; struct recursive_send_callback { - Env& env; - volatile int& sent; + Env& env; + int volatile& sent; void operator()(message msg, rank_type dst, tag_type tag) { @@ -570,8 +587,8 @@ test_send_recv_cb_resubmit_disown(oomph::context& ctxt, std::size_t size, int ti struct recursive_recv_callback { - Env& env; - volatile int& received; + Env& env; + int volatile& received; void operator()(message msg, rank_type src, tag_type tag) { @@ -590,10 +607,16 @@ test_send_recv_cb_resubmit_disown(oomph::context& ctxt, std::size_t size, int ti while (sent < NITERS || received < NITERS) { env.comm.progress(); }; } +#ifndef TEST_DEVICE_MODE_ONLY TEST_F(mpi_test_fixture, send_recv_cb_resubmit_disown) { launch_test(test_send_recv_cb_resubmit_disown); -#if HWMALLOC_ENABLE_DEVICE +} +#else +# if HWMALLOC_ENABLE_DEVICE +TEST_F(mpi_test_fixture, send_recv_cb_resubmit_disown_device) +{ launch_test(test_send_recv_cb_resubmit_disown); -#endif } +# endif +#endif