diff --git a/.cscs-ci/container/build.Containerfile b/.cscs-ci/container/build.Containerfile
new file mode 100644
index 00000000..fe3e707f
--- /dev/null
+++ b/.cscs-ci/container/build.Containerfile
@@ -0,0 +1,20 @@
+ARG DEPS_IMAGE
+FROM $DEPS_IMAGE
+
+COPY . /oomph
+WORKDIR /oomph
+
+ARG BACKEND
+ARG NUM_PROCS
+RUN spack -e ci build-env oomph -- \
+        cmake -G Ninja -B build \
+            -DCMAKE_BUILD_TYPE=Debug \
+            -DOOMPH_WITH_TESTING=ON \
+            -DOOMPH_WITH_$(echo $BACKEND | tr '[:lower:]' '[:upper:]')=ON \
+            -DOOMPH_USE_BUNDLED_LIBS=ON \
+            -DOOMPH_USE_BUNDLED_HWMALLOC=OFF \
+            -DMPIEXEC_EXECUTABLE="" \
+            -DMPIEXEC_NUMPROC_FLAG="" \
+            -DMPIEXEC_PREFLAGS="" \
+            -DMPIEXEC_POSTFLAGS="" && \
+    spack -e ci build-env oomph -- cmake --build build -j$NUM_PROCS
diff --git a/.cscs-ci/container/deps.Containerfile b/.cscs-ci/container/deps.Containerfile
new file mode 100644
index 00000000..f5867ac5
--- /dev/null
+++ b/.cscs-ci/container/deps.Containerfile
@@ -0,0 +1,24 @@
+ARG BASE_IMAGE
+FROM $BASE_IMAGE
+
+ARG SPACK_SHA
+RUN mkdir -p /opt/spack && \
+    curl -fLsS "https://api.github.com/repos/spack/spack/tarball/$SPACK_SHA" | tar --strip-components=1 -xz -C /opt/spack
+
+ENV PATH="/opt/spack/bin:$PATH"
+
+ARG SPACK_PACKAGES_SHA
+RUN mkdir -p /opt/spack-packages && \
+    curl -fLsS "https://api.github.com/repos/spack/spack-packages/tarball/$SPACK_PACKAGES_SHA" | tar --strip-components=1 -xz -C /opt/spack-packages
+
+RUN spack repo remove --scope defaults:base builtin && \
+    spack repo add --scope site /opt/spack-packages/repos/spack_repo/builtin
+
+ARG SPACK_ENV_FILE
+COPY $SPACK_ENV_FILE /spack_environment/spack.yaml
+
+ARG NUM_PROCS
+RUN spack external find --all && \
+    spack env create ci /spack_environment/spack.yaml && \
+    spack -e ci concretize -f && \
+    spack -e ci install --jobs $NUM_PROCS --fail-fast --only=dependencies
diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml
new file mode 100644
index 00000000..c88a4522
--- /dev/null
+++ b/.cscs-ci/default.yaml
@@ -0,0 +1,192 @@
+include:
+  - remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml'
+
+variables:
+  BASE_IMAGE: jfrog.svc.cscs.ch/docker-group-csstaff/alps-images/ngc-pytorch:26.01-py3-alps4-dev
+  SPACK_SHA: v1.1.1
+  SPACK_PACKAGES_SHA: bc93746ce936d6653271b6e98f6df6ee28f64e84 # develop on 2026-03-25
+  FF_TIMESTAMPS: true
+
+.build_deps_template:
+  timeout: 1 hour
+  before_script:
+    - echo $DOCKERHUB_TOKEN | podman login docker.io -u $DOCKERHUB_USERNAME --password-stdin || true
+    - export DOCKERFILE_SHA=`sha256sum .cscs-ci/container/deps.Containerfile | head -c 16`
+    - export ENV_FILE_SHA=`sha256sum ${SPACK_ENV_FILE} | head -c 16`
+    - export CONFIG_TAG=`echo $DOCKERFILE_SHA-$BASE_IMAGE-$SPACK_SHA-$SPACK_PACKAGES_SHA-$ENV_FILE_SHA | sha256sum - | head -c 16`
+    - export PERSIST_IMAGE_NAME=$CSCS_REGISTRY_PATH/public/oomph-spack-deps-$BACKEND:$CONFIG_TAG
+    - echo -e "CONFIG_TAG=$CONFIG_TAG" >> base-${BACKEND}.env
+    - echo -e "DEPS_IMAGE=$PERSIST_IMAGE_NAME" >> base-${BACKEND}.env
+  variables:
+    DOCKERFILE: .cscs-ci/container/deps.Containerfile
+    DOCKER_BUILD_ARGS: '["BASE_IMAGE", "SPACK_SHA", "SPACK_PACKAGES_SHA", "SPACK_ENV_FILE"]'
+    SPACK_ENV_FILE: .cscs-ci/spack/$BACKEND.yaml
+  artifacts:
+    reports:
+      dotenv: base-${BACKEND}.env
+
+# TODO: NCCL will be enabled in https://github.com/ghex-org/oomph/pull/55
+# build_deps_nccl:
+#   variables:
+#     BACKEND: nccl
+#   extends:
+#     - .container-builder-cscs-gh200
+#     - .build_deps_template
+
+build_deps_mpi:
+  variables:
+    BACKEND: mpi
+  extends:
+    - .container-builder-cscs-gh200
+    - .build_deps_template
+
+build_deps_ucx:
+  variables:
+    BACKEND: ucx
+  extends:
+    - .container-builder-cscs-gh200
+    - .build_deps_template
+
+build_deps_libfabric:
+  variables:
+    BACKEND: libfabric
+  extends:
+    - .container-builder-cscs-gh200
+    - .build_deps_template
+
+.build_template:
+  extends: .container-builder-cscs-gh200
+  timeout: 15 minutes
+  before_script:
+    - echo $DOCKERHUB_TOKEN | podman login docker.io -u $DOCKERHUB_USERNAME --password-stdin || true
+    - export PERSIST_IMAGE_NAME=$CSCS_REGISTRY_PATH/public/oomph-build-$BACKEND:$CI_COMMIT_SHA
+    - echo -e "BUILD_IMAGE=$PERSIST_IMAGE_NAME" >> build-${BACKEND}.env
+  variables:
+    DOCKERFILE: .cscs-ci/container/build.Containerfile
+    DOCKER_BUILD_ARGS: '["DEPS_IMAGE", "BACKEND"]'
+  artifacts:
+    reports:
+      dotenv: build-${BACKEND}.env
+
+# TODO: NCCL will be enabled in https://github.com/ghex-org/oomph/pull/55
+# build_nccl:
+#   variables:
+#     BACKEND: nccl
+#   extends: .build_template
+#   needs:
+#     - job: build_deps_nccl
+#       artifacts: true
+
+build_mpi:
+  variables:
+    BACKEND: mpi
+  extends: .build_template
+  needs:
+    - job: build_deps_mpi
+      artifacts: true
+
+build_ucx:
+  variables:
+    BACKEND: ucx
+  extends: .build_template
+  needs:
+    - job: build_deps_ucx
+      artifacts: true
+
+build_libfabric:
+  variables:
+    BACKEND: libfabric
+  extends: .build_template
+  needs:
+    - job: build_deps_libfabric
+      artifacts: true
+
+.test_template_base:
+  extends: .container-runner-clariden-gh200
+  variables:
+    SLURM_JOB_NUM_NODES: 1
+    SLURM_GPUS_PER_TASK: 1
+    SLURM_TIMELIMIT: '5:00'
+    SLURM_PARTITION: normal
+    SLURM_MPI_TYPE: pmix
+    SLURM_NETWORK: disable_rdzv_get
+    SLURM_LABELIO: 1
+    SLURM_UNBUFFEREDIO: 1
+    PMIX_MCA_psec: native
+    PMIX_MCA_gds: "^shmem2"
+    USE_MPI: NO
+
+.test_serial_template:
+  extends: .test_template_base
+  variables:
+    SLURM_NTASKS: 1
+  script:
+    - ctest --test-dir /oomph/build -L "serial" --output-on-failure --timeout 60 --parallel 8
+
+.test_parallel_template:
+  extends: .test_template_base
+  variables:
+    SLURM_NTASKS: 4
+  script:
+    # All ranks write to ctest files in Testing, but this can deadlock when
+    # writing inside the container.
+    - if [[ "${SLURM_PROCID}" == 0 ]]; then rm -rf /oomph/build/Testing; mkdir /tmp/Testing; ln -s /tmp/Testing /oomph/build/Testing; fi
+    - sleep 1
+    - ctest --test-dir /oomph/build -L "parallel-ranks-4" --output-on-failure --timeout 60
+
+# TODO: NCCL will be enabled in https://github.com/ghex-org/oomph/pull/55
+# test_serial_nccl:
+#   extends: .test_serial_template
+#   needs:
+#     - job: build_nccl
+#       artifacts: true
+#   image: $BUILD_IMAGE
+
+# test_parallel_nccl:
+#   extends: .test_parallel_template
+#   needs:
+#     - job: build_nccl
+#       artifacts: true
+#   image: $BUILD_IMAGE
+
+test_serial_mpi:
+  extends: .test_serial_template
+  needs:
+    - job: build_mpi
+      artifacts: true
+  image: $BUILD_IMAGE
+
+test_parallel_mpi:
+  extends: .test_parallel_template
+  needs:
+    - job: build_mpi
+      artifacts: true
+  image: $BUILD_IMAGE
+
+test_serial_ucx:
+  extends: .test_serial_template
+  needs:
+    - job: build_ucx
+      artifacts: true
+  image: $BUILD_IMAGE
+
+test_parallel_ucx:
+  extends: .test_parallel_template
+  needs:
+    - job: build_ucx
+      artifacts: true
+  image: $BUILD_IMAGE
+
+test_serial_libfabric:
+  extends: .test_serial_template
+  needs:
+    - job: build_libfabric
+      artifacts: true
+  image: $BUILD_IMAGE
+
+test_parallel_libfabric:
+  extends: .test_parallel_template
+  needs:
+    - job: build_libfabric
+      artifacts: true
+  image: $BUILD_IMAGE
diff --git a/.cscs-ci/spack/libfabric.yaml b/.cscs-ci/spack/libfabric.yaml
new file mode 100644
index 00000000..fac7f88f
--- /dev/null
+++ b/.cscs-ci/spack/libfabric.yaml
@@ -0,0 +1,6 @@
+spack:
+  specs:
+  - oomph@main backend=libfabric +cuda
+  view: false
+  concretizer:
+    unify: true
diff --git a/.cscs-ci/spack/mpi.yaml b/.cscs-ci/spack/mpi.yaml
new file mode 100644
index 00000000..d59aab13
--- /dev/null
+++ b/.cscs-ci/spack/mpi.yaml
@@ -0,0 +1,6 @@
+spack:
+  specs:
+  - oomph@main backend=mpi +cuda
+  view: false
+  concretizer:
+    unify: true
diff --git a/.cscs-ci/spack/nccl.yaml b/.cscs-ci/spack/nccl.yaml
new file mode 100644
index 00000000..94f0dd31
--- /dev/null
+++ b/.cscs-ci/spack/nccl.yaml
@@ -0,0 +1,6 @@
+spack:
+  specs:
+  - oomph@main backend=nccl +cuda
+  view: false
+  concretizer:
+    unify: true
diff --git a/.cscs-ci/spack/ucx.yaml b/.cscs-ci/spack/ucx.yaml
new file mode 100644
index 00000000..51377dd8
--- /dev/null
+++ b/.cscs-ci/spack/ucx.yaml
@@ -0,0 +1,6 @@
+spack:
+  specs:
+  - oomph@main backend=ucx +cuda
+  view: false
+  concretizer:
+    unify: true
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 90a582d1..3db53422 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,12 +1,6 @@
 cmake_minimum_required(VERSION 3.17)
 # CMake version is set at 3.17 because of find_package(CUDAToolkit)
 
-if (NOT ${CMAKE_VERSION} VERSION_LESS 3.27)
-    # new in 3.27: additionally use uppercase <PACKAGENAME>_ROOT
-    # environment and CMake variables for find_package
-    cmake_policy(SET CMP0144 NEW)
-endif()
-
 set(OOMPH_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 list(APPEND CMAKE_MODULE_PATH "${OOMPH_MODULE_PATH}")
 
@@ -28,6 +22,7 @@ endfunction()
 
 set_policy(CMP0074 NEW) # find_package uses XXX_ROOT vars using PackageName
 set_policy(CMP0144 NEW) # find_package allows XXX_ROOT vars using PACKAGENAME Uppercase
+set_policy(CMP0167 NEW) # find_package uses new boost config (boost 1.70 onwards)
 
 # ---------------------------------------------------------------------
 # CMake setup, C++ version, build type, modules, etc
@@ -92,7 +87,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/config.hpp.in
     ${CMAKE_CURRENT_BINARY_DIR}/include/oomph/config.hpp @ONLY)
 install(FILES ${PROJECT_BINARY_DIR}/include/oomph/config.hpp
     DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/oomph)
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/cmake_config.inc.in 
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/cmake_config.inc.in
     ${CMAKE_CURRENT_BINARY_DIR}/include/oomph/cmake_config.inc)
 
 # ---------------------------------------------------------------------
diff --git a/cmake/config.hpp.in b/cmake/config.hpp.in
index 458b038a..e9fcf5e4 100644
--- a/cmake/config.hpp.in
+++ b/cmake/config.hpp.in
@@ -26,9 +26,12 @@
 
 #cmakedefine01 OOMPH_USE_FAST_PIMPL
 #cmakedefine01 OOMPH_ENABLE_BARRIER
+
+// clang-format off
 #define OOMPH_RECURSION_DEPTH @OOMPH_RECURSION_DEPTH@
 
 #define OOMPH_VERSION @OOMPH_VERSION_NUMERIC@
 #define OOMPH_VERSION_MAJOR @OOMPH_VERSION_MAJOR@
 #define OOMPH_VERSION_MINOR @OOMPH_VERSION_MINOR@
 #define OOMPH_VERSION_PATCH @OOMPH_VERSION_PATCH@
+// clang-format on
diff --git a/cmake/oomph_defs.hpp.in b/cmake/oomph_defs.hpp.in
index 70ae8732..a52a943f 100644
--- a/cmake/oomph_defs.hpp.in
+++ b/cmake/oomph_defs.hpp.in
@@ -15,7 +15,9 @@ namespace oomph
 {
     namespace fort
     {
+        // clang-format off
         using fp_type = @OOMPH_FORTRAN_FP@;
+        // clang-format on
         typedef enum {
             OomphBarrierGlobal=1,
             OomphBarrierThread=2,
diff --git a/cmake/oomph_libfabric.cmake b/cmake/oomph_libfabric.cmake
index 758f3f4d..1ddaf71d 100644
--- a/cmake/oomph_libfabric.cmake
+++ b/cmake/oomph_libfabric.cmake
@@ -1,176 +1,196 @@
 # set all libfabric related options and values
 
-#------------------------------------------------------------------------------
+# ------------------------------------------------------------------------------
 # Enable libfabric support
-#------------------------------------------------------------------------------
+# ------------------------------------------------------------------------------
 set(OOMPH_WITH_LIBFABRIC OFF CACHE BOOL "Build with LIBFABRIC backend")
 
-if (OOMPH_WITH_LIBFABRIC)
-    find_package(Libfabric REQUIRED)
-    add_library(oomph_libfabric SHARED)
-    add_library(oomph::libfabric ALIAS oomph_libfabric)
-    oomph_shared_lib_options(oomph_libfabric)
-    target_link_libraries(oomph_libfabric PUBLIC libfabric::libfabric)
-    install(TARGETS oomph_libfabric
-        EXPORT oomph-targets
-        LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-        ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
-
-    # ---------------------------------------------------------------------
-    # Function to add config defines to a list that depends on a namespace variable
-    # #defines that match the namespace can later be written out to a file
-    # ---------------------------------------------------------------------
-    function(oomph_libfabric_add_config_define_namespace)
-      set(options)
-      set(one_value_args DEFINE NAMESPACE)
-      set(multi_value_args VALUE)
-      cmake_parse_arguments(OPTION
-        "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN})
-
-      set(DEF_VAR OOMPH_LIBFABRIC_CONFIG_DEFINITIONS_${OPTION_NAMESPACE})
-
-      # to avoid extra trailing spaces (no value), use an if check
-      if(OPTION_VALUE)
-        set_property(GLOBAL APPEND PROPERTY ${DEF_VAR} "${OPTION_DEFINE} ${OPTION_VALUE}")
-      else()
-        set_property(GLOBAL APPEND PROPERTY ${DEF_VAR} "${OPTION_DEFINE}")
-      endif()
-
-    endfunction()
-
-    # ---------------------------------------------------------------------
-    # Function to write out all the config defines for a given namespace
-    # into a config file
-    # ---------------------------------------------------------------------
-    function(oomph_libfabric_write_config_defines_file)
-      set(options)
-      set(one_value_args TEMPLATE NAMESPACE FILENAME)
-      set(multi_value_args)
-      cmake_parse_arguments(OPTION
-        "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN})
-
-      get_property(DEFINITIONS_VAR GLOBAL PROPERTY
-        OOMPH_LIBFABRIC_CONFIG_DEFINITIONS_${OPTION_NAMESPACE})
-
-      if(DEFINED DEFINITIONS_VAR)
-        list(SORT DEFINITIONS_VAR)
-        list(REMOVE_DUPLICATES DEFINITIONS_VAR)
-      endif()
-
-      set(oomph_config_defines "\n")
-      foreach(def ${DEFINITIONS_VAR})
-        set(oomph_config_defines "${oomph_config_defines}#define ${def}\n")
-      endforeach()
-
-      # if the user has not specified a template, generate a proper header file
-      if (NOT OPTION_TEMPLATE)
-        string(TOUPPER ${OPTION_NAMESPACE} NAMESPACE_UPPER)
-        set(PREAMBLE
-          "\n"
-          "// Do not edit this file! It has been generated by the cmake configuration step.\n"
-          "\n"
-          "#ifndef OOMPH_LIBFABRIC_CONFIG_${NAMESPACE_UPPER}_HPP\n"
-          "#define OOMPH_LIBFABRIC_CONFIG_${NAMESPACE_UPPER}_HPP\n"
-        )
-        set(TEMP_FILENAME "${PROJECT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/${NAMESPACE_UPPER}")
-        file(WRITE ${TEMP_FILENAME}
-            ${PREAMBLE}
-            ${oomph_config_defines}
-            "#endif\n"
-        )
-        configure_file("${TEMP_FILENAME}" "${OPTION_FILENAME}" COPYONLY)
-        file(REMOVE "${TEMP_FILENAME}")
-      else()
-        configure_file("${OPTION_TEMPLATE}"
-                       "${OPTION_FILENAME}"
-                       @ONLY)
-      endif()
-    endfunction()
-
-    include(CMakeParseArguments)
-
-    #------------------------------------------------------------------------------
-    # Hardware device selection
-    #------------------------------------------------------------------------------
-    set(OOMPH_LIBFABRIC_PROVIDER "tcp" CACHE
-        STRING "The provider (cxi(Cray Slingshot)/efa(Amazon Elastic)/gni(Cray Gemini)/psm2(Intel Omni-Path)/tcp/verbs(Infiniband))")
-    set_property(CACHE OOMPH_LIBFABRIC_PROVIDER PROPERTY STRINGS
-        "cxi" "efa" "gni" "psm2" "tcp" "verbs")
+if(OOMPH_WITH_LIBFABRIC)
+  find_package(Libfabric REQUIRED)
+  add_library(oomph_libfabric SHARED)
+  add_library(oomph::libfabric ALIAS oomph_libfabric)
+  oomph_shared_lib_options(oomph_libfabric)
+  target_link_libraries(oomph_libfabric PUBLIC libfabric::libfabric)
+  install(TARGETS oomph_libfabric EXPORT oomph-targets
+          LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+          ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  )
+
+  # ---------------------------------------------------------------------
+  # Function to add config defines to a list that depends on a namespace
+  # variable #defines that match the namespace can later be written out to a
+  # file
+  # ---------------------------------------------------------------------
+  function(oomph_libfabric_add_config_define_namespace)
+    set(options)
+    set(one_value_args DEFINE NAMESPACE)
+    set(multi_value_args VALUE)
+    cmake_parse_arguments(
+      OPTION "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}
+    )
 
-    oomph_libfabric_add_config_define_namespace(
-        DEFINE HAVE_LIBFABRIC_PROVIDER
-        VALUE  "\"${OOMPH_LIBFABRIC_PROVIDER}\""
-        NAMESPACE libfabric)
-
-      option(OOMPH_LIBFABRIC_V1_API "Support older libfabric@1.15" OFF)
-      if (OOMPH_LIBFABRIC_V1_API)
-        oomph_libfabric_add_config_define_namespace(
-            DEFINE OOMPH_LIBFABRIC_V1_API
-            NAMESPACE libfabric)
-      endif()
-
-    if(OOMPH_LIBFABRIC_PROVIDER MATCHES "verbs")
-        oomph_libfabric_add_config_define_namespace(
-            DEFINE HAVE_LIBFABRIC_VERBS
-            NAMESPACE libfabric)
-    elseif(OOMPH_LIBFABRIC_PROVIDER MATCHES "gni")
-        oomph_libfabric_add_config_define_namespace(
-            DEFINE HAVE_LIBFABRIC_GNI
-            NAMESPACE libfabric)
-        # add pmi library
-        set(_libfabric_libraries ${_libfabric_libraries} PMIx::libpmix)
-    elseif(OOMPH_LIBFABRIC_PROVIDER MATCHES "cxi")
-        oomph_libfabric_add_config_define_namespace(
-            DEFINE HAVE_LIBFABRIC_CXI
-            NAMESPACE libfabric)
-    elseif(OOMPH_LIBFABRIC_PROVIDER MATCHES "efa")
-        oomph_libfabric_add_config_define_namespace(
-            DEFINE HAVE_LIBFABRIC_EFA
-            NAMESPACE libfabric)
-    elseif(OOMPH_LIBFABRIC_PROVIDER MATCHES "tcp")
-        oomph_libfabric_add_config_define_namespace(
-            DEFINE HAVE_LIBFABRIC_TCP
-            NAMESPACE libfabric)
-    elseif(OOMPH_LIBFABRIC_PROVIDER MATCHES "sockets")
-        message(WARNING "The Sockets provider is deprecated in favor of the tcp, udp, "
-            "and utility providers")
-        oomph_libfabric_add_config_define_namespace(
-            DEFINE HAVE_LIBFABRIC_SOCKETS
-            NAMESPACE libfabric)
-    elseif(OOMPH_LIBFABRIC_PROVIDER MATCHES "psm2")
-        oomph_libfabric_add_config_define_namespace(
-            DEFINE HAVE_LIBFABRIC_PSM2
-            NAMESPACE libfabric)
-    endif()
+    set(DEF_VAR OOMPH_LIBFABRIC_CONFIG_DEFINITIONS_${OPTION_NAMESPACE})
 
-    #------------------------------------------------------------------------------
-    # Performance counters
-    #------------------------------------------------------------------------------
-    set(OOMPH_LIBFABRIC_WITH_PERFORMANCE_COUNTERS OFF BOOL
-        STRING "Enable libfabric parcelport performance counters (default: OFF)")
-    mark_as_advanced(OOMPH_LIBFABRIC_WITH_PERFORMANCE_COUNTERS)
-
-    if (OOMPH_LIBFABRIC_WITH_PERFORMANCE_COUNTERS)
-      oomph_libfabric_add_config_define_namespace(
-          DEFINE    OOMPH_LIBFABRIC_HAVE_PERFORMANCE_COUNTERS
-          NAMESPACE libfabric)
+    # to avoid extra trailing spaces (no value), use an if check
+    if(OPTION_VALUE)
+      set_property(
+        GLOBAL APPEND PROPERTY ${DEF_VAR} "${OPTION_DEFINE} ${OPTION_VALUE}"
+      )
+    else()
+      set_property(GLOBAL APPEND PROPERTY ${DEF_VAR} "${OPTION_DEFINE}")
     endif()
 
-    #------------------------------------------------------------------------------
-    # used by template expansion for location of print.hpp
-    #------------------------------------------------------------------------------
-    set(OOMPH_SRC_LIBFABRIC_DIR "${PROJECT_SOURCE_DIR}/src/libfabric")
-
-    #------------------------------------------------------------------------------
-    # Write options to file in build dir
-    #------------------------------------------------------------------------------
-    oomph_libfabric_write_config_defines_file(
-        NAMESPACE libfabric
-        FILENAME  "${PROJECT_BINARY_DIR}/src/libfabric/oomph_libfabric_defines.hpp"
-        TEMPLATE  "${OOMPH_SRC_LIBFABRIC_DIR}/libfabric_defines_template.hpp"
+  endfunction()
+
+  # ---------------------------------------------------------------------
+  # Function to write out all the config defines for a given namespace into a
+  # config file
+  # ---------------------------------------------------------------------
+  function(oomph_libfabric_write_config_defines_file)
+    set(options)
+    set(one_value_args TEMPLATE NAMESPACE FILENAME)
+    set(multi_value_args)
+    cmake_parse_arguments(
+      OPTION "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}
     )
-    target_include_directories(oomph_libfabric PRIVATE "${PROJECT_BINARY_DIR}/src/libfabric")
-endif()
 
+    get_property(
+      DEFINITIONS_VAR GLOBAL
+      PROPERTY OOMPH_LIBFABRIC_CONFIG_DEFINITIONS_${OPTION_NAMESPACE}
+    )
+
+    if(DEFINED DEFINITIONS_VAR)
+      list(SORT DEFINITIONS_VAR)
+      list(REMOVE_DUPLICATES DEFINITIONS_VAR)
+    endif()
 
+    set(oomph_config_defines "\n")
+    foreach(def ${DEFINITIONS_VAR})
+      set(oomph_config_defines "${oomph_config_defines}#define ${def}\n")
+    endforeach()
 
+    # if the user has not specified a template, generate a proper header file
+    if(NOT OPTION_TEMPLATE)
+      string(TOUPPER ${OPTION_NAMESPACE} NAMESPACE_UPPER)
+      set(PREAMBLE
+          "\n"
+          "// Do not edit this file! It has been generated by the cmake configuration step.\n"
+          "\n"
+          "#ifndef OOMPH_LIBFABRIC_CONFIG_${NAMESPACE_UPPER}_HPP\n"
+          "#define OOMPH_LIBFABRIC_CONFIG_${NAMESPACE_UPPER}_HPP\n"
+      )
+      set(TEMP_FILENAME
+          "${PROJECT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/${NAMESPACE_UPPER}"
+      )
+      file(WRITE ${TEMP_FILENAME} ${PREAMBLE} ${oomph_config_defines}
+                                  "#endif\n"
+      )
+      configure_file("${TEMP_FILENAME}" "${OPTION_FILENAME}" COPYONLY)
+      file(REMOVE "${TEMP_FILENAME}")
+    else()
+      configure_file("${OPTION_TEMPLATE}" "${OPTION_FILENAME}" @ONLY)
+    endif()
+  endfunction()
+
+  include(CMakeParseArguments)
+
+  # ------------------------------------------------------------------------------
+  # Hardware device selection
+  # ------------------------------------------------------------------------------
+  set(OOMPH_LIBFABRIC_PROVIDER
+      "tcp"
+      CACHE
+        STRING
+        "The provider cxi(Cray Slingshot)/efa(Amazon Elastic)/gni(Cray Gemini)/psm2(Intel Omni-Path)/tcp/verbs(Infiniband), shm, lnx"
+  )
+  set_property(
+    CACHE OOMPH_LIBFABRIC_PROVIDER
+    PROPERTY STRINGS
+             "cxi"
+             "efa"
+             "gni"
+             "psm2"
+             "tcp"
+             "verbs"
+             "shm"
+             "lnx"
+  )
+
+  oomph_libfabric_add_config_define_namespace(
+    DEFINE HAVE_LIBFABRIC_PROVIDER VALUE "\"${OOMPH_LIBFABRIC_PROVIDER}\""
+    NAMESPACE libfabric
+  )
+
+  option(OOMPH_LIBFABRIC_V1_API "Support older libfabric@1.15" OFF)
+  if(OOMPH_LIBFABRIC_V1_API)
+    oomph_libfabric_add_config_define_namespace(
+      DEFINE OOMPH_LIBFABRIC_V1_API NAMESPACE libfabric
+    )
+  endif()
+
+  # Map provider string to uppercase and create a define
+  string(TOUPPER "${OOMPH_LIBFABRIC_PROVIDER}" PROVIDER_UPPER)
+  oomph_libfabric_add_config_define_namespace(
+    DEFINE "HAVE_LIBFABRIC_${PROVIDER_UPPER}" NAMESPACE libfabric
+  )
+
+  # Special handling for deprecated or extra cases
+  if(OOMPH_LIBFABRIC_PROVIDER STREQUAL "sockets")
+    message(
+      WARNING
+        "The
+      Sockets
+      provider
+      is
+      deprecated
+      in
+      favor
+      of
+      the
+      tcp,
+      udp,
+      and
+      utility
+      providers"
+    )
+  endif()
+
+  # Special handling for gni provider needing PMIx
+  if(OOMPH_LIBFABRIC_PROVIDER STREQUAL "gni")
+    set(_libfabric_libraries ${_libfabric_libraries} PMIx::libpmix)
+  endif()
+
+  # ------------------------------------------------------------------------------
+  # Performance counters
+  # ------------------------------------------------------------------------------
+  set(OOMPH_LIBFABRIC_WITH_PERFORMANCE_COUNTERS
+      OFF
+      BOOL
+      STRING
+      "Enable libfabric performance counters (default: OFF)"
+  )
+  mark_as_advanced(OOMPH_LIBFABRIC_WITH_PERFORMANCE_COUNTERS)
+
+  if(OOMPH_LIBFABRIC_WITH_PERFORMANCE_COUNTERS)
+    oomph_libfabric_add_config_define_namespace(
+      DEFINE OOMPH_LIBFABRIC_HAVE_PERFORMANCE_COUNTERS NAMESPACE libfabric
+    )
+  endif()
+
+  # ------------------------------------------------------------------------------
+  # used by template expansion for location of print.hpp
+  # ------------------------------------------------------------------------------
+  set(OOMPH_SRC_LIBFABRIC_DIR "${PROJECT_SOURCE_DIR}/src/libfabric")
+
+  # ------------------------------------------------------------------------------
+  # Write options to file in build dir
+  # ------------------------------------------------------------------------------
+  oomph_libfabric_write_config_defines_file(
+    NAMESPACE libfabric FILENAME
+    "${PROJECT_BINARY_DIR}/src/libfabric/oomph_libfabric_defines.hpp" TEMPLATE
+    "${OOMPH_SRC_LIBFABRIC_DIR}/libfabric_defines_template.hpp"
+  )
+  target_include_directories(
+    oomph_libfabric PRIVATE "${PROJECT_BINARY_DIR}/src/libfabric"
+  )
+endif()
diff --git a/include/oomph/detail/communicator_helper.hpp b/include/oomph/detail/communicator_helper.hpp
index 6e0e97d5..8335c6eb 100644
--- a/include/oomph/detail/communicator_helper.hpp
+++ b/include/oomph/detail/communicator_helper.hpp
@@ -11,6 +11,7 @@
 
 #include <atomic>
 #include <boost/callable_traits.hpp>
+#include <oomph/message_buffer.hpp>
 #include <oomph/request.hpp>
 #include <oomph/util/pool_factory.hpp>
 //#include <oomph/util/tag_range.hpp>
@@ -33,7 +34,7 @@
 
 #define OOMPH_CHECK_CALLBACK_MSG_REF                                                               \
     static_assert(std::is_same<arg0_t, message_buffer<TT>&>::value ||                              \
-                      std::is_same<arg0_t, message_buffer<TT> const&>::value,                      \
+            std::is_same<arg0_t, message_buffer<TT> const&>::value,                                \
         "first callback argument type is not an l-value reference to a message_buffer");
 
 #define OOMPH_CHECK_CALLBACK_MSG_CONST_REF                                                         \
@@ -41,129 +42,107 @@
         "first callback argument type is not a const l-value reference to a message_buffer");
 
 #define OOMPH_CHECK_CALLBACK(CALLBACK)                                                             \
-    {                                                                                              \
-        OOMPH_CHECK_CALLBACK_F(CALLBACK, rank_type, tag_type)                                      \
-        OOMPH_CHECK_CALLBACK_MSG                                                                   \
-    }
+    {OOMPH_CHECK_CALLBACK_F(CALLBACK, rank_type, tag_type) OOMPH_CHECK_CALLBACK_MSG}
 
 #define OOMPH_CHECK_CALLBACK_MULTI(CALLBACK)                                                       \
-    {                                                                                              \
-        OOMPH_CHECK_CALLBACK_F(CALLBACK, std::vector<rank_type>, tag_type)                         \
-        OOMPH_CHECK_CALLBACK_MSG                                                                   \
-    }
+    {OOMPH_CHECK_CALLBACK_F(CALLBACK, std::vector<rank_type>, tag_type) OOMPH_CHECK_CALLBACK_MSG}
 
 #define OOMPH_CHECK_CALLBACK_MULTI_TAGS(CALLBACK)                                                  \
-    {                                                                                              \
-        OOMPH_CHECK_CALLBACK_F(CALLBACK, std::vector<rank_type>, std::vector<tag_type>)            \
-        OOMPH_CHECK_CALLBACK_MSG                                                                   \
-    }
+    {OOMPH_CHECK_CALLBACK_F(CALLBACK, std::vector<rank_type>, std::vector<tag_type>)               \
+            OOMPH_CHECK_CALLBACK_MSG}
 
 #define OOMPH_CHECK_CALLBACK_REF(CALLBACK)                                                         \
-    {                                                                                              \
-        OOMPH_CHECK_CALLBACK_F(CALLBACK, rank_type, tag_type)                                      \
-        OOMPH_CHECK_CALLBACK_MSG_REF                                                               \
-    }
+    {OOMPH_CHECK_CALLBACK_F(CALLBACK, rank_type, tag_type) OOMPH_CHECK_CALLBACK_MSG_REF}
 
 #define OOMPH_CHECK_CALLBACK_MULTI_REF(CALLBACK)                                                   \
-    {                                                                                              \
-        OOMPH_CHECK_CALLBACK_F(CALLBACK, std::vector<rank_type>, tag_type)                         \
-        OOMPH_CHECK_CALLBACK_MSG_REF                                                               \
-    }
+    {OOMPH_CHECK_CALLBACK_F(CALLBACK, std::vector<rank_type>, tag_type)                            \
+            OOMPH_CHECK_CALLBACK_MSG_REF}
 
 #define OOMPH_CHECK_CALLBACK_MULTI_REF_TAGS(CALLBACK)                                              \
-    {                                                                                              \
-        OOMPH_CHECK_CALLBACK_F(CALLBACK, std::vector<rank_type>, std::vector<tag_type>)            \
-        OOMPH_CHECK_CALLBACK_MSG_REF                                                               \
-    }
+    {OOMPH_CHECK_CALLBACK_F(CALLBACK, std::vector<rank_type>, std::vector<tag_type>)               \
+            OOMPH_CHECK_CALLBACK_MSG_REF}
 
 #define OOMPH_CHECK_CALLBACK_CONST_REF(CALLBACK)                                                   \
-    {                                                                                              \
-        OOMPH_CHECK_CALLBACK_F(CALLBACK, rank_type, tag_type)                                      \
-        OOMPH_CHECK_CALLBACK_MSG_CONST_REF                                                         \
-    }
+    {OOMPH_CHECK_CALLBACK_F(CALLBACK, rank_type, tag_type) OOMPH_CHECK_CALLBACK_MSG_CONST_REF}
 
 #define OOMPH_CHECK_CALLBACK_MULTI_CONST_REF(CALLBACK)                                             \
-    {                                                                                              \
-        OOMPH_CHECK_CALLBACK_F(CALLBACK, std::vector<rank_type>, tag_type)                         \
-        OOMPH_CHECK_CALLBACK_MSG_CONST_REF                                                         \
-    }
+    {OOMPH_CHECK_CALLBACK_F(CALLBACK, std::vector<rank_type>, tag_type)                            \
+            OOMPH_CHECK_CALLBACK_MSG_CONST_REF}
 
 #define OOMPH_CHECK_CALLBACK_MULTI_CONST_REF_TAGS(CALLBACK)                                        \
-    {                                                                                              \
-        OOMPH_CHECK_CALLBACK_F(CALLBACK, std::vector<rank_type>, std::vector<tag_type>)            \
-        OOMPH_CHECK_CALLBACK_MSG_CONST_REF                                                         \
-    }
-
-namespace oomph
-{
-class communicator_impl;
-
-namespace detail
-{
-struct communicator_state
-{
-    using impl_type = communicator_impl;
-    impl_type*                              m_impl;
-    std::atomic<std::size_t>*               m_shared_scheduled_recvs;
-    util::pool_factory<multi_request_state> m_mrs_factory;
-    std::size_t                             scheduled_sends = 0;
-    std::size_t                             scheduled_recvs = 0;
-
-    communicator_state(impl_type* impl_, std::atomic<std::size_t>* shared_scheduled_recvs);
-    ~communicator_state();
-    communicator_state(communicator_state const&) = delete;
-    communicator_state(communicator_state&&) = delete;
-    communicator_state& operator=(communicator_state const&) = delete;
-    communicator_state& operator=(communicator_state&&) = delete;
-
-    auto make_multi_request_state(std::size_t ns) { return m_mrs_factory.make(m_impl, ns); }
-
-    template<typename T>
-    auto make_multi_request_state(std::vector<rank_type>&& neighs,
-        oomph::message_buffer<T> const&                    msg)
-    {
-        return m_mrs_factory.make(m_impl, neighs.size(), std::move(neighs), std::vector<tag_type>{},
-            msg.size(), &msg);
-    }
-
-    template<typename T>
-    auto make_multi_request_state(std::vector<rank_type>&& neighs, std::vector<tag_type>&& tags,
-        oomph::message_buffer<T> const& msg)
-    {
-        return m_mrs_factory.make(m_impl, neighs.size(), std::move(neighs), std::move(tags),
-            msg.size(), &msg);
-    }
-
-    template<typename T>
-    auto make_multi_request_state(std::vector<rank_type>&& neighs, oomph::message_buffer<T>& msg)
-    {
-        return m_mrs_factory.make(m_impl, neighs.size(), std::move(neighs), std::vector<tag_type>{},
-            msg.size(), &msg);
-    }
-
-    template<typename T>
-    auto make_multi_request_state(std::vector<rank_type>&& neighs, std::vector<tag_type>&& tags,
-        oomph::message_buffer<T>& msg)
-    {
-        return m_mrs_factory.make(m_impl, neighs.size(), std::move(neighs), std::move(tags),
-            msg.size(), &msg);
-    }
-
-    template<typename T>
-    auto make_multi_request_state(std::vector<rank_type>&& neighs, oomph::message_buffer<T>&& msg)
-    {
-        return m_mrs_factory.make(m_impl, neighs.size(), std::move(neighs), std::vector<tag_type>{},
-            msg.size(), nullptr, std::move(msg.m));
-    }
-
-    template<typename T>
-    auto make_multi_request_state(std::vector<rank_type>&& neighs, std::vector<tag_type>&& tags,
-        oomph::message_buffer<T>&& msg)
-    {
-        return m_mrs_factory.make(m_impl, neighs.size(), std::move(neighs), std::move(tags),
-            msg.size(), nullptr, std::move(msg.m));
-    }
-};
-
-} // namespace detail
-} // namespace oomph
+    {OOMPH_CHECK_CALLBACK_F(CALLBACK, std::vector<rank_type>, std::vector<tag_type>)               \
+            OOMPH_CHECK_CALLBACK_MSG_CONST_REF}
+
+namespace oomph {
+    class communicator_impl;
+
+    namespace detail {
+        struct communicator_state
+        {
+            using impl_type = communicator_impl;
+            impl_type* m_impl;
+            std::atomic<std::size_t>* m_shared_scheduled_recvs;
+            util::pool_factory<multi_request_state> m_mrs_factory;
+            std::size_t scheduled_sends = 0;
+            std::size_t scheduled_recvs = 0;
+
+            communicator_state(impl_type* impl_, std::atomic<std::size_t>* shared_scheduled_recvs);
+            ~communicator_state();
+            communicator_state(communicator_state const&) = delete;
+            communicator_state(communicator_state&&) = delete;
+            communicator_state& operator=(communicator_state const&) = delete;
+            communicator_state& operator=(communicator_state&&) = delete;
+
+            auto make_multi_request_state(std::size_t ns) { return m_mrs_factory.make(m_impl, ns); }
+
+            template <typename T>
+            auto make_multi_request_state(
+                std::vector<rank_type>&& neighs, oomph::message_buffer<T> const& msg)
+            {
+                return m_mrs_factory.make(m_impl, neighs.size(), std::move(neighs),
+                    std::vector<tag_type>{}, msg.size(), &msg);
+            }
+
+            template <typename T>
+            auto make_multi_request_state(std::vector<rank_type>&& neighs,
+                std::vector<tag_type>&& tags, oomph::message_buffer<T> const& msg)
+            {
+                return m_mrs_factory.make(
+                    m_impl, neighs.size(), std::move(neighs), std::move(tags), msg.size(), &msg);
+            }
+
+            template <typename T>
+            auto
+            make_multi_request_state(std::vector<rank_type>&& neighs, oomph::message_buffer<T>& msg)
+            {
+                return m_mrs_factory.make(m_impl, neighs.size(), std::move(neighs),
+                    std::vector<tag_type>{}, msg.size(), &msg);
+            }
+
+            template <typename T>
+            auto make_multi_request_state(std::vector<rank_type>&& neighs,
+                std::vector<tag_type>&& tags, oomph::message_buffer<T>& msg)
+            {
+                return m_mrs_factory.make(
+                    m_impl, neighs.size(), std::move(neighs), std::move(tags), msg.size(), &msg);
+            }
+
+            template <typename T>
+            auto make_multi_request_state(
+                std::vector<rank_type>&& neighs, oomph::message_buffer<T>&& msg)
+            {
+                return m_mrs_factory.make(m_impl, neighs.size(), std::move(neighs),
+                    std::vector<tag_type>{}, msg.size(), nullptr, std::move(msg.m));
+            }
+
+            template <typename T>
+            auto make_multi_request_state(std::vector<rank_type>&& neighs,
+                std::vector<tag_type>&& tags, oomph::message_buffer<T>&& msg)
+            {
+                return m_mrs_factory.make(m_impl, neighs.size(), std::move(neighs), std::move(tags),
+                    msg.size(), nullptr, std::move(msg.m));
+            }
+        };
+
+    }    // namespace detail
+}    // namespace oomph
diff --git a/src/libfabric/CMakeLists.txt b/src/libfabric/CMakeLists.txt
index c82e387d..92128897 100644
--- a/src/libfabric/CMakeLists.txt
+++ b/src/libfabric/CMakeLists.txt
@@ -19,4 +19,24 @@ list(TRANSFORM oomph_sources PREPEND ${CMAKE_CURRENT_SOURCE_DIR}/../
 target_sources(oomph_libfabric PRIVATE ${oomph_sources_libfabric})
 target_sources(oomph_libfabric PRIVATE context.cpp)
 target_sources(oomph_libfabric PRIVATE operation_context.cpp)
-target_sources(oomph_libfabric PRIVATE locality.cpp)
+
+# if we are using GPU, then the libfabric library was probably built with
+# gpu support, and we should link to cuda to prevent link errors
+if (HWMALLOC_ENABLE_DEVICE)
+  include(CheckLanguage)
+  check_language(CUDA)
+
+  if(CMAKE_CUDA_COMPILER)
+    enable_language(CUDA)
+  else()
+     message(STATUS "No CUDA support")
+    return()
+  endif()
+
+  find_package(CUDAToolkit)
+  target_link_libraries(oomph_libfabric PRIVATE CUDA::cudart)
+endif()
+
+add_executable(check_libfabric test/check_libfabric.cpp)
+target_link_libraries(check_libfabric PUBLIC oomph_libfabric)
+target_include_directories(check_libfabric PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
diff --git a/src/libfabric/communicator.hpp b/src/libfabric/communicator.hpp
index ff8fc945..6bec497b 100644
--- a/src/libfabric/communicator.hpp
+++ b/src/libfabric/communicator.hpp
@@ -14,131 +14,133 @@
 
 #include <boost/lockfree/queue.hpp>
 
-#include <oomph/context.hpp>
 #include <oomph/communicator.hpp>
+#include <oomph/context.hpp>
 
 // paths relative to backend
 #include <../communicator_base.hpp>
 #include <../device_guard.hpp>
+#include <context.hpp>
+#include <controller.hpp>
 #include <operation_context.hpp>
 #include <request_state.hpp>
-#include <controller.hpp>
-#include <context.hpp>
-
-namespace oomph
-{
 
-using operation_context = libfabric::operation_context;
+namespace oomph {
 
-using tag_disp = NS_DEBUG::detail::hex<12, uintptr_t>;
+    using operation_context = libfabric::operation_context;
 
-template<int Level>
-inline /*constexpr*/ NS_DEBUG::print_threshold<Level, 0> com_deb("COMMUNI");
+    using tag_disp = NS_DEBUG::detail::hex<12, uintptr_t>;
 
-static NS_DEBUG::enable_print<false> com_err("COMMUNI");
+    template <int Level>
+    inline NS_DEBUG::print_threshold<Level, 0> com_deb("COMMUNI");
 
-class communicator_impl : public communicator_base<communicator_impl>
-{
-    using tag_type = std::uint64_t;
-    //
-    using segment_type = libfabric::memory_segment;
-    using region_type = segment_type::handle_type;
+    static NS_DEBUG::enable_print<false> com_err("COMMUNI");
 
-    using callback_queue = boost::lockfree::queue<detail::request_state*,
-        boost::lockfree::fixed_sized<false>, boost::lockfree::allocator<std::allocator<void>>>;
-
-  public:
-    context_impl*               m_context;
-    libfabric::endpoint_wrapper m_tx_endpoint;
-    libfabric::endpoint_wrapper m_rx_endpoint;
-    //
-    callback_queue m_send_cb_queue;
-    callback_queue m_recv_cb_queue;
-    callback_queue m_recv_cb_cancel;
-
-    // --------------------------------------------------------------------
-    communicator_impl(context_impl* ctxt)
-    : communicator_base(ctxt)
-    , m_context(ctxt)
-    , m_send_cb_queue(128)
-    , m_recv_cb_queue(128)
-    , m_recv_cb_cancel(8)
+    class communicator_impl : public communicator_base<communicator_impl>
     {
-        LF_DEB(com_deb<9>, debug(NS_DEBUG::str<>("MPI_comm"), NS_DEBUG::ptr(mpi_comm())));
-        m_tx_endpoint = m_context->get_controller()->get_tx_endpoint();
-        m_rx_endpoint = m_context->get_controller()->get_rx_endpoint();
-    }
+        using tag_type = std::uint64_t;
+        //
+        using segment_type = libfabric::memory_segment;
+        using region_type = segment_type::handle_type;
+
+        using callback_queue = boost::lockfree::queue<detail::request_state*,
+            boost::lockfree::fixed_sized<false>, boost::lockfree::allocator<std::allocator<void>>>;
+
+    public:
+        context_impl* m_context;
+        libfabric::endpoint_wrapper m_tx_endpoint;
+        libfabric::endpoint_wrapper m_rx_endpoint;
+        //
+        callback_queue m_send_cb_queue;
+        callback_queue m_recv_cb_queue;
+        callback_queue m_recv_cb_cancel;
+
+        // --------------------------------------------------------------------
+        communicator_impl(context_impl* ctxt)
+          : communicator_base(ctxt)
+          , m_context(ctxt)
+          , m_send_cb_queue(128)
+          , m_recv_cb_queue(128)
+          , m_recv_cb_cancel(8)
+        {
+            LF_DEB(com_deb<9>, debug(str<>("MPI_comm"), hptr(mpi_comm())));
+            m_tx_endpoint = m_context->get_controller()->get_tx_endpoint();
+            m_rx_endpoint = m_context->get_controller()->get_rx_endpoint();
+        }
 
-    // --------------------------------------------------------------------
-    ~communicator_impl() { clear_callback_queues(); }
+        // --------------------------------------------------------------------
+        ~communicator_impl() { clear_callback_queues(); }
 
-    // --------------------------------------------------------------------
-    auto& get_heap() noexcept { return m_context->get_heap(); }
+        // --------------------------------------------------------------------
+        auto& get_heap() noexcept { return m_context->get_heap(); }
 
-    // --------------------------------------------------------------------
-    /// generate a tag with 0xRRRRRRRRtttttttt rank, tag.
-    /// original tag can be 32bits, then we add 32bits of rank info.
-    inline std::uint64_t make_tag64(std::uint32_t tag, /*std::uint32_t rank, */ std::uintptr_t ctxt)
-    {
-        return (((ctxt & 0x0000000000FFFFFF) << 24) | ((std::uint64_t(tag) & 0x0000000000FFFFFF)));
-    }
+        // --------------------------------------------------------------------
+        /// generate a tag with 0xRRRRRRRRtttttttt rank, tag.
+        /// original tag can be 32bits, then we add 32bits of rank info.
+        inline std::uint64_t make_tag64(
+            std::uint32_t tag, /*std::uint32_t rank, */ std::uintptr_t ctxt)
+        {
+            return (((ctxt & 0x0000'0000'00FF'FFFF) << 24) |
+                ((std::uint64_t(tag) & 0x0000'0000'00FF'FFFF)));
+        }
 
-    // --------------------------------------------------------------------
-    template<typename Func, typename... Args>
-    inline void execute_fi_function(Func F, const char* msg, Args&&... args)
-    {
-        bool ok = false;
-        while (!ok)
+        // --------------------------------------------------------------------
+        template <typename Func, typename... Args>
+        inline void execute_fi_function(Func F, char const* msg, Args&&... args)
         {
-            ssize_t ret = F(std::forward<Args>(args)...);
-            if (ret == 0) { return; }
-            else if (ret == -FI_EAGAIN)
-            {
-                // com_deb<9>.error("Reposting", msg);
-                // no point stressing the system
-                m_context->get_controller()->poll_for_work_completions(this);
-            }
-            else if (ret == -FI_ENOENT)
+            bool ok = false;
+            while (!ok)
             {
-                // if a node has failed, we can recover
-                // @TODO : put something better here
-                com_err.error("No destination endpoint, terminating.");
-                std::terminate();
+                ssize_t ret = F(std::forward<Args>(args)...);
+                if (ret == 0) { return; }
+                else if (ret == -FI_EAGAIN)
+                {
+                    // com_deb<9>.error("Reposting", msg);
+                    // no point stressing the system
+                    m_context->get_controller()->poll_for_work_completions(this);
+                }
+                else if (ret == -FI_ENOENT)
+                {
+                    // if a node has failed, we can recover
+                    // @TODO : put something better here
+                    com_err.error("No destination endpoint, terminating.");
+                    std::terminate();
+                }
+                else if (ret) { throw NS_LIBFABRIC::fabric_error(int(ret), msg); }
             }
-            else if (ret) { throw NS_LIBFABRIC::fabric_error(int(ret), msg); }
         }
-    }
 
-    // --------------------------------------------------------------------
-    // this takes a pinned memory region and sends it
-    void send_tagged_region(region_type const& send_region, std::size_t size, fi_addr_t dst_addr_,
-        uint64_t tag_, operation_context* ctxt)
-    {
-        [[maybe_unused]] auto scp = com_deb<9>.scope(NS_DEBUG::ptr(this), __func__);
-        // clang-format off
+        // --------------------------------------------------------------------
+        // this takes a pinned memory region and sends it
+        void send_tagged_region(region_type const& send_region, std::size_t size,
+            fi_addr_t dst_addr_, uint64_t tag_, operation_context* ctxt)
+        {
+            [[maybe_unused]] auto scp = com_deb<9>.scope(NS_DEBUG::hptr(this), __func__);
+            // clang-format off
         LF_DEB(com_deb<9>,
-            debug(NS_DEBUG::str<>("send_tagged_region"),
-                  "->", NS_DEBUG::dec<2>(dst_addr_),
+            debug(str<>("send_tagged_region"),
+                  "->", dec<2>(dst_addr_),
                   send_region,
                   "tag", tag_disp(tag_),
-                  "context", NS_DEBUG::ptr(ctxt),
-                  "tx endpoint", NS_DEBUG::ptr(m_tx_endpoint.get_ep())));
-        // clang-format on
-        execute_fi_function(fi_tsend, "fi_tsend", m_tx_endpoint.get_ep(), send_region.get_address(),
-            size, send_region.get_local_key(), dst_addr_, tag_, ctxt);
-    }
+                  "context", hptr(ctxt),
+                  "tx endpoint", hptr(m_tx_endpoint.get_ep())));
+            // clang-format on
+            execute_fi_function(fi_tsend, "fi_tsend", m_tx_endpoint.get_ep(),
+                send_region.get_address(), size, send_region.get_local_key(), dst_addr_, tag_,
+                ctxt);
+        }
 
-    // --------------------------------------------------------------------
-    // this takes a pinned memory region and sends it using inject instead of send
-    void inject_tagged_region(region_type const& send_region, std::size_t size, fi_addr_t dst_addr_,
-        uint64_t tag_)
-    {
-        [[maybe_unused]] auto scp = com_deb<9>.scope(NS_DEBUG::ptr(this), __func__);
-        // clang-format on
-        LF_DEB(com_deb<9>,
-            debug(NS_DEBUG::str<>("inject tagged"), "->", NS_DEBUG::dec<2>(dst_addr_), send_region,
-                "tag", tag_disp(tag_), "tx endpoint", NS_DEBUG::ptr(m_tx_endpoint.get_ep())));
-        // clang-format off
+        // --------------------------------------------------------------------
+        // this takes a pinned memory region and sends it using inject instead of send
+        void inject_tagged_region(
+            region_type const& send_region, std::size_t size, fi_addr_t dst_addr_, uint64_t tag_)
+        {
+            [[maybe_unused]] auto scp = com_deb<9>.scope(NS_DEBUG::hptr(this), __func__);
+            // clang-format on
+            LF_DEB(com_deb<9>,
+                debug(str<>("inject tagged"), "->", dec<2>(dst_addr_), send_region, "tag",
+                    tag_disp(tag_), "tx endpoint", hptr(m_tx_endpoint.get_ep())));
+            // clang-format off
         execute_fi_function(fi_tinject, "fi_tinject", m_tx_endpoint.get_ep(),
             send_region.get_address(), size, dst_addr_, tag_);
     }
@@ -150,285 +152,283 @@ class communicator_impl : public communicator_base<communicator_impl>
     void recv_tagged_region(region_type const& recv_region, std::size_t size, fi_addr_t src_addr_,
         uint64_t tag_, operation_context* ctxt)
     {
-        [[maybe_unused]] auto scp = com_deb<9>.scope(NS_DEBUG::ptr(this), __func__);
+        [[maybe_unused]] auto scp = com_deb<9>.scope(NS_DEBUG::hptr(this), __func__);
         // clang-format off
         LF_DEB(com_deb<1>,
-            debug(NS_DEBUG::str<>("recv_tagged_region"),
-                  "<-", NS_DEBUG::dec<2>(src_addr_),
+            debug(str<>("recv_tagged_region"),
+                  "<-", dec<2>(src_addr_),
                   recv_region,
                   "tag", tag_disp(tag_),
-                  "context", NS_DEBUG::ptr(ctxt),
-                  "rx endpoint", NS_DEBUG::ptr(m_rx_endpoint.get_ep())));
-        // clang-format on
-        constexpr uint64_t ignore = 0;
-        execute_fi_function(fi_trecv, "fi_trecv", m_rx_endpoint.get_ep(), recv_region.get_address(),
-            size, recv_region.get_local_key(), src_addr_, tag_, ignore, ctxt);
-        // if (l.owns_lock()) l.unlock();
-    }
+                  "context", hptr(ctxt),
+                  "rx endpoint", hptr(m_rx_endpoint.get_ep())));
+            // clang-format on
+            constexpr uint64_t ignore = 0;
+            execute_fi_function(fi_trecv, "fi_trecv", m_rx_endpoint.get_ep(),
+                recv_region.get_address(), size, recv_region.get_local_key(), src_addr_, tag_,
+                ignore, ctxt);
+            // if (l.owns_lock()) l.unlock();
+        }
 
-    // --------------------------------------------------------------------
-    send_request send(context_impl::heap_type::pointer const& ptr, std::size_t size, rank_type dst,
-        oomph::tag_type tag, util::unique_function<void(rank_type, oomph::tag_type)>&& cb,
-        std::size_t* scheduled)
-    {
-        [[maybe_unused]] auto scp = com_deb<9>.scope(NS_DEBUG::ptr(this), __func__);
-        std::uint64_t stag = make_tag64(tag, /*this->rank(), */ this->m_context->get_context_tag());
+        // --------------------------------------------------------------------
+        send_request send(context_impl::heap_type::pointer const& ptr, std::size_t size,
+            rank_type dst, oomph::tag_type tag,
+            util::unique_function<void(rank_type, oomph::tag_type)>&& cb, std::size_t* scheduled)
+        {
+            [[maybe_unused]] auto scp = com_deb<9>.scope(NS_DEBUG::hptr(this), __func__);
+            std::uint64_t stag =
+                make_tag64(tag, /*this->rank(), */ this->m_context->get_context_tag());
 
 #if OOMPH_ENABLE_DEVICE
-        auto const& reg = ptr.on_device() ? ptr.device_handle() : ptr.handle();
+            auto const& reg = ptr.on_device() ? ptr.device_handle() : ptr.handle();
 #else
-        auto const& reg = ptr.handle();
+            auto const& reg = ptr.handle();
 #endif
 
 #ifdef EXTRA_SIZE_CHECKS
-        if (size != reg.get_size())
-        {
-            LF_DEB(com_err, error(NS_DEBUG::str<>("send mismatch"), "size", NS_DEBUG::hex<6>(size),
-                                "reg size", NS_DEBUG::hex<6>(reg.get_size())));
-        }
-#endif
-        m_context->get_controller()->sends_posted_++;
-
-        // use optimized inject if msg is very small
-        if (size <= m_context->get_controller()->get_tx_inject_size())
-        {
-            inject_tagged_region(reg, size, fi_addr_t(dst), stag);
-            if (!has_reached_recursion_depth())
+            if (size != reg.get_size())
             {
-                auto inc = recursion();
-                cb(dst, tag);
-                return {};
+                LF_DEB(com_err,
+                    error(str<>("send mismatch"), "size", hex<6>(size), "reg size",
+                        hex<6>(reg.get_size())));
             }
-            else
+#endif
+            m_context->get_controller()->sends_posted_++;
+
+            // use optimized inject if msg is very small
+            if (size <= m_context->get_controller()->get_tx_inject_size())
             {
-                // construct request which is also an operation context
-                auto s =
-                    m_req_state_factory.make(m_context, this, scheduled, dst, tag, std::move(cb));
-                s->create_self_ref();
-                while (!m_send_cb_queue.push(s.get())) {}
-                return {std::move(s)};
+                inject_tagged_region(reg, size, fi_addr_t(dst), stag);
+                if (!has_reached_recursion_depth())
+                {
+                    auto inc = recursion();
+                    cb(dst, tag);
+                    return {};
+                }
+                else
+                {
+                    // construct request which is also an operation context
+                    auto s = m_req_state_factory.make(
+                        m_context, this, scheduled, dst, tag, std::move(cb));
+                    s->create_self_ref();
+                    while (!m_send_cb_queue.push(s.get())) {}
+                    return {std::move(s)};
+                }
             }
-        }
 
-        // construct request which is also an operation context
-        auto s = m_req_state_factory.make(m_context, this, scheduled, dst, tag, std::move(cb));
-        s->create_self_ref();
+            // construct request which is also an operation context
+            auto s = m_req_state_factory.make(m_context, this, scheduled, dst, tag, std::move(cb));
+            s->create_self_ref();
 
-        // clang-format off
+            // clang-format off
         LF_DEB(com_deb<9>,
-            debug(NS_DEBUG::str<>("Send"),
-                  "thisrank", NS_DEBUG::dec<>(rank()),
-                  "rank", NS_DEBUG::dec<>(dst),
+            debug(str<>("Send"),
+                  "thisrank", dec<>(rank()),
+                  "rank", dec<>(dst),
                   "tag", tag_disp(std::uint64_t(tag)),
                   //"wrapped tag", tag_disp(std::uint64_t(tag.get())),
                   "stag", tag_disp(stag),
-                  "addr", NS_DEBUG::ptr(reg.get_address()),
-                  "size", NS_DEBUG::hex<6>(size),
-                  "reg size", NS_DEBUG::hex<6>(reg.get_size()),
-                  "op_ctx", NS_DEBUG::ptr(&(s->m_operation_context)),
-                  "req", NS_DEBUG::ptr(s.get())));
+                  "addr", hptr(reg.get_address()),
+                  "size", hex<6>(size),
+                  "reg size", hex<6>(reg.get_size()),
+                  "op_ctx", hptr(&(s->m_operation_context)),
+                  "req", hptr(s.get())));
 #if OOMPH_ENABLE_DEVICE
         if (!ptr.on_device()) {
             LF_DEB(com_deb<9>,
-                debug(NS_DEBUG::str<>("send region CRC32"),
-                      NS_DEBUG::mem_crc32(reg.get_address(), size, "CRC32")));
+                debug(str<>("send region CRC32"),
+                      mem_crc32(reg.get_address(), size, "CRC32")));
         }
 #endif
-        // clang-format on
+            // clang-format on
 
-        send_tagged_region(reg, size, fi_addr_t(dst), stag, &(s->m_operation_context));
-        return {std::move(s)};
-    }
+            send_tagged_region(reg, size, fi_addr_t(dst), stag, &(s->m_operation_context));
+            return {std::move(s)};
+        }
 
-    recv_request recv(context_impl::heap_type::pointer& ptr, std::size_t size, rank_type src,
-        oomph::tag_type tag, util::unique_function<void(rank_type, oomph::tag_type)>&& cb,
-        std::size_t* scheduled)
-    {
-        [[maybe_unused]] auto scp = com_deb<9>.scope(NS_DEBUG::ptr(this), __func__);
-        std::uint64_t         stag = make_tag64(tag, /*src, */ this->m_context->get_context_tag());
+        recv_request recv(context_impl::heap_type::pointer& ptr, std::size_t size, rank_type src,
+            oomph::tag_type tag, util::unique_function<void(rank_type, oomph::tag_type)>&& cb,
+            std::size_t* scheduled)
+        {
+            [[maybe_unused]] auto scp = com_deb<9>.scope(NS_DEBUG::hptr(this), __func__);
+            std::uint64_t stag = make_tag64(tag, /*src, */ this->m_context->get_context_tag());
 
 #if OOMPH_ENABLE_DEVICE
-        auto const& reg = ptr.on_device() ? ptr.device_handle() : ptr.handle();
+            auto const& reg = ptr.on_device() ? ptr.device_handle() : ptr.handle();
 #else
-        auto const& reg = ptr.handle();
+            auto const& reg = ptr.handle();
 #endif
 
 #ifdef EXTRA_SIZE_CHECKS
-        if (size != reg.get_size())
-        {
-            LF_DEB(com_err, error(NS_DEBUG::str<>("recv mismatch"), "size", NS_DEBUG::hex<6>(size),
-                                "reg size", NS_DEBUG::hex<6>(reg.get_size())));
-        }
+            if (size != reg.get_size())
+            {
+                LF_DEB(com_err,
+                    error(str<>("recv mismatch"), "size", hex<6>(size), "reg size",
+                        hex<6>(reg.get_size())));
+            }
 #endif
-        m_context->get_controller()->recvs_posted_++;
+            m_context->get_controller()->recvs_posted_++;
 
-        // construct request which is also an operation context
-        auto s = m_req_state_factory.make(m_context, this, scheduled, src, tag, std::move(cb));
-        s->create_self_ref();
+            // construct request which is also an operation context
+            auto s = m_req_state_factory.make(m_context, this, scheduled, src, tag, std::move(cb));
+            s->create_self_ref();
 
-        // clang-format off
+            // clang-format off
         LF_DEB(com_deb<9>,
-            debug(NS_DEBUG::str<>("recv"),
-                  "thisrank", NS_DEBUG::dec<>(rank()),
-                  "rank", NS_DEBUG::dec<>(src),
+            debug(str<>("recv"),
+                  "thisrank", dec<>(rank()),
+                  "rank", dec<>(src),
                   "tag", tag_disp(std::uint64_t(tag)),
                   //"wrapped tag", tag_disp(std::uint64_t(tag.get())),
                   "stag", tag_disp(stag),
-                  "addr", NS_DEBUG::ptr(reg.get_address()),
-                  "size", NS_DEBUG::hex<6>(size),
-                  "reg size", NS_DEBUG::hex<6>(reg.get_size()),
-                  "op_ctx", NS_DEBUG::ptr(&(s->m_operation_context)),
-                  "req", NS_DEBUG::ptr(s.get())));
+                  "addr", hptr(reg.get_address()),
+                  "size", hex<6>(size),
+                  "reg size", hex<6>(reg.get_size()),
+                  "op_ctx", hptr(&(s->m_operation_context)),
+                  "req", hptr(s.get())));
 #if OOMPH_ENABLE_DEVICE
         if (!ptr.on_device()) {
             LF_DEB(com_deb<9>,
-                debug(NS_DEBUG::str<>("recv region CRC32"),
-                      NS_DEBUG::mem_crc32(reg.get_address(), size, "CRC32")));
+                debug(str<>("recv region CRC32"),
+                      mem_crc32(reg.get_address(), size, "CRC32")));
         }
 #endif
-        // clang-format on
+            // clang-format on
 
-        recv_tagged_region(reg, size, fi_addr_t(src), stag, &(s->m_operation_context));
-        return {std::move(s)};
-    }
+            recv_tagged_region(reg, size, fi_addr_t(src), stag, &(s->m_operation_context));
+            return {std::move(s)};
+        }
 
-    shared_recv_request shared_recv(context_impl::heap_type::pointer& ptr, std::size_t size,
-        rank_type src, oomph::tag_type tag,
-        util::unique_function<void(rank_type, oomph::tag_type)>&& cb,
-        std::atomic<std::size_t>*                                 scheduled)
-    {
-        [[maybe_unused]] auto scp = com_deb<9>.scope(NS_DEBUG::ptr(this), __func__);
-        std::uint64_t         stag = make_tag64(tag, /*src, */ this->m_context->get_context_tag());
+        shared_recv_request shared_recv(context_impl::heap_type::pointer& ptr, std::size_t size,
+            rank_type src, oomph::tag_type tag,
+            util::unique_function<void(rank_type, oomph::tag_type)>&& cb,
+            std::atomic<std::size_t>* scheduled)
+        {
+            [[maybe_unused]] auto scp = com_deb<9>.scope(NS_DEBUG::hptr(this), __func__);
+            std::uint64_t stag = make_tag64(tag, /*src, */ this->m_context->get_context_tag());
 
 #if OOMPH_ENABLE_DEVICE
-        auto const& reg = ptr.on_device() ? ptr.device_handle() : ptr.handle();
+            auto const& reg = ptr.on_device() ? ptr.device_handle() : ptr.handle();
 #else
-        auto const& reg = ptr.handle();
+            auto const& reg = ptr.handle();
 #endif
 
 #ifdef EXTRA_SIZE_CHECKS
-        if (size != reg.get_size())
-        {
-            LF_DEB(com_err, error(NS_DEBUG::str<>("recv mismatch"), "size", NS_DEBUG::hex<6>(size),
-                                "reg size", NS_DEBUG::hex<6>(reg.get_size())));
-        }
+            if (size != reg.get_size())
+            {
+                LF_DEB(com_err,
+                    error(str<>("recv mismatch"), "size", hex<6>(size), "reg size",
+                        hex<6>(reg.get_size())));
+            }
 #endif
-        m_context->get_controller()->recvs_posted_++;
+            m_context->get_controller()->recvs_posted_++;
 
-        // construct request which is also an operation context
-        auto s = std::make_shared<detail::shared_request_state>(m_context, this, scheduled, src,
-            tag, std::move(cb));
-        s->create_self_ref();
+            // construct request which is also an operation context
+            auto s = std::make_shared<detail::shared_request_state>(
+                m_context, this, scheduled, src, tag, std::move(cb));
+            s->create_self_ref();
 
-        // clang-format off
+            // clang-format off
         LF_DEB(com_deb<9>,
-            debug(NS_DEBUG::str<>("shared_recv"),
-                  "thisrank", NS_DEBUG::dec<>(rank()),
-                  "rank", NS_DEBUG::dec<>(src),
+            debug(str<>("shared_recv"),
+                  "thisrank", dec<>(rank()),
+                  "rank", dec<>(src),
                   "tag", tag_disp(std::uint64_t(tag)),
                   //"wrapped tag", tag_disp(std::uint64_t(tag.get())),
                   "stag", tag_disp(stag),
-                  "addr", NS_DEBUG::ptr(reg.get_address()),
-                  "size", NS_DEBUG::hex<6>(size),
-                  "reg size", NS_DEBUG::hex<6>(reg.get_size()),
-                  "op_ctx", NS_DEBUG::ptr(&(s->m_operation_context)),
-                  "req", NS_DEBUG::ptr(s.get())));
-        // clang-format on
-
-        recv_tagged_region(reg, size, fi_addr_t(src), stag, &(s->m_operation_context));
-        m_context->get_controller()->poll_recv_queue(m_rx_endpoint.get_rx_cq(), this);
-        return {std::move(s)};
-    }
+                  "addr", hptr(reg.get_address()),
+                  "size", hex<6>(size),
+                  "reg size", hex<6>(reg.get_size()),
+                  "op_ctx", hptr(&(s->m_operation_context)),
+                  "req", hptr(s.get())));
+            // clang-format on
+
+            recv_tagged_region(reg, size, fi_addr_t(src), stag, &(s->m_operation_context));
+            m_context->get_controller()->poll_recv_queue(m_rx_endpoint.get_rx_cq(), this);
+            return {std::move(s)};
+        }
 
-    void progress()
-    {
-        m_context->get_controller()->poll_for_work_completions(this);
-        clear_callback_queues();
-    }
+        void progress()
+        {
+            m_context->get_controller()->poll_for_work_completions(this);
+            clear_callback_queues();
+        }
 
-    void clear_callback_queues()
-    {
-        // work through ready callbacks, which were pushed to the queue
-        // (by other threads)
-        m_send_cb_queue.consume_all(
-            [](oomph::detail::request_state* req)
-            {
+        void clear_callback_queues()
+        {
+            // work through ready callbacks, which were pushed to the queue
+            // (by other threads)
+            m_send_cb_queue.consume_all([](oomph::detail::request_state* req) {
                 [[maybe_unused]] auto scp =
-                    com_deb<9>.scope("m_send_cb_queue.consume_all", NS_DEBUG::ptr(req));
+                    com_deb<9>.scope("m_send_cb_queue.consume_all", NS_DEBUG::hptr(req));
                 auto ptr = req->release_self_ref();
                 req->invoke_cb();
             });
 
-        m_recv_cb_queue.consume_all(
-            [](oomph::detail::request_state* req)
-            {
+            m_recv_cb_queue.consume_all([](oomph::detail::request_state* req) {
                 [[maybe_unused]] auto scp =
-                    com_deb<9>.scope("m_recv_cb_queue.consume_all", NS_DEBUG::ptr(req));
+                    com_deb<9>.scope("m_recv_cb_queue.consume_all", NS_DEBUG::hptr(req));
                 auto ptr = req->release_self_ref();
                 req->invoke_cb();
             });
-        m_context->m_recv_cb_queue.consume_all(
-            [](detail::shared_request_state* req)
-            {
+            m_context->m_recv_cb_queue.consume_all([](detail::shared_request_state* req) {
                 auto ptr = req->release_self_ref();
                 req->invoke_cb();
             });
-    }
+        }
 
-    // Cancel is a problem with libfabric because fi_cancel is asynchronous.
-    // The item to be cancelled will either complete with CANCELLED status
-    // or will complete as usual (ie before the cancel could take effect)
-    //
-    // We can only be certain if we poll until the completion happens
-    // or attach a callback to the cancel notification which is not supported
-    // by oomph.
-    bool cancel_recv(detail::request_state* s)
-    {
-        // get the original message operation context
-        operation_context* op_ctx = &(s->m_operation_context);
+        // Cancel is a problem with libfabric because fi_cancel is asynchronous.
+        // The item to be cancelled will either complete with CANCELLED status
+        // or will complete as usual (ie before the cancel could take effect)
+        //
+        // We can only be certain if we poll until the completion happens
+        // or attach a callback to the cancel notification which is not supported
+        // by oomph.
+        bool cancel_recv(detail::request_state* s)
+        {
+            // get the original message operation context
+            operation_context* op_ctx = &(s->m_operation_context);
 
-        // submit the cancellation request
-        bool ok = (fi_cancel(&m_rx_endpoint.get_ep()->fid, op_ctx) == 0);
-        LF_DEB(com_deb<9>,
-            debug(NS_DEBUG::str<>("Cancel"), "ok", ok, "op_ctx", NS_DEBUG::ptr(op_ctx)));
+            // submit the cancellation request
+            bool ok = (fi_cancel(&m_rx_endpoint.get_ep()->fid, op_ctx) == 0);
+            LF_DEB(com_deb<9>, debug(str<>("Cancel"), "ok", ok, "op_ctx", hptr(op_ctx)));
 
-        // if the cancel operation failed completely, return
-        if (!ok) return false;
+            // if the cancel operation failed completely, return
+            if (!ok) return false;
 
-        bool found = false;
-        while (!found)
-        {
-            m_context->get_controller()->poll_recv_queue(m_rx_endpoint.get_rx_cq(), this);
-            // otherwise, poll until we know if it worked
-            std::stack<detail::request_state*> temp_stack;
-            detail::request_state*             temp;
-            while (!found && m_recv_cb_cancel.pop(temp))
+            bool found = false;
+            while (!found)
             {
-                if (temp == s)
+                m_context->get_controller()->poll_recv_queue(m_rx_endpoint.get_rx_cq(), this);
+                // otherwise, poll until we know if it worked
+                std::stack<detail::request_state*> temp_stack;
+                detail::request_state* temp;
+                while (!found && m_recv_cb_cancel.pop(temp))
                 {
-                    // our recv was cancelled correctly
-                    found = true;
-                    LF_DEB(com_deb<9>, debug(NS_DEBUG::str<>("Cancel"), "succeeded", "op_ctx",
-                                           NS_DEBUG::ptr(op_ctx)));
-                    auto ptr = s->release_self_ref();
-                    s->set_canceled();
+                    if (temp == s)
+                    {
+                        // our recv was cancelled correctly
+                        found = true;
+                        LF_DEB(com_deb<9>,
+                            debug(str<>("Cancel"), "succeeded", "op_ctx", hptr(op_ctx)));
+                        auto ptr = s->release_self_ref();
+                        s->set_canceled();
+                    }
+                    else
+                    {
+                        // a different cancel operation
+                        temp_stack.push(temp);
+                    }
                 }
-                else
+                // return any weird unhandled cancels back to the queue
+                while (!temp_stack.empty())
                 {
-                    // a different cancel operation
-                    temp_stack.push(temp);
+                    auto temp = temp_stack.top();
+                    temp_stack.pop();
+                    m_recv_cb_cancel.push(temp);
                 }
             }
-            // return any weird unhandled cancels back to the queue
-            while (!temp_stack.empty())
-            {
-                auto temp = temp_stack.top();
-                temp_stack.pop();
-                m_recv_cb_cancel.push(temp);
-            }
+            return found;
         }
-        return found;
-    }
-};
+    };
 
-} // namespace oomph
+}    // namespace oomph
diff --git a/src/libfabric/context.cpp b/src/libfabric/context.cpp
index 5621a83b..a1debfd7 100644
--- a/src/libfabric/context.cpp
+++ b/src/libfabric/context.cpp
@@ -10,88 +10,95 @@
 #include <cstdint>
 //
 #include <boost/thread.hpp>
-
-#include <hwmalloc/heap_config.hpp>
-
 // paths relative to backend
-#include <oomph_libfabric_defines.hpp>
-#include <controller.hpp>
 #include <communicator.hpp>
 #include <context.hpp>
+#include <controller.hpp>
+#include <oomph_libfabric_defines.hpp>
 
-namespace oomph
-{
-// cppcheck-suppress ConfigurationNotChecked
-static NS_DEBUG::enable_print<false> src_deb("__SRC__");
+namespace oomph {
+    // cppcheck-suppress ConfigurationNotChecked
+    static NS_DEBUG::enable_print<false> src_deb("__SRC__");
 
-using controller_type = libfabric::controller;
+    using controller_type = libfabric::controller;
 
-context_impl::context_impl(MPI_Comm comm, bool thread_safe,
-    hwmalloc::heap_config const& heap_config)
-: context_base(comm, thread_safe)
-, m_heap{this, heap_config}
-, m_recv_cb_queue(128)
-, m_recv_cb_cancel(8)
-{
-    int rank, size;
-    OOMPH_CHECK_MPI_RESULT(MPI_Comm_rank(comm, &rank));
-    OOMPH_CHECK_MPI_RESULT(MPI_Comm_size(comm, &size));
+    context_impl::context_impl(
+        MPI_Comm comm, bool thread_safe, hwmalloc::heap_config const& heap_config, bool debug)
+      : context_base(comm, thread_safe)
+      , m_heap{this, heap_config}
+      , m_recv_cb_queue(128)
+      , m_recv_cb_cancel(8)
+    {
+        int rank, size;
+        OOMPH_CHECK_MPI_RESULT(MPI_Comm_rank(comm, &rank));
+        OOMPH_CHECK_MPI_RESULT(MPI_Comm_size(comm, &size));
 
-    m_ctxt_tag = reinterpret_cast<std::uintptr_t>(this);
-    OOMPH_CHECK_MPI_RESULT(MPI_Bcast(&m_ctxt_tag, 1, MPI_UINT64_T, 0, comm));
-    LF_DEB(src_deb, debug(NS_DEBUG::str<>("Broadcast"), "rank", debug::dec<3>(rank), "context",
-                        debug::ptr(m_ctxt_tag)));
+        m_ctxt_tag = reinterpret_cast<std::uintptr_t>(this);
+        OOMPH_CHECK_MPI_RESULT(MPI_Bcast(&m_ctxt_tag, 1, MPI_UINT64_T, 0, comm));
+        LF_DEB(
+            src_deb, debug(str<>("Broadcast"), "rank", dec<3>(rank), "context", hptr(m_ctxt_tag)));
 
-    // TODO fix the thread safety
-    // problem: controller is a singleton and has problems when 2 contexts are created in the
-    // following order: single threaded first, then multi-threaded after
-    //int threads = thread_safe ? std::thread::hardware_concurrency() : 1;
-    //int threads = std::thread::hardware_concurrency();
-    int threads = boost::thread::physical_concurrency();
-    m_controller = init_libfabric_controller(this, comm, rank, size, threads);
-    m_domain = m_controller->get_domain();
-}
+        // TODO fix the thread safety
+        // problem: controller is a singleton and has problems when 2 contexts are created
+        // in the following order: single threaded first, then multi-threaded after
+        // int threads = thread_safe ? std::thread::hardware_concurrency() : 1;
+        // int threads = std::thread::hardware_concurrency();
+        // Determine the number of threads based on the CPU affinity mask
+        int threads = 1;
+#if defined(_GNU_SOURCE)
+        cpu_set_t cpuset;
+        CPU_ZERO(&cpuset);
+        if (sched_getaffinity(0, sizeof(cpuset), &cpuset) == 0)
+            threads = CPU_COUNT(&cpuset);
+        else
+            threads = boost::thread::physical_concurrency();
+#else
+        threads = boost::thread::physical_concurrency();
+#endif
+        m_controller = init_libfabric_controller(this, comm, rank, size, threads, debug);
+        m_domain = m_controller->get_domain();
+    }
 
-communicator_impl*
-context_impl::get_communicator()
-{
-    auto comm = new communicator_impl{this};
-    m_comms_set.insert(comm);
-    return comm;
-}
+    communicator_impl* context_impl::get_communicator()
+    {
+        auto comm = new communicator_impl{this};
+        m_comms_set.insert(comm);
+        return comm;
+    }
 
-const char*
-context_impl::get_transport_option(const std::string& opt)
-{
-    if (opt == "name") { return "libfabric"; }
-    else if (opt == "progress") { return libfabric_progress_string(); }
-    else if (opt == "endpoint") { return libfabric_endpoint_string(); }
-    else if (opt == "rendezvous_threshold")
+    char const* context_impl::get_transport_option(std::string const& opt)
     {
-        static char buffer[32];
-        std::string temp = std::to_string(m_controller->rendezvous_threshold());
-        strncpy(buffer, temp.c_str(), std::min(size_t(31), std::strlen(temp.c_str())));
-        return buffer;
+        if (opt == "name") { return "libfabric"; }
+        else if (opt == "progress") { return libfabric_progress_string(); }
+        else if (opt == "endpoint") { return libfabric_endpoint_string(); }
+        else if (opt == "rendezvous_threshold")
+        {
+            static char buffer[32];
+            std::string temp = std::to_string(m_controller->rendezvous_threshold());
+            if (temp.size() > 31) throw std::runtime_error("Bad string option check, fix please");
+            strncpy(buffer, temp.c_str(), 32);
+            return buffer;
+        }
+        else { return "unspecified"; }
     }
-    else { return "unspecified"; }
-}
 
-std::shared_ptr<controller_type>
-context_impl::init_libfabric_controller(oomph::context_impl* /*ctx*/, MPI_Comm comm, int rank,
-    int size, int threads)
-{
-    // only allow one thread to pass, make other wait
-    static std::mutex                       m_init_mutex;
-    std::lock_guard<std::mutex>             lock(m_init_mutex);
-    static std::shared_ptr<controller_type> instance(nullptr);
-    if (!instance.get())
+    std::shared_ptr<controller_type> context_impl::init_libfabric_controller(
+        oomph::context_impl* /*ctx*/, MPI_Comm comm, int rank, int size, int threads, bool debug)
     {
-        LF_DEB(src_deb, debug(NS_DEBUG::str<>("New Controller"), "rank", debug::dec<3>(rank),
-                            "size", debug::dec<3>(size), "threads", debug::dec<3>(threads)));
-        instance.reset(new controller_type());
-        instance->initialize(HAVE_LIBFABRIC_PROVIDER, rank == 0, size, threads, comm);
+        // only allow one thread to pass, make other wait
+        static std::mutex m_init_mutex;
+        std::lock_guard<std::mutex> lock(m_init_mutex);
+        static std::shared_ptr<controller_type> instance(nullptr);
+        if (!instance.get())
+        {
+            LF_DEB(src_deb,
+                debug(NS_DEBUG::str<>("New Controller"), "rank", dec<3>(rank), "size", dec<3>(size),
+                    "threads", dec<3>(threads)));
+            instance.reset(new controller_type());
+            if (debug) instance->enable_debug();
+            instance->initialize(HAVE_LIBFABRIC_PROVIDER, rank == 0, size, threads, comm);
+        }
+        return instance;
     }
-    return instance;
-}
 
-} // namespace oomph
+}    // namespace oomph
diff --git a/src/libfabric/context.hpp b/src/libfabric/context.hpp
index a7c0c112..76654d66 100644
--- a/src/libfabric/context.hpp
+++ b/src/libfabric/context.hpp
@@ -9,148 +9,152 @@
  */
 #pragma once
 
-#include <thread>
 #include <stack>
 
 #include <hwmalloc/heap.hpp>
-#include <hwmalloc/heap_config.hpp>
 #include <hwmalloc/register.hpp>
 
 #include <oomph/config.hpp>
 
 // paths relative to backend
 #include <../context_base.hpp>
-#include <memory_region.hpp>
 #include <controller.hpp>
+#include <memory_region.hpp>
 #include <request_state.hpp>
 
-namespace oomph
-{
-
-static NS_DEBUG::enable_print<false> ctx_deb("CONTEXT");
-
-using controller_type = libfabric::controller;
-
-class context_impl : public context_base
-{
-  public:
-    using region_type = libfabric::memory_segment;
-    using domain_type = region_type::provider_domain;
-    using device_region_type = libfabric::memory_segment;
-    using heap_type = hwmalloc::heap<context_impl>;
-    using callback_queue = boost::lockfree::queue<detail::shared_request_state*,
-        boost::lockfree::fixed_sized<false>, boost::lockfree::allocator<std::allocator<void>>>;
-
-  private:
-    heap_type                        m_heap;
-    domain_type*                     m_domain;
-    std::shared_ptr<controller_type> m_controller;
-    std::uintptr_t                   m_ctxt_tag;
-
-  public:
-    // --------------------------------------------------
-    // create a singleton ptr to a libfabric controller that
-    // can be shared between oomph context objects
-    static std::shared_ptr<controller_type> init_libfabric_controller(oomph::context_impl* ctx,
-        MPI_Comm comm, int rank, int size, int threads);
-
-    // queue for shared recv callbacks
-    callback_queue m_recv_cb_queue;
-    // queue for canceled shared recv requests
-    callback_queue m_recv_cb_cancel;
-
-  public:
-    context_impl(MPI_Comm comm, bool thread_safe, hwmalloc::heap_config const& heap_config);
-    context_impl(context_impl const&) = delete;
-    context_impl(context_impl&&) = delete;
-
-    region_type make_region(void* const ptr, std::size_t size, int device_id)
+namespace oomph {
+
+    static NS_DEBUG::enable_print<false> ctx_deb("CONTEXT");
+
+    using controller_type = libfabric::controller;
+
+    class context_impl : public context_base
     {
-        if (m_controller->get_mrbind())
+    public:
+        using region_type = libfabric::memory_segment;
+        using domain_type = region_type::provider_domain;
+        using device_region_type = libfabric::memory_segment;
+        using heap_type = hwmalloc::heap<context_impl>;
+        using callback_queue = boost::lockfree::queue<detail::shared_request_state*,
+            boost::lockfree::fixed_sized<false>, boost::lockfree::allocator<std::allocator<void>>>;
+
+    private:
+        heap_type m_heap;
+        domain_type* m_domain;
+        std::shared_ptr<controller_type> m_controller;
+        std::uintptr_t m_ctxt_tag;
+
+    public:
+        // --------------------------------------------------
+        // create a singleton ptr to a libfabric controller that
+        // can be shared between oomph context objects
+        static std::shared_ptr<controller_type> init_libfabric_controller(oomph::context_impl* ctx,
+            MPI_Comm comm, int rank, int size, int threads, bool debug = false);
+
+        // queue for shared recv callbacks
+        callback_queue m_recv_cb_queue;
+        // queue for canceled shared recv requests
+        callback_queue m_recv_cb_cancel;
+
+    public:
+        context_impl(MPI_Comm comm, bool thread_safe, hwmalloc::heap_config const& heap_config,
+            bool debug = false);
+        // context_impl(MPI_Comm comm, bool thread_safe, bool message_pool_never_free,
+        //     std::size_t message_pool_reserve, bool debug = false);
+        context_impl(context_impl const&) = delete;
+        context_impl(context_impl&&) = delete;
+
+        region_type make_region(void* const ptr, std::size_t size, int device_id)
         {
-            void* endpoint = m_controller->get_rx_endpoint().get_ep();
-            return libfabric::memory_segment(m_domain, ptr, size, true, endpoint, device_id);
+            if (m_controller->get_mrbind())
+            {
+                void* endpoint = m_controller->get_rx_endpoint().get_ep();
+                return libfabric::memory_segment(m_domain, ptr, size, true, endpoint, device_id);
+            }
+            else
+            {
+                return libfabric::memory_segment(m_domain, ptr, size, false, nullptr, device_id);
+            }
         }
-        else { return libfabric::memory_segment(m_domain, ptr, size, false, nullptr, device_id); }
-    }
 
-    auto& get_heap() noexcept { return m_heap; }
+        auto& get_heap() noexcept { return m_heap; }
 
-    communicator_impl* get_communicator();
+        communicator_impl* get_communicator();
 
-    // we must modify all tags to use 32bits of context ptr for uniqueness
-    inline std::uintptr_t get_context_tag() { return m_ctxt_tag; }
+        // we must modify all tags to use 32bits of context ptr for uniqueness
+        inline std::uintptr_t get_context_tag() { return m_ctxt_tag; }
 
-    inline controller_type* get_controller() /*const */ { return m_controller.get(); }
-    const char*             get_transport_option(const std::string& opt);
+        inline controller_type* get_controller() /*const */ { return m_controller.get(); }
+        char const* get_transport_option(std::string const& opt);
 
-    void progress() { get_controller()->poll_for_work_completions(nullptr); }
+        void progress() { get_controller()->poll_for_work_completions(nullptr); }
 
-    bool cancel_recv(detail::shared_request_state* s)
-    {
-        // get the original message operation context
-        auto op_ctx = &(s->m_operation_context);
+        bool cancel_recv(detail::shared_request_state* s)
+        {
+            // get the original message operation context
+            auto op_ctx = &(s->m_operation_context);
 
-        // submit the cancellation request
-        bool ok = (fi_cancel(&(get_controller()->get_rx_endpoint().get_ep()->fid), op_ctx) == 0);
+            // submit the cancellation request
+            bool ok =
+                (fi_cancel(&(get_controller()->get_rx_endpoint().get_ep()->fid), op_ctx) == 0);
 
-        // if the cancel operation failed completely, return
-        if (!ok) return false;
+            // if the cancel operation failed completely, return
+            if (!ok) return false;
 
-        bool found = false;
-        while (!found)
-        {
-            get_controller()->poll_recv_queue(get_controller()->get_rx_endpoint().get_rx_cq(),
-                nullptr);
-            // otherwise, poll until we know if it worked
-            std::stack<detail::shared_request_state*> temp_stack;
-            detail::shared_request_state*             temp;
-            while (!found && m_recv_cb_cancel.pop(temp))
+            bool found = false;
+            while (!found)
             {
-                if (temp == s)
+                get_controller()->poll_recv_queue(
+                    get_controller()->get_rx_endpoint().get_rx_cq(), nullptr);
+                // otherwise, poll until we know if it worked
+                std::stack<detail::shared_request_state*> temp_stack;
+                detail::shared_request_state* temp;
+                while (!found && m_recv_cb_cancel.pop(temp))
                 {
-                    // our recv was cancelled correctly
-                    found = true;
-                    LF_DEB(oomph::ctx_deb, debug(NS_DEBUG::str<>("Cancel shared"), "succeeded",
-                                               "op_ctx", NS_DEBUG::ptr(op_ctx)));
-                    auto ptr = s->release_self_ref();
-                    s->set_canceled();
+                    if (temp == s)
+                    {
+                        // our recv was cancelled correctly
+                        found = true;
+                        LF_DEB(oomph::ctx_deb,
+                            debug(str<>("Cancel shared"), "succeeded", "op_ctx", hptr(op_ctx)));
+                        auto ptr = s->release_self_ref();
+                        s->set_canceled();
+                    }
+                    else
+                    {
+                        // a different cancel operation
+                        temp_stack.push(temp);
+                    }
                 }
-                else
+                // return any weird unhandled cancels back to the queue
+                while (!temp_stack.empty())
                 {
-                    // a different cancel operation
-                    temp_stack.push(temp);
+                    auto temp = temp_stack.top();
+                    temp_stack.pop();
+                    m_recv_cb_cancel.push(temp);
                 }
             }
-            // return any weird unhandled cancels back to the queue
-            while (!temp_stack.empty())
-            {
-                auto temp = temp_stack.top();
-                temp_stack.pop();
-                m_recv_cb_cancel.push(temp);
-            }
+            return found;
         }
-        return found;
-    }
 
-    unsigned int num_tag_bits() const noexcept { return 32; }
-};
+        unsigned int num_tag_bits() const noexcept { return 32; }
+    };
 
-// --------------------------------------------------------------------
-template<>
-inline oomph::libfabric::memory_segment
-register_memory<oomph::context_impl>(oomph::context_impl& c, void* const ptr, std::size_t size)
-{
-    return c.make_region(ptr, size, -2);
-}
+    // --------------------------------------------------------------------
+    template <>
+    inline oomph::libfabric::memory_segment
+    register_memory<oomph::context_impl>(oomph::context_impl& c, void* const ptr, std::size_t size)
+    {
+        return c.make_region(ptr, size, -2);
+    }
 
 #if OOMPH_ENABLE_DEVICE
-template<>
-inline oomph::libfabric::memory_segment
-register_device_memory<context_impl>(context_impl& c, int device_id, void* ptr, std::size_t size)
-{
-    return c.make_region(ptr, size, device_id);
-}
+    template <>
+    inline oomph::libfabric::memory_segment register_device_memory<context_impl>(
+        context_impl& c, int device_id, void* ptr, std::size_t size)
+    {
+        return c.make_region(ptr, size, device_id);
+    }
 #endif
 
-} // namespace oomph
+}    // namespace oomph
diff --git a/src/libfabric/controller.hpp b/src/libfabric/controller.hpp
index 5becc148..f015a0c4 100644
--- a/src/libfabric/controller.hpp
+++ b/src/libfabric/controller.hpp
@@ -9,23 +9,13 @@
  */
 #pragma once
 
-#include <array>
-#include <atomic>
-#include <chrono>
-#include <deque>
-#include <functional>
-#include <iostream>
-#include <map>
-#include <memory>
 #include <mutex>
 #include <string>
-#include <utility>
 #include <vector>
 //
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
-#include <sstream>
 //
 #include <rdma/fabric.h>
 #include <rdma/fi_domain.h>
@@ -35,428 +25,440 @@
 #include <rdma/fi_rma.h>
 #include <rdma/fi_tagged.h>
 //
-#include "oomph_libfabric_defines.hpp"
+#include "controller_base.hpp"
 #include "fabric_error.hpp"
 #include "locality.hpp"
-#include "memory_region.hpp"
+#include "oomph_libfabric_defines.hpp"
 #include "operation_context.hpp"
-#include "controller_base.hpp"
 //
 #include <oomph/util/unique_function.hpp>
 //
 #include <mpi.h>
 
-namespace NS_DEBUG
-{
-// cppcheck-suppress ConfigurationNotChecked
+namespace NS_DEBUG {
+    // cppcheck-suppress ConfigurationNotChecked
 
-using namespace oomph::debug;
-template<int Level>
-inline /*constexpr*/ NS_DEBUG::print_threshold<Level, 0> cnt_deb("CONTROL");
-//
-static NS_DEBUG::enable_print<true> cnt_err("CONTROL");
-} // namespace NS_DEBUG
-
-namespace oomph::libfabric
-{
-
-class controller : public controller_base<controller>
-{
-  public:
-    // --------------------------------------------------------------------
-    controller()
-    : controller_base()
-    {
-    }
+    using namespace oomph::debug;
+    template <int Level>
+    inline NS_DEBUG::print_threshold<Level, 0> cnt_deb("CONTROL");
+    //
+    static NS_DEBUG::enable_print<false> cnt_err("CONTROL");
+}    // namespace NS_DEBUG
 
-    // --------------------------------------------------------------------
-    void initialize_derived(std::string const&, bool, int, size_t, MPI_Comm mpi_comm)
-    {
-        // Broadcast address of all endpoints to all ranks
-        // and fill address vector with info
-        exchange_addresses(av_, mpi_comm);
-    }
+namespace oomph::libfabric {
 
-    // --------------------------------------------------------------------
-    constexpr fi_threading threadlevel_flags()
+    class controller : public controller_base<controller>
     {
-#if defined(HAVE_LIBFABRIC_GNI) /*|| defined(HAVE_LIBFABRIC_CXI)*/
-        return FI_THREAD_ENDPOINT;
-#else
-        return FI_THREAD_SAFE;
-#endif
-    }
+    public:
+        // --------------------------------------------------------------------
+        controller()
+          : controller_base()
+        {
+        }
 
-    // --------------------------------------------------------------------
-    constexpr uint64_t caps_flags()
-    {
-#if OOMPH_ENABLE_DEVICE && !defined(HAVE_LIBFABRIC_TCP)
-        std::int64_t hmem_flags = FI_HMEM;
+        // --------------------------------------------------------------------
+        void initialize_derived(std::string const&, bool, int, size_t, MPI_Comm mpi_comm)
+        {
+            // Broadcast address of all endpoints to all ranks
+            // and fill address vector with info
+            exchange_addresses(av_, mpi_comm);
+        }
+
+        // --------------------------------------------------------------------
+        constexpr fi_threading threadlevel_flags()
+        {
+#if defined(HAVE_LIBFABRIC_GNI) || defined(HAVE_LIBFABRIC_LNX)
+            return FI_THREAD_ENDPOINT;
 #else
-        std::int64_t hmem_flags = 0;
+            return FI_THREAD_SAFE;
 #endif
-        return hmem_flags | FI_MSG | FI_TAGGED | FI_RMA | FI_READ | FI_WRITE | FI_RECV | FI_SEND |
-               FI_TRANSMIT | FI_REMOTE_READ | FI_REMOTE_WRITE;
-    }
-
-    // --------------------------------------------------------------------
-    // we do not need to perform any special actions on init (to contact root node)
-    void setup_root_node_address(struct fi_info* /*info*/) {}
+        }
 
-    // --------------------------------------------------------------------
-    // send address to rank 0 and receive array of all localities
-    void MPI_exchange_localities(fid_av* av, MPI_Comm comm, int rank, int size)
-    {
-        [[maybe_unused]] auto scp = NS_DEBUG::cnt_deb<9>.scope(NS_DEBUG::ptr(this), __func__);
-        std::vector<char>     localities(size * locality_defs::array_size, 0);
-        //
-        if (rank > 0)
+        // --------------------------------------------------------------------
+        uint64_t caps_flags(uint64_t /*available_flags*/) const
         {
-            LF_DEB(NS_DEBUG::cnt_deb<9>, debug(debug::str<>("sending here"), iplocality(here_),
-                                             "size", locality_defs::array_size));
-            /*int err = */ MPI_Send(here_.fabric_data(), locality_defs::array_size, MPI_CHAR,
-                0, // dst rank
-                0, // tag
-                comm);
-
-            LF_DEB(NS_DEBUG::cnt_deb<9>,
-                debug(debug::str<>("receiving all"), "size", locality_defs::array_size));
-
-            MPI_Status status;
-            /*err = */ MPI_Recv(localities.data(), size * locality_defs::array_size, MPI_CHAR,
-                0, // src rank
-                0, // tag
-                comm, &status);
-            LF_DEB(NS_DEBUG::cnt_deb<9>, debug(debug::str<>("received addresses")));
+            uint64_t flags_required = FI_TAGGED;
+#ifndef HAVE_LIBFABRIC_LNX
+            flags_required |= FI_MSG | FI_TAGGED | FI_RECV | FI_SEND | FI_RMA | FI_READ | FI_WRITE |
+                FI_REMOTE_READ | FI_REMOTE_WRITE;
+# if OOMPH_ENABLE_DEVICE
+            flags_required |= FI_HMEM;
+# endif
+#endif
+            return flags_required;
         }
-        else
+
+        // --------------------------------------------------------------------
+        // we do not need to perform any special actions on init (to contact root node)
+        void setup_root_node_address(struct fi_info* /*info*/) {}
+
+        // --------------------------------------------------------------------
+        // send address to rank 0 and receive array of all localities
+        void MPI_exchange_localities(fid_av* av, MPI_Comm comm, int rank, int size)
         {
-            LF_DEB(NS_DEBUG::cnt_deb<9>, debug(debug::str<>("receiving addresses")));
-            memcpy(&localities[0], here_.fabric_data(), locality_defs::array_size);
-            for (int i = 1; i < size; ++i)
+            [[maybe_unused]] auto scp = NS_DEBUG::cnt_deb<9>.scope(NS_DEBUG::hptr(this), __func__);
+
+            // array of empty locality objects
+            std::vector<locality::locality_data> localities(size);
+            //
+            if (rank > 0)
             {
-                LF_DEB(NS_DEBUG::cnt_deb<9>,
-                    debug(debug::str<>("receiving address"), debug::dec<>(i)));
+                LF_DEB(cnt_deb<9>,
+                    debug(
+                        str<>("sending here"), here_.to_str(), "size", locality_defs::array_size));
+                /*int err = */ MPI_Send(here_.fabric_data().data(), locality_defs::array_size,
+                    MPI_CHAR,
+                    0,    // dst rank
+                    0,    // tag
+                    comm);
+
+                LF_DEB(
+                    cnt_deb<9>, debug(str<>("receiving all"), "size", locality_defs::array_size));
+
                 MPI_Status status;
-                /*int err = */ MPI_Recv(&localities[i * locality_defs::array_size],
-                    size * locality_defs::array_size, MPI_CHAR,
-                    i, // src rank
-                    0, // tag
+                /*err = */ MPI_Recv(localities.data(), size * locality_defs::array_size, MPI_CHAR,
+                    0,    // src rank
+                    0,    // tag
                     comm, &status);
-                LF_DEB(NS_DEBUG::cnt_deb<9>,
-                    debug(debug::str<>("received address"), debug::dec<>(i)));
+                LF_DEB(cnt_deb<9>, debug(str<>("received addresses")));
+            }
+            else
+            {
+                LF_DEB(cnt_deb<9>, debug(str<>("receiving addresses")));
+                memcpy(&localities[0], here_.fabric_data().data(), locality_defs::array_size);
+                for (int i = 1; i < size; ++i)
+                {
+                    LF_DEB(cnt_deb<9>, debug(str<>("receiving address"), dec<>(i)));
+                    MPI_Status status;
+                    /*int err = */ MPI_Recv(&localities[i], size * locality_defs::array_size,
+                        MPI_CHAR,
+                        i,    // src rank
+                        0,    // tag
+                        comm, &status);
+                    LF_DEB(cnt_deb<9>, debug(str<>("received address"), dec<>(i)));
+                }
+
+                LF_DEB(cnt_deb<9>, debug(str<>("sending all")));
+                for (int i = 1; i < size; ++i)
+                {
+                    LF_DEB(cnt_deb<9>, debug(str<>("sending to"), dec<>(i)));
+                    /*int err = */ MPI_Send(&localities[0], size * locality_defs::array_size,
+                        MPI_CHAR,
+                        i,    // dst rank
+                        0,    // tag
+                        comm);
+                }
             }
 
-            LF_DEB(NS_DEBUG::cnt_deb<9>, debug(debug::str<>("sending all")));
-            for (int i = 1; i < size; ++i)
+            // all ranks should now have a full localities vector
+            LF_DEB(cnt_deb<9>, debug(str<>("populating vector")));
+            for (int i = 0; i < size; ++i)
             {
-                LF_DEB(NS_DEBUG::cnt_deb<9>, debug(debug::str<>("sending to"), debug::dec<>(i)));
-                /*int err = */ MPI_Send(&localities[0], size * locality_defs::array_size, MPI_CHAR,
-                    i, // dst rank
-                    0, // tag
-                    comm);
+                locality temp(localities[i], av);
+                insert_address(temp);
             }
         }
 
-        // all ranks should now have a full localities vector
-        LF_DEB(NS_DEBUG::cnt_deb<9>, debug(debug::str<>("populating vector")));
-        for (int i = 0; i < size; ++i)
+        // --------------------------------------------------------------------
+        // if we did not bootstrap, then fetch the list of all localities
+        // and insert each one into the address vector
+        void exchange_addresses(fid_av* av, MPI_Comm mpi_comm)
         {
-            locality temp;
-            int      offset = i * locality_defs::array_size;
-            memcpy(temp.fabric_data_writable(), &localities[offset], locality_defs::array_size);
-            insert_address(av, temp);
-        }
-    }
-
-    // --------------------------------------------------------------------
-    // if we did not bootstrap, then fetch the list of all localities
-    // and insert each one into the address vector
-    void exchange_addresses(fid_av* av, MPI_Comm mpi_comm)
-    {
-        [[maybe_unused]] auto scp = NS_DEBUG::cnt_deb<9>.scope(NS_DEBUG::ptr(this), __func__);
+            [[maybe_unused]] auto scp = NS_DEBUG::cnt_deb<9>.scope(NS_DEBUG::hptr(this), __func__);
 
-        int rank, size;
-        MPI_Comm_rank(mpi_comm, &rank);
-        MPI_Comm_size(mpi_comm, &size);
+            int rank, size;
+            MPI_Comm_rank(mpi_comm, &rank);
+            MPI_Comm_size(mpi_comm, &size);
 
-        LF_DEB(NS_DEBUG::cnt_deb<9>,
-            debug(debug::str<>("initialize_localities"), size, "localities"));
+            LF_DEB(cnt_deb<9>, debug(str<>("initialize_localities"), size, "localities"));
 
-        MPI_exchange_localities(av, mpi_comm, rank, size);
-        debug_print_av_vector(size);
-        LF_DEB(NS_DEBUG::cnt_deb<9>, debug(debug::str<>("Done localities")));
-    }
+            MPI_exchange_localities(av, mpi_comm, rank, size);
+#ifndef HAVE_LIBFABRIC_LNX    // address stuff not yet supported
+            debug_print_av_vector(size);
+#endif
+            LF_DEB(cnt_deb<9>, debug(str<>("Done localities")));
+        }
 
-    // --------------------------------------------------------------------
-    inline constexpr bool bypass_tx_lock()
-    {
+        // --------------------------------------------------------------------
+        inline constexpr bool bypass_tx_lock()
+        {
 #if defined(HAVE_LIBFABRIC_GNI)
-        return true;
-#elif defined(HAVE_LIBFABRIC_CXI)
-        // @todo : cxi provider is not yet thread safe using scalable endpoints
-        return false;
+            return true;
+#elif defined(HAVE_LIBFABRIC_LNX)
+            // @todo : cxi provider is not yet thread safe using scalable endpoints
+            return false;
 #else
-        return (threadlevel_flags() == FI_THREAD_SAFE ||
+            return (threadlevel_flags() == FI_THREAD_SAFE ||
                 endpoint_type_ == endpoint_type::threadlocalTx);
 #endif
-    }
+        }
 
-    // --------------------------------------------------------------------
-    inline controller_base::unique_lock get_tx_lock()
-    {
-        if (bypass_tx_lock()) return unique_lock();
-        return unique_lock(send_mutex_);
-    }
+        // --------------------------------------------------------------------
+        inline controller_base::unique_lock get_tx_lock()
+        {
+            if (bypass_tx_lock()) return unique_lock();
+            return unique_lock(send_mutex_);
+        }
 
-    // --------------------------------------------------------------------
-    inline controller_base::unique_lock try_tx_lock()
-    {
-        if (bypass_tx_lock()) return unique_lock();
-        return unique_lock(send_mutex_, std::try_to_lock_t{});
-    }
+        // --------------------------------------------------------------------
+        inline controller_base::unique_lock try_tx_lock()
+        {
+            if (bypass_tx_lock()) return unique_lock();
+            return unique_lock(send_mutex_, std::try_to_lock_t{});
+        }
 
-    // --------------------------------------------------------------------
-    inline constexpr bool bypass_rx_lock()
-    {
+        // --------------------------------------------------------------------
+        inline constexpr bool bypass_rx_lock()
+        {
 #ifdef HAVE_LIBFABRIC_GNI
-        return true;
+            return true;
 #else
-        return (
-            threadlevel_flags() == FI_THREAD_SAFE || endpoint_type_ == endpoint_type::scalableTxRx);
+            return (threadlevel_flags() == FI_THREAD_SAFE ||
+                endpoint_type_ == endpoint_type::scalableTxRx);
 #endif
-    }
+        }
 
-    // --------------------------------------------------------------------
-    inline controller_base::unique_lock get_rx_lock()
-    {
-        if (bypass_rx_lock()) return unique_lock();
-        return unique_lock(recv_mutex_);
-    }
+        // --------------------------------------------------------------------
+        inline controller_base::unique_lock get_rx_lock()
+        {
+            if (bypass_rx_lock()) return unique_lock();
+            return unique_lock(recv_mutex_);
+        }
 
-    // --------------------------------------------------------------------
-    inline controller_base::unique_lock try_rx_lock()
-    {
-        if (bypass_rx_lock()) return unique_lock();
-        return unique_lock(recv_mutex_, std::try_to_lock_t{});
-    }
+        // --------------------------------------------------------------------
+        inline controller_base::unique_lock try_rx_lock()
+        {
+            if (bypass_rx_lock()) return unique_lock();
+            return unique_lock(recv_mutex_, std::try_to_lock_t{});
+        }
 
-    // --------------------------------------------------------------------
-    int poll_send_queue(fid_cq* send_cq, void* user_data)
-    {
+        // --------------------------------------------------------------------
+        int poll_send_queue(fid_cq* send_cq, void* user_data)
+        {
 #ifdef EXCESSIVE_POLLING_BACKOFF_MICRO_S
-        std::chrono::steady_clock::time_point now = std::chrono::steady_clock::now();
-        if (std::chrono::duration_cast<std::chrono::microseconds>(now - send_poll_stamp).count() <
-            EXCESSIVE_POLLING_BACKOFF_MICRO_S)
-            return 0;
-        send_poll_stamp = now;
+            std::chrono::steady_clock::time_point now = std::chrono::steady_clock::now();
+            if (std::chrono::duration_cast<std::chrono::microseconds>(now - send_poll_stamp)
+                    .count() < EXCESSIVE_POLLING_BACKOFF_MICRO_S)
+                return 0;
+            send_poll_stamp = now;
 #endif
-        int             ret;
-        fi_cq_msg_entry entry[max_completions_array_limit_];
-        assert(max_completions_per_poll_ <= max_completions_array_limit_);
-        {
-            auto lock = try_tx_lock();
-
-            // if we're not threadlocal and didn't get the lock,
-            // then another thread is polling now, just exit
-            if (!bypass_tx_lock() && !lock.owns_lock()) { return -1; }
+            int ret;
+            fi_cq_msg_entry entry[max_completions_array_limit_];
+            assert(max_completions_per_poll_ <= max_completions_array_limit_);
+            {
+                auto lock = try_tx_lock();
 
-            static auto polling =
-                NS_DEBUG::cnt_deb<9>.make_timer(1, debug::str<>("poll send queue"));
-            LF_DEB(NS_DEBUG::cnt_deb<9>, timed(polling, NS_DEBUG::ptr(send_cq)));
+                // if we're not threadlocal and didn't get the lock,
+                // then another thread is polling now, just exit
+                if (!bypass_tx_lock() && !lock.owns_lock()) { return -1; }
 
-            // poll for completions
-            {
-                ret = fi_cq_read(send_cq, &entry[0], max_completions_per_poll_);
-            }
-            // if there is an error, retrieve it
-            if (ret == -FI_EAVAIL)
-            {
-                struct fi_cq_err_entry e = {};
-                int                    err_sz = fi_cq_readerr(send_cq, &e, 0);
-                (void)err_sz;
+                static auto polling =
+                    NS_DEBUG::cnt_deb<9>.make_timer(1, NS_DEBUG::str<>("poll send queue"));
+                LF_DEB(cnt_deb<9>, timed(polling, hptr(send_cq)));
 
-                // flags might not be set correctly
-                if ((e.flags & (FI_MSG | FI_SEND | FI_TAGGED)) != 0)
+                // poll for completions
                 {
-                    NS_DEBUG::cnt_err.error("txcq Error FI_EAVAIL for "
-                                            "FI_SEND with len",
-                        debug::hex<6>(e.len), "context", NS_DEBUG::ptr(e.op_context), "code",
-                        NS_DEBUG::dec<3>(e.err), "flags", debug::bin<16>(e.flags), "error",
-                        fi_cq_strerror(send_cq, e.prov_errno, e.err_data, (char*)e.buf, e.len));
+                    ret = fi_cq_read(send_cq, &entry[0], max_completions_per_poll_);
                 }
-                else if ((e.flags & FI_RMA) != 0)
+                // if there is an error, retrieve it
+                if (ret == -FI_EAVAIL)
                 {
-                    NS_DEBUG::cnt_err.error("txcq Error FI_EAVAIL for "
-                                            "FI_RMA with len",
-                        debug::hex<6>(e.len), "context", NS_DEBUG::ptr(e.op_context), "code",
-                        NS_DEBUG::dec<3>(e.err), "flags", debug::bin<16>(e.flags), "error",
-                        fi_cq_strerror(send_cq, e.prov_errno, e.err_data, (char*)e.buf, e.len));
+                    struct fi_cq_err_entry e = {};
+                    int err_sz = fi_cq_readerr(send_cq, &e, 0);
+                    (void) err_sz;
+
+                    // flags might not be set correctly
+                    if ((e.flags & (FI_MSG | FI_SEND | FI_TAGGED)) != 0)
+                    {
+                        LF_DEB(cnt_err,
+                            error("txcq Error FI_EAVAIL for FI_SEND with len", hex<6>(e.len),
+                                "context", hptr(e.op_context), "code", dec<3>(e.err), "flags",
+                                bin<16>(e.flags), "error",
+                                fi_cq_strerror(
+                                    send_cq, e.prov_errno, e.err_data, (char*) e.buf, e.len)));
+                    }
+                    else if ((e.flags & FI_RMA) != 0)
+                    {
+                        LF_DEB(cnt_err,
+                            error("txcq Error FI_EAVAIL for FI_RMA with len", hex<6>(e.len),
+                                "context", hptr(e.op_context), "code", dec<3>(e.err), "flags",
+                                bin<16>(e.flags), "error",
+                                fi_cq_strerror(
+                                    send_cq, e.prov_errno, e.err_data, (char*) e.buf, e.len)));
+                    }
+                    operation_context* handler = reinterpret_cast<operation_context*>(e.op_context);
+                    handler->handle_error(e);
+                    return 0;
                 }
-                operation_context* handler = reinterpret_cast<operation_context*>(e.op_context);
-                handler->handle_error(e);
-                return 0;
             }
-        }
-        //
-        // exit possibly locked region and process each completion
-        //
-        if (ret > 0)
-        {
-            int processed = 0;
-            for (int i = 0; i < ret; ++i)
+            //
+            // exit possibly locked region and process each completion
+            //
+            if (ret > 0)
             {
-                ++sends_complete;
-                LF_DEB(NS_DEBUG::cnt_deb<9>,
-                    debug(debug::str<>("Completion"), i, debug::dec<2>(i), "txcq flags",
-                        fi_tostr(&entry[i].flags, FI_TYPE_CQ_EVENT_FLAGS), "(",
-                        debug::dec<>(entry[i].flags), ")", "context",
-                        NS_DEBUG::ptr(entry[i].op_context), "length", debug::hex<6>(entry[i].len)));
-                if ((entry[i].flags & (FI_TAGGED | FI_SEND | FI_MSG)) != 0)
+                std::array<char, 1024> buf;
+                int processed = 0;
+                for (int i = 0; i < ret; ++i)
                 {
-                    LF_DEB(NS_DEBUG::cnt_deb<9>,
-                        debug(debug::str<>("Completion"), "txcq tagged send completion",
-                            NS_DEBUG::ptr(entry[i].op_context)));
-
-                    operation_context* handler =
-                        reinterpret_cast<operation_context*>(entry[i].op_context);
-                    processed += handler->handle_tagged_send_completion(user_data);
-                }
-                else
-                {
-                    NS_DEBUG::cnt_err.error("Received an unknown txcq completion",
-                        debug::dec<>(entry[i].flags), debug::bin<64>(entry[i].flags));
-                    std::terminate();
+                    ++sends_complete;
+                    LF_DEB(cnt_deb<9>,
+                        debug(str<>("Completion"), i, dec<2>(i), "txcq flags",
+                            fi_tostr_r(
+                                buf.data(), buf.size(), &entry[i].flags, FI_TYPE_CQ_EVENT_FLAGS),
+                            "(", dec<>(entry[i].flags), ")", "context", hptr(entry[i].op_context),
+                            "length", hex<6>(entry[i].len)));
+                    if ((entry[i].flags & (FI_TAGGED | FI_SEND | FI_MSG)) != 0)
+                    {
+                        LF_DEB(cnt_deb<9>,
+                            debug(str<>("Completion"), "txcq tagged send completion",
+                                hptr(entry[i].op_context)));
+
+                        operation_context* handler =
+                            reinterpret_cast<operation_context*>(entry[i].op_context);
+                        processed += handler->handle_tagged_send_completion(user_data);
+                    }
+                    else
+                    {
+                        LF_DEB(cnt_err,
+                            error("Received an unknown txcq completion", dec<>(entry[i].flags),
+                                bin<64>(entry[i].flags)));
+                        std::terminate();
+                    }
                 }
+                return processed;
             }
-            return processed;
-        }
-        else if (ret == 0 || ret == -FI_EAGAIN)
-        {
-            // do nothing, we will try again on the next check
+            else if (ret == 0 || ret == -FI_EAGAIN)
+            {
+                // do nothing, we will try again on the next check
+            }
+            else { LF_DEB(cnt_err, error("unknown error in completion txcq read")); }
+            return 0;
         }
-        else { NS_DEBUG::cnt_err.error("unknown error in completion txcq read"); }
-        return 0;
-    }
 
-    // --------------------------------------------------------------------
-    int poll_recv_queue(fid_cq* rx_cq, void* user_data)
-    {
+        // --------------------------------------------------------------------
+        int poll_recv_queue(fid_cq* rx_cq, void* user_data)
+        {
 #ifdef EXCESSIVE_POLLING_BACKOFF_MICRO_S
-        std::chrono::steady_clock::time_point now = std::chrono::steady_clock::now();
-        if (std::chrono::duration_cast<std::chrono::microseconds>(now - recv_poll_stamp).count() <
-            EXCESSIVE_POLLING_BACKOFF_MICRO_S)
-            return 0;
-        recv_poll_stamp = now;
+            std::chrono::steady_clock::time_point now = std::chrono::steady_clock::now();
+            if (std::chrono::duration_cast<std::chrono::microseconds>(now - recv_poll_stamp)
+                    .count() < EXCESSIVE_POLLING_BACKOFF_MICRO_S)
+                return 0;
+            recv_poll_stamp = now;
 #endif
-        int             ret;
-        fi_cq_msg_entry entry[max_completions_array_limit_];
-        assert(max_completions_per_poll_ <= max_completions_array_limit_);
-        {
-            auto lock = get_rx_lock();
+            int ret;
+            fi_cq_msg_entry entry[max_completions_array_limit_];
+            assert(max_completions_per_poll_ <= max_completions_array_limit_);
+            {
+                auto lock = get_rx_lock();
 
-            // if we're not threadlocal and didn't get the lock,
-            // then another thread is polling now, just exit
-            if (!bypass_rx_lock() && !lock.owns_lock()) { return -1; }
+                // if we're not threadlocal and didn't get the lock,
+                // then another thread is polling now, just exit
+                if (!bypass_rx_lock() && !lock.owns_lock()) { return -1; }
 
-            static auto polling =
-                NS_DEBUG::cnt_deb<2>.make_timer(1, debug::str<>("poll recv queue"));
-            LF_DEB(NS_DEBUG::cnt_deb<2>, timed(polling, NS_DEBUG::ptr(rx_cq)));
+                static auto polling =
+                    NS_DEBUG::cnt_deb<2>.make_timer(1, NS_DEBUG::str<>("poll recv queue"));
+                LF_DEB(cnt_deb<2>, timed(polling, hptr(rx_cq)));
 
-            // poll for completions
-            {
-                ret = fi_cq_read(rx_cq, &entry[0], max_completions_per_poll_);
-            }
-            // if there is an error, retrieve it
-            if (ret == -FI_EAVAIL)
-            {
-                // read the full error status
-                struct fi_cq_err_entry e = {};
-                int                    err_sz = fi_cq_readerr(rx_cq, &e, 0);
-                (void)err_sz;
-                // from the manpage 'man 3 fi_cq_readerr'
-                if (e.err == FI_ECANCELED)
+                // poll for completions
                 {
-                    LF_DEB(NS_DEBUG::cnt_deb<1>,
-                        debug(debug::str<>("rxcq Cancelled"), "flags", debug::hex<6>(e.flags),
-                            "len", debug::hex<6>(e.len), "context", NS_DEBUG::ptr(e.op_context)));
-                    // the request was cancelled, we can simply exit
-                    // as the canceller will have doone any cleanup needed
-                    operation_context* handler = reinterpret_cast<operation_context*>(e.op_context);
-                    handler->handle_cancelled();
-                    return 0;
+                    ret = fi_cq_read(rx_cq, &entry[0], max_completions_per_poll_);
                 }
-                else if (e.err != FI_SUCCESS)
+                // if there is an error, retrieve it
+                if (ret == -FI_EAVAIL)
                 {
-                    NS_DEBUG::cnt_err.error(debug::str<>("poll_recv_queue"), "error code",
-                        debug::dec<>(-e.err), "flags", debug::hex<6>(e.flags), "len",
-                        debug::hex<6>(e.len), "context", NS_DEBUG::ptr(e.op_context), "error msg",
-                        fi_cq_strerror(rx_cq, e.prov_errno, e.err_data, (char*)e.buf, e.len));
+                    // read the full error status
+                    struct fi_cq_err_entry e = {};
+                    int err_sz = fi_cq_readerr(rx_cq, &e, 0);
+                    (void) err_sz;
+                    // from the manpage 'man 3 fi_cq_readerr'
+                    if (e.err == FI_ECANCELED)
+                    {
+                        LF_DEB(cnt_deb<1>,
+                            debug(str<>("rxcq Cancelled"), "flags", hex<6>(e.flags), "len",
+                                hex<6>(e.len), "context", hptr(e.op_context)));
+                        // the request was cancelled, we can simply exit
+                        // as the canceller will have doone any cleanup needed
+                        operation_context* handler =
+                            reinterpret_cast<operation_context*>(e.op_context);
+                        handler->handle_cancelled();
+                        return 0;
+                    }
+                    else if (e.err != FI_SUCCESS)
+                    {
+                        LF_DEB(cnt_err,
+                            error(str<>("poll_recv_queue"), "error code", dec<>(-e.err), "flags",
+                                hex<6>(e.flags), "len", hex<6>(e.len), "context",
+                                hptr(e.op_context), "error msg",
+                                fi_cq_strerror(
+                                    rx_cq, e.prov_errno, e.err_data, (char*) e.buf, e.len)));
+                    }
+                    operation_context* handler = reinterpret_cast<operation_context*>(e.op_context);
+                    if (handler) handler->handle_error(e);
+                    return 0;
                 }
-                operation_context* handler = reinterpret_cast<operation_context*>(e.op_context);
-                if (handler) handler->handle_error(e);
-                return 0;
             }
-        }
-        //
-        // release the lock and process each completion
-        //
-        if (ret > 0)
-        {
-            int processed = 0;
-            for (int i = 0; i < ret; ++i)
+            //
+            // release the lock and process each completion
+            //
+            if (ret > 0)
             {
-                ++recvs_complete;
-                LF_DEB(NS_DEBUG::cnt_deb<2>,
-                    debug(debug::str<>("Completion"), i, "rxcq flags",
-                        fi_tostr(&entry[i].flags, FI_TYPE_CQ_EVENT_FLAGS), "(",
-                        debug::dec<>(entry[i].flags), ")", "context",
-                        NS_DEBUG::ptr(entry[i].op_context), "length", debug::hex<6>(entry[i].len)));
-                if ((entry[i].flags & (FI_TAGGED | FI_RECV)) != 0)
+                std::array<char, 1024> buf;
+                int processed = 0;
+                for (int i = 0; i < ret; ++i)
                 {
-                    LF_DEB(NS_DEBUG::cnt_deb<2>,
-                        debug(debug::str<>("Completion"), "rxcq tagged recv completion",
-                            NS_DEBUG::ptr(entry[i].op_context)));
-
-                    operation_context* handler =
-                        reinterpret_cast<operation_context*>(entry[i].op_context);
-                    processed += handler->handle_tagged_recv_completion(user_data);
-                }
-                else
-                {
-                    NS_DEBUG::cnt_err.error("Received an unknown rxcq completion",
-                        debug::dec<>(entry[i].flags), debug::bin<64>(entry[i].flags));
-                    std::terminate();
+                    ++recvs_complete;
+                    LF_DEB(cnt_deb<2>,
+                        debug(str<>("Completion"), i, "rxcq flags",
+                            fi_tostr_r(
+                                buf.data(), buf.size(), &entry[i].flags, FI_TYPE_CQ_EVENT_FLAGS),
+                            "(", dec<>(entry[i].flags), ")", "context", hptr(entry[i].op_context),
+                            "length", hex<6>(entry[i].len)));
+                    if ((entry[i].flags & (FI_TAGGED | FI_RECV)) != 0)
+                    {
+                        LF_DEB(cnt_deb<2>,
+                            debug(str<>("Completion"), "rxcq tagged recv completion",
+                                hptr(entry[i].op_context)));
+
+                        operation_context* handler =
+                            reinterpret_cast<operation_context*>(entry[i].op_context);
+                        processed += handler->handle_tagged_recv_completion(user_data);
+                    }
+                    else
+                    {
+                        LF_DEB(cnt_err,
+                            error("Received an unknown rxcq completion", dec<>(entry[i].flags),
+                                bin<64>(entry[i].flags)));
+                        std::terminate();
+                    }
                 }
+                return processed;
+            }
+            else if (ret == 0 || ret == -FI_EAGAIN)
+            {
+                // do nothing, we will try again on the next check
             }
-            return processed;
+            else { LF_DEB(cnt_err, error("unknown error in completion rxcq read")); }
+            return 0;
         }
-        else if (ret == 0 || ret == -FI_EAGAIN)
+
+        // Jobs started using mpi don't have this info
+        struct fi_info* set_src_dst_addresses(struct fi_info* info, bool tx)
         {
-            // do nothing, we will try again on the next check
+            (void) info;    // unused variable warning
+            (void) tx;      // unused variable warning
+
+            LF_DEB(cnb_deb, debug(str<>("fi_dupinfo")));
+            struct fi_info* hints = fi_dupinfo(info);
+            if (!hints) throw NS_LIBFABRIC::fabric_error(0, "fi_dupinfo");
+            // clear any Rx address data that might be set
+            // free(hints->src_addr);
+            // hints->src_addr = nullptr;
+            // hints->src_addrlen = 0;
+            free(hints->dest_addr);
+            hints->dest_addr = nullptr;
+            hints->dest_addrlen = 0;
+            return hints;
         }
-        else { NS_DEBUG::cnt_err.error("unknown error in completion rxcq read"); }
-        return 0;
-    }
+    };
 
-    // Jobs started using mpi don't have this info
-    struct fi_info* set_src_dst_addresses(struct fi_info* info, bool tx)
-    {
-        (void)info; // unused variable warning
-        (void)tx;   // unused variable warning
-
-        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("fi_dupinfo")));
-        struct fi_info* hints = fi_dupinfo(info);
-        if (!hints) throw NS_LIBFABRIC::fabric_error(0, "fi_dupinfo");
-        // clear any Rx address data that might be set
-        // free(hints->src_addr);
-        // hints->src_addr = nullptr;
-        // hints->src_addrlen = 0;
-        free(hints->dest_addr);
-        hints->dest_addr = nullptr;
-        hints->dest_addrlen = 0;
-        return hints;
-    }
-};
-
-} // namespace oomph::libfabric
+}    // namespace oomph::libfabric
diff --git a/src/libfabric/controller_base.hpp b/src/libfabric/controller_base.hpp
index e1ce377e..d423803b 100644
--- a/src/libfabric/controller_base.hpp
+++ b/src/libfabric/controller_base.hpp
@@ -9,18 +9,12 @@
  */
 #pragma once
 
-#include <array>
-#include <atomic>
 #include <chrono>
-#include <deque>
-#include <functional>
 #include <iostream>
-#include <map>
 #include <memory>
 #include <mutex>
 #include <string>
 #include <utility>
-#include <vector>
 //
 #include <cstddef>
 #include <cstdint>
@@ -45,23 +39,25 @@
 #include "memory_region.hpp"
 #include "operation_context_base.hpp"
 
-//#define DISABLE_FI_INJECT
-//#define EXCESSIVE_POLLING_BACKOFF_MICRO_S 50
+#if ((FI_MAJOR_VERSION == 1) && FI_MINOR_VERSION <= 12)
+#define fi_tostr_r(a,b,c,d) " "
+#endif
+
+// #define DISABLE_FI_INJECT
+// #define EXCESSIVE_POLLING_BACKOFF_MICRO_S 50
 
 // ------------------------------------------------------------------
 
 // ----------------------------------------
 // auto progress (libfabric thread) or manual
 // ----------------------------------------
-static fi_progress
-libfabric_progress_type()
+static fi_progress libfabric_progress_type()
 {
     if (std::getenv("LIBFABRIC_AUTO_PROGRESS") == nullptr) return FI_PROGRESS_MANUAL;
     return FI_PROGRESS_AUTO;
 }
 
-static const char*
-libfabric_progress_string()
+static char const* libfabric_progress_string()
 {
     if (libfabric_progress_type() == FI_PROGRESS_AUTO) return "auto";
     return "manual";
@@ -93,8 +89,7 @@ enum class endpoint_type : int
 // ----------------------------------------
 // single endpoint or separate for send/recv
 // ----------------------------------------
-static endpoint_type
-libfabric_endpoint_type()
+static endpoint_type libfabric_endpoint_type()
 {
     auto env_str = std::getenv("LIBFABRIC_ENDPOINT_TYPE");
     if (env_str == nullptr) return endpoint_type::single;
@@ -114,8 +109,7 @@ libfabric_endpoint_type()
     return endpoint_type::single;
 }
 
-static const char*
-libfabric_endpoint_string()
+static char const* libfabric_endpoint_string()
 {
     auto lf_ep_type = libfabric_endpoint_type();
     if (lf_ep_type == endpoint_type::multiple) return "multiple";
@@ -128,8 +122,7 @@ libfabric_endpoint_string()
 // ----------------------------------------
 // number of completions to handle per poll
 // ----------------------------------------
-static int
-libfabric_completions_per_poll()
+static int libfabric_completions_per_poll()
 {
     auto env_str = std::getenv("LIBFABRIC_POLL_SIZE");
     if (env_str != nullptr)
@@ -148,8 +141,7 @@ libfabric_completions_per_poll()
 // ----------------------------------------
 // Eager/Rendezvous threshold
 // ----------------------------------------
-static int
-libfabric_rendezvous_threshold(int def_val)
+static int libfabric_rendezvous_threshold(int def_val)
 {
     auto env_str = std::getenv("LIBFABRIC_RENDEZVOUS_THRESHOLD");
     if (env_str != nullptr)
@@ -170,10 +162,10 @@ libfabric_rendezvous_threshold(int def_val)
 // Needed on Cray for GNI extensions
 // ------------------------------------------------
 #ifdef HAVE_LIBFABRIC_GNI
-#include "rdma/fi_ext_gni.h"
-//#define OOMPH_GNI_REG "none"
-#define OOMPH_GNI_REG "internal"
-//#define OOMPH_GNI_REG "udreg"
+# include "rdma/fi_ext_gni.h"
+// #define OOMPH_GNI_REG "none"
+# define OOMPH_GNI_REG "internal"
+// #define OOMPH_GNI_REG "udreg"
 
 static std::vector<std::pair<int, std::string>> gni_strs = {
     {GNI_MR_CACHE, "GNI_MR_CACHE"},
@@ -209,23 +201,22 @@ static std::vector<std::pair<int, std::string>> gni_ints = {
 // clang-format on
 #endif
 
-// the libfabric library expects us to ask for an API supported version, so if we know we support
-// api 2.0, then we ask for that, but the cxi legacy library on daint only supports 1.15,
-// so drop back to that version if needed
+// the libfabric library expects us to ask for an API supported version, so if
+// we know we support api 2.0, then we ask for that, but the cxi legacy library
+// on daint only supports 1.15, so drop back to that version if needed
 #if defined(OOMPH_LIBFABRIC_V1_API)
-#define LIBFABRIC_FI_VERSION_MAJOR 1
-#define LIBFABRIC_FI_VERSION_MINOR 15
+# define LIBFABRIC_FI_VERSION_MAJOR 1
+# define LIBFABRIC_FI_VERSION_MINOR 15
 #else
-#define LIBFABRIC_FI_VERSION_MAJOR 2
-#define LIBFABRIC_FI_VERSION_MINOR 0
+# define LIBFABRIC_FI_VERSION_MAJOR 2
+# define LIBFABRIC_FI_VERSION_MINOR 2
 #endif
 
-namespace NS_DEBUG
-{
-// cppcheck-suppress ConfigurationNotChecked
-static NS_DEBUG::enable_print<false> cnb_deb("CONBASE");
-static NS_DEBUG::enable_print<false> cnb_err("CONBASE");
-} // namespace NS_DEBUG
+namespace NS_DEBUG {
+    // cppcheck-suppress ConfigurationNotChecked
+    static NS_DEBUG::enable_print<false> cnb_deb("CONBASE");
+    static NS_DEBUG::enable_print<false> cnb_err("CONBASE");
+}    // namespace NS_DEBUG
 
 /** @brief a class to return the number of progressed callbacks */
 struct progress_status
@@ -237,7 +228,7 @@ struct progress_status
     int num_sends() const noexcept { return m_num_sends; }
     int num_recvs() const noexcept { return m_num_recvs; }
 
-    progress_status& operator+=(const progress_status& other) noexcept
+    progress_status& operator+=(progress_status const& other) noexcept
     {
         m_num_sends += other.m_num_sends;
         m_num_recvs += other.m_num_recvs;
@@ -245,1255 +236,1329 @@ struct progress_status
     }
 };
 
-namespace NS_LIBFABRIC
-{
-/// A wrapper around fi_close that reports any error
-/// Because we use so many handles, we must be careful to
-/// delete them all before closing resources that use them
-template<typename Handle>
-void
-fidclose(Handle fid, const char* msg)
-{
-    LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("closing"), msg));
-    int ret = fi_close(fid);
-    if (ret == -FI_EBUSY) { throw NS_LIBFABRIC::fabric_error(ret, "fi_close EBUSY"); }
-    else if (ret == FI_SUCCESS) { return; }
-    throw NS_LIBFABRIC::fabric_error(ret, "fi_close error");
-}
-
-/// when using thread local endpoints, we encapsulate things that
-/// are needed to manage an endpoint
-struct endpoint_wrapper
-{
-  private:
-    friend class controller;
-
-    fid_ep*     ep_ = nullptr;
-    fid_cq*     rq_ = nullptr;
-    fid_cq*     tq_ = nullptr;
-    const char* name_ = nullptr;
-
-  public:
-    endpoint_wrapper() {}
-    endpoint_wrapper(fid_ep* ep, fid_cq* rq, fid_cq* tq, const char* name)
-    : ep_(ep)
-    , rq_(rq)
-    , tq_(tq)
-    , name_(name)
+namespace NS_LIBFABRIC {
+    /// A wrapper around fi_close that reports any error
+    /// Because we use so many handles, we must be careful to
+    /// delete them all before closing resources that use them
+    template <typename Handle>
+    void fidclose(Handle fid, char const* msg)
     {
-        [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__, name_);
+        LF_DEB(cnb_deb, debug(str<>("closing"), msg));
+        int ret = fi_close(fid);
+        if (ret == -FI_EBUSY) { throw NS_LIBFABRIC::fabric_error(ret, "fi_close EBUSY"); }
+        else if (ret == FI_SUCCESS) { return; }
+        throw NS_LIBFABRIC::fabric_error(ret, "fi_close error");
     }
 
-    // to keep boost::lockfree happy, we need these copy operators
-    endpoint_wrapper(const endpoint_wrapper& ep) = default;
-    endpoint_wrapper& operator=(const endpoint_wrapper& ep) = default;
-
-    void cleanup()
+    /// when using thread local endpoints, we encapsulate things that
+    /// are needed to manage an endpoint
+    struct endpoint_wrapper
     {
-        [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__, name_);
-        if (ep_)
+    private:
+        friend class controller;
+
+        fid_ep* ep_ = nullptr;
+        fid_cq* rq_ = nullptr;
+        fid_cq* tq_ = nullptr;
+        char const* name_ = nullptr;
+
+    public:
+        endpoint_wrapper() {}
+        endpoint_wrapper(fid_ep* ep, fid_cq* rq, fid_cq* tq, char const* name)
+          : ep_(ep)
+          , rq_(rq)
+          , tq_(tq)
+          , name_(name)
         {
-            fidclose(&ep_->fid, "endpoint");
-            ep_ = nullptr;
-        }
-        if (rq_)
-        {
-            fidclose(&rq_->fid, "rq");
-            rq_ = nullptr;
+            [[maybe_unused]] auto scp =
+                NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__, name_);
         }
-        if (tq_)
+
+        // to keep boost::lockfree happy, we need these copy operators
+        endpoint_wrapper(endpoint_wrapper const& ep) = default;
+        endpoint_wrapper& operator=(endpoint_wrapper const& ep) = default;
+
+        void cleanup()
         {
-            fidclose(&tq_->fid, "tq");
-            tq_ = nullptr;
+            [[maybe_unused]] auto scp =
+                NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__, name_);
+            if (ep_)
+            {
+                fidclose(&ep_->fid, "endpoint");
+                ep_ = nullptr;
+            }
+            if (rq_)
+            {
+                fidclose(&rq_->fid, "rq");
+                rq_ = nullptr;
+            }
+            if (tq_)
+            {
+                fidclose(&tq_->fid, "tq");
+                tq_ = nullptr;
+            }
         }
-    }
 
-    inline fid_ep*     get_ep() { return ep_; }
-    inline fid_cq*     get_rx_cq() { return rq_; }
-    inline fid_cq*     get_tx_cq() { return tq_; }
-    inline void        set_tx_cq(fid_cq* cq) { tq_ = cq; }
-    inline const char* get_name() { return name_; }
-};
+        inline fid_ep* get_ep() { return ep_; }
+        inline fid_cq* get_rx_cq() { return rq_; }
+        inline fid_cq* get_tx_cq() { return tq_; }
+        inline void set_tx_cq(fid_cq* cq) { tq_ = cq; }
+        inline char const* get_name() { return name_; }
+    };
 
-using region_type = NS_MEMORY::memory_handle;
-using endpoint_context_pool =
-    boost::lockfree::queue<endpoint_wrapper, boost::lockfree::fixed_sized<false>>;
-
-struct stack_endpoint
-{
-    endpoint_wrapper       endpoint_;
-    endpoint_context_pool* pool_;
-    //
-    stack_endpoint()
-    : endpoint_()
-    , pool_(nullptr)
-    {
-    }
-    //
-    stack_endpoint(fid_ep* ep, fid_cq* rq, fid_cq* tq, const char* name,
-        endpoint_context_pool* pool)
-    : endpoint_(ep, rq, tq, name)
-    , pool_(pool)
-    {
-    }
-    //
-    stack_endpoint& operator=(stack_endpoint&& other)
-    {
-        endpoint_ = std::move(other.endpoint_);
-        pool_ = std::exchange(other.pool_, nullptr);
-        return *this;
-    }
+    using region_type = NS_MEMORY::memory_handle;
+    using endpoint_context_pool =
+        boost::lockfree::queue<endpoint_wrapper, boost::lockfree::fixed_sized<false>>;
 
-    ~stack_endpoint()
+    struct stack_endpoint
     {
-        if (!pool_) return;
-        LF_DEB(NS_DEBUG::cnb_deb,
-            trace(debug::str<>("Scalable Ep"), "used push", "ep", NS_DEBUG::ptr(get_ep()), "tx cq",
-                NS_DEBUG::ptr(get_tx_cq()), "rx cq", NS_DEBUG::ptr(get_rx_cq())));
-        pool_->push(endpoint_);
-    }
-
-    inline fid_ep* get_ep() { return endpoint_.get_ep(); }
-
-    inline fid_cq* get_rx_cq() { return endpoint_.get_rx_cq(); }
-
-    inline fid_cq* get_tx_cq() { return endpoint_.get_tx_cq(); }
-};
-
-struct endpoints_lifetime_manager
-{
-    // threadlocal endpoints
-    static inline thread_local stack_endpoint tl_tx_;
-    static inline thread_local stack_endpoint tl_stx_;
-    static inline thread_local stack_endpoint tl_srx_;
-    // non threadlocal endpoints, tx/rx
-    endpoint_wrapper ep_tx_;
-    endpoint_wrapper ep_rx_;
-};
-
-template<typename Derived>
-class controller_base
-{
-  public:
-    typedef std::mutex                   mutex_type;
-    typedef std::lock_guard<mutex_type>  scoped_lock;
-    typedef std::unique_lock<mutex_type> unique_lock;
-
-  protected:
-    // For threadlocal/scalable endpoints,
-    // we use a dedicated threadlocal endpoint wrapper
-    std::unique_ptr<endpoints_lifetime_manager> eps_;
+        endpoint_wrapper endpoint_;
+        endpoint_context_pool* pool_;
+        //
+        stack_endpoint()
+          : endpoint_()
+          , pool_(nullptr)
+        {
+        }
+        //
+        stack_endpoint(
+            fid_ep* ep, fid_cq* rq, fid_cq* tq, char const* name, endpoint_context_pool* pool)
+          : endpoint_(ep, rq, tq, name)
+          , pool_(pool)
+        {
+        }
+        //
+        stack_endpoint& operator=(stack_endpoint&& other)
+        {
+            endpoint_ = std::move(other.endpoint_);
+            pool_ = std::exchange(other.pool_, nullptr);
+            return *this;
+        }
 
-    using endpoint_context_pool =
-        boost::lockfree::queue<endpoint_wrapper, boost::lockfree::fixed_sized<false>>;
-    endpoint_context_pool tx_endpoints_;
-    endpoint_context_pool rx_endpoints_;
+        ~stack_endpoint()
+        {
+            if (!pool_) return;
+            LF_DEB(cnb_deb,
+                trace(str<>("Scalable Ep"), "used push", "ep", hptr(get_ep()), "tx cq",
+                    hptr(get_tx_cq()), "rx cq", hptr(get_rx_cq())));
+            pool_->push(endpoint_);
+        }
 
-    struct fi_info*    fabric_info_;
-    struct fid_fabric* fabric_;
-    struct fid_domain* fabric_domain_;
-    struct fid_pep*    ep_passive_;
+        inline fid_ep* get_ep() { return endpoint_.get_ep(); }
 
-    struct fid_av* av_;
-    endpoint_type  endpoint_type_;
+        inline fid_cq* get_rx_cq() { return endpoint_.get_rx_cq(); }
 
-    locality here_;
-    locality root_;
+        inline fid_cq* get_tx_cq() { return endpoint_.get_tx_cq(); }
+    };
 
-    // used during queue creation setup and during polling
-    mutex_type controller_mutex_;
+    struct endpoints_lifetime_manager
+    {
+        // threadlocal endpoints
+        static inline thread_local stack_endpoint tl_tx_;
+        static inline thread_local stack_endpoint tl_stx_;
+        static inline thread_local stack_endpoint tl_srx_;
+        // non threadlocal endpoints, tx/rx
+        endpoint_wrapper ep_tx_;
+        endpoint_wrapper ep_rx_;
+    };
 
-    // used to protect send/recv resources
-    alignas(64) mutex_type send_mutex_;
-    alignas(64) mutex_type recv_mutex_;
+    template <typename Derived>
+    class controller_base
+    {
+    public:
+        typedef std::mutex mutex_type;
+        typedef std::lock_guard<mutex_type> scoped_lock;
+        typedef std::unique_lock<mutex_type> unique_lock;
+
+    protected:
+        // For threadlocal/scalable endpoints,
+        // we use a dedicated threadlocal endpoint wrapper
+        std::unique_ptr<endpoints_lifetime_manager> eps_;
+
+        using endpoint_context_pool =
+            boost::lockfree::queue<endpoint_wrapper, boost::lockfree::fixed_sized<false>>;
+        endpoint_context_pool tx_endpoints_;
+        endpoint_context_pool rx_endpoints_;
+
+        bool display_fabric_info_;    // for debugging purposes, show fi_info hints
+        struct fi_info* fabric_info_;
+        struct fid_fabric* fabric_;
+        struct fid_domain* fabric_domain_;
+        struct fid_pep* ep_passive_;
+
+        struct fid_av* av_;
+        endpoint_type endpoint_type_;
+
+        locality here_;
+        locality root_;
+
+        // used during queue creation setup and during polling
+        mutex_type controller_mutex_;
+
+        // used to protect send/recv resources
+        alignas(64) mutex_type send_mutex_;
+        alignas(64) mutex_type recv_mutex_;
+
+        std::size_t tx_inject_size_;
+        std::size_t tx_attr_size_;
+        std::size_t rx_attr_size_;
+
+        uint32_t max_completions_per_poll_;
+        uint32_t msg_rendezvous_threshold_;
+        inline static constexpr uint32_t max_completions_array_limit_ = 256;
+
+        static inline thread_local std::chrono::steady_clock::time_point send_poll_stamp;
+        static inline thread_local std::chrono::steady_clock::time_point recv_poll_stamp;
+
+        // set if FI_MR_LOCAL is required (local access requires binding)
+        bool mrlocal = false;
+        // set if FI_MR_ENDPOINT is required (per endpoint memory binding)
+        bool mrbind = false;
+        // set if FI_MR_HRMEM provider requires heterogeneous memory registration
+        bool mrhmem = false;
+
+    public:
+        bool get_mrbind() { return mrbind; }
+
+    public:
+        NS_LIBFABRIC::simple_counter<int, false> sends_posted_;
+        NS_LIBFABRIC::simple_counter<int, false> recvs_posted_;
+        NS_LIBFABRIC::simple_counter<int, false> sends_readied_;
+        NS_LIBFABRIC::simple_counter<int, false> recvs_readied_;
+        NS_LIBFABRIC::simple_counter<int, false> sends_complete;
+        NS_LIBFABRIC::simple_counter<int, false> recvs_complete;
+
+        void finvoke(char const* msg, char const* err, int ret)
+        {
+            LF_DEB(cnb_deb, trace(str<>(msg)));
+            if (ret) throw NS_LIBFABRIC::fabric_error(ret, err);
+        }
 
-    std::size_t tx_inject_size_;
-    std::size_t tx_attr_size_;
-    std::size_t rx_attr_size_;
+    public:
+        // --------------------------------------------------------------------
+        controller_base()
+          : eps_(nullptr)
+          , tx_endpoints_(1)
+          , rx_endpoints_(1)
+          , display_fabric_info_(false)
+          , fabric_info_(nullptr)
+          , fabric_(nullptr)
+          , fabric_domain_(nullptr)
+          , ep_passive_(nullptr)
+          , av_(nullptr)
+          , tx_inject_size_(0)
+          , tx_attr_size_(0)
+          , rx_attr_size_(0)
+          , max_completions_per_poll_(1)
+          , msg_rendezvous_threshold_(0x4000)
+          , sends_posted_(0)
+          , recvs_posted_(0)
+          , sends_readied_(0)
+          , recvs_readied_(0)
+          , sends_complete(0)
+          , recvs_complete(0)
+        {
+        }
 
-    uint32_t                         max_completions_per_poll_;
-    uint32_t                         msg_rendezvous_threshold_;
-    inline static constexpr uint32_t max_completions_array_limit_ = 256;
+        // --------------------------------------------------------------------
+        // clean up all resources
+        ~controller_base()
+        {
+            [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__);
+            unsigned int messages_handled_ = 0;
+            unsigned int rma_reads_ = 0;
+            unsigned int recv_deletes_ = 0;
 
-    static inline thread_local std::chrono::steady_clock::time_point send_poll_stamp;
-    static inline thread_local std::chrono::steady_clock::time_point recv_poll_stamp;
+            LF_DEB(cnb_deb,
+                debug(str<>("counters"), "Received messages", dec<>(messages_handled_),
+                    "Total reads", dec<>(rma_reads_), "Total deletes", dec<>(recv_deletes_),
+                    "deletes error", dec<>(messages_handled_ - recv_deletes_)));
 
-    // set if FI_MR_LOCAL is required (local access requires binding)
-    bool mrlocal = false;
-    // set if FI_MR_ENDPOINT is required (per endpoint memory binding)
-    bool mrbind = false;
-    // set if FI_MR_HRMEM provider requires heterogeneous memory registration
-    bool mrhmem = false;
+            tx_endpoints_.consume_all([](auto&& ep) { ep.cleanup(); });
+            rx_endpoints_.consume_all([](auto&& ep) { ep.cleanup(); });
 
-  public:
-    bool get_mrbind() { return mrbind; }
+            // No cleanup threadlocals : done by consume_all cleanup above
+            // eps_->tl_tx_.endpoint_.cleanup();
+            // eps_->tl_stx_.endpoint_.cleanup();
+            // eps_->tl_srx_.endpoint_.cleanup();
 
-  public:
-    NS_LIBFABRIC::simple_counter<int, false> sends_posted_;
-    NS_LIBFABRIC::simple_counter<int, false> recvs_posted_;
-    NS_LIBFABRIC::simple_counter<int, false> sends_readied_;
-    NS_LIBFABRIC::simple_counter<int, false> recvs_readied_;
-    NS_LIBFABRIC::simple_counter<int, false> sends_complete;
-    NS_LIBFABRIC::simple_counter<int, false> recvs_complete;
+            // non threadlocal endpoints, tx/rx
+            eps_->ep_tx_.cleanup();
+            eps_->ep_rx_.cleanup();
 
-    void finvoke(const char* msg, const char* err, int ret)
-    {
-        LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>(msg)));
-        if (ret) throw NS_LIBFABRIC::fabric_error(ret, err);
-    }
+            // Cleanup endpoints
+            eps_.reset(nullptr);
 
-  public:
-    // --------------------------------------------------------------------
-    controller_base()
-    : eps_(nullptr)
-    , tx_endpoints_(1)
-    , rx_endpoints_(1)
-    , fabric_info_(nullptr)
-    , fabric_(nullptr)
-    , fabric_domain_(nullptr)
-    , ep_passive_(nullptr)
-    , av_(nullptr)
-    , tx_inject_size_(0)
-    , tx_attr_size_(0)
-    , rx_attr_size_(0)
-    , max_completions_per_poll_(1)
-    , msg_rendezvous_threshold_(0x4000)
-    , sends_posted_(0)
-    , recvs_posted_(0)
-    , sends_readied_(0)
-    , recvs_readied_(0)
-    , sends_complete(0)
-    , recvs_complete(0)
-    {
-    }
+            // delete adddress vector
+            fidclose(&av_->fid, "Address Vector");
 
-    // --------------------------------------------------------------------
-    // clean up all resources
-    ~controller_base()
-    {
-        [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__);
-        unsigned int          messages_handled_ = 0;
-        unsigned int          rma_reads_ = 0;
-        unsigned int          recv_deletes_ = 0;
+            try
+            {
+                fidclose(&fabric_domain_->fid, "Domain");
+            }
+            catch (fabric_error& e)
+            {
+                std::cout << "fabric domain close failed : Ensure all RMA "
+                             "objects are freed before program termination"
+                          << std::endl;
+            }
+            fidclose(&fabric_->fid, "Fabric");
 
-        LF_DEB(NS_DEBUG::cnb_deb,
-            debug(debug::str<>("counters"), "Received messages", debug::dec<>(messages_handled_),
-                "Total reads", debug::dec<>(rma_reads_), "Total deletes",
-                debug::dec<>(recv_deletes_), "deletes error",
-                debug::dec<>(messages_handled_ - recv_deletes_)));
+            // clean up
+            LF_DEB(cnb_deb, debug(str<>("freeing fabric_info")));
 
-        tx_endpoints_.consume_all([](auto&& ep) { ep.cleanup(); });
-        rx_endpoints_.consume_all([](auto&& ep) { ep.cleanup(); });
+            fi_freeinfo(fabric_info_);
+        }
 
-        // No cleanup threadlocals : done by consume_all cleanup above
-        // eps_->tl_tx_.endpoint_.cleanup();
-        // eps_->tl_stx_.endpoint_.cleanup();
-        // eps_->tl_srx_.endpoint_.cleanup();
+        // --------------------------------------------------------------------
+        // only used in check_libfabric quick test for helpful output
+        void enable_debug() { display_fabric_info_ = true; }
 
-        // non threadlocal endpoints, tx/rx
-        eps_->ep_tx_.cleanup();
-        eps_->ep_rx_.cleanup();
+        // --------------------------------------------------------------------
+        // setup an endpoint for receiving messages,
+        // usually an rx endpoint is shared by all threads
+        endpoint_wrapper create_rx_endpoint(
+            struct fid_domain* domain, struct fi_info* info, struct fid_av* av)
+        {
+            [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__);
+            auto ep_rx = new_endpoint_active(domain, info, false);
 
-        // Cleanup endpoints
-        eps_.reset(nullptr);
+            // bind address vector
+            bind_address_vector_to_endpoint(ep_rx, av);
 
-        // delete adddress vector
-        fidclose(&av_->fid, "Address Vector");
+            // create a completion queue for the rx endpoint
+            info->rx_attr->op_flags |= FI_COMPLETION;
+            auto rx_cq = create_completion_queue(domain, info->rx_attr->size, "rx");
 
-        try
-        {
-            fidclose(&fabric_domain_->fid, "Domain");
+            // bind CQ to endpoint
+            bind_queue_to_endpoint(ep_rx, rx_cq, FI_RECV, "rx");
+            return endpoint_wrapper(ep_rx, rx_cq, nullptr, "rx");
         }
-        catch (fabric_error& e)
+
+        // --------------------------------------------------------------------
+        // initialize the basic fabric/domain/name
+        template <typename... Args>
+        void initialize(
+            std::string const& provider, bool rootnode, int size, size_t threads, Args&&... args)
         {
-            std::cout << "fabric domain close failed : Ensure all RMA "
-                         "objects are freed before program termination"
-                      << std::endl;
-        }
-        fidclose(&fabric_->fid, "Fabric");
+            LF_DEB(cnb_deb, eval([]() { std::cout.setf(std::ios::unitbuf); }));
+            [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__);
 
-        // clean up
-        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("freeing fabric_info")));
+            max_completions_per_poll_ = libfabric_completions_per_poll();
+            LF_DEB(cnb_err, debug(str<>("Poll completions"), dec<3>(max_completions_per_poll_)));
 
-        fi_freeinfo(fabric_info_);
-    }
+            uint32_t default_val = (threads == 1) ? 0x400 : 0x4000;
+            msg_rendezvous_threshold_ = libfabric_rendezvous_threshold(default_val);
+            LF_DEB(
+                cnb_err, debug(str<>("Rendezvous threshold"), hex<4>(msg_rendezvous_threshold_)));
 
-    // --------------------------------------------------------------------
-    // setup an endpoint for receiving messages,
-    // usually an rx endpoint is shared by all threads
-    endpoint_wrapper create_rx_endpoint(struct fid_domain* domain, struct fi_info* info,
-        struct fid_av* av)
-    {
-        auto ep_rx = new_endpoint_active(domain, info, false);
+            endpoint_type_ = static_cast<endpoint_type>(libfabric_endpoint_type());
+            LF_DEB(cnb_err, debug(str<>("Endpoints"), libfabric_endpoint_string()));
 
-        // bind address vector
-        bind_address_vector_to_endpoint(ep_rx, av);
+            eps_ = std::make_unique<endpoints_lifetime_manager>();
 
-        // create a completion queue for the rx endpoint
-        info->rx_attr->op_flags |= FI_COMPLETION;
-        auto rx_cq = create_completion_queue(domain, info->rx_attr->size, "rx");
+            LF_DEB(cnb_deb, debug(str<>("Threads"), dec<3>(threads)));
 
-        // bind CQ to endpoint
-        bind_queue_to_endpoint(ep_rx, rx_cq, FI_RECV, "rx");
-        return endpoint_wrapper(ep_rx, rx_cq, nullptr, "rx");
-    }
-
-    // --------------------------------------------------------------------
-    // initialize the basic fabric/domain/name
-    template<typename... Args>
-    void initialize(std::string const& provider, bool rootnode, int size, size_t threads,
-        Args&&... args)
-    {
-        LF_DEB(NS_DEBUG::cnb_deb, eval([]() { std::cout.setf(std::ios::unitbuf); }));
-        [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__);
+            open_fabric(provider, threads, rootnode);
 
-        max_completions_per_poll_ = libfabric_completions_per_poll();
-        LF_DEB(NS_DEBUG::cnb_err,
-            debug(debug::str<>("Poll completions"), debug::dec<3>(max_completions_per_poll_)));
+            // create an address vector that will be bound to (all) endpoints
+            av_ = create_address_vector(fabric_info_, size, threads);
 
-        uint32_t default_val = (threads == 1) ? 0x400 : 0x4000;
-        msg_rendezvous_threshold_ = libfabric_rendezvous_threshold(default_val);
-        LF_DEB(NS_DEBUG::cnb_err,
-            debug(debug::str<>("Rendezvous threshold"), debug::hex<4>(msg_rendezvous_threshold_)));
+            // we need an rx endpoint in all cases except scalable rx
+            if (endpoint_type_ != endpoint_type::scalableTxRx)
+            {
+                // setup an endpoint for receiving messages
+                // rx endpoint is typically shared by all threads
+                eps_->ep_rx_ = create_rx_endpoint(fabric_domain_, fabric_info_, av_);
+            }
 
-        endpoint_type_ = static_cast<endpoint_type>(libfabric_endpoint_type());
-        LF_DEB(NS_DEBUG::cnb_err, debug(debug::str<>("Endpoints"), libfabric_endpoint_string()));
+            if (endpoint_type_ == endpoint_type::single)
+            {
+                // always bind a tx cq to the rx endpoint for single endpoint type
+                auto tx_cq = bind_tx_queue_to_rx_endpoint(fabric_info_, eps_->ep_rx_.get_ep());
+                eps_->ep_rx_.set_tx_cq(tx_cq);
+            }
+            else if (endpoint_type_ != endpoint_type::scalableTxRx)
+            {
+#if defined(HAVE_LIBFABRIC_SOCKETS) || defined(HAVE_LIBFABRIC_TCP) ||                              \
+    defined(HAVE_LIBFABRIC_SHM) || defined(HAVE_LIBFABRIC_VERBS) || defined(HAVE_LIBFABRIC_CXI) || \
+    defined(HAVE_LIBFABRIC_EFA)
+                // it appears that the rx endpoint cannot be enabled if it does not
+                // have a Tx CQ (at least when using sockets), so we create a dummy
+                // Tx CQ and bind it just to stop libfabric from triggering an error.
+                // The tx_cq won't actually be used because the user will get the real
+                // tx endpoint which will have the correct cq bound to it
+                auto dummy_cq = bind_tx_queue_to_rx_endpoint(fabric_info_, eps_->ep_rx_.get_ep());
+                eps_->ep_rx_.set_tx_cq(dummy_cq);
+#endif
+            }
 
-        eps_ = std::make_unique<endpoints_lifetime_manager>();
+            if (endpoint_type_ == endpoint_type::multiple)
+            {
+                // create a separate Tx endpoint for sending messages
+                // note that the CQ needs FI_RECV even though its a Tx cq to keep
+                // some providers happy as they trigger an error if an endpoint
+                // has no Rx cq attached (appears to be a progress related bug)
+                auto ep_tx = new_endpoint_active(fabric_domain_, fabric_info_, true);
 
-        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Threads"), debug::dec<3>(threads)));
+                // create a completion queue for tx endpoint
+                fabric_info_->tx_attr->op_flags |= (FI_INJECT_COMPLETE | FI_COMPLETION);
+                auto tx_cq = create_completion_queue(
+                    fabric_domain_, fabric_info_->tx_attr->size, "tx multiple");
 
-        open_fabric(provider, threads, rootnode);
+                bind_queue_to_endpoint(ep_tx, tx_cq, FI_TRANSMIT | FI_RECV, "tx multiple");
+                bind_address_vector_to_endpoint(ep_tx, av_);
+                enable_endpoint(ep_tx, "tx multiple");
 
-        // create an address vector that will be bound to (all) endpoints
-        av_ = create_address_vector(fabric_info_, size, threads);
+                // combine endpoints and CQ into wrapper for convenience
+                eps_->ep_tx_ = endpoint_wrapper(ep_tx, nullptr, tx_cq, "tx multiple");
+            }
+            else if (endpoint_type_ == endpoint_type::threadlocalTx)
+            {
+                // each thread creates a Tx endpoint on first call to get_tx_endpoint()
+            }
+            else if (endpoint_type_ == endpoint_type::scalableTx ||
+                endpoint_type_ == endpoint_type::scalableTxRx)
+            {
+                // setup tx contexts for each possible thread
+                size_t threads_allocated = 0;
+                auto ep_sx = new_endpoint_scalable(
+                    fabric_domain_, fabric_info_, true /*Tx*/, threads, threads_allocated);
+
+                LF_DEB(cnb_deb,
+                    trace(str<>("scalable endpoint ok"), "Contexts allocated",
+                        dec<4>(threads_allocated)));
+
+                finvoke("fi_scalable_ep_bind AV", "fi_scalable_ep_bind",
+                    fi_scalable_ep_bind(ep_sx, &av_->fid, 0));
+
+                // prepare the stack for insertions
+                tx_endpoints_.reserve(threads_allocated);
+                //
+                for (unsigned int i = 0; i < threads_allocated; i++)
+                {
+                    [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(
+                        NS_DEBUG::hptr(this), "scalable", NS_DEBUG::dec<4>(i));
+
+                    // For threadlocal/scalable endpoints, tx/rx resources
+                    fid_ep* scalable_ep_tx;
+                    fid_cq* scalable_cq_tx;
+
+                    // Create a Tx context, cq, bind and enable
+                    finvoke("create tx context", "fi_tx_context",
+                        fi_tx_context(ep_sx, i, NULL, &scalable_ep_tx, NULL));
+                    scalable_cq_tx = create_completion_queue(
+                        fabric_domain_, fabric_info_->tx_attr->size, "tx scalable");
+                    bind_queue_to_endpoint(
+                        scalable_ep_tx, scalable_cq_tx, FI_TRANSMIT, "tx scalable");
+                    enable_endpoint(scalable_ep_tx, "tx scalable");
+
+                    endpoint_wrapper tx(scalable_ep_tx, nullptr, scalable_cq_tx, "tx scalable");
+                    LF_DEB(cnb_deb,
+                        trace(str<>("Scalable Ep"), "initial tx push", "ep", hptr(tx.get_ep()),
+                            "tx cq", hptr(tx.get_tx_cq()), "rx cq", hptr(tx.get_rx_cq())));
+                    tx_endpoints_.push(tx);
+                }
 
-        // we need an rx endpoint in all cases except scalable rx
-        if (endpoint_type_ != endpoint_type::scalableTxRx)
-        {
-            // setup an endpoint for receiving messages
-            // rx endpoint is typically shared by all threads
-            eps_->ep_rx_ = create_rx_endpoint(fabric_domain_, fabric_info_, av_);
-        }
+                eps_->ep_tx_ = endpoint_wrapper(ep_sx, nullptr, nullptr, "rx scalable");
+            }
 
-        if (endpoint_type_ == endpoint_type::single)
-        {
-            // always bind a tx cq to the rx endpoint for single endpoint type
-            auto tx_cq = bind_tx_queue_to_rx_endpoint(fabric_info_, eps_->ep_rx_.get_ep());
-            eps_->ep_rx_.set_tx_cq(tx_cq);
-        }
-        else if (endpoint_type_ != endpoint_type::scalableTxRx)
-        {
-#if defined(HAVE_LIBFABRIC_SOCKETS) || defined(HAVE_LIBFABRIC_TCP) ||                              \
-    defined(HAVE_LIBFABRIC_VERBS) || defined(HAVE_LIBFABRIC_CXI) || defined(HAVE_LIBFABRIC_EFA)
-            // it appears that the rx endpoint cannot be enabled if it does not
-            // have a Tx CQ (at least when using sockets), so we create a dummy
-            // Tx CQ and bind it just to stop libfabric from triggering an error.
-            // The tx_cq won't actually be used because the user will get the real
-            // tx endpoint which will have the correct cq bound to it
-            auto dummy_cq = bind_tx_queue_to_rx_endpoint(fabric_info_, eps_->ep_rx_.get_ep());
-            eps_->ep_rx_.set_tx_cq(dummy_cq);
-#endif
+            // once enabled we can get the address
+            enable_endpoint(eps_->ep_rx_.get_ep(), "rx here");
+            here_ = get_endpoint_address(&eps_->ep_rx_.get_ep()->fid);
+            LF_DEB(cnb_deb, debug(str<>("setting 'here'"), here_.to_str()));
+
+            //        // if we are using scalable endpoints, then setup tx/rx contexts
+            //        // we will us a single endpoint for all Tx/Rx contexts
+            //        if (endpoint_type_ == endpoint_type::scalableTx ||
+            //            endpoint_type_ == endpoint_type::scalableTxRx)
+            //        {
+
+            //            // thread slots might not be same as what we asked for
+            //            size_t threads_allocated = 0;
+            //            auto   ep_sx = new_endpoint_scalable(fabric_domain_, fabric_info_, true /*Tx*/, threads,
+            //                  threads_allocated);
+            //            if (!ep_sx)
+            //                throw NS_LIBFABRIC::fabric_error(FI_EOTHER, "fi_scalable endpoint creation failed");
+
+            //            LF_DEB(cnb_deb, trace(str<>("scalable endpoint ok"),
+            //                                         "Contexts allocated", dec<4>(threads_allocated)));
+
+            //            // prepare the stack for insertions
+            //            tx_endpoints_.reserve(threads_allocated);
+            //            rx_endpoints_.reserve(threads_allocated);
+            //            //
+            //            for (unsigned int i = 0; i < threads_allocated; i++)
+            //            {
+            //                [[maybe_unused]] auto scp =
+            //                    NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), "scalable", dec<4>(i));
+
+            //                // For threadlocal/scalable endpoints, tx/rx resources
+            //                fid_ep* scalable_ep_tx;
+            //                fid_cq* scalable_cq_tx;
+            ////                fid_ep* scalable_ep_rx;
+            ////                fid_cq* scalable_cq_rx;
+
+            //                // Tx context setup
+            //                finvoke("create tx context", "fi_tx_context",
+            //                    fi_tx_context(ep_sx, i, NULL, &scalable_ep_tx, NULL));
+
+            //                scalable_cq_tx = create_completion_queue(fabric_domain_,
+            //                    fabric_info_->tx_attr->size, "tx scalable");
+
+            //                bind_queue_to_endpoint(scalable_ep_tx, scalable_cq_tx, FI_TRANSMIT, "tx scalable");
+
+            //                enable_endpoint(scalable_ep_tx, "tx scalable");
+
+            //                endpoint_wrapper tx(scalable_ep_tx, nullptr, scalable_cq_tx, "tx scalable");
+            //                LF_DEB(cnb_deb,
+            //                    trace(str<>("Scalable Ep"), "initial tx push", "ep",
+            //                        NS_DEBUG::ptr(tx.get_ep()), "tx cq", NS_DEBUG::ptr(tx.get_tx_cq()), "rx cq",
+            //                        NS_DEBUG::ptr(tx.get_rx_cq())));
+            //                tx_endpoints_.push(tx);
+
+            //                // Rx contexts
+            ////                finvoke("create rx context", "fi_rx_context",
+            ////                    fi_rx_context(ep_sx, i, NULL, &scalable_ep_rx, NULL));
+
+            ////                scalable_cq_rx =
+            ////                    create_completion_queue(fabric_domain_, fabric_info_->rx_attr->size, "rx");
+
+            ////                bind_queue_to_endpoint(scalable_ep_rx, scalable_cq_rx, FI_RECV, "rx scalable");
+
+            ////                enable_endpoint(scalable_ep_rx, "rx scalable");
+
+            ////                endpoint_wrapper rx(scalable_ep_rx, scalable_cq_rx, nullptr, "rx scalable");
+            ////                LF_DEB(cnb_deb,
+            ////                    trace(str<>("Scalable Ep"), "initial rx push", "ep",
+            ////                        NS_DEBUG::ptr(rx.get_ep()), "tx cq", NS_DEBUG::ptr(rx.get_tx_cq()), "rx cq",
+            ////                        NS_DEBUG::ptr(rx.get_rx_cq())));
+            ////                rx_endpoints_.push(rx);
+            //            }
+
+            //            finvoke("fi_scalable_ep_bind AV", "fi_scalable_ep_bind",
+            //                fi_scalable_ep_bind(ep_sx, &av_->fid, 0));
+
+            //            eps_->ep_tx_ = endpoint_wrapper(ep_sx, nullptr, nullptr, "rx scalable");
+
+            return static_cast<Derived*>(this)->initialize_derived(
+                provider, rootnode, size, threads, std::forward<Args>(args)...);
         }
 
-        if (endpoint_type_ == endpoint_type::multiple)
+        // --------------------------------------------------------------------
+        uint64_t caps_flags(uint64_t available_flags) const
         {
-            // create a separate Tx endpoint for sending messages
-            // note that the CQ needs FI_RECV even though its a Tx cq to keep
-            // some providers happy as they trigger an error if an endpoint
-            // has no Rx cq attached (appears to be a progress related bug)
-            auto ep_tx = new_endpoint_active(fabric_domain_, fabric_info_, true);
-
-            // create a completion queue for tx endpoint
-            fabric_info_->tx_attr->op_flags |= (FI_INJECT_COMPLETE | FI_COMPLETION);
-            auto tx_cq =
-                create_completion_queue(fabric_domain_, fabric_info_->tx_attr->size, "tx multiple");
-
-            bind_queue_to_endpoint(ep_tx, tx_cq, FI_TRANSMIT | FI_RECV, "tx multiple");
-            bind_address_vector_to_endpoint(ep_tx, av_);
-            enable_endpoint(ep_tx, "tx multiple");
-
-            // combine endpoints and CQ into wrapper for convenience
-            eps_->ep_tx_ = endpoint_wrapper(ep_tx, nullptr, tx_cq, "tx multiple");
-        }
-        else if (endpoint_type_ == endpoint_type::threadlocalTx)
-        {
-            // each thread creates a Tx endpoint on first call to get_tx_endpoint()
-        }
-        else if (endpoint_type_ == endpoint_type::scalableTx ||
-                 endpoint_type_ == endpoint_type::scalableTxRx)
-        {
-            // setup tx contexts for each possible thread
-            size_t threads_allocated = 0;
-            auto   ep_sx = new_endpoint_scalable(fabric_domain_, fabric_info_, true /*Tx*/, threads,
-                  threads_allocated);
-
-            LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>("scalable endpoint ok"),
-                                          "Contexts allocated", debug::dec<4>(threads_allocated)));
-
-            finvoke("fi_scalable_ep_bind AV", "fi_scalable_ep_bind",
-                fi_scalable_ep_bind(ep_sx, &av_->fid, 0));
-
-            // prepare the stack for insertions
-            tx_endpoints_.reserve(threads_allocated);
+            char buf[1024];
+            LF_DEB(cnb_err,
+                debug(str<>("caps available"), hex(available_flags),
+                    fi_tostr_r(buf, 1024, &available_flags, FI_TYPE_CAPS)));
+            uint64_t required_flags =
+                static_cast<Derived const*>(this)->caps_flags(available_flags);
             //
-            for (unsigned int i = 0; i < threads_allocated; i++)
+            uint64_t final_flags = required_flags;
+            for (uint64_t bit = 0; bit < 64; ++bit)
             {
-                [[maybe_unused]] auto scp =
-                    NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), "scalable", debug::dec<4>(i));
-
-                // For threadlocal/scalable endpoints, tx/rx resources
-                fid_ep* scalable_ep_tx;
-                fid_cq* scalable_cq_tx;
-
-                // Create a Tx context, cq, bind and enable
-                finvoke("create tx context", "fi_tx_context",
-                    fi_tx_context(ep_sx, i, NULL, &scalable_ep_tx, NULL));
-                scalable_cq_tx = create_completion_queue(fabric_domain_,
-                    fabric_info_->tx_attr->size, "tx scalable");
-                bind_queue_to_endpoint(scalable_ep_tx, scalable_cq_tx, FI_TRANSMIT, "tx scalable");
-                enable_endpoint(scalable_ep_tx, "tx scalable");
-
-                endpoint_wrapper tx(scalable_ep_tx, nullptr, scalable_cq_tx, "tx scalable");
-                LF_DEB(NS_DEBUG::cnb_deb,
-                    trace(debug::str<>("Scalable Ep"), "initial tx push", "ep",
-                        NS_DEBUG::ptr(tx.get_ep()), "tx cq", NS_DEBUG::ptr(tx.get_tx_cq()), "rx cq",
-                        NS_DEBUG::ptr(tx.get_rx_cq())));
-                tx_endpoints_.push(tx);
+                uint64_t f = (1ULL << bit);
+                if ((required_flags & f) && ((available_flags & f) == 0))
+                {
+                    LF_DEB(cnb_err,
+                        error(str<>("caps flags unavailable"),
+                            fi_tostr_r(buf, 1024, &f, FI_TYPE_CAPS)));
+                    final_flags &= ~f;
+                }
             }
-
-            eps_->ep_tx_ = endpoint_wrapper(ep_sx, nullptr, nullptr, "rx scalable");
+            LF_DEB(cnb_err,
+                debug(str<>("caps flags requested"), hex(final_flags),
+                    fi_tostr_r(buf, 1024, &final_flags, FI_TYPE_CAPS)));
+            return final_flags;
         }
 
-        // once enabled we can get the address
-        enable_endpoint(eps_->ep_rx_.get_ep(), "rx here");
-        here_ = get_endpoint_address(&eps_->ep_rx_.get_ep()->fid);
-        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("setting 'here'"), iplocality(here_)));
-
-        //        // if we are using scalable endpoints, then setup tx/rx contexts
-        //        // we will us a single endpoint for all Tx/Rx contexts
-        //        if (endpoint_type_ == endpoint_type::scalableTx ||
-        //            endpoint_type_ == endpoint_type::scalableTxRx)
-        //        {
-
-        //            // thread slots might not be same as what we asked for
-        //            size_t threads_allocated = 0;
-        //            auto   ep_sx = new_endpoint_scalable(fabric_domain_, fabric_info_, true /*Tx*/, threads,
-        //                  threads_allocated);
-        //            if (!ep_sx)
-        //                throw NS_LIBFABRIC::fabric_error(FI_EOTHER, "fi_scalable endpoint creation failed");
-
-        //            LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>("scalable endpoint ok"),
-        //                                         "Contexts allocated", debug::dec<4>(threads_allocated)));
-
-        //            // prepare the stack for insertions
-        //            tx_endpoints_.reserve(threads_allocated);
-        //            rx_endpoints_.reserve(threads_allocated);
-        //            //
-        //            for (unsigned int i = 0; i < threads_allocated; i++)
-        //            {
-        //                [[maybe_unused]] auto scp =
-        //                    NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), "scalable", debug::dec<4>(i));
-
-        //                // For threadlocal/scalable endpoints, tx/rx resources
-        //                fid_ep* scalable_ep_tx;
-        //                fid_cq* scalable_cq_tx;
-        ////                fid_ep* scalable_ep_rx;
-        ////                fid_cq* scalable_cq_rx;
-
-        //                // Tx context setup
-        //                finvoke("create tx context", "fi_tx_context",
-        //                    fi_tx_context(ep_sx, i, NULL, &scalable_ep_tx, NULL));
-
-        //                scalable_cq_tx = create_completion_queue(fabric_domain_,
-        //                    fabric_info_->tx_attr->size, "tx scalable");
-
-        //                bind_queue_to_endpoint(scalable_ep_tx, scalable_cq_tx, FI_TRANSMIT, "tx scalable");
-
-        //                enable_endpoint(scalable_ep_tx, "tx scalable");
-
-        //                endpoint_wrapper tx(scalable_ep_tx, nullptr, scalable_cq_tx, "tx scalable");
-        //                LF_DEB(NS_DEBUG::cnb_deb,
-        //                    trace(debug::str<>("Scalable Ep"), "initial tx push", "ep",
-        //                        NS_DEBUG::ptr(tx.get_ep()), "tx cq", NS_DEBUG::ptr(tx.get_tx_cq()), "rx cq",
-        //                        NS_DEBUG::ptr(tx.get_rx_cq())));
-        //                tx_endpoints_.push(tx);
-
-        //                // Rx contexts
-        ////                finvoke("create rx context", "fi_rx_context",
-        ////                    fi_rx_context(ep_sx, i, NULL, &scalable_ep_rx, NULL));
-
-        ////                scalable_cq_rx =
-        ////                    create_completion_queue(fabric_domain_, fabric_info_->rx_attr->size, "rx");
-
-        ////                bind_queue_to_endpoint(scalable_ep_rx, scalable_cq_rx, FI_RECV, "rx scalable");
-
-        ////                enable_endpoint(scalable_ep_rx, "rx scalable");
-
-        ////                endpoint_wrapper rx(scalable_ep_rx, scalable_cq_rx, nullptr, "rx scalable");
-        ////                LF_DEB(NS_DEBUG::cnb_deb,
-        ////                    trace(debug::str<>("Scalable Ep"), "initial rx push", "ep",
-        ////                        NS_DEBUG::ptr(rx.get_ep()), "tx cq", NS_DEBUG::ptr(rx.get_tx_cq()), "rx cq",
-        ////                        NS_DEBUG::ptr(rx.get_rx_cq())));
-        ////                rx_endpoints_.push(rx);
-        //            }
-
-        //            finvoke("fi_scalable_ep_bind AV", "fi_scalable_ep_bind",
-        //                fi_scalable_ep_bind(ep_sx, &av_->fid, 0));
-
-        //            eps_->ep_tx_ = endpoint_wrapper(ep_sx, nullptr, nullptr, "rx scalable");
-
-        return static_cast<Derived*>(this)->initialize_derived(provider, rootnode, size, threads,
-            std::forward<Args>(args)...);
-    }
-
-    // --------------------------------------------------------------------
-    constexpr uint64_t caps_flags() { return static_cast<Derived*>(this)->caps_flags(); }
-
-    // --------------------------------------------------------------------
-    constexpr fi_threading threadlevel_flags()
-    {
-        return static_cast<Derived*>(this)->threadlevel_flags();
-    }
+        // --------------------------------------------------------------------
+        constexpr fi_threading threadlevel_flags()
+        {
+            return static_cast<Derived*>(this)->threadlevel_flags();
+        }
 
-    // --------------------------------------------------------------------
-    constexpr std::int64_t memory_registration_mode_flags()
-    {
-        std::int64_t base_flags = FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY;
+        // --------------------------------------------------------------------
+        constexpr std::int64_t memory_registration_mode_flags()
+        {
+#if defined(HAVE_LIBFABRIC_LNX)
+            return FI_MR_HMEM;
+#endif
+            std::int64_t base_flags = FI_MR_ALLOCATED;    // | FI_MR_VIRT_ADDR | FI_MR_PROV_KEY;
 #if OOMPH_ENABLE_DEVICE
-        base_flags = base_flags | FI_MR_HMEM;
+            base_flags = base_flags | FI_MR_HMEM;
 #endif
-        base_flags = base_flags | FI_MR_LOCAL;
+            base_flags = base_flags | FI_MR_LOCAL;
 
 #if defined(HAVE_LIBFABRIC_CXI)
-        return base_flags | FI_MR_MMU_NOTIFY | FI_MR_ENDPOINT;
+            return base_flags | FI_MR_ENDPOINT;
 
 #elif defined(HAVE_LIBFABRIC_EFA)
-        return base_flags | FI_MR_MMU_NOTIFY | FI_MR_ENDPOINT;
+            return base_flags | FI_MR_MMU_NOTIFY | FI_MR_ENDPOINT;
 #else
-        return base_flags;
+            return base_flags;
 #endif
-    }
+        }
 
-    // --------------------------------------------------------------------
-    uint32_t rendezvous_threshold() { return msg_rendezvous_threshold_; }
-    // --------------------------------------------------------------------
-    // initialize the basic fabric/domain/name
-    void open_fabric(std::string const& provider, int threads, bool rootnode)
-    {
-        [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__);
+        // --------------------------------------------------------------------
+        uint32_t rendezvous_threshold() { return msg_rendezvous_threshold_; }
 
-        struct fi_info* fabric_hints_ = fi_allocinfo();
-        if (!fabric_hints_)
+        // --------------------------------------------------------------------
+        // initialize the basic fabric/domain/name
+        void open_fabric(std::string const& provider, int threads, bool rootnode)
         {
-            throw NS_LIBFABRIC::fabric_error(-1, "Failed to allocate fabric hints");
-        }
+            [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__);
 
-        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Here locality"), iplocality(here_)));
+            struct fi_info* fabric_hints_ = fi_allocinfo();
+            if (!fabric_hints_)
+            {
+                throw NS_LIBFABRIC::fabric_error(-1, "Failed to allocate fabric hints");
+            }
 
-#if defined(HAVE_LIBFABRIC_SOCKETS) || defined(HAVE_LIBFABRIC_TCP) || defined(HAVE_LIBFABRIC_VERBS)
-        fabric_hints_->addr_format = FI_SOCKADDR_IN;
-#elif defined(HAVE_LIBFABRIC_EFA)
-        fabric_hints_->addr_format = FI_ADDR_EFA;
+            // setup the provider we want to use before getting info
+            if ((provider.c_str() == std::string("tcp")) ||
+                (provider.c_str() == std::string("verbs")))
+            {
+                fabric_hints_->fabric_attr->prov_name =
+                    strdup(std::string(provider + ";ofi_rxm").c_str());
+            }
+            else { fabric_hints_->fabric_attr->prov_name = strdup(provider.c_str()); }
+            LF_DEB(cnb_deb, debug(str<>("fabric provider"), fabric_hints_->fabric_attr->prov_name));
+
+#if defined(HAVE_LIBFABRIC_CXI)
+            // libfabric domain for multi-nic CXI provider
+            char const* cxi_domain = std::getenv("FI_CXI_DEVICE_NAME");
+            if (cxi_domain == nullptr)
+            {
+                LF_DEB(cnb_err, error(str<>("Domain"), "FI_CXI_DEVICE_NAME not set"));
+            }
+            else { fabric_hints_->domain_attr->name = strdup(cxi_domain); }
+            LF_DEB(
+                NS_DEBUG::cnb_deb, debug(str<>("fabric domain"), fabric_hints_->domain_attr->name));
 #endif
 
-        fabric_hints_->caps = caps_flags();
+            fabric_hints_->domain_attr->mr_mode = memory_registration_mode_flags();
 
-        fabric_hints_->mode = FI_CONTEXT /*| FI_MR_LOCAL*/;
-        if (provider.c_str() == std::string("tcp"))
-        {
-            fabric_hints_->fabric_attr->prov_name =
-                strdup(std::string(provider + ";ofi_rxm").c_str());
-        }
-        else if (provider.c_str() == std::string("verbs"))
-        {
-            fabric_hints_->fabric_attr->prov_name =
-                strdup(std::string(provider + ";ofi_rxm").c_str());
-        }
-        else { fabric_hints_->fabric_attr->prov_name = strdup(provider.c_str()); }
-        LF_DEB(NS_DEBUG::cnb_deb,
-            debug(debug::str<>("fabric provider"), fabric_hints_->fabric_attr->prov_name));
+            // get an info object to see what might be available before we set any flags
+            uint64_t flags = 0;
+            int ret = fi_getinfo(FI_VERSION(LIBFABRIC_FI_VERSION_MAJOR, LIBFABRIC_FI_VERSION_MINOR),
+                nullptr, nullptr, flags, fabric_hints_, &fabric_info_);
+            if (ret) throw NS_LIBFABRIC::fabric_error(ret, "Failed to get fabric info");
+            if (display_fabric_info_ && fabric_info_)
+            {
+                std::array<char, 8192> buf;
+                LF_DEB(cnb_err,
+                    trace(str<>("Fabric info"), "pre-check ->",
+                        fabric_hints_->fabric_attr->prov_name, "\n",
+                        fi_tostr_r(buf.data(), buf.size(), fabric_info_, FI_TYPE_INFO)));
+            }
 
-        fabric_hints_->domain_attr->mr_mode = memory_registration_mode_flags();
+            // set capabilities we want to request
+            uint64_t all_caps =
+                caps_flags(fabric_info_->rx_attr->caps | fabric_info_->tx_attr->caps);
 
-        // Enable/Disable the use of progress threads
-        auto progress = libfabric_progress_type();
-        fabric_hints_->domain_attr->control_progress = progress;
-        fabric_hints_->domain_attr->data_progress = progress;
-        LF_DEB(NS_DEBUG::cnb_err, debug(debug::str<>("progress"), libfabric_progress_string()));
+            // fabric_hints_->caps = all_caps;
+            fabric_hints_->tx_attr->caps = fabric_info_->tx_attr->caps & all_caps;
+            fabric_hints_->rx_attr->caps = fabric_info_->rx_attr->caps & all_caps;
 
-        if (threads > 1)
-        {
-            LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("FI_THREAD_FID")));
-            // Enable thread safe mode (Does not work with psm2 provider)
-            // fabric_hints_->domain_attr->threading = FI_THREAD_SAFE;
-            //fabric_hints_->domain_attr->threading = FI_THREAD_FID;
-            fabric_hints_->domain_attr->threading = threadlevel_flags();
-        }
-        else
-        {
-            LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("FI_THREAD_DOMAIN")));
-            // we serialize everything
-            fabric_hints_->domain_attr->threading = FI_THREAD_DOMAIN;
-        }
+            if ((fabric_info_->mode & FI_CONTEXT) == 0)
+            {
+                std::array<char, 1024> buf;
+                LF_DEB(cnb_err,
+                    debug(str<>("mode FI_CONTEXT!=0"),
+                        fi_tostr_r(buf.data(), buf.size(), &fabric_hints_->domain_attr->mode,
+                            FI_TYPE_MODE)));
+            }
+            fabric_hints_->domain_attr->name = strdup(fabric_info_->domain_attr->name);
+
+            // Enable/Disable the use of progress threads
+            auto progress = libfabric_progress_type();
+            fabric_hints_->domain_attr->control_progress = progress;
+            fabric_hints_->domain_attr->data_progress = progress;
+            LF_DEB(cnb_err, debug(str<>("progress"), libfabric_progress_string()));
+
+            if (threads > 1)
+            {
+                LF_DEB(cnb_deb, debug(str<>("Setting Threads>1 level")));
+                // fabric_hints_->domain_attr->threading = FI_THREAD_SAFE;
+                // fabric_hints_->domain_attr->threading = FI_THREAD_FID;
+                fabric_hints_->domain_attr->threading = threadlevel_flags();
+            }
+            else
+            {
+                LF_DEB(cnb_deb, debug(str<>("FI_THREAD_DOMAIN")));
+                // we serialize everything
+                fabric_hints_->domain_attr->threading = FI_THREAD_DOMAIN;
+            }
 
-        // Enable resource management
-        fabric_hints_->domain_attr->resource_mgmt = FI_RM_ENABLED;
+            // Enable resource management
+            fabric_hints_->domain_attr->resource_mgmt = FI_RM_ENABLED;
 
-        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("fabric endpoint"), "RDM"));
-        fabric_hints_->ep_attr->type = FI_EP_RDM;
+            LF_DEB(cnb_deb, debug(str<>("fabric endpoint"), "RDM"));
+            fabric_hints_->ep_attr->type = FI_EP_RDM;
 
-        uint64_t flags = 0;
-        LF_DEB(NS_DEBUG::cnb_deb,
-            debug(debug::str<>("get fabric info"), "FI_VERSION",
-                debug::dec(LIBFABRIC_FI_VERSION_MAJOR), debug::dec(LIBFABRIC_FI_VERSION_MINOR)));
+            LF_DEB(cnb_deb,
+                debug(str<>("get fabric info"), "FI_VERSION", dec(LIBFABRIC_FI_VERSION_MAJOR),
+                    dec(LIBFABRIC_FI_VERSION_MINOR)));
 
-        int ret = fi_getinfo(FI_VERSION(LIBFABRIC_FI_VERSION_MAJOR, LIBFABRIC_FI_VERSION_MINOR),
-            nullptr, nullptr, flags, fabric_hints_, &fabric_info_);
-        if (ret) throw NS_LIBFABRIC::fabric_error(ret, "Failed to get fabric info");
+            ret = fi_getinfo(FI_VERSION(LIBFABRIC_FI_VERSION_MAJOR, LIBFABRIC_FI_VERSION_MINOR),
+                nullptr, nullptr, flags, fabric_hints_, &fabric_info_);
+            if (ret) throw NS_LIBFABRIC::fabric_error(ret, "Failed to get fabric info");
 
-        if (rootnode)
-        {
-            LF_DEB(NS_DEBUG::cnb_err,
-                trace(debug::str<>("Fabric info"), "\n", fi_tostr(fabric_info_, FI_TYPE_INFO)));
-        }
+            if (rootnode)
+            {
+                std::array<char, 8192> buf;
+                LF_DEB(cnb_err,
+                    trace(str<>("Fabric info"), "\n",
+                        fi_tostr_r(buf.data(), buf.size(), fabric_info_, FI_TYPE_INFO)));
+            }
 
-        bool context = (fabric_hints_->mode & FI_CONTEXT) != 0;
-        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Requires FI_CONTEXT"), context));
+            int mrkey = (fabric_info_->domain_attr->mr_mode & FI_MR_PROV_KEY) != 0;
+            LF_DEB(cnb_deb, debug(str<>("Requires FI_MR_PROV_KEY"), mrkey));
 
-        mrlocal = (fabric_hints_->domain_attr->mr_mode & FI_MR_LOCAL) != 0;
-        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Requires FI_MR_LOCAL"), mrlocal));
+            bool context = (fabric_info_->mode & FI_CONTEXT) != 0;
+            LF_DEB(cnb_deb, debug(str<>("Requires FI_CONTEXT"), context));
 
-        mrbind = (fabric_hints_->domain_attr->mr_mode & FI_MR_ENDPOINT) != 0;
-        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Requires FI_MR_ENDPOINT"), mrbind));
+            mrlocal = (fabric_info_->domain_attr->mr_mode & FI_MR_LOCAL) != 0;
+            LF_DEB(cnb_deb, debug(str<>("Requires FI_MR_LOCAL"), mrlocal));
 
-        /* Check if provider requires heterogeneous memory registration */
-        mrhmem = (fabric_hints_->domain_attr->mr_mode & FI_MR_HMEM) != 0;
-        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Requires FI_MR_HMEM"), mrhmem));
+            mrbind = (fabric_info_->domain_attr->mr_mode & FI_MR_ENDPOINT) != 0;
+            LF_DEB(cnb_deb, debug(str<>("Requires FI_MR_ENDPOINT"), mrbind));
 
-        bool mrhalloc = (fabric_hints_->domain_attr->mr_mode & FI_MR_ALLOCATED) != 0;
-        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Requires FI_MR_ALLOCATED"), mrhalloc));
+            /* Check if provider requires heterogeneous memory registration */
+            mrhmem = (fabric_info_->domain_attr->mr_mode & FI_MR_HMEM) != 0;
+            LF_DEB(cnb_deb, debug(str<>("Requires FI_MR_HMEM"), mrhmem));
 
-        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Creating fi_fabric")));
-        ret = fi_fabric(fabric_info_->fabric_attr, &fabric_, nullptr);
-        if (ret) throw NS_LIBFABRIC::fabric_error(ret, "Failed to get fi_fabric");
+            bool mrhalloc = (fabric_info_->domain_attr->mr_mode & FI_MR_ALLOCATED) != 0;
+            LF_DEB(cnb_deb, debug(str<>("Requires FI_MR_ALLOCATED"), mrhalloc));
+#if (FI_MAJOR_VERSION > 1) || ((FI_MAJOR_VERSION == 1) && FI_MINOR_VERSION >= 20)
+            int auth_key = (fabric_info_->domain_attr->max_ep_auth_key);
+            LF_DEB(cnb_deb, debug(str<>("Supported max_ep_auth_key"), auth_key));
+            fabric_info_->domain_attr->max_ep_auth_key = 0;
+#endif
+            LF_DEB(cnb_deb, debug(str<>("Creating fi_fabric")));
+            ret = fi_fabric(fabric_info_->fabric_attr, &fabric_, nullptr);
+            if (ret) throw NS_LIBFABRIC::fabric_error(ret, "Failed to get fi_fabric");
 
-        // Allocate a domain.
-        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Allocating domain")));
-        ret = fi_domain(fabric_, fabric_info_, &fabric_domain_, nullptr);
-        if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_domain");
+            // Allocate a domain.
+            LF_DEB(cnb_deb, debug(str<>("Allocating domain")));
+            ret = fi_domain(fabric_, fabric_info_, &fabric_domain_, nullptr);
+            if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_domain");
 
 #if defined(HAVE_LIBFABRIC_GNI)
-        {
-            [[maybe_unused]] auto scp =
-                NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), "GNI memory registration block");
-
-            LF_DEB(NS_DEBUG::cnb_err, debug(debug::str<>("-------"), "GNI String values"));
-            // Dump out all vars for debug purposes
-            for (auto& gni_data : gni_strs)
             {
-                _set_check_domain_op_value<const char*>(gni_data.first, 0, gni_data.second.c_str(),
-                    false);
+                [[maybe_unused]] auto scp =
+                    NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), "GNI memory registration block");
+
+                LF_DEB(cnb_err, debug(str<>("-------"), "GNI String values"));
+                // Dump out all vars for debug purposes
+                for (auto& gni_data : gni_strs)
+                {
+                    _set_check_domain_op_value<char const*>(
+                        gni_data.first, 0, gni_data.second.c_str(), false);
+                }
+                LF_DEB(cnb_err, debug(str<>("-------"), "GNI Int values"));
+                for (auto& gni_data : gni_ints)
+                {
+                    _set_check_domain_op_value<uint32_t>(
+                        gni_data.first, 0, gni_data.second.c_str(), false);
+                }
+                LF_DEB(cnb_err, debug(str<>("-------")));
+
+                // --------------------------
+                // GNI_MR_CACHE
+                // set GNI mem reg to be either none, internal or udreg
+                //
+                _set_check_domain_op_value<char*>(
+                    GNI_MR_CACHE, const_cast<char*>(OOMPH_GNI_REG), "GNI_MR_CACHE");
+
+                // --------------------------
+                // GNI_MR_UDREG_REG_LIMIT
+                // Experiments showed default value of 2048 too high if
+                // launching multiple clients on one node
+                //
+                int32_t udreg_limit = 0x0800;    // 0x0400 = 1024, 0x0800 = 2048
+                _set_check_domain_op_value<int32_t>(
+                    GNI_MR_UDREG_REG_LIMIT, udreg_limit, "GNI_MR_UDREG_REG_LIMIT");
+
+                // --------------------------
+                // GNI_MR_CACHE_LAZY_DEREG
+                // Enable lazy deregistration in MR cache
+                //
+                int32_t enable = 1;
+                LF_DEB(cnb_deb, debug(str<>("setting GNI_MR_CACHE_LAZY_DEREG")));
+                _set_check_domain_op_value<int32_t>(
+                    GNI_MR_CACHE_LAZY_DEREG, enable, "GNI_MR_CACHE_LAZY_DEREG");
+
+                // --------------------------
+                // GNI_MSG_RENDEZVOUS_THRESHOLD (c.f. GNI_RMA_RDMA_THRESHOLD)
+                //
+                int32_t thresh = msg_rendezvous_threshold_;
+                _set_check_domain_op_value<int32_t>(
+                    GNI_MSG_RENDEZVOUS_THRESHOLD, thresh, "GNI_MSG_RENDEZVOUS_THRESHOLD");
             }
-            LF_DEB(NS_DEBUG::cnb_err, debug(debug::str<>("-------"), "GNI Int values"));
-            for (auto& gni_data : gni_ints)
+#endif
+            tx_inject_size_ = fabric_info_->tx_attr->inject_size;
+
+            // the number of preposted receives, and sender queue depth
+            // is set by querying the tx/tx attr sizes
+            tx_attr_size_ = std::min(size_t(512), fabric_info_->tx_attr->size / 2);
+            rx_attr_size_ = std::min(size_t(512), fabric_info_->rx_attr->size / 2);
+            // Print fabric info to a human-readable string if available
+            if (display_fabric_info_ && fabric_info_)
             {
-                _set_check_domain_op_value<uint32_t>(gni_data.first, 0, gni_data.second.c_str(),
-                    false);
+                std::array<char, 8192> buf;
+                std::cout << "Libfabric fabric info:\n"
+                          << fi_tostr_r(buf.data(), buf.size(), fabric_info_, FI_TYPE_INFO)
+                          << std::endl;
             }
-            LF_DEB(NS_DEBUG::cnb_err, debug(debug::str<>("-------")));
-
-            // --------------------------
-            // GNI_MR_CACHE
-            // set GNI mem reg to be either none, internal or udreg
-            //
-            _set_check_domain_op_value<char*>(GNI_MR_CACHE, const_cast<char*>(OOMPH_GNI_REG),
-                "GNI_MR_CACHE");
-
-            // --------------------------
-            // GNI_MR_UDREG_REG_LIMIT
-            // Experiments showed default value of 2048 too high if
-            // launching multiple clients on one node
-            //
-            int32_t udreg_limit = 0x0800; // 0x0400 = 1024, 0x0800 = 2048
-            _set_check_domain_op_value<int32_t>(GNI_MR_UDREG_REG_LIMIT, udreg_limit,
-                "GNI_MR_UDREG_REG_LIMIT");
-
-            // --------------------------
-            // GNI_MR_CACHE_LAZY_DEREG
-            // Enable lazy deregistration in MR cache
-            //
-            int32_t enable = 1;
-            LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("setting GNI_MR_CACHE_LAZY_DEREG")));
-            _set_check_domain_op_value<int32_t>(GNI_MR_CACHE_LAZY_DEREG, enable,
-                "GNI_MR_CACHE_LAZY_DEREG");
-
-            // --------------------------
-            // GNI_MSG_RENDEZVOUS_THRESHOLD (c.f. GNI_RMA_RDMA_THRESHOLD)
-            //
-            int32_t thresh = msg_rendezvous_threshold_;
-            _set_check_domain_op_value<int32_t>(GNI_MSG_RENDEZVOUS_THRESHOLD, thresh,
-                "GNI_MSG_RENDEZVOUS_THRESHOLD");
+            fi_freeinfo(fabric_hints_);
         }
-#endif
-        tx_inject_size_ = fabric_info_->tx_attr->inject_size;
-
-        // the number of preposted receives, and sender queue depth
-        // is set by querying the tx/tx attr sizes
-        tx_attr_size_ = std::min(size_t(512), fabric_info_->tx_attr->size / 2);
-        rx_attr_size_ = std::min(size_t(512), fabric_info_->rx_attr->size / 2);
-        fi_freeinfo(fabric_hints_);
-    }
 
-    // --------------------------------------------------------------------
-    struct fi_info* set_src_dst_addresses(struct fi_info* info, bool tx)
-    {
-        return static_cast<Derived*>(this)->set_src_dst_addresses(info, tx);
-    }
+        // --------------------------------------------------------------------
+        struct fi_info* set_src_dst_addresses(struct fi_info* info, bool tx)
+        {
+            return static_cast<Derived*>(this)->set_src_dst_addresses(info, tx);
+        }
 
 #ifdef HAVE_LIBFABRIC_GNI
-    // --------------------------------------------------------------------
-    // Special GNI extensions to disable memory registration cache
+        // --------------------------------------------------------------------
+        // Special GNI extensions to disable memory registration cache
 
-    // if set is false, the old value is returned and nothing is set
-    template<typename T>
-    int _set_check_domain_op_value(int op, T value, const char* info, bool set = true)
-    {
-        [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__);
-        static struct fi_gni_ops_domain* gni_domain_ops = nullptr;
-        int                              ret = 0;
-
-        if (gni_domain_ops == nullptr)
+        // if set is false, the old value is returned and nothing is set
+        template <typename T>
+        int _set_check_domain_op_value(int op, T value, char const* info, bool set = true)
         {
-            ret = fi_open_ops(&fabric_domain_->fid, FI_GNI_DOMAIN_OPS_1, 0, (void**)&gni_domain_ops,
-                nullptr);
-            LF_DEB(NS_DEBUG::cnb_deb,
-                debug(debug::str<>("gni open ops"), (ret == 0 ? "OK" : "FAIL"),
-                    NS_DEBUG::ptr(gni_domain_ops)));
-        }
+            [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__);
+            static struct fi_gni_ops_domain* gni_domain_ops = nullptr;
+            int ret = 0;
 
-        // if open was ok and set flag is present, then set value
-        if (ret == 0 && set)
-        {
-            ret = gni_domain_ops->set_val(&fabric_domain_->fid, (dom_ops_val_t)(op),
-                reinterpret_cast<void*>(&value));
+            if (gni_domain_ops == nullptr)
+            {
+                ret = fi_open_ops(&fabric_domain_->fid, FI_GNI_DOMAIN_OPS_1, 0,
+                    (void**) &gni_domain_ops, nullptr);
+                LF_DEB(cnb_deb,
+                    debug(str<>("gni open ops"), (ret == 0 ? "OK" : "FAIL"),
+                        NS_DEBUG::ptr(gni_domain_ops)));
+            }
 
-            LF_DEB(NS_DEBUG::cnb_deb,
-                debug(debug::str<>("gni set ops val"), value, (ret == 0 ? "OK" : "FAIL")));
-        }
+            // if open was ok and set flag is present, then set value
+            if (ret == 0 && set)
+            {
+                ret = gni_domain_ops->set_val(
+                    &fabric_domain_->fid, (dom_ops_val_t) (op), reinterpret_cast<void*>(&value));
 
-        // Get the value (so we can check that the value we set is now returned)
-        T new_value;
-        ret = gni_domain_ops->get_val(&fabric_domain_->fid, (dom_ops_val_t)(op), &new_value);
-        if constexpr (std::is_integral<T>::value)
-        {
-            LF_DEB(NS_DEBUG::cnb_err, debug(debug::str<>("gni op val"), (ret == 0 ? "OK" : "FAIL"),
-                                          info, debug::hex<8>(new_value)));
-        }
-        else
-        {
-            LF_DEB(NS_DEBUG::cnb_err,
-                debug(debug::str<>("gni op val"), (ret == 0 ? "OK" : "FAIL"), info, new_value));
-        }
-        //
-        if (ret) throw NS_LIBFABRIC::fabric_error(ret, std::string("setting ") + info);
+                LF_DEB(cnb_deb, debug(str<>("gni set ops val"), value, (ret == 0 ? "OK" : "FAIL")));
+            }
 
-        return ret;
-    }
+            // Get the value (so we can check that the value we set is now returned)
+            T new_value;
+            ret = gni_domain_ops->get_val(&fabric_domain_->fid, (dom_ops_val_t) (op), &new_value);
+            if constexpr (std::is_integral<T>::value)
+            {
+                LF_DEB(cnb_err,
+                    debug(
+                        str<>("gni op val"), (ret == 0 ? "OK" : "FAIL"), info, hex<8>(new_value)));
+            }
+            else
+            {
+                LF_DEB(cnb_err,
+                    debug(str<>("gni op val"), (ret == 0 ? "OK" : "FAIL"), info, new_value));
+            }
+            //
+            if (ret) throw NS_LIBFABRIC::fabric_error(ret, std::string("setting ") + info);
+
+            return ret;
+        }
 #endif
 
-    // --------------------------------------------------------------------
-    struct fid_ep* new_endpoint_active(struct fid_domain* domain, struct fi_info* info, bool tx)
-    {
-        // don't allow multiple threads to call endpoint create at the same time
-        scoped_lock lock(controller_mutex_);
+        // --------------------------------------------------------------------
+        struct fid_ep* new_endpoint_active(struct fid_domain* domain, struct fi_info* info, bool tx)
+        {
+            // don't allow multiple threads to call endpoint create at the same time
+            scoped_lock lock(controller_mutex_);
 
-        // make sure src_addr/dst_addr are set accordingly
-        // and we do not create two endpoint with the same src address
-        struct fi_info* hints = set_src_dst_addresses(info, tx);
+            // make sure src_addr/dst_addr are set accordingly
+            // and we do not create two endpoint with the same src address
+            struct fi_info* hints = set_src_dst_addresses(info, tx);
 
-        [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__);
-        LF_DEB(NS_DEBUG::cnb_deb,
-            debug(debug::str<>("Got info mode"), (info->mode & FI_NOTIFY_FLAGS_ONLY)));
+            [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__);
+            LF_DEB(cnb_deb, debug(str<>("Got info mode"), (info->mode & FI_NOTIFY_FLAGS_ONLY)));
 
-        struct fid_ep* ep;
-        int            ret = fi_endpoint(domain, hints, &ep, nullptr);
-        if (ret)
-        {
-            throw NS_LIBFABRIC::fabric_error(ret, "fi_endpoint (too many threadlocal "
-                                                  "endpoints?)");
+            struct fid_ep* ep;
+            int ret = fi_endpoint(domain, hints, &ep, nullptr);
+            if (ret)
+            {
+                throw NS_LIBFABRIC::fabric_error(
+                    ret, "fi_endpoint (too many threadlocal endpoints?)");
+            }
+            fi_freeinfo(hints);
+            LF_DEB(cnb_deb, debug(str<>("new_endpoint_active"), hptr(ep)));
+            return ep;
         }
-        fi_freeinfo(hints);
-        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("new_endpoint_active"), NS_DEBUG::ptr(ep)));
-        return ep;
-    }
 
-    // --------------------------------------------------------------------
-    struct fid_ep* new_endpoint_scalable(struct fid_domain* domain, struct fi_info* info, bool tx,
-        size_t threads, size_t& threads_allocated)
-    {
-        // don't allow multiple threads to call endpoint create at the same time
-        scoped_lock lock(controller_mutex_);
+        // --------------------------------------------------------------------
+        struct fid_ep* new_endpoint_scalable(struct fid_domain* domain, struct fi_info* info,
+            bool tx, size_t threads, size_t& threads_allocated)
+        {
+            // don't allow multiple threads to call endpoint create at the same time
+            scoped_lock lock(controller_mutex_);
 
-        [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__);
+            [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__);
 
-        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("fi_dupinfo")));
-        struct fi_info* hints = fi_dupinfo(info);
-        if (!hints) throw NS_LIBFABRIC::fabric_error(0, "fi_dupinfo");
+            LF_DEB(cnb_deb, debug(str<>("fi_dupinfo")));
+            struct fi_info* hints = fi_dupinfo(info);
+            if (!hints) throw NS_LIBFABRIC::fabric_error(0, "fi_dupinfo");
 
-        int             flags = 0;
-        struct fi_info* new_hints = nullptr;
-        int ret = fi_getinfo(FI_VERSION(LIBFABRIC_FI_VERSION_MAJOR, LIBFABRIC_FI_VERSION_MINOR),
-            nullptr, nullptr, flags, hints, &new_hints);
-        if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_getinfo");
+            int flags = 0;
+            struct fi_info* new_hints = nullptr;
+            int ret = fi_getinfo(FI_VERSION(LIBFABRIC_FI_VERSION_MAJOR, LIBFABRIC_FI_VERSION_MINOR),
+                nullptr, nullptr, flags, hints, &new_hints);
+            if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_getinfo");
 
-        // Check the optimal number of TX/RX contexts supported by the provider
-        size_t context_count = 0;
-        if (tx) { context_count = std::min(new_hints->domain_attr->tx_ctx_cnt, threads); }
-        else { context_count = std::min(new_hints->domain_attr->rx_ctx_cnt, threads); }
+            // Check the optimal number of TX/RX contexts supported by the provider
+            size_t context_count = 0;
+            if (tx) { context_count = std::min(new_hints->domain_attr->tx_ctx_cnt, threads); }
+            else { context_count = std::min(new_hints->domain_attr->rx_ctx_cnt, threads); }
 
-        // clang-format off
-        LF_DEB(NS_DEBUG::cnb_deb,
-            trace(debug::str<>("scalable endpoint"),
+            // clang-format off
+        LF_DEB(cnb_deb,
+            trace(str<>("scalable endpoint"),
                   "Tx", tx,
-                  "Threads", debug::dec<3>(threads),
-                  "tx_ctx_cnt", debug::dec<3>(new_hints->domain_attr->tx_ctx_cnt),
-                  "rx_ctx_cnt", debug::dec<3>(new_hints->domain_attr->rx_ctx_cnt),
-                  "context_count", debug::dec<3>(context_count)));
-        // clang-format on
-
-        threads_allocated = context_count;
-        new_hints->ep_attr->tx_ctx_cnt = context_count;
-        new_hints->ep_attr->rx_ctx_cnt = context_count;
-
-        struct fid_ep* ep;
-        ret = fi_scalable_ep(domain, new_hints, &ep, nullptr);
-        if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_scalable_ep");
-        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("new_endpoint_scalable"), NS_DEBUG::ptr(ep)));
-        fi_freeinfo(hints);
-        return ep;
-    }
-
-    // --------------------------------------------------------------------
-    endpoint_wrapper& get_rx_endpoint()
-    {
-        static auto rx = NS_DEBUG::cnb_deb.make_timer(1, debug::str<>("get_rx_endpoint"));
-        LF_DEB(NS_DEBUG::cnb_deb, timed(rx));
+                  "Threads", dec<3>(threads),
+                  "tx_ctx_cnt", dec<3>(new_hints->domain_attr->tx_ctx_cnt),
+                  "rx_ctx_cnt", dec<3>(new_hints->domain_attr->rx_ctx_cnt),
+                  "context_count", dec<3>(context_count)));
+            // clang-format on
+
+            threads_allocated = context_count;
+            new_hints->ep_attr->tx_ctx_cnt = context_count;
+            new_hints->ep_attr->rx_ctx_cnt = context_count;
+
+            struct fid_ep* ep;
+            ret = fi_scalable_ep(domain, new_hints, &ep, nullptr);
+            if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_scalable_ep");
+            LF_DEB(cnb_deb, debug(str<>("new_endpoint_scalable"), hptr(ep)));
+            fi_freeinfo(hints);
+            return ep;
+        }
 
-        if (endpoint_type_ == endpoint_type::scalableTxRx)
+        // --------------------------------------------------------------------
+        endpoint_wrapper& get_rx_endpoint()
         {
-            if (eps_->tl_srx_.get_ep() == nullptr)
+            static auto rx = NS_DEBUG::cnb_deb.make_timer(1, NS_DEBUG::str<>("get_rx_endpoint"));
+            LF_DEB(cnb_deb, timed(rx));
+
+            if (endpoint_type_ == endpoint_type::scalableTxRx)
             {
-                endpoint_wrapper ep;
-                bool             ok = rx_endpoints_.pop(ep);
-                if (!ok)
+                if (eps_->tl_srx_.get_ep() == nullptr)
                 {
-                    // clang-format off
-                    LF_DEB(NS_DEBUG::cnb_deb, error(debug::str<>("Scalable Ep"), "pop rx",
-                        "ep", NS_DEBUG::ptr(ep.get_ep()),
-                        "tx cq", NS_DEBUG::ptr(ep.get_tx_cq()),
-                        "rx cq", NS_DEBUG::ptr(ep.get_rx_cq())));
-                    // clang-format on
-                    throw std::runtime_error("rx endpoint wrapper pop fail");
+                    endpoint_wrapper ep;
+                    bool ok = rx_endpoints_.pop(ep);
+                    if (!ok)
+                    {
+                        // clang-format off
+                    LF_DEB(cnb_deb, error(str<>("Scalable Ep"), "pop rx",
+                        "ep", hptr(ep.get_ep()),
+                        "tx cq", hptr(ep.get_tx_cq()),
+                        "rx cq", hptr(ep.get_rx_cq())));
+                        // clang-format on
+                        throw std::runtime_error("rx endpoint wrapper pop fail");
+                    }
+                    eps_->tl_srx_ = stack_endpoint(
+                        ep.get_ep(), ep.get_rx_cq(), ep.get_tx_cq(), ep.get_name(), &rx_endpoints_);
+                    LF_DEB(cnb_deb,
+                        trace(str<>("Scalable Ep"), "pop rx", "ep", hptr(eps_->tl_srx_.get_ep()),
+                            "tx cq", hptr(eps_->tl_srx_.get_tx_cq()), "rx cq",
+                            hptr(eps_->tl_srx_.get_rx_cq())));
                 }
-                eps_->tl_srx_ = stack_endpoint(ep.get_ep(), ep.get_rx_cq(), ep.get_tx_cq(),
-                    ep.get_name(), &rx_endpoints_);
-                LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>("Scalable Ep"), "pop rx", "ep",
-                                              NS_DEBUG::ptr(eps_->tl_srx_.get_ep()), "tx cq",
-                                              NS_DEBUG::ptr(eps_->tl_srx_.get_tx_cq()), "rx cq",
-                                              NS_DEBUG::ptr(eps_->tl_srx_.get_rx_cq())));
+                return eps_->tl_srx_.endpoint_;
             }
-            return eps_->tl_srx_.endpoint_;
+            // otherwise just return the normal Rx endpoint
+            return eps_->ep_rx_;
         }
-        // otherwise just return the normal Rx endpoint
-        return eps_->ep_rx_;
-    }
 
-    // --------------------------------------------------------------------
-    endpoint_wrapper& get_tx_endpoint()
-    {
-        if (endpoint_type_ == endpoint_type::threadlocalTx)
+        // --------------------------------------------------------------------
+        endpoint_wrapper& get_tx_endpoint()
         {
-            if (eps_->tl_tx_.get_ep() == nullptr)
+            if (endpoint_type_ == endpoint_type::threadlocalTx)
             {
-                [[maybe_unused]] auto scp =
-                    NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__, "threadlocal");
-
-                // create a completion queue for tx endpoint
-                fabric_info_->tx_attr->op_flags |= (FI_INJECT_COMPLETE | FI_COMPLETION);
-                auto tx_cq = create_completion_queue(fabric_domain_, fabric_info_->tx_attr->size,
-                    "tx threadlocal");
-
-                // setup an endpoint for sending messages
-                // note that the CQ needs FI_RECV even though its a Tx cq to keep
-                // some providers happy as they trigger an error if an endpoint
-                // has no Rx cq attached (progress bug)
-                auto ep_tx = new_endpoint_active(fabric_domain_, fabric_info_, true);
-                bind_queue_to_endpoint(ep_tx, tx_cq, FI_TRANSMIT | FI_RECV, "tx threadlocal");
-                bind_address_vector_to_endpoint(ep_tx, av_);
-                enable_endpoint(ep_tx, "tx threadlocal");
-
-                // set threadlocal endpoint wrapper
-                LF_DEB(NS_DEBUG::cnb_deb,
-                    trace(debug::str<>("Threadlocal Ep"), "create Tx", "ep", NS_DEBUG::ptr(ep_tx),
-                        "tx cq", NS_DEBUG::ptr(tx_cq), "rx cq", NS_DEBUG::ptr(nullptr)));
-                // for cleaning up at termination
-                endpoint_wrapper ep(ep_tx, nullptr, tx_cq, "tx threadlocal");
-                tx_endpoints_.push(ep);
-                eps_->tl_tx_ = stack_endpoint(ep_tx, nullptr, tx_cq, "threadlocal", nullptr);
+                if (eps_->tl_tx_.get_ep() == nullptr)
+                {
+                    [[maybe_unused]] auto scp =
+                        NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__, "threadlocal");
+
+                    // create a completion queue for tx endpoint
+                    fabric_info_->tx_attr->op_flags |= (FI_INJECT_COMPLETE | FI_COMPLETION);
+                    auto tx_cq = create_completion_queue(
+                        fabric_domain_, fabric_info_->tx_attr->size, "tx threadlocal");
+
+                    // setup an endpoint for sending messages
+                    // note that the CQ needs FI_RECV even though its a Tx cq to keep
+                    // some providers happy as they trigger an error if an endpoint
+                    // has no Rx cq attached (progress bug)
+                    auto ep_tx = new_endpoint_active(fabric_domain_, fabric_info_, true);
+                    bind_queue_to_endpoint(ep_tx, tx_cq, FI_TRANSMIT | FI_RECV, "tx threadlocal");
+                    bind_address_vector_to_endpoint(ep_tx, av_);
+                    enable_endpoint(ep_tx, "tx threadlocal");
+
+                    // set threadlocal endpoint wrapper
+                    LF_DEB(cnb_deb,
+                        trace(str<>("Threadlocal Ep"), "create Tx", "ep", hptr(ep_tx), "tx cq",
+                            hptr(tx_cq), "rx cq", hptr(nullptr)));
+                    // for cleaning up at termination
+                    endpoint_wrapper ep(ep_tx, nullptr, tx_cq, "tx threadlocal");
+                    tx_endpoints_.push(ep);
+                    eps_->tl_tx_ = stack_endpoint(ep_tx, nullptr, tx_cq, "threadlocal", nullptr);
+                }
+                return eps_->tl_tx_.endpoint_;
             }
-            return eps_->tl_tx_.endpoint_;
-        }
-        else if (endpoint_type_ == endpoint_type::scalableTx ||
-                 endpoint_type_ == endpoint_type::scalableTxRx)
-        {
-            if (eps_->tl_stx_.get_ep() == nullptr)
+            else if (endpoint_type_ == endpoint_type::scalableTx ||
+                endpoint_type_ == endpoint_type::scalableTxRx)
             {
-                endpoint_wrapper ep;
-                bool             ok = tx_endpoints_.pop(ep);
-                if (!ok)
+                if (eps_->tl_stx_.get_ep() == nullptr)
                 {
-                    LF_DEB(NS_DEBUG::cnb_deb,
-                        error(debug::str<>("Scalable Ep"), "pop tx", "ep",
-                            NS_DEBUG::ptr(ep.get_ep()), "tx cq", NS_DEBUG::ptr(ep.get_tx_cq()),
-                            "rx cq", NS_DEBUG::ptr(ep.get_rx_cq())));
-                    throw std::runtime_error("tx endpoint wrapper pop fail");
+                    endpoint_wrapper ep;
+                    bool ok = tx_endpoints_.pop(ep);
+                    if (!ok)
+                    {
+                        LF_DEB(cnb_deb,
+                            error(str<>("Scalable Ep"), "pop tx", "ep", hptr(ep.get_ep()), "tx cq",
+                                hptr(ep.get_tx_cq()), "rx cq", hptr(ep.get_rx_cq())));
+                        throw std::runtime_error("tx endpoint wrapper pop fail");
+                    }
+                    eps_->tl_stx_ = stack_endpoint(
+                        ep.get_ep(), ep.get_rx_cq(), ep.get_tx_cq(), ep.get_name(), &tx_endpoints_);
+                    LF_DEB(cnb_deb,
+                        trace(str<>("Scalable Ep"), "pop tx", "ep", hptr(eps_->tl_stx_.get_ep()),
+                            "tx cq", hptr(eps_->tl_stx_.get_tx_cq()), "rx cq",
+                            hptr(eps_->tl_stx_.get_rx_cq())));
                 }
-                eps_->tl_stx_ = stack_endpoint(ep.get_ep(), ep.get_rx_cq(), ep.get_tx_cq(),
-                    ep.get_name(), &tx_endpoints_);
-                LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>("Scalable Ep"), "pop tx", "ep",
-                                              NS_DEBUG::ptr(eps_->tl_stx_.get_ep()), "tx cq",
-                                              NS_DEBUG::ptr(eps_->tl_stx_.get_tx_cq()), "rx cq",
-                                              NS_DEBUG::ptr(eps_->tl_stx_.get_rx_cq())));
+                return eps_->tl_stx_.endpoint_;
             }
-            return eps_->tl_stx_.endpoint_;
+            else if (endpoint_type_ == endpoint_type::multiple) { return eps_->ep_tx_; }
+            // single : shared tx/rx endpoint
+            return eps_->ep_rx_;
         }
-        else if (endpoint_type_ == endpoint_type::multiple) { return eps_->ep_tx_; }
-        // single : shared tx/rx endpoint
-        return eps_->ep_rx_;
-    }
-
-    // --------------------------------------------------------------------
-    void bind_address_vector_to_endpoint(struct fid_ep* endpoint, struct fid_av* av)
-    {
-        [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__);
-
-        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Binding AV"), "to", NS_DEBUG::ptr(endpoint)));
-        int ret = fi_ep_bind(endpoint, &av->fid, 0);
-        if (ret) throw NS_LIBFABRIC::fabric_error(ret, "bind address_vector");
-    }
 
-    // --------------------------------------------------------------------
-    void bind_queue_to_endpoint(struct fid_ep* endpoint, struct fid_cq*& cq, uint32_t cqtype,
-        const char* type)
-    {
-        [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__, type);
-
-        LF_DEB(NS_DEBUG::cnb_deb,
-            debug(debug::str<>("Binding CQ"), "to", NS_DEBUG::ptr(endpoint), type));
-        int ret = fi_ep_bind(endpoint, &cq->fid, cqtype);
-        if (ret) throw NS_LIBFABRIC::fabric_error(ret, "bind cq");
-    }
+        // --------------------------------------------------------------------
+        void bind_address_vector_to_endpoint(struct fid_ep* endpoint, struct fid_av* av)
+        {
+            [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__);
 
-    // --------------------------------------------------------------------
-    fid_cq* bind_tx_queue_to_rx_endpoint(struct fi_info* info, struct fid_ep* ep)
-    {
-        [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__);
-        info->tx_attr->op_flags |= (FI_INJECT_COMPLETE | FI_COMPLETION);
-        fid_cq* tx_cq = create_completion_queue(fabric_domain_, info->tx_attr->size, "tx->rx");
-        // shared send/recv endpoint - bind send cq to the recv endpoint
-        bind_queue_to_endpoint(ep, tx_cq, FI_TRANSMIT, "tx->rx bug fix");
-        return tx_cq;
-    }
+            LF_DEB(cnb_deb, debug(str<>("Binding AV"), "to", hptr(endpoint)));
+            int ret = fi_ep_bind(endpoint, &av->fid, 0);
+            if (ret) throw NS_LIBFABRIC::fabric_error(ret, "bind address_vector");
+        }
 
-    // --------------------------------------------------------------------
-    void enable_endpoint(struct fid_ep* endpoint, const char* type)
-    {
-        [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__, type);
+        // --------------------------------------------------------------------
+        void bind_queue_to_endpoint(
+            struct fid_ep* endpoint, struct fid_cq*& cq, uint32_t cqtype, char const* type)
+        {
+            [[maybe_unused]] auto scp =
+                NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__, type);
 
-        LF_DEB(NS_DEBUG::cnb_deb,
-            debug(debug::str<>("Enabling endpoint"), NS_DEBUG::ptr(endpoint)));
-        int ret = fi_enable(endpoint);
-        if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_enable");
-    }
+            LF_DEB(cnb_deb, debug(str<>("Binding CQ"), "to", hptr(endpoint), type));
+            int ret = fi_ep_bind(endpoint, &cq->fid, cqtype);
+            if (ret) throw NS_LIBFABRIC::fabric_error(ret, "bind cq");
+        }
 
-    // --------------------------------------------------------------------
-    locality get_endpoint_address(struct fid* id)
-    {
-        [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__);
+        // --------------------------------------------------------------------
+        fid_cq* bind_tx_queue_to_rx_endpoint(struct fi_info* info, struct fid_ep* ep)
+        {
+            [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__);
+            info->tx_attr->op_flags |= (FI_INJECT_COMPLETE | FI_COMPLETION);
+            fid_cq* tx_cq = create_completion_queue(fabric_domain_, info->tx_attr->size, "tx->rx");
+            // shared send/recv endpoint - bind send cq to the recv endpoint
+            bind_queue_to_endpoint(ep, tx_cq, FI_TRANSMIT, "tx->rx bug fix");
+            return tx_cq;
+        }
 
-        locality::locality_data local_addr;
-        std::size_t             addrlen = locality_defs::array_size;
-        int                     ret = fi_getname(id, local_addr.data(), &addrlen);
-        if (ret || (addrlen > locality_defs::array_size))
+        // --------------------------------------------------------------------
+        void enable_endpoint(struct fid_ep* endpoint, char const* type)
         {
-            std::string err =
-                std::to_string(addrlen) + "=" + std::to_string(locality_defs::array_size);
-            NS_LIBFABRIC::fabric_error(ret, "fi_getname - size error or other problem " + err);
+            [[maybe_unused]] auto scp =
+                NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__, type);
+
+            LF_DEB(cnb_deb, debug(str<>("Enabling endpoint"), hptr(endpoint)));
+            int ret = fi_enable(endpoint);
+            if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_enable");
         }
 
-        // optimized out when debug logging is false
-        if constexpr (NS_DEBUG::cnb_deb.is_enabled())
+        // --------------------------------------------------------------------
+        locality get_endpoint_address(struct fid* id)
         {
-            std::stringstream temp1;
-            for (std::size_t i = 0; i < locality_defs::array_length; ++i)
+            [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__);
+
+            locality::locality_data local_addr;
+            std::size_t addrlen = locality_defs::array_size;
+            int ret = fi_getname(id, local_addr.data(), &addrlen);
+            if (ret || (addrlen > locality_defs::array_size))
             {
-                temp1 << debug::ipaddr(&local_addr[i]) << " - ";
+                std::string err =
+                    std::to_string(addrlen) + "=" + std::to_string(locality_defs::array_size);
+                NS_LIBFABRIC::fabric_error(ret, "fi_getname - error (address size ?) " + err);
             }
 
-            LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("raw address data"), "size",
-                                          debug::dec<>(addrlen), " : ", temp1.str().c_str()));
-            std::stringstream temp2;
-            for (std::size_t i = 0; i < locality_defs::array_length; ++i)
+            // optimized out when debug logging is false
+            if constexpr (NS_DEBUG::cnb_deb.is_enabled())
             {
-                temp2 << debug::hex<8>(local_addr[i]) << " - ";
+                LF_DEB(cnb_deb,
+                    debug(str<>("raw address data"), "size", dec<4>(addrlen), " : ",
+                        locality(local_addr, av_).to_str()));
+
+                std::stringstream temp2;
+                for (std::size_t i = 0; i < locality_defs::array_length; ++i)
+                {
+                    temp2 << NS_DEBUG::hex<8>(local_addr[i]) << " - ";
+                }
+                LF_DEB(cnb_deb, debug(str<>("raw address data"), temp2.str().c_str()));
             }
-            LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("raw address data"), temp2.str().c_str()));
+            return locality(local_addr, av_);
         }
-        return locality(local_addr);
-    }
 
-    // --------------------------------------------------------------------
-    fid_pep* create_passive_endpoint(struct fid_fabric* fabric, struct fi_info* info)
-    {
-        [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__);
+        // --------------------------------------------------------------------
+        fid_pep* create_passive_endpoint(struct fid_fabric* fabric, struct fi_info* info)
+        {
+            [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__);
 
-        struct fid_pep* ep;
-        int             ret = fi_passive_ep(fabric, info, &ep, nullptr);
-        if (ret) { throw NS_LIBFABRIC::fabric_error(ret, "Failed to create fi_passive_ep"); }
-        return ep;
-    }
+            struct fid_pep* ep;
+            int ret = fi_passive_ep(fabric, info, &ep, nullptr);
+            if (ret) { throw NS_LIBFABRIC::fabric_error(ret, "Failed to create fi_passive_ep"); }
+            return ep;
+        }
 
-    // --------------------------------------------------------------------
-    inline const locality& here() const { return here_; }
+        // --------------------------------------------------------------------
+        inline locality const& here() const { return here_; }
 
-    // --------------------------------------------------------------------
-    inline const fi_addr_t& fi_address() const { return here_.fi_address(); }
+        // --------------------------------------------------------------------
+        inline fi_addr_t const& fi_address() const { return here_.fi_address(); }
 
-    // --------------------------------------------------------------------
-    inline void setHere(const locality& val) { here_ = val; }
+        // --------------------------------------------------------------------
+        inline void setHere(locality const& val) { here_ = val; }
 
-    // --------------------------------------------------------------------
-    inline const locality& root() const { return root_; }
+        // --------------------------------------------------------------------
+        inline locality const& root() const { return root_; }
 
-    // --------------------------------------------------------------------
-    inline struct fid_domain* get_domain() const { return fabric_domain_; }
+        // --------------------------------------------------------------------
+        inline struct fid_domain* get_domain() const { return fabric_domain_; }
 
-    // --------------------------------------------------------------------
-    inline std::size_t get_rma_protocol_size() { return 65536; }
+        // --------------------------------------------------------------------
+        inline std::size_t get_rma_protocol_size() { return 65536; }
 #ifdef DISABLE_FI_INJECT
-    // --------------------------------------------------------------------
-    inline std::size_t get_tx_inject_size() { return 0; }
+        // --------------------------------------------------------------------
+        inline std::size_t get_tx_inject_size() { return 0; }
 #else
-    // --------------------------------------------------------------------
-    inline std::size_t get_tx_inject_size() { return tx_inject_size_; }
+        // --------------------------------------------------------------------
+        inline std::size_t get_tx_inject_size() { return tx_inject_size_; }
 #endif
 
-    // --------------------------------------------------------------------
-    inline std::size_t get_tx_size() { return tx_attr_size_; }
+        // --------------------------------------------------------------------
+        inline std::size_t get_tx_size() { return tx_attr_size_; }
 
-    // --------------------------------------------------------------------
-    inline std::size_t get_rx_size() { return rx_attr_size_; }
+        // --------------------------------------------------------------------
+        inline std::size_t get_rx_size() { return rx_attr_size_; }
 
-    // --------------------------------------------------------------------
-    // returns true when all connections have been disconnected and none are active
-    inline bool isTerminated()
-    {
-        return false;
-        //return (qp_endpoint_map_.size() == 0);
-    }
+        // --------------------------------------------------------------------
+        // returns true when all connections have been disconnected and none are active
+        inline bool isTerminated()
+        {
+            return false;
+            // return (qp_endpoint_map_.size() == 0);
+        }
 
-    // --------------------------------------------------------------------
-    void debug_print_av_vector(std::size_t N)
-    {
-        locality    addr;
-        std::size_t addrlen = locality_defs::array_size;
-        for (std::size_t i = 0; i < N; ++i)
+        // --------------------------------------------------------------------
+        void debug_print_av_vector(std::size_t N)
         {
-            int ret = fi_av_lookup(av_, fi_addr_t(i), addr.fabric_data_writable(), &addrlen);
-            addr.set_fi_address(fi_addr_t(i));
-            if ((ret == 0) && (addrlen == locality_defs::array_size))
-            {
-                LF_DEB(NS_DEBUG::cnb_deb,
-                    debug(debug::str<>("address vector"), debug::dec<3>(i), iplocality(addr)));
-            }
-            else
+            locality addr;
+            std::size_t addrlen = locality_defs::array_size;
+            for (std::size_t i = 0; i < N; ++i)
             {
-                LF_DEB(NS_DEBUG::cnb_err,
-                    error(debug::str<>("address length"), debug::dec<3>(addrlen),
-                        debug::dec<3>(locality_defs::array_size)));
-                throw std::runtime_error("debug_print_av_vector : address vector "
-                                         "traversal failure");
+                int ret = fi_av_lookup(av_, fi_addr_t(i), addr.fabric_data_writable(), &addrlen);
+                addr.set_fi_address(fi_addr_t(i));
+                if ((ret == 0) && (addrlen <= locality_defs::array_size))
+                {
+                    LF_DEB(cnb_deb, debug(str<>("address vector"), dec<3>(i), addr.to_str()));
+                }
+                else
+                {
+                    LF_DEB(cnb_err,
+                        error(str<>("address length"), dec<3>(addrlen),
+                            dec<3>(locality_defs::array_size)));
+                    throw std::runtime_error("debug_print_av_vector : address vector "
+                                             "traversal failure");
+                }
             }
         }
-    }
 
-    // --------------------------------------------------------------------
-    inline constexpr bool bypass_tx_lock()
-    {
+        // --------------------------------------------------------------------
+        inline constexpr bool bypass_tx_lock()
+        {
 #if defined(HAVE_LIBFABRIC_GNI)
-        return true;
-#elif defined(HAVE_LIBFABRIC_CXI)
-        // @todo : cxi provider is not yet thread safe using scalable endpoints
-        return false;
+            return true;
+#elif defined(HAVE_LIBFABRIC_LNX)
+            // @todo : provider is not yet thread safe using scalable endpoints
+            return false;
 #else
-        return (threadlevel_flags() == FI_THREAD_SAFE ||
+            return (threadlevel_flags() == FI_THREAD_SAFE ||
                 endpoint_type_ == endpoint_type::threadlocalTx);
 #endif
-    }
+        }
 
-    // --------------------------------------------------------------------
-    inline controller_base::unique_lock get_tx_lock()
-    {
-        if (bypass_tx_lock()) return unique_lock();
-        return unique_lock(send_mutex_);
-    }
+        // --------------------------------------------------------------------
+        inline controller_base::unique_lock get_tx_lock()
+        {
+            if (bypass_tx_lock()) return unique_lock();
+            return unique_lock(send_mutex_);
+        }
 
-    // --------------------------------------------------------------------
-    inline controller_base::unique_lock try_tx_lock()
-    {
-        if (bypass_tx_lock()) return unique_lock();
-        return unique_lock(send_mutex_, std::try_to_lock_t{});
-    }
+        // --------------------------------------------------------------------
+        inline controller_base::unique_lock try_tx_lock()
+        {
+            if (bypass_tx_lock()) return unique_lock();
+            return unique_lock(send_mutex_, std::try_to_lock_t{});
+        }
 
-    // --------------------------------------------------------------------
-    inline constexpr bool bypass_rx_lock()
-    {
+        // --------------------------------------------------------------------
+        inline constexpr bool bypass_rx_lock()
+        {
 #ifdef HAVE_LIBFABRIC_GNI
-        return true;
+            return true;
 #else
-        return (
-            threadlevel_flags() == FI_THREAD_SAFE || endpoint_type_ == endpoint_type::scalableTxRx);
+            return (threadlevel_flags() == FI_THREAD_SAFE ||
+                endpoint_type_ == endpoint_type::scalableTxRx);
 #endif
-    }
+        }
 
-    // --------------------------------------------------------------------
-    inline controller_base::unique_lock get_rx_lock()
-    {
-        if (bypass_rx_lock()) return unique_lock();
-        return unique_lock(recv_mutex_);
-    }
+        // --------------------------------------------------------------------
+        inline controller_base::unique_lock get_rx_lock()
+        {
+            if (bypass_rx_lock()) return unique_lock();
+            return unique_lock(recv_mutex_);
+        }
 
-    // --------------------------------------------------------------------
-    inline controller_base::unique_lock try_rx_lock()
-    {
-        if (bypass_rx_lock()) return unique_lock();
-        return unique_lock(recv_mutex_, std::try_to_lock_t{});
-    }
+        // --------------------------------------------------------------------
+        inline controller_base::unique_lock try_rx_lock()
+        {
+            if (bypass_rx_lock()) return unique_lock();
+            return unique_lock(recv_mutex_, std::try_to_lock_t{});
+        }
 
-    // --------------------------------------------------------------------
-    progress_status poll_for_work_completions(void* user_data)
-    {
-        progress_status p{0, 0};
-        bool            retry = false;
-        do {
-            // sends
-            uint32_t nsend = static_cast<Derived*>(this)->poll_send_queue(
-                get_tx_endpoint().get_tx_cq(), user_data);
-            p.m_num_sends += nsend;
-            retry = (nsend == max_completions_per_poll_);
-            // recvs
-            uint32_t nrecv = static_cast<Derived*>(this)->poll_recv_queue(
-                get_rx_endpoint().get_rx_cq(), user_data);
-            p.m_num_recvs += nrecv;
-            retry |= (nrecv == max_completions_per_poll_);
-        } while (retry);
-        return p;
-    }
+        // --------------------------------------------------------------------
+        progress_status poll_for_work_completions(void* user_data)
+        {
+            progress_status p{0, 0};
+            bool retry = false;
+            do {
+                // sends
+                uint32_t nsend = static_cast<Derived*>(this)->poll_send_queue(
+                    get_tx_endpoint().get_tx_cq(), user_data);
+                p.m_num_sends += nsend;
+                retry = (nsend == max_completions_per_poll_);
+                // recvs
+                uint32_t nrecv = static_cast<Derived*>(this)->poll_recv_queue(
+                    get_rx_endpoint().get_rx_cq(), user_data);
+                p.m_num_recvs += nrecv;
+                retry |= (nrecv == max_completions_per_poll_);
+            } while (retry);
+            return p;
+        }
 
-    // --------------------------------------------------------------------
-    inline int poll_send_queue(fid_cq* tx_cq, void* user_data)
-    {
-        return static_cast<Derived*>(this)->poll_send_queue(tx_cq, user_data);
-    }
+        // --------------------------------------------------------------------
+        inline int poll_send_queue(fid_cq* tx_cq, void* user_data)
+        {
+            return static_cast<Derived*>(this)->poll_send_queue(tx_cq, user_data);
+        }
 
-    // --------------------------------------------------------------------
-    inline int poll_recv_queue(fid_cq* rx_cq, void* user_data)
-    {
-        return static_cast<Derived*>(this)->poll_recv_queue(rx_cq, user_data);
-    }
+        // --------------------------------------------------------------------
+        inline int poll_recv_queue(fid_cq* rx_cq, void* user_data)
+        {
+            return static_cast<Derived*>(this)->poll_recv_queue(rx_cq, user_data);
+        }
 
-    // --------------------------------------------------------------------
-    struct fid_cq* create_completion_queue(struct fid_domain* domain, size_t size, const char* type)
-    {
-        [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__, type);
-
-        struct fid_cq* cq;
-        fi_cq_attr     cq_attr = {};
-        cq_attr.format = FI_CQ_FORMAT_MSG;
-        cq_attr.wait_obj = FI_WAIT_NONE;
-        cq_attr.wait_cond = FI_CQ_COND_NONE;
-        cq_attr.size = size;
-        cq_attr.flags = 0 /*FI_COMPLETION*/;
-        LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>("CQ size"), debug::dec<4>(size)));
-        // open completion queue on fabric domain and set context to null
-        int ret = fi_cq_open(domain, &cq_attr, &cq, nullptr);
-        if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_cq_open");
-        return cq;
-    }
+        // --------------------------------------------------------------------
+        struct fid_cq* create_completion_queue(
+            struct fid_domain* domain, size_t size, char const* type)
+        {
+            [[maybe_unused]] auto scp =
+                NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__, type);
+
+            struct fid_cq* cq;
+            fi_cq_attr cq_attr = {};
+            cq_attr.format = FI_CQ_FORMAT_MSG;
+            cq_attr.wait_obj = FI_WAIT_NONE;
+            cq_attr.wait_cond = FI_CQ_COND_NONE;
+            cq_attr.size = size;
+            cq_attr.flags = 0 /*FI_COMPLETION*/;
+            LF_DEB(cnb_deb, trace(str<>("CQ size"), dec<4>(size)));
+            // open completion queue on fabric domain and set context to null
+            int ret = fi_cq_open(domain, &cq_attr, &cq, nullptr);
+            if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_cq_open");
+            return cq;
+        }
 
-    // --------------------------------------------------------------------
-    fid_av* create_address_vector(struct fi_info* info, int N, int num_rx_contexts)
-    {
-        [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__);
+        // --------------------------------------------------------------------
+        fid_av* create_address_vector(struct fi_info* info, int N, int num_rx_contexts)
+        {
+            [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__);
 
-        fid_av*    av;
-        fi_av_attr av_attr = {fi_av_type(0), 0, 0, 0, nullptr, nullptr, 0};
+            fid_av* av;
+            fi_av_attr av_attr = {fi_av_type(0), 0, 0, 0, nullptr, nullptr, 0};
 
-        // number of addresses expected
-        av_attr.count = N;
+            // number of addresses expected
+            av_attr.count = N;
 
-        // number of receive contexts used
-        int rx_ctx_bits = 0;
+            // number of receive contexts used
+            int rx_ctx_bits = 0;
 #ifdef RX_CONTEXTS_SUPPORT
-        while (num_rx_contexts >> ++rx_ctx_bits)
-            ;
-        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("rx_ctx_bits"), rx_ctx_bits));
+            while (num_rx_contexts >> ++rx_ctx_bits);
+            LF_DEB(cnb_deb, debug(str<>("rx_ctx_bits"), rx_ctx_bits));
 #endif
-        av_attr.rx_ctx_bits = rx_ctx_bits;
-        // if contexts is nonzero, then we are using a single scalable endpoint
-        av_attr.ep_per_node = (num_rx_contexts > 0) ? 2 : 0;
+            av_attr.rx_ctx_bits = rx_ctx_bits;
+            // if contexts is nonzero, then we are using a single scalable endpoint
+            av_attr.ep_per_node = (num_rx_contexts > 0) ? 2 : 0;
 
-        if (info->domain_attr->av_type != FI_AV_UNSPEC)
-        {
-            av_attr.type = info->domain_attr->av_type;
-        }
-        else
-        {
-            LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("map FI_AV_TABLE")));
-            av_attr.type = FI_AV_TABLE;
-        }
+            if (info->domain_attr->av_type != FI_AV_UNSPEC)
+            {
+                av_attr.type = info->domain_attr->av_type;
+            }
+            else
+            {
+                LF_DEB(cnb_deb, debug(str<>("map FI_AV_TABLE")));
+                av_attr.type = FI_AV_TABLE;
+            }
 
-        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Creating AV")));
-        int ret = fi_av_open(fabric_domain_, &av_attr, &av, nullptr);
-        if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_av_open");
-        return av;
-    }
+            LF_DEB(cnb_deb, debug(str<>("Creating AV")));
+            int ret = fi_av_open(fabric_domain_, &av_attr, &av, nullptr);
+            if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_av_open");
+            return av;
+        }
 
-    // --------------------------------------------------------------------
-    locality insert_address(const locality& address) { return insert_address(av_, address); }
+        // --------------------------------------------------------------------
+        locality insert_address(locality const& address) { return insert_address(av_, address); }
 
-    // --------------------------------------------------------------------
-    locality insert_address(fid_av* av, const locality& address)
-    {
-        [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__);
-
-        LF_DEB(NS_DEBUG::cnb_deb,
-            trace(debug::str<>("inserting AV"), iplocality(address), NS_DEBUG::ptr(av)));
-        fi_addr_t fi_addr = 0xffffffff;
-        int       ret = fi_av_insert(av, address.fabric_data(), 1, &fi_addr, 0, nullptr);
-        if (ret < 0) { throw NS_LIBFABRIC::fabric_error(ret, "fi_av_insert"); }
-        else if (ret == 0)
+        // --------------------------------------------------------------------
+        locality insert_address(fid_av* av, locality const& address)
         {
-            NS_DEBUG::cnb_deb.error("fi_av_insert called with existing address");
-            NS_LIBFABRIC::fabric_error(ret, "fi_av_insert did not return 1");
+            [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::hptr(this), __func__);
+
+            LF_DEB(cnb_deb, trace(str<>("inserting AV"), address.to_str(), hptr(av)));
+            fi_addr_t fi_addr = 0xffff'ffff;
+            int ret = fi_av_insert(av, address.fabric_data().data(), 1, &fi_addr, 0, nullptr);
+            if (ret < 0) { throw NS_LIBFABRIC::fabric_error(ret, "fi_av_insert"); }
+            else if (ret == 0)
+            {
+                LF_DEB(cnb_deb, error("fi_av_insert called with existing address"));
+                NS_LIBFABRIC::fabric_error(ret, "fi_av_insert did not return 1");
+            }
+            // address was generated correctly, now update the locality with the fi_addr
+            locality new_locality(address, fi_addr, av);
+            LF_DEB(cnb_deb,
+                trace(str<>("AV add"), "rank", dec<>(fi_addr), new_locality.to_str(), "fi_addr",
+                    hex<4>(fi_addr)));
+            return new_locality;
         }
-        // address was generated correctly, now update the locality with the fi_addr
-        locality new_locality(address, fi_addr);
-        LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>("AV add"), "rank", debug::dec<>(fi_addr),
-                                      iplocality(new_locality), "fi_addr", debug::hex<4>(fi_addr)));
-        return new_locality;
-    }
-};
+    };
 
-} // namespace NS_LIBFABRIC
+}    // namespace NS_LIBFABRIC
diff --git a/src/libfabric/fabric_error.hpp b/src/libfabric/fabric_error.hpp
index 0f2db4c1..84e43dd5 100644
--- a/src/libfabric/fabric_error.hpp
+++ b/src/libfabric/fabric_error.hpp
@@ -11,42 +11,39 @@
 
 #include <stdexcept>
 #include <string>
-#include <string.h>
 //
 #include <rdma/fi_errno.h>
 //
 #include "oomph_libfabric_defines.hpp"
 
-namespace NS_DEBUG
-{
-// cppcheck-suppress ConfigurationNotChecked
-static NS_DEBUG::enable_print<false> err_deb("ERROR__");
-} // namespace NS_DEBUG
+namespace NS_DEBUG {
+    // cppcheck-suppress ConfigurationNotChecked
+    static NS_DEBUG::enable_print<false> err_deb("ERROR__");
+}    // namespace NS_DEBUG
 
-namespace NS_LIBFABRIC
-{
+namespace NS_LIBFABRIC {
 
-class fabric_error : public std::runtime_error
-{
-  public:
-    // --------------------------------------------------------------------
-    fabric_error(int err, const std::string& msg)
-    : std::runtime_error(std::string(fi_strerror(-err)) + msg)
-    , error_(err)
+    class fabric_error : public std::runtime_error
     {
-        NS_DEBUG::err_deb.error(msg, ":", fi_strerror(-err));
-        std::terminate();
-    }
+    public:
+        // --------------------------------------------------------------------
+        fabric_error(int err, std::string const& msg)
+          : std::runtime_error(std::string(fi_strerror(-err)) + msg)
+          , error_(err)
+        {
+            NS_DEBUG::err_deb.error(msg, ":", fi_strerror(-err));
+            std::terminate();
+        }
 
-    fabric_error(int err)
-    : std::runtime_error(fi_strerror(-err))
-    , error_(-err)
-    {
-        NS_DEBUG::err_deb.error(what());
-        std::terminate();
-    }
+        fabric_error(int err)
+          : std::runtime_error(fi_strerror(-err))
+          , error_(-err)
+        {
+            NS_DEBUG::err_deb.error(what());
+            std::terminate();
+        }
 
-    int error_;
-};
+        int error_;
+    };
 
-} // namespace NS_LIBFABRIC
+}    // namespace NS_LIBFABRIC
diff --git a/src/libfabric/libfabric_defines_template.hpp b/src/libfabric/libfabric_defines_template.hpp
index 64c04944..ea2a105b 100644
--- a/src/libfabric/libfabric_defines_template.hpp
+++ b/src/libfabric/libfabric_defines_template.hpp
@@ -14,26 +14,29 @@
 // some namespaces for the lib and for debugging are setup correctly
 
 #define NS_LIBFABRIC oomph::libfabric
-#define NS_MEMORY    oomph::libfabric
-#define NS_DEBUG     oomph::debug
+#define NS_MEMORY oomph::libfabric
+#define NS_DEBUG oomph::debug
 
 #ifndef LF_DEB
-#define LF_DEB(printer, Expr)                                                                      \
-    if constexpr (printer.is_enabled()) { printer.Expr; };
+# define LF_DEB(printer, Expr)                                                                     \
+     {                                                                                             \
+         using namespace NS_DEBUG;                                                                 \
+         if constexpr (printer.is_enabled()) { printer.Expr; };                                    \
+     }
 #endif
 
 #define LFSOURCE_DIR "@OOMPH_SRC_LIBFABRIC_DIR@"
-#define LFPRINT_HPP  "@OOMPH_SRC_LIBFABRIC_DIR@/print.hpp"
-#define LFCOUNT_HPP  "@OOMPH_SRC_LIBFABRIC_DIR@/simple_counter.hpp"
+#define LFPRINT_HPP "@OOMPH_SRC_LIBFABRIC_DIR@/print.hpp"
+#define LFCOUNT_HPP "@OOMPH_SRC_LIBFABRIC_DIR@/simple_counter.hpp"
 
 // oomph has a debug print helper file in the main source tree
 #if __has_include(LFPRINT_HPP)
-#include LFPRINT_HPP
-#define has_debug 1
+# include LFPRINT_HPP
+# define has_debug 1
 #endif
 
 #if __has_include(LFCOUNT_HPP)
-#include LFCOUNT_HPP
+# include LFCOUNT_HPP
 #endif
 
 #endif
diff --git a/src/libfabric/locality.cpp b/src/libfabric/locality.cpp
deleted file mode 100644
index 487912f5..00000000
--- a/src/libfabric/locality.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * ghex-org
- *
- * Copyright (c) 2014-2023, ETH Zurich
- * All rights reserved.
- *
- * Please, refer to the LICENSE file in the root directory.
- * SPDX-License-Identifier: BSD-3-Clause
- */
-
-#include <locality.hpp>
-
-namespace oomph
-{
-namespace libfabric
-{
-
-// ------------------------------------------------------------------
-// format as ip address, port, libfabric address
-// ------------------------------------------------------------------
-iplocality::iplocality(const locality& l)
-: data(l)
-{
-}
-
-std::ostream&
-operator<<(std::ostream& os, const iplocality& p)
-{
-    os << std::dec << NS_DEBUG::ipaddr(p.data.fabric_data()) << " - "
-       << NS_DEBUG::ipaddr(p.data.ip_address()) << ":" << NS_DEBUG::dec<>(p.data.port()) << " ("
-       << NS_DEBUG::dec<>(p.data.fi_address()) << ") ";
-    return os;
-}
-
-} // namespace libfabric
-} // namespace oomph
diff --git a/src/libfabric/locality.hpp b/src/libfabric/locality.hpp
index 74f6b290..9e91cec1 100644
--- a/src/libfabric/locality.hpp
+++ b/src/libfabric/locality.hpp
@@ -16,242 +16,191 @@
 #include <utility>
 //
 #include <rdma/fabric.h>
-#include <netinet/in.h>
+#include <rdma/fi_domain.h>
+//
 #include <arpa/inet.h>
+#include <netinet/in.h>
 //
 #include "oomph_libfabric_defines.hpp"
 
-// Different providers use different address formats that we must accommodate
-// in our locality object.
+// Different providers use different address formats that we must accommodate in our locality object.
 #ifdef HAVE_LIBFABRIC_GNI
-#define HAVE_LIBFABRIC_LOCALITY_SIZE 48
+# define HAVE_LIBFABRIC_LOCALITY_SIZE 48
 #endif
 
 #ifdef HAVE_LIBFABRIC_CXI
-#ifdef HAVE_LIBFABRIC_CXI_1_15
-#define HAVE_LIBFABRIC_LOCALITY_SIZE sizeof(int)
-#else
-#define HAVE_LIBFABRIC_LOCALITY_SIZE sizeof(long int)
-#endif
+# ifdef HAVE_LIBFABRIC_CXI_1_15
+#  define HAVE_LIBFABRIC_LOCALITY_SIZE sizeof(int)
+# else
+#  define HAVE_LIBFABRIC_LOCALITY_SIZE sizeof(long int)
+# endif
 #endif
 
 #ifdef HAVE_LIBFABRIC_EFA
-#define HAVE_LIBFABRIC_LOCALITY_SIZE 32
+# define HAVE_LIBFABRIC_LOCALITY_SIZE 32
 #endif
 
 #if defined(HAVE_LIBFABRIC_VERBS) || defined(HAVE_LIBFABRIC_TCP) ||                                \
     defined(HAVE_LIBFABRIC_SOCKETS) || defined(HAVE_LIBFABRIC_PSM2)
-#define HAVE_LIBFABRIC_LOCALITY_SIZE 16
-#define HAVE_LIBFABRIC_LOCALITY_SOCKADDR
+# define HAVE_LIBFABRIC_LOCALITY_SIZE 16
 #endif
 
-namespace oomph
-{
-// cppcheck-suppress ConfigurationNotChecked
-static NS_DEBUG::enable_print<false> loc_deb("LOCALTY");
-} // namespace oomph
-
-namespace oomph
-{
-namespace libfabric
-{
-
-struct locality;
-
-// ------------------------------------------------------------------
-// format as ip address, port, libfabric address
-// ------------------------------------------------------------------
-struct iplocality
-{
-    const locality& data;
-    iplocality(const locality& a);
-    friend std::ostream& operator<<(std::ostream& os, const iplocality& p);
-};
-
-// --------------------------------------------------------------------
-// Locality, in this structure we store the information required by
-// libfabric to make a connection to another node.
-// With libfabric 1.4.x the array contains the fabric ip address stored
-// as the second uint32_t in the array. For this reason we use an
-// array of uint32_t rather than uint8_t/char so we can easily access
-// the ip for debug/validation purposes
-// --------------------------------------------------------------------
-namespace locality_defs
-{
-// the number of 32bit ints stored in our array
-const uint32_t array_size = HAVE_LIBFABRIC_LOCALITY_SIZE;
-const uint32_t array_length = HAVE_LIBFABRIC_LOCALITY_SIZE / 4;
-} // namespace locality_defs
-
-struct locality
-{
-    // array type of our locality data
-    typedef std::array<uint32_t, locality_defs::array_length> locality_data;
-
-    static const char* type() { return "libfabric"; }
-
-    explicit locality(const locality_data& in_data)
-    {
-        std::memcpy(&data_[0], &in_data[0], locality_defs::array_size);
-        fi_address_ = 0;
-        LF_DEB(loc_deb, trace(NS_DEBUG::str<>("expl constructing"), iplocality((*this))));
-    }
-
-    locality()
-    {
-        std::memset(&data_[0], 0x00, locality_defs::array_size);
-        fi_address_ = 0;
-        LF_DEB(loc_deb, trace(NS_DEBUG::str<>("default construct"), iplocality((*this))));
-    }
-
-    locality(const locality& other)
-    : data_(other.data_)
-    , fi_address_(other.fi_address_)
-    {
-        LF_DEB(loc_deb, trace(NS_DEBUG::str<>("copy construct"), iplocality((*this))));
-    }
-
-    locality(const locality& other, fi_addr_t addr)
-    : data_(other.data_)
-    , fi_address_(addr)
-    {
-        LF_DEB(loc_deb, trace(NS_DEBUG::str<>("copy fi construct"), iplocality((*this))));
-    }
-
-    locality(locality&& other)
-    : data_(std::move(other.data_))
-    , fi_address_(other.fi_address_)
-    {
-        LF_DEB(loc_deb, trace(NS_DEBUG::str<>("move construct"), iplocality((*this))));
-    }
-
-    // provided to support sockets mode bootstrap
-    explicit locality(const std::string& address, const std::string& portnum)
-    {
-        LF_DEB(loc_deb, trace(NS_DEBUG::str<>("explicit construct"), address, ":", portnum));
-        //
-        struct sockaddr_in socket_data;
-        memset(&socket_data, 0, sizeof(socket_data));
-        socket_data.sin_family = AF_INET;
-        socket_data.sin_port = htons(std::stol(portnum));
-        inet_pton(AF_INET, address.c_str(), &(socket_data.sin_addr));
-        //
-        std::memcpy(&data_[0], &socket_data, locality_defs::array_size);
-        fi_address_ = 0;
-        LF_DEB(loc_deb, trace(NS_DEBUG::str<>("string constructing"), iplocality((*this))));
-    }
-
-    // some condition marking this locality as valid
-    explicit inline operator bool() const
-    {
-        LF_DEB(loc_deb, trace(NS_DEBUG::str<>("bool operator"), iplocality((*this))));
-        return (ip_address() != 0);
-    }
-
-    inline bool valid() const
-    {
-        LF_DEB(loc_deb, trace(NS_DEBUG::str<>("valid operator"), iplocality((*this))));
-        return (ip_address() != 0);
-    }
-
-    locality& operator=(const locality& other)
-    {
-        data_ = other.data_;
-        fi_address_ = other.fi_address_;
-        LF_DEB(loc_deb,
-            trace(NS_DEBUG::str<>("copy operator"), iplocality(*this), iplocality(other)));
-        return *this;
-    }
-
-    bool operator==(const locality& other)
-    {
-        LF_DEB(loc_deb,
-            trace(NS_DEBUG::str<>("equality operator"), iplocality(*this), iplocality(other)));
-        return std::memcmp(&data_, &other.data_, locality_defs::array_size) == 0;
-    }
-
-    bool less_than(const locality& other)
-    {
-        LF_DEB(loc_deb,
-            trace(NS_DEBUG::str<>("less operator"), iplocality(*this), iplocality(other)));
-        if (ip_address() < other.ip_address()) return true;
-        if (ip_address() == other.ip_address()) return port() < other.port();
-        return false;
-    }
+#if defined(HAVE_LIBFABRIC_SHM)
+# define HAVE_LIBFABRIC_LOCALITY_SIZE 24
+#endif
 
-    const uint32_t& ip_address() const
-    {
-#if defined(HAVE_LIBFABRIC_LOCALITY_SOCKADDR)
-        return reinterpret_cast<const struct sockaddr_in*>(data_.data())->sin_addr.s_addr;
-#elif defined(HAVE_LIBFABRIC_GNI)
-        return data_[0];
-#elif defined(HAVE_LIBFABRIC_CXI)
-        return data_[0];
-#elif defined(HAVE_LIBFABRIC_EFA)
-        return data_[0];
-#else
-        throw fabric_error(0, "unsupported fabric provider, please fix ASAP");
+#if defined(HAVE_LIBFABRIC_LNX)
+# define HAVE_LIBFABRIC_LOCALITY_SIZE 512
 #endif
-    }
 
-    static const uint32_t& ip_address(const locality_data& data)
-    {
-#if defined(HAVE_LIBFABRIC_LOCALITY_SOCKADDR)
-        return reinterpret_cast<const struct sockaddr_in*>(&data)->sin_addr.s_addr;
-#elif defined(HAVE_LIBFABRIC_GNI)
-        return data[0];
-#elif defined(HAVE_LIBFABRIC_CXI)
-        return data[0];
-#elif defined(HAVE_LIBFABRIC_EFA)
-        return data[0];
+namespace oomph {
+    // cppcheck-suppress ConfigurationNotChecked
+    static NS_DEBUG::enable_print<false> loc_deb("LOCALTY");
+}    // namespace oomph
+
+namespace oomph { namespace libfabric {
+
+    struct locality;
+
+    // --------------------------------------------------------------------
+    // Locality, in this structure we store the information required by
+    // libfabric to make a connection to another node.
+    // With libfabric 1.4.x the array contains the fabric ip address stored
+    // as the second uint32_t in the array. For this reason we use an
+    // array of uint32_t rather than uint8_t/char so we can easily access
+    // the ip for debug/validation purposes
+    // --------------------------------------------------------------------
+    namespace locality_defs {
+        // the number of 32bit ints stored in our array
+        uint32_t const array_size = HAVE_LIBFABRIC_LOCALITY_SIZE;
+        uint32_t const array_length = HAVE_LIBFABRIC_LOCALITY_SIZE / 4;
+    }    // namespace locality_defs
+
+    struct locality
+    {
+        // array type of our locality data
+        typedef std::array<uint32_t, locality_defs::array_length> locality_data;
+
+        static char const* type() { return "libfabric"; }
+
+        explicit locality(locality_data const& in_data, struct fid_av* av)
+        {
+            std::memcpy(&data_[0], &in_data[0], locality_defs::array_size);
+            fi_address_ = 0;
+            av_ = av;
+            LF_DEB(loc_deb, trace(NS_DEBUG::str<>("explicit construct"), to_str()));
+        }
+
+        locality()
+        {
+            std::memset(&data_[0], 0x00, locality_defs::array_size);
+            fi_address_ = 0;
+            av_ = nullptr;
+            LF_DEB(loc_deb, trace(NS_DEBUG::str<>("default construct"), to_str()));
+        }
+
+        locality(locality const& other)
+          : data_(other.data_)
+          , fi_address_(other.fi_address_)
+          , av_(other.av_)
+        {
+            LF_DEB(loc_deb, trace(NS_DEBUG::str<>("copy construct"), to_str()));
+        }
+
+        locality(locality const& other, fi_addr_t addr, struct fid_av* av)
+          : data_(other.data_)
+          , fi_address_(addr)
+          , av_(av)
+        {
+            LF_DEB(loc_deb, trace(NS_DEBUG::str<>("copy fi construct"), to_str()));
+        }
+
+        locality(locality&& other)
+          : data_(std::move(other.data_))
+          , fi_address_(other.fi_address_)
+          , av_(other.av_)
+        {
+            LF_DEB(loc_deb, trace(NS_DEBUG::str<>("move construct"), to_str()));
+        }
+
+        // provided to support sockets mode bootstrap
+        explicit locality(std::string const& address, std::string const& portnum)
+        {
+            LF_DEB(loc_deb, trace(NS_DEBUG::str<>("explicit construct-2"), address, ":", portnum));
+            //
+            struct sockaddr_in socket_data;
+            memset(&socket_data, 0, sizeof(socket_data));
+            socket_data.sin_family = AF_INET;
+            socket_data.sin_port = htons(std::stol(portnum));
+            inet_pton(AF_INET, address.c_str(), &(socket_data.sin_addr));
+            //
+            std::memcpy(&data_[0], &socket_data, locality_defs::array_size);
+            fi_address_ = 0;
+            av_ = nullptr;
+            LF_DEB(loc_deb, trace(NS_DEBUG::str<>("string constructing"), to_str()));
+        }
+
+        locality& operator=(locality const& other)
+        {
+            data_ = other.data_;
+            fi_address_ = other.fi_address_;
+            av_ = other.av_;
+            LF_DEB(loc_deb, trace(NS_DEBUG::str<>("copy operator"), to_str(), other.to_str()));
+            return *this;
+        }
+
+        bool operator==(locality const& other)
+        {
+            LF_DEB(loc_deb, trace(NS_DEBUG::str<>("equality operator"), to_str(), other.to_str()));
+            return std::memcmp(&data_, &other.data_, locality_defs::array_size) == 0;
+        }
+
+        inline fi_addr_t const& fi_address() const { return fi_address_; }
+
+        inline void set_fi_address(fi_addr_t fi_addr) { fi_address_ = fi_addr; }
+
+        inline uint16_t port() const
+        {
+            uint16_t port = 256 * reinterpret_cast<uint8_t const*>(data_.data())[2] +
+                reinterpret_cast<uint8_t const*>(data_.data())[3];
+            return port;
+        }
+
+        inline locality_data const& fabric_data() const { return data_; }
+
+        inline char* fabric_data_writable() { return reinterpret_cast<char*>(data_.data()); }
+
+        std::string to_str() const
+        {
+            size_t buflen = 1024;
+            std::array<char, 1024> buf;
+            if (!av_) { return "No address vector"; }
+            char const* straddr_ret = fi_av_straddr(av_, data_.data(), buf.data(), &buflen);
+#ifdef HAVE_LIBFABRIC_LNX
+            return "LNX does not yet support straddr";
 #else
-        throw fabric_error(0, "unsupported fabric provider, please fix ASAP");
+            std::string result = straddr_ret ? straddr_ret : "Address formatting Error";
+            return result;
 #endif
-    }
-
-    inline const fi_addr_t& fi_address() const { return fi_address_; }
-
-    inline void set_fi_address(fi_addr_t fi_addr) { fi_address_ = fi_addr; }
-
-    inline uint16_t port() const
-    {
-        uint16_t port = 256 * reinterpret_cast<const uint8_t*>(data_.data())[2] +
-                        reinterpret_cast<const uint8_t*>(data_.data())[3];
-        return port;
-    }
-
-    inline const void* fabric_data() const { return data_.data(); }
-
-    inline char* fabric_data_writable() { return reinterpret_cast<char*>(data_.data()); }
-
-  private:
-    friend bool operator==(locality const& lhs, locality const& rhs)
-    {
-        LF_DEB(loc_deb,
-            trace(NS_DEBUG::str<>("equality friend"), iplocality(lhs), iplocality(rhs)));
-        return ((lhs.data_ == rhs.data_) && (lhs.fi_address_ == rhs.fi_address_));
-    }
-
-    friend bool operator<(locality const& lhs, locality const& rhs)
-    {
-        const uint32_t&  a1 = lhs.ip_address();
-        const uint32_t&  a2 = rhs.ip_address();
-        const fi_addr_t& f1 = lhs.fi_address();
-        const fi_addr_t& f2 = rhs.fi_address();
-        LF_DEB(loc_deb, trace(NS_DEBUG::str<>("less friend"), iplocality(lhs), iplocality(rhs)));
-        return (a1 < a2) || (a1 == a2 && f1 < f2);
-    }
-
-    friend std::ostream& operator<<(std::ostream& os, locality const& loc)
-    {
-        for (uint32_t i = 0; i < locality_defs::array_length; ++i) { os << loc.data_[i]; }
-        return os;
-    }
-
-  private:
-    locality_data data_;
-    fi_addr_t     fi_address_;
-};
-
-} // namespace libfabric
-} // namespace oomph
+        }
+
+    private:
+        friend bool operator==(locality const& lhs, locality const& rhs)
+        {
+            LF_DEB(loc_deb, trace(NS_DEBUG::str<>("equality friend"), lhs.to_str(), rhs.to_str()));
+            return ((lhs.data_ == rhs.data_) && (lhs.fi_address_ == rhs.fi_address_));
+        }
+
+        friend std::ostream& operator<<(std::ostream& os, locality const& loc)
+        {
+            for (uint32_t i = 0; i < locality_defs::array_length; ++i) { os << loc.data_[i]; }
+            return os;
+        }
+
+    private:
+        locality_data data_;
+        fi_addr_t fi_address_;
+        struct fid_av* av_;
+    };
+
+}}    // namespace oomph::libfabric
diff --git a/src/libfabric/memory_region.hpp b/src/libfabric/memory_region.hpp
index 0cd5c4a7..2028fc41 100644
--- a/src/libfabric/memory_region.hpp
+++ b/src/libfabric/memory_region.hpp
@@ -15,23 +15,21 @@
 #include <rdma/fi_domain.h>
 //
 #include <iostream>
-#include <memory>
 #include <utility>
 
-#include "oomph_libfabric_defines.hpp"
 #include "fabric_error.hpp"
+#include "oomph_libfabric_defines.hpp"
 
 #ifdef OOMPH_ENABLE_DEVICE
-#include <hwmalloc/device.hpp>
+# include <hwmalloc/device.hpp>
 #endif
 // ------------------------------------------------------------------
 
-namespace NS_MEMORY
-{
+namespace NS_MEMORY {
 
-static NS_DEBUG::enable_print<false> mrn_deb("REGION_");
+    static NS_DEBUG::enable_print<false> mrn_deb("REGION_");
 
-/*
+    /*
 struct fi_mr_attr {
     union {
         const struct iovec *mr_iov;
@@ -60,342 +58,352 @@ struct fi_mr_attr {
 
 */
 
-// This is the only part of the code that actually
-// calls libfabric functions
-struct region_provider
-{
-    // The internal memory region handle
-    using provider_region = struct fid_mr;
-    using provider_domain = struct fid_domain;
-
-    // register region
-    static inline int fi_register_memory(provider_domain* pd, int device_id, const void* buf,
-        size_t len, uint64_t access_flags, uint64_t offset, uint64_t request_key,
-        struct fid_mr** mr)
+    // This is the only part of the code that actually
+    // calls libfabric functions
+    struct region_provider
     {
-        [[maybe_unused]] auto scp =
-            NS_MEMORY::mrn_deb.scope(__func__, NS_DEBUG::ptr(buf), NS_DEBUG::dec<>(len), device_id);
-        //
-        struct iovec addresses = {/*.iov_base = */ const_cast<void*>(buf), /*.iov_len = */ len};
-        fi_mr_attr   attr = {
-            /*.mr_iov         = */ &addresses,
-            /*.iov_count      = */ 1,
-            /*.access         = */ access_flags,
-            /*.offset         = */ offset,
-            /*.requested_key  = */ request_key,
-            /*.context        = */ nullptr,
-            /*.auth_key_size  = */ 0,
-            /*.auth_key       = */ nullptr,
-            /*.iface          = */ FI_HMEM_SYSTEM,
-            /*.device         = */ {0},
+        // The internal memory region handle
+        using provider_region = struct fid_mr;
+        using provider_domain = struct fid_domain;
+
+        // register region
+        static inline int fi_register_memory(provider_domain* pd, int device_id, void const* buf,
+            size_t len, uint64_t access_flags, uint64_t offset, uint64_t request_key,
+            struct fid_mr** mr)
+        {
+            [[maybe_unused]] auto scp = NS_MEMORY::mrn_deb.scope(
+                __func__, NS_DEBUG::hptr(buf), NS_DEBUG::dec<>(len), device_id);
+            //
+            struct iovec addresses = {/*.iov_base = */ const_cast<void*>(buf), /*.iov_len = */ len};
+            fi_mr_attr attr = {
+                /*.mr_iov         = */ {&addresses},
+                /*.iov_count      = */ 1,
+                /*.access         = */ access_flags,
+                /*.offset         = */ offset,
+                /*.requested_key  = */ request_key,
+                /*.context        = */ nullptr,
+                /*.auth_key_size  = */ 0,
+                /*.auth_key       = */ nullptr,
+                /*.iface          = */ FI_HMEM_SYSTEM,
+                /*.device         = */ {0},
 #if (FI_MAJOR_VERSION > 1) || ((FI_MAJOR_VERSION == 1) && FI_MINOR_VERSION > 17)
-            /*.hmem_data      = */ nullptr,
+                /*.hmem_data      = */ nullptr,
 #endif
 #if (FI_MAJOR_VERSION >= 2)
-            /*page_size       = */ static_cast<size_t>(sysconf(_SC_PAGESIZE)),
-            /*base_mr         = */ nullptr,
-            /*sub_mr_cnt      = */ 0,
-        };
+                /*page_size       = */ static_cast<size_t>(sysconf(_SC_PAGESIZE)),
+                /*base_mr         = */ nullptr,
+                /*sub_mr_cnt      = */ 0,
+            };
 #else
-        };
+            };
 #endif
 
-        if (device_id >= 0)
-        {
+            if (device_id >= 0)
+            {
 #ifdef OOMPH_ENABLE_DEVICE
-            attr.device.cuda = device_id;
-            int handle = hwmalloc::get_device_id();
-            attr.device.cuda = handle;
-#if defined(OOMPH_DEVICE_CUDA)
-            attr.iface = FI_HMEM_CUDA;
-            LF_DEB(NS_MEMORY::mrn_deb,
-                trace(NS_DEBUG::str<>("CUDA"), "set device id", device_id, handle));
-#elif defined(OOMPH_DEVICE_HIP)
-            attr.iface = FI_HMEM_ROCR;
-            LF_DEB(NS_MEMORY::mrn_deb,
-                trace(NS_DEBUG::str<>("HIP"), "set device id", device_id, handle));
-#endif
+                attr.device.cuda = device_id;
+                int handle = hwmalloc::get_device_id();
+                attr.device.cuda = handle;
+# if defined(OOMPH_DEVICE_CUDA)
+                attr.iface = FI_HMEM_CUDA;
+                LF_DEB(NS_MEMORY::mrn_deb,
+                    trace(NS_DEBUG::str<>("CUDA"), "set device id", device_id, handle));
+# elif defined(OOMPH_DEVICE_HIP)
+                attr.iface = FI_HMEM_ROCR;
+                LF_DEB(NS_MEMORY::mrn_deb,
+                    trace(NS_DEBUG::str<>("HIP"), "set device id", device_id, handle));
+# endif
 #endif
+            }
+            uint64_t flags = 0;
+            int ret = fi_mr_regattr(pd, &attr, flags, mr);
+            if (ret) { throw NS_LIBFABRIC::fabric_error(int(ret), "register_memory"); }
+            return ret;
         }
-        uint64_t flags = 0;
-        int      ret = fi_mr_regattr(pd, &attr, flags, mr);
-        if (ret) { throw NS_LIBFABRIC::fabric_error(int(ret), "register_memory"); }
-        return ret;
-    }
 
-    // unregister region
-    static inline int unregister_memory(provider_region* region) { return fi_close(&region->fid); }
-
-    // Default registration flags for this provider
-    static inline constexpr int access_flags()
-    {
-        return FI_READ | FI_WRITE | FI_RECV | FI_SEND /*| FI_REMOTE_READ | FI_REMOTE_WRITE*/;
-    }
+        // unregister region
+        static inline int unregister_memory(provider_region* region)
+        {
+            return fi_close(&region->fid);
+        }
 
-    // Get the local descriptor of the memory region.
-    static inline void* get_local_key(provider_region* const region) { return fi_mr_desc(region); }
+        // Default registration flags for this provider
+        static inline constexpr int access_flags()
+        {
+            return FI_READ | FI_WRITE | FI_RECV | FI_SEND /*| FI_REMOTE_READ | FI_REMOTE_WRITE*/;
+        }
 
-    // Get the remote key of the memory region.
-    static inline uint64_t get_remote_key(provider_region* const region)
-    {
-        return fi_mr_key(region);
-    }
-};
+        // Get the local descriptor of the memory region.
+        static inline void* get_local_key(provider_region* const region)
+        {
+            return fi_mr_desc(region);
+        }
 
-// --------------------------------------------------------------------
-// This is a handle to a small chunk of memory that has been registered
-// as part of a much larger allocation (a memory_segment)
-struct memory_handle
-{
-    // --------------------------------------------------------------------
-    using provider_region = region_provider::provider_region;
+        // Get the remote key of the memory region.
+        static inline uint64_t get_remote_key(provider_region* const region)
+        {
+            return fi_mr_key(region);
+        }
+    };
 
     // --------------------------------------------------------------------
-    // Default constructor creates unusable handle(region)
-    memory_handle()
-    : address_{nullptr}
-    , region_{nullptr}
-    , size_{0}
-    , used_space_{0}
+    // This is a handle to a small chunk of memory that has been registered
+    // as part of a much larger allocation (a memory_segment)
+    struct memory_handle
     {
-    }
-    memory_handle(memory_handle const&) noexcept = default;
-    memory_handle& operator=(memory_handle const&) noexcept = default;
-
-    memory_handle(provider_region* region, unsigned char* addr,
-        std::size_t size /*, uint32_t flags*/) noexcept
-    : address_{addr}
-    , region_{region}
-    , size_{uint32_t(size)}
-    , used_space_{0}
-    {
-        //            LF_DEB(NS_MEMORY::mrn_deb,
-        //                trace(NS_DEBUG::str<>("memory_handle"), *this));
-    }
+        // --------------------------------------------------------------------
+        using provider_region = region_provider::provider_region;
+
+        // --------------------------------------------------------------------
+        // Default constructor creates unusable handle(region)
+        memory_handle()
+          : address_{nullptr}
+          , region_{nullptr}
+          , size_{0}
+          , used_space_{0}
+        {
+        }
+        memory_handle(memory_handle const&) noexcept = default;
+        memory_handle& operator=(memory_handle const&) noexcept = default;
+
+        memory_handle(provider_region* region, unsigned char* addr,
+            std::size_t size /*, uint32_t flags*/) noexcept
+          : address_{addr}
+          , region_{region}
+          , size_{uint32_t(size)}
+          , used_space_{0}
+        {
+            //            LF_DEB(NS_MEMORY::mrn_deb,
+            //                trace(NS_DEBUG::str<>("memory_handle"), *this));
+        }
 
-    // --------------------------------------------------------------------
-    // move constructor, clear other region so that it is not unregistered twice
-    memory_handle(memory_handle&& other) noexcept
-    : address_{other.address_}
-    , region_{std::exchange(other.region_, nullptr)}
-    , size_{other.size_}
-    , used_space_{other.used_space_}
-    {
-    }
+        // --------------------------------------------------------------------
+        // move constructor, clear other region so that it is not unregistered twice
+        memory_handle(memory_handle&& other) noexcept
+          : address_{other.address_}
+          , region_{std::exchange(other.region_, nullptr)}
+          , size_{other.size_}
+          , used_space_{other.used_space_}
+        {
+        }
 
-    // --------------------------------------------------------------------
-    // move assignment, clear other region so that it is not unregistered twice
-    memory_handle& operator=(memory_handle&& other) noexcept
-    {
-        address_ = other.address_;
-        region_ = std::exchange(other.region_, nullptr);
-        size_ = other.size_;
-        used_space_ = other.used_space_;
-        return *this;
-    }
+        // --------------------------------------------------------------------
+        // move assignment, clear other region so that it is not unregistered twice
+        memory_handle& operator=(memory_handle&& other) noexcept
+        {
+            address_ = other.address_;
+            region_ = std::exchange(other.region_, nullptr);
+            size_ = other.size_;
+            used_space_ = other.used_space_;
+            return *this;
+        }
 
-    // --------------------------------------------------------------------
-    // Return the address of this memory region block.
-    inline unsigned char* get_address(void) const { return address_; }
+        // --------------------------------------------------------------------
+        // Return the address of this memory region block.
+        inline unsigned char* get_address(void) const { return address_; }
 
-    // --------------------------------------------------------------------
-    // Get the local descriptor of the memory region.
-    inline void* get_local_key(void) const { return region_provider::get_local_key(region_); }
+        // --------------------------------------------------------------------
+        // Get the local descriptor of the memory region.
+        inline void* get_local_key(void) const { return region_provider::get_local_key(region_); }
 
-    // --------------------------------------------------------------------
-    // Get the remote key of the memory region.
-    inline uint64_t get_remote_key(void) const { return region_provider::get_remote_key(region_); }
+        // --------------------------------------------------------------------
+        // Get the remote key of the memory region.
+        inline uint64_t get_remote_key(void) const
+        {
+            return region_provider::get_remote_key(region_);
+        }
 
-    // --------------------------------------------------------------------
-    // Get the size of the memory chunk usable by this memory region,
-    // this may be smaller than the value returned by get_length
-    // if the region is a sub region (partial region) within another block
-    inline uint64_t get_size(void) const { return size_; }
+        // --------------------------------------------------------------------
+        // Get the size of the memory chunk usable by this memory region,
+        // this may be smaller than the value returned by get_length
+        // if the region is a sub region (partial region) within another block
+        inline uint64_t get_size(void) const { return size_; }
 
-    // --------------------------------------------------------------------
-    // Get the size used by a message in the memory region.
-    inline uint32_t get_message_length(void) const { return used_space_; }
+        // --------------------------------------------------------------------
+        // Get the size used by a message in the memory region.
+        inline uint32_t get_message_length(void) const { return used_space_; }
 
-    // --------------------------------------------------------------------
-    // Set the size used by a message in the memory region.
-    inline void set_message_length(uint32_t length) { used_space_ = length; }
+        // --------------------------------------------------------------------
+        // Set the size used by a message in the memory region.
+        inline void set_message_length(uint32_t length) { used_space_ = length; }
 
-    // --------------------------------------------------------------------
-    void release_region() noexcept { region_ = nullptr; }
+        // --------------------------------------------------------------------
+        void release_region() noexcept { region_ = nullptr; }
 
-    // --------------------------------------------------------------------
-    // return the underlying libfabric region handle
-    inline provider_region* get_region() const { return region_; }
+        // --------------------------------------------------------------------
+        // return the underlying libfabric region handle
+        inline provider_region* get_region() const { return region_; }
 
-    // --------------------------------------------------------------------
-    // Deregister the memory region.
-    // returns 0 when successful, -1 otherwise
-    int deregister(void) const
-    {
-        if (region_ /*&& !get_user_region()*/)
+        // --------------------------------------------------------------------
+        // Deregister the memory region.
+        // returns 0 when successful, -1 otherwise
+        int deregister(void) const
         {
-            LF_DEB(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("release"), region_));
-            //
-            if (region_provider::unregister_memory(region_))
-            {
-                LF_DEB(NS_MEMORY::mrn_deb, error("fi_close mr failed"));
-                return -1;
-            }
-            else
+            if (region_ /*&& !get_user_region()*/)
             {
-                LF_DEB(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("de-Registered region"), *this));
+                LF_DEB(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("release"), region_));
+                //
+                if (region_provider::unregister_memory(region_))
+                {
+                    LF_DEB(NS_MEMORY::mrn_deb, error("fi_close mr failed"));
+                    return -1;
+                }
+                else
+                {
+                    LF_DEB(
+                        NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("de-Registered region"), *this));
+                }
+                region_ = nullptr;
             }
-            region_ = nullptr;
+            return 0;
         }
-        return 0;
-    }
 
-    // --------------------------------------------------------------------
-    friend std::ostream& operator<<(std::ostream& os, memory_handle const& region)
-    {
-        (void)region;
+        // --------------------------------------------------------------------
+        friend std::ostream& operator<<(std::ostream& os, memory_handle const& region)
+        {
+            (void) region;
 #if 1 || has_debug
-        os << "region "
-           << NS_DEBUG::ptr(&region)
-           //<< " fi_region "  << NS_DEBUG::ptr(region.region_)
-           << " address " << NS_DEBUG::ptr(region.address_) << " size "
-           << NS_DEBUG::hex<6>(region.size_)
-           //<< " used_space " << NS_DEBUG::hex<6>(region.used_space_/*size_*/)
-           << " loc key "
-           << NS_DEBUG::ptr(
-                  region.region_ ? region_provider::get_local_key(region.region_) : nullptr)
-           << " rem key "
-           << NS_DEBUG::ptr(region.region_ ? region_provider::get_remote_key(region.region_) : 0);
-        ///// clang-format off
-        ///// clang-format on
+            using namespace NS_DEBUG;
+            os << "region "
+               << hptr(&region)
+               //<< " fi_region "  << hptr(region.region_)
+               << " address " << hptr(region.address_) << " size "
+               << hex<6>(region.size_)
+               //<< " used_space " << hex<6>(region.used_space_/*size_*/)
+               << " loc key "
+               << hptr(region.region_ ? region_provider::get_local_key(region.region_) : nullptr)
+               << " rem key "
+               << hptr(region.region_ ? region_provider::get_remote_key(region.region_) : 0);
+            ///// clang-format off
+            ///// clang-format on
 #endif
-        return os;
-    }
-
-  protected:
-    // This gives the start address of this region.
-    // This is the address that should be used for data storage
-    unsigned char* address_;
+            return os;
+        }
 
-    // The hardware level handle to the region (as returned from libfabric fi_mr_reg)
-    mutable provider_region* region_;
+    protected:
+        // This gives the start address of this region.
+        // This is the address that should be used for data storage
+        unsigned char* address_;
 
-    // The (maximum available) size of the memory buffer
-    uint32_t size_;
+        // The hardware level handle to the region (as returned from libfabric fi_mr_reg)
+        mutable provider_region* region_;
 
-    // Space used by a message in the memory region.
-    // This may be smaller/less than the size available if more space
-    // was allocated than it turns out was needed
-    mutable uint32_t used_space_;
-};
+        // The (maximum available) size of the memory buffer
+        uint32_t size_;
 
-// --------------------------------------------------------------------
-// a memory segment is a pinned block of memory that has been specialized
-// by a particular region provider. Each provider (infiniband, libfabric,
-// other) has a different definition for the object and the protection
-// domain used to limit access.
-// --------------------------------------------------------------------
-struct memory_segment : public memory_handle
-{
-    using provider_domain = region_provider::provider_domain;
-    using provider_region = region_provider::provider_region;
-    using handle_type = memory_handle;
+        // Space used by a message in the memory region.
+        // This may be smaller/less than the size available if more space
+        // was allocated than it turns out was needed
+        mutable uint32_t used_space_;
+    };
 
     // --------------------------------------------------------------------
-    memory_segment(provider_region* region, unsigned char* address, unsigned char* base_address,
-        uint64_t size)
-    : memory_handle(region, address, size)
-    , base_addr_(base_address)
-    {
-    }
-
+    // a memory segment is a pinned block of memory that has been specialized
+    // by a particular region provider. Each provider (infiniband, libfabric,
+    // other) has a different definition for the object and the protection
+    // domain used to limit access.
     // --------------------------------------------------------------------
-    // move constructor, clear other region
-    memory_segment(memory_segment&& other) noexcept
-    : memory_handle(std::move(other))
-    , base_addr_{std::exchange(other.base_addr_, nullptr)}
+    struct memory_segment : public memory_handle
     {
-    }
+        using provider_domain = region_provider::provider_domain;
+        using provider_region = region_provider::provider_region;
+        using handle_type = memory_handle;
+
+        // --------------------------------------------------------------------
+        memory_segment(provider_region* region, unsigned char* address, unsigned char* base_address,
+            uint64_t size)
+          : memory_handle(region, address, size)
+          , base_addr_(base_address)
+        {
+        }
 
-    // --------------------------------------------------------------------
-    // move assignment, clear other region
-    memory_segment& operator=(memory_segment&& other) noexcept
-    {
-        memory_handle(std::move(other));
-        region_ = std::exchange(other.region_, nullptr);
-        return *this;
-    }
+        // --------------------------------------------------------------------
+        // move constructor, clear other region
+        memory_segment(memory_segment&& other) noexcept
+          : memory_handle(std::move(other))
+          , base_addr_{std::exchange(other.base_addr_, nullptr)}
+        {
+        }
 
-    // --------------------------------------------------------------------
-    // construct a memory region object by registering an existing address buffer
-    // we do not cache local/remote keys here because memory segments are only
-    // used by the heap to store chunks and the user will always receive
-    // a memory_handle - which does have keys cached
-    memory_segment(provider_domain* pd, const void* buffer, const uint64_t length, bool bind_mr,
-        void* ep, int device_id)
-    {
-        // an rma key counter to keep some providers (CXI) happy
-        static std::atomic<std::uint64_t> key = 0;
-        //
-        address_ = static_cast<unsigned char*>(const_cast<void*>(buffer));
-        size_ = length;
-        used_space_ = length;
-        region_ = nullptr;
-        //
-        base_addr_ = memory_handle::address_;
-        LF_DEB(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("memory_segment"), *this, device_id));
-
-        int ret = region_provider::fi_register_memory(pd, device_id, buffer, length,
-            region_provider::access_flags(), 0, key++, &(region_));
-        if (!ret)
+        // --------------------------------------------------------------------
+        // move assignment, clear other region
+        memory_segment& operator=(memory_segment&& other) noexcept
         {
-            LF_DEB(NS_MEMORY::mrn_deb,
-                trace(NS_DEBUG::str<>("Registered region"), "device", device_id, *this));
+            memory_handle(std::move(other));
+            region_ = std::exchange(other.region_, nullptr);
+            return *this;
         }
 
-        if (bind_mr)
+        // --------------------------------------------------------------------
+        // construct a memory region object by registering an existing address buffer
+        // we do not cache local/remote keys here because memory segments are only
+        // used by the heap to store chunks and the user will always receive
+        // a memory_handle - which does have keys cached
+        memory_segment(provider_domain* pd, void const* buffer, uint64_t const length, bool bind_mr,
+            void* ep, int device_id)
         {
-            ret = fi_mr_bind(region_, (struct fid*)ep, 0);
-            if (ret) { throw NS_LIBFABRIC::fabric_error(int(ret), "fi_mr_bind"); }
-            else { LF_DEB(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("Bound region"), *this)); }
+            // an rma key counter to keep some providers (CXI) happy
+            static std::atomic<std::uint64_t> key = 0;
+            //
+            address_ = static_cast<unsigned char*>(const_cast<void*>(buffer));
+            size_ = length;
+            used_space_ = length;
+            region_ = nullptr;
+            //
+            base_addr_ = memory_handle::address_;
+            LF_DEB(NS_MEMORY::mrn_deb, trace(str<>("memory_segment"), *this, device_id));
 
-            ret = fi_mr_enable(region_);
-            if (ret) { throw NS_LIBFABRIC::fabric_error(int(ret), "fi_mr_enable"); }
-            else { LF_DEB(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("Enabled region"), *this)); }
+            int ret = region_provider::fi_register_memory(pd, device_id, buffer, length,
+                region_provider::access_flags(), 0, key++, &(region_));
+            if (!ret)
+            {
+                LF_DEB(NS_MEMORY::mrn_deb,
+                    trace(str<>("Registered region"), "device", device_id, *this));
+            }
+
+            if (bind_mr)
+            {
+                ret = fi_mr_bind(region_, (struct fid*) ep, 0);
+                if (ret) { throw NS_LIBFABRIC::fabric_error(int(ret), "fi_mr_bind"); }
+                else { LF_DEB(NS_MEMORY::mrn_deb, trace(str<>("Bound region"), *this)); }
+
+                ret = fi_mr_enable(region_);
+                if (ret) { throw NS_LIBFABRIC::fabric_error(int(ret), "fi_mr_enable"); }
+                else { LF_DEB(NS_MEMORY::mrn_deb, trace(str<>("Enabled region"), *this)); }
+            }
         }
-    }
 
-    // --------------------------------------------------------------------
-    // destroy the region and memory according to flag settings
-    ~memory_segment() { deregister(); }
+        // --------------------------------------------------------------------
+        // destroy the region and memory according to flag settings
+        ~memory_segment() { deregister(); }
 
-    handle_type get_handle(std::size_t offset, std::size_t size) const noexcept
-    {
-        return memory_handle(region_, base_addr_ + offset, size);
-    }
+        handle_type get_handle(std::size_t offset, std::size_t size) const noexcept
+        {
+            return memory_handle(region_, base_addr_ + offset, size);
+        }
 
-    // --------------------------------------------------------------------
-    // Get the address of the base memory region.
-    // This is the address of the memory allocated from the system
-    inline unsigned char* get_base_address(void) const { return base_addr_; }
+        // --------------------------------------------------------------------
+        // Get the address of the base memory region.
+        // This is the address of the memory allocated from the system
+        inline unsigned char* get_base_address(void) const { return base_addr_; }
 
-    // --------------------------------------------------------------------
-    friend std::ostream& operator<<(std::ostream& os, memory_segment const& region)
-    {
-        (void)region;
+        // --------------------------------------------------------------------
+        friend std::ostream& operator<<(std::ostream& os, memory_segment const& region)
+        {
+            (void) region;
 #if has_debug
-        // clang-format off
+            // clang-format off
             os << *static_cast<const memory_handle*>(&region)
-               << " base address " << NS_DEBUG::ptr(region.base_addr_);
-        // clang-format on
+               << " base address " << NS_DEBUG::hptr(region.base_addr_);
+            // clang-format on
 #endif
-        return os;
-    }
+            return os;
+        }
 
-  public:
-    // this is the base address of the memory registered by this segment
-    // individual memory_handles are offset from this address
-    unsigned char* base_addr_;
-};
+    public:
+        // this is the base address of the memory registered by this segment
+        // individual memory_handles are offset from this address
+        unsigned char* base_addr_;
+    };
 
-} // namespace NS_MEMORY
+}    // namespace NS_MEMORY
diff --git a/src/libfabric/operation_context.cpp b/src/libfabric/operation_context.cpp
index ce5081dd..0f6de97a 100644
--- a/src/libfabric/operation_context.cpp
+++ b/src/libfabric/operation_context.cpp
@@ -8,49 +8,52 @@
  * SPDX-License-Identifier: BSD-3-Clause
  */
 // paths relative to backend
-#include <oomph_libfabric_defines.hpp>
-#include <controller.hpp>
 #include <communicator.hpp>
 #include <context.hpp>
+#include <controller.hpp>
+#include <oomph_libfabric_defines.hpp>
 
-namespace oomph::libfabric
-{
-void
-operation_context::handle_cancelled()
-{
-    [[maybe_unused]] auto scp = opctx_deb<1>.scope(NS_DEBUG::ptr(this), __func__);
-    // enqueue the cancelled/callback
-    if (std::holds_alternative<detail::request_state*>(m_req))
-    {
-        // regular (non-shared) recv
-        auto s = std::get<detail::request_state*>(m_req);
-        while (!(s->m_comm->m_recv_cb_cancel.push(s))) {}
-    }
-    else if (std::holds_alternative<detail::shared_request_state*>(m_req))
+namespace oomph::libfabric {
+    void operation_context::handle_cancelled()
     {
-        // shared recv
-        auto s = std::get<detail::shared_request_state*>(m_req);
-        while (!(s->m_ctxt->m_recv_cb_cancel.push(s))) {}
+        [[maybe_unused]] auto scp = opctx_deb<1>.scope(NS_DEBUG::hptr(this), __func__);
+        // enqueue the cancelled/callback
+        if (std::holds_alternative<detail::request_state*>(m_req))
+        {
+            // regular (non-shared) recv
+            auto s = std::get<detail::request_state*>(m_req);
+            while (!(s->m_comm->m_recv_cb_cancel.push(s))) {}
+        }
+        else if (std::holds_alternative<detail::shared_request_state*>(m_req))
+        {
+            // shared recv
+            auto s = std::get<detail::shared_request_state*>(m_req);
+            while (!(s->m_ctxt->m_recv_cb_cancel.push(s))) {}
+        }
+        else { throw std::runtime_error("Request state invalid in handle_cancelled"); }
     }
-    else { throw std::runtime_error("Request state invalid in handle_cancelled"); }
-}
 
-int
-operation_context::handle_tagged_recv_completion_impl(void* user_data)
-{
-    [[maybe_unused]] auto scp = opctx_deb<1>.scope(NS_DEBUG::ptr(this), __func__);
-    if (std::holds_alternative<detail::request_state*>(m_req))
+    int operation_context::handle_tagged_recv_completion_impl(void* user_data)
     {
-        // regular (non-shared) recv
-        auto s = std::get<detail::request_state*>(m_req);
-        //if (std::this_thread::get_id() == thread_id_)
-        if (reinterpret_cast<oomph::communicator_impl*>(user_data) == s->m_comm)
+        [[maybe_unused]] auto scp = opctx_deb<1>.scope(NS_DEBUG::hptr(this), __func__);
+        if (std::holds_alternative<detail::request_state*>(m_req))
         {
-            if (!s->m_comm->has_reached_recursion_depth())
+            // regular (non-shared) recv
+            auto s = std::get<detail::request_state*>(m_req);
+            //if (std::this_thread::get_id() == thread_id_)
+            if (reinterpret_cast<oomph::communicator_impl*>(user_data) == s->m_comm)
             {
-                auto inc = s->m_comm->recursion();
-                auto ptr = s->release_self_ref();
-                s->invoke_cb();
+                if (!s->m_comm->has_reached_recursion_depth())
+                {
+                    auto inc = s->m_comm->recursion();
+                    auto ptr = s->release_self_ref();
+                    s->invoke_cb();
+                }
+                else
+                {
+                    // enqueue the callback
+                    while (!(s->m_comm->m_recv_cb_queue.push(s))) {}
+                }
             }
             else
             {
@@ -58,82 +61,76 @@ operation_context::handle_tagged_recv_completion_impl(void* user_data)
                 while (!(s->m_comm->m_recv_cb_queue.push(s))) {}
             }
         }
-        else
-        {
-            // enqueue the callback
-            while (!(s->m_comm->m_recv_cb_queue.push(s))) {}
-        }
-    }
-    else if (std::holds_alternative<detail::shared_request_state*>(m_req))
-    {
-        // shared recv
-        auto s = std::get<detail::shared_request_state*>(m_req);
-        if (!s->m_comm->m_context->has_reached_recursion_depth())
+        else if (std::holds_alternative<detail::shared_request_state*>(m_req))
         {
-            auto inc = s->m_comm->m_context->recursion();
-            auto ptr = s->release_self_ref();
-            s->invoke_cb();
-        }
-        else
-        {
-            // enqueue the callback
-            while (!(s->m_comm->m_context->m_recv_cb_queue.push(s))) {}
-        }
-    }
-    else
-    {
-        detail::request_state** req = reinterpret_cast<detail::request_state**>(&m_req);
-        LF_DEB(NS_MEMORY::opctx_deb<9>,
-            error(NS_DEBUG::str<>("invalid request_state"), this, "request", NS_DEBUG::ptr(req)));
-        throw std::runtime_error("Request state invalid in handle_tagged_recv");
-    }
-    return 1;
-}
-
-int
-operation_context::handle_tagged_send_completion_impl(void* user_data)
-{
-    if (std::holds_alternative<detail::request_state*>(m_req))
-    {
-        // regular (non-shared) recv
-        auto s = std::get<detail::request_state*>(m_req);
-        if (reinterpret_cast<oomph::communicator_impl*>(user_data) == s->m_comm)
-        {
-            if (!s->m_comm->has_reached_recursion_depth())
+            // shared recv
+            auto s = std::get<detail::shared_request_state*>(m_req);
+            if (!s->m_comm->m_context->has_reached_recursion_depth())
             {
-                auto inc = s->m_comm->recursion();
+                auto inc = s->m_comm->m_context->recursion();
                 auto ptr = s->release_self_ref();
                 s->invoke_cb();
             }
             else
             {
                 // enqueue the callback
-                while (!(s->m_comm->m_send_cb_queue.push(s))) {}
+                while (!(s->m_comm->m_context->m_recv_cb_queue.push(s))) {}
             }
         }
         else
         {
-            // enqueue the callback
-            while (!(s->m_comm->m_send_cb_queue.push(s))) {}
+            detail::request_state** req = reinterpret_cast<detail::request_state**>(&m_req);
+            LF_DEB(NS_MEMORY::opctx_deb<9>,
+                error(
+                    str<>("invalid request_state"), this, "request", hptr(req)));
+            throw std::runtime_error("Request state invalid in handle_tagged_recv");
         }
+        return 1;
     }
-    else if (std::holds_alternative<detail::shared_request_state*>(m_req))
+
+    int operation_context::handle_tagged_send_completion_impl(void* user_data)
     {
-        // shared recv
-        auto s = std::get<detail::shared_request_state*>(m_req);
-        if (!s->m_comm->m_context->has_reached_recursion_depth())
+        if (std::holds_alternative<detail::request_state*>(m_req))
         {
-            auto inc = s->m_comm->m_context->recursion();
-            auto ptr = s->release_self_ref();
-            s->invoke_cb();
+            // regular (non-shared) recv
+            auto s = std::get<detail::request_state*>(m_req);
+            if (reinterpret_cast<oomph::communicator_impl*>(user_data) == s->m_comm)
+            {
+                if (!s->m_comm->has_reached_recursion_depth())
+                {
+                    auto inc = s->m_comm->recursion();
+                    auto ptr = s->release_self_ref();
+                    s->invoke_cb();
+                }
+                else
+                {
+                    // enqueue the callback
+                    while (!(s->m_comm->m_send_cb_queue.push(s))) {}
+                }
+            }
+            else
+            {
+                // enqueue the callback
+                while (!(s->m_comm->m_send_cb_queue.push(s))) {}
+            }
         }
-        else
+        else if (std::holds_alternative<detail::shared_request_state*>(m_req))
         {
-            // enqueue the callback
-            while (!(s->m_comm->m_context->m_recv_cb_queue.push(s))) {}
+            // shared recv
+            auto s = std::get<detail::shared_request_state*>(m_req);
+            if (!s->m_comm->m_context->has_reached_recursion_depth())
+            {
+                auto inc = s->m_comm->m_context->recursion();
+                auto ptr = s->release_self_ref();
+                s->invoke_cb();
+            }
+            else
+            {
+                // enqueue the callback
+                while (!(s->m_comm->m_context->m_recv_cb_queue.push(s))) {}
+            }
         }
+        else { throw std::runtime_error("Request state invalid in handle_tagged_send"); }
+        return 1;
     }
-    else { throw std::runtime_error("Request state invalid in handle_tagged_send"); }
-    return 1;
-}
-} // namespace oomph::libfabric
+}    // namespace oomph::libfabric
diff --git a/src/libfabric/operation_context.hpp b/src/libfabric/operation_context.hpp
index ad106e6a..faed3d70 100644
--- a/src/libfabric/operation_context.hpp
+++ b/src/libfabric/operation_context.hpp
@@ -15,39 +15,38 @@
 //
 #include "operation_context_base.hpp"
 //
-namespace oomph::libfabric
-{
-
-template<int Level>
-inline /*constexpr*/ NS_DEBUG::print_threshold<Level, 0> opctx_deb("OP__CXT");
-
-// This struct holds the ready state of a future
-// we must also store the context used in libfabric, in case
-// a request is cancelled - fi_cancel(...) needs it
-struct operation_context : public operation_context_base<operation_context>
-{
-    std::variant<oomph::detail::request_state*, oomph::detail::shared_request_state*> m_req;
-
-    template<typename RequestState>
-    operation_context(RequestState* req)
-    : operation_context_base()
-    , m_req{req}
-    {
-        [[maybe_unused]] auto scp =
-            opctx_deb<9>.scope(NS_DEBUG::ptr(this), __func__, "request", req);
-    }
-
-    // --------------------------------------------------------------------
-    // When a completion returns FI_ECANCELED, this is called
-    void handle_cancelled();
+namespace oomph::libfabric {
 
-    // --------------------------------------------------------------------
-    // Called when a tagged recv completes
-    int handle_tagged_recv_completion_impl(void* user_data);
+    template <int Level>
+    inline NS_DEBUG::print_threshold<Level, 0> opctx_deb("OP__CXT");
 
-    // --------------------------------------------------------------------
-    // Called when a tagged send completes
-    int handle_tagged_send_completion_impl(void* user_data);
-};
-
-} // namespace oomph::libfabric
+    // This struct holds the ready state of a future
+    // we must also store the context used in libfabric, in case
+    // a request is cancelled - fi_cancel(...) needs it
+    struct operation_context : public operation_context_base<operation_context>
+    {
+        std::variant<oomph::detail::request_state*, oomph::detail::shared_request_state*> m_req;
+
+        template <typename RequestState>
+        operation_context(RequestState* req)
+          : operation_context_base()
+          , m_req{req}
+        {
+            [[maybe_unused]] auto scp =
+                opctx_deb<9>.scope(NS_DEBUG::hptr(this), __func__, "request", req);
+        }
+
+        // --------------------------------------------------------------------
+        // When a completion returns FI_ECANCELED, this is called
+        void handle_cancelled();
+
+        // --------------------------------------------------------------------
+        // Called when a tagged recv completes
+        int handle_tagged_recv_completion_impl(void* user_data);
+
+        // --------------------------------------------------------------------
+        // Called when a tagged send completes
+        int handle_tagged_send_completion_impl(void* user_data);
+    };
+
+}    // namespace oomph::libfabric
diff --git a/src/libfabric/operation_context_base.hpp b/src/libfabric/operation_context_base.hpp
index e5156f99..462c79b5 100644
--- a/src/libfabric/operation_context_base.hpp
+++ b/src/libfabric/operation_context_base.hpp
@@ -12,85 +12,84 @@
 #include <rdma/fi_eq.h>
 #include "oomph_libfabric_defines.hpp"
 
-namespace NS_LIBFABRIC
-{
+namespace NS_LIBFABRIC {
 
-class controller;
+    class controller;
 
-static NS_DEBUG::enable_print<false> ctx_bas("CTXBASE");
+    static NS_DEBUG::enable_print<false> ctx_bas("CTXBASE");
 
-// This struct holds the ready state of a future
-// we must also store the context used in libfabric, in case
-// a request is cancelled - fi_cancel(...) needs it
-template<typename Derived>
-struct operation_context_base
-{
-  private:
-    // libfabric requires some space for it's internal bookkeeping
-    // so the first member of this struct must be fi_context
-    fi_context context_reserved_space;
-
-  public:
-    operation_context_base()
-    : context_reserved_space()
+    // This struct holds the ready state of a future
+    // we must also store the context used in libfabric, in case
+    // a request is cancelled - fi_cancel(...) needs it
+    template <typename Derived>
+    struct operation_context_base
     {
-        [[maybe_unused]] auto scp = ctx_bas.scope(NS_DEBUG::ptr(this), __func__);
-    }
+    private:
+        // libfabric requires some space for it's internal bookkeeping
+        // so the first member of this struct must be fi_context
+        fi_context context_reserved_space;
 
-    // error
-    void handle_error(struct fi_cq_err_entry& err)
-    {
-        static_cast<Derived*>(this)->handle_error_impl(err);
-    }
-    void handle_error_impl(struct fi_cq_err_entry& /*err*/) { std::terminate(); }
+    public:
+        operation_context_base()
+          : context_reserved_space()
+        {
+            [[maybe_unused]] auto scp = ctx_bas.scope(NS_DEBUG::hptr(this), __func__);
+        }
 
-    void handle_cancelled() { static_cast<Derived*>(this)->handle_cancelled_impl(); }
-    void handle_cancelled_impl() { std::terminate(); }
+        // error
+        void handle_error(struct fi_cq_err_entry& err)
+        {
+            static_cast<Derived*>(this)->handle_error_impl(err);
+        }
+        void handle_error_impl(struct fi_cq_err_entry& /*err*/) { std::terminate(); }
 
-    // send
-    int handle_send_completion()
-    {
-        return static_cast<Derived*>(this)->handle_send_completion_impl();
-    }
-    int handle_send_completion_impl() { return 0; }
+        void handle_cancelled() { static_cast<Derived*>(this)->handle_cancelled_impl(); }
+        void handle_cancelled_impl() { std::terminate(); }
 
-    // tagged send
-    int handle_tagged_send_completion(void* user_data)
-    {
-        return static_cast<Derived*>(this)->handle_tagged_send_completion_impl(user_data);
-    }
-    int handle_tagged_send_completion_impl(void* /*user_data*/) { return 0; }
+        // send
+        int handle_send_completion()
+        {
+            return static_cast<Derived*>(this)->handle_send_completion_impl();
+        }
+        int handle_send_completion_impl() { return 0; }
 
-    // recv
-    int handle_recv_completion(std::uint64_t len)
-    {
-        return static_cast<Derived*>(this)->handle_recv_completion_impl(len);
-    }
-    int handle_recv_completion_impl(std::uint64_t /*len*/) { return 0; }
+        // tagged send
+        int handle_tagged_send_completion(void* user_data)
+        {
+            return static_cast<Derived*>(this)->handle_tagged_send_completion_impl(user_data);
+        }
+        int handle_tagged_send_completion_impl(void* /*user_data*/) { return 0; }
 
-    // tagged recv
-    int handle_tagged_recv_completion(void* user_data)
-    {
-        return static_cast<Derived*>(this)->handle_tagged_recv_completion_impl(user_data);
-    }
-    int handle_tagged_recv_completion_impl(bool /*threadlocal*/) { return 0; }
+        // recv
+        int handle_recv_completion(std::uint64_t len)
+        {
+            return static_cast<Derived*>(this)->handle_recv_completion_impl(len);
+        }
+        int handle_recv_completion_impl(std::uint64_t /*len*/) { return 0; }
 
-    void handle_rma_read_completion()
-    {
-        static_cast<Derived*>(this)->handle_rma_read_completion_impl();
-    }
-    void handle_rma_read_completion_impl() {}
+        // tagged recv
+        int handle_tagged_recv_completion(void* user_data)
+        {
+            return static_cast<Derived*>(this)->handle_tagged_recv_completion_impl(user_data);
+        }
+        int handle_tagged_recv_completion_impl(bool /*threadlocal*/) { return 0; }
 
-    // unknown sender = new connection
-    int handle_new_connection(controller* ctrl, std::uint64_t len)
-    {
-        return static_cast<Derived*>(this)->handle_new_connection_impl(ctrl, len);
-    }
-    int handle_new_connection_impl(controller*, std::uint64_t) { return 0; }
-};
+        void handle_rma_read_completion()
+        {
+            static_cast<Derived*>(this)->handle_rma_read_completion_impl();
+        }
+        void handle_rma_read_completion_impl() {}
 
-// provided so that a pointer can be cast to this and the operation_context_type queried
-struct unspecialized_context : public operation_context_base<unspecialized_context>
-{
-};
-} // namespace NS_LIBFABRIC
+        // unknown sender = new connection
+        int handle_new_connection(controller* ctrl, std::uint64_t len)
+        {
+            return static_cast<Derived*>(this)->handle_new_connection_impl(ctrl, len);
+        }
+        int handle_new_connection_impl(controller*, std::uint64_t) { return 0; }
+    };
+
+    // provided so that a pointer can be cast to this and the operation_context_type queried
+    struct unspecialized_context : public operation_context_base<unspecialized_context>
+    {
+    };
+}    // namespace NS_LIBFABRIC
diff --git a/src/libfabric/print.hpp b/src/libfabric/print.hpp
index cf8de408..04364b98 100644
--- a/src/libfabric/print.hpp
+++ b/src/libfabric/print.hpp
@@ -27,12 +27,12 @@
 #include <vector>
 //
 #if defined(__linux) || defined(linux) || defined(__linux__)
-#include <sys/mman.h>
-#include <unistd.h>
+# include <sys/mman.h>
+# include <unistd.h>
 #elif defined(__APPLE__)
-#include <crt_externs.h>
-#include <unistd.h>
-#define environ (*_NSGetEnviron())
+# include <crt_externs.h>
+# include <unistd.h>
+# define environ (*_NSGetEnviron())
 #else
 extern char** environ;
 #endif
@@ -73,670 +73,633 @@ extern char** environ;
 // ------------------------------------------------------------
 
 #define NS_DEBUG oomph::debug
-#define LF_DEB(printer, Expr)                                                                      \
-    if constexpr (printer.is_enabled()) { printer.Expr; };
+#ifndef LF_DEB
+# define LF_DEB(printer, Expr)                                                                     \
+     {                                                                                             \
+         using namespace NS_DEBUG;                                                                 \
+         if constexpr (printer.is_enabled()) { printer.Expr; };                                    \
+     }
+#endif
 
 // ------------------------------------------------------------
 /// \cond NODETAIL
-namespace NS_DEBUG
-{
-
-// ------------------------------------------------------------------
-// format as zero padded int
-// ------------------------------------------------------------------
-namespace detail
-{
-
-template<int N, typename T>
-struct dec
-{
-    constexpr dec(T const& v)
-    : data_(v)
-    {
-    }
+namespace NS_DEBUG {
 
-    T const& data_;
+    // ------------------------------------------------------------------
+    // format as zero padded int
+    // ------------------------------------------------------------------
+    namespace detail {
 
-    friend std::ostream& operator<<(std::ostream& os, dec<N, T> const& d)
-    {
-        os << std::right << std::setfill('0') << std::setw(N) << std::noshowbase << std::dec
-           << d.data_;
-        return os;
-    }
-};
-} // namespace detail
-
-template<int N = 2, typename T>
-constexpr detail::dec<N, T>
-dec(T const& v)
-{
-    return detail::dec<N, T>(v);
-}
-
-// ------------------------------------------------------------------
-// format as pointer
-// ------------------------------------------------------------------
-struct ptr
-{
-    ptr(void const* v)
-    : data_(v)
-    {
-    }
-    ptr(std::uintptr_t const v)
-    : data_(reinterpret_cast<void const*>(v))
-    {
-    }
-    void const*          data_;
-    friend std::ostream& operator<<(std::ostream& os, ptr const& d)
-    {
-        os << std::right << "0x" << std::setfill('0') << std::setw(12) << std::noshowbase
-           << std::hex << reinterpret_cast<uintptr_t>(d.data_);
-        return os;
-    }
-};
-
-// ------------------------------------------------------------------
-// format as zero padded hex
-// ------------------------------------------------------------------
-namespace detail
-{
-
-template<int N = 4, typename T = int, typename Enable = void>
-struct hex;
-
-template<int N, typename T>
-struct hex<N, T, typename std::enable_if<!std::is_pointer<T>::value>::type>
-{
-    constexpr hex(T const& v)
-    : data_(v)
-    {
-    }
-    T const&             data_;
-    friend std::ostream& operator<<(std::ostream& os, const hex<N, T>& d)
-    {
-        os << std::right << "0x" << std::setfill('0') << std::setw(N) << std::noshowbase << std::hex
-           << d.data_;
-        return os;
-    }
-};
+        template <int N, typename T>
+        struct dec
+        {
+            constexpr dec(T const& v)
+              : data_(v)
+            {
+            }
 
-template<int N, typename T>
-struct hex<N, T, typename std::enable_if<std::is_pointer<T>::value>::type>
-{
-    constexpr hex(T const& v)
-    : data_(v)
-    {
-    }
-    T const&             data_;
-    friend std::ostream& operator<<(std::ostream& os, const hex<N, T>& d)
-    {
-        os << std::right << std::setw(N) << std::noshowbase << std::hex << d.data_;
-        return os;
-    }
-};
-} // namespace detail
-
-template<int N = 4, typename T>
-constexpr detail::hex<N, T>
-hex(T const& v)
-{
-    return detail::hex<N, T>(v);
-}
-
-// ------------------------------------------------------------------
-// format as binary bits
-// ------------------------------------------------------------------
-namespace detail
-{
-
-template<int N = 8, typename T = int>
-struct bin
-{
-    constexpr bin(T const& v)
-    : data_(v)
-    {
-    }
-    T const&             data_;
-    friend std::ostream& operator<<(std::ostream& os, const bin<N, T>& d)
+            T const& data_;
+
+            friend std::ostream& operator<<(std::ostream& os, dec<N, T> const& d)
+            {
+                os << std::right << std::setfill('0') << std::setw(N) << std::noshowbase << std::dec
+                   << d.data_;
+                return os;
+            }
+        };
+    }    // namespace detail
+
+    template <int N = 2, typename T>
+    constexpr detail::dec<N, T> dec(T const& v)
     {
-        os << std::bitset<N>(d.data_);
-        return os;
+        return detail::dec<N, T>(v);
     }
-};
-} // namespace detail
-
-template<int N = 8, typename T>
-constexpr detail::bin<N, T>
-bin(T const& v)
-{
-    return detail::bin<N, T>(v);
-}
-
-// ------------------------------------------------------------------
-// format as padded string
-// ------------------------------------------------------------------
-template<int N = 20>
-struct str
-{
-    constexpr str(char const* v)
-    : data_(v)
+
+    // ------------------------------------------------------------------
+    // format as pointer
+    // ------------------------------------------------------------------
+    struct hptr
     {
-    }
+        hptr(void const* v)
+          : data_(v)
+        {
+        }
+        hptr(std::uintptr_t const v)
+          : data_(reinterpret_cast<void const*>(v))
+        {
+        }
+        void const* data_;
+        friend std::ostream& operator<<(std::ostream& os, hptr const& d)
+        {
+            os << std::right << "0x" << std::setfill('0') << std::setw(12) << std::noshowbase
+               << std::hex << reinterpret_cast<uintptr_t>(d.data_);
+            return os;
+        }
+    };
 
-    char const* data_;
+    // ------------------------------------------------------------------
+    // format as zero padded hex
+    // ------------------------------------------------------------------
+    namespace detail {
 
-    friend std::ostream& operator<<(std::ostream& os, str<N> const& d)
-    {
-        os << std::left << std::setfill(' ') << std::setw(N) << d.data_;
-        return os;
-    }
-};
-
-// ------------------------------------------------------------------
-// format as ip address
-// ------------------------------------------------------------------
-struct ipaddr
-{
-    ipaddr(const void* a)
-    : data_(reinterpret_cast<const uint8_t*>(a))
-    , ipdata_(0)
+        template <int N = 4, typename T = int, typename Enable = void>
+        struct hex;
+
+        template <int N, typename T>
+        struct hex<N, T, typename std::enable_if<!std::is_pointer<T>::value>::type>
+        {
+            constexpr hex(T const& v)
+              : data_(v)
+            {
+            }
+            T const& data_;
+            friend std::ostream& operator<<(std::ostream& os, hex<N, T> const& d)
+            {
+                os << std::right << "0x" << std::setfill('0') << std::setw(N) << std::noshowbase
+                   << std::hex << d.data_;
+                return os;
+            }
+        };
+
+        template <int N, typename T>
+        struct hex<N, T, typename std::enable_if<std::is_pointer<T>::value>::type>
+        {
+            constexpr hex(T const& v)
+              : data_(v)
+            {
+            }
+            T const& data_;
+            friend std::ostream& operator<<(std::ostream& os, hex<N, T> const& d)
+            {
+                os << std::right << std::setw(N) << std::noshowbase << std::hex << d.data_;
+                return os;
+            }
+        };
+    }    // namespace detail
+
+    template <int N = 4, typename T>
+    constexpr detail::hex<N, T> hex(T const& v)
     {
+        return detail::hex<N, T>(v);
     }
-    ipaddr(const uint32_t a)
-    : data_(reinterpret_cast<const uint8_t*>(&ipdata_))
-    , ipdata_(a)
+
+    // ------------------------------------------------------------------
+    // format as binary bits
+    // ------------------------------------------------------------------
+    namespace detail {
+
+        template <int N = 8, typename T = int>
+        struct bin
+        {
+            constexpr bin(T const& v)
+              : data_(v)
+            {
+            }
+            T const& data_;
+            friend std::ostream& operator<<(std::ostream& os, bin<N, T> const& d)
+            {
+                os << std::bitset<N>(d.data_);
+                return os;
+            }
+        };
+    }    // namespace detail
+
+    template <int N = 8, typename T>
+    constexpr detail::bin<N, T> bin(T const& v)
     {
+        return detail::bin<N, T>(v);
     }
-    const uint8_t* data_;
-    const uint32_t ipdata_;
 
-    friend std::ostream& operator<<(std::ostream& os, ipaddr const& p)
+    // ------------------------------------------------------------------
+    // format as padded string
+    // ------------------------------------------------------------------
+    template <int N = 20>
+    struct str
     {
-        os << std::dec << int(p.data_[0]) << "." << int(p.data_[1]) << "." << int(p.data_[2]) << "."
-           << int(p.data_[3]);
-        return os;
-    }
-};
-
-// ------------------------------------------------------------------
-// helper fuction for printing CRC32
-// ------------------------------------------------------------------
-inline uint32_t
-crc32(const void* address, size_t length)
-{
-    boost::crc_32_type result;
-    result.process_bytes(address, length);
-    return result.checksum();
-}
-
-// ------------------------------------------------------------------
-// helper fuction for printing short memory dump and crc32
-// useful for debugging corruptions in buffers during
-// rma or other transfers
-// ------------------------------------------------------------------
-struct mem_crc32
-{
-    mem_crc32(const void* a, std::size_t len, const char* txt)
-    : addr_(reinterpret_cast<const std::uint8_t*>(a))
-    , len_(len)
-    , txt_(txt)
+        constexpr str(char const* v)
+          : data_(v)
+        {
+        }
+
+        char const* data_;
+
+        friend std::ostream& operator<<(std::ostream& os, str<N> const& d)
+        {
+            os << std::left << std::setfill(' ') << std::setw(N) << d.data_;
+            return os;
+        }
+    };
+
+    // ------------------------------------------------------------------
+    // helper fuction for printing CRC32
+    // ------------------------------------------------------------------
+    inline uint32_t crc32(void const* address, size_t length)
     {
+        boost::crc_32_type result;
+        result.process_bytes(address, length);
+        return result.checksum();
     }
-    const std::uint8_t*  addr_;
-    const std::size_t    len_;
-    const char*          txt_;
-    friend std::ostream& operator<<(std::ostream& os, mem_crc32 const& p)
+
+    // ------------------------------------------------------------------
+    // helper fuction for printing short memory dump and crc32
+    // useful for debugging corruptions in buffers during
+    // rma or other transfers
+    // ------------------------------------------------------------------
+    struct mem_crc32
     {
-        const std::uint8_t* byte = static_cast<const std::uint8_t*>(p.addr_);
-        os << "Memory:";
-        os << " address " << ptr(p.addr_) << " length " << hex<6, std::size_t>(p.len_)
-           << " CRC32:" << hex<8, std::size_t>(crc32(p.addr_, p.len_)) << "\n";
-        size_t i = 0;
-        while (i < std::min(size_t(128), p.len_))
-        {
-            os << "0x";
-            for (int j = 7; j >= 0; j--)
+        mem_crc32(void const* a, std::size_t len, char const* txt)
+          : addr_(reinterpret_cast<std::uint8_t const*>(a))
+          , len_(len)
+          , txt_(txt)
+        {
+        }
+        std::uint8_t const* addr_;
+        std::size_t const len_;
+        char const* txt_;
+        friend std::ostream& operator<<(std::ostream& os, mem_crc32 const& p)
+        {
+            using namespace NS_DEBUG;
+            std::uint8_t const* byte = static_cast<std::uint8_t const*>(p.addr_);
+            os << "Memory:";
+            os << " address " << hptr(p.addr_) << " length " << hex<6, std::size_t>(p.len_)
+               << " CRC32:" << hex<8, std::size_t>(crc32(p.addr_, p.len_)) << "\n";
+            size_t i = 0;
+            while (i < std::min(size_t(128), p.len_))
             {
-                os << std::hex << std::setfill('0') << std::setw(2)
-                   << (((i + j) > p.len_) ? (int)0 : (int)byte[i + j]);
+                os << "0x";
+                for (int j = 7; j >= 0; j--)
+                {
+                    os << std::hex << std::setfill('0') << std::setw(2)
+                       << (((i + j) > p.len_) ? (int) 0 : (int) byte[i + j]);
+                }
+                i += 8;
+                if (i % 32 == 0)
+                    os << std::endl;
+                else
+                    os << " ";
             }
-            i += 8;
-            if (i % 32 == 0) os << std::endl;
-            else
-                os << " ";
+            os << ": " << p.txt_;
+            return os;
         }
-        os << ": " << p.txt_;
-        return os;
-    }
-};
-
-namespace detail
-{
-
-template<typename TupleType, std::size_t... I>
-void
-tuple_print(std::ostream& os, TupleType const& t, std::index_sequence<I...>)
-{
-    (..., (os << (I == 0 ? "" : " ") << std::get<I>(t)));
-}
-
-template<typename... Args>
-void
-tuple_print(std::ostream& os, const std::tuple<Args...>& t)
-{
-    tuple_print(os, t, std::make_index_sequence<sizeof...(Args)>());
-}
-} // namespace detail
-
-namespace detail
-{
-
-// ------------------------------------------------------------------
-// helper class for printing thread ID
-// ------------------------------------------------------------------
-struct current_thread_print_helper
-{
-};
-
-inline std::ostream&
-operator<<(std::ostream& os, current_thread_print_helper const&)
-{
-    os << hex<12, std::thread::id>(std::this_thread::get_id())
+    };
+
+    namespace detail {
+
+        template <typename TupleType, std::size_t... I>
+        void tuple_print(std::ostream& os, TupleType const& t, std::index_sequence<I...>)
+        {
+            (..., (os << (I == 0 ? "" : " ") << std::get<I>(t)));
+        }
+
+        template <typename... Args>
+        void tuple_print(std::ostream& os, std::tuple<Args...> const& t)
+        {
+            tuple_print(os, t, std::make_index_sequence<sizeof...(Args)>());
+        }
+    }    // namespace detail
+
+    namespace detail {
+
+        // ------------------------------------------------------------------
+        // helper class for printing thread ID
+        // ------------------------------------------------------------------
+        struct current_thread_print_helper
+        {
+        };
+
+        inline std::ostream& operator<<(std::ostream& os, current_thread_print_helper const&)
+        {
+            os << hex<12, std::thread::id>(std::this_thread::get_id())
 #ifdef DEBUGGING_PRINT_LINUX
-       << " cpu " << debug::dec<3, int>(sched_getcpu()) << " ";
+               << " cpu " << debug::dec<3, int>(sched_getcpu()) << " ";
 #else
-       << " cpu "
-       << "--- ";
+               << " cpu "
+               << "--- ";
 #endif
-    return os;
-}
-
-// ------------------------------------------------------------------
-// helper class for printing time since start
-// ------------------------------------------------------------------
-struct hostname_print_helper
-{
-    const char* get_hostname() const
-    {
-        static bool initialized = false;
-        static char hostname_[20];
-        if (!initialized)
-        {
-            initialized = true;
-            gethostname(hostname_, std::size_t(12));
-            std::string temp = "(" + std::to_string(guess_rank()) + ")";
-            std::strcat(hostname_, temp.c_str());
+            return os;
         }
-        return hostname_;
-    }
 
-    int guess_rank() const
-    {
-        std::vector<std::string> env_strings{"_RANK=", "_NODEID="};
-        for (char** current = environ; *current; current++)
+        // ------------------------------------------------------------------
+        // helper class for printing time since start
+        // ------------------------------------------------------------------
+        struct hostname_print_helper
         {
-            auto e = std::string(*current);
-            for (auto s : env_strings)
+            char const* get_hostname() const
             {
-                auto pos = e.find(s);
-                if (pos != std::string::npos)
+                static bool initialized = false;
+                static char hostname_[20];
+                if (!initialized)
                 {
-                    //std::cout << "Got a rank string : " << e << std::endl;
-                    return std::stoi(e.substr(pos + s.size(), 5));
+                    initialized = true;
+                    gethostname(hostname_, std::size_t(12));
+                    std::string temp = "(" + std::to_string(guess_rank()) + ")";
+                    std::strcat(hostname_, temp.c_str());
                 }
+                return hostname_;
             }
+
+            int guess_rank() const
+            {
+                std::vector<std::string> env_strings{"_RANK=", "_NODEID="};
+                for (char** current = environ; *current; current++)
+                {
+                    auto e = std::string(*current);
+                    for (auto s : env_strings)
+                    {
+                        auto pos = e.find(s);
+                        if (pos != std::string::npos)
+                        {
+                            //std::cout << "Got a rank string : " << e << std::endl;
+                            return std::stoi(e.substr(pos + s.size(), 5));
+                        }
+                    }
+                }
+                return -1;
+            }
+        };
+
+        inline std::ostream& operator<<(std::ostream& os, hostname_print_helper const& h)
+        {
+            os << debug::str<13>(h.get_hostname()) << " ";
+            return os;
         }
-        return -1;
-    }
-};
-
-inline std::ostream&
-operator<<(std::ostream& os, hostname_print_helper const& h)
-{
-    os << debug::str<13>(h.get_hostname()) << " ";
-    return os;
-}
-
-// ------------------------------------------------------------------
-// helper class for printing time since start
-// ------------------------------------------------------------------
-struct current_time_print_helper
-{
-};
-
-inline std::ostream&
-operator<<(std::ostream& os, current_time_print_helper const&)
-{
-    using namespace std::chrono;
-    static steady_clock::time_point log_t_start = steady_clock::now();
-    //
-    auto now = steady_clock::now();
-    auto nowt = duration_cast<microseconds>(now - log_t_start).count();
-    //
-    os << debug::dec<10>(nowt) << " ";
-    return os;
-}
-
-template<typename... Args>
-void
-display(char const* prefix, Args const&... args)
-{
-    // using a temp stream object with a single copy to cout at the end
-    // prevents multiple threads from injecting overlapping text
-    std::stringstream tempstream;
-    tempstream << prefix << detail::current_time_print_helper()
-               << detail::current_thread_print_helper() << detail::hostname_print_helper();
-    ((tempstream << args << " "), ...);
-    tempstream << "\n";
-    std::cout << tempstream.str() << std::flush;
-}
-
-template<typename... Args>
-void
-debug(Args const&... args)
-{
-    display("<DEB> ", args...);
-}
-
-template<typename... Args>
-void
-warning(Args const&... args)
-{
-    display("<WAR> ", args...);
-}
-
-template<typename... Args>
-void
-error(Args const&... args)
-{
-    display("<ERR> ", args...);
-}
-
-template<typename... Args>
-void
-scope(Args const&... args)
-{
-    display("<SCO> ", args...);
-}
-
-template<typename... Args>
-void
-trace(Args const&... args)
-{
-    display("<TRC> ", args...);
-}
-
-template<typename... Args>
-void
-timed(Args const&... args)
-{
-    display("<TIM> ", args...);
-}
-} // namespace detail
-
-template<typename... Args>
-struct scoped_var
-{
-    // capture tuple elements by reference - no temp vars in constructor please
-    char const*                      prefix_;
-    std::tuple<Args const&...> const message_;
-    std::string                      buffered_msg;
-
-    //
-    scoped_var(char const* p, Args const&... args)
-    : prefix_(p)
-    , message_(args...)
-    {
-        std::stringstream tempstream;
-        detail::tuple_print(tempstream, message_);
-        buffered_msg = tempstream.str();
-        detail::display("<SCO> ", prefix_, debug::str<>(">> enter <<"), tempstream.str());
-    }
 
-    ~scoped_var() { detail::display("<SCO> ", prefix_, debug::str<>("<< leave >>"), buffered_msg); }
-};
-
-template<typename... Args>
-struct timed_var
-{
-    mutable std::chrono::steady_clock::time_point time_start_;
-    double const                                  delay_;
-    std::tuple<Args...> const                     message_;
-    //
-    timed_var(double const& delay, Args const&... args)
-    : time_start_(std::chrono::steady_clock::now())
-    , delay_(delay)
-    , message_(args...)
-    {
-    }
+        // ------------------------------------------------------------------
+        // helper class for printing time since start
+        // ------------------------------------------------------------------
+        struct current_time_print_helper
+        {
+        };
 
-    bool elapsed(std::chrono::steady_clock::time_point const& now) const
-    {
-        double elapsed_ =
-            std::chrono::duration_cast<std::chrono::duration<double>>(now - time_start_).count();
+        inline std::ostream& operator<<(std::ostream& os, current_time_print_helper const&)
+        {
+            using namespace std::chrono;
+            static steady_clock::time_point log_t_start = steady_clock::now();
+            //
+            auto now = steady_clock::now();
+            auto nowt = duration_cast<microseconds>(now - log_t_start).count();
+            //
+            os << debug::dec<10>(nowt) << " ";
+            return os;
+        }
 
-        if (elapsed_ > delay_)
+        template <typename... Args>
+        void display(char const* prefix, Args const&... args)
         {
-            time_start_ = now;
-            return true;
+            // using a temp stream object with a single copy to cout at the end
+            // prevents multiple threads from injecting overlapping text
+            std::stringstream tempstream;
+            tempstream << prefix << detail::current_time_print_helper()
+                       << detail::current_thread_print_helper() << detail::hostname_print_helper();
+            ((tempstream << args << " "), ...);
+            tempstream << "\n";
+            std::cout << tempstream.str() << std::flush;
         }
-        return false;
-    }
 
-    friend std::ostream& operator<<(std::ostream& os, timed_var<Args...> const& ti)
-    {
-        detail::tuple_print(os, ti.message_);
-        return os;
-    }
-};
+        template <typename... Args>
+        void debug(Args const&... args)
+        {
+            display("<DEB> ", args...);
+        }
 
-///////////////////////////////////////////////////////////////////////////
-template<bool enable>
-struct enable_print;
+        template <typename... Args>
+        void warning(Args const&... args)
+        {
+            display("<WAR> ", args...);
+        }
 
-// when false, debug statements should produce no code
-template<>
-struct enable_print<false>
-{
-    constexpr enable_print(const char*) {}
+        template <typename... Args>
+        void error(Args const&... args)
+        {
+            display("<ERR> ", args...);
+        }
 
-    constexpr bool is_enabled() const { return false; }
+        template <typename... Args>
+        void scope(Args const&... args)
+        {
+            display("<SCO> ", args...);
+        }
 
-    template<typename... Args>
-    constexpr void debug(Args const&...) const
-    {
-    }
+        template <typename... Args>
+        void trace(Args const&... args)
+        {
+            display("<TRC> ", args...);
+        }
 
-    template<typename... Args>
-    constexpr void warning(Args const&...) const
-    {
-    }
+        template <typename... Args>
+        void timed(Args const&... args)
+        {
+            display("<TIM> ", args...);
+        }
+    }    // namespace detail
 
-    template<typename... Args>
-    constexpr void trace(Args const&...) const
+    template <typename... Args>
+    struct scoped_var
     {
-    }
+        // capture tuple elements by reference - no temp vars in constructor please
+        char const* prefix_;
+        std::tuple<Args const&...> const message_;
+        std::string buffered_msg;
 
-    template<typename... Args>
-    constexpr void error(Args const&...) const
-    {
-    }
+        //
+        scoped_var(char const* p, Args const&... args)
+          : prefix_(p)
+          , message_(args...)
+        {
+            std::stringstream tempstream;
+            detail::tuple_print(tempstream, message_);
+            buffered_msg = tempstream.str();
+            detail::display("<SCO> ", prefix_, debug::str<>(">> enter <<"), tempstream.str());
+        }
 
-    template<typename... Args>
-    constexpr void timed(Args const&...) const
-    {
-    }
+        ~scoped_var()
+        {
+            detail::display("<SCO> ", prefix_, debug::str<>("<< leave >>"), buffered_msg);
+        }
+    };
+
+    template <typename... Args>
+    struct timed_var
+    {
+        mutable std::chrono::steady_clock::time_point time_start_;
+        double const delay_;
+        std::tuple<Args...> const message_;
+        //
+        timed_var(double const& delay, Args const&... args)
+          : time_start_(std::chrono::steady_clock::now())
+          , delay_(delay)
+          , message_(args...)
+        {
+        }
 
-    template<typename T>
-    constexpr void array(std::string const&, std::vector<T> const&) const
-    {
-    }
+        bool elapsed(std::chrono::steady_clock::time_point const& now) const
+        {
+            double elapsed_ =
+                std::chrono::duration_cast<std::chrono::duration<double>>(now - time_start_)
+                    .count();
 
-    template<typename T, std::size_t N>
-    constexpr void array(std::string const&, std::array<T, N> const&) const
-    {
-    }
+            if (elapsed_ > delay_)
+            {
+                time_start_ = now;
+                return true;
+            }
+            return false;
+        }
 
-    template<typename Iter>
-    constexpr void array(std::string const&, Iter, Iter) const
-    {
-    }
+        friend std::ostream& operator<<(std::ostream& os, timed_var<Args...> const& ti)
+        {
+            detail::tuple_print(os, ti.message_);
+            return os;
+        }
+    };
 
-    template<typename... Args>
-    constexpr bool scope(Args const&...)
-    {
-        return true;
-    }
+    ///////////////////////////////////////////////////////////////////////////
+    template <bool enable>
+    struct enable_print;
 
-    template<typename T, typename... Args>
-    constexpr bool declare_variable(Args const&...) const
+    // when false, debug statements should produce no code
+    template <>
+    struct enable_print<false>
     {
-        return true;
-    }
+        constexpr enable_print(char const*) {}
 
-    template<typename T, typename V>
-    constexpr void set(T&, V const&)
-    {
-    }
+        constexpr bool is_enabled() const { return false; }
 
-    // @todo, return void so that timers have zero footprint when disabled
-    template<typename... Args>
-    constexpr int make_timer(const double, Args const&...) const
-    {
-        return 0;
-    }
+        template <typename... Args>
+        constexpr void debug(Args const&...) const
+        {
+        }
 
-    template<typename Expr>
-    constexpr bool eval(Expr const&)
-    {
-        return true;
-    }
-};
-
-// when true, debug statements produce valid output
-template<>
-struct enable_print<true>
-{
-  private:
-    char const* prefix_;
-
-  public:
-    constexpr enable_print()
-    : prefix_("")
-    {
-    }
+        template <typename... Args>
+        constexpr void warning(Args const&...) const
+        {
+        }
 
-    constexpr enable_print(const char* p)
-    : prefix_(p)
-    {
-    }
+        template <typename... Args>
+        constexpr void trace(Args const&...) const
+        {
+        }
 
-    constexpr bool is_enabled() const { return true; }
+        template <typename... Args>
+        constexpr void error(Args const&...) const
+        {
+        }
 
-    template<typename... Args>
-    constexpr void debug(Args const&... args) const
-    {
-        detail::debug(prefix_, args...);
-    }
+        template <typename... Args>
+        constexpr void timed(Args const&...) const
+        {
+        }
 
-    template<typename... Args>
-    constexpr void warning(Args const&... args) const
-    {
-        detail::warning(prefix_, args...);
-    }
+        template <typename T>
+        constexpr void array(std::string const&, std::vector<T> const&) const
+        {
+        }
 
-    template<typename... Args>
-    constexpr void trace(Args const&... args) const
-    {
-        detail::trace(prefix_, args...);
-    }
+        template <typename T, std::size_t N>
+        constexpr void array(std::string const&, std::array<T, N> const&) const
+        {
+        }
 
-    template<typename... Args>
-    constexpr void error(Args const&... args) const
-    {
-        detail::error(prefix_, args...);
-    }
+        template <typename Iter>
+        constexpr void array(std::string const&, Iter, Iter) const
+        {
+        }
 
-    template<typename... Args>
-    scoped_var<Args...> scope(Args const&... args)
-    {
-        return scoped_var<Args...>(prefix_, args...);
-    }
+        template <typename... Args>
+        constexpr bool scope(Args const&...)
+        {
+            return true;
+        }
 
-    template<typename... T, typename... Args>
-    void timed(timed_var<T...> const& init, Args const&... args) const
-    {
-        auto now = std::chrono::steady_clock::now();
-        if (init.elapsed(now)) { detail::timed(prefix_, init, args...); }
-    }
+        template <typename T, typename... Args>
+        constexpr bool declare_variable(Args const&...) const
+        {
+            return true;
+        }
 
-    template<typename T>
-    void array(std::string const& name, std::vector<T> const& v) const
-    {
-        std::cout << str<20>(name.c_str()) << ": {" << debug::dec<4>(v.size()) << "} : ";
-        std::copy(std::begin(v), std::end(v), std::ostream_iterator<T>(std::cout, ", "));
-        std::cout << "\n";
-    }
+        template <typename T, typename V>
+        constexpr void set(T&, V const&)
+        {
+        }
 
-    template<typename T, std::size_t N>
-    void array(std::string const& name, const std::array<T, N>& v) const
-    {
-        std::cout << str<20>(name.c_str()) << ": {" << debug::dec<4>(v.size()) << "} : ";
-        std::copy(std::begin(v), std::end(v), std::ostream_iterator<T>(std::cout, ", "));
-        std::cout << "\n";
-    }
+        // @todo, return void so that timers have zero footprint when disabled
+        template <typename... Args>
+        constexpr int make_timer(double const, Args const&...) const
+        {
+            return 0;
+        }
 
-    template<typename Iter>
-    void array(std::string const& name, Iter begin, Iter end) const
-    {
-        std::cout << str<20>(name.c_str()) << ": {" << debug::dec<4>(std::distance(begin, end))
-                  << "} : ";
-        std::copy(begin, end,
-            std::ostream_iterator<typename std::iterator_traits<Iter>::value_type>(std::cout,
-                ", "));
-        std::cout << std::endl;
-    }
+        template <typename Expr>
+        constexpr bool eval(Expr const&)
+        {
+            return true;
+        }
+    };
 
-    template<typename T, typename... Args>
-    T declare_variable(Args const&... args) const
+    // when true, debug statements produce valid output
+    template <>
+    struct enable_print<true>
     {
-        return T(args...);
-    }
+    private:
+        char const* prefix_;
 
-    template<typename T, typename V>
-    void set(T& var, V const& val)
-    {
-        var = val;
-    }
+    public:
+        constexpr enable_print()
+          : prefix_("")
+        {
+        }
+
+        constexpr enable_print(char const* p)
+          : prefix_(p)
+        {
+        }
+
+        constexpr bool is_enabled() const { return true; }
+
+        template <typename... Args>
+        constexpr void debug(Args const&... args) const
+        {
+            detail::debug(prefix_, args...);
+        }
+
+        template <typename... Args>
+        constexpr void warning(Args const&... args) const
+        {
+            detail::warning(prefix_, args...);
+        }
 
-    template<typename... Args>
-    timed_var<Args...> make_timer(const double delay, const Args... args) const
+        template <typename... Args>
+        constexpr void trace(Args const&... args) const
+        {
+            detail::trace(prefix_, args...);
+        }
+
+        template <typename... Args>
+        constexpr void error(Args const&... args) const
+        {
+            detail::error(prefix_, args...);
+        }
+
+        template <typename... Args>
+        scoped_var<Args...> scope(Args const&... args)
+        {
+            return scoped_var<Args...>(prefix_, args...);
+        }
+
+        template <typename... T, typename... Args>
+        void timed(timed_var<T...> const& init, Args const&... args) const
+        {
+            auto now = std::chrono::steady_clock::now();
+            if (init.elapsed(now)) { detail::timed(prefix_, init, args...); }
+        }
+
+        template <typename T>
+        void array(std::string const& name, std::vector<T> const& v) const
+        {
+            std::cout << str<20>(name.c_str()) << ": {" << debug::dec<4>(v.size()) << "} : ";
+            std::copy(std::begin(v), std::end(v), std::ostream_iterator<T>(std::cout, ", "));
+            std::cout << "\n";
+        }
+
+        template <typename T, std::size_t N>
+        void array(std::string const& name, std::array<T, N> const& v) const
+        {
+            std::cout << str<20>(name.c_str()) << ": {" << debug::dec<4>(v.size()) << "} : ";
+            std::copy(std::begin(v), std::end(v), std::ostream_iterator<T>(std::cout, ", "));
+            std::cout << "\n";
+        }
+
+        template <typename Iter>
+        void array(std::string const& name, Iter begin, Iter end) const
+        {
+            std::cout << str<20>(name.c_str()) << ": {" << debug::dec<4>(std::distance(begin, end))
+                      << "} : ";
+            std::copy(begin, end,
+                std::ostream_iterator<typename std::iterator_traits<Iter>::value_type>(
+                    std::cout, ", "));
+            std::cout << std::endl;
+        }
+
+        template <typename T, typename... Args>
+        T declare_variable(Args const&... args) const
+        {
+            return T(args...);
+        }
+
+        template <typename T, typename V>
+        void set(T& var, V const& val)
+        {
+            var = val;
+        }
+
+        template <typename... Args>
+        timed_var<Args...> make_timer(double const delay, Args const... args) const
+        {
+            return timed_var<Args...>(delay, args...);
+        }
+
+        template <typename Expr>
+        auto eval(Expr const& e)
+        {
+            return e();
+        }
+    };
+
+    // ------------------------------------------------------------------
+    // helper for N>M true/false
+    // ------------------------------------------------------------------
+    template <int Level, int Threshold>
+    struct check_level : std::integral_constant<bool, Level <= Threshold>
     {
-        return timed_var<Args...>(delay, args...);
-    }
+    };
 
-    template<typename Expr>
-    auto eval(Expr const& e)
+    template <int Level, int Threshold>
+    struct print_threshold : enable_print<check_level<Level, Threshold>::value>
     {
-        return e();
-    }
-};
-
-// ------------------------------------------------------------------
-// helper for N>M true/false
-// ------------------------------------------------------------------
-template<int Level, int Threshold>
-struct check_level : std::integral_constant<bool, Level <= Threshold>
-{
-};
-
-template<int Level, int Threshold>
-struct print_threshold : enable_print<check_level<Level, Threshold>::value>
-{
-    using base_type = enable_print<check_level<Level, Threshold>::value>;
-    // inherit constructor
-    using base_type::base_type;
-};
-
-} // namespace NS_DEBUG
+        using base_type = enable_print<check_level<Level, Threshold>::value>;
+        // inherit constructor
+        using base_type::base_type;
+    };
+
+}    // namespace NS_DEBUG
 /// \endcond
diff --git a/src/libfabric/request_state.hpp b/src/libfabric/request_state.hpp
index d00e0367..74958fc5 100644
--- a/src/libfabric/request_state.hpp
+++ b/src/libfabric/request_state.hpp
@@ -13,90 +13,88 @@
 #include "../request_state_base.hpp"
 #include "./operation_context.hpp"
 
-namespace oomph
-{
-namespace detail
-{
-
-struct request_state
-: public util::enable_shared_from_this<request_state>
-, public request_state_base<false>
-{
-    using base = request_state_base<false>;
-    using shared_ptr_t = util::unsafe_shared_ptr<request_state>;
-    using operation_context = libfabric::operation_context;
-
-    operation_context                      m_operation_context;
-    util::unsafe_shared_ptr<request_state> m_self_ptr;
-
-    request_state(oomph::context_impl* ctxt, oomph::communicator_impl* comm, std::size_t* scheduled,
-        rank_type rank, tag_type tag, cb_type&& cb)
-    : base{ctxt, comm, scheduled, rank, tag, std::move(cb)}
-    , m_operation_context{this}
-    {
-    }
-
-    void progress();
-
-    bool cancel();
-
-    void create_self_ref()
-    {
-        // create a self-reference cycle!!
-        // this is useful if we only keep a raw pointer around internally, which still is supposed
-        // to keep the object alive
-        m_self_ptr = shared_from_this();
-    }
-
-    shared_ptr_t release_self_ref() noexcept
-    {
-        assert(((bool)m_self_ptr) && "doesn't own a self-reference!");
-        return std::move(m_self_ptr);
-    }
-};
-
-struct shared_request_state
-: public std::enable_shared_from_this<shared_request_state>
-, public request_state_base<true>
-{
-    using base = request_state_base<true>;
-    using shared_ptr_t = std::shared_ptr<shared_request_state>;
-    using operation_context = libfabric::operation_context;
-
-    operation_context                     m_operation_context;
-    std::shared_ptr<shared_request_state> m_self_ptr;
-
-    shared_request_state(oomph::context_impl* ctxt, oomph::communicator_impl* comm,
-        std::atomic<std::size_t>* scheduled, rank_type rank, tag_type tag, cb_type&& cb)
-    : base{ctxt, comm, scheduled, rank, tag, std::move(cb)}
-    , m_operation_context{this}
-    {
-        [[maybe_unused]] auto scp = libfabric::opctx_deb<9>.scope(NS_DEBUG::ptr(this), __func__);
-    }
+namespace oomph { namespace detail {
 
-    ~shared_request_state()
+    struct request_state
+      : public util::enable_shared_from_this<request_state>
+      , public request_state_base<false>
     {
-        [[maybe_unused]] auto scp = libfabric::opctx_deb<9>.scope(NS_DEBUG::ptr(this), __func__);
-    }
-
-    void progress();
-
-    bool cancel();
-
-    void create_self_ref()
+        using base = request_state_base<false>;
+        using shared_ptr_t = util::unsafe_shared_ptr<request_state>;
+        using operation_context = libfabric::operation_context;
+
+        operation_context m_operation_context;
+        util::unsafe_shared_ptr<request_state> m_self_ptr;
+
+        request_state(oomph::context_impl* ctxt, oomph::communicator_impl* comm,
+            std::size_t* scheduled, rank_type rank, tag_type tag, cb_type&& cb)
+          : base{ctxt, comm, scheduled, rank, tag, std::move(cb)}
+          , m_operation_context{this}
+        {
+        }
+
+        void progress();
+
+        bool cancel();
+
+        void create_self_ref()
+        {
+            // create a self-reference cycle!!
+            // this is useful if we only keep a raw pointer around internally, which still is supposed
+            // to keep the object alive
+            m_self_ptr = shared_from_this();
+        }
+
+        shared_ptr_t release_self_ref() noexcept
+        {
+            assert(((bool) m_self_ptr) && "doesn't own a self-reference!");
+            return std::move(m_self_ptr);
+        }
+    };
+
+    struct shared_request_state
+      : public std::enable_shared_from_this<shared_request_state>
+      , public request_state_base<true>
     {
-        // create a self-reference cycle!!
-        // this is useful if we only keep a raw pointer around internally, which still is supposed
-        // to keep the object alive
-        m_self_ptr = shared_from_this();
-    }
-
-    shared_ptr_t release_self_ref() noexcept
-    {
-        assert(((bool)m_self_ptr) && "doesn't own a self-reference!");
-        return std::move(m_self_ptr);
-    }
-};
-
-} // namespace detail
-} // namespace oomph
+        using base = request_state_base<true>;
+        using shared_ptr_t = std::shared_ptr<shared_request_state>;
+        using operation_context = libfabric::operation_context;
+
+        operation_context m_operation_context;
+        std::shared_ptr<shared_request_state> m_self_ptr;
+
+        shared_request_state(oomph::context_impl* ctxt, oomph::communicator_impl* comm,
+            std::atomic<std::size_t>* scheduled, rank_type rank, tag_type tag, cb_type&& cb)
+          : base{ctxt, comm, scheduled, rank, tag, std::move(cb)}
+          , m_operation_context{this}
+        {
+            [[maybe_unused]] auto scp =
+                libfabric::opctx_deb<9>.scope(NS_DEBUG::hptr(this), __func__);
+        }
+
+        ~shared_request_state()
+        {
+            [[maybe_unused]] auto scp =
+                libfabric::opctx_deb<9>.scope(NS_DEBUG::hptr(this), __func__);
+        }
+
+        void progress();
+
+        bool cancel();
+
+        void create_self_ref()
+        {
+            // create a self-reference cycle!!
+            // this is useful if we only keep a raw pointer around internally, which still is supposed
+            // to keep the object alive
+            m_self_ptr = shared_from_this();
+        }
+
+        shared_ptr_t release_self_ref() noexcept
+        {
+            assert(((bool) m_self_ptr) && "doesn't own a self-reference!");
+            return std::move(m_self_ptr);
+        }
+    };
+
+}}    // namespace oomph::detail
diff --git a/src/libfabric/simple_counter.hpp b/src/libfabric/simple_counter.hpp
index f44eac92..26ecf8d5 100644
--- a/src/libfabric/simple_counter.hpp
+++ b/src/libfabric/simple_counter.hpp
@@ -12,13 +12,13 @@
 #include "oomph_libfabric_defines.hpp"
 //
 #include <atomic>
-#include <type_traits>
 #include <iostream>
+#include <type_traits>
 
 #ifdef OOMPH_LIBFABRIC_HAVE_PERFORMANCE_COUNTERS
-#define PERFORMANCE_COUNTER_ENABLED true
+# define PERFORMANCE_COUNTER_ENABLED true
 #else
-#define PERFORMANCE_COUNTER_ENABLED false
+# define PERFORMANCE_COUNTER_ENABLED false
 #endif
 
 //
@@ -29,90 +29,86 @@
 // the performance counter that will simply do nothing when disabled - but
 // still allow code that uses the counters in arithmetic to compile.
 //
-namespace oomph
-{
-namespace libfabric
-{
-template<typename T, bool enabled = PERFORMANCE_COUNTER_ENABLED,
-    typename Enable = std::enable_if_t<std::is_integral<T>::value>>
-struct simple_counter
-{
-};
-
-// --------------------------------------------------------------------
-// specialization for performance counters Enabled
-// we provide an atomic<T> that can be incremented or added/subtracted to
-template<typename T>
-struct simple_counter<T, true>
-{
-    simple_counter()
-    : value_{T()}
+namespace oomph { namespace libfabric {
+    template <typename T, bool enabled = PERFORMANCE_COUNTER_ENABLED,
+        typename Enable = std::enable_if_t<std::is_integral<T>::value>>
+    struct simple_counter
     {
-    }
+    };
 
-    simple_counter(const T& init)
-    : value_{init}
+    // --------------------------------------------------------------------
+    // specialization for performance counters Enabled
+    // we provide an atomic<T> that can be incremented or added/subtracted to
+    template <typename T>
+    struct simple_counter<T, true>
     {
-    }
+        simple_counter()
+          : value_{T()}
+        {
+        }
 
-    inline operator T() const { return value_; }
+        simple_counter(T const& init)
+          : value_{init}
+        {
+        }
 
-    inline T operator=(const T& x) { return value_ = x; }
+        inline operator T() const { return value_; }
 
-    inline T operator++() { return ++value_; }
+        inline T operator=(T const& x) { return value_ = x; }
 
-    inline T operator++(int x) { return (value_ += x); }
+        inline T operator++() { return ++value_; }
 
-    inline T operator+=(const T& rhs) { return (value_ += rhs); }
+        inline T operator++(int x) { return (value_ += x); }
 
-    inline T operator--() { return --value_; }
+        inline T operator+=(T const& rhs) { return (value_ += rhs); }
 
-    inline T operator--(int x) { return (value_ -= x); }
+        inline T operator--() { return --value_; }
 
-    inline T operator-=(const T& rhs) { return (value_ -= rhs); }
+        inline T operator--(int x) { return (value_ -= x); }
 
-    friend std::ostream& operator<<(std::ostream& os, const simple_counter<T, true>& x)
-    {
-        os << x.value_;
-        return os;
-    }
+        inline T operator-=(T const& rhs) { return (value_ -= rhs); }
 
-    std::atomic<T> value_;
-};
+        friend std::ostream& operator<<(std::ostream& os, simple_counter<T, true> const& x)
+        {
+            os << x.value_;
+            return os;
+        }
 
-// --------------------------------------------------------------------
-// specialization for performance counters Disabled
-// just return dummy values so that arithmetic operations compile ok
-template<typename T>
-struct simple_counter<T, false>
-{
-    simple_counter() {}
+        std::atomic<T> value_;
+    };
 
-    simple_counter(const T&) {}
+    // --------------------------------------------------------------------
+    // specialization for performance counters Disabled
+    // just return dummy values so that arithmetic operations compile ok
+    template <typename T>
+    struct simple_counter<T, false>
+    {
+        simple_counter() {}
 
-    inline operator T() const { return 0; }
+        simple_counter(T const&) {}
 
-    //        inline bool operator==(const T&) { return true; }
+        inline operator T() const { return 0; }
 
-    inline T operator=(const T&) { return 0; }
+        //        inline bool operator==(const T&) { return true; }
 
-    inline T operator++() { return 0; }
+        inline T operator=(T const&) { return 0; }
 
-    inline T operator++(int) { return 0; }
+        inline T operator++() { return 0; }
 
-    inline T operator+=(const T&) { return 0; }
+        inline T operator++(int) { return 0; }
 
-    inline T operator--() { return 0; }
+        inline T operator+=(T const&) { return 0; }
 
-    inline T operator--(int) { return 0; }
+        inline T operator--() { return 0; }
 
-    inline T operator-=(const T&) { return 0; }
+        inline T operator--(int) { return 0; }
 
-    friend std::ostream& operator<<(std::ostream& os, const simple_counter<T, false>&)
-    {
-        os << "undefined";
-        return os;
-    }
-};
-} // namespace libfabric
-} // namespace oomph
+        inline T operator-=(T const&) { return 0; }
+
+        friend std::ostream& operator<<(std::ostream& os, simple_counter<T, false> const&)
+        {
+            os << "undefined";
+            return os;
+        }
+    };
+}}    // namespace oomph::libfabric
diff --git a/src/libfabric/test/check_libfabric.cpp b/src/libfabric/test/check_libfabric.cpp
new file mode 100644
index 00000000..11d9788e
--- /dev/null
+++ b/src/libfabric/test/check_libfabric.cpp
@@ -0,0 +1,31 @@
+/*
+ * ghex-org
+ *
+ * Copyright (c) 2014-2023, ETH Zurich
+ * All rights reserved.
+ *
+ * Please, refer to the LICENSE file in the root directory.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#include <mpi.h>
+#include <oomph/context.hpp>
+#include "../benchmarks/mpi_environment.hpp"
+//
+#include "../communicator.hpp"
+#include "../context.hpp"
+
+#include <hwmalloc/heap_config.hpp>
+
+int main(int argc, char** argv)
+{
+    using namespace oomph;
+    bool const message_pool_never_free = false;
+    std::size_t const message_pool_reserve = 1024 * 1024 * 128;
+    bool const multi_threaded = true;
+    bool debug = true;
+    //
+    mpi_environment env(multi_threaded, argc, argv);
+    hwmalloc::heap_config const& default_heap = hwmalloc::get_default_heap_config();
+    auto ctxt = context_impl(MPI_COMM_WORLD, true, default_heap /*, debug*/);
+}
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 5217bbaf..39affd0e 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,6 +1,8 @@
 add_subdirectory(mpi_runner)
 
-set(OOMPH_TEST_LEAK_GPU_MEMORY OFF CACHE BOOL "Do not free memory (bug on Piz Daint)")
+set(OOMPH_TEST_LEAK_GPU_MEMORY
+    OFF
+    CACHE BOOL "Do not free memory (bug on Piz Daint)")
 
 # ---------------------------------------------------------------------
 # compile tests
@@ -10,30 +12,57 @@ set(OOMPH_TEST_LEAK_GPU_MEMORY OFF CACHE BOOL "Do not free memory (bug on Piz Da
 set(serial_tests test_unique_function test_unsafe_shared_ptr)
 
 # list of parallel tests to be executed
-set(parallel_tests test_context test_send_recv test_send_multi test_cancel test_locality)
-#test_tag_range)
-if (OOMPH_ENABLE_BARRIER)
-    list(APPEND parallel_tests test_barrier)
+set(parallel_tests test_context test_send_recv test_send_multi test_cancel
+                   test_locality)
+
+# list of parallel tests that also have device code variants
+if(HWMALLOC_ENABLE_DEVICE)
+  set(device_tests test_send_recv)
+endif()
+
+# test_tag_range)
+if(OOMPH_ENABLE_BARRIER)
+  list(APPEND parallel_tests test_barrier)
 endif()
 
-# creates an object library (i.e. *.o file)
+# creates an object library (i.e. *.o file), if DEVICE is specified, extra flags
+# are added and the target name has a suffix
 function(compile_test t_)
-    set(t ${t_}_obj)
-    add_library(${t} OBJECT ${t_}.cpp)
-    oomph_target_compile_options(${t})
-    if (OOMPH_TEST_LEAK_GPU_MEMORY)
-        target_compile_definitions(${t} PRIVATE OOMPH_TEST_LEAK_GPU_MEMORY)
-    endif()
-    target_link_libraries(${t} PRIVATE ext-gtest)
-    target_link_libraries(${t} PUBLIC oomph)
+  set(options DEVICE)
+  cmake_parse_arguments(CT "${options}" "" "" ${ARGN})
+  set(source_filename_ "${t_}.cpp")
+  set(suffix_ "")
+  if(CT_DEVICE)
+    # Make a copy the input source file in the build directory, add a suffix
+    set(suffix_ "_device")
+    cmake_path(REPLACE_EXTENSION source_filename_ LAST_ONLY "${suffix_}.cpp"
+               OUTPUT_VARIABLE src_name_)
+    set(dst_file "${CMAKE_CURRENT_BINARY_DIR}/${src_name_}")
+    configure_file("${source_filename_}" "${dst_file}" COPYONLY)
+    set(source_filename_ "${dst_file}")
+  endif()
+  set(target_ ${t}${suffix_}_obj)
+  add_library(${target_} OBJECT ${source_filename_})
+  oomph_target_compile_options(${target_})
+  target_compile_definitions(
+    ${target_}
+    PRIVATE $<$<BOOL:${OOMPH_TEST_LEAK_GPU_MEMORY}>:OOMPH_TEST_LEAK_GPU_MEMORY>)
+  target_compile_definitions(
+    ${target_} PRIVATE $<$<BOOL:${CT_DEVICE}>:TEST_DEVICE_MODE_ONLY>)
+  target_link_libraries(${target_} PRIVATE ext-gtest)
+  target_link_libraries(${target_} PUBLIC oomph)
 endfunction()
 
-# compile an object library for each test
-# tests will be compiled only once and then linked against all enabled oomph backends
+# compile an object library for each test tests will be compiled only once and
+# then linked against all enabled oomph backends
 list(APPEND all_tests ${serial_tests} ${parallel_tests})
 list(REMOVE_DUPLICATES all_tests)
 foreach(t ${all_tests})
-    compile_test(${t})
+  compile_test(${t})
+  if(${t} IN_LIST device_tests)
+    # generate a second version of the obj file, but with DEVICE code enabled
+    compile_test(${t} DEVICE)
+  endif()
 endforeach()
 
 # ---------------------------------------------------------------------
@@ -48,10 +77,11 @@ function(reg_serial_test t)
     add_test(
         NAME ${t}
         COMMAND $<TARGET_FILE:${t}>)
+    set_tests_properties(${t} PROPERTIES LABELS "serial")
 endfunction()
 
 foreach(t ${serial_tests})
-    reg_serial_test(${t})
+  reg_serial_test(${t})
 endforeach()
 
 # creates an executable by linking to object file and to selected oomph backend
@@ -61,29 +91,42 @@ function(reg_parallel_test t_ lib n)
     oomph_target_compile_options(${t})
     target_link_libraries(${t} PRIVATE gtest_main_mpi)
     target_link_libraries(${t} PRIVATE oomph_${lib})
-    add_test(
-        NAME ${t}
-        COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${n} ${MPIEXEC_PREFLAGS}
-            $<TARGET_FILE:${t}> ${MPIEXEC_POSTFLAGS})
-    set_tests_properties(${t} PROPERTIES RUN_SERIAL TRUE)
+    if("${MPIEXEC_EXECUTABLE}" STREQUAL "")
+      add_test(NAME ${t} COMMAND $<TARGET_FILE:${t}>)
+    else()
+      add_test(
+          NAME ${t}
+          COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${n} ${MPIEXEC_PREFLAGS}
+              $<TARGET_FILE:${t}> ${MPIEXEC_POSTFLAGS})
+    endif()
+    set_tests_properties(${t} PROPERTIES RUN_SERIAL TRUE LABELS "parallel-ranks-${n}")
 endfunction()
 
-if (OOMPH_WITH_MPI)
-    foreach(t ${parallel_tests})
-        reg_parallel_test(${t} mpi 4)
-    endforeach()
+if(OOMPH_WITH_MPI)
+  foreach(t ${parallel_tests})
+    reg_parallel_test(${t} mpi 4)
+  endforeach()
+  foreach(t ${device_tests})
+    reg_parallel_test(${t}_device mpi 4)
+  endforeach()
 endif()
 
-if (OOMPH_WITH_UCX)
-    foreach(t ${parallel_tests})
-        reg_parallel_test(${t} ucx 4)
-    endforeach()
+if(OOMPH_WITH_UCX)
+  foreach(t ${parallel_tests})
+    reg_parallel_test(${t} ucx 4)
+  endforeach()
+  foreach(t ${device_tests})
+    reg_parallel_test(${t}_device ucx 4)
+  endforeach()
 endif()
 
-if (OOMPH_WITH_LIBFABRIC)
-    foreach(t ${parallel_tests})
-        reg_parallel_test(${t} libfabric 4)
-    endforeach()
+if(OOMPH_WITH_LIBFABRIC)
+  foreach(t ${parallel_tests})
+    reg_parallel_test(${t} libfabric 4)
+  endforeach()
+  foreach(t ${device_tests})
+    reg_parallel_test(${t}_device libfabric 4)
+  endforeach()
 endif()
 
 add_subdirectory(bindings)
diff --git a/test/bindings/fortran/CMakeLists.txt b/test/bindings/fortran/CMakeLists.txt
index 974d2f7c..2a5980c5 100644
--- a/test/bindings/fortran/CMakeLists.txt
+++ b/test/bindings/fortran/CMakeLists.txt
@@ -25,12 +25,17 @@ function(reg_parallel_test_f t_ lib n nthr)
         $<INSTALL_INTERFACE:..>
         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../../..>
         $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/../../..>)
-    add_test(
-        NAME ${t}
-        COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${n} ${MPIEXEC_PREFLAGS}
-            $<TARGET_FILE:${t}> ${MPIEXEC_POSTFLAGS})
+    if("${MPIEXEC_EXECUTABLE}" STREQUAL "")
+      add_test(NAME ${t} COMMAND $<TARGET_FILE:${t}>)
+    else()
+      add_test(
+          NAME ${t}
+          COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${n} ${MPIEXEC_PREFLAGS}
+              $<TARGET_FILE:${t}> ${MPIEXEC_POSTFLAGS})
+    endif()
     set_tests_properties(${t} PROPERTIES
-        ENVIRONMENT OMP_NUM_THREADS=${nthr})
+        ENVIRONMENT OMP_NUM_THREADS=${nthr}
+        LABELS "parallel-ranks-${n}")
 endfunction()
 
 if (OOMPH_WITH_MPI)
diff --git a/test/test_send_recv.cpp b/test/test_send_recv.cpp
index 0cfd1170..1326eecb 100644
--- a/test/test_send_recv.cpp
+++ b/test/test_send_recv.cpp
@@ -7,16 +7,21 @@
  * Please, refer to the LICENSE file in the root directory.
  * SPDX-License-Identifier: BSD-3-Clause
  */
-#include <oomph/context.hpp>
+#ifdef TEST_DEVICE_MODE_ONLY
+# ifdef HWMALLOC_ENABLE_DEVICE
+#  include <driver_types.h>
+# endif
+#endif
+
 #include <gtest/gtest.h>
-#include "./mpi_runner/mpi_test_fixture.hpp"
-#include <iostream>
-#include <iomanip>
-#include <thread>
+#include <oomph/context.hpp>
+// use this path because device version in build dir needs to find include
 #include <atomic>
+#include <thread>
+#include "../test/mpi_runner/mpi_test_fixture.hpp"
 
-#define NITERS   50
-#define SIZE     64
+#define NITERS 50
+#define SIZE 64
 #define NTHREADS 4
 
 std::vector<std::atomic<int>> shared_received(NTHREADS);
@@ -33,22 +38,22 @@ struct test_environment_base
     using tag_type = oomph::tag_type;
     using message = oomph::message_buffer<rank_type>;
 
-    oomph::context&     ctxt;
+    oomph::context& ctxt;
     oomph::communicator comm;
-    rank_type           speer_rank;
-    rank_type           rpeer_rank;
-    int                 thread_id;
-    int                 num_threads;
-    tag_type            tag;
+    rank_type speer_rank;
+    rank_type rpeer_rank;
+    int thread_id;
+    int num_threads;
+    tag_type tag;
 
     test_environment_base(oomph::context& c, int tid, int num_t)
-    : ctxt(c)
-    , comm(ctxt.get_communicator())
-    , speer_rank((comm.rank() + 1) % comm.size())
-    , rpeer_rank((comm.rank() + comm.size() - 1) % comm.size())
-    , thread_id(tid)
-    , num_threads(num_t)
-    , tag(tid)
+      : ctxt(c)
+      , comm(ctxt.get_communicator())
+      , speer_rank((comm.rank() + 1) % comm.size())
+      , rpeer_rank((comm.rank() + comm.size() - 1) % comm.size())
+      , thread_id(tid)
+      , num_threads(num_t)
+      , tag(tid)
     {
     }
 };
@@ -57,25 +62,26 @@ struct test_environment : public test_environment_base
 {
     using base = test_environment_base;
 
-    static auto make_buffer(oomph::communicator& comm, std::size_t size, bool user_alloc,
-        rank_type* ptr)
+    static auto make_buffer(
+        oomph::communicator& comm, std::size_t size, bool user_alloc, rank_type* ptr)
     {
-        if (user_alloc) return comm.make_buffer<rank_type>(ptr, size);
+        if (user_alloc)
+            return comm.make_buffer<rank_type>(ptr, size);
         else
             return comm.make_buffer<rank_type>(size);
     }
 
     std::vector<rank_type> raw_smsg;
     std::vector<rank_type> raw_rmsg;
-    message                smsg;
-    message                rmsg;
+    message smsg;
+    message rmsg;
 
     test_environment(oomph::context& c, std::size_t size, int tid, int num_t, bool user_alloc)
-    : base(c, tid, num_t)
-    , raw_smsg(user_alloc ? size : 0)
-    , raw_rmsg(user_alloc ? size : 0)
-    , smsg(make_buffer(comm, size, user_alloc, raw_smsg.data()))
-    , rmsg(make_buffer(comm, size, user_alloc, raw_rmsg.data()))
+      : base(c, tid, num_t)
+      , raw_smsg(user_alloc ? size : 0)
+      , raw_rmsg(user_alloc ? size : 0)
+      , smsg(make_buffer(comm, size, user_alloc, raw_smsg.data()))
+      , rmsg(make_buffer(comm, size, user_alloc, raw_rmsg.data()))
     {
         fill_send_buffer();
         fill_recv_buffer();
@@ -104,10 +110,11 @@ struct test_environment_device : public test_environment_base
 {
     using base = test_environment_base;
 
-    static auto make_buffer(oomph::communicator& comm, std::size_t size, bool user_alloc,
-        rank_type* device_ptr)
+    static auto make_buffer(
+        oomph::communicator& comm, std::size_t size, bool user_alloc, rank_type* device_ptr)
     {
-        if (user_alloc) return comm.make_device_buffer<rank_type>(device_ptr, size, 0);
+        if (user_alloc)
+            return comm.make_device_buffer<rank_type>(device_ptr, size, 0);
         else
             return comm.make_device_buffer<rank_type>(size, 0);
     }
@@ -120,37 +127,37 @@ struct test_environment_device : public test_environment_base
             if (size) m_ptr = hwmalloc::device_malloc(size * sizeof(rank_type));
         }
         device_allocation(device_allocation&& other)
-        : m_ptr{std::exchange(other.m_ptr, nullptr)}
+          : m_ptr{std::exchange(other.m_ptr, nullptr)}
         {
         }
         ~device_allocation()
         {
-#ifndef OOMPH_TEST_LEAK_GPU_MEMORY
+# ifndef OOMPH_TEST_LEAK_GPU_MEMORY
             if (m_ptr) hwmalloc::device_free(m_ptr);
-#endif
+# endif
         }
-        rank_type* get() const noexcept { return (rank_type*)m_ptr; }
+        rank_type* get() const noexcept { return (rank_type*) m_ptr; }
     };
 
     device_allocation raw_device_smsg;
     device_allocation raw_device_rmsg;
-    message           smsg;
-    message           rmsg;
-
-    test_environment_device(oomph::context& c, std::size_t size, int tid, int num_t,
-        bool user_alloc)
-    : base(c, tid, num_t)
-#ifndef OOMPH_TEST_LEAK_GPU_MEMORY
-    , raw_device_smsg(user_alloc ? size : 0)
-    , raw_device_rmsg(user_alloc ? size : 0)
-    , smsg(make_buffer(comm, size, user_alloc, raw_device_smsg.get()))
-    , rmsg(make_buffer(comm, size, user_alloc, raw_device_rmsg.get()))
-#else
-    , raw_device_smsg(size)
-    , raw_device_rmsg(size)
-    , smsg(make_buffer(comm, size, user_alloc, raw_device_smsg.get()))
-    , rmsg(make_buffer(comm, size, user_alloc, raw_device_rmsg.get()))
-#endif
+    message smsg;
+    message rmsg;
+
+    test_environment_device(
+        oomph::context& c, std::size_t size, int tid, int num_t, bool user_alloc)
+      : base(c, tid, num_t)
+# ifndef OOMPH_TEST_LEAK_GPU_MEMORY
+      , raw_device_smsg(user_alloc ? size : 0)
+      , raw_device_rmsg(user_alloc ? size : 0)
+      , smsg(make_buffer(comm, size, user_alloc, raw_device_smsg.get()))
+      , rmsg(make_buffer(comm, size, user_alloc, raw_device_rmsg.get()))
+# else
+      , raw_device_smsg(size)
+      , raw_device_rmsg(size)
+      , smsg(make_buffer(comm, size, user_alloc, raw_device_smsg.get()))
+      , rmsg(make_buffer(comm, size, user_alloc, raw_device_rmsg.get()))
+# endif
     {
         fill_send_buffer();
         fill_recv_buffer();
@@ -178,9 +185,8 @@ struct test_environment_device : public test_environment_base
 };
 #endif
 
-template<typename Func>
-void
-launch_test(Func f)
+template <typename Func>
+void launch_test(Func f)
 {
     // single threaded
     {
@@ -193,7 +199,7 @@ launch_test(Func f)
 
     // multi threaded
     {
-        oomph::context           ctxt(MPI_COMM_WORLD, true);
+        oomph::context ctxt(MPI_COMM_WORLD, true);
         std::vector<std::thread> threads;
         threads.reserve(NTHREADS);
         reset_counters();
@@ -210,9 +216,9 @@ launch_test(Func f)
 
 // no callback
 // ===========
-template<typename Env>
-void
-test_send_recv(oomph::context& ctxt, std::size_t size, int tid, int num_threads, bool user_alloc)
+template <typename Env>
+void test_send_recv(
+    oomph::context& ctxt, std::size_t size, int tid, int num_threads, bool user_alloc)
 {
     Env env(ctxt, size, tid, num_threads, user_alloc);
 
@@ -221,10 +227,7 @@ test_send_recv(oomph::context& ctxt, std::size_t size, int tid, int num_threads,
     {
         auto rreq = env.comm.recv(env.rmsg, env.rpeer_rank, env.tag);
         auto sreq = env.comm.send(env.smsg, env.speer_rank, env.tag);
-        while (!(rreq.is_ready() && sreq.is_ready())) 
-        { 
-            env.comm.progress(); 
-        };
+        while (!(rreq.is_ready() && sreq.is_ready())) { env.comm.progress(); };
         EXPECT_TRUE(env.check_recv_buffer());
         env.fill_recv_buffer();
     }
@@ -250,19 +253,19 @@ test_send_recv(oomph::context& ctxt, std::size_t size, int tid, int num_threads,
     }
 }
 
-TEST_F(mpi_test_fixture, send_recv)
-{
-    launch_test(test_send_recv<test_environment>);
-#if HWMALLOC_ENABLE_DEVICE
-    launch_test(test_send_recv<test_environment_device>);
+#ifndef TEST_DEVICE_MODE_ONLY
+TEST_F(mpi_test_fixture, send_recv) { launch_test(test_send_recv<test_environment>); }
+#else
+# if HWMALLOC_ENABLE_DEVICE
+TEST_F(mpi_test_fixture, send_recv_device) { launch_test(test_send_recv<test_environment_device>); }
+# endif
 #endif
-}
 
 // callback: pass by l-value reference
 // ===================================
-template<typename Env>
-void
-test_send_recv_cb(oomph::context& ctxt, std::size_t size, int tid, int num_threads, bool user_alloc)
+template <typename Env>
+void test_send_recv_cb(
+    oomph::context& ctxt, std::size_t size, int tid, int num_threads, bool user_alloc)
 {
     using rank_type = test_environment::rank_type;
     using tag_type = test_environment::tag_type;
@@ -270,8 +273,8 @@ test_send_recv_cb(oomph::context& ctxt, std::size_t size, int tid, int num_threa
 
     Env env(ctxt, size, tid, num_threads, user_alloc);
 
-    volatile int received = 0;
-    volatile int sent = 0;
+    int volatile received = 0;
+    int volatile sent = 0;
 
     auto send_callback = [&](message const&, rank_type, tag_type) { ++sent; };
     auto recv_callback = [&](message&, rank_type, tag_type) { ++received; };
@@ -317,20 +320,22 @@ test_send_recv_cb(oomph::context& ctxt, std::size_t size, int tid, int num_threa
     EXPECT_EQ(sent, NITERS);
 }
 
-TEST_F(mpi_test_fixture, send_recv_cb)
+#ifndef TEST_DEVICE_MODE_ONLY
+TEST_F(mpi_test_fixture, send_recv_cb) { launch_test(test_send_recv_cb<test_environment>); }
+#else
+# if HWMALLOC_ENABLE_DEVICE
+TEST_F(mpi_test_fixture, send_recv_cb_device)
 {
-    launch_test(test_send_recv_cb<test_environment>);
-#if HWMALLOC_ENABLE_DEVICE
     launch_test(test_send_recv_cb<test_environment_device>);
-#endif
 }
+# endif
+#endif
 
 // callback: pass by r-value reference (give up ownership)
 // =======================================================
-template<typename Env>
-void
-test_send_recv_cb_disown(oomph::context& ctxt, std::size_t size, int tid, int num_threads,
-    bool user_alloc)
+template <typename Env>
+void test_send_recv_cb_disown(
+    oomph::context& ctxt, std::size_t size, int tid, int num_threads, bool user_alloc)
 {
     using rank_type = test_environment::rank_type;
     using tag_type = test_environment::tag_type;
@@ -338,16 +343,14 @@ test_send_recv_cb_disown(oomph::context& ctxt, std::size_t size, int tid, int nu
 
     Env env(ctxt, size, tid, num_threads, user_alloc);
 
-    volatile int received = 0;
-    volatile int sent = 0;
+    int volatile received = 0;
+    int volatile sent = 0;
 
-    auto send_callback = [&](message msg, rank_type, tag_type)
-    {
+    auto send_callback = [&](message msg, rank_type, tag_type) {
         ++sent;
         env.smsg = std::move(msg);
     };
-    auto recv_callback = [&](message msg, rank_type, tag_type)
-    {
+    auto recv_callback = [&](message msg, rank_type, tag_type) {
         ++received;
         env.rmsg = std::move(msg);
     };
@@ -393,20 +396,25 @@ test_send_recv_cb_disown(oomph::context& ctxt, std::size_t size, int tid, int nu
     EXPECT_EQ(sent, NITERS);
 }
 
+#ifndef TEST_DEVICE_MODE_ONLY
 TEST_F(mpi_test_fixture, send_recv_cb_disown)
 {
     launch_test(test_send_recv_cb_disown<test_environment>);
-#if HWMALLOC_ENABLE_DEVICE
+}
+#else
+# if HWMALLOC_ENABLE_DEVICE
+TEST_F(mpi_test_fixture, send_recv_cb_disown_device)
+{
     launch_test(test_send_recv_cb_disown<test_environment_device>);
-#endif
 }
+# endif
+#endif
 
 // callback: pass by r-value reference (give up ownership), shared recv
 // ====================================================================
-template<typename Env>
-void
-test_send_shared_recv_cb_disown(oomph::context& ctxt, std::size_t size, int tid, int num_threads,
-    bool user_alloc)
+template <typename Env>
+void test_send_shared_recv_cb_disown(
+    oomph::context& ctxt, std::size_t size, int tid, int num_threads, bool user_alloc)
 {
     using rank_type = test_environment::rank_type;
     using tag_type = test_environment::tag_type;
@@ -416,19 +424,18 @@ test_send_shared_recv_cb_disown(oomph::context& ctxt, std::size_t size, int tid,
 
     thread_id = env.thread_id;
 
-    //volatile int received = 0;
-    volatile int sent = 0;
+    // volatile int received = 0;
+    int volatile sent = 0;
 
-    auto send_callback = [&](message msg, rank_type, tag_type)
-    {
+    auto send_callback = [&](message msg, rank_type, tag_type) {
         ++sent;
         env.smsg = std::move(msg);
     };
-    auto recv_callback = [&](message msg, rank_type, tag_type)
-    {
-        //std::cout << thread_id << " " << env.thread_id << std::endl;
-        //if (thread_id != env.thread_id) std::cout << "other thread picked up callback" << std::endl;
-        //else std::cout << "my thread picked up callback" << std::endl;
+    auto recv_callback = [&](message msg, rank_type, tag_type) {
+        // std::cout << thread_id << " " << env.thread_id << std::endl;
+        // if (thread_id != env.thread_id) std::cout << "other thread picked up
+        // callback" << std::endl; else std::cout << "my thread picked up callback"
+        // << std::endl;
         env.rmsg = std::move(msg);
         ++shared_received[env.thread_id];
     };
@@ -475,20 +482,25 @@ test_send_shared_recv_cb_disown(oomph::context& ctxt, std::size_t size, int tid,
     EXPECT_EQ(sent, NITERS);
 }
 
+#ifndef TEST_DEVICE_MODE_ONLY
 TEST_F(mpi_test_fixture, send_shared_recv_cb_disown)
 {
     launch_test(test_send_shared_recv_cb_disown<test_environment>);
-#if HWMALLOC_ENABLE_DEVICE
+}
+#else
+# if HWMALLOC_ENABLE_DEVICE
+TEST_F(mpi_test_fixture, send_shared_recv_cb_disown_device)
+{
     launch_test(test_send_shared_recv_cb_disown<test_environment_device>);
-#endif
 }
+# endif
+#endif
 
 // callback: pass by l-value reference, and resubmit
 // =================================================
-template<typename Env>
-void
-test_send_recv_cb_resubmit(oomph::context& ctxt, std::size_t size, int tid, int num_threads,
-    bool user_alloc)
+template <typename Env>
+void test_send_recv_cb_resubmit(
+    oomph::context& ctxt, std::size_t size, int tid, int num_threads, bool user_alloc)
 {
     using rank_type = test_environment::rank_type;
     using tag_type = test_environment::tag_type;
@@ -496,13 +508,13 @@ test_send_recv_cb_resubmit(oomph::context& ctxt, std::size_t size, int tid, int
 
     Env env(ctxt, size, tid, num_threads, user_alloc);
 
-    volatile int received = 0;
-    volatile int sent = 0;
+    int volatile received = 0;
+    int volatile sent = 0;
 
     struct recursive_send_callback
     {
-        Env&          env;
-        volatile int& sent;
+        Env& env;
+        int volatile& sent;
 
         void operator()(message& msg, rank_type dst, tag_type tag)
         {
@@ -513,8 +525,8 @@ test_send_recv_cb_resubmit(oomph::context& ctxt, std::size_t size, int tid, int
 
     struct recursive_recv_callback
     {
-        Env&          env;
-        volatile int& received;
+        Env& env;
+        int volatile& received;
 
         void operator()(message& msg, rank_type src, tag_type tag)
         {
@@ -531,20 +543,25 @@ test_send_recv_cb_resubmit(oomph::context& ctxt, std::size_t size, int tid, int
     while (sent < NITERS || received < NITERS) { env.comm.progress(); };
 }
 
+#ifndef TEST_DEVICE_MODE_ONLY
 TEST_F(mpi_test_fixture, send_recv_cb_resubmit)
 {
     launch_test(test_send_recv_cb_resubmit<test_environment>);
-#if HWMALLOC_ENABLE_DEVICE
+}
+#else
+# if HWMALLOC_ENABLE_DEVICE
+TEST_F(mpi_test_fixture, send_recv_cb_resubmit_device)
+{
     launch_test(test_send_recv_cb_resubmit<test_environment_device>);
-#endif
 }
+# endif
+#endif
 
 // callback: pass by r-value reference (give up ownership), and resubmit
 // =====================================================================
-template<typename Env>
-void
-test_send_recv_cb_resubmit_disown(oomph::context& ctxt, std::size_t size, int tid, int num_threads,
-    bool user_alloc)
+template <typename Env>
+void test_send_recv_cb_resubmit_disown(
+    oomph::context& ctxt, std::size_t size, int tid, int num_threads, bool user_alloc)
 {
     using rank_type = test_environment::rank_type;
     using tag_type = test_environment::tag_type;
@@ -552,13 +569,13 @@ test_send_recv_cb_resubmit_disown(oomph::context& ctxt, std::size_t size, int ti
 
     Env env(ctxt, size, tid, num_threads, user_alloc);
 
-    volatile int received = 0;
-    volatile int sent = 0;
+    int volatile received = 0;
+    int volatile sent = 0;
 
     struct recursive_send_callback
     {
-        Env&          env;
-        volatile int& sent;
+        Env& env;
+        int volatile& sent;
 
         void operator()(message msg, rank_type dst, tag_type tag)
         {
@@ -570,8 +587,8 @@ test_send_recv_cb_resubmit_disown(oomph::context& ctxt, std::size_t size, int ti
 
     struct recursive_recv_callback
     {
-        Env&          env;
-        volatile int& received;
+        Env& env;
+        int volatile& received;
 
         void operator()(message msg, rank_type src, tag_type tag)
         {
@@ -590,10 +607,16 @@ test_send_recv_cb_resubmit_disown(oomph::context& ctxt, std::size_t size, int ti
     while (sent < NITERS || received < NITERS) { env.comm.progress(); };
 }
 
+#ifndef TEST_DEVICE_MODE_ONLY
 TEST_F(mpi_test_fixture, send_recv_cb_resubmit_disown)
 {
     launch_test(test_send_recv_cb_resubmit_disown<test_environment>);
-#if HWMALLOC_ENABLE_DEVICE
+}
+#else
+# if HWMALLOC_ENABLE_DEVICE
+TEST_F(mpi_test_fixture, send_recv_cb_resubmit_disown_device)
+{
     launch_test(test_send_recv_cb_resubmit_disown<test_environment_device>);
-#endif
 }
+# endif
+#endif