From 040514f4e96d699c1e2751df6f9c09aeccb3cfbe Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Wed, 25 Mar 2026 10:53:03 +0100 Subject: [PATCH 01/51] Add cicd-ext CI configuration --- .cscs-ci/container/build.Containerfile | 8 ++ .cscs-ci/container/deps.Containerfile | 24 ++++ .cscs-ci/default.yaml | 191 +++++++++++++++++++++++++ .cscs-ci/spack/libfabric.yaml | 6 + .cscs-ci/spack/mpi.yaml | 6 + .cscs-ci/spack/nccl.yaml | 6 + .cscs-ci/spack/ucx.yaml | 6 + test/CMakeLists.txt | 3 +- test/bindings/fortran/CMakeLists.txt | 3 +- 9 files changed, 251 insertions(+), 2 deletions(-) create mode 100644 .cscs-ci/container/build.Containerfile create mode 100644 .cscs-ci/container/deps.Containerfile create mode 100644 .cscs-ci/default.yaml create mode 100644 .cscs-ci/spack/libfabric.yaml create mode 100644 .cscs-ci/spack/mpi.yaml create mode 100644 .cscs-ci/spack/nccl.yaml create mode 100644 .cscs-ci/spack/ucx.yaml diff --git a/.cscs-ci/container/build.Containerfile b/.cscs-ci/container/build.Containerfile new file mode 100644 index 00000000..784221cb --- /dev/null +++ b/.cscs-ci/container/build.Containerfile @@ -0,0 +1,8 @@ +ARG DEPS_IMAGE +FROM $DEPS_IMAGE + +COPY . /oomph +WORKDIR /oomph + +RUN spack -e ci build-env oomph -- cmake -B build -DOOMPH_WITH_TESTING=ON -DMPIEXEC_EXECUTABLE="" -DMPIEXEC_NUMPROC_FLAG="" -DMPIEXEC_PREFLAGS="" -DMPIEXEC_POSTFLAGS="" && \ + spack -e ci build-env oomph -- cmake --build build -j$(nproc) diff --git a/.cscs-ci/container/deps.Containerfile b/.cscs-ci/container/deps.Containerfile new file mode 100644 index 00000000..73c225c1 --- /dev/null +++ b/.cscs-ci/container/deps.Containerfile @@ -0,0 +1,24 @@ +FROM ghcr.io/eth-cscs/alps-images:py26.01-alps3-base + +ARG SPACK_SHA=develop +ARG SPACK_PACKAGES_SHA=main +ARG SPACK_ENV_FILE + +ENV DEBIAN_FRONTEND=noninteractive + +RUN mkdir -p /opt/spack && \ + curl -Ls "https://api.github.com/repos/spack/spack/tarball/$SPACK_SHA" | tar --strip-components=1 -xz -C /opt/spack + +ENV PATH="/opt/spack/bin:$PATH" + +RUN mkdir -p /opt/spack-packages && \ + curl -Ls "https://api.github.com/repos/spack/spack-packages/tarball/$SPACK_PACKAGES_SHA" | tar --strip-components=1 -xz -C /opt/spack-packages + +RUN spack repo remove --scope defaults:base builtin && \ + spack repo add --scope site /opt/spack-packages/repos/spack_repo/builtin + +COPY $SPACK_ENV_FILE /spack_environment/spack.yaml + +RUN spack env create ci /spack_environment/spack.yaml && \ + spack -e ci concretize -f && \ + spack -e ci install --jobs $(nproc) --fail-fast --only=dependencies diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml new file mode 100644 index 00000000..959ba3d4 --- /dev/null +++ b/.cscs-ci/default.yaml @@ -0,0 +1,191 @@ +include: + - remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml' + +stages: + - build_deps + - build + - test + +variables: + # The base image is the py26.01 alps3 image from docs.cscs.ch + BASE_IMAGE: ghcr.io/eth-cscs/alps-images:py26.01-alps3-base + SPACK_SHA: develop + SPACK_PACKAGES_SHA: main + FF_TIMESTAMPS: true + +.build_deps_template: + stage: build_deps + timeout: 1 hours + before_script: + - echo $DOCKERHUB_TOKEN | podman login docker.io -u $DOCKERHUB_USERNAME --password-stdin || true + - export DOCKERFILE_SHA=`sha256sum .cscs-ci/container/deps.Containerfile | head -c 16` + - export ENV_FILE_SHA=`sha256sum ${SPACK_ENV_FILE} | head -c 16` + - export CONFIG_TAG=`echo $DOCKERFILE_SHA-$BASE_IMAGE-$SPACK_SHA-$SPACK_PACKAGES_SHA-$ENV_FILE_SHA | sha256sum - | head -c 16` + - export PERSIST_IMAGE_NAME=$CSCS_REGISTRY_PATH/oomph-spack-deps-$BACKEND:$CONFIG_TAG + - echo -e "CONFIG_TAG=$CONFIG_TAG" >> base-${BACKEND}.env + - echo -e "DEPS_IMAGE=$PERSIST_IMAGE_NAME" >> base-${BACKEND}.env + variables: + DOCKERFILE: .cscs-ci/container/deps.Containerfile + DOCKER_BUILD_ARGS: '["SPACK_SHA", "SPACK_PACKAGES_SHA", "SPACK_ENV_FILE"]' + artifacts: + reports: + dotenv: base-${BACKEND}.env + +build_deps_nccl: + extends: + - .container-builder-cscs-gh200 + - .build_deps_template + variables: + BACKEND: nccl + SPACK_ENV_FILE: .cscs-ci/spack/nccl.yaml + +build_deps_mpi: + extends: + - .container-builder-cscs-gh200 + - .build_deps_template + variables: + BACKEND: mpi + SPACK_ENV_FILE: .cscs-ci/spack/mpi.yaml + +build_deps_ucx: + extends: + - .container-builder-cscs-gh200 + - .build_deps_template + variables: + BACKEND: ucx + SPACK_ENV_FILE: .cscs-ci/spack/ucx.yaml + +build_deps_libfabric: + extends: + - .container-builder-cscs-gh200 + - .build_deps_template + variables: + BACKEND: libfabric + SPACK_ENV_FILE: .cscs-ci/spack/libfabric.yaml + +.build_template: + stage: build + extends: .container-builder-cscs-gh200 + timeout: 1 hours + before_script: + - echo $DOCKERHUB_TOKEN | podman login docker.io -u $DOCKERHUB_USERNAME --password-stdin || true + - export PERSIST_IMAGE_NAME=$CSCS_REGISTRY_PATH/oomph-build-$BACKEND:$CI_COMMIT_SHA + - echo -e "BUILD_IMAGE=$PERSIST_IMAGE_NAME" >> build-${BACKEND}.env + variables: + DOCKERFILE: .cscs-ci/container/build.Containerfile + DOCKER_BUILD_ARGS: '["DEPS_IMAGE"]' + artifacts: + reports: + dotenv: build-${BACKEND}.env + +build_nccl: + extends: .build_template + needs: + - job: build_deps_nccl + artifacts: true + variables: + BACKEND: nccl + +build_mpi: + extends: .build_template + needs: + - job: build_deps_mpi + artifacts: true + variables: + BACKEND: mpi + +build_ucx: + extends: .build_template + needs: + - job: build_deps_ucx + artifacts: true + variables: + BACKEND: ucx + +build_libfabric: + extends: .build_template + needs: + - job: build_deps_libfabric + artifacts: true + variables: + BACKEND: libfabric + +.test_serial_template: + stage: test + extends: .container-runner-clariden-gh200 + variables: + SLURM_JOB_NUM_NODES: 1 + SLURM_NTASKS: 1 + SLURM_TIMELIMIT: '00:15:00' + SLURM_PARTITION: normal + script: + - ctest --test-dir build -L "serial" --output-on-failure + +.test_parallel_template: + stage: test + extends: .container-runner-clariden-gh200 + variables: + SLURM_JOB_NUM_NODES: 1 + SLURM_NTASKS: 4 + SLURM_TIMELIMIT: '00:15:00' + SLURM_PARTITION: normal + SLURM_MPI: pmix + MPICH_GPU_SUPPORT_ENABLED: 1 + script: + - srun -n 4 ctest --test-dir build -L "parallel-ranks-4" --output-on-failure + +test_serial_nccl: + extends: .test_serial_template + needs: + - job: build_nccl + artifacts: true + image: $BUILD_IMAGE + +test_parallel_nccl: + extends: .test_parallel_template + needs: + - job: build_nccl + artifacts: true + image: $BUILD_IMAGE + +test_serial_mpi: + extends: .test_serial_template + needs: + - job: build_mpi + artifacts: true + image: $BUILD_IMAGE + +test_parallel_mpi: + extends: .test_parallel_template + needs: + - job: build_mpi + artifacts: true + image: $BUILD_IMAGE + +test_serial_ucx: + extends: .test_serial_template + needs: + - job: build_ucx + artifacts: true + image: $BUILD_IMAGE + +test_parallel_ucx: + extends: .test_parallel_template + needs: + - job: build_ucx + artifacts: true + image: $BUILD_IMAGE + +test_serial_libfabric: + extends: .test_serial_template + needs: + - job: build_libfabric + artifacts: true + image: $BUILD_IMAGE + +test_parallel_libfabric: + extends: .test_parallel_template + needs: + - job: build_libfabric + artifacts: true + image: $BUILD_IMAGE diff --git a/.cscs-ci/spack/libfabric.yaml b/.cscs-ci/spack/libfabric.yaml new file mode 100644 index 00000000..f659f278 --- /dev/null +++ b/.cscs-ci/spack/libfabric.yaml @@ -0,0 +1,6 @@ +spack: + specs: + - oomph backend=libfabric +cuda +python + view: false + concretizer: + unify: true diff --git a/.cscs-ci/spack/mpi.yaml b/.cscs-ci/spack/mpi.yaml new file mode 100644 index 00000000..696d894d --- /dev/null +++ b/.cscs-ci/spack/mpi.yaml @@ -0,0 +1,6 @@ +spack: + specs: + - oomph backend=mpi +cuda +python + view: false + concretizer: + unify: true diff --git a/.cscs-ci/spack/nccl.yaml b/.cscs-ci/spack/nccl.yaml new file mode 100644 index 00000000..2dc59834 --- /dev/null +++ b/.cscs-ci/spack/nccl.yaml @@ -0,0 +1,6 @@ +spack: + specs: + - oomph backend=nccl +cuda +python + view: false + concretizer: + unify: true diff --git a/.cscs-ci/spack/ucx.yaml b/.cscs-ci/spack/ucx.yaml new file mode 100644 index 00000000..76100e29 --- /dev/null +++ b/.cscs-ci/spack/ucx.yaml @@ -0,0 +1,6 @@ +spack: + specs: + - oomph backend=ucx +cuda +python + view: false + concretizer: + unify: true diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 5217bbaf..31fea066 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -48,6 +48,7 @@ function(reg_serial_test t) add_test( NAME ${t} COMMAND $) + set_tests_properties(${t} PROPERTIES LABELS "serial") endfunction() foreach(t ${serial_tests}) @@ -65,7 +66,7 @@ function(reg_parallel_test t_ lib n) NAME ${t} COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${n} ${MPIEXEC_PREFLAGS} $ ${MPIEXEC_POSTFLAGS}) - set_tests_properties(${t} PROPERTIES RUN_SERIAL TRUE) + set_tests_properties(${t} PROPERTIES RUN_SERIAL TRUE LABELS "parallel-ranks-${n}") endfunction() if (OOMPH_WITH_MPI) diff --git a/test/bindings/fortran/CMakeLists.txt b/test/bindings/fortran/CMakeLists.txt index 974d2f7c..10e69e15 100644 --- a/test/bindings/fortran/CMakeLists.txt +++ b/test/bindings/fortran/CMakeLists.txt @@ -30,7 +30,8 @@ function(reg_parallel_test_f t_ lib n nthr) COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${n} ${MPIEXEC_PREFLAGS} $ ${MPIEXEC_POSTFLAGS}) set_tests_properties(${t} PROPERTIES - ENVIRONMENT OMP_NUM_THREADS=${nthr}) + ENVIRONMENT OMP_NUM_THREADS=${nthr} + LABELS "parallel-ranks-${n}") endfunction() if (OOMPH_WITH_MPI) From ba1784c191cc247422c4d6c2a7ae159482f5af9c Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Wed, 25 Mar 2026 16:45:03 +0100 Subject: [PATCH 02/51] Apply suggestions from code review Co-authored-by: Mikael Simberg --- .cscs-ci/container/deps.Containerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.cscs-ci/container/deps.Containerfile b/.cscs-ci/container/deps.Containerfile index 73c225c1..a07e4797 100644 --- a/.cscs-ci/container/deps.Containerfile +++ b/.cscs-ci/container/deps.Containerfile @@ -1,7 +1,7 @@ FROM ghcr.io/eth-cscs/alps-images:py26.01-alps3-base -ARG SPACK_SHA=develop -ARG SPACK_PACKAGES_SHA=main +ARG SPACK_SHA=v1.1.1 +ARG SPACK_PACKAGES_SHA=bc93746ce936d6653271b6e98f6df6ee28f64e84 # develop on 2026-03-25 ARG SPACK_ENV_FILE ENV DEBIAN_FRONTEND=noninteractive From 8cf50769349d3f432e602a44f2c48a94e56d769d Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Wed, 25 Mar 2026 16:45:54 +0100 Subject: [PATCH 03/51] Apply suggestion from @msimberg --- .cscs-ci/container/deps.Containerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.cscs-ci/container/deps.Containerfile b/.cscs-ci/container/deps.Containerfile index a07e4797..32920e3d 100644 --- a/.cscs-ci/container/deps.Containerfile +++ b/.cscs-ci/container/deps.Containerfile @@ -19,6 +19,7 @@ RUN spack repo remove --scope defaults:base builtin && \ COPY $SPACK_ENV_FILE /spack_environment/spack.yaml -RUN spack env create ci /spack_environment/spack.yaml && \ +RUN spack external find --all && \ + spack env create ci /spack_environment/spack.yaml && \ spack -e ci concretize -f && \ spack -e ci install --jobs $(nproc) --fail-fast --only=dependencies From 5423771993a2b6bec66453337d9ba61a9bd3e0a1 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Wed, 25 Mar 2026 21:59:35 +0100 Subject: [PATCH 04/51] Fix CI container build args --- .cscs-ci/container/deps.Containerfile | 12 +++++------- .cscs-ci/default.yaml | 7 +++---- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/.cscs-ci/container/deps.Containerfile b/.cscs-ci/container/deps.Containerfile index 32920e3d..bcba848f 100644 --- a/.cscs-ci/container/deps.Containerfile +++ b/.cscs-ci/container/deps.Containerfile @@ -1,22 +1,20 @@ -FROM ghcr.io/eth-cscs/alps-images:py26.01-alps3-base - -ARG SPACK_SHA=v1.1.1 -ARG SPACK_PACKAGES_SHA=bc93746ce936d6653271b6e98f6df6ee28f64e84 # develop on 2026-03-25 -ARG SPACK_ENV_FILE - -ENV DEBIAN_FRONTEND=noninteractive +ARG BASE_IMAGE +FROM BASE_IMAGE +ARG SPACK_SHA RUN mkdir -p /opt/spack && \ curl -Ls "https://api.github.com/repos/spack/spack/tarball/$SPACK_SHA" | tar --strip-components=1 -xz -C /opt/spack ENV PATH="/opt/spack/bin:$PATH" +ARG SPACK_PACKAGES_SHA RUN mkdir -p /opt/spack-packages && \ curl -Ls "https://api.github.com/repos/spack/spack-packages/tarball/$SPACK_PACKAGES_SHA" | tar --strip-components=1 -xz -C /opt/spack-packages RUN spack repo remove --scope defaults:base builtin && \ spack repo add --scope site /opt/spack-packages/repos/spack_repo/builtin +ARG SPACK_ENV_FILE COPY $SPACK_ENV_FILE /spack_environment/spack.yaml RUN spack external find --all && \ diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 959ba3d4..cc1818b4 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -7,10 +7,9 @@ stages: - test variables: - # The base image is the py26.01 alps3 image from docs.cscs.ch - BASE_IMAGE: ghcr.io/eth-cscs/alps-images:py26.01-alps3-base - SPACK_SHA: develop - SPACK_PACKAGES_SHA: main + BASE_IMAGE: jfrog.svc.ccs.ch/docker-group-csstaff/alps-images/ngc-pytorch:26.01-py3-alps3 + SPACK_SHA: v1.1.1 + SPACK_PACKAGES_SHA: bc93746ce936d6653271b6e98f6df6ee28f64e84 # develop on 2026-03-25 FF_TIMESTAMPS: true .build_deps_template: From 3f15f1f207e5a5c4c73a80f267255c9fd88e54ab Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Wed, 25 Mar 2026 22:01:56 +0100 Subject: [PATCH 05/51] Specify oomph@main in spack environments --- .cscs-ci/spack/libfabric.yaml | 2 +- .cscs-ci/spack/mpi.yaml | 2 +- .cscs-ci/spack/nccl.yaml | 2 +- .cscs-ci/spack/ucx.yaml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.cscs-ci/spack/libfabric.yaml b/.cscs-ci/spack/libfabric.yaml index f659f278..27fdfb08 100644 --- a/.cscs-ci/spack/libfabric.yaml +++ b/.cscs-ci/spack/libfabric.yaml @@ -1,6 +1,6 @@ spack: specs: - - oomph backend=libfabric +cuda +python + - oomph@main backend=libfabric +cuda +python view: false concretizer: unify: true diff --git a/.cscs-ci/spack/mpi.yaml b/.cscs-ci/spack/mpi.yaml index 696d894d..90e45ff8 100644 --- a/.cscs-ci/spack/mpi.yaml +++ b/.cscs-ci/spack/mpi.yaml @@ -1,6 +1,6 @@ spack: specs: - - oomph backend=mpi +cuda +python + - oomph@main backend=mpi +cuda +python view: false concretizer: unify: true diff --git a/.cscs-ci/spack/nccl.yaml b/.cscs-ci/spack/nccl.yaml index 2dc59834..4c08a383 100644 --- a/.cscs-ci/spack/nccl.yaml +++ b/.cscs-ci/spack/nccl.yaml @@ -1,6 +1,6 @@ spack: specs: - - oomph backend=nccl +cuda +python + - oomph@main backend=nccl +cuda +python view: false concretizer: unify: true diff --git a/.cscs-ci/spack/ucx.yaml b/.cscs-ci/spack/ucx.yaml index 76100e29..251a4ec9 100644 --- a/.cscs-ci/spack/ucx.yaml +++ b/.cscs-ci/spack/ucx.yaml @@ -1,6 +1,6 @@ spack: specs: - - oomph backend=ucx +cuda +python + - oomph@main backend=ucx +cuda +python view: false concretizer: unify: true From b832fe622e928f849e6a132b2ac191880eac4acb Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Wed, 25 Mar 2026 22:02:26 +0100 Subject: [PATCH 06/51] Remove +python from spack specs --- .cscs-ci/spack/libfabric.yaml | 2 +- .cscs-ci/spack/mpi.yaml | 2 +- .cscs-ci/spack/nccl.yaml | 2 +- .cscs-ci/spack/ucx.yaml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.cscs-ci/spack/libfabric.yaml b/.cscs-ci/spack/libfabric.yaml index 27fdfb08..fac7f88f 100644 --- a/.cscs-ci/spack/libfabric.yaml +++ b/.cscs-ci/spack/libfabric.yaml @@ -1,6 +1,6 @@ spack: specs: - - oomph@main backend=libfabric +cuda +python + - oomph@main backend=libfabric +cuda view: false concretizer: unify: true diff --git a/.cscs-ci/spack/mpi.yaml b/.cscs-ci/spack/mpi.yaml index 90e45ff8..d59aab13 100644 --- a/.cscs-ci/spack/mpi.yaml +++ b/.cscs-ci/spack/mpi.yaml @@ -1,6 +1,6 @@ spack: specs: - - oomph@main backend=mpi +cuda +python + - oomph@main backend=mpi +cuda view: false concretizer: unify: true diff --git a/.cscs-ci/spack/nccl.yaml b/.cscs-ci/spack/nccl.yaml index 4c08a383..94f0dd31 100644 --- a/.cscs-ci/spack/nccl.yaml +++ b/.cscs-ci/spack/nccl.yaml @@ -1,6 +1,6 @@ spack: specs: - - oomph@main backend=nccl +cuda +python + - oomph@main backend=nccl +cuda view: false concretizer: unify: true diff --git a/.cscs-ci/spack/ucx.yaml b/.cscs-ci/spack/ucx.yaml index 251a4ec9..51377dd8 100644 --- a/.cscs-ci/spack/ucx.yaml +++ b/.cscs-ci/spack/ucx.yaml @@ -1,6 +1,6 @@ spack: specs: - - oomph@main backend=ucx +cuda +python + - oomph@main backend=ucx +cuda view: false concretizer: unify: true From 1e851df257cb97ba664ec392f0c472bcb9c3a615 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Wed, 25 Mar 2026 22:10:23 +0100 Subject: [PATCH 07/51] Remove stages --- .cscs-ci/default.yaml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index cc1818b4..c3e80df3 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -1,11 +1,6 @@ include: - remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml' -stages: - - build_deps - - build - - test - variables: BASE_IMAGE: jfrog.svc.ccs.ch/docker-group-csstaff/alps-images/ngc-pytorch:26.01-py3-alps3 SPACK_SHA: v1.1.1 @@ -13,7 +8,6 @@ variables: FF_TIMESTAMPS: true .build_deps_template: - stage: build_deps timeout: 1 hours before_script: - echo $DOCKERHUB_TOKEN | podman login docker.io -u $DOCKERHUB_USERNAME --password-stdin || true @@ -63,7 +57,6 @@ build_deps_libfabric: SPACK_ENV_FILE: .cscs-ci/spack/libfabric.yaml .build_template: - stage: build extends: .container-builder-cscs-gh200 timeout: 1 hours before_script: @@ -110,7 +103,6 @@ build_libfabric: BACKEND: libfabric .test_serial_template: - stage: test extends: .container-runner-clariden-gh200 variables: SLURM_JOB_NUM_NODES: 1 @@ -121,7 +113,6 @@ build_libfabric: - ctest --test-dir build -L "serial" --output-on-failure .test_parallel_template: - stage: test extends: .container-runner-clariden-gh200 variables: SLURM_JOB_NUM_NODES: 1 From 0398d59869d80c6a002ceb37497820bc384b50a9 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Wed, 25 Mar 2026 22:13:40 +0100 Subject: [PATCH 08/51] Refactor ci config --- .cscs-ci/default.yaml | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index c3e80df3..2a0bc830 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -19,42 +19,39 @@ variables: - echo -e "DEPS_IMAGE=$PERSIST_IMAGE_NAME" >> base-${BACKEND}.env variables: DOCKERFILE: .cscs-ci/container/deps.Containerfile - DOCKER_BUILD_ARGS: '["SPACK_SHA", "SPACK_PACKAGES_SHA", "SPACK_ENV_FILE"]' + DOCKER_BUILD_ARGS: '["BASE_IMAGE", "SPACK_SHA", "SPACK_PACKAGES_SHA", "SPACK_ENV_FILE"]' + SPACK_ENV_FILE: .cscs./spack/$BACKEND.yaml artifacts: reports: dotenv: base-${BACKEND}.env build_deps_nccl: + variables: + BACKEND: nccl extends: - .container-builder-cscs-gh200 - .build_deps_template - variables: - BACKEND: nccl - SPACK_ENV_FILE: .cscs-ci/spack/nccl.yaml build_deps_mpi: + variables: + BACKEND: mpi extends: - .container-builder-cscs-gh200 - .build_deps_template - variables: - BACKEND: mpi - SPACK_ENV_FILE: .cscs-ci/spack/mpi.yaml build_deps_ucx: + variables: + BACKEND: ucx extends: - .container-builder-cscs-gh200 - .build_deps_template - variables: - BACKEND: ucx - SPACK_ENV_FILE: .cscs-ci/spack/ucx.yaml build_deps_libfabric: + variables: + BACKEND: libfabric extends: - .container-builder-cscs-gh200 - .build_deps_template - variables: - BACKEND: libfabric - SPACK_ENV_FILE: .cscs-ci/spack/libfabric.yaml .build_template: extends: .container-builder-cscs-gh200 @@ -71,36 +68,36 @@ build_deps_libfabric: dotenv: build-${BACKEND}.env build_nccl: + variables: + BACKEND: nccl extends: .build_template needs: - job: build_deps_nccl artifacts: true - variables: - BACKEND: nccl build_mpi: + variables: + BACKEND: mpi extends: .build_template needs: - job: build_deps_mpi artifacts: true - variables: - BACKEND: mpi build_ucx: + variables: + BACKEND: ucx extends: .build_template needs: - job: build_deps_ucx artifacts: true - variables: - BACKEND: ucx build_libfabric: + variables: + BACKEND: libfabric extends: .build_template needs: - job: build_deps_libfabric artifacts: true - variables: - BACKEND: libfabric .test_serial_template: extends: .container-runner-clariden-gh200 From b1297aca28222bdc614d04851e4d5d3f6bd129e8 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 09:13:10 +0100 Subject: [PATCH 09/51] Fix base image --- .cscs-ci/container/deps.Containerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cscs-ci/container/deps.Containerfile b/.cscs-ci/container/deps.Containerfile index bcba848f..50570529 100644 --- a/.cscs-ci/container/deps.Containerfile +++ b/.cscs-ci/container/deps.Containerfile @@ -1,5 +1,5 @@ ARG BASE_IMAGE -FROM BASE_IMAGE +FROM $BASE_IMAGE ARG SPACK_SHA RUN mkdir -p /opt/spack && \ From 34847222c671735ee2ec057d9fad716162bb866b Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 09:26:27 +0100 Subject: [PATCH 10/51] Fix typo --- .cscs-ci/default.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 2a0bc830..2f3e3db2 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -2,7 +2,7 @@ include: - remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml' variables: - BASE_IMAGE: jfrog.svc.ccs.ch/docker-group-csstaff/alps-images/ngc-pytorch:26.01-py3-alps3 + BASE_IMAGE: jfrog.svc.cscs.ch/docker-group-csstaff/alps-images/ngc-pytorch:26.01-py3-alps3 SPACK_SHA: v1.1.1 SPACK_PACKAGES_SHA: bc93746ce936d6653271b6e98f6df6ee28f64e84 # develop on 2026-03-25 FF_TIMESTAMPS: true From 33624ef1261b5d0ef305f77e015d6ac6d2f68212 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 09:31:11 +0100 Subject: [PATCH 11/51] Fix env file path --- .cscs-ci/default.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 2f3e3db2..14baa148 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -20,7 +20,7 @@ variables: variables: DOCKERFILE: .cscs-ci/container/deps.Containerfile DOCKER_BUILD_ARGS: '["BASE_IMAGE", "SPACK_SHA", "SPACK_PACKAGES_SHA", "SPACK_ENV_FILE"]' - SPACK_ENV_FILE: .cscs./spack/$BACKEND.yaml + SPACK_ENV_FILE: .cscs-ci/spack/$BACKEND.yaml artifacts: reports: dotenv: base-${BACKEND}.env From a3e950dfbf5e5cc4340cb1af96ee3809778f4801 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 09:45:54 +0100 Subject: [PATCH 12/51] Update cmake config in CI --- .cscs-ci/container/build.Containerfile | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/.cscs-ci/container/build.Containerfile b/.cscs-ci/container/build.Containerfile index 784221cb..1f010a87 100644 --- a/.cscs-ci/container/build.Containerfile +++ b/.cscs-ci/container/build.Containerfile @@ -4,5 +4,16 @@ FROM $DEPS_IMAGE COPY . /oomph WORKDIR /oomph -RUN spack -e ci build-env oomph -- cmake -B build -DOOMPH_WITH_TESTING=ON -DMPIEXEC_EXECUTABLE="" -DMPIEXEC_NUMPROC_FLAG="" -DMPIEXEC_PREFLAGS="" -DMPIEXEC_POSTFLAGS="" && \ +ARG BACKEND +RUN spack -e ci build-env oomph -- \ + cmake -G Ninja -B build \ + -DOOMPH_WITH_TESTING=ON \ + # Converte BACKEND to uppercase + -DOOMPH_WITH_$(echo $BACKEND | tr '[:lower:]' '[:upper:]')=ON \ + -DOOMPH_USE_BUNDLED_LIBS=ON \ + -DOOMPH_USE_BUNDLED_HWMALLOC=OFF \ + -DMPIEXEC_EXECUTABLE="" \ + -DMPIEXEC_NUMPROC_FLAG="" \ + -DMPIEXEC_PREFLAGS="" \ + -DMPIEXEC_POSTFLAGS="" && \ spack -e ci build-env oomph -- cmake --build build -j$(nproc) From 9871e880437958381c6d0280aeee276f66f5a760 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 10:18:52 +0100 Subject: [PATCH 13/51] Use NUM_PROCS instead of nproc --- .cscs-ci/container/build.Containerfile | 3 ++- .cscs-ci/container/deps.Containerfile | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.cscs-ci/container/build.Containerfile b/.cscs-ci/container/build.Containerfile index 1f010a87..c16ce28d 100644 --- a/.cscs-ci/container/build.Containerfile +++ b/.cscs-ci/container/build.Containerfile @@ -5,6 +5,7 @@ COPY . /oomph WORKDIR /oomph ARG BACKEND +ARG NUM_PROCS RUN spack -e ci build-env oomph -- \ cmake -G Ninja -B build \ -DOOMPH_WITH_TESTING=ON \ @@ -16,4 +17,4 @@ RUN spack -e ci build-env oomph -- \ -DMPIEXEC_NUMPROC_FLAG="" \ -DMPIEXEC_PREFLAGS="" \ -DMPIEXEC_POSTFLAGS="" && \ - spack -e ci build-env oomph -- cmake --build build -j$(nproc) + spack -e ci build-env oomph -- cmake --build build $NUM_PROCS diff --git a/.cscs-ci/container/deps.Containerfile b/.cscs-ci/container/deps.Containerfile index 50570529..5fc530bd 100644 --- a/.cscs-ci/container/deps.Containerfile +++ b/.cscs-ci/container/deps.Containerfile @@ -17,7 +17,8 @@ RUN spack repo remove --scope defaults:base builtin && \ ARG SPACK_ENV_FILE COPY $SPACK_ENV_FILE /spack_environment/spack.yaml +ARG NUM_PROCS RUN spack external find --all && \ spack env create ci /spack_environment/spack.yaml && \ spack -e ci concretize -f && \ - spack -e ci install --jobs $(nproc) --fail-fast --only=dependencies + spack -e ci install --jobs $NUM_PROCS --fail-fast --only=dependencies From 177592ddd10994449c066a517852f1a94295798c Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 10:47:21 +0100 Subject: [PATCH 14/51] Fix num procs --- .cscs-ci/container/build.Containerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cscs-ci/container/build.Containerfile b/.cscs-ci/container/build.Containerfile index c16ce28d..66a8ae69 100644 --- a/.cscs-ci/container/build.Containerfile +++ b/.cscs-ci/container/build.Containerfile @@ -17,4 +17,4 @@ RUN spack -e ci build-env oomph -- \ -DMPIEXEC_NUMPROC_FLAG="" \ -DMPIEXEC_PREFLAGS="" \ -DMPIEXEC_POSTFLAGS="" && \ - spack -e ci build-env oomph -- cmake --build build $NUM_PROCS + spack -e ci build-env oomph -- cmake --build build -j$NUM_PROCS From 110d4eb869544c8d1e2efda08083a8f80443fe47 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 13:01:55 +0100 Subject: [PATCH 15/51] Update test job config --- .cscs-ci/default.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 14baa148..65467789 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -106,8 +106,9 @@ build_libfabric: SLURM_NTASKS: 1 SLURM_TIMELIMIT: '00:15:00' SLURM_PARTITION: normal + SLURM_NETWORK=disable_rdzv_get script: - - ctest --test-dir build -L "serial" --output-on-failure + - ctest --test-dir /oomph/build -L "serial" --output-on-failure .test_parallel_template: extends: .container-runner-clariden-gh200 @@ -117,9 +118,10 @@ build_libfabric: SLURM_TIMELIMIT: '00:15:00' SLURM_PARTITION: normal SLURM_MPI: pmix + SLURM_NETWORK=disable_rdzv_get MPICH_GPU_SUPPORT_ENABLED: 1 script: - - srun -n 4 ctest --test-dir build -L "parallel-ranks-4" --output-on-failure + - srun -n 4 ctest --test-dir /oomph/build -L "parallel-ranks-4" --output-on-failure test_serial_nccl: extends: .test_serial_template From 95183544bfbe219d59d1e1eb0b7337c5e7acfd8b Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 13:04:31 +0100 Subject: [PATCH 16/51] Fix syntax --- .cscs-ci/default.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 65467789..3b649519 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -106,7 +106,7 @@ build_libfabric: SLURM_NTASKS: 1 SLURM_TIMELIMIT: '00:15:00' SLURM_PARTITION: normal - SLURM_NETWORK=disable_rdzv_get + SLURM_NETWORK: disable_rdzv_get script: - ctest --test-dir /oomph/build -L "serial" --output-on-failure @@ -118,7 +118,7 @@ build_libfabric: SLURM_TIMELIMIT: '00:15:00' SLURM_PARTITION: normal SLURM_MPI: pmix - SLURM_NETWORK=disable_rdzv_get + SLURM_NETWORK: disable_rdzv_get MPICH_GPU_SUPPORT_ENABLED: 1 script: - srun -n 4 ctest --test-dir /oomph/build -L "parallel-ranks-4" --output-on-failure From 5419c5be656be431f5658ebd3072bf0990869cd2 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 13:39:34 +0100 Subject: [PATCH 17/51] Fix parallel testing --- .cscs-ci/default.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 3b649519..c09ce896 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -119,9 +119,10 @@ build_libfabric: SLURM_PARTITION: normal SLURM_MPI: pmix SLURM_NETWORK: disable_rdzv_get + SLURM_LABELIO: 1 MPICH_GPU_SUPPORT_ENABLED: 1 script: - - srun -n 4 ctest --test-dir /oomph/build -L "parallel-ranks-4" --output-on-failure + - ctest --test-dir /oomph/build -L "parallel-ranks-4" --output-on-failure test_serial_nccl: extends: .test_serial_template From ffd02a45ce35f0da1ba76d38001415fee5b070b2 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 14:22:49 +0100 Subject: [PATCH 18/51] Explicitly ask for one gpu per task --- .cscs-ci/default.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index c09ce896..55780273 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -114,6 +114,7 @@ build_libfabric: extends: .container-runner-clariden-gh200 variables: SLURM_JOB_NUM_NODES: 1 + SLURM_GPUS_PER_TASK: 1 SLURM_NTASKS: 4 SLURM_TIMELIMIT: '00:15:00' SLURM_PARTITION: normal From a81f37f4b47d0f5e3e338c1c5c5e7417aad838e4 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 14:22:58 +0100 Subject: [PATCH 19/51] Verbose ctest output --- .cscs-ci/default.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 55780273..bcab0e30 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -123,7 +123,7 @@ build_libfabric: SLURM_LABELIO: 1 MPICH_GPU_SUPPORT_ENABLED: 1 script: - - ctest --test-dir /oomph/build -L "parallel-ranks-4" --output-on-failure + - ctest --test-dir /oomph/build -L "parallel-ranks-4" --verbose test_serial_nccl: extends: .test_serial_template From 7b35569ffdc33c346d4ea0578dd8689990babb21 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 14:48:05 +0100 Subject: [PATCH 20/51] Explicitly set debug build for CI --- .cscs-ci/container/build.Containerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cscs-ci/container/build.Containerfile b/.cscs-ci/container/build.Containerfile index 66a8ae69..fe3e707f 100644 --- a/.cscs-ci/container/build.Containerfile +++ b/.cscs-ci/container/build.Containerfile @@ -8,8 +8,8 @@ ARG BACKEND ARG NUM_PROCS RUN spack -e ci build-env oomph -- \ cmake -G Ninja -B build \ + -DCMAKE_BUILD_TYPE=Debug \ -DOOMPH_WITH_TESTING=ON \ - # Converte BACKEND to uppercase -DOOMPH_WITH_$(echo $BACKEND | tr '[:lower:]' '[:upper:]')=ON \ -DOOMPH_USE_BUNDLED_LIBS=ON \ -DOOMPH_USE_BUNDLED_HWMALLOC=OFF \ From 60e0e25cf9f1c3b3df23fb097184bc5c3af3a67f Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 14:50:50 +0100 Subject: [PATCH 21/51] Don't set any mpiexec options if MPIEXEC_EXECUTABLE is empty --- test/CMakeLists.txt | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 31fea066..e645a636 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -62,10 +62,15 @@ function(reg_parallel_test t_ lib n) oomph_target_compile_options(${t}) target_link_libraries(${t} PRIVATE gtest_main_mpi) target_link_libraries(${t} PRIVATE oomph_${lib}) - add_test( - NAME ${t} - COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${n} ${MPIEXEC_PREFLAGS} - $ ${MPIEXEC_POSTFLAGS}) + # If not empty + if("${MPIEXEC_EXECUTABLE}" STREQUAL "") + add_test(NAME ${t} COMMAND $) + else() + add_test( + NAME ${t} + COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${n} ${MPIEXEC_PREFLAGS} + $ ${MPIEXEC_POSTFLAGS}) + endif() set_tests_properties(${t} PROPERTIES RUN_SERIAL TRUE LABELS "parallel-ranks-${n}") endfunction() From c3ea5689c8a4e1c3ea7ce06d0aea3a8500749e01 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 16:37:53 +0100 Subject: [PATCH 22/51] Don't buffer test output --- .cscs-ci/default.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index bcab0e30..3b93bf25 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -107,6 +107,7 @@ build_libfabric: SLURM_TIMELIMIT: '00:15:00' SLURM_PARTITION: normal SLURM_NETWORK: disable_rdzv_get + SLURM_UNBUFFERED: 1 script: - ctest --test-dir /oomph/build -L "serial" --output-on-failure @@ -121,6 +122,7 @@ build_libfabric: SLURM_MPI: pmix SLURM_NETWORK: disable_rdzv_get SLURM_LABELIO: 1 + SLURM_UNBUFFERED: 1 MPICH_GPU_SUPPORT_ENABLED: 1 script: - ctest --test-dir /oomph/build -L "parallel-ranks-4" --verbose From 521011e320f203e8cfc5427f24d00360fefa9ca3 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 16:38:18 +0100 Subject: [PATCH 23/51] Skip cancel test --- .cscs-ci/default.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 3b93bf25..7110ce50 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -125,7 +125,8 @@ build_libfabric: SLURM_UNBUFFERED: 1 MPICH_GPU_SUPPORT_ENABLED: 1 script: - - ctest --test-dir /oomph/build -L "parallel-ranks-4" --verbose + # TODO: test_cancel hanging? + - ctest --test-dir /oomph/build -L "parallel-ranks-4" --verbose -E test_cancel test_serial_nccl: extends: .test_serial_template From abb418899d225730b6394df8c9fe3a9327f26466 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 16:46:42 +0100 Subject: [PATCH 24/51] Fix slurm variables --- .cscs-ci/default.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 7110ce50..9f2b60d7 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -107,7 +107,7 @@ build_libfabric: SLURM_TIMELIMIT: '00:15:00' SLURM_PARTITION: normal SLURM_NETWORK: disable_rdzv_get - SLURM_UNBUFFERED: 1 + SLURM_UNBUFFEREDIO: 1 script: - ctest --test-dir /oomph/build -L "serial" --output-on-failure @@ -119,10 +119,10 @@ build_libfabric: SLURM_NTASKS: 4 SLURM_TIMELIMIT: '00:15:00' SLURM_PARTITION: normal - SLURM_MPI: pmix + SLURM_MPI_TYPE: pmix SLURM_NETWORK: disable_rdzv_get SLURM_LABELIO: 1 - SLURM_UNBUFFERED: 1 + SLURM_UNBUFFEREDIO: 1 MPICH_GPU_SUPPORT_ENABLED: 1 script: # TODO: test_cancel hanging? From 3689ebee5882b7e08f81c51bc508245ee61f72ad Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 16:49:39 +0100 Subject: [PATCH 25/51] Shorten timeouts --- .cscs-ci/default.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 9f2b60d7..2d831960 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -55,7 +55,7 @@ build_deps_libfabric: .build_template: extends: .container-builder-cscs-gh200 - timeout: 1 hours + timeout: 15 minutes before_script: - echo $DOCKERHUB_TOKEN | podman login docker.io -u $DOCKERHUB_USERNAME --password-stdin || true - export PERSIST_IMAGE_NAME=$CSCS_REGISTRY_PATH/oomph-build-$BACKEND:$CI_COMMIT_SHA @@ -117,7 +117,7 @@ build_libfabric: SLURM_JOB_NUM_NODES: 1 SLURM_GPUS_PER_TASK: 1 SLURM_NTASKS: 4 - SLURM_TIMELIMIT: '00:15:00' + SLURM_TIMELIMIT: '5:00' SLURM_PARTITION: normal SLURM_MPI_TYPE: pmix SLURM_NETWORK: disable_rdzv_get From 841d97bc758a1fe532b03700f17ae118a66ae55b Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 16:56:37 +0100 Subject: [PATCH 26/51] Don't load cxi hooks in CI --- .cscs-ci/default.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 2d831960..0f48134c 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -108,6 +108,7 @@ build_libfabric: SLURM_PARTITION: normal SLURM_NETWORK: disable_rdzv_get SLURM_UNBUFFEREDIO: 1 + USE_MPI: NO script: - ctest --test-dir /oomph/build -L "serial" --output-on-failure @@ -124,6 +125,7 @@ build_libfabric: SLURM_LABELIO: 1 SLURM_UNBUFFEREDIO: 1 MPICH_GPU_SUPPORT_ENABLED: 1 + USE_MPI: NO script: # TODO: test_cancel hanging? - ctest --test-dir /oomph/build -L "parallel-ranks-4" --verbose -E test_cancel From d7995af0c2fe34da458e3192434050ed06a79464 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 17:05:17 +0100 Subject: [PATCH 27/51] Update slurm and ctest options --- .cscs-ci/default.yaml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 0f48134c..98ccacff 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -108,9 +108,11 @@ build_libfabric: SLURM_PARTITION: normal SLURM_NETWORK: disable_rdzv_get SLURM_UNBUFFEREDIO: 1 + PMIX_MCA_psec: native + PMIX_MCA_gds: "^shmem2" USE_MPI: NO script: - - ctest --test-dir /oomph/build -L "serial" --output-on-failure + - ctest --test-dir /oomph/build -L "serial" --output-on-failure --timeout 60 .test_parallel_template: extends: .container-runner-clariden-gh200 @@ -125,10 +127,12 @@ build_libfabric: SLURM_LABELIO: 1 SLURM_UNBUFFEREDIO: 1 MPICH_GPU_SUPPORT_ENABLED: 1 + PMIX_MCA_psec: native + PMIX_MCA_gds: "^shmem2" USE_MPI: NO script: # TODO: test_cancel hanging? - - ctest --test-dir /oomph/build -L "parallel-ranks-4" --verbose -E test_cancel + - ctest --test-dir /oomph/build -L "parallel-ranks-4" --verbose --timeout 60 test_serial_nccl: extends: .test_serial_template From c91ae1cfa92c42ef332d867a31b9e9796b79dd99 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 9 Apr 2026 15:03:20 +0200 Subject: [PATCH 28/51] List libfabric and ucx info in CI --- .cscs-ci/default.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 98ccacff..06165d12 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -132,6 +132,8 @@ build_libfabric: USE_MPI: NO script: # TODO: test_cancel hanging? + - fi_info + - ucx_info -d - ctest --test-dir /oomph/build -L "parallel-ranks-4" --verbose --timeout 60 test_serial_nccl: From b74e96d5742d1628e325679a1f087ae4e9795dd3 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 9 Apr 2026 15:14:47 +0200 Subject: [PATCH 29/51] Clean up test templates --- .cscs-ci/default.yaml | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 06165d12..c8dccdc7 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -99,41 +99,37 @@ build_libfabric: - job: build_deps_libfabric artifacts: true -.test_serial_template: +.test_template_base: extends: .container-runner-clariden-gh200 variables: SLURM_JOB_NUM_NODES: 1 - SLURM_NTASKS: 1 - SLURM_TIMELIMIT: '00:15:00' + SLURM_GPUS_PER_TASK: 1 + SLURM_TIMELIMIT: '5:00' SLURM_PARTITION: normal + SLURM_MPI_TYPE: pmix SLURM_NETWORK: disable_rdzv_get + SLURM_LABELIO: 1 SLURM_UNBUFFEREDIO: 1 PMIX_MCA_psec: native PMIX_MCA_gds: "^shmem2" USE_MPI: NO + +.test_serial_template: + extends: + - .container-runner-clariden-gh200 + - .test_template_base + variables: + SLURM_NTASKS: 1 script: - ctest --test-dir /oomph/build -L "serial" --output-on-failure --timeout 60 .test_parallel_template: - extends: .container-runner-clariden-gh200 + extends: + - .container-runner-clariden-gh200 + - .test_template_base variables: - SLURM_JOB_NUM_NODES: 1 - SLURM_GPUS_PER_TASK: 1 SLURM_NTASKS: 4 - SLURM_TIMELIMIT: '5:00' - SLURM_PARTITION: normal - SLURM_MPI_TYPE: pmix - SLURM_NETWORK: disable_rdzv_get - SLURM_LABELIO: 1 - SLURM_UNBUFFEREDIO: 1 - MPICH_GPU_SUPPORT_ENABLED: 1 - PMIX_MCA_psec: native - PMIX_MCA_gds: "^shmem2" - USE_MPI: NO script: - # TODO: test_cancel hanging? - - fi_info - - ucx_info -d - ctest --test-dir /oomph/build -L "parallel-ranks-4" --verbose --timeout 60 test_serial_nccl: From 80ce06cec9e6ede1d88db5c60afbf66566696065 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 9 Apr 2026 15:31:59 +0200 Subject: [PATCH 30/51] Disable NCCL CI pipelines since it's not yet supported --- .cscs-ci/default.yaml | 55 +++++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index c8dccdc7..321b1b7c 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -25,12 +25,13 @@ variables: reports: dotenv: base-${BACKEND}.env -build_deps_nccl: - variables: - BACKEND: nccl - extends: - - .container-builder-cscs-gh200 - - .build_deps_template +# TODO: NCCL will be be enabled in https://github.com/ghex-org/oomph/pull/55 +# build_deps_nccl: +# variables: +# BACKEND: nccl +# extends: +# - .container-builder-cscs-gh200 +# - .build_deps_template build_deps_mpi: variables: @@ -67,13 +68,14 @@ build_deps_libfabric: reports: dotenv: build-${BACKEND}.env -build_nccl: - variables: - BACKEND: nccl - extends: .build_template - needs: - - job: build_deps_nccl - artifacts: true +# TODO: NCCL will be be enabled in https://github.com/ghex-org/oomph/pull/55 +# build_nccl: +# variables: +# BACKEND: nccl +# extends: .build_template +# needs: +# - job: build_deps_nccl +# artifacts: true build_mpi: variables: @@ -132,19 +134,20 @@ build_libfabric: script: - ctest --test-dir /oomph/build -L "parallel-ranks-4" --verbose --timeout 60 -test_serial_nccl: - extends: .test_serial_template - needs: - - job: build_nccl - artifacts: true - image: $BUILD_IMAGE - -test_parallel_nccl: - extends: .test_parallel_template - needs: - - job: build_nccl - artifacts: true - image: $BUILD_IMAGE +# TODO: NCCL will be be enabled in https://github.com/ghex-org/oomph/pull/55 +# test_serial_nccl: +# extends: .test_serial_template +# needs: +# - job: build_nccl +# artifacts: true +# image: $BUILD_IMAGE + +# test_parallel_nccl: +# extends: .test_parallel_template +# needs: +# - job: build_nccl +# artifacts: true +# image: $BUILD_IMAGE test_serial_mpi: extends: .test_serial_template From 3fed3780f1d5d746f229a969299897ba421e503d Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 9 Apr 2026 16:49:07 +0200 Subject: [PATCH 31/51] Small cleanup and parallel non-distributed tests in CI --- .cscs-ci/default.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 321b1b7c..1212d17e 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -123,7 +123,7 @@ build_libfabric: variables: SLURM_NTASKS: 1 script: - - ctest --test-dir /oomph/build -L "serial" --output-on-failure --timeout 60 + - ctest --test-dir /oomph/build -L "serial" --output-on-failure --timeout 60 --parallel 8 .test_parallel_template: extends: @@ -132,7 +132,7 @@ build_libfabric: variables: SLURM_NTASKS: 4 script: - - ctest --test-dir /oomph/build -L "parallel-ranks-4" --verbose --timeout 60 + - ctest --test-dir /oomph/build -L "parallel-ranks-4" --output-on-failure --timeout 60 # TODO: NCCL will be be enabled in https://github.com/ghex-org/oomph/pull/55 # test_serial_nccl: From 38112f2696fca1abe6673cb2004741e36a1f33ba Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Fri, 10 Apr 2026 08:02:57 +0200 Subject: [PATCH 32/51] strace ctest call --- .cscs-ci/default.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 1212d17e..5dc3eea4 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -132,7 +132,7 @@ build_libfabric: variables: SLURM_NTASKS: 4 script: - - ctest --test-dir /oomph/build -L "parallel-ranks-4" --output-on-failure --timeout 60 + - strace ctest --test-dir /oomph/build -L "parallel-ranks-4" --output-on-failure --timeout 60 # TODO: NCCL will be be enabled in https://github.com/ghex-org/oomph/pull/55 # test_serial_nccl: From f1e54aadb20ca737e5d070c72fc7ebc54717a2ef Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Fri, 10 Apr 2026 09:05:52 +0200 Subject: [PATCH 33/51] Verbose CI tests --- .cscs-ci/default.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 5dc3eea4..372ccb3c 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -132,7 +132,7 @@ build_libfabric: variables: SLURM_NTASKS: 4 script: - - strace ctest --test-dir /oomph/build -L "parallel-ranks-4" --output-on-failure --timeout 60 + - strace ctest --test-dir /oomph/build -L "parallel-ranks-4" --output-on-failure --timeout 60 --verbose # TODO: NCCL will be be enabled in https://github.com/ghex-org/oomph/pull/55 # test_serial_nccl: From faf58b8df99b53bffeeb089cb9be534c56dbc504 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Fri, 10 Apr 2026 16:26:39 +0200 Subject: [PATCH 34/51] Remove verbose parallel tests in CI --- .cscs-ci/default.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 372ccb3c..1212d17e 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -132,7 +132,7 @@ build_libfabric: variables: SLURM_NTASKS: 4 script: - - strace ctest --test-dir /oomph/build -L "parallel-ranks-4" --output-on-failure --timeout 60 --verbose + - ctest --test-dir /oomph/build -L "parallel-ranks-4" --output-on-failure --timeout 60 # TODO: NCCL will be be enabled in https://github.com/ghex-org/oomph/pull/55 # test_serial_nccl: From e2e7d9c4357ed44a2c8509391a845a96befede82 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Fri, 10 Apr 2026 16:45:04 +0200 Subject: [PATCH 35/51] Fix fortran parallel tests when MPIEXEC_EXECUTABLE is empty --- test/CMakeLists.txt | 1 - test/bindings/fortran/CMakeLists.txt | 12 ++++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index e645a636..cb4e6f0e 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -62,7 +62,6 @@ function(reg_parallel_test t_ lib n) oomph_target_compile_options(${t}) target_link_libraries(${t} PRIVATE gtest_main_mpi) target_link_libraries(${t} PRIVATE oomph_${lib}) - # If not empty if("${MPIEXEC_EXECUTABLE}" STREQUAL "") add_test(NAME ${t} COMMAND $) else() diff --git a/test/bindings/fortran/CMakeLists.txt b/test/bindings/fortran/CMakeLists.txt index 10e69e15..2a5980c5 100644 --- a/test/bindings/fortran/CMakeLists.txt +++ b/test/bindings/fortran/CMakeLists.txt @@ -25,10 +25,14 @@ function(reg_parallel_test_f t_ lib n nthr) $ $ $) - add_test( - NAME ${t} - COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${n} ${MPIEXEC_PREFLAGS} - $ ${MPIEXEC_POSTFLAGS}) + if("${MPIEXEC_EXECUTABLE}" STREQUAL "") + add_test(NAME ${t} COMMAND $) + else() + add_test( + NAME ${t} + COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${n} ${MPIEXEC_PREFLAGS} + $ ${MPIEXEC_POSTFLAGS}) + endif() set_tests_properties(${t} PROPERTIES ENVIRONMENT OMP_NUM_THREADS=${nthr} LABELS "parallel-ranks-${n}") From d83dbaaea959940a822c7cafcc9c61564174471d Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Fri, 10 Apr 2026 16:51:06 +0200 Subject: [PATCH 36/51] Add missing BACKEND build arg to build step in CI --- .cscs-ci/default.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 1212d17e..f4dc498f 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -63,7 +63,7 @@ build_deps_libfabric: - echo -e "BUILD_IMAGE=$PERSIST_IMAGE_NAME" >> build-${BACKEND}.env variables: DOCKERFILE: .cscs-ci/container/build.Containerfile - DOCKER_BUILD_ARGS: '["DEPS_IMAGE"]' + DOCKER_BUILD_ARGS: '["DEPS_IMAGE", "BACKEND"]' artifacts: reports: dotenv: build-${BACKEND}.env From a21ac784602e07b4736c95deb783120a24c11b3c Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Fri, 10 Apr 2026 20:01:16 +0200 Subject: [PATCH 37/51] Singular hour --- .cscs-ci/default.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index f4dc498f..f5fb0a64 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -8,7 +8,7 @@ variables: FF_TIMESTAMPS: true .build_deps_template: - timeout: 1 hours + timeout: 1 hour before_script: - echo $DOCKERHUB_TOKEN | podman login docker.io -u $DOCKERHUB_USERNAME --password-stdin || true - export DOCKERFILE_SHA=`sha256sum .cscs-ci/container/deps.Containerfile | head -c 16` From d2ae1f0db7419ddfcf010c9295f85e2cbd9d77bb Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Fri, 10 Apr 2026 20:01:57 +0200 Subject: [PATCH 38/51] Double word --- .cscs-ci/default.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index f5fb0a64..be8be743 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -25,7 +25,7 @@ variables: reports: dotenv: base-${BACKEND}.env -# TODO: NCCL will be be enabled in https://github.com/ghex-org/oomph/pull/55 +# TODO: NCCL will be enabled in https://github.com/ghex-org/oomph/pull/55 # build_deps_nccl: # variables: # BACKEND: nccl @@ -68,7 +68,7 @@ build_deps_libfabric: reports: dotenv: build-${BACKEND}.env -# TODO: NCCL will be be enabled in https://github.com/ghex-org/oomph/pull/55 +# TODO: NCCL will be enabled in https://github.com/ghex-org/oomph/pull/55 # build_nccl: # variables: # BACKEND: nccl @@ -134,7 +134,7 @@ build_libfabric: script: - ctest --test-dir /oomph/build -L "parallel-ranks-4" --output-on-failure --timeout 60 -# TODO: NCCL will be be enabled in https://github.com/ghex-org/oomph/pull/55 +# TODO: NCCL will be enabled in https://github.com/ghex-org/oomph/pull/55 # test_serial_nccl: # extends: .test_serial_template # needs: From bdd9374414373cc7b4a814c184315af113e3058a Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Fri, 10 Apr 2026 20:02:44 +0200 Subject: [PATCH 39/51] Remove unnecessary extends --- .cscs-ci/default.yaml | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index be8be743..6c7f56c7 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -117,18 +117,14 @@ build_libfabric: USE_MPI: NO .test_serial_template: - extends: - - .container-runner-clariden-gh200 - - .test_template_base + extends: .test_template_base variables: SLURM_NTASKS: 1 script: - ctest --test-dir /oomph/build -L "serial" --output-on-failure --timeout 60 --parallel 8 .test_parallel_template: - extends: - - .container-runner-clariden-gh200 - - .test_template_base + extends: .test_template_base variables: SLURM_NTASKS: 4 script: From d89bf20c0e5bfcbd71222046b0b03f5b212687f4 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Fri, 10 Apr 2026 20:38:07 +0200 Subject: [PATCH 40/51] More curl flags --- .cscs-ci/container/deps.Containerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.cscs-ci/container/deps.Containerfile b/.cscs-ci/container/deps.Containerfile index 5fc530bd..f5867ac5 100644 --- a/.cscs-ci/container/deps.Containerfile +++ b/.cscs-ci/container/deps.Containerfile @@ -3,13 +3,13 @@ FROM $BASE_IMAGE ARG SPACK_SHA RUN mkdir -p /opt/spack && \ - curl -Ls "https://api.github.com/repos/spack/spack/tarball/$SPACK_SHA" | tar --strip-components=1 -xz -C /opt/spack + curl -fLsS "https://api.github.com/repos/spack/spack/tarball/$SPACK_SHA" | tar --strip-components=1 -xz -C /opt/spack ENV PATH="/opt/spack/bin:$PATH" ARG SPACK_PACKAGES_SHA RUN mkdir -p /opt/spack-packages && \ - curl -Ls "https://api.github.com/repos/spack/spack-packages/tarball/$SPACK_PACKAGES_SHA" | tar --strip-components=1 -xz -C /opt/spack-packages + curl -fLsS "https://api.github.com/repos/spack/spack-packages/tarball/$SPACK_PACKAGES_SHA" | tar --strip-components=1 -xz -C /opt/spack-packages RUN spack repo remove --scope defaults:base builtin && \ spack repo add --scope site /opt/spack-packages/repos/spack_repo/builtin From 45d69924fd643a4db017e9bb8ec2f44b748fb6f7 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Mon, 13 Apr 2026 09:33:12 +0200 Subject: [PATCH 41/51] Use prerelease base image in CI --- .cscs-ci/default.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 6c7f56c7..e14c5e47 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -2,7 +2,7 @@ include: - remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml' variables: - BASE_IMAGE: jfrog.svc.cscs.ch/docker-group-csstaff/alps-images/ngc-pytorch:26.01-py3-alps3 + BASE_IMAGE: jfrog.svc.cscs.ch/docker-group-csstaff/alps-images/ngc-pytorch:26.01-py3-alps4-dev SPACK_SHA: v1.1.1 SPACK_PACKAGES_SHA: bc93746ce936d6653271b6e98f6df6ee28f64e84 # develop on 2026-03-25 FF_TIMESTAMPS: true From 841b9bfc27a9daaf7b8f0a5e3bd809440f9f06c6 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Mon, 13 Apr 2026 16:06:28 +0200 Subject: [PATCH 42/51] Use separate Testing directory for ctest per process --- .cscs-ci/default.yaml | 2 +- .cscs-ci/scripts/ctest-bind-testing-dir.sh | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) create mode 100755 .cscs-ci/scripts/ctest-bind-testing-dir.sh diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index e14c5e47..9a9eeaa4 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -121,7 +121,7 @@ build_libfabric: variables: SLURM_NTASKS: 1 script: - - ctest --test-dir /oomph/build -L "serial" --output-on-failure --timeout 60 --parallel 8 + - /oomph/.cscs-ci/scripts/ctest-bind-testing-dir.sh ctest --test-dir /oomph/build -L "serial" --output-on-failure --timeout 60 --parallel 8 .test_parallel_template: extends: .test_template_base diff --git a/.cscs-ci/scripts/ctest-bind-testing-dir.sh b/.cscs-ci/scripts/ctest-bind-testing-dir.sh new file mode 100755 index 00000000..04462b75 --- /dev/null +++ b/.cscs-ci/scripts/ctest-bind-testing-dir.sh @@ -0,0 +1,9 @@ +#/usr/bin/env bash +# +# Helper script to mount a separate directory for the Testing/Temporary +# directory for each process when running ctest within slurm. + +set -x +unshare --mount --map-root-user \ + bash -c \ + "mount --bind /tmp/Testing/Temporary-${SLURM_PROCID} $PWD/Testing/Temporary && exec \"$@\"" From c80510f79ebe3e8dac58a1596e0f4ae3a2d23e76 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Mon, 13 Apr 2026 16:19:05 +0200 Subject: [PATCH 43/51] Try to fix ctest wrapper --- .cscs-ci/default.yaml | 4 ++-- .cscs-ci/scripts/ctest-bind-testing-dir.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 9a9eeaa4..fdd27dcd 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -121,14 +121,14 @@ build_libfabric: variables: SLURM_NTASKS: 1 script: - - /oomph/.cscs-ci/scripts/ctest-bind-testing-dir.sh ctest --test-dir /oomph/build -L "serial" --output-on-failure --timeout 60 --parallel 8 + - ctest --test-dir /oomph/build -L "serial" --output-on-failure --timeout 60 --parallel 8 .test_parallel_template: extends: .test_template_base variables: SLURM_NTASKS: 4 script: - - ctest --test-dir /oomph/build -L "parallel-ranks-4" --output-on-failure --timeout 60 + - /oomph/.cscs-ci/scripts/ctest-bind-testing-dir.sh ctest --test-dir /oomph/build -L "parallel-ranks-4" --output-on-failure --timeout 60 # TODO: NCCL will be enabled in https://github.com/ghex-org/oomph/pull/55 # test_serial_nccl: diff --git a/.cscs-ci/scripts/ctest-bind-testing-dir.sh b/.cscs-ci/scripts/ctest-bind-testing-dir.sh index 04462b75..12ce5f7d 100755 --- a/.cscs-ci/scripts/ctest-bind-testing-dir.sh +++ b/.cscs-ci/scripts/ctest-bind-testing-dir.sh @@ -6,4 +6,4 @@ set -x unshare --mount --map-root-user \ bash -c \ - "mount --bind /tmp/Testing/Temporary-${SLURM_PROCID} $PWD/Testing/Temporary && exec \"$@\"" + "mount --bind /tmp/Testing/Temporary-${SLURM_PROCID} $PWD/Testing/Temporary && $@" From 5fe5a13b83b5d9ff1c1d2bc13d1f0c49f0c87895 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Mon, 13 Apr 2026 16:42:48 +0200 Subject: [PATCH 44/51] cd into build directory --- .cscs-ci/default.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index fdd27dcd..f2a095b8 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -128,7 +128,8 @@ build_libfabric: variables: SLURM_NTASKS: 4 script: - - /oomph/.cscs-ci/scripts/ctest-bind-testing-dir.sh ctest --test-dir /oomph/build -L "parallel-ranks-4" --output-on-failure --timeout 60 + - cd /oomph/build + - /oomph/.cscs-ci/scripts/ctest-bind-testing-dir.sh ctest -L "parallel-ranks-4" --output-on-failure --timeout 60 # TODO: NCCL will be enabled in https://github.com/ghex-org/oomph/pull/55 # test_serial_nccl: From e55f1ad2244718cd44b3d9db98bc2020b3ad97c7 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Mon, 13 Apr 2026 17:07:18 +0200 Subject: [PATCH 45/51] Fix testing path --- .cscs-ci/scripts/ctest-bind-testing-dir.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.cscs-ci/scripts/ctest-bind-testing-dir.sh b/.cscs-ci/scripts/ctest-bind-testing-dir.sh index 12ce5f7d..10b9712f 100755 --- a/.cscs-ci/scripts/ctest-bind-testing-dir.sh +++ b/.cscs-ci/scripts/ctest-bind-testing-dir.sh @@ -4,6 +4,7 @@ # directory for each process when running ctest within slurm. set -x +mkdir -p "/tmp/Testing/Temporary-${SLURM_PROCID}" unshare --mount --map-root-user \ bash -c \ "mount --bind /tmp/Testing/Temporary-${SLURM_PROCID} $PWD/Testing/Temporary && $@" From 2ce53ed4a58c70d87183857619dfe7386737bb55 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Mon, 13 Apr 2026 17:20:43 +0200 Subject: [PATCH 46/51] Try without --map-root-user --- .cscs-ci/scripts/ctest-bind-testing-dir.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cscs-ci/scripts/ctest-bind-testing-dir.sh b/.cscs-ci/scripts/ctest-bind-testing-dir.sh index 10b9712f..bd310aed 100755 --- a/.cscs-ci/scripts/ctest-bind-testing-dir.sh +++ b/.cscs-ci/scripts/ctest-bind-testing-dir.sh @@ -5,6 +5,6 @@ set -x mkdir -p "/tmp/Testing/Temporary-${SLURM_PROCID}" -unshare --mount --map-root-user \ +unshare --mount \ bash -c \ "mount --bind /tmp/Testing/Temporary-${SLURM_PROCID} $PWD/Testing/Temporary && $@" From 9c711025ed12d9614c8e0a16d1fef086b6dbf820 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Mon, 13 Apr 2026 19:14:58 +0200 Subject: [PATCH 47/51] Try something else for ctest deadlocks --- .cscs-ci/default.yaml | 6 ++++-- .cscs-ci/scripts/ctest-bind-testing-dir.sh | 10 ---------- 2 files changed, 4 insertions(+), 12 deletions(-) delete mode 100755 .cscs-ci/scripts/ctest-bind-testing-dir.sh diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index f2a095b8..c0a53a11 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -128,8 +128,10 @@ build_libfabric: variables: SLURM_NTASKS: 4 script: - - cd /oomph/build - - /oomph/.cscs-ci/scripts/ctest-bind-testing-dir.sh ctest -L "parallel-ranks-4" --output-on-failure --timeout 60 + # All ranks write to ctest files in Testing, but this can deadlock when + # writing inside the container. + - if [[ "${SLURM_PROCID}" == 0 ]]; then rm -rf /oomph/build/Testing; mkdir /tmp/Testing; ln -s /tmp/Testing /oomph/build/Testing; done + - ctest --test-dir /oomph/build -L "parallel-ranks-4" --output-on-failure --timeout 60 # TODO: NCCL will be enabled in https://github.com/ghex-org/oomph/pull/55 # test_serial_nccl: diff --git a/.cscs-ci/scripts/ctest-bind-testing-dir.sh b/.cscs-ci/scripts/ctest-bind-testing-dir.sh deleted file mode 100755 index bd310aed..00000000 --- a/.cscs-ci/scripts/ctest-bind-testing-dir.sh +++ /dev/null @@ -1,10 +0,0 @@ -#/usr/bin/env bash -# -# Helper script to mount a separate directory for the Testing/Temporary -# directory for each process when running ctest within slurm. - -set -x -mkdir -p "/tmp/Testing/Temporary-${SLURM_PROCID}" -unshare --mount \ - bash -c \ - "mount --bind /tmp/Testing/Temporary-${SLURM_PROCID} $PWD/Testing/Temporary && $@" From 5f3f6bb1287a61e36f646e4ac7efe761522e5af8 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Mon, 13 Apr 2026 19:50:36 +0200 Subject: [PATCH 48/51] Fix syntax error --- .cscs-ci/default.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index c0a53a11..05fd359b 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -130,7 +130,7 @@ build_libfabric: script: # All ranks write to ctest files in Testing, but this can deadlock when # writing inside the container. - - if [[ "${SLURM_PROCID}" == 0 ]]; then rm -rf /oomph/build/Testing; mkdir /tmp/Testing; ln -s /tmp/Testing /oomph/build/Testing; done + - if [[ "${SLURM_PROCID}" == 0 ]]; then rm -rf /oomph/build/Testing; mkdir /tmp/Testing; ln -s /tmp/Testing /oomph/build/Testing; fi - ctest --test-dir /oomph/build -L "parallel-ranks-4" --output-on-failure --timeout 60 # TODO: NCCL will be enabled in https://github.com/ghex-org/oomph/pull/55 From e33fd3d597294bf1176d975f1a0bd95a2df712b2 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Tue, 14 Apr 2026 11:08:46 +0200 Subject: [PATCH 49/51] Add sleep just to be safe when symlinking testing directory --- .cscs-ci/default.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 05fd359b..7bdc0567 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -131,6 +131,7 @@ build_libfabric: # All ranks write to ctest files in Testing, but this can deadlock when # writing inside the container. - if [[ "${SLURM_PROCID}" == 0 ]]; then rm -rf /oomph/build/Testing; mkdir /tmp/Testing; ln -s /tmp/Testing /oomph/build/Testing; fi + - sleep 1 - ctest --test-dir /oomph/build -L "parallel-ranks-4" --output-on-failure --timeout 60 # TODO: NCCL will be enabled in https://github.com/ghex-org/oomph/pull/55 From a00c372a8e1d74b2ad197788ab47bfa506b86043 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Tue, 14 Apr 2026 13:44:19 +0200 Subject: [PATCH 50/51] Use public path for base images --- .cscs-ci/default.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 7bdc0567..c88a4522 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -14,7 +14,7 @@ variables: - export DOCKERFILE_SHA=`sha256sum .cscs-ci/container/deps.Containerfile | head -c 16` - export ENV_FILE_SHA=`sha256sum ${SPACK_ENV_FILE} | head -c 16` - export CONFIG_TAG=`echo $DOCKERFILE_SHA-$BASE_IMAGE-$SPACK_SHA-$SPACK_PACKAGES_SHA-$ENV_FILE_SHA | sha256sum - | head -c 16` - - export PERSIST_IMAGE_NAME=$CSCS_REGISTRY_PATH/oomph-spack-deps-$BACKEND:$CONFIG_TAG + - export PERSIST_IMAGE_NAME=$CSCS_REGISTRY_PATH/public/oomph-spack-deps-$BACKEND:$CONFIG_TAG - echo -e "CONFIG_TAG=$CONFIG_TAG" >> base-${BACKEND}.env - echo -e "DEPS_IMAGE=$PERSIST_IMAGE_NAME" >> base-${BACKEND}.env variables: @@ -59,7 +59,7 @@ build_deps_libfabric: timeout: 15 minutes before_script: - echo $DOCKERHUB_TOKEN | podman login docker.io -u $DOCKERHUB_USERNAME --password-stdin || true - - export PERSIST_IMAGE_NAME=$CSCS_REGISTRY_PATH/oomph-build-$BACKEND:$CI_COMMIT_SHA + - export PERSIST_IMAGE_NAME=$CSCS_REGISTRY_PATH/public/oomph-build-$BACKEND:$CI_COMMIT_SHA - echo -e "BUILD_IMAGE=$PERSIST_IMAGE_NAME" >> build-${BACKEND}.env variables: DOCKERFILE: .cscs-ci/container/build.Containerfile From f9b528d05ea8893ca2089cedbfbac76fc92c401c Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 16 Apr 2026 20:15:20 +0200 Subject: [PATCH 51/51] Disable libfabric tests in cicd-ext --- .cscs-ci/default.yaml | 53 +++++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index c88a4522..0d6ba1fc 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -47,12 +47,13 @@ build_deps_ucx: - .container-builder-cscs-gh200 - .build_deps_template -build_deps_libfabric: - variables: - BACKEND: libfabric - extends: - - .container-builder-cscs-gh200 - - .build_deps_template +# TODO: Libfabric tests are currently failing on Alps and need to be fixed. +# build_deps_libfabric: +# variables: +# BACKEND: libfabric +# extends: +# - .container-builder-cscs-gh200 +# - .build_deps_template .build_template: extends: .container-builder-cscs-gh200 @@ -93,13 +94,14 @@ build_ucx: - job: build_deps_ucx artifacts: true -build_libfabric: - variables: - BACKEND: libfabric - extends: .build_template - needs: - - job: build_deps_libfabric - artifacts: true +# TODO: Libfabric tests are currently failing on Alps and need to be fixed. +# build_libfabric: +# variables: +# BACKEND: libfabric +# extends: .build_template +# needs: +# - job: build_deps_libfabric +# artifacts: true .test_template_base: extends: .container-runner-clariden-gh200 @@ -177,16 +179,17 @@ test_parallel_ucx: artifacts: true image: $BUILD_IMAGE -test_serial_libfabric: - extends: .test_serial_template - needs: - - job: build_libfabric - artifacts: true - image: $BUILD_IMAGE +# TODO: Libfabric tests are currently failing on Alps and need to be fixed. +# test_serial_libfabric: +# extends: .test_serial_template +# needs: +# - job: build_libfabric +# artifacts: true +# image: $BUILD_IMAGE -test_parallel_libfabric: - extends: .test_parallel_template - needs: - - job: build_libfabric - artifacts: true - image: $BUILD_IMAGE +# test_parallel_libfabric: +# extends: .test_parallel_template +# needs: +# - job: build_libfabric +# artifacts: true +# image: $BUILD_IMAGE