From 209c386c3b6e0782dd1769de171e65fbcc4b5b3a Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 8 Jan 2026 13:08:31 -0600 Subject: [PATCH 1/4] Add weekly compute-sanitizer checks for racecheck and synccheck Add GitHub Actions workflows to run compute-sanitizer racecheck and synccheck tools on librapidsmpf tests weekly. Refactor test_cpp_memcheck.sh into test_cpp_sanitizer.sh which accepts tool name and test name arguments. - compute-sanitizer-trigger.yaml: Weekly Saturday trigger for racecheck/synccheck - compute-sanitizer-run.yaml: Reusable workflow for running sanitizer tools - test_cpp_sanitizer.sh: Environment setup and orchestration script - run_compute_sanitizer_test.sh: Runs compute-sanitizer on specified test Temporary racecheck/synccheck jobs added to pr.yaml for testing. --- .github/workflows/compute-sanitizer-run.yaml | 38 ++++++++++++ .../workflows/compute-sanitizer-trigger.yaml | 22 +++++++ .github/workflows/pr.yaml | 16 ++++- .github/workflows/test.yaml | 2 +- ci/run_compute_sanitizer_test.sh | 58 +++++++++++++++++++ ci/test_cpp_memcheck.sh | 55 ------------------ ci/test_cpp_sanitizer.sh | 55 ++++++++++++++++++ 7 files changed, 189 insertions(+), 57 deletions(-) create mode 100644 .github/workflows/compute-sanitizer-run.yaml create mode 100644 .github/workflows/compute-sanitizer-trigger.yaml create mode 100755 ci/run_compute_sanitizer_test.sh delete mode 100755 ci/test_cpp_memcheck.sh create mode 100755 ci/test_cpp_sanitizer.sh diff --git a/.github/workflows/compute-sanitizer-run.yaml b/.github/workflows/compute-sanitizer-run.yaml new file mode 100644 index 000000000..bacc6bf81 --- /dev/null +++ b/.github/workflows/compute-sanitizer-run.yaml @@ -0,0 +1,38 @@ +name: Compute Sanitizer Run + +on: + workflow_call: + inputs: + tool_name: + required: true + type: string + description: "Compute sanitizer tool to run (memcheck, racecheck, initcheck, synccheck)" + workflow_dispatch: + inputs: + tool_name: + required: true + type: choice + description: "Compute sanitizer tool to run" + options: + - memcheck + - racecheck + - initcheck + - synccheck + +jobs: + run-sanitizer-tests: + name: Run ${{ inputs.tool_name }} on single_tests + runs-on: linux-amd64-gpu-l4-latest-1 + container: + image: rapidsai/ci-conda:26.02-latest + options: --gpus all + continue-on-error: true + steps: + - name: Checkout code + uses: actions/checkout@v5 + - name: Run compute-sanitizer ${{ inputs.tool_name }} on single_tests + shell: bash + env: + TOOL_NAME: "${{ inputs.tool_name }}" + run: | + ./ci/test_cpp_sanitizer.sh "${TOOL_NAME}" single_tests diff --git a/.github/workflows/compute-sanitizer-trigger.yaml b/.github/workflows/compute-sanitizer-trigger.yaml new file mode 100644 index 000000000..f57f4f4db --- /dev/null +++ b/.github/workflows/compute-sanitizer-trigger.yaml @@ -0,0 +1,22 @@ +name: Compute Sanitizer Trigger + +# This workflow runs compute-sanitizer tools (racecheck, synccheck) on librapidsmpf tests weekly. +# memcheck is run in the nightly test.yaml workflow. +# For targeted testing, manually trigger compute-sanitizer-run.yaml with specific tool_name. + +on: + schedule: + - cron: '0 10 * * 6' # Weekly on Saturday at 10:00 UTC + workflow_dispatch: + +jobs: + run-sanitizer-tests-racecheck: + name: compute-sanitizer racecheck tests + uses: ./.github/workflows/compute-sanitizer-run.yaml + with: + tool_name: "racecheck" + run-sanitizer-tests-synccheck: + name: compute-sanitizer synccheck tests + uses: ./.github/workflows/compute-sanitizer-run.yaml + with: + tool_name: "synccheck" diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 0c78caa96..7298ebdf5 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -18,6 +18,8 @@ jobs: - conda-cpp-linters - conda-cpp-tests - conda-cpp-memcheck + - conda-cpp-racecheck + - conda-cpp-synccheck - conda-python-build - conda-python-tests - docs-build @@ -139,8 +141,20 @@ jobs: uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main with: build_type: pull-request - script: "ci/test_cpp_memcheck.sh" + script: "ci/test_cpp_sanitizer.sh memcheck single_tests" node_type: "gpu-l4-latest-1" + # Temporary for testing + conda-cpp-racecheck: + needs: conda-cpp-build + uses: ./.github/workflows/compute-sanitizer-run.yaml + with: + tool_name: "racecheck" + # Temporary for testing + conda-cpp-synccheck: + needs: conda-cpp-build + uses: ./.github/workflows/compute-sanitizer-run.yaml + with: + tool_name: "synccheck" conda-python-build: needs: conda-cpp-build secrets: inherit diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 8027e28fe..66a244616 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -43,7 +43,7 @@ jobs: container-options: "--cap-add CAP_SYS_PTRACE --shm-size=8g --ulimit=nofile=1000000:1000000" date: ${{ inputs.date }} node_type: "gpu-l4-latest-1" - script: ci/test_cpp_memcheck.sh + script: ci/test_cpp_sanitizer.sh memcheck single_tests sha: ${{ inputs.sha }} conda-python-tests: secrets: inherit diff --git a/ci/run_compute_sanitizer_test.sh b/ci/run_compute_sanitizer_test.sh new file mode 100755 index 000000000..6edbf0097 --- /dev/null +++ b/ci/run_compute_sanitizer_test.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +set -euo pipefail + +# This script runs compute-sanitizer on a single librapidsmpf test executable +# Usage: ./run_compute_sanitizer_test.sh TOOL_NAME TEST_NAME [additional gtest args...] +# Example: ./run_compute_sanitizer_test.sh memcheck single_tests +# Example: ./run_compute_sanitizer_test.sh racecheck single_tests --gtest_filter=ShufflerTest.* + +if [ $# -lt 2 ]; then + echo "Error: Tool and test name required" + echo "Usage: $0 TOOL_NAME TEST_NAME [additional gtest args...]" + echo " TOOL_NAME: compute-sanitizer tool (memcheck, racecheck, initcheck, synccheck)" + echo " TEST_NAME: librapidsmpf test name (e.g., single_tests)" + exit 1 +fi + +TOOL_NAME="${1}" +shift +TEST_NAME="${1}" +shift + +rapids-logger "Running compute-sanitizer --tool ${TOOL_NAME} on ${TEST_NAME}" + +# Navigate to test installation directory +TEST_DIR="${INSTALL_PREFIX:-${CONDA_PREFIX:-/usr}}/bin/tests/librapidsmpf" +TEST_EXECUTABLE="${TEST_DIR}/gtests/${TEST_NAME}" + +if [ ! -x "${TEST_EXECUTABLE}" ]; then + rapids-logger "Error: Test executable ${TEST_EXECUTABLE} not found or not executable" + exit 1 +fi + +# Build compute-sanitizer arguments based on tool +SANITIZER_ARGS=( + --tool "${TOOL_NAME}" + --force-blocking-launches + --error-exitcode=1 +) + +# Add tool-specific arguments +if [ "${TOOL_NAME}" = "memcheck" ]; then + SANITIZER_ARGS+=(--track-stream-ordered-races=all) +fi + +# Run compute-sanitizer on the specified test, excluding CuptiMonitorTest +compute-sanitizer \ + "${SANITIZER_ARGS[@]}" \ + "${TEST_EXECUTABLE}" \ + --gtest_filter=-CuptiMonitorTest.* \ + "$@" + +EXITCODE=$? + +rapids-logger "compute-sanitizer --tool ${TOOL_NAME} on ${TEST_NAME} exiting with value: $EXITCODE" +exit $EXITCODE diff --git a/ci/test_cpp_memcheck.sh b/ci/test_cpp_memcheck.sh deleted file mode 100755 index d84e6bafc..000000000 --- a/ci/test_cpp_memcheck.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. -# SPDX-License-Identifier: Apache-2.0 - -set -xeuo pipefail - -. /opt/conda/etc/profile.d/conda.sh - -rapids-logger "Configuring conda strict channel priority" -conda config --set channel_priority strict - -CPP_CHANNEL=$(rapids-download-conda-from-github cpp) - -rapids-logger "Generate C++ testing dependencies" -rapids-dependency-file-generator \ - --output conda \ - --file-key test_cpp \ - --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" \ - --prepend-channel "${CPP_CHANNEL}" \ - | tee env.yaml - -rapids-mamba-retry env create --yes -f env.yaml -n test - -# Temporarily allow unbound variables for conda activation. -set +u -conda activate test -set -u - -RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/ -mkdir -p "${RAPIDS_TESTS_DIR}" - -rapids-print-env - -rapids-logger "Check GPU usage" -nvidia-smi - -# Trap ERR so that `EXITCODE` is printed when a command fails and the script -# exits with error status -EXITCODE=0 -# shellcheck disable=SC2317 -set_exit_code() { - EXITCODE=$? - rapids-logger "Test failed with exit code ${EXITCODE}" -} -trap set_exit_code ERR -set +e - -# Support customizing the ctests' install location -cd "${INSTALL_PREFIX:-${CONDA_PREFIX:-/usr}}/bin/tests/librapidsmpf/" - -rapids-logger "Run librapidsmpf gtests with compute-sanitizer (Single Node)" -compute-sanitizer --tool memcheck --track-stream-ordered-races=all gtests/single_tests --gtest_filter=-CuptiMonitorTest.* - -rapids-logger "Test script exiting with exit code: $EXITCODE" -exit ${EXITCODE} diff --git a/ci/test_cpp_sanitizer.sh b/ci/test_cpp_sanitizer.sh new file mode 100755 index 000000000..6b0f8a865 --- /dev/null +++ b/ci/test_cpp_sanitizer.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +set -euo pipefail + +. /opt/conda/etc/profile.d/conda.sh + +# This script sets up the test environment and runs compute-sanitizer on librapidsmpf tests +# Usage: ./test_cpp_sanitizer.sh TOOL_NAME TEST_NAME [additional gtest args...] +# Example: ./test_cpp_sanitizer.sh memcheck single_tests +# Example: ./test_cpp_sanitizer.sh racecheck single_tests --gtest_filter=ShufflerTest.* + +if [ $# -lt 2 ]; then + echo "Error: Tool and test name required" + echo "Usage: $0 TOOL_NAME TEST_NAME [additional gtest args...]" + echo " TOOL_NAME: compute-sanitizer tool (memcheck, racecheck, initcheck, synccheck)" + echo " TEST_NAME: librapidsmpf test name (e.g., single_tests)" + exit 1 +fi + +TOOL_NAME="${1}" +shift +TEST_NAME="${1}" +shift + +rapids-logger "Configuring conda strict channel priority" +conda config --set channel_priority strict + +CPP_CHANNEL=$(rapids-download-conda-from-github cpp) + +rapids-logger "Generate C++ testing dependencies" +rapids-dependency-file-generator \ + --output conda \ + --file-key test_cpp \ + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" \ + --prepend-channel "${CPP_CHANNEL}" \ + | tee env.yaml + +rapids-mamba-retry env create --yes -f env.yaml -n test + +# Temporarily allow unbound variables for conda activation. +set +u +conda activate test +set -u + +rapids-print-env + +rapids-logger "Check GPU usage" +nvidia-smi + +# Support invoking test_cpp_sanitizer.sh outside the script directory +cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")" + +./run_compute_sanitizer_test.sh "${TOOL_NAME}" "${TEST_NAME}" "$@" From e2a8b5d5482921d9fa945fe2421c31792794f3d6 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 8 Jan 2026 13:26:31 -0600 Subject: [PATCH 2/4] Add devcontainer fallback location to run_compute_sanitizer_test.sh --- ci/run_compute_sanitizer_test.sh | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/ci/run_compute_sanitizer_test.sh b/ci/run_compute_sanitizer_test.sh index 6edbf0097..edaa26f1d 100755 --- a/ci/run_compute_sanitizer_test.sh +++ b/ci/run_compute_sanitizer_test.sh @@ -24,8 +24,23 @@ shift rapids-logger "Running compute-sanitizer --tool ${TOOL_NAME} on ${TEST_NAME}" -# Navigate to test installation directory -TEST_DIR="${INSTALL_PREFIX:-${CONDA_PREFIX:-/usr}}/bin/tests/librapidsmpf" +# Support customizing the ctests' install location +# First, try the installed location (CI/conda environments) +installed_test_location="${INSTALL_PREFIX:-${CONDA_PREFIX:-/usr}}/bin/tests/librapidsmpf" +# Fall back to the build directory (devcontainer environments) +devcontainers_test_location="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/../cpp/build/latest" + +if [[ -d "${installed_test_location}" ]]; then + TEST_DIR="${installed_test_location}" +elif [[ -d "${devcontainers_test_location}" ]]; then + TEST_DIR="${devcontainers_test_location}" +else + echo "Error: Test location not found. Searched:" >&2 + echo " - ${installed_test_location}" >&2 + echo " - ${devcontainers_test_location}" >&2 + exit 1 +fi + TEST_EXECUTABLE="${TEST_DIR}/gtests/${TEST_NAME}" if [ ! -x "${TEST_EXECUTABLE}" ]; then From d2f98fc9da184a7d9775de0364678cc63a3824cd Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 8 Jan 2026 13:57:09 -0600 Subject: [PATCH 3/4] Use custom-job shared workflow for compute-sanitizer runs --- .github/workflows/compute-sanitizer-run.yaml | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/.github/workflows/compute-sanitizer-run.yaml b/.github/workflows/compute-sanitizer-run.yaml index bacc6bf81..f5e7b3287 100644 --- a/.github/workflows/compute-sanitizer-run.yaml +++ b/.github/workflows/compute-sanitizer-run.yaml @@ -22,17 +22,9 @@ on: jobs: run-sanitizer-tests: name: Run ${{ inputs.tool_name }} on single_tests - runs-on: linux-amd64-gpu-l4-latest-1 - container: - image: rapidsai/ci-conda:26.02-latest - options: --gpus all - continue-on-error: true - steps: - - name: Checkout code - uses: actions/checkout@v5 - - name: Run compute-sanitizer ${{ inputs.tool_name }} on single_tests - shell: bash - env: - TOOL_NAME: "${{ inputs.tool_name }}" - run: | - ./ci/test_cpp_sanitizer.sh "${TOOL_NAME}" single_tests + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-13.1.0 + with: + build_type: pull-request + node_type: "gpu-l4-latest-1" + script: "ci/test_cpp_sanitizer.sh ${{ inputs.tool_name }} single_tests" From 026d8ff006744b1efdb874d4256321aeaee2c8c6 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 8 Jan 2026 15:02:47 -0600 Subject: [PATCH 4/4] Use same action for memcheck. --- .github/workflows/pr.yaml | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 434ba5eff..bc58aa10e 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -136,13 +136,10 @@ jobs: script: ci/test_cpp.sh sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN conda-cpp-memcheck: - secrets: inherit needs: conda-cpp-build - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-13.1.0 + uses: ./.github/workflows/compute-sanitizer-run.yaml with: - build_type: pull-request - script: "ci/test_cpp_sanitizer.sh memcheck single_tests" - node_type: "gpu-l4-latest-1" + tool_name: "memcheck" # Temporary for testing conda-cpp-racecheck: needs: conda-cpp-build