diff --git a/.github/workflows/compute-sanitizer-run.yaml b/.github/workflows/compute-sanitizer-run.yaml new file mode 100644 index 000000000..f5e7b3287 --- /dev/null +++ b/.github/workflows/compute-sanitizer-run.yaml @@ -0,0 +1,30 @@ +name: Compute Sanitizer Run + +on: + workflow_call: + inputs: + tool_name: + required: true + type: string + description: "Compute sanitizer tool to run (memcheck, racecheck, initcheck, synccheck)" + workflow_dispatch: + inputs: + tool_name: + required: true + type: choice + description: "Compute sanitizer tool to run" + options: + - memcheck + - racecheck + - initcheck + - synccheck + +jobs: + run-sanitizer-tests: + name: Run ${{ inputs.tool_name }} on single_tests + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-13.1.0 + with: + build_type: pull-request + node_type: "gpu-l4-latest-1" + script: "ci/test_cpp_sanitizer.sh ${{ inputs.tool_name }} single_tests" diff --git a/.github/workflows/compute-sanitizer-trigger.yaml b/.github/workflows/compute-sanitizer-trigger.yaml new file mode 100644 index 000000000..f57f4f4db --- /dev/null +++ b/.github/workflows/compute-sanitizer-trigger.yaml @@ -0,0 +1,22 @@ +name: Compute Sanitizer Trigger + +# This workflow runs compute-sanitizer tools (racecheck, synccheck) on librapidsmpf tests weekly. +# memcheck is run in the nightly test.yaml workflow. +# For targeted testing, manually trigger compute-sanitizer-run.yaml with specific tool_name. + +on: + schedule: + - cron: '0 10 * * 6' # Weekly on Saturday at 10:00 UTC + workflow_dispatch: + +jobs: + run-sanitizer-tests-racecheck: + name: compute-sanitizer racecheck tests + uses: ./.github/workflows/compute-sanitizer-run.yaml + with: + tool_name: "racecheck" + run-sanitizer-tests-synccheck: + name: compute-sanitizer synccheck tests + uses: ./.github/workflows/compute-sanitizer-run.yaml + with: + tool_name: "synccheck" diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 94f03cf1c..bc58aa10e 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -18,6 +18,8 @@ jobs: - conda-cpp-linters - conda-cpp-tests - conda-cpp-memcheck + - conda-cpp-racecheck + - conda-cpp-synccheck - conda-python-build - conda-python-tests - docs-build @@ -134,13 +136,22 @@ jobs: script: ci/test_cpp.sh sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN conda-cpp-memcheck: - secrets: inherit needs: conda-cpp-build - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-13.1.0 + uses: ./.github/workflows/compute-sanitizer-run.yaml with: - build_type: pull-request - script: "ci/test_cpp_memcheck.sh" - node_type: "gpu-l4-latest-1" + tool_name: "memcheck" + # Temporary for testing + conda-cpp-racecheck: + needs: conda-cpp-build + uses: ./.github/workflows/compute-sanitizer-run.yaml + with: + tool_name: "racecheck" + # Temporary for testing + conda-cpp-synccheck: + needs: conda-cpp-build + uses: ./.github/workflows/compute-sanitizer-run.yaml + with: + tool_name: "synccheck" conda-python-build: needs: conda-cpp-build secrets: inherit diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 2b764a3df..47eb0fd2c 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -43,7 +43,7 @@ jobs: container-options: "--cap-add CAP_SYS_PTRACE --shm-size=8g --ulimit=nofile=1000000:1000000" date: ${{ inputs.date }} node_type: "gpu-l4-latest-1" - script: ci/test_cpp_memcheck.sh + script: ci/test_cpp_sanitizer.sh memcheck single_tests sha: ${{ inputs.sha }} conda-python-tests: secrets: inherit diff --git a/ci/run_compute_sanitizer_test.sh b/ci/run_compute_sanitizer_test.sh new file mode 100755 index 000000000..edaa26f1d --- /dev/null +++ b/ci/run_compute_sanitizer_test.sh @@ -0,0 +1,73 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +set -euo pipefail + +# This script runs compute-sanitizer on a single librapidsmpf test executable +# Usage: ./run_compute_sanitizer_test.sh TOOL_NAME TEST_NAME [additional gtest args...] +# Example: ./run_compute_sanitizer_test.sh memcheck single_tests +# Example: ./run_compute_sanitizer_test.sh racecheck single_tests --gtest_filter=ShufflerTest.* + +if [ $# -lt 2 ]; then + echo "Error: Tool and test name required" + echo "Usage: $0 TOOL_NAME TEST_NAME [additional gtest args...]" + echo " TOOL_NAME: compute-sanitizer tool (memcheck, racecheck, initcheck, synccheck)" + echo " TEST_NAME: librapidsmpf test name (e.g., single_tests)" + exit 1 +fi + +TOOL_NAME="${1}" +shift +TEST_NAME="${1}" +shift + +rapids-logger "Running compute-sanitizer --tool ${TOOL_NAME} on ${TEST_NAME}" + +# Support customizing the ctests' install location +# First, try the installed location (CI/conda environments) +installed_test_location="${INSTALL_PREFIX:-${CONDA_PREFIX:-/usr}}/bin/tests/librapidsmpf" +# Fall back to the build directory (devcontainer environments) +devcontainers_test_location="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/../cpp/build/latest" + +if [[ -d "${installed_test_location}" ]]; then + TEST_DIR="${installed_test_location}" +elif [[ -d "${devcontainers_test_location}" ]]; then + TEST_DIR="${devcontainers_test_location}" +else + echo "Error: Test location not found. Searched:" >&2 + echo " - ${installed_test_location}" >&2 + echo " - ${devcontainers_test_location}" >&2 + exit 1 +fi + +TEST_EXECUTABLE="${TEST_DIR}/gtests/${TEST_NAME}" + +if [ ! -x "${TEST_EXECUTABLE}" ]; then + rapids-logger "Error: Test executable ${TEST_EXECUTABLE} not found or not executable" + exit 1 +fi + +# Build compute-sanitizer arguments based on tool +SANITIZER_ARGS=( + --tool "${TOOL_NAME}" + --force-blocking-launches + --error-exitcode=1 +) + +# Add tool-specific arguments +if [ "${TOOL_NAME}" = "memcheck" ]; then + SANITIZER_ARGS+=(--track-stream-ordered-races=all) +fi + +# Run compute-sanitizer on the specified test, excluding CuptiMonitorTest +compute-sanitizer \ + "${SANITIZER_ARGS[@]}" \ + "${TEST_EXECUTABLE}" \ + --gtest_filter=-CuptiMonitorTest.* \ + "$@" + +EXITCODE=$? + +rapids-logger "compute-sanitizer --tool ${TOOL_NAME} on ${TEST_NAME} exiting with value: $EXITCODE" +exit $EXITCODE diff --git a/ci/test_cpp_memcheck.sh b/ci/test_cpp_memcheck.sh deleted file mode 100755 index d84e6bafc..000000000 --- a/ci/test_cpp_memcheck.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. -# SPDX-License-Identifier: Apache-2.0 - -set -xeuo pipefail - -. /opt/conda/etc/profile.d/conda.sh - -rapids-logger "Configuring conda strict channel priority" -conda config --set channel_priority strict - -CPP_CHANNEL=$(rapids-download-conda-from-github cpp) - -rapids-logger "Generate C++ testing dependencies" -rapids-dependency-file-generator \ - --output conda \ - --file-key test_cpp \ - --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" \ - --prepend-channel "${CPP_CHANNEL}" \ - | tee env.yaml - -rapids-mamba-retry env create --yes -f env.yaml -n test - -# Temporarily allow unbound variables for conda activation. -set +u -conda activate test -set -u - -RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/ -mkdir -p "${RAPIDS_TESTS_DIR}" - -rapids-print-env - -rapids-logger "Check GPU usage" -nvidia-smi - -# Trap ERR so that `EXITCODE` is printed when a command fails and the script -# exits with error status -EXITCODE=0 -# shellcheck disable=SC2317 -set_exit_code() { - EXITCODE=$? - rapids-logger "Test failed with exit code ${EXITCODE}" -} -trap set_exit_code ERR -set +e - -# Support customizing the ctests' install location -cd "${INSTALL_PREFIX:-${CONDA_PREFIX:-/usr}}/bin/tests/librapidsmpf/" - -rapids-logger "Run librapidsmpf gtests with compute-sanitizer (Single Node)" -compute-sanitizer --tool memcheck --track-stream-ordered-races=all gtests/single_tests --gtest_filter=-CuptiMonitorTest.* - -rapids-logger "Test script exiting with exit code: $EXITCODE" -exit ${EXITCODE} diff --git a/ci/test_cpp_sanitizer.sh b/ci/test_cpp_sanitizer.sh new file mode 100755 index 000000000..6b0f8a865 --- /dev/null +++ b/ci/test_cpp_sanitizer.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +set -euo pipefail + +. /opt/conda/etc/profile.d/conda.sh + +# This script sets up the test environment and runs compute-sanitizer on librapidsmpf tests +# Usage: ./test_cpp_sanitizer.sh TOOL_NAME TEST_NAME [additional gtest args...] +# Example: ./test_cpp_sanitizer.sh memcheck single_tests +# Example: ./test_cpp_sanitizer.sh racecheck single_tests --gtest_filter=ShufflerTest.* + +if [ $# -lt 2 ]; then + echo "Error: Tool and test name required" + echo "Usage: $0 TOOL_NAME TEST_NAME [additional gtest args...]" + echo " TOOL_NAME: compute-sanitizer tool (memcheck, racecheck, initcheck, synccheck)" + echo " TEST_NAME: librapidsmpf test name (e.g., single_tests)" + exit 1 +fi + +TOOL_NAME="${1}" +shift +TEST_NAME="${1}" +shift + +rapids-logger "Configuring conda strict channel priority" +conda config --set channel_priority strict + +CPP_CHANNEL=$(rapids-download-conda-from-github cpp) + +rapids-logger "Generate C++ testing dependencies" +rapids-dependency-file-generator \ + --output conda \ + --file-key test_cpp \ + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" \ + --prepend-channel "${CPP_CHANNEL}" \ + | tee env.yaml + +rapids-mamba-retry env create --yes -f env.yaml -n test + +# Temporarily allow unbound variables for conda activation. +set +u +conda activate test +set -u + +rapids-print-env + +rapids-logger "Check GPU usage" +nvidia-smi + +# Support invoking test_cpp_sanitizer.sh outside the script directory +cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")" + +./run_compute_sanitizer_test.sh "${TOOL_NAME}" "${TEST_NAME}" "$@"