diff --git a/.github/workflows/tests_link_nvidia_host_libraries.yml b/.github/workflows/tests_link_nvidia_host_libraries.yml index d15f4cf3..d30aced1 100644 --- a/.github/workflows/tests_link_nvidia_host_libraries.yml +++ b/.github/workflows/tests_link_nvidia_host_libraries.yml @@ -19,7 +19,18 @@ jobs: - name: Initialize EESSI uses: eessi/github-action-eessi@v3 - + + - name: Test function nvidia_gpu_available before setup of libraries + run: | + source scripts/utils.sh + if nvidia_gpu_available; then + echo "Error: Found NVIDIA libraries before the mock libraries were set up." + exit 1 + else + echo "NVIDIA libraries were not found." + echo "Proceeding to setting up the mock NVIDIA libraries." + fi + - name: Setup mock NVIDIA libraries run: | # Run the script to create mock libraries @@ -48,6 +59,16 @@ jobs: echo "Updating PATH" echo "PATH=/tmp/nvidia-bin:$PATH" >> $GITHUB_ENV + - name: Test nvidia_gpu_available after setup of mock libraries + run: | + source scripts/utils.sh + if nvidia_gpu_available; then + echo "mock NVIDIA libraries and nvidia-smi were set up" + else + echo "Error: mock nvidia-smi is not available." + exit 1 + fi + - name: Test LD_PRELOAD mode run: | echo ">>> Testing LD_PRELOAD mode" diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index d90b3253..bf5c59ca 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -17,11 +17,6 @@ display_help() { echo " --skip-cuda-install - disable installing a full CUDA SDK in the host_injections prefix (e.g. in CI)" } -# Function to check if a command exists -function command_exists() { - command -v "$1" >/dev/null 2>&1 -} - function copy_build_log() { # copy specified build log to specified directory, with some context added build_log=${1} @@ -315,18 +310,9 @@ else fi # Install NVIDIA drivers in host_injections (if they exist) -if command_exists "nvidia-smi"; then - export LD_LIBRARY_PATH="/.singularity.d/libs:${LD_LIBRARY_PATH}" - nvidia-smi --version - ec=$? - if [ ${ec} -eq 0 ]; then - echo "Command 'nvidia-smi' found. Installing NVIDIA drivers for use in prefix shell..." - ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh - else - echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully." - echo "This script now assumes this is NOT a GPU node." - echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error." - fi +if nvidia_gpu_available; then + echo "Installing NVIDIA drivers for use in prefix shell..." + ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh fi if [ ! -z "${shared_fs_path}" ]; then diff --git a/bot/build.sh b/bot/build.sh index 11b8ec41..290444f1 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -220,23 +220,9 @@ BUILD_STEP_ARGS+=("--save" "${TARBALL_TMP_BUILD_STEP_DIR}") BUILD_STEP_ARGS+=("--storage" "${STORAGE}") # add options required to handle NVIDIA support -if command_exists "nvidia-smi"; then - # Accept that this may fail - set +e - nvidia-smi --version - ec=$? - set -e - if [ ${ec} -eq 0 ]; then - echo "Command 'nvidia-smi' found, using available GPU" - BUILD_STEP_ARGS+=("--nvidia" "all") - else - echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully." - echo "This script now assumes this is NOT a GPU node." - echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error." - BUILD_STEP_ARGS+=("--nvidia" "install") - fi +if nvidia_gpu_available; then + BUILD_STEP_ARGS+=("--nvidia" "all") else - echo "No 'nvidia-smi' found, no available GPU but allowing overriding this check" BUILD_STEP_ARGS+=("--nvidia" "install") fi diff --git a/bot/test.sh b/bot/test.sh index 9ab49866..93907de5 100755 --- a/bot/test.sh +++ b/bot/test.sh @@ -225,20 +225,8 @@ fi TEST_STEP_ARGS+=("--extra-bind-paths" "/sys/fs/cgroup:/hostsys/fs/cgroup:ro") # add options required to handle NVIDIA support -if command_exists "nvidia-smi"; then - # Accept that this may fail - set +e - nvidia-smi --version - ec=$? - set -e - if [ ${ec} -eq 0 ]; then - echo "Command 'nvidia-smi' found, using available GPU" - TEST_STEP_ARGS+=("--nvidia" "run") - else - echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully." - echo "This script now assumes this is NOT a GPU node." - echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error." - fi +if nvidia_gpu_available; then + TEST_STEP_ARGS+=("--nvidia" "run") fi # prepare arguments to test_suite.sh (specific to test step) diff --git a/scripts/utils.sh b/scripts/utils.sh index 962decd2..51fb2155 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -147,3 +147,22 @@ function get_ipv4_address { echo "${hipv4}" return 0 } + +function nvidia_gpu_available { + if command_exists "nvidia-smi"; then + # We are careful here in case we are running in a container and LD_LIBARY_PATH has been wiped + LD_LIBRARY_PATH="/.singularity.d/libs:${LD_LIBRARY_PATH}" nvidia-smi --version + ec=$? + if [ ${ec} -eq 0 ]; then + echo "Command 'nvidia-smi' found." + return 0 + else + echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully." + echo "This script now assumes this is NOT a GPU node." + return 1 + fi + else + echo "No 'nvidia-smi' found, no available GPU." + return 2 + fi +}