diff --git a/.github/workflows/tests_scripts.yml b/.github/workflows/tests_scripts.yml index 7a8f5fa4..db942c57 100644 --- a/.github/workflows/tests_scripts.yml +++ b/.github/workflows/tests_scripts.yml @@ -51,7 +51,7 @@ jobs: # can't test with EasyBuild versions older than v4.5.2 when using EESSI 2023.06, # since Python in compat layer is Python 3.11.x; # testing with a single EasyBuild version takes a while in GitHub Actions, so stick to a single sensible version - for EB_VERSION in '4.6.0'; do + for EB_VERSION in '5.1.0'; do # Create script that uses load_easybuild_module.sh which we can run in compat layer environment # note: Be careful with single vs double quotes below! # ${EB_VERSION} should be expanded, so use double quotes; @@ -113,7 +113,7 @@ jobs: # scripts need to be copied to /tmp, # since create_directory_tarballs.sh must be accessible from within build container - ./eessi_container.sh --mode run --verbose /software-layer-scripts/create_directory_tarballs.sh 2023.06 + ./eessi_container.sh --mode run --verbose /software-layer-scripts/create_directory_tarballs.sh "${{matrix.EESSI_VERSION}}" # check if tarballs have been produced ls -l *.tar.gz diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index d90b3253..c76745cc 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -17,11 +17,6 @@ display_help() { echo " --skip-cuda-install - disable installing a full CUDA SDK in the host_injections prefix (e.g. in CI)" } -# Function to check if a command exists -function command_exists() { - command -v "$1" >/dev/null 2>&1 -} - function copy_build_log() { # copy specified build log to specified directory, with some context added build_log=${1} @@ -315,19 +310,17 @@ else fi # Install NVIDIA drivers in host_injections (if they exist) -if command_exists "nvidia-smi"; then +# Accept that this may fail +set +e +verify_nvidia-smi +ec=$? +if [ ${ec} -eq 0 ]; then + export LD_LIBRARY_PATH="/.singularity.d/libs:${LD_LIBRARY_PATH}" + ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +elif [ ${ec} -eq 1 ]; then export LD_LIBRARY_PATH="/.singularity.d/libs:${LD_LIBRARY_PATH}" - nvidia-smi --version - ec=$? - if [ ${ec} -eq 0 ]; then - echo "Command 'nvidia-smi' found. Installing NVIDIA drivers for use in prefix shell..." - ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh - else - echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully." - echo "This script now assumes this is NOT a GPU node." - echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error." - fi fi +set -e if [ ! -z "${shared_fs_path}" ]; then shared_eb_sourcepath=${shared_fs_path}/easybuild/sources diff --git a/bot/build.sh b/bot/build.sh index 11b8ec41..6a08c7f2 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -220,25 +220,18 @@ BUILD_STEP_ARGS+=("--save" "${TARBALL_TMP_BUILD_STEP_DIR}") BUILD_STEP_ARGS+=("--storage" "${STORAGE}") # add options required to handle NVIDIA support -if command_exists "nvidia-smi"; then - # Accept that this may fail - set +e - nvidia-smi --version - ec=$? - set -e - if [ ${ec} -eq 0 ]; then - echo "Command 'nvidia-smi' found, using available GPU" - BUILD_STEP_ARGS+=("--nvidia" "all") - else - echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully." - echo "This script now assumes this is NOT a GPU node." - echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error." - BUILD_STEP_ARGS+=("--nvidia" "install") - fi -else - echo "No 'nvidia-smi' found, no available GPU but allowing overriding this check" +# Accept that this may fail +set +e +verify_nvidia-smi +ec=$? +if [ ${ec} -eq 0 ]; then + BUILD_STEP_ARGS+=("--nvidia" "all") +elif [ ${ec} -eq 1 ]; then + BUILD_STEP_ARGS+=("--nvidia" "install") +elif [ ${ec} -eq 2 ]; then BUILD_STEP_ARGS+=("--nvidia" "install") fi +set -e # Retain location for host injections so we don't reinstall CUDA # (Always need to run the driver installation as available driver may change) diff --git a/bot/test.sh b/bot/test.sh index 9ab49866..0cb10174 100755 --- a/bot/test.sh +++ b/bot/test.sh @@ -225,21 +225,14 @@ fi TEST_STEP_ARGS+=("--extra-bind-paths" "/sys/fs/cgroup:/hostsys/fs/cgroup:ro") # add options required to handle NVIDIA support -if command_exists "nvidia-smi"; then - # Accept that this may fail - set +e - nvidia-smi --version - ec=$? - set -e - if [ ${ec} -eq 0 ]; then - echo "Command 'nvidia-smi' found, using available GPU" - TEST_STEP_ARGS+=("--nvidia" "run") - else - echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully." - echo "This script now assumes this is NOT a GPU node." - echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error." - fi +# Accept that this may fail +set +e +verify_nvidia-smi +ec=$? +if [ ${ec} -eq 0 ]; then + TEST_STEP_ARGS+=("--nvidia" "run") fi +set -e # prepare arguments to test_suite.sh (specific to test step) declare -a TEST_SUITE_ARGS=() diff --git a/easystacks/software.eessi.io/2023.06/accel/nvidia/zen3_a100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml b/easystacks/software.eessi.io/2023.06/accel/nvidia/zen3_a100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml new file mode 100644 index 00000000..0bf49b79 --- /dev/null +++ b/easystacks/software.eessi.io/2023.06/accel/nvidia/zen3_a100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml @@ -0,0 +1,2 @@ +easyconfigs: + - pmt-1.2.0-GCCcore-12.3.0-CUDA-12.1.1.eb diff --git a/scripts/utils.sh b/scripts/utils.sh index 962decd2..2adc1a0d 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -147,3 +147,22 @@ function get_ipv4_address { echo "${hipv4}" return 0 } + +function verify_nvidia-smi { + if command_exists "nvidia-smi"; then + nvidia-smi --version + ec=$? + if [ ${ec} -eq 0 ]; then + echo "Command 'nvidia-smi' found. Installing NVIDIA drivers for use in prefix shell..." + return 0 + else + echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully." + echo "This script now assumes this is NOT a GPU node." + echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error." + return 1 + fi + else + echo echo "No 'nvidia-smi' found, no available GPU but allowing overriding this check" + return 2 + fi +}