Skip to content

Commit f3fe933

Browse files
authored
Merge pull request #910 from casparvl/fix_behavior_for_present_but_failing_nvidiasmi
Fix behavior for present but failing nvidiasmi
2 parents 2081cd7 + e6f89cc commit f3fe933

File tree

3 files changed

+39
-7
lines changed

3 files changed

+39
-7
lines changed

EESSI-install-software.sh

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -271,11 +271,18 @@ fi
271271

272272
# Install NVIDIA drivers in host_injections (if they exist)
273273
if command_exists "nvidia-smi"; then
274-
echo "Command 'nvidia-smi' found. Installing NVIDIA drivers for use in prefix shell..."
275-
${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh
274+
nvidia-smi --version
275+
ec=$?
276+
if [ ${ec} -eq 0 ]; then
277+
echo "Command 'nvidia-smi' found. Installing NVIDIA drivers for use in prefix shell..."
278+
${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh
279+
else
280+
echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully."
281+
echo "This script now assumes this is NOT a GPU node."
282+
echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error."
283+
fi
276284
fi
277285

278-
279286
if [ ! -z "${shared_fs_path}" ]; then
280287
shared_eb_sourcepath=${shared_fs_path}/easybuild/sources
281288
echo ">> Using ${shared_eb_sourcepath} as shared EasyBuild source path"

bot/build.sh

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -244,14 +244,28 @@ mkdir -p ${TARBALL_TMP_BUILD_STEP_DIR}
244244
# prepare arguments to eessi_container.sh specific to build step
245245
BUILD_STEP_ARGS+=("--save" "${TARBALL_TMP_BUILD_STEP_DIR}")
246246
BUILD_STEP_ARGS+=("--storage" "${STORAGE}")
247+
247248
# add options required to handle NVIDIA support
248249
if command_exists "nvidia-smi"; then
249-
echo "Command 'nvidia-smi' found, using available GPU"
250-
BUILD_STEP_ARGS+=("--nvidia" "all")
250+
# Accept that this may fail
251+
set +e
252+
nvidia-smi --version
253+
ec=$?
254+
set -e
255+
if [ ${ec} -eq 0 ]; then
256+
echo "Command 'nvidia-smi' found, using available GPU"
257+
BUILD_STEP_ARGS+=("--nvidia" "all")
258+
else
259+
echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully."
260+
echo "This script now assumes this is NOT a GPU node."
261+
echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error."
262+
BUILD_STEP_ARGS+=("--nvidia" "install")
263+
fi
251264
else
252265
echo "No 'nvidia-smi' found, no available GPU but allowing overriding this check"
253266
BUILD_STEP_ARGS+=("--nvidia" "install")
254267
fi
268+
255269
# Retain location for host injections so we don't reinstall CUDA
256270
# (Always need to run the driver installation as available driver may change)
257271
if [[ ! -z ${SHARED_FS_PATH} ]]; then

bot/test.sh

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -215,8 +215,19 @@ TEST_STEP_ARGS+=("--extra-bind-paths" "/sys/fs/cgroup:/hostsys/fs/cgroup:ro")
215215

216216
# add options required to handle NVIDIA support
217217
if command_exists "nvidia-smi"; then
218-
echo "Command 'nvidia-smi' found, using available GPU"
219-
TEST_STEP_ARGS+=("--nvidia" "run")
218+
# Accept that this may fail
219+
set +e
220+
nvidia-smi --version
221+
ec=$?
222+
set -e
223+
if [ ${ec} -eq 0 ]; then
224+
echo "Command 'nvidia-smi' found, using available GPU"
225+
TEST_STEP_ARGS+=("--nvidia" "run")
226+
else
227+
echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully."
228+
echo "This script now assumes this is NOT a GPU node."
229+
echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error."
230+
fi
220231
fi
221232

222233
# prepare arguments to test_suite.sh (specific to test step)

0 commit comments

Comments
 (0)