-
Notifications
You must be signed in to change notification settings - Fork 41
Description
root@ipn1:/mnt/models/stable_diffusion_nvidia_training_sa/training_results_v5.0/NVIDIA/benchmarks/stable_diffusion/implementations/nyx_n1_ngc25.04_nemo# srun --no-container-entrypoint --container-image=evuedsoacr.azurecr.io/dc-ecosys-appl-eng/mlperf-nvidia:stable_diffusion-pyt --no-container-mount-home --container-remap-root --container-writable true bash -c 'echo Hello from inside container'
pyxis: importing docker image: evuedsoacr.azurecr.io/dc-ecosys-appl-eng/mlperf-nvidia:stable_diffusion-pyt
slurmstepd: error: pyxis: child 1022281 failed with error code: 1
slurmstepd: error: pyxis: failed to import docker image
slurmstepd: error: pyxis: printing enroot log file:
slurmstepd: error: pyxis: [INFO] Querying registry for permission grant
slurmstepd: error: pyxis: [INFO] Authenticating with user:
slurmstepd: error: pyxis: [INFO] Authentication succeeded
slurmstepd: error: pyxis: [INFO] Fetching image manifest list
slurmstepd: error: pyxis: [INFO] Fetching image manifest
slurmstepd: error: pyxis: [ERROR] URL https://registry-1.docker.io/v2/evuedsoacr.azurecr.io/dc-ecosys-appl-eng/mlperf-nvidia/manifests/stable_diffusion-pyt returned error code: 401 Unauthorized
slurmstepd: error: pyxis: couldn't start container
slurmstepd: error: spank: required plugin task_pyxis.so: task_init() failed with rc=-1
slurmstepd: error: Failed to invoke spank plugin stack
slurmstepd: error: pyxis: child 1022333 failed with error code: 1
srun: error: localhost: task 0: Exited with exit code 1