Skip to content

Segfault when generating CDI specs with driver version < 570 #1555

@NeonSludge

Description

@NeonSludge

NVIDIA k8s device plugin 0.18.1 using the cdi-annotations strategy crashes after trying to generate CDI specs with NVIDIA driver version < 570.
It's the same issue as NVIDIA/nvidia-container-toolkit#1398 but in this case it seems that the "disable nvsandbox utils" workaround doesn't work (it does for the NCT).
We've tried passing disable-nvsandbox-utils and disable-nvsandboxutils feature flags via the new cdi.featureFlags option in the 0.18.1 chart but it seemingly had no effect.
We have a driver upgrade planned but I just wanted to see if there was any way to use the latest version of the device plugin with older drivers (again, the feature flag workaround works for the NCT).

Here are the logs:

I1205 08:11:20.943256     221 main.go:244] "Starting NVIDIA Device Plugin" version=<
    dfcf7283
    commit: dfcf72838b8839c7a0fd9bc9d21fe8d46bed2111
 >
...
I1205 08:11:20.947962     221 main.go:363] 
Running with config:
{
  "version": "v1",
  "flags": {
    "migStrategy": "none",
    "failOnInitError": true,
    "mpsRoot": "/run/nvidia/mps",
    "nvidiaDriverRoot": "/",
    "nvidiaDevRoot": "/",
    "gdrcopyEnabled": false,
    "gdsEnabled": false,
    "mofedEnabled": false,
    "useNodeFeatureAPI": null,
    "deviceDiscoveryStrategy": "nvml",
    "plugin": {
      "passDeviceSpecs": true,
      "deviceListStrategy": [
        "cdi-annotations"
      ],
      "deviceIDStrategy": "uuid",
      "cdiAnnotationPrefix": "cdi.k8s.io/",
      "nvidiaCTKPath": "/usr/bin/nvidia-cdi-hook",
      "containerDriverRoot": "/driver-root"
    }
  },
  "resources": {
    "gpus": [
      {
        "pattern": "*",
        "name": "nvidia.com/gpu"
      }
    ]
  },
  "sharing": {
    "timeSlicing": {
      "renameByDefault": true,
      "resources": [
        {
          "name": "nvidia.com/gpu",
          "rename": "nvidia.com/gpu.shared",
          "devices": "all",
          "replicas": 10
        }
      ]
    }
  },
  "imex": {}
}
...
time="2025-12-05T08:11:20Z" level=info msg="Using /driver-root/usr/lib64/libnvidia-sandboxutils.so.565.57.01"
time="2025-12-05T08:11:20Z" level=info msg="Auto-detected mode as 'nvml'"
time="2025-12-05T08:11:21Z" level=info msg="Generating CDI spec for resource: k8s.device-plugin.nvidia.com/gpu"
SIGSEGV: segmentation violation
PC=0x7fda549efefa m=7 sigcode=128 addr=0x0
signal arrived during cgo execution

goroutine 1 gp=0xc000002380 m=7 mp=0xc0000e6008 [syscall]:
runtime.cgocall(0xd33490, 0xc0001b0f28)
    /usr/local/go/src/runtime/cgocall.go:167 +0x4b fp=0xc0001b0f00 sp=0xc0001b0ec8 pc=0x48a56b
github.com/NVIDIA/nvidia-container-toolkit/internal/nvsandboxutils._Cfunc_nvSandboxUtilsShutdown()
    _cgo_gotypes.go:171 +0x45 fp=0xc0001b0f28 sp=0xc0001b0f00 pc=0xc59285
github.com/NVIDIA/nvidia-container-toolkit/internal/nvsandboxutils.nvSandboxUtilsShutdown(...)
    /build/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/nvsandboxutils/nvsandboxutils.go:42
github.com/NVIDIA/nvidia-container-toolkit/internal/nvsandboxutils.(*library).Shutdown(0xc000388a80)
    /build/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/nvsandboxutils/impl.go:36 +0x19 fp=0xc0001b0f40 sp=0xc0001b0f28 pc=0xc59899
github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi.(*nvmllib).tryShutdown(0xc0002e2680)
    /build/vendor/github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/lib-nvml.go:229 +0x33 fp=0xc0001b0f90 sp=0xc0001b0f40 pc=0xc8b193
github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi.(*nvmllib).DeviceSpecGenerators.deferwrap1()
    /build/vendor/github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/lib-nvml.go:57 +0x25 fp=0xc0001b0fa8 sp=0xc0001b0f90 pc=0xc8a045
github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi.(*nvmllib).DeviceSpecGenerators(0xc0002e2680, {0xc0004d3cb0, 0x1, 0x1})
    /build/vendor/github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/lib-nvml.go:63 +0x126 fp=0xc0001b1028 sp=0xc0001b0fa8 pc=0xc89f46
github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi.(*wrapper).GetDeviceSpecsByID(0x0?, {0xc0004d3cb0?, 0xc0000e6008?, 0xc0001b10b0?})
    /build/vendor/github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/wrapper.go:79 +0x22 fp=0xc0001b1070 sp=0xc0001b1028 pc=0xc90542
github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi.(*wrapper).GetSpec(0xc0002f2280, {0x0?, 0xf8c1b8?, 0x27?})
    /build/vendor/github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/wrapper.go:56 +0x8e fp=0xc0001b1278 sp=0xc0001b1070 pc=0xc900ce
github.com/NVIDIA/k8s-device-plugin/internal/cdi.(*cdiHandler).CreateSpecFile(0xc0002cec60)
    /build/internal/cdi/cdi.go:190 +0x278 fp=0xc0001b1458 sp=0xc0001b1278 pc=0xcb58d8
main.GetPlugins({0x10b0a30, 0x186bea0}, {0x10b5530, 0xc0002d4e80}, {0x10cb398, 0xc000388030}, {0x10b7e20, 0xc0002de180}, 0xc0002b42d0)
    /build/cmd/nvidia-device-plugin/plugin-manager.go:77 +0x738 fp=0xc0001b1720 sp=0xc0001b1458 pc=0xcd18f8
main.startPlugins(0xc000137b80, 0xc000137ac0)
    /build/cmd/nvidia-device-plugin/main.go:367 +0x5e5 fp=0xc0001b18c0 sp=0xc0001b1720 pc=0xcd09c5
main.start(0xc000137b80, 0xc000137ac0)
    /build/cmd/nvidia-device-plugin/main.go:270 +0x4ea fp=0xc0001b1b18 sp=0xc0001b18c0 pc=0xccfbea
main.main.func1(0xc000137b80?)
    /build/cmd/nvidia-device-plugin/main.go:56 +0x17 fp=0xc0001b1b38 sp=0xc0001b1b18 pc=0xccf077
github.com/urfave/cli/v2.(*Command).Run(0xc00017d600, 0xc000137b80, {0xc000128030, 0x1, 0x1})
    /build/vendor/github.com/urfave/cli/v2/command.go:276 +0x7c2 fp=0xc0001b1da8 sp=0xc0001b1b38 pc=0x703fc2
github.com/urfave/cli/v2.(*App).RunContext(0xc00019e400, {0x10b0a30, 0x186bea0}, {0xc000128030, 0x1, 0x1})
    /build/vendor/github.com/urfave/cli/v2/app.go:333 +0x5a5 fp=0xc0001b1e08 sp=0xc0001b1da8 pc=0x700985
github.com/urfave/cli/v2.(*App).Run(...)
    /build/vendor/github.com/urfave/cli/v2/app.go:307
main.main()
    /build/cmd/nvidia-device-plugin/main.go:180 +0x139c fp=0xc0001b1f50 sp=0xc0001b1e08 pc=0xccef9c
runtime.main()
    /usr/local/go/src/runtime/proc.go:285 +0x29d fp=0xc0001b1fe0 sp=0xc0001b1f50 pc=0x45923d
runtime.goexit({})
    /usr/local/go/src/runtime/asm_amd64.s:1693 +0x1 fp=0xc0001b1fe8 sp=0xc0001b1fe0 pc=0x495481

goroutine 2 gp=0xc000002e00 m=nil [force gc (idle)]:
runtime.gopark(0x0?, 0x0?, 0x0?, 0x0?, 0x0?)
    /usr/local/go/src/runtime/proc.go:460 +0xce fp=0xc0000acfa8 sp=0xc0000acf88 pc=0x48d86e
runtime.goparkunlock(...)
    /usr/local/go/src/runtime/proc.go:466
runtime.forcegchelper()
    /usr/local/go/src/runtime/proc.go:373 +0xb3 fp=0xc0000acfe0 sp=0xc0000acfa8 pc=0x459573
runtime.goexit({})
    /usr/local/go/src/runtime/asm_amd64.s:1693 +0x1 fp=0xc0000acfe8 sp=0xc0000acfe0 pc=0x495481
created by runtime.init.7 in goroutine 1
    /usr/local/go/src/runtime/proc.go:361 +0x1a

goroutine 3 gp=0xc000003340 m=nil [GC sweep wait]:
runtime.gopark(0x0?, 0x0?, 0x0?, 0x0?, 0x0?)
    /usr/local/go/src/runtime/proc.go:460 +0xce fp=0xc0000ad780 sp=0xc0000ad760 pc=0x48d86e
runtime.goparkunlock(...)
    /usr/local/go/src/runtime/proc.go:466
runtime.bgsweep(0xc0000d8000)
    /usr/local/go/src/runtime/mgcsweep.go:279 +0x94 fp=0xc0000ad7c8 sp=0xc0000ad780 pc=0x4433f4
runtime.gcenable.gowrap1()
    /usr/local/go/src/runtime/mgc.go:212 +0x25 fp=0xc0000ad7e0 sp=0xc0000ad7c8 pc=0x437445
runtime.goexit({})
    /usr/local/go/src/runtime/asm_amd64.s:1693 +0x1 fp=0xc0000ad7e8 sp=0xc0000ad7e0 pc=0x495481
created by runtime.gcenable in goroutine 1
    /usr/local/go/src/runtime/mgc.go:212 +0x66

goroutine 4 gp=0xc000003500 m=nil [GC scavenge wait]:
runtime.gopark(0xc0000d8000?, 0x10a0c40?, 0x1?, 0x0?, 0xc000003500?)
    /usr/local/go/src/runtime/proc.go:460 +0xce fp=0xc0000adf78 sp=0xc0000adf58 pc=0x48d86e
runtime.goparkunlock(...)
    /usr/local/go/src/runtime/proc.go:466
runtime.(*scavengerState).park(0x1849da0)
    /usr/local/go/src/runtime/mgcscavenge.go:425 +0x49 fp=0xc0000adfa8 sp=0xc0000adf78 pc=0x440ec9
runtime.bgscavenge(0xc0000d8000)
    /usr/local/go/src/runtime/mgcscavenge.go:653 +0x3c fp=0xc0000adfc8 sp=0xc0000adfa8 pc=0x44145c
runtime.gcenable.gowrap2()
    /usr/local/go/src/runtime/mgc.go:213 +0x25 fp=0xc0000adfe0 sp=0xc0000adfc8 pc=0x4373e5
runtime.goexit({})
    /usr/local/go/src/runtime/asm_amd64.s:1693 +0x1 fp=0xc0000adfe8 sp=0xc0000adfe0 pc=0x495481
created by runtime.gcenable in goroutine 1
    /usr/local/go/src/runtime/mgc.go:213 +0xa5

goroutine 18 gp=0xc000102380 m=nil [GOMAXPROCS updater (idle)]:
runtime.gopark(0x0?, 0x0?, 0x0?, 0x0?, 0x0?)
    /usr/local/go/src/runtime/proc.go:460 +0xce fp=0xc0000a8788 sp=0xc0000a8768 pc=0x48d86e
runtime.goparkunlock(...)
    /usr/local/go/src/runtime/proc.go:466
runtime.updateMaxProcsGoroutine()
    /usr/local/go/src/runtime/proc.go:6720 +0xe7 fp=0xc0000a87e0 sp=0xc0000a8788 pc=0x467547
runtime.goexit({})
    /usr/local/go/src/runtime/asm_amd64.s:1693 +0x1 fp=0xc0000a87e8 sp=0xc0000a87e0 pc=0x495481
created by runtime.defaultGOMAXPROCSUpdateEnable in goroutine 1
    /usr/local/go/src/runtime/proc.go:6708 +0x37

goroutine 19 gp=0xc0001028c0 m=nil [finalizer wait]:
runtime.gopark(0x468515?, 0x42eda5?, 0xb8?, 0x1?, 0xc000002380?)
    /usr/local/go/src/runtime/proc.go:460 +0xce fp=0xc0000ac620 sp=0xc0000ac600 pc=0x48d86e
runtime.runFinalizers()
    /usr/local/go/src/runtime/mfinal.go:210 +0x107 fp=0xc0000ac7e0 sp=0xc0000ac620 pc=0x436347
runtime.goexit({})
    /usr/local/go/src/runtime/asm_amd64.s:1693 +0x1 fp=0xc0000ac7e8 sp=0xc0000ac7e0 pc=0x495481
created by runtime.createfing in goroutine 1
    /usr/local/go/src/runtime/mfinal.go:172 +0x3d

goroutine 20 gp=0xc0001036c0 m=nil [cleanup wait]:
runtime.gopark(0x0?, 0x0?, 0x0?, 0x0?, 0x0?)
    /usr/local/go/src/runtime/proc.go:460 +0xce fp=0xc0000a8f68 sp=0xc0000a8f48 pc=0x48d86e
runtime.goparkunlock(...)
    /usr/local/go/src/runtime/proc.go:466
runtime.(*cleanupQueue).dequeue(0x184a280)
    /usr/local/go/src/runtime/mcleanup.go:439 +0xc5 fp=0xc0000a8fa0 sp=0xc0000a8f68 pc=0x433565
runtime.runCleanups()
    /usr/local/go/src/runtime/mcleanup.go:635 +0x45 fp=0xc0000a8fe0 sp=0xc0000a8fa0 pc=0x433c25
runtime.goexit({})
    /usr/local/go/src/runtime/asm_amd64.s:1693 +0x1 fp=0xc0000a8fe8 sp=0xc0000a8fe0 pc=0x495481
created by runtime.(*cleanupQueue).createGs in goroutine 1
    /usr/local/go/src/runtime/mcleanup.go:589 +0xa5

goroutine 21 gp=0xc00025c000 m=nil [IO wait]:
runtime.gopark(0x0?, 0x0?, 0x0?, 0x0?, 0xb?)
    /usr/local/go/src/runtime/proc.go:460 +0xce fp=0xc00028fcc8 sp=0xc00028fca8 pc=0x48d86e
runtime.netpollblock(0x4ed538?, 0x423be6?, 0x0?)
    /usr/local/go/src/runtime/netpoll.go:575 +0xf7 fp=0xc00028fd00 sp=0xc00028fcc8 pc=0x451b17
internal/poll.runtime_pollWait(0x7fda0cce8e00, 0x72)
    /usr/local/go/src/runtime/netpoll.go:351 +0x85 fp=0xc00028fd20 sp=0xc00028fd00 pc=0x48ca45
internal/poll.(*pollDesc).wait(0xc000134660?, 0xc00028fe53?, 0x1)
    /usr/local/go/src/internal/poll/fd_poll_runtime.go:84 +0x27 fp=0xc00028fd48 sp=0xc00028fd20 pc=0x50dba7
internal/poll.(*pollDesc).waitRead(...)
    /usr/local/go/src/internal/poll/fd_poll_runtime.go:89
internal/poll.(*FD).Read(0xc000134660, {0xc00028fe53, 0x10000, 0x10000})
    /usr/local/go/src/internal/poll/fd_unix.go:165 +0x279 fp=0xc00028fde0 sp=0xc00028fd48 pc=0x50ee99
os.(*File).read(...)
    /usr/local/go/src/os/file_posix.go:29
os.(*File).Read(0xc00011c5c8, {0xc00028fe53?, 0x0?, 0x0?})
    /usr/local/go/src/os/file.go:144 +0x4f fp=0xc00028fe20 sp=0xc00028fde0 pc=0x51aaaf
github.com/fsnotify/fsnotify.(*inotify).readEvents(0xc000217900)
    /build/vendor/github.com/fsnotify/fsnotify/backend_inotify.go:357 +0xcf fp=0xc00029ffc8 sp=0xc00028fe20 pc=0x6919af
github.com/fsnotify/fsnotify.newBackend.gowrap1()
    /build/vendor/github.com/fsnotify/fsnotify/backend_inotify.go:155 +0x25 fp=0xc00029ffe0 sp=0xc00029ffc8 pc=0x6904e5
runtime.goexit({})
    /usr/local/go/src/runtime/asm_amd64.s:1693 +0x1 fp=0xc00029ffe8 sp=0xc00029ffe0 pc=0x495481
created by github.com/fsnotify/fsnotify.newBackend in goroutine 1
    /build/vendor/github.com/fsnotify/fsnotify/backend_inotify.go:155 +0x1f6

goroutine 22 gp=0xc00025c1c0 m=nil [select, locked to thread]:
runtime.gopark(0xc0000a9fa8?, 0x2?, 0x58?, 0x0?, 0xc0000a9f94?)
    /usr/local/go/src/runtime/proc.go:460 +0xce fp=0xc0000a9e18 sp=0xc0000a9df8 pc=0x48d86e
runtime.selectgo(0xc0000a9fa8, 0xc0000a9f90, 0x0?, 0x0, 0x0?, 0x1)
    /usr/local/go/src/runtime/select.go:351 +0x8b7 fp=0xc0000a9f58 sp=0xc0000a9e18 pc=0x46be57
runtime.ensureSigM.func1()
    /usr/local/go/src/runtime/signal_unix.go:1085 +0x194 fp=0xc0000a9fe0 sp=0xc0000a9f58 pc=0x488174
runtime.goexit({})
    /usr/local/go/src/runtime/asm_amd64.s:1693 +0x1 fp=0xc0000a9fe8 sp=0xc0000a9fe0 pc=0x495481
created by runtime.ensureSigM in goroutine 1
    /usr/local/go/src/runtime/signal_unix.go:1068 +0xc5

goroutine 5 gp=0xc000003dc0 m=4 mp=0xc0000b3808 [syscall]:
runtime.notetsleepg(0x186cba0, 0xffffffffffffffff)
    /usr/local/go/src/runtime/lock_futex.go:123 +0x29 fp=0xc0000ae7a0 sp=0xc0000ae778 pc=0x42c0e9
os/signal.signal_recv()
    /usr/local/go/src/runtime/sigqueue.go:152 +0x29 fp=0xc0000ae7c0 sp=0xc0000ae7a0 pc=0x48f8e9
os/signal.loop()
    /usr/local/go/src/os/signal/signal_unix.go:23 +0x13 fp=0xc0000ae7e0 sp=0xc0000ae7c0 pc=0xccda53
runtime.goexit({})
    /usr/local/go/src/runtime/asm_amd64.s:1693 +0x1 fp=0xc0000ae7e8 sp=0xc0000ae7e0 pc=0x495481
created by os/signal.Notify.func1.1 in goroutine 1
    /usr/local/go/src/os/signal/signal.go:152 +0x1f

rax    0x0
rbx    0xffffffffffffff78
rcx    0x3c91
rdx    0x7fd9f401bce0
rdi    0x2d7265766972642f
rsi    0x2d7265766972641f
rbp    0x7fd9f401dfa8
rsp    0x7fda06ffcd80
r8     0x7
r9     0x7fd9f401a590
r10    0x59a52e57ba581785
r11    0x18
r12    0x7fda05e0d860
r13    0xc000485638
r14    0xc000002380
r15    0xffffffffffffffff
rip    0x7fda549efefa
rflags 0x10202
cs     0x33
fs     0x0
gs     0x0

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    Projects

    No projects

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions