From cf03181e60ebdcc810d284c3fa63f47e12b705ae Mon Sep 17 00:00:00 2001 From: Hengqi Chen Date: Fri, 17 Oct 2025 14:35:50 +0800 Subject: [PATCH] Fix DeviceGetRunningProcessDetailList API The DeviceGetRunningProcessDetailList API constantly returns ERROR_INSUFFICIENT_SIZE since we did not setup ProcessDetailList properly. Let's allocate memory for ProcessDetailList::ProcArray before invoking nvmlDeviceGetRunningProcessDetailList() so that we can retrieve the process list. Note that the memory allocation has to be done in cgo otherwise we will hit a Golang runtime error. Signed-off-by: Hengqi Chen --- pkg/nvml/cgo_helpers_static.go | 11 ++++++++ pkg/nvml/device.go | 50 ++++++++++++++++++++++++++++++++-- 2 files changed, 59 insertions(+), 2 deletions(-) diff --git a/pkg/nvml/cgo_helpers_static.go b/pkg/nvml/cgo_helpers_static.go index 1f30eaa..6a3126b 100644 --- a/pkg/nvml/cgo_helpers_static.go +++ b/pkg/nvml/cgo_helpers_static.go @@ -18,6 +18,9 @@ import ( "unsafe" ) +/* +#include +*/ import "C" var cgoAllocsUnknown = new(struct{}) @@ -73,3 +76,11 @@ func unpackPCharString(str string) (*C.char, *struct{}) { h := (*stringHeader)(unsafe.Pointer(&str)) return (*C.char)(h.Data), cgoAllocsUnknown } + +func malloc(size uintptr) unsafe.Pointer { + return C.malloc(C.size_t(size)) +} + +func free(ptr unsafe.Pointer) { + C.free(ptr) +} diff --git a/pkg/nvml/device.go b/pkg/nvml/device.go index d341e15..74f62b0 100644 --- a/pkg/nvml/device.go +++ b/pkg/nvml/device.go @@ -2931,10 +2931,56 @@ func (l *library) DeviceGetRunningProcessDetailList(device Device) (ProcessDetai } func (device nvmlDevice) GetRunningProcessDetailList() (ProcessDetailList, Return) { + return deviceGetRunningProcessDetailList(device) +} + +func deviceGetRunningProcessDetailList(device nvmlDevice) (ProcessDetailList, Return) { var plist ProcessDetailList plist.Version = STRUCT_VERSION(plist, 1) - ret := nvmlDeviceGetRunningProcessDetailList(device, &plist) - return plist, ret + plist.NumProcArrayEntries = 1 + + for { + // Allocate memory in cgo for ProcessDetailList::ProcArray + // We can't simply use a unsafe.Pointer of Go slice here + // otherwise it will trigger the following error: + // runtime error: cgo argument has Go pointer to unpinned Go pointer + count := plist.NumProcArrayEntries + cptr := malloc(uintptr(count) * unsafe.Sizeof(ProcessDetail_v1{})) + if cptr == nil { + return plist, ERROR_MEMORY + } + + plist.ProcArray = (*ProcessDetail_v1)(cptr) + ret := nvmlDeviceGetRunningProcessDetailList(device, &plist) + if ret == SUCCESS { + out := make([]ProcessDetail_v1, plist.NumProcArrayEntries) + src := unsafe.Slice((*ProcessDetail_v1)(cptr), plist.NumProcArrayEntries) + copy(out, src) + + if plist.NumProcArrayEntries > 0 { + plist.ProcArray = &out[0] + } else { + plist.ProcArray = nil + } + + // Clean up C memory before return + free(cptr) + + return plist, ret + } + + // Clean up C memory before retry/return + if cptr != nil { + free(cptr) + } + + if ret != ERROR_INSUFFICIENT_SIZE { + return plist, ret + } + + // Increase capacity and retry + plist.NumProcArrayEntries *= 2 + } } // nvml.DeviceGetConfComputeMemSizeInfo()