Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions pkg/nvml/cgo_helpers_static.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ import (
"unsafe"
)

/*
#include <stdlib.h>
*/
import "C"

var cgoAllocsUnknown = new(struct{})
Expand Down Expand Up @@ -73,3 +76,11 @@ func unpackPCharString(str string) (*C.char, *struct{}) {
h := (*stringHeader)(unsafe.Pointer(&str))
return (*C.char)(h.Data), cgoAllocsUnknown
}

func malloc(size uintptr) unsafe.Pointer {
return C.malloc(C.size_t(size))
}

func free(ptr unsafe.Pointer) {
C.free(ptr)
}
50 changes: 48 additions & 2 deletions pkg/nvml/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -2931,10 +2931,56 @@ func (l *library) DeviceGetRunningProcessDetailList(device Device) (ProcessDetai
}

func (device nvmlDevice) GetRunningProcessDetailList() (ProcessDetailList, Return) {
return deviceGetRunningProcessDetailList(device)
}

func deviceGetRunningProcessDetailList(device nvmlDevice) (ProcessDetailList, Return) {
var plist ProcessDetailList
plist.Version = STRUCT_VERSION(plist, 1)
ret := nvmlDeviceGetRunningProcessDetailList(device, &plist)
return plist, ret
plist.NumProcArrayEntries = 1

Comment on lines +2937 to +2941
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is it not sufficient to just do this:

var plist ProcessDetailList
	plist.Version = STRUCT_VERSION(plist, 1)
	plist.NumProcArrayEntries = 1
	plist.ProcArray = &(make([]ProcessDetail_v1, plist.NumProcArrayEntries)[0])
	...

Does NVML realloc the space that plist.ProcArray points to inside the call?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, I just looked it up -- it says this:

To determine the size of the plist->procArray array to allocate, call the function with plist->numProcArrayEntries set to zero and plist->procArray set to NULL. The return code will be either NVML_ERROR_INSUFFICIENT_SIZE (if there are valid processes of type plist->mode to report on, in which case the plist->numProcArrayEntries field will indicate the required number of entries in the array) or NVML_SUCCESS (if no processes of type plist->mode exist).

Which means we should be following the same pattern as e.g.:
https://github.com/NVIDIA/go-nvml/blob/main/pkg/nvml/device.go#L993

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With no need to introduce all of this c-level malloc wrapper, etc.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am aware and do follow this pattern actually. The point is that the ProcessDetailList:: ProcArray is inside a struct and have to be a c pointer according to my tests.

Copy link
Collaborator

@klueska klueska Oct 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean like this:

diff --git a/pkg/nvml/device.go b/pkg/nvml/device.go
index d341e15..f907a64 100644
--- a/pkg/nvml/device.go
+++ b/pkg/nvml/device.go
@@ -2931,10 +2931,24 @@ func (l *library) DeviceGetRunningProcessDetailList(device Device) (ProcessDetai
 }

 func (device nvmlDevice) GetRunningProcessDetailList() (ProcessDetailList, Return) {
-   var plist ProcessDetailList
-   plist.Version = STRUCT_VERSION(plist, 1)
-   ret := nvmlDeviceGetRunningProcessDetailList(device, &plist)
-   return plist, ret
+   numProcArrayEntries := uint32(1)
+   procArray := make([]ProcessDetail_v1, numProcArrayEntries)
+   for {
+       var plist ProcessDetailList
+       plist.Version = STRUCT_VERSION(plist, 1)
+       plist.NumProcArrayEntries = numProcArrayEntries
+       plist.ProcArray = &procArray[0]
+       ret := nvmlDeviceGetRunningProcessDetailList(device, &plist)
+       if ret == SUCCESS {
+           return plist, ret
+       }
+       if ret != ERROR_INSUFFICIENT_SIZE {
+           return ProcessDetailList{}, ret
+       }
+       numProcArrayEntries = plist.NumProcArrayEntries
+       procArray = make([]ProcessDetail_v1, numProcArrayEntries)
+   }
 }

 // nvml.DeviceGetConfComputeMemSizeInfo()

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will trigger a runtime error, no?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't believe so. Which part are you worried about?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you run it locally? I did encounter a runtime error.

for {
// Allocate memory in cgo for ProcessDetailList::ProcArray
// We can't simply use a unsafe.Pointer of Go slice here
// otherwise it will trigger the following error:
// runtime error: cgo argument has Go pointer to unpinned Go pointer
count := plist.NumProcArrayEntries
cptr := malloc(uintptr(count) * unsafe.Sizeof(ProcessDetail_v1{}))
if cptr == nil {
return plist, ERROR_MEMORY
}

plist.ProcArray = (*ProcessDetail_v1)(cptr)
ret := nvmlDeviceGetRunningProcessDetailList(device, &plist)
if ret == SUCCESS {
out := make([]ProcessDetail_v1, plist.NumProcArrayEntries)
src := unsafe.Slice((*ProcessDetail_v1)(cptr), plist.NumProcArrayEntries)
copy(out, src)

if plist.NumProcArrayEntries > 0 {
plist.ProcArray = &out[0]
} else {
plist.ProcArray = nil
}

// Clean up C memory before return
free(cptr)

return plist, ret
}

// Clean up C memory before retry/return
if cptr != nil {
free(cptr)
}

if ret != ERROR_INSUFFICIENT_SIZE {
return plist, ret
}

// Increase capacity and retry
plist.NumProcArrayEntries *= 2
}
}

// nvml.DeviceGetConfComputeMemSizeInfo()
Expand Down