diff --git a/pkg/dcgm/api.go b/pkg/dcgm/api.go index 7d81688..ab92dbc 100644 --- a/pkg/dcgm/api.go +++ b/pkg/dcgm/api.go @@ -85,6 +85,11 @@ func GetDeviceInfo(gpuID uint) (Device, error) { return getDeviceInfo(gpuID) } +// GetGPUStatus returns the entity status of the specified GPU +func GetGPUStatus(gpuID uint) EntityStatus { + return getGPUStatus(gpuID) +} + // GetDeviceStatus returns current status information about the specified GPU func GetDeviceStatus(gpuID uint) (DeviceStatus, error) { return latestValuesForDevice(gpuID) diff --git a/pkg/dcgm/device_info.go b/pkg/dcgm/device_info.go index 92f01b4..f13f902 100644 --- a/pkg/dcgm/device_info.go +++ b/pkg/dcgm/device_info.go @@ -233,21 +233,26 @@ func getDeviceInfo(gpuID uint) (deviceInfo Device, err error) { break } } + status := getGPUStatus(gpuID) + if status != EntityStatusOk { + supported = "No" + } busid := *stringPtr(&device.identifiers.pciBusId[0]) - cpuAffinity, err := getCPUAffinity(gpuID) - if err != nil { - return - } - var ( topology []P2PLink bandwidth int64 + cpuAffinity string ) // get device topology and bandwidth only if its a DCGM supported device if supported == "Yes" { + cpuAffinity, err = getCPUAffinity(gpuID) + if err != nil { + return + } + topology, err = getDeviceTopology(gpuID) if err != nil { return diff --git a/pkg/dcgm/device_status.go b/pkg/dcgm/device_status.go index b791ecb..6e45266 100644 --- a/pkg/dcgm/device_status.go +++ b/pkg/dcgm/device_status.go @@ -11,6 +11,52 @@ import ( "math/rand" ) +// EntityStatus represents the status of a GPU entity +type EntityStatus uint + +const ( + // EntityStatusUnknown - Entity has not been referenced yet + EntityStatusUnknown EntityStatus = 0 + // EntityStatusOk - Entity is known and OK + EntityStatusOk EntityStatus = 1 + // EntityStatusUnsupported - Entity is unsupported by DCGM + EntityStatusUnsupported EntityStatus = 2 + // EntityStatusInaccessible - Entity is inaccessible, usually due to cgroups + EntityStatusInaccessible EntityStatus = 3 + // EntityStatusLost - Entity has been lost. Usually set from NVML returning NVML_ERROR_GPU_IS_LOST + EntityStatusLost EntityStatus = 4 + // EntityStatusFake - Entity is a fake, injection-only entity for testing + EntityStatusFake EntityStatus = 5 + // EntityStatusDisabled - Don't collect values from this GPU + EntityStatusDisabled EntityStatus = 6 + // EntityStatusDetached - Entity is detached, not good for any uses + EntityStatusDetached EntityStatus = 7 +) + +// String returns a string representation of the entity status +func (e EntityStatus) String() string { + switch e { + case EntityStatusUnknown: + return "Unknown" + case EntityStatusOk: + return "OK" + case EntityStatusUnsupported: + return "Unsupported" + case EntityStatusInaccessible: + return "Inaccessible" + case EntityStatusLost: + return "Lost" + case EntityStatusFake: + return "Fake" + case EntityStatusDisabled: + return "Disabled" + case EntityStatusDetached: + return "Detached" + default: + return fmt.Sprintf("Unknown(%d)", e) + } +} + // PerfState represents the performance state (P-state) of a GPU type PerfState uint @@ -85,6 +131,15 @@ type DeviceStatus struct { FanSpeed int64 // % } +func getGPUStatus(gpuID uint) EntityStatus { + var status C.DcgmEntityStatus_t + result := C.dcgmGetGpuStatus(handle.handle, C.uint(gpuID), &status) + if result != C.DCGM_ST_OK { + return EntityStatusUnknown + } + return EntityStatus(status) +} + func latestValuesForDevice(gpuId uint) (status DeviceStatus, err error) { const ( pwr int = iota