Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions pkg/dcgm/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,11 @@ func GetDeviceInfo(gpuID uint) (Device, error) {
return getDeviceInfo(gpuID)
}

// GetGPUStatus returns the entity status of the specified GPU
func GetGPUStatus(gpuID uint) EntityStatus {
return getGPUStatus(gpuID)
}

// GetDeviceStatus returns current status information about the specified GPU
func GetDeviceStatus(gpuID uint) (DeviceStatus, error) {
return latestValuesForDevice(gpuID)
Expand Down
15 changes: 10 additions & 5 deletions pkg/dcgm/device_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,21 +233,26 @@ func getDeviceInfo(gpuID uint) (deviceInfo Device, err error) {
break
}
}
status := getGPUStatus(gpuID)
if status != EntityStatusOk {
supported = "No"
}

busid := *stringPtr(&device.identifiers.pciBusId[0])

cpuAffinity, err := getCPUAffinity(gpuID)
if err != nil {
return
}

var (
topology []P2PLink
bandwidth int64
cpuAffinity string
)

// get device topology and bandwidth only if its a DCGM supported device
if supported == "Yes" {
cpuAffinity, err = getCPUAffinity(gpuID)
if err != nil {
return
}

topology, err = getDeviceTopology(gpuID)
if err != nil {
return
Expand Down
55 changes: 55 additions & 0 deletions pkg/dcgm/device_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,52 @@ import (
"math/rand"
)

// EntityStatus represents the status of a GPU entity
type EntityStatus uint

const (
// EntityStatusUnknown - Entity has not been referenced yet
EntityStatusUnknown EntityStatus = 0
// EntityStatusOk - Entity is known and OK
EntityStatusOk EntityStatus = 1
// EntityStatusUnsupported - Entity is unsupported by DCGM
EntityStatusUnsupported EntityStatus = 2
// EntityStatusInaccessible - Entity is inaccessible, usually due to cgroups
EntityStatusInaccessible EntityStatus = 3
// EntityStatusLost - Entity has been lost. Usually set from NVML returning NVML_ERROR_GPU_IS_LOST
EntityStatusLost EntityStatus = 4
// EntityStatusFake - Entity is a fake, injection-only entity for testing
EntityStatusFake EntityStatus = 5
// EntityStatusDisabled - Don't collect values from this GPU
EntityStatusDisabled EntityStatus = 6
// EntityStatusDetached - Entity is detached, not good for any uses
EntityStatusDetached EntityStatus = 7
)

// String returns a string representation of the entity status
func (e EntityStatus) String() string {
switch e {
case EntityStatusUnknown:
return "Unknown"
case EntityStatusOk:
return "OK"
case EntityStatusUnsupported:
return "Unsupported"
case EntityStatusInaccessible:
return "Inaccessible"
case EntityStatusLost:
return "Lost"
case EntityStatusFake:
return "Fake"
case EntityStatusDisabled:
return "Disabled"
case EntityStatusDetached:
return "Detached"
default:
return fmt.Sprintf("Unknown(%d)", e)
}
}

// PerfState represents the performance state (P-state) of a GPU
type PerfState uint

Expand Down Expand Up @@ -85,6 +131,15 @@ type DeviceStatus struct {
FanSpeed int64 // %
}

func getGPUStatus(gpuID uint) EntityStatus {
var status C.DcgmEntityStatus_t
result := C.dcgmGetGpuStatus(handle.handle, C.uint(gpuID), &status)
if result != C.DCGM_ST_OK {
return EntityStatusUnknown
}
return EntityStatus(status)
}

func latestValuesForDevice(gpuId uint) (status DeviceStatus, err error) {
const (
pwr int = iota
Expand Down