Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 15 additions & 4 deletions pkg/nvlib/device/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,11 @@ type Interface interface {
}

type devicelib struct {
nvmllib nvml.Interface
skippedDevices map[string]struct{}
verifySymbols *bool
migProfiles []MigProfile
nvmllib nvml.Interface
ignoreVisitErrors bool
skippedDevices map[string]struct{}
verifySymbols *bool
migProfiles []MigProfile
}

var _ Interface = &devicelib{}
Expand All @@ -67,6 +68,16 @@ func New(nvmllib nvml.Interface, opts ...Option) Interface {
return d
}

// WithIgnoreVisitDevicesErrors allows errors raised when visiting devices to be
// ignored.
// This is useful where a single device is unhealthy, but the expectation is
// that one continues on error.
func WithIgnoreVisitDevicesErrors(ignoreVisitErrors bool) Option {
return func(d *devicelib) {
d.ignoreVisitErrors = ignoreVisitErrors
}
}

// WithVerifySymbols provides an option to toggle whether to verify select symbols exist in dynamic libraries before calling them.
func WithVerifySymbols(verify bool) Option {
return func(d *devicelib) {
Expand Down
42 changes: 24 additions & 18 deletions pkg/nvlib/device/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -464,29 +464,35 @@ func (d *devicelib) VisitDevices(visit func(int, Device) error) error {
}

for i := 0; i < count; i++ {
device, ret := d.nvmllib.DeviceGetHandleByIndex(i)
if ret != nvml.SUCCESS {
return fmt.Errorf("error getting device handle for index '%v': %v", i, ret)
}
dev, err := d.newDevice(device)
err := d.visitDevice(i, visit)
if err != nil {
return fmt.Errorf("error creating new device wrapper: %v", err)
if d.ignoreVisitErrors {
continue
}
return fmt.Errorf("error visiting device '%d': %w", i, err)
}
}
return nil
}

isSkipped, err := dev.isSkipped()
if err != nil {
return fmt.Errorf("error checking whether device is skipped: %v", err)
}
if isSkipped {
continue
}
func (d *devicelib) visitDevice(i int, visit func(int, Device) error) error {
device, ret := d.nvmllib.DeviceGetHandleByIndex(i)
if ret != nvml.SUCCESS {
return fmt.Errorf("error getting device handle: %v", ret)
}
dev, err := d.newDevice(device)
if err != nil {
return fmt.Errorf("error creating new device wrapper: %v", err)
}

err = visit(i, dev)
if err != nil {
return fmt.Errorf("error visiting device: %v", err)
}
isSkipped, err := dev.isSkipped()
if err != nil {
return fmt.Errorf("error checking whether device is skipped: %v", err)
}
return nil
if isSkipped {
return nil
}
return visit(i, dev)
}

// VisitMigDevices walks a top-level device and invokes a callback function for each MIG device configured on it.
Expand Down