diff --git a/pkg/nvlib/device/api.go b/pkg/nvlib/device/api.go index c2a6517..01a5307 100644 --- a/pkg/nvlib/device/api.go +++ b/pkg/nvlib/device/api.go @@ -38,10 +38,11 @@ type Interface interface { } type devicelib struct { - nvmllib nvml.Interface - skippedDevices map[string]struct{} - verifySymbols *bool - migProfiles []MigProfile + nvmllib nvml.Interface + ignoreVisitErrors bool + skippedDevices map[string]struct{} + verifySymbols *bool + migProfiles []MigProfile } var _ Interface = &devicelib{} @@ -67,6 +68,16 @@ func New(nvmllib nvml.Interface, opts ...Option) Interface { return d } +// WithIgnoreVisitDevicesErrors allows errors raised when visiting devices to be +// ignored. +// This is useful where a single device is unhealthy, but the expectation is +// that one continues on error. +func WithIgnoreVisitDevicesErrors(ignoreVisitErrors bool) Option { + return func(d *devicelib) { + d.ignoreVisitErrors = ignoreVisitErrors + } +} + // WithVerifySymbols provides an option to toggle whether to verify select symbols exist in dynamic libraries before calling them. func WithVerifySymbols(verify bool) Option { return func(d *devicelib) { diff --git a/pkg/nvlib/device/device.go b/pkg/nvlib/device/device.go index a67ce3c..06d347b 100644 --- a/pkg/nvlib/device/device.go +++ b/pkg/nvlib/device/device.go @@ -464,29 +464,35 @@ func (d *devicelib) VisitDevices(visit func(int, Device) error) error { } for i := 0; i < count; i++ { - device, ret := d.nvmllib.DeviceGetHandleByIndex(i) - if ret != nvml.SUCCESS { - return fmt.Errorf("error getting device handle for index '%v': %v", i, ret) - } - dev, err := d.newDevice(device) + err := d.visitDevice(i, visit) if err != nil { - return fmt.Errorf("error creating new device wrapper: %v", err) + if d.ignoreVisitErrors { + continue + } + return fmt.Errorf("error visiting device '%d': %w", i, err) } + } + return nil +} - isSkipped, err := dev.isSkipped() - if err != nil { - return fmt.Errorf("error checking whether device is skipped: %v", err) - } - if isSkipped { - continue - } +func (d *devicelib) visitDevice(i int, visit func(int, Device) error) error { + device, ret := d.nvmllib.DeviceGetHandleByIndex(i) + if ret != nvml.SUCCESS { + return fmt.Errorf("error getting device handle: %v", ret) + } + dev, err := d.newDevice(device) + if err != nil { + return fmt.Errorf("error creating new device wrapper: %v", err) + } - err = visit(i, dev) - if err != nil { - return fmt.Errorf("error visiting device: %v", err) - } + isSkipped, err := dev.isSkipped() + if err != nil { + return fmt.Errorf("error checking whether device is skipped: %v", err) } - return nil + if isSkipped { + return nil + } + return visit(i, dev) } // VisitMigDevices walks a top-level device and invokes a callback function for each MIG device configured on it.