Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 44 additions & 33 deletions cmd/gpu-feature-discovery/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -180,44 +180,13 @@ func start(c *cli.Context, cfg *Config) error {
klog.Infof("\nRunning with config:\n%v", string(configJSON))

nvmllib := nvml.New()
devicelib := device.New(nvmllib)
infolib := nvinfo.New(
nvinfo.WithNvmlLib(nvmllib),
nvinfo.WithDeviceLib(devicelib),
)

manager, err := resource.NewManager(infolib, nvmllib, devicelib, config)
if err != nil {
return fmt.Errorf("failed to create resource manager: %w", err)

}
vgpul := vgpu.NewVGPULib(vgpu.NewNvidiaPCILib())

var clientSets flags.ClientSets
if config.Flags.UseNodeFeatureAPI != nil && *config.Flags.UseNodeFeatureAPI {
cs, err := cfg.kubeClientConfig.NewClientSets()
if err != nil {
return fmt.Errorf("failed to create clientsets: %w", err)
}
clientSets = cs
}

labelOutputer, err := lm.NewOutputer(
config,
cfg.nodeConfig,
clientSets,
)
d, err := newGFDRunner(cfg, nvmllib, vgpul, config)
if err != nil {
return fmt.Errorf("failed to create label outputer: %w", err)
return err
}

klog.Info("Start running")
d := &gfd{
manager: manager,
vgpu: vgpul,
config: config,
labelOutputer: labelOutputer,
}
restart, err := d.run(sigs)
if err != nil {
return err
Expand All @@ -237,6 +206,48 @@ type gfd struct {
labelOutputer lm.Outputer
}

func newGFDRunner(cfg *Config, nvmllib nvml.Interface, vgpul vgpu.Interface, config *spec.Config) (*gfd, error) {
devicelib := device.New(nvmllib,
// TODO: Do we want to expose this as a config option?
Copy link
Contributor

@klueska klueska Dec 12, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mean as a flag / envvar to the CLI / in the config file?

device.WithIgnoreVisitDevicesErrors(true),
)
infolib := nvinfo.New(
nvinfo.WithNvmlLib(nvmllib),
nvinfo.WithDeviceLib(devicelib),
)

manager, err := resource.NewManager(infolib, nvmllib, devicelib, config)
if err != nil {
return nil, fmt.Errorf("failed to create resource manager: %w", err)

}

var clientSets flags.ClientSets
if config.Flags.UseNodeFeatureAPI != nil && *config.Flags.UseNodeFeatureAPI {
cs, err := cfg.kubeClientConfig.NewClientSets()
if err != nil {
return nil, fmt.Errorf("failed to create clientsets: %w", err)
}
clientSets = cs
}

labelOutputer, err := lm.NewOutputer(
config,
cfg.nodeConfig,
clientSets,
)
if err != nil {
return nil, fmt.Errorf("failed to create label outputer: %w", err)
}
d := &gfd{
manager: manager,
vgpu: vgpul,
config: config,
labelOutputer: labelOutputer,
}
return d, nil
}

func (d *gfd) run(sigs chan os.Signal) (bool, error) {
defer func() {
if d.config.Flags.UseNodeFeatureAPI != nil && *d.config.Flags.UseNodeFeatureAPI {
Expand Down
83 changes: 83 additions & 0 deletions cmd/gpu-feature-discovery/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ import (
"testing"
"time"

"github.com/NVIDIA/go-nvml/pkg/nvml"
"github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100"
"github.com/stretchr/testify/require"

spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1"
Expand Down Expand Up @@ -420,6 +422,87 @@ func TestFailOnNVMLInitError(t *testing.T) {
}
}

// TODO: This should be extended to a more representative test.
func TestGFDLabellers(t *testing.T) {
vgpul := &vgpu.InterfaceMock{
DevicesFunc: func() ([]*vgpu.Device, error) {
return nil, nil
},
}

nvmllib := dgxa100.New()

for _, d := range nvmllib.Devices {
// TODO: This is not implemented in the mock.
(d.(*dgxa100.Device)).GetGpuFabricInfoFunc = func() (nvml.GpuFabricInfo, nvml.Return) {
return nvml.GpuFabricInfo{}, nvml.ERROR_NOT_SUPPORTED
}
}

// Force one of the devices to have errors when enumerating the device.
workingDevices := nvmllib.DeviceGetHandleByIndexFunc
nvmllib.DeviceGetHandleByIndexFunc = func(n int) (nvml.Device, nvml.Return) {
if n == 0 {
return nil, nvml.ERROR_INVALID_ARGUMENT
}
return workingDevices(n)
}

cfg := &Config{}
config := &spec.Config{
Flags: spec.Flags{
CommandLineFlags: spec.CommandLineFlags{
DeviceDiscoveryStrategy: ptr("nvml"),
FailOnInitError: ptr(true),
MigStrategy: ptr("none"),
GFD: &spec.GFDCommandLineFlags{
MachineTypeFile: ptr(""),
OutputFile: ptr(""),
},
},
},
}
d, err := newGFDRunner(cfg, nvmllib, vgpul, config)
require.NoError(t, err)

loopLabelers, err := lm.NewLabelers(d.manager, d.vgpu, d.config)
require.NoError(t, err)

labels, err := loopLabelers.Labels()
require.NoError(t, err)

expectedLabels := map[string]string{
"nvidia.com/cuda.driver-version.full": "550.54.15",
"nvidia.com/cuda.driver-version.major": "550",
"nvidia.com/cuda.driver-version.minor": "54",
"nvidia.com/cuda.driver-version.revision": "15",
"nvidia.com/cuda.driver.major": "550",
"nvidia.com/cuda.driver.minor": "54",
"nvidia.com/cuda.driver.rev": "15",
"nvidia.com/cuda.runtime-version.full": "12.4",
"nvidia.com/cuda.runtime-version.major": "12",
"nvidia.com/cuda.runtime-version.minor": "4",
"nvidia.com/cuda.runtime.major": "12",
"nvidia.com/cuda.runtime.minor": "4",
"nvidia.com/gpu.compute.major": "8",
"nvidia.com/gpu.compute.minor": "0",
"nvidia.com/gpu.count": "7",
"nvidia.com/gpu.family": "ampere",
"nvidia.com/gpu.machine": "unknown",
"nvidia.com/gpu.memory": "40960",
"nvidia.com/gpu.mode": "unknown",
"nvidia.com/gpu.product": "Mock-NVIDIA-A100-SXM4-40GB",
"nvidia.com/gpu.replicas": "1",
"nvidia.com/gpu.sharing-strategy": "none",
"nvidia.com/mig.capable": "true",
"nvidia.com/mps.capable": "false",
"nvidia.com/vgpu.present": "false",
}

require.EqualValues(t, expectedLabels, (map[string]string)(labels))

}

func buildLabelMapFromOutput(output []byte) (map[string]string, error) {
labels := make(map[string]string)

Expand Down
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,5 @@ require (
sigs.k8s.io/randfill v1.0.0 // indirect
sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect
)

replace github.com/NVIDIA/go-nvlib => ../go-nvlib
2 changes: 0 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@ github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1
github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM=
github.com/NVIDIA/go-gpuallocator v0.6.0 h1:2PA2swx59gJYREPkZNTGtyCP6Pnz3WEgnYsXlRkyvkk=
github.com/NVIDIA/go-gpuallocator v0.6.0/go.mod h1:c+Yspg+/QxWOmoSQeuI48Z/7nS+mMPtxyj1NYUTwewY=
github.com/NVIDIA/go-nvlib v0.9.0 h1:GKLIvLJ0uhCtTLLZp2Q8QIDRxOYH45MM4Y5OO3U5Rho=
github.com/NVIDIA/go-nvlib v0.9.0/go.mod h1:7mzx9FSdO9fXWP9NKuZmWkCwhkEcSWQFe2tmFwtLb9c=
github.com/NVIDIA/go-nvml v0.13.0-1 h1:OLX8Jq3dONuPOQPC7rndB6+iDmDakw0XTYgzMxObkEw=
github.com/NVIDIA/go-nvml v0.13.0-1/go.mod h1:+KNA7c7gIBH7SKSJ1ntlwkfN80zdx8ovl4hrK3LmPt4=
github.com/NVIDIA/nvidia-container-toolkit v1.18.1 h1:525Y921X2TwKwBvFfMNqyXF3QWlYbsemQfpd8YB2lHs=
Expand Down
18 changes: 13 additions & 5 deletions internal/lm/labeler.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,23 @@ type Labeler interface {

// NewLabelers constructs the required labelers from the specified config
func NewLabelers(manager resource.Manager, vgpu vgpu.Interface, config *spec.Config) (Labeler, error) {
var labellers []Labeler

if config.Flags.GFD.MachineTypeFile != nil {
machineTypeLabeler, err := newMachineTypeLabeler(*config.Flags.GFD.MachineTypeFile)
if err != nil {
return nil, fmt.Errorf("failed to construct machine type labeler: %v", err)
}
labellers = append(labellers, machineTypeLabeler)
}

deviceLabeler, err := NewDeviceLabeler(manager, config)
if err != nil {
return nil, fmt.Errorf("error creating labeler: %v", err)
}
labellers = append(labellers, deviceLabeler)

l := Merge(
deviceLabeler,
NewVGPULabeler(vgpu),
)
labellers = append(labellers, NewVGPULabeler(vgpu))

return l, nil
return Merge(labellers...), nil
}
6 changes: 0 additions & 6 deletions internal/lm/nvml.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,6 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e
return empty{}, nil
}

machineTypeLabeler, err := newMachineTypeLabeler(*config.Flags.GFD.MachineTypeFile)
if err != nil {
return nil, fmt.Errorf("failed to construct machine type labeler: %v", err)
}

versionLabeler, err := newVersionLabeler(manager)
if err != nil {
return nil, fmt.Errorf("failed to construct version labeler: %v", err)
Expand Down Expand Up @@ -86,7 +81,6 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e
}

l := Merge(
machineTypeLabeler,
versionLabeler,
migCapabilityLabeler,
sharingLabeler,
Expand Down
47 changes: 21 additions & 26 deletions internal/vgpu/pciutil.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ import (
)

// NvidiaPCI interface allows us to get a list of all NVIDIA PCI devices
//
//go:generate moq -rm -fmt=goimports -out pciutil_mock.go . NvidiaPCI
type NvidiaPCI interface {
Devices() ([]*PCIDevice, error)
}
Expand Down Expand Up @@ -166,39 +168,32 @@ func GetLong(buffer []byte, pos int) uint32 {
uint32(buffer[pos+3])<<24
}

// MockNvidiaPCI represents mock of NvidiaPCI interface
type MockNvidiaPCI struct {
devices []*PCIDevice
}

// Devices returns PCI devices with mocked data
func (p *MockNvidiaPCI) Devices() ([]*PCIDevice, error) {
return p.devices, nil
}

// NewMockNvidiaPCI initializes and returns mock PCI interface type
func NewMockNvidiaPCI() NvidiaPCI {
var (
gpuPassThroughConfig = []byte{0xde, 0x10, 0x8a, 0x11, 0x07, 0x04, 0x10, 0x00, 0xa1, 0x00, 0x00, 0x03, 0x00, 0xf8, 0x00, 0x00, 0x00, 0x00, 0x00, 0xec, 0x0c, 0x00, 0x00, 0xe0, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0xea, 0x00, 0x00, 0x00, 0x00, 0x01, 0xc1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xde, 0x10, 0x14, 0x10, 0x00, 0x00, 0x00, 0xee, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x01, 0x00, 0x00, 0xde, 0x10, 0x14, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xce, 0xd6, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x68, 0x03, 0x00, 0x08, 0x00, 0x00, 0x00, 0x05, 0x78, 0x81, 0x00, 0x00, 0x70, 0xe6, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x10, 0xb4, 0x02, 0x00, 0xe1, 0x8d, 0x64, 0x00, 0x10, 0x29, 0x00, 0x00, 0x03, 0x3d, 0x45, 0x10, 0x00, 0x00, 0x01, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x03, 0x00, 0x3e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09, 0x00, 0x14, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
vgpuConfig = []byte{0xde, 0x10, 0xb8, 0x1e, 0x02, 0x05, 0xff, 0x06, 0xa1, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x0c, 0x00, 0x00, 0xd0, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0xfa, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xde, 0x10, 0x0f, 0x13, 0x00, 0x00, 0x00, 0x00, 0xd0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xce, 0xd6, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x81, 0x00, 0x00, 0x00, 0xe0, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x4e, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09, 0x68, 0x1b, 0x56, 0x46, 0x00, 0x16, 0x34, 0x36, 0x30, 0x2e, 0x31, 0x36, 0x00, 0x00, 0x00, 0x00, 0x72, 0x34, 0x36, 0x30, 0x5f, 0x30, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
)

return &MockNvidiaPCI{
devices: []*PCIDevice{
{
Path: "",
Address: "passthrough",
Vendor: "0x10de",
Class: "300",
Config: gpuPassThroughConfig,
},
{
Path: "",
Address: "vgpu",
Vendor: "0x10de",
Class: "300",
Config: vgpuConfig,
},
return &NvidiaPCIMock{
DevicesFunc: func() ([]*PCIDevice, error) {
return []*PCIDevice{
{
Path: "",
Address: "passthrough",
Vendor: "0x10de",
Class: "300",
Config: gpuPassThroughConfig,
},
{
Path: "",
Address: "vgpu",
Vendor: "0x10de",
Class: "300",
Config: vgpuConfig,
},
}, nil
},
}

}
67 changes: 67 additions & 0 deletions internal/vgpu/pciutil_mock.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading