Skip to content

Commit 518a860

Browse files
tkatilauniemimu
authored andcommitted
gpu: add levelzero sidecar support for plugin and the deployment files
In addition to the levelzero's health data use, this adds support to scan devices in WSL. Scanning happens by retrieving Intel device indices from the Level-Zero API. Signed-off-by: Tuomas Katila <tuomas.katila@intel.com>
1 parent 2df9443 commit 518a860

File tree

16 files changed

+1293
-39
lines changed

16 files changed

+1293
-39
lines changed

cmd/gpu_plugin/gpu_plugin.go

Lines changed: 189 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"path/filepath"
2424
"regexp"
2525
"sort"
26+
"strconv"
2627
"strings"
2728
"time"
2829

@@ -31,15 +32,19 @@ import (
3132
"k8s.io/klog/v2"
3233
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
3334

35+
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/levelzeroservice"
3436
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/rm"
3537
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/labeler"
38+
gpulevelzero "github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/levelzero"
3639
dpapi "github.com/intel/intel-device-plugins-for-kubernetes/pkg/deviceplugin"
3740
cdispec "tags.cncf.io/container-device-interface/specs-go"
3841
)
3942

4043
const (
4144
sysfsDrmDirectory = "/sys/class/drm"
4245
devfsDriDirectory = "/dev/dri"
46+
wslDxgPath = "/dev/dxg"
47+
wslLibPath = "/usr/lib/wsl"
4348
nfdFeatureDir = "/etc/kubernetes/node-feature-discovery/features.d"
4449
resourceFilename = "intel-gpu-resources.txt"
4550
gpuDeviceRE = `^card[0-9]+$`
@@ -51,6 +56,7 @@ const (
5156
namespace = "gpu.intel.com"
5257
deviceTypeI915 = "i915"
5358
deviceTypeXe = "xe"
59+
deviceTypeDxg = "dxg"
5460
deviceTypeDefault = deviceTypeI915
5561

5662
// telemetry resource settings.
@@ -67,8 +73,11 @@ const (
6773
type cliOptions struct {
6874
preferredAllocationPolicy string
6975
sharedDevNum int
76+
temperatureLimit int
7077
enableMonitoring bool
7178
resourceManagement bool
79+
wslScan bool
80+
healthManagement bool
7281
}
7382

7483
type rmWithMultipleDriversErr struct {
@@ -274,11 +283,13 @@ type devicePlugin struct {
274283
scanDone chan bool
275284
scanResources chan bool
276285

277-
resMan rm.ResourceManager
286+
resMan rm.ResourceManager
287+
levelzeroService levelzeroservice.LevelzeroService
278288

279-
sysfsDir string
280-
devfsDir string
281-
bypathDir string
289+
sysfsDir string
290+
devfsDir string
291+
bypathDir string
292+
healthStatuses map[string]string
282293

283294
// Note: If restarting the plugin with a new policy, the allocations for existing pods remain with old policy.
284295
policy preferredAllocationPolicyFunc
@@ -300,6 +311,7 @@ func newDevicePlugin(sysfsDir, devfsDir string, options cliOptions) *devicePlugi
300311
scanDone: make(chan bool, 1), // buffered as we may send to it before Scan starts receiving from it
301312
bypathFound: true,
302313
scanResources: make(chan bool, 1),
314+
healthStatuses: make(map[string]string),
303315
}
304316

305317
if options.resourceManagement {
@@ -325,15 +337,85 @@ func newDevicePlugin(sysfsDir, devfsDir string, options cliOptions) *devicePlugi
325337
dp.policy = nonePolicy
326338
}
327339

328-
if _, err := os.ReadDir(dp.bypathDir); err != nil {
329-
klog.Warningf("failed to read by-path dir: %+v", err)
340+
if !options.wslScan {
341+
if _, err := os.ReadDir(dp.bypathDir); err != nil {
342+
klog.Warningf("failed to read by-path dir: %+v", err)
330343

331-
dp.bypathFound = false
344+
dp.bypathFound = false
345+
}
332346
}
333347

334348
return dp
335349
}
336350

351+
func logHealthStatusChange(card, newStatus string, statuses map[string]string) {
352+
prevState, found := statuses[card]
353+
if !found {
354+
klog.V(2).Infof("%s: new => %s", card, newStatus)
355+
356+
statuses[card] = newStatus
357+
} else if prevState != newStatus {
358+
klog.V(2).Infof("%s: %s => %s", card, prevState, newStatus)
359+
360+
statuses[card] = newStatus
361+
}
362+
}
363+
364+
func (dp *devicePlugin) healthStatusForCard(cardPath string) string {
365+
if dp.levelzeroService == nil {
366+
return pluginapi.Healthy
367+
}
368+
369+
link, err := os.Readlink(filepath.Join(cardPath, "device"))
370+
if err != nil {
371+
klog.Warning("couldn't read device link for", cardPath)
372+
373+
return pluginapi.Healthy
374+
}
375+
376+
health := pluginapi.Healthy
377+
378+
// Check status changes after the function exits
379+
defer func() { logHealthStatusChange(cardPath, health, dp.healthStatuses) }()
380+
381+
bdfAddr := filepath.Base(link)
382+
383+
dh, err := dp.levelzeroService.GetDeviceHealth(bdfAddr)
384+
if err != nil {
385+
klog.Warningf("Device health retrieval failed: %v", err)
386+
387+
return health
388+
}
389+
390+
// Direct Health indicators
391+
klog.V(4).Infof("Health indicators: Memory=%t, Bus=%t, SoC=%t", dh.Memory, dh.Bus, dh.SoC)
392+
393+
if !dh.Memory || !dh.Bus || !dh.SoC {
394+
health = pluginapi.Unhealthy
395+
396+
return health
397+
}
398+
399+
dt, err := dp.levelzeroService.GetDeviceTemperature(bdfAddr)
400+
// In case of any errors, return the current health status
401+
if err != nil {
402+
klog.Warningf("Device temperature retrieval failed: %v", err)
403+
404+
return health
405+
}
406+
407+
limit := float64(dp.options.temperatureLimit)
408+
409+
// Temperatures for different areas
410+
klog.V(4).Infof("Temperatures: Memory=%.1fC, GPU=%.1fC, Global=%.1fC", dh.MemoryTemperature, dh.GPUTemperature, dh.GlobalTemperature)
411+
412+
if dt.GPU > limit || dt.Global > limit || dt.Memory > limit {
413+
health = pluginapi.Unhealthy
414+
}
415+
416+
return health
417+
}
418+
337419
// Implement the PreferredAllocator interface.
338420
func (dp *devicePlugin) GetPreferredAllocation(rqt *pluginapi.PreferredAllocationRequest) (*pluginapi.PreferredAllocationResponse, error) {
339421
if dp.resMan != nil {
@@ -369,6 +451,68 @@ func (dp *devicePlugin) GetPreferredAllocation(rqt *pluginapi.PreferredAllocatio
369451
}
370452

371453
func (dp *devicePlugin) Scan(notifier dpapi.Notifier) error {
454+
if dp.options.wslScan {
455+
return dp.wslGpuScan(notifier)
456+
} else {
457+
return dp.sysFsGpuScan(notifier)
458+
}
459+
}
460+
461+
func (dp *devicePlugin) wslGpuScan(notifier dpapi.Notifier) error {
462+
defer dp.scanTicker.Stop()
463+
464+
klog.V(1).Infof("GPU (%s) resource share count = %d", deviceTypeDxg, dp.options.sharedDevNum)
465+
466+
devSpecs := []pluginapi.DeviceSpec{
467+
{
468+
HostPath: wslDxgPath,
469+
ContainerPath: wslDxgPath,
470+
Permissions: "rw",
471+
},
472+
}
473+
474+
mounts := []pluginapi.Mount{
475+
{
476+
ContainerPath: wslLibPath,
477+
HostPath: wslLibPath,
478+
ReadOnly: true,
479+
},
480+
}
481+
482+
for {
483+
indices, err := dp.levelzeroService.GetIntelIndices()
484+
if err == nil {
485+
klog.V(4).Info("Intel Level-Zero indices: ", indices)
486+
487+
devTree := dpapi.NewDeviceTree()
488+
489+
for _, index := range indices {
490+
envs := map[string]string{
491+
rm.LevelzeroAffinityMaskEnvVar: strconv.Itoa(int(index)),
492+
}
493+
494+
deviceInfo := dpapi.NewDeviceInfo(pluginapi.Healthy, devSpecs, mounts, envs, nil, nil)
495+
496+
for i := 0; i < dp.options.sharedDevNum; i++ {
497+
devID := fmt.Sprintf("card%d-%d", index, i)
498+
devTree.AddDevice(deviceTypeDxg, devID, deviceInfo)
499+
}
500+
}
501+
502+
notifier.Notify(devTree)
503+
} else {
504+
klog.Warning("Failed to get Intel indices from Level-Zero")
505+
}
506+
507+
select {
508+
case <-dp.scanDone:
509+
return nil
510+
case <-dp.scanTicker.C:
511+
}
512+
}
513+
}
514+
515+
func (dp *devicePlugin) sysFsGpuScan(notifier dpapi.Notifier) error {
372516
defer dp.scanTicker.Stop()
373517

374518
klog.V(1).Infof("GPU (%s/%s) resource share count = %d", deviceTypeI915, deviceTypeXe, dp.options.sharedDevNum)
@@ -566,7 +710,9 @@ func (dp *devicePlugin) scan() (dpapi.DeviceTree, error) {
566710

567711
mounts, cdiDevices := dp.createMountsAndCDIDevices(cardPath, name, devSpecs)
568712

569-
deviceInfo := dpapi.NewDeviceInfo(pluginapi.Healthy, devSpecs, mounts, nil, nil, cdiDevices)
713+
health := dp.healthStatusForCard(cardPath)
714+
715+
deviceInfo := dpapi.NewDeviceInfo(health, devSpecs, mounts, nil, nil, cdiDevices)
570716

571717
for i := 0; i < dp.options.sharedDevNum; i++ {
572718
devID := fmt.Sprintf("%s-%d", name, i)
@@ -627,7 +773,10 @@ func main() {
627773
flag.StringVar(&prefix, "prefix", "", "Prefix for devfs & sysfs paths")
628774
flag.BoolVar(&opts.enableMonitoring, "enable-monitoring", false, "whether to enable '*_monitoring' (= all GPUs) resource")
629775
flag.BoolVar(&opts.resourceManagement, "resource-manager", false, "fractional GPU resource management")
776+
flag.BoolVar(&opts.healthManagement, "health-management", false, "enable GPU health management")
777+
flag.BoolVar(&opts.wslScan, "wsl", false, "scan for / use WSL devices")
630778
flag.IntVar(&opts.sharedDevNum, "shared-dev-num", 1, "number of containers sharing the same GPU device")
779+
flag.IntVar(&opts.temperatureLimit, "temp-limit", 100, "temperature limit at which device is marked unhealthy")
631780
flag.StringVar(&opts.preferredAllocationPolicy, "allocation-policy", "none", "modes of allocating GPU devices: balanced, packed and none")
632781
flag.Parse()
633782

@@ -651,6 +800,34 @@ func main() {
651800

652801
plugin := newDevicePlugin(prefix+sysfsDrmDirectory, prefix+devfsDriDirectory, opts)
653802

803+
if plugin.options.wslScan {
804+
klog.Info("WSL mode requested")
805+
806+
if plugin.options.resourceManagement {
807+
klog.Error("Resource management is not supported within WSL. Please disable resource management.")
808+
809+
os.Exit(1)
810+
}
811+
812+
if plugin.options.enableMonitoring {
813+
klog.Error("Monitoring is not supported within WSL. Please disable monitoring.")
814+
815+
os.Exit(1)
816+
}
817+
818+
if plugin.options.healthManagement {
819+
klog.Error("Health management is not supported within WSL. Please disable health management.")
820+
821+
os.Exit(1)
822+
}
823+
}
824+
825+
if plugin.options.healthManagement || plugin.options.wslScan {
826+
plugin.levelzeroService = levelzeroservice.NewLevelzero(gpulevelzero.DefaultUnixSocketPath)
827+
828+
go plugin.levelzeroService.Run(true)
829+
}
830+
654831
if plugin.options.resourceManagement {
655832
// Start labeler to export labels file for NFD.
656833
nfdFeatureFile := path.Join(nfdFeatureDir, resourceFilename)
@@ -659,7 +836,10 @@ func main() {
659836

660837
// Labeler catches OS signals and calls os.Exit() after receiving any.
661838
go labeler.Run(prefix+sysfsDrmDirectory, nfdFeatureFile,
662-
labelerMaxInterval, plugin.scanResources)
839+
labelerMaxInterval, plugin.scanResources, plugin.levelzeroService, func() {
840+
// Exit the whole app when labeler exits
841+
os.Exit(0)
842+
})
663843
}
664844

665845
manager := dpapi.NewManager(namespace, plugin)

0 commit comments

Comments
 (0)