@@ -23,6 +23,7 @@ import (
2323 "path/filepath"
2424 "regexp"
2525 "sort"
26+ "strconv"
2627 "strings"
2728 "time"
2829
@@ -31,15 +32,19 @@ import (
3132 "k8s.io/klog/v2"
3233 pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
3334
35+ "github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/levelzeroservice"
3436 "github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/rm"
3537 "github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/labeler"
38+ gpulevelzero "github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/levelzero"
3639 dpapi "github.com/intel/intel-device-plugins-for-kubernetes/pkg/deviceplugin"
3740 cdispec "tags.cncf.io/container-device-interface/specs-go"
3841)
3942
4043const (
4144 sysfsDrmDirectory = "/sys/class/drm"
4245 devfsDriDirectory = "/dev/dri"
46+ wslDxgPath = "/dev/dxg"
47+ wslLibPath = "/usr/lib/wsl"
4348 nfdFeatureDir = "/etc/kubernetes/node-feature-discovery/features.d"
4449 resourceFilename = "intel-gpu-resources.txt"
4550 gpuDeviceRE = `^card[0-9]+$`
@@ -51,6 +56,7 @@ const (
5156 namespace = "gpu.intel.com"
5257 deviceTypeI915 = "i915"
5358 deviceTypeXe = "xe"
59+ deviceTypeDxg = "dxg"
5460 deviceTypeDefault = deviceTypeI915
5561
5662 // telemetry resource settings.
@@ -67,8 +73,11 @@ const (
6773type cliOptions struct {
6874 preferredAllocationPolicy string
6975 sharedDevNum int
76+ temperatureLimit int
7077 enableMonitoring bool
7178 resourceManagement bool
79+ wslScan bool
80+ healthManagement bool
7281}
7382
7483type rmWithMultipleDriversErr struct {
@@ -274,11 +283,13 @@ type devicePlugin struct {
274283 scanDone chan bool
275284 scanResources chan bool
276285
277- resMan rm.ResourceManager
286+ resMan rm.ResourceManager
287+ levelzeroService levelzeroservice.LevelzeroService
278288
279- sysfsDir string
280- devfsDir string
281- bypathDir string
289+ sysfsDir string
290+ devfsDir string
291+ bypathDir string
292+ healthStatuses map [string ]string
282293
283294 // Note: If restarting the plugin with a new policy, the allocations for existing pods remain with old policy.
284295 policy preferredAllocationPolicyFunc
@@ -300,6 +311,7 @@ func newDevicePlugin(sysfsDir, devfsDir string, options cliOptions) *devicePlugi
300311 scanDone : make (chan bool , 1 ), // buffered as we may send to it before Scan starts receiving from it
301312 bypathFound : true ,
302313 scanResources : make (chan bool , 1 ),
314+ healthStatuses : make (map [string ]string ),
303315 }
304316
305317 if options .resourceManagement {
@@ -325,15 +337,85 @@ func newDevicePlugin(sysfsDir, devfsDir string, options cliOptions) *devicePlugi
325337 dp .policy = nonePolicy
326338 }
327339
328- if _ , err := os .ReadDir (dp .bypathDir ); err != nil {
329- klog .Warningf ("failed to read by-path dir: %+v" , err )
340+ if ! options .wslScan {
341+ if _ , err := os .ReadDir (dp .bypathDir ); err != nil {
342+ klog .Warningf ("failed to read by-path dir: %+v" , err )
330343
331- dp .bypathFound = false
344+ dp .bypathFound = false
345+ }
332346 }
333347
334348 return dp
335349}
336350
351+ func logHealthStatusChange (card , newStatus string , statuses map [string ]string ) {
352+ prevState , found := statuses [card ]
353+ if ! found {
354+ klog .V (2 ).Infof ("%s: new => %s" , card , newStatus )
355+
356+ statuses [card ] = newStatus
357+ } else if prevState != newStatus {
358+ klog .V (2 ).Infof ("%s: %s => %s" , card , prevState , newStatus )
359+
360+ statuses [card ] = newStatus
361+ }
362+ }
363+
364+ func (dp * devicePlugin ) healthStatusForCard (cardPath string ) string {
365+ if dp .levelzeroService == nil {
366+ return pluginapi .Healthy
367+ }
368+
369+ link , err := os .Readlink (filepath .Join (cardPath , "device" ))
370+ if err != nil {
371+ klog .Warning ("couldn't read device link for" , cardPath )
372+
373+ return pluginapi .Healthy
374+ }
375+
376+ health := pluginapi .Healthy
377+
378+ // Check status changes after the function exits
379+ defer func () { logHealthStatusChange (cardPath , health , dp .healthStatuses ) }()
380+
381+ bdfAddr := filepath .Base (link )
382+
383+ dh , err := dp .levelzeroService .GetDeviceHealth (bdfAddr )
384+ if err != nil {
385+ klog .Warningf ("Device health retrieval failed: %v" , err )
386+
387+ return health
388+ }
389+
390+ // Direct Health indicators
391+ klog .V (4 ).Infof ("Health indicators: Memory=%t, Bus=%t, SoC=%t" , dh .Memory , dh .Bus , dh .SoC )
392+
393+ if ! dh .Memory || ! dh .Bus || ! dh .SoC {
394+ health = pluginapi .Unhealthy
395+
396+ return health
397+ }
398+
399+ dt , err := dp .levelzeroService .GetDeviceTemperature (bdfAddr )
400+ // In case of any errors, return the current health status
401+ if err != nil {
402+ klog .Warningf ("Device temperature retrieval failed: %v" , err )
403+
404+ return health
405+ }
406+
407+ limit := float64 (dp .options .temperatureLimit )
408+
409+ // Temperatures for different areas
410+ klog .V (4 ).Infof ("Temperatures: Memory=%.1fC, GPU=%.1fC, Global=%.1fC" , dh .MemoryTemperature , dh .GPUTemperature , dh .GlobalTemperature )
411+
412+ if dt .GPU > limit || dt .Global > limit || dt .Memory > limit {
413+ health = pluginapi .Unhealthy
414+ }
415+
416+ return health
417+ }
418+
337419// Implement the PreferredAllocator interface.
338420func (dp * devicePlugin ) GetPreferredAllocation (rqt * pluginapi.PreferredAllocationRequest ) (* pluginapi.PreferredAllocationResponse , error ) {
339421 if dp .resMan != nil {
@@ -369,6 +451,68 @@ func (dp *devicePlugin) GetPreferredAllocation(rqt *pluginapi.PreferredAllocatio
369451}
370452
371453func (dp * devicePlugin ) Scan (notifier dpapi.Notifier ) error {
454+ if dp .options .wslScan {
455+ return dp .wslGpuScan (notifier )
456+ } else {
457+ return dp .sysFsGpuScan (notifier )
458+ }
459+ }
460+
461+ func (dp * devicePlugin ) wslGpuScan (notifier dpapi.Notifier ) error {
462+ defer dp .scanTicker .Stop ()
463+
464+ klog .V (1 ).Infof ("GPU (%s) resource share count = %d" , deviceTypeDxg , dp .options .sharedDevNum )
465+
466+ devSpecs := []pluginapi.DeviceSpec {
467+ {
468+ HostPath : wslDxgPath ,
469+ ContainerPath : wslDxgPath ,
470+ Permissions : "rw" ,
471+ },
472+ }
473+
474+ mounts := []pluginapi.Mount {
475+ {
476+ ContainerPath : wslLibPath ,
477+ HostPath : wslLibPath ,
478+ ReadOnly : true ,
479+ },
480+ }
481+
482+ for {
483+ indices , err := dp .levelzeroService .GetIntelIndices ()
484+ if err == nil {
485+ klog .V (4 ).Info ("Intel Level-Zero indices: " , indices )
486+
487+ devTree := dpapi .NewDeviceTree ()
488+
489+ for _ , index := range indices {
490+ envs := map [string ]string {
491+ rm .LevelzeroAffinityMaskEnvVar : strconv .Itoa (int (index )),
492+ }
493+
494+ deviceInfo := dpapi .NewDeviceInfo (pluginapi .Healthy , devSpecs , mounts , envs , nil , nil )
495+
496+ for i := 0 ; i < dp .options .sharedDevNum ; i ++ {
497+ devID := fmt .Sprintf ("card%d-%d" , index , i )
498+ devTree .AddDevice (deviceTypeDxg , devID , deviceInfo )
499+ }
500+ }
501+
502+ notifier .Notify (devTree )
503+ } else {
504+ klog .Warning ("Failed to get Intel indices from Level-Zero" )
505+ }
506+
507+ select {
508+ case <- dp .scanDone :
509+ return nil
510+ case <- dp .scanTicker .C :
511+ }
512+ }
513+ }
514+
515+ func (dp * devicePlugin ) sysFsGpuScan (notifier dpapi.Notifier ) error {
372516 defer dp .scanTicker .Stop ()
373517
374518 klog .V (1 ).Infof ("GPU (%s/%s) resource share count = %d" , deviceTypeI915 , deviceTypeXe , dp .options .sharedDevNum )
@@ -566,7 +710,9 @@ func (dp *devicePlugin) scan() (dpapi.DeviceTree, error) {
566710
567711 mounts , cdiDevices := dp .createMountsAndCDIDevices (cardPath , name , devSpecs )
568712
569- deviceInfo := dpapi .NewDeviceInfo (pluginapi .Healthy , devSpecs , mounts , nil , nil , cdiDevices )
713+ health := dp .healthStatusForCard (cardPath )
714+
715+ deviceInfo := dpapi .NewDeviceInfo (health , devSpecs , mounts , nil , nil , cdiDevices )
570716
571717 for i := 0 ; i < dp .options .sharedDevNum ; i ++ {
572718 devID := fmt .Sprintf ("%s-%d" , name , i )
@@ -627,7 +773,10 @@ func main() {
627773 flag .StringVar (& prefix , "prefix" , "" , "Prefix for devfs & sysfs paths" )
628774 flag .BoolVar (& opts .enableMonitoring , "enable-monitoring" , false , "whether to enable '*_monitoring' (= all GPUs) resource" )
629775 flag .BoolVar (& opts .resourceManagement , "resource-manager" , false , "fractional GPU resource management" )
776+ flag .BoolVar (& opts .healthManagement , "health-management" , false , "enable GPU health management" )
777+ flag .BoolVar (& opts .wslScan , "wsl" , false , "scan for / use WSL devices" )
630778 flag .IntVar (& opts .sharedDevNum , "shared-dev-num" , 1 , "number of containers sharing the same GPU device" )
779+ flag .IntVar (& opts .temperatureLimit , "temp-limit" , 100 , "temperature limit at which device is marked unhealthy" )
631780 flag .StringVar (& opts .preferredAllocationPolicy , "allocation-policy" , "none" , "modes of allocating GPU devices: balanced, packed and none" )
632781 flag .Parse ()
633782
@@ -651,6 +800,34 @@ func main() {
651800
652801 plugin := newDevicePlugin (prefix + sysfsDrmDirectory , prefix + devfsDriDirectory , opts )
653802
803+ if plugin .options .wslScan {
804+ klog .Info ("WSL mode requested" )
805+
806+ if plugin .options .resourceManagement {
807+ klog .Error ("Resource management is not supported within WSL. Please disable resource management." )
808+
809+ os .Exit (1 )
810+ }
811+
812+ if plugin .options .enableMonitoring {
813+ klog .Error ("Monitoring is not supported within WSL. Please disable monitoring." )
814+
815+ os .Exit (1 )
816+ }
817+
818+ if plugin .options .healthManagement {
819+ klog .Error ("Health management is not supported within WSL. Please disable health management." )
820+
821+ os .Exit (1 )
822+ }
823+ }
824+
825+ if plugin .options .healthManagement || plugin .options .wslScan {
826+ plugin .levelzeroService = levelzeroservice .NewLevelzero (gpulevelzero .DefaultUnixSocketPath )
827+
828+ go plugin .levelzeroService .Run (true )
829+ }
830+
654831 if plugin .options .resourceManagement {
655832 // Start labeler to export labels file for NFD.
656833 nfdFeatureFile := path .Join (nfdFeatureDir , resourceFilename )
@@ -659,7 +836,10 @@ func main() {
659836
660837 // Labeler catches OS signals and calls os.Exit() after receiving any.
661838 go labeler .Run (prefix + sysfsDrmDirectory , nfdFeatureFile ,
662- labelerMaxInterval , plugin .scanResources )
839+ labelerMaxInterval , plugin .scanResources , plugin .levelzeroService , func () {
840+ // Exit the whole app when labeler exits
841+ os .Exit (0 )
842+ })
663843 }
664844
665845 manager := dpapi .NewManager (namespace , plugin )
0 commit comments