[Feature] Make GPU utils function neutral for vendor's resource names

yansun1996 · yansun1996 · commit 3a75de89e50d · 2025-10-14T18:30:47.000Z
Signed-off-by: yansun1996 &lt;Yan.Sun3@amd.com&gt;
diff --git a/cluster-autoscaler/utils/gpu/gpu.go b/cluster-autoscaler/utils/gpu/gpu.go
@@ -26,6 +26,8 @@ import (
 )
 
 const (
+	// ResourceAMDGPU is the name of the AMD GPU resource.
+	ResourceAMDGPU = "amd.com/gpu"
 	// ResourceNvidiaGPU is the name of the Nvidia GPU resource.
 	ResourceNvidiaGPU = "nvidia.com/gpu"
 	// ResourceDirectX is the name of the DirectX resource on windows.
@@ -35,6 +37,14 @@ const (
 	DefaultGPUType = "nvidia-tesla-k80"
 )
 
+// Centralized list of all known GPU vendor extended resource names.
+// Extend this slice if new vendor resource names are added.
+var GPUVendorResourceNames = []apiv1.ResourceName{
+	ResourceNvidiaGPU,
+	ResourceAMDGPU,
+	ResourceDirectX,
+}
+
 const (
 	// MetricsGenericGPU - for when there is no information about GPU type
 	MetricsGenericGPU = "generic"
@@ -109,23 +119,53 @@ func validateGpuType(availableGPUTypes map[string]struct{}, gpu string) string {
 // if the drivers are installed and GPU is ready to use.
 func NodeHasGpu(GPULabel string, node *apiv1.Node) bool {
 	_, hasGpuLabel := node.Labels[GPULabel]
-	gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable[ResourceNvidiaGPU]
-	return hasGpuLabel || (hasGpuAllocatable && !gpuAllocatable.IsZero())
+	if hasGpuLabel {
+		return true
+	}
+	// Check for extended resources as well
+	for _, gpuVendorResourceName := range GPUVendorResourceNames {
+		gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable[gpuVendorResourceName]
+		if hasGpuAllocatable && !gpuAllocatable.IsZero() {
+			return true
+		}
+	}
+	return false
 }
 
 // PodRequestsGpu returns true if a given pod has GPU request.
 func PodRequestsGpu(pod *apiv1.Pod) bool {
 	podRequests := podutils.PodRequests(pod)
-	_, gpuFound := podRequests[ResourceNvidiaGPU]
-	return gpuFound
+	for _, gpuVendorResourceName := range GPUVendorResourceNames {
+		if _, found := podRequests[gpuVendorResourceName]; found {
+			return true
+		}
+	}
+	return false
+}
+
+// DetectNodeGPUResourceName inspects the node's allocatable resources and returns the first
+// known GPU extended resource name that has non-zero allocatable. Falls back to Nvidia for
+// backward compatibility if none are found but a GPU label is present.
+func DetectNodeGPUResourceName(node *apiv1.Node) apiv1.ResourceName {
+	for _, rn := range GPUVendorResourceNames {
+		if qty, ok := node.Status.Allocatable[rn]; ok && !qty.IsZero() {
+			return rn
+		}
+	}
+	// Fallback: preserve previous behavior (defaulting to Nvidia) if label existed
+	return ResourceNvidiaGPU
 }
 
 // GetNodeGPUFromCloudProvider returns the GPU the node has. Returned GPU has the GPU label of the
 // passed in cloud provider. If the node doesn't have a GPU, returns nil.
 func GetNodeGPUFromCloudProvider(provider cloudprovider.CloudProvider, node *apiv1.Node) *cloudprovider.GpuConfig {
 	gpuLabel := provider.GPULabel()
 	if NodeHasGpu(gpuLabel, node) {
-		return &cloudprovider.GpuConfig{Label: gpuLabel, Type: node.Labels[gpuLabel], ExtendedResourceName: ResourceNvidiaGPU}
+		return &cloudprovider.GpuConfig{
+			Label:                gpuLabel,
+			Type:                 node.Labels[gpuLabel],
+			ExtendedResourceName: DetectNodeGPUResourceName(node),
+		}
 	}
 	return nil
 }
diff --git a/cluster-autoscaler/utils/gpu/gpu_test.go b/cluster-autoscaler/utils/gpu/gpu_test.go
@@ -236,3 +236,70 @@ func TestGetGpuInfoForMetrics(t *testing.T) {
 		})
 	}
 }
+
+func TestDetectNodeGPUResourceName(t *testing.T) {
+	testCases := []struct {
+		name                 string
+		node                 *apiv1.Node
+		expectedResourceName apiv1.ResourceName
+	}{
+		{
+			name: "nvidia gpu",
+			node: &apiv1.Node{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:   "node-with-nvidia-gpu",
+					Labels: map[string]string{},
+				},
+				Status: apiv1.NodeStatus{
+					Capacity: apiv1.ResourceList{
+						gpu.ResourceNvidiaGPU: *resource.NewQuantity(1, resource.DecimalSI),
+					},
+					Allocatable: apiv1.ResourceList{
+						gpu.ResourceNvidiaGPU: *resource.NewQuantity(1, resource.DecimalSI),
+					},
+				},
+			},
+			expectedResourceName: gpu.ResourceNvidiaGPU,
+		},
+		{
+			name: "amd gpu",
+			node: &apiv1.Node{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:   "node-with-amd-gpu",
+					Labels: map[string]string{},
+				},
+				Status: apiv1.NodeStatus{
+					Capacity: apiv1.ResourceList{
+						gpu.ResourceAMDGPU: *resource.NewQuantity(8, resource.DecimalSI),
+					},
+					Allocatable: apiv1.ResourceList{
+						gpu.ResourceAMDGPU: *resource.NewQuantity(8, resource.DecimalSI),
+					},
+				},
+			},
+			expectedResourceName: gpu.ResourceAMDGPU,
+		},
+		{
+			name: "test default gpu resource name",
+			node: &apiv1.Node{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:   "node-without-gpu",
+					Labels: map[string]string{},
+				},
+				Status: apiv1.NodeStatus{
+					Capacity:    apiv1.ResourceList{},
+					Allocatable: apiv1.ResourceList{},
+				},
+			},
+			expectedResourceName: gpu.ResourceNvidiaGPU,
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			resourceName := gpu.DetectNodeGPUResourceName(tc.node)
+			if resourceName != tc.expectedResourceName {
+				t.Errorf("expected resource name %s but got %s", tc.expectedResourceName, resourceName)
+			}
+		})
+	}
+}