Skip to content

Commit 3a75de8

Browse files
committed
[Feature] Make GPU utils function neutral for vendor's resource names
Signed-off-by: yansun1996 <Yan.Sun3@amd.com>
1 parent 2944dcb commit 3a75de8

File tree

2 files changed

+112
-5
lines changed

2 files changed

+112
-5
lines changed

cluster-autoscaler/utils/gpu/gpu.go

Lines changed: 45 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ import (
2626
)
2727

2828
const (
29+
// ResourceAMDGPU is the name of the AMD GPU resource.
30+
ResourceAMDGPU = "amd.com/gpu"
2931
// ResourceNvidiaGPU is the name of the Nvidia GPU resource.
3032
ResourceNvidiaGPU = "nvidia.com/gpu"
3133
// ResourceDirectX is the name of the DirectX resource on windows.
@@ -35,6 +37,14 @@ const (
3537
DefaultGPUType = "nvidia-tesla-k80"
3638
)
3739

40+
// Centralized list of all known GPU vendor extended resource names.
41+
// Extend this slice if new vendor resource names are added.
42+
var GPUVendorResourceNames = []apiv1.ResourceName{
43+
ResourceNvidiaGPU,
44+
ResourceAMDGPU,
45+
ResourceDirectX,
46+
}
47+
3848
const (
3949
// MetricsGenericGPU - for when there is no information about GPU type
4050
MetricsGenericGPU = "generic"
@@ -109,23 +119,53 @@ func validateGpuType(availableGPUTypes map[string]struct{}, gpu string) string {
109119
// if the drivers are installed and GPU is ready to use.
110120
func NodeHasGpu(GPULabel string, node *apiv1.Node) bool {
111121
_, hasGpuLabel := node.Labels[GPULabel]
112-
gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable[ResourceNvidiaGPU]
113-
return hasGpuLabel || (hasGpuAllocatable && !gpuAllocatable.IsZero())
122+
if hasGpuLabel {
123+
return true
124+
}
125+
// Check for extended resources as well
126+
for _, gpuVendorResourceName := range GPUVendorResourceNames {
127+
gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable[gpuVendorResourceName]
128+
if hasGpuAllocatable && !gpuAllocatable.IsZero() {
129+
return true
130+
}
131+
}
132+
return false
114133
}
115134

116135
// PodRequestsGpu returns true if a given pod has GPU request.
117136
func PodRequestsGpu(pod *apiv1.Pod) bool {
118137
podRequests := podutils.PodRequests(pod)
119-
_, gpuFound := podRequests[ResourceNvidiaGPU]
120-
return gpuFound
138+
for _, gpuVendorResourceName := range GPUVendorResourceNames {
139+
if _, found := podRequests[gpuVendorResourceName]; found {
140+
return true
141+
}
142+
}
143+
return false
144+
}
145+
146+
// DetectNodeGPUResourceName inspects the node's allocatable resources and returns the first
147+
// known GPU extended resource name that has non-zero allocatable. Falls back to Nvidia for
148+
// backward compatibility if none are found but a GPU label is present.
149+
func DetectNodeGPUResourceName(node *apiv1.Node) apiv1.ResourceName {
150+
for _, rn := range GPUVendorResourceNames {
151+
if qty, ok := node.Status.Allocatable[rn]; ok && !qty.IsZero() {
152+
return rn
153+
}
154+
}
155+
// Fallback: preserve previous behavior (defaulting to Nvidia) if label existed
156+
return ResourceNvidiaGPU
121157
}
122158

123159
// GetNodeGPUFromCloudProvider returns the GPU the node has. Returned GPU has the GPU label of the
124160
// passed in cloud provider. If the node doesn't have a GPU, returns nil.
125161
func GetNodeGPUFromCloudProvider(provider cloudprovider.CloudProvider, node *apiv1.Node) *cloudprovider.GpuConfig {
126162
gpuLabel := provider.GPULabel()
127163
if NodeHasGpu(gpuLabel, node) {
128-
return &cloudprovider.GpuConfig{Label: gpuLabel, Type: node.Labels[gpuLabel], ExtendedResourceName: ResourceNvidiaGPU}
164+
return &cloudprovider.GpuConfig{
165+
Label: gpuLabel,
166+
Type: node.Labels[gpuLabel],
167+
ExtendedResourceName: DetectNodeGPUResourceName(node),
168+
}
129169
}
130170
return nil
131171
}

cluster-autoscaler/utils/gpu/gpu_test.go

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,3 +236,70 @@ func TestGetGpuInfoForMetrics(t *testing.T) {
236236
})
237237
}
238238
}
239+
240+
func TestDetectNodeGPUResourceName(t *testing.T) {
241+
testCases := []struct {
242+
name string
243+
node *apiv1.Node
244+
expectedResourceName apiv1.ResourceName
245+
}{
246+
{
247+
name: "nvidia gpu",
248+
node: &apiv1.Node{
249+
ObjectMeta: metav1.ObjectMeta{
250+
Name: "node-with-nvidia-gpu",
251+
Labels: map[string]string{},
252+
},
253+
Status: apiv1.NodeStatus{
254+
Capacity: apiv1.ResourceList{
255+
gpu.ResourceNvidiaGPU: *resource.NewQuantity(1, resource.DecimalSI),
256+
},
257+
Allocatable: apiv1.ResourceList{
258+
gpu.ResourceNvidiaGPU: *resource.NewQuantity(1, resource.DecimalSI),
259+
},
260+
},
261+
},
262+
expectedResourceName: gpu.ResourceNvidiaGPU,
263+
},
264+
{
265+
name: "amd gpu",
266+
node: &apiv1.Node{
267+
ObjectMeta: metav1.ObjectMeta{
268+
Name: "node-with-amd-gpu",
269+
Labels: map[string]string{},
270+
},
271+
Status: apiv1.NodeStatus{
272+
Capacity: apiv1.ResourceList{
273+
gpu.ResourceAMDGPU: *resource.NewQuantity(8, resource.DecimalSI),
274+
},
275+
Allocatable: apiv1.ResourceList{
276+
gpu.ResourceAMDGPU: *resource.NewQuantity(8, resource.DecimalSI),
277+
},
278+
},
279+
},
280+
expectedResourceName: gpu.ResourceAMDGPU,
281+
},
282+
{
283+
name: "test default gpu resource name",
284+
node: &apiv1.Node{
285+
ObjectMeta: metav1.ObjectMeta{
286+
Name: "node-without-gpu",
287+
Labels: map[string]string{},
288+
},
289+
Status: apiv1.NodeStatus{
290+
Capacity: apiv1.ResourceList{},
291+
Allocatable: apiv1.ResourceList{},
292+
},
293+
},
294+
expectedResourceName: gpu.ResourceNvidiaGPU,
295+
},
296+
}
297+
for _, tc := range testCases {
298+
t.Run(tc.name, func(t *testing.T) {
299+
resourceName := gpu.DetectNodeGPUResourceName(tc.node)
300+
if resourceName != tc.expectedResourceName {
301+
t.Errorf("expected resource name %s but got %s", tc.expectedResourceName, resourceName)
302+
}
303+
})
304+
}
305+
}

0 commit comments

Comments
 (0)