Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion pkg/device-plugin/nvidiadevice/nvinternal/plugin/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,27 @@ func GetNextDeviceRequest(dtype string, p corev1.Pod) (corev1.Container, device.
if !ok {
return corev1.Container{}, res, errors.New("device request not found")
}

// The annotation format follows the order: init containers first, then regular containers
// Index mapping:
// 0 to len(InitContainers)-1: init containers
// len(InitContainers) to len(InitContainers)+len(Containers)-1: regular containers
initContainerCount := len(p.Spec.InitContainers)

for ctridx, ctrDevice := range pd {
if len(ctrDevice) > 0 {
return p.Spec.Containers[ctridx], ctrDevice, nil
if ctridx < initContainerCount {
// This is an init container
klog.Infof("Found device request in init container at index %d, name: %s", ctridx, p.Spec.InitContainers[ctridx].Name)
return p.Spec.InitContainers[ctridx], ctrDevice, nil
} else {
// This is a regular container
regularContainerIdx := ctridx - initContainerCount
if regularContainerIdx < len(p.Spec.Containers) {
klog.Infof("Found device request in container at index %d (original idx: %d), name: %s", regularContainerIdx, ctridx, p.Spec.Containers[regularContainerIdx].Name)
return p.Spec.Containers[regularContainerIdx], ctrDevice, nil
}
}
}
}
return corev1.Container{}, res, errors.New("device request not found")
Expand Down
206 changes: 205 additions & 1 deletion pkg/device-plugin/nvidiadevice/nvinternal/plugin/util_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ func TestGenerateMigTemplate(t *testing.T) {
expectedPos: 1,
expectedReset: true,
expectedMig: map[string]int32{
"1g.5gb": 1,
"1g.5gb": 1,
"2g.10gb": 3,
},
},
Expand Down Expand Up @@ -161,6 +161,210 @@ func TestGenerateMigTemplate(t *testing.T) {
}
}

func TestGetNextDeviceRequest_DeviceInRegularContainer(t *testing.T) {
// Save and restore InRequestDevices
oldInRequestDevices := device.InRequestDevices
defer func() { device.InRequestDevices = oldInRequestDevices }()

device.InRequestDevices = map[string]string{
"NVIDIA": "hami.io/vgpu-devices-to-allocate",
}

// Pod with no init containers, one regular container with a device
// Annotation format: "UUID,Type,mem,cores:;"
// After split by ";", we get ["UUID,Type,mem,cores:", ""]
// Index 0 maps to regular container 0 (since no init containers)
pod := corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "test-pod",
Namespace: "default",
Annotations: map[string]string{
"hami.io/vgpu-devices-to-allocate": "GPU-abc123,NVIDIA,1000,30:;",
},
},
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{Name: "main-container"},
},
},
}

ctr, ctrDevices, err := GetNextDeviceRequest("NVIDIA", pod)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if ctr.Name != "main-container" {
t.Errorf("expected container name 'main-container', got '%s'", ctr.Name)
}
if len(ctrDevices) != 1 {
t.Fatalf("expected 1 device, got %d", len(ctrDevices))
}
if ctrDevices[0].UUID != "GPU-abc123" {
t.Errorf("expected UUID 'GPU-abc123', got '%s'", ctrDevices[0].UUID)
}
}

func TestGetNextDeviceRequest_DeviceInInitContainer(t *testing.T) {
oldInRequestDevices := device.InRequestDevices
defer func() { device.InRequestDevices = oldInRequestDevices }()

device.InRequestDevices = map[string]string{
"NVIDIA": "hami.io/vgpu-devices-to-allocate",
}

// Pod with 1 init container (has device) and 1 regular container (no device)
// Annotation: "GPU-init1,NVIDIA,500,10:;;"
// After split by ";": ["GPU-init1,NVIDIA,500,10:", "", ""]
// Index 0 -> init container 0 (has device), Index 1 -> regular container 0 (empty)
pod := corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "test-pod-init",
Namespace: "default",
Annotations: map[string]string{
"hami.io/vgpu-devices-to-allocate": "GPU-init1,NVIDIA,500,10:;;",
},
},
Spec: corev1.PodSpec{
InitContainers: []corev1.Container{
{Name: "init-with-gpu"},
},
Containers: []corev1.Container{
{Name: "main-no-gpu"},
},
},
}

ctr, ctrDevices, err := GetNextDeviceRequest("NVIDIA", pod)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if ctr.Name != "init-with-gpu" {
t.Errorf("expected container name 'init-with-gpu', got '%s'", ctr.Name)
}
if len(ctrDevices) != 1 {
t.Fatalf("expected 1 device, got %d", len(ctrDevices))
}
if ctrDevices[0].UUID != "GPU-init1" {
t.Errorf("expected UUID 'GPU-init1', got '%s'", ctrDevices[0].UUID)
}
}

func TestGetNextDeviceRequest_DeviceInRegularContainerWithInitOffset(t *testing.T) {
oldInRequestDevices := device.InRequestDevices
defer func() { device.InRequestDevices = oldInRequestDevices }()

device.InRequestDevices = map[string]string{
"NVIDIA": "hami.io/vgpu-devices-to-allocate",
}

// Pod with 2 init containers (no device) and 1 regular container (has device)
// Annotation: ";;GPU-main1,NVIDIA,2000,50:;"
// After split by ";": ["", "", "GPU-main1,NVIDIA,2000,50:", ""]
// Index 0 -> init container 0 (empty)
// Index 1 -> init container 1 (empty)
// Index 2 -> regular container 0 (has device, regularIdx = 2 - 2 = 0)
pod := corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "test-pod-offset",
Namespace: "default",
Annotations: map[string]string{
"hami.io/vgpu-devices-to-allocate": ";;GPU-main1,NVIDIA,2000,50:;",
},
},
Spec: corev1.PodSpec{
InitContainers: []corev1.Container{
{Name: "init1-no-gpu"},
{Name: "init2-no-gpu"},
},
Containers: []corev1.Container{
{Name: "main-with-gpu"},
},
},
}

ctr, ctrDevices, err := GetNextDeviceRequest("NVIDIA", pod)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if ctr.Name != "main-with-gpu" {
t.Errorf("expected container name 'main-with-gpu', got '%s'", ctr.Name)
}
if len(ctrDevices) != 1 {
t.Fatalf("expected 1 device, got %d", len(ctrDevices))
}
if ctrDevices[0].UUID != "GPU-main1" {
t.Errorf("expected UUID 'GPU-main1', got '%s'", ctrDevices[0].UUID)
}
}

func TestGetNextDeviceRequest_NoDeviceFound(t *testing.T) {
oldInRequestDevices := device.InRequestDevices
defer func() { device.InRequestDevices = oldInRequestDevices }()

device.InRequestDevices = map[string]string{
"NVIDIA": "hami.io/vgpu-devices-to-allocate",
}

// Pod with annotation but all containers have empty devices
// Annotation: ";;"
// After split by ";": ["", "", ""]
pod := corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "test-pod-empty",
Namespace: "default",
Annotations: map[string]string{
"hami.io/vgpu-devices-to-allocate": ";;",
},
},
Spec: corev1.PodSpec{
InitContainers: []corev1.Container{
{Name: "init1"},
},
Containers: []corev1.Container{
{Name: "main1"},
},
},
}

_, _, err := GetNextDeviceRequest("NVIDIA", pod)
if err == nil {
t.Fatal("expected error 'device request not found', got nil")
}
if err.Error() != "device request not found" {
t.Errorf("expected error 'device request not found', got '%s'", err.Error())
}
}

func TestGetNextDeviceRequest_DeviceTypeNotFound(t *testing.T) {
oldInRequestDevices := device.InRequestDevices
defer func() { device.InRequestDevices = oldInRequestDevices }()

device.InRequestDevices = map[string]string{
"NVIDIA": "hami.io/vgpu-devices-to-allocate",
}

// Pod with annotation for NVIDIA, but we ask for a non-existent device type
pod := corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "test-pod-notype",
Namespace: "default",
Annotations: map[string]string{
"hami.io/vgpu-devices-to-allocate": "GPU-abc,NVIDIA,1000,30:;",
},
},
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{Name: "main"},
},
},
}

_, _, err := GetNextDeviceRequest("AMD", pod)
if err == nil {
t.Fatal("expected error 'device request not found', got nil")
}
}

func Test_PodAllocationTrySuccess(t *testing.T) {
// Initialize fake clientset and pre-load test data
client.KubeClient = fake.NewSimpleClientset()
Expand Down
42 changes: 34 additions & 8 deletions pkg/device/devices.go
Original file line number Diff line number Diff line change
Expand Up @@ -386,9 +386,11 @@ func DecodePodDevices(checklist map[string]string, annos map[string]string) (Pod
if err != nil {
return PodDevices{}, nil
}
if len(cd) == 0 {
continue
}
// IMPORTANT: Do NOT skip empty ContainerDevices!
// We must preserve the index mapping between annotation entries and pod containers.
// The annotation format is: "dev1:;dev2:;;dev3:;" where ; separates containers
// If we skip empty entries, the index mapping will be broken for multi-container pods
// (especially pods with init containers where some containers don't use devices)
pd[devID] = append(pd[devID], cd)
}
}
Expand Down Expand Up @@ -484,24 +486,48 @@ func ExtractMigTemplatesFromUUID(uuid string) (int, int, error) {
}

func Resourcereqs(pod *corev1.Pod) (counts PodDeviceRequests) {
counts = make(PodDeviceRequests, len(pod.Spec.Containers))
// Total containers = init containers + regular containers
totalContainers := len(pod.Spec.InitContainers) + len(pod.Spec.Containers)
counts = make(PodDeviceRequests, totalContainers)
klog.V(4).InfoS("Processing resource requirements",
"pod", klog.KObj(pod),
"containerCount", len(pod.Spec.Containers))
"initContainerCount", len(pod.Spec.InitContainers),
"containerCount", len(pod.Spec.Containers),
"totalContainers", totalContainers)
//Count Nvidia GPU
cnt := int32(0)
for i := range pod.Spec.Containers {

// Process init containers first (indices 0 to len(InitContainers)-1)
for i := range pod.Spec.InitContainers {
devices := GetDevices()
counts[i] = make(ContainerDeviceRequests)
klog.V(5).InfoS("Processing container resources",
klog.V(5).InfoS("Processing init container resources",
"pod", klog.KObj(pod),
"containerIndex", i,
"containerName", pod.Spec.InitContainers[i].Name)
for idx, val := range devices {
request := val.GenerateResourceRequests(&pod.Spec.InitContainers[i])
if request.Nums > 0 {
cnt += request.Nums
counts[i][idx] = request
}
}
}

// Process regular containers (indices len(InitContainers) to totalContainers-1)
initContainerOffset := len(pod.Spec.InitContainers)
for i := range pod.Spec.Containers {
devices := GetDevices()
counts[initContainerOffset+i] = make(ContainerDeviceRequests)
klog.V(5).InfoS("Processing container resources",
"pod", klog.KObj(pod),
"containerIndex", initContainerOffset+i,
"containerName", pod.Spec.Containers[i].Name)
for idx, val := range devices {
request := val.GenerateResourceRequests(&pod.Spec.Containers[i])
if request.Nums > 0 {
cnt += request.Nums
counts[i][idx] = request
counts[initContainerOffset+i][idx] = request
}
}
}
Expand Down
Loading
Loading