From 3d8ee4cde542659bab005f80991fd0aa8e120e0b Mon Sep 17 00:00:00 2001 From: ronin Date: Mon, 17 Jul 2023 23:00:02 +0800 Subject: [PATCH 1/4] Fix bug causing potential pod error by using patch for annotations This commit switches from using 'update' to 'patch' when setting pod annotations. The previous 'update' approach was found to potentially cause pod errors. By using 'patch', we now only modify the specific fields that have changed, preventing the previous issue. Signed-off-by: ronin --- pkg/plugin/nvidia/utils.go | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/pkg/plugin/nvidia/utils.go b/pkg/plugin/nvidia/utils.go index 330223fc6..f90f3a76f 100644 --- a/pkg/plugin/nvidia/utils.go +++ b/pkg/plugin/nvidia/utils.go @@ -18,7 +18,9 @@ package nvidia import ( "context" + "encoding/json" "fmt" + "k8s.io/apimachinery/pkg/types" "math" "strconv" "strings" @@ -188,6 +190,14 @@ func GetGPUIDsFromPodAnnotation(pod *v1.Pod) []int { return nil } +type objectForAddAnnotations struct { + Metadata objectMetaForPatch `json:"metadata"` +} + +type objectMetaForPatch struct { + Annotations map[string]string `json:"annotations,omitempty" protobuf:"bytes,12,rep,name=annotations"` +} + func UpdatePodAnnotations(kubeClient *kubernetes.Clientset, pod *v1.Pod) error { pod, err := kubeClient.CoreV1().Pods(pod.Namespace).Get(context.TODO(), pod.Name, metav1.GetOptions{}) if err != nil { @@ -196,10 +206,14 @@ func UpdatePodAnnotations(kubeClient *kubernetes.Clientset, pod *v1.Pod) error { if len(pod.Annotations) == 0 { pod.Annotations = map[string]string{} } - pod.Annotations[GPUAssigned] = "true" - // TODO(@hzxuzhonghu): use patch instead - _, err = kubeClient.CoreV1().Pods(pod.Namespace).Update(context.TODO(), pod, metav1.UpdateOptions{}) + addAnnotationsPath := objectForAddAnnotations{ + Metadata: objectMetaForPatch{ + Annotations: pod.Annotations, + }, + } + patchBytes, err := json.Marshal(&addAnnotationsPath) + _, err = kubeClient.CoreV1().Pods(pod.Namespace).Patch(context.TODO(), pod.Name, types.MergePatchType, patchBytes, metav1.PatchOptions{}) return err } From a022a3862a720e741eb86b3263b2cbac284674b4 Mon Sep 17 00:00:00 2001 From: ronin Date: Mon, 17 Jul 2023 23:10:43 +0800 Subject: [PATCH 2/4] Grant pod patch permission to clusterRole for annotation updates This commit adds 'patch' permission to the clusterRole associated with our plugin. This change allows the plugin to patch pod annotations more safely, reducing the chance of errors and conflicts during updates. Signed-off-by: ronin --- volcano-device-plugin-GKE.yml | 2 +- volcano-device-plugin.yml | 2 +- volcano-vgpu-device-plugin.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/volcano-device-plugin-GKE.yml b/volcano-device-plugin-GKE.yml index 07940673e..5b77641cc 100644 --- a/volcano-device-plugin-GKE.yml +++ b/volcano-device-plugin-GKE.yml @@ -32,7 +32,7 @@ rules: verbs: ["patch"] - apiGroups: [""] resources: ["pods"] - verbs: ["get", "list", "update"] + verbs: ["get", "list", "update", "patch"] --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 diff --git a/volcano-device-plugin.yml b/volcano-device-plugin.yml index 1879e7664..f675bfb57 100644 --- a/volcano-device-plugin.yml +++ b/volcano-device-plugin.yml @@ -32,7 +32,7 @@ rules: verbs: ["patch"] - apiGroups: [""] resources: ["pods"] - verbs: ["get", "list", "update","patch"] + verbs: ["get", "list", "update", "patch"] --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 diff --git a/volcano-vgpu-device-plugin.yml b/volcano-vgpu-device-plugin.yml index e71b42d88..52fb2e3ad 100644 --- a/volcano-vgpu-device-plugin.yml +++ b/volcano-vgpu-device-plugin.yml @@ -32,7 +32,7 @@ rules: verbs: ["patch"] - apiGroups: [""] resources: ["pods"] - verbs: ["get", "list", "update","patch"] + verbs: ["get", "list", "update", "patch"] --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 From de46f0519597572a2ee2a3029ad02ee1d43b1af7 Mon Sep 17 00:00:00 2001 From: ronin Date: Mon, 17 Jul 2023 23:13:26 +0800 Subject: [PATCH 3/4] Revert "Grant pod patch permission to clusterRole for annotation updates" This reverts commit 053d614bb71e0f30206e176b848915274f4705c8. Signed-off-by: ronin --- volcano-device-plugin-GKE.yml | 2 +- volcano-device-plugin.yml | 2 +- volcano-vgpu-device-plugin.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/volcano-device-plugin-GKE.yml b/volcano-device-plugin-GKE.yml index 5b77641cc..07940673e 100644 --- a/volcano-device-plugin-GKE.yml +++ b/volcano-device-plugin-GKE.yml @@ -32,7 +32,7 @@ rules: verbs: ["patch"] - apiGroups: [""] resources: ["pods"] - verbs: ["get", "list", "update", "patch"] + verbs: ["get", "list", "update"] --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 diff --git a/volcano-device-plugin.yml b/volcano-device-plugin.yml index f675bfb57..1879e7664 100644 --- a/volcano-device-plugin.yml +++ b/volcano-device-plugin.yml @@ -32,7 +32,7 @@ rules: verbs: ["patch"] - apiGroups: [""] resources: ["pods"] - verbs: ["get", "list", "update", "patch"] + verbs: ["get", "list", "update","patch"] --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 diff --git a/volcano-vgpu-device-plugin.yml b/volcano-vgpu-device-plugin.yml index 52fb2e3ad..e71b42d88 100644 --- a/volcano-vgpu-device-plugin.yml +++ b/volcano-vgpu-device-plugin.yml @@ -32,7 +32,7 @@ rules: verbs: ["patch"] - apiGroups: [""] resources: ["pods"] - verbs: ["get", "list", "update", "patch"] + verbs: ["get", "list", "update","patch"] --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 From a8807c7cfdbb2889aef702052d4e72148a20ed8f Mon Sep 17 00:00:00 2001 From: ronin Date: Mon, 17 Jul 2023 23:18:49 +0800 Subject: [PATCH 4/4] Introduce error handling for potential JSON encoding errors This commit adds an error handler to catch and manage potential JSON encoding errors. By handling these errors explicitly, we can prevent crashes and improve the stability of the application. Signed-off-by: ronin --- pkg/plugin/nvidia/utils.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pkg/plugin/nvidia/utils.go b/pkg/plugin/nvidia/utils.go index f90f3a76f..074a93975 100644 --- a/pkg/plugin/nvidia/utils.go +++ b/pkg/plugin/nvidia/utils.go @@ -213,7 +213,12 @@ func UpdatePodAnnotations(kubeClient *kubernetes.Clientset, pod *v1.Pod) error { Annotations: pod.Annotations, }, } + patchBytes, err := json.Marshal(&addAnnotationsPath) + if err != nil { + return err + } + _, err = kubeClient.CoreV1().Pods(pod.Namespace).Patch(context.TODO(), pod.Name, types.MergePatchType, patchBytes, metav1.PatchOptions{}) return err }