From a29703ca59490ec6b0650a639be2457ead189a17 Mon Sep 17 00:00:00 2001 From: Runzhen Wang Date: Wed, 11 Mar 2026 06:53:35 +0000 Subject: [PATCH 1/6] npd --- commands.go | 120 ++++++++++++++++++++++++++++++++++++++++++++++++++++ go.mod | 4 ++ go.sum | 8 ++++ 3 files changed, 132 insertions(+) diff --git a/commands.go b/commands.go index d3274fef..6fab2399 100644 --- a/commands.go +++ b/commands.go @@ -4,7 +4,10 @@ import ( "context" "fmt" "os" + "os/exec" "path/filepath" + "strconv" + "strings" "sync" "sync/atomic" "time" @@ -12,6 +15,9 @@ import ( "github.com/sirupsen/logrus" "github.com/spf13/cobra" "google.golang.org/grpc" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/clientcmd" _ "github.com/Azure/AKSFlexNode/components" "github.com/Azure/AKSFlexNode/components/services/inmem" @@ -231,6 +237,7 @@ func startDaemonLoops( if driftEnabled { startNodeDriftDetectionAndRemediationLoop(ctx, cfg, conn, logger, cfgMu, bootstrapInProgress, detectors, wg) } + startNodeConditionLoop(ctx, cfg, logger, wg) } func snapshotConfig(cfg *config.Config, cfgMu *sync.RWMutex) *config.Config { @@ -460,3 +467,116 @@ func handleExecutionResult(result *bootstrapper.ExecutionResult, operation strin // For bootstrap, return error on failure return fmt.Errorf("%s failed: %s", operation, result.Error) } + +func getBootTime() (time.Time, error) { + data, err := os.ReadFile("/proc/uptime") + if err != nil { + return time.Time{}, fmt.Errorf("failed to read /proc/uptime: %w", err) + } + + // /proc/uptime contains two numbers: uptime in seconds and idle time + // We only need the first number + fields := strings.Fields(string(data)) + if len(fields) < 1 { + return time.Time{}, fmt.Errorf("invalid /proc/uptime format") + } + + uptimeSeconds, err := strconv.ParseFloat(fields[0], 64) + if err != nil { + return time.Time{}, fmt.Errorf("failed to parse uptime: %w", err) + } + + // Calculate boot time: current time - uptime + bootTime := time.Now().Add(-time.Duration(uptimeSeconds * float64(time.Second))) + return bootTime, nil +} + +func getNodeName() (string, error) { + data, err := os.ReadFile("/etc/hostname") + if err != nil { + return "", fmt.Errorf("failed to read /etc/hostname: %w", err) + } + + nodeName := strings.TrimSpace(string(data)) + if nodeName == "" { + return "", fmt.Errorf("node name is empty in /etc/hostname") + } + + return nodeName, nil +} + +func rebootNode() error { + rebootCmd := exec.Command("/usr/bin/nsenter", "-m/proc/1/ns/mnt", + "/bin/bash", "-c", "echo b > /proc/sysrq-trigger") + + return rebootCmd.Run() +} + +func startNodeConditionLoop(ctx context.Context, cfg *config.Config, logger *logrus.Logger, wg *sync.WaitGroup) { + wg.Add(1) + + go func() { + defer wg.Done() + ticker := time.NewTicker(1 * time.Minute) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + now := time.Now() + logger.Infof("Starting node condition check at %s...", now.Format("2006-01-02 15:04:05")) + + // Load kubeconfig + config, err := clientcmd.BuildConfigFromFlags("", "/var/lib/kubelet/kubelet/kubeconfig") + if err != nil { + logger.Errorf("failed to load kubeconfig: %s", err.Error()) + return + } + + // Create Kubernetes clientset + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + logger.Errorf("failed to create clientset: %s", err.Error()) + return + } + + nodeName, err := getNodeName() + if err != nil { + logger.Errorf("failed to get node name: %s", err.Error()) + return + } + + // Get the node + node, err := clientset.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{}) + if err != nil { + logger.Errorf("failed to get node %s: %s", nodeName, err.Error()) + } + + hostBootTime, err := getBootTime() + if err != nil { + logger.Errorf("failed to get host boot time: %s", err.Error()) + return + } + + for _, condition := range node.Status.Conditions { + switch condition.Type { + case "KernelDeadlock": + if condition.Status == "True" && condition.LastTransitionTime.Time.After(hostBootTime) { + logger.Infof("Node has a kernel deadlock since %s, rebooting...", + condition.LastTransitionTime.Time.Format("2006-01-02 15:04:05")) + + // Reboot the node + err := rebootNode() + if err != nil { + logger.Errorf("failed to reboot node: %s", err.Error()) + } + } + } + } + + logger.Infof("Node condition check completed successfully at %s", time.Now().Format("2006-01-02 15:04:05")) + } + } + }() +} diff --git a/go.mod b/go.mod index f0acdaf3..90f5c499 100644 --- a/go.mod +++ b/go.mod @@ -41,6 +41,7 @@ require ( github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/emicklei/go-restful/v3 v3.13.0 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/fxamacker/cbor/v2 v2.9.0 // indirect @@ -85,10 +86,13 @@ require ( go.yaml.in/yaml/v3 v3.0.4 // indirect golang.org/x/crypto v0.47.0 // indirect golang.org/x/net v0.49.0 // indirect + golang.org/x/oauth2 v0.33.0 // indirect golang.org/x/sys v0.40.0 // indirect golang.org/x/term v0.39.0 // indirect golang.org/x/text v0.33.0 // indirect + golang.org/x/time v0.9.0 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb // indirect + gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/apiextensions-apiserver v0.34.3 // indirect diff --git a/go.sum b/go.sum index 9a807769..e9b74df3 100644 --- a/go.sum +++ b/go.sum @@ -187,6 +187,8 @@ github.com/spf13/viper v1.21.0/go.mod h1:P0lhsswPGWD/1lZJ9ny3fYnVqxiegrlNrEmgLjb github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= @@ -243,6 +245,8 @@ golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o= golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8= +golang.org/x/oauth2 v0.33.0 h1:4Q+qn+E5z8gPRJfmRy7C2gGG3T4jIprK6aSYgTXGRpo= +golang.org/x/oauth2 v0.33.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -274,6 +278,8 @@ golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE= golang.org/x/text v0.33.0/go.mod h1:LuMebE6+rBincTi9+xWTY8TztLzKHc/9C1uBCG27+q8= +golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY= +golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= @@ -296,6 +302,8 @@ gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntN gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/dnaeon/go-vcr.v4 v4.0.2 h1:7T5VYf2ifyK01ETHbJPl5A6XTpUljD4Trw3GEDcdedk= gopkg.in/dnaeon/go-vcr.v4 v4.0.2/go.mod h1:65yxh9goQVrudqofKtHA4JNFWd6XZRkWfKN4YpMx7KI= +gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnfEbYzo= +gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= From 9c0d5f54763251c290a862c39317f4bd3a9329c4 Mon Sep 17 00:00:00 2001 From: Runzhen Date: Thu, 12 Mar 2026 14:17:39 -0700 Subject: [PATCH 2/6] Update commands.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- commands.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/commands.go b/commands.go index 6fab2399..5e3f5949 100644 --- a/commands.go +++ b/commands.go @@ -492,14 +492,14 @@ func getBootTime() (time.Time, error) { } func getNodeName() (string, error) { - data, err := os.ReadFile("/etc/hostname") + host, err := os.Hostname() if err != nil { - return "", fmt.Errorf("failed to read /etc/hostname: %w", err) + return "", fmt.Errorf("failed to get hostname: %w", err) } - nodeName := strings.TrimSpace(string(data)) + nodeName := strings.TrimSpace(host) if nodeName == "" { - return "", fmt.Errorf("node name is empty in /etc/hostname") + return "", fmt.Errorf("node name is empty") } return nodeName, nil From 369dea15854e1e366f6fa3679e892983faa9f2aa Mon Sep 17 00:00:00 2001 From: Runzhen Date: Thu, 12 Mar 2026 14:18:09 -0700 Subject: [PATCH 3/6] Update commands.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- commands.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/commands.go b/commands.go index 5e3f5949..a505d0f5 100644 --- a/commands.go +++ b/commands.go @@ -528,14 +528,14 @@ func startNodeConditionLoop(ctx context.Context, cfg *config.Config, logger *log logger.Infof("Starting node condition check at %s...", now.Format("2006-01-02 15:04:05")) // Load kubeconfig - config, err := clientcmd.BuildConfigFromFlags("", "/var/lib/kubelet/kubelet/kubeconfig") + kubeConfig, err := clientcmd.BuildConfigFromFlags("", config.KubeletKubeconfigPath) if err != nil { logger.Errorf("failed to load kubeconfig: %s", err.Error()) return } // Create Kubernetes clientset - clientset, err := kubernetes.NewForConfig(config) + clientset, err := kubernetes.NewForConfig(kubeConfig) if err != nil { logger.Errorf("failed to create clientset: %s", err.Error()) return From 5e09cf86e45735e81848ac8aa9321646b1f6d65b Mon Sep 17 00:00:00 2001 From: Runzhen Date: Thu, 12 Mar 2026 14:23:54 -0700 Subject: [PATCH 4/6] Update commands.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- commands.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/commands.go b/commands.go index a505d0f5..708652b0 100644 --- a/commands.go +++ b/commands.go @@ -548,7 +548,9 @@ func startNodeConditionLoop(ctx context.Context, cfg *config.Config, logger *log } // Get the node - node, err := clientset.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{}) + ctxWithTimeout, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + node, err := clientset.CoreV1().Nodes().Get(ctxWithTimeout, nodeName, metav1.GetOptions{}) if err != nil { logger.Errorf("failed to get node %s: %s", nodeName, err.Error()) } From 8ec560ea7fe4df0e58ea147d23b49610b544adbe Mon Sep 17 00:00:00 2001 From: Runzhen Date: Thu, 12 Mar 2026 14:25:24 -0700 Subject: [PATCH 5/6] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- commands.go | 1 + 1 file changed, 1 insertion(+) diff --git a/commands.go b/commands.go index 708652b0..84d8fb6f 100644 --- a/commands.go +++ b/commands.go @@ -553,6 +553,7 @@ func startNodeConditionLoop(ctx context.Context, cfg *config.Config, logger *log node, err := clientset.CoreV1().Nodes().Get(ctxWithTimeout, nodeName, metav1.GetOptions{}) if err != nil { logger.Errorf("failed to get node %s: %s", nodeName, err.Error()) + continue } hostBootTime, err := getBootTime() From 1417ecb363eb48eed7fdf3b113e1f27d4b99f2bd Mon Sep 17 00:00:00 2001 From: Runzhen Date: Thu, 12 Mar 2026 14:25:43 -0700 Subject: [PATCH 6/6] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- commands.go | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/commands.go b/commands.go index 84d8fb6f..61ef00fa 100644 --- a/commands.go +++ b/commands.go @@ -534,23 +534,24 @@ func startNodeConditionLoop(ctx context.Context, cfg *config.Config, logger *log return } + continue + } + // Create Kubernetes clientset clientset, err := kubernetes.NewForConfig(kubeConfig) if err != nil { logger.Errorf("failed to create clientset: %s", err.Error()) - return + continue } nodeName, err := getNodeName() if err != nil { logger.Errorf("failed to get node name: %s", err.Error()) - return + continue } // Get the node - ctxWithTimeout, cancel := context.WithTimeout(ctx, 10*time.Second) - defer cancel() - node, err := clientset.CoreV1().Nodes().Get(ctxWithTimeout, nodeName, metav1.GetOptions{}) + node, err := clientset.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{}) if err != nil { logger.Errorf("failed to get node %s: %s", nodeName, err.Error()) continue @@ -559,7 +560,7 @@ func startNodeConditionLoop(ctx context.Context, cfg *config.Config, logger *log hostBootTime, err := getBootTime() if err != nil { logger.Errorf("failed to get host boot time: %s", err.Error()) - return + continue } for _, condition := range node.Status.Conditions {