diff --git a/commands.go b/commands.go index d3274fe..61ef00f 100644 --- a/commands.go +++ b/commands.go @@ -4,7 +4,10 @@ import ( "context" "fmt" "os" + "os/exec" "path/filepath" + "strconv" + "strings" "sync" "sync/atomic" "time" @@ -12,6 +15,9 @@ import ( "github.com/sirupsen/logrus" "github.com/spf13/cobra" "google.golang.org/grpc" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/clientcmd" _ "github.com/Azure/AKSFlexNode/components" "github.com/Azure/AKSFlexNode/components/services/inmem" @@ -231,6 +237,7 @@ func startDaemonLoops( if driftEnabled { startNodeDriftDetectionAndRemediationLoop(ctx, cfg, conn, logger, cfgMu, bootstrapInProgress, detectors, wg) } + startNodeConditionLoop(ctx, cfg, logger, wg) } func snapshotConfig(cfg *config.Config, cfgMu *sync.RWMutex) *config.Config { @@ -460,3 +467,120 @@ func handleExecutionResult(result *bootstrapper.ExecutionResult, operation strin // For bootstrap, return error on failure return fmt.Errorf("%s failed: %s", operation, result.Error) } + +func getBootTime() (time.Time, error) { + data, err := os.ReadFile("/proc/uptime") + if err != nil { + return time.Time{}, fmt.Errorf("failed to read /proc/uptime: %w", err) + } + + // /proc/uptime contains two numbers: uptime in seconds and idle time + // We only need the first number + fields := strings.Fields(string(data)) + if len(fields) < 1 { + return time.Time{}, fmt.Errorf("invalid /proc/uptime format") + } + + uptimeSeconds, err := strconv.ParseFloat(fields[0], 64) + if err != nil { + return time.Time{}, fmt.Errorf("failed to parse uptime: %w", err) + } + + // Calculate boot time: current time - uptime + bootTime := time.Now().Add(-time.Duration(uptimeSeconds * float64(time.Second))) + return bootTime, nil +} + +func getNodeName() (string, error) { + host, err := os.Hostname() + if err != nil { + return "", fmt.Errorf("failed to get hostname: %w", err) + } + + nodeName := strings.TrimSpace(host) + if nodeName == "" { + return "", fmt.Errorf("node name is empty") + } + + return nodeName, nil +} + +func rebootNode() error { + rebootCmd := exec.Command("/usr/bin/nsenter", "-m/proc/1/ns/mnt", + "/bin/bash", "-c", "echo b > /proc/sysrq-trigger") + + return rebootCmd.Run() +} + +func startNodeConditionLoop(ctx context.Context, cfg *config.Config, logger *logrus.Logger, wg *sync.WaitGroup) { + wg.Add(1) + + go func() { + defer wg.Done() + ticker := time.NewTicker(1 * time.Minute) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + now := time.Now() + logger.Infof("Starting node condition check at %s...", now.Format("2006-01-02 15:04:05")) + + // Load kubeconfig + kubeConfig, err := clientcmd.BuildConfigFromFlags("", config.KubeletKubeconfigPath) + if err != nil { + logger.Errorf("failed to load kubeconfig: %s", err.Error()) + return + } + + continue + } + + // Create Kubernetes clientset + clientset, err := kubernetes.NewForConfig(kubeConfig) + if err != nil { + logger.Errorf("failed to create clientset: %s", err.Error()) + continue + } + + nodeName, err := getNodeName() + if err != nil { + logger.Errorf("failed to get node name: %s", err.Error()) + continue + } + + // Get the node + node, err := clientset.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{}) + if err != nil { + logger.Errorf("failed to get node %s: %s", nodeName, err.Error()) + continue + } + + hostBootTime, err := getBootTime() + if err != nil { + logger.Errorf("failed to get host boot time: %s", err.Error()) + continue + } + + for _, condition := range node.Status.Conditions { + switch condition.Type { + case "KernelDeadlock": + if condition.Status == "True" && condition.LastTransitionTime.Time.After(hostBootTime) { + logger.Infof("Node has a kernel deadlock since %s, rebooting...", + condition.LastTransitionTime.Time.Format("2006-01-02 15:04:05")) + + // Reboot the node + err := rebootNode() + if err != nil { + logger.Errorf("failed to reboot node: %s", err.Error()) + } + } + } + } + + logger.Infof("Node condition check completed successfully at %s", time.Now().Format("2006-01-02 15:04:05")) + } + } + }() +} diff --git a/go.mod b/go.mod index f0acdaf..90f5c49 100644 --- a/go.mod +++ b/go.mod @@ -41,6 +41,7 @@ require ( github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/emicklei/go-restful/v3 v3.13.0 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/fxamacker/cbor/v2 v2.9.0 // indirect @@ -85,10 +86,13 @@ require ( go.yaml.in/yaml/v3 v3.0.4 // indirect golang.org/x/crypto v0.47.0 // indirect golang.org/x/net v0.49.0 // indirect + golang.org/x/oauth2 v0.33.0 // indirect golang.org/x/sys v0.40.0 // indirect golang.org/x/term v0.39.0 // indirect golang.org/x/text v0.33.0 // indirect + golang.org/x/time v0.9.0 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb // indirect + gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/apiextensions-apiserver v0.34.3 // indirect diff --git a/go.sum b/go.sum index 9a80776..e9b74df 100644 --- a/go.sum +++ b/go.sum @@ -187,6 +187,8 @@ github.com/spf13/viper v1.21.0/go.mod h1:P0lhsswPGWD/1lZJ9ny3fYnVqxiegrlNrEmgLjb github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= @@ -243,6 +245,8 @@ golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o= golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8= +golang.org/x/oauth2 v0.33.0 h1:4Q+qn+E5z8gPRJfmRy7C2gGG3T4jIprK6aSYgTXGRpo= +golang.org/x/oauth2 v0.33.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -274,6 +278,8 @@ golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE= golang.org/x/text v0.33.0/go.mod h1:LuMebE6+rBincTi9+xWTY8TztLzKHc/9C1uBCG27+q8= +golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY= +golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= @@ -296,6 +302,8 @@ gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntN gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/dnaeon/go-vcr.v4 v4.0.2 h1:7T5VYf2ifyK01ETHbJPl5A6XTpUljD4Trw3GEDcdedk= gopkg.in/dnaeon/go-vcr.v4 v4.0.2/go.mod h1:65yxh9goQVrudqofKtHA4JNFWd6XZRkWfKN4YpMx7KI= +gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnfEbYzo= +gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=