Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 124 additions & 0 deletions commands.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,20 @@
"context"
"fmt"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"

"github.com/sirupsen/logrus"
"github.com/spf13/cobra"
"google.golang.org/grpc"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/clientcmd"

_ "github.com/Azure/AKSFlexNode/components"
"github.com/Azure/AKSFlexNode/components/services/inmem"
Expand Down Expand Up @@ -231,6 +237,7 @@
if driftEnabled {
startNodeDriftDetectionAndRemediationLoop(ctx, cfg, conn, logger, cfgMu, bootstrapInProgress, detectors, wg)
}
startNodeConditionLoop(ctx, cfg, logger, wg)
}

func snapshotConfig(cfg *config.Config, cfgMu *sync.RWMutex) *config.Config {
Expand Down Expand Up @@ -460,3 +467,120 @@
// For bootstrap, return error on failure
return fmt.Errorf("%s failed: %s", operation, result.Error)
}

func getBootTime() (time.Time, error) {
data, err := os.ReadFile("/proc/uptime")
if err != nil {
return time.Time{}, fmt.Errorf("failed to read /proc/uptime: %w", err)
}

// /proc/uptime contains two numbers: uptime in seconds and idle time
// We only need the first number
fields := strings.Fields(string(data))
if len(fields) < 1 {
return time.Time{}, fmt.Errorf("invalid /proc/uptime format")
}

uptimeSeconds, err := strconv.ParseFloat(fields[0], 64)
if err != nil {
return time.Time{}, fmt.Errorf("failed to parse uptime: %w", err)
}

// Calculate boot time: current time - uptime
bootTime := time.Now().Add(-time.Duration(uptimeSeconds * float64(time.Second)))
return bootTime, nil
}

func getNodeName() (string, error) {
host, err := os.Hostname()
if err != nil {
return "", fmt.Errorf("failed to get hostname: %w", err)
}

nodeName := strings.TrimSpace(host)
if nodeName == "" {
return "", fmt.Errorf("node name is empty")
}

return nodeName, nil
}

func rebootNode() error {
rebootCmd := exec.Command("/usr/bin/nsenter", "-m/proc/1/ns/mnt",
"/bin/bash", "-c", "echo b > /proc/sysrq-trigger")

return rebootCmd.Run()
Comment on lines +508 to +512
Copy link

Copilot AI Mar 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Direct exec.Command usage here will be flagged by the repo’s enabled gosec linter (this codebase typically suppresses with #nosec or routes through pkg/utils helpers). Consider using utils.RunSystemCommand / RunCommandWithOutput (to preserve stderr for debugging) or add an explicit suppression comment with justification.

Copilot uses AI. Check for mistakes.
Comment on lines +509 to +512
Copy link

Copilot AI Mar 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Direct use of exec.Command here may trigger gosec (G204) and also provides no timeout or captured stderr/stdout for troubleshooting. Prefer using the repo’s command execution helper (e.g., pkg/utils.RunSystemCommand / RunCommandWithOutput) or add a scoped #nosec with justification and use exec.CommandContext with a timeout so failures are observable and the call can’t hang indefinitely.

Suggested change
rebootCmd := exec.Command("/usr/bin/nsenter", "-m/proc/1/ns/mnt",
"/bin/bash", "-c", "echo b > /proc/sysrq-trigger")
return rebootCmd.Run()
// Use a bounded context so the reboot command can't hang indefinitely.
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
// #nosec G204 -- command and arguments are constant literals; no user input is interpolated.
rebootCmd := exec.CommandContext(ctx, "/usr/bin/nsenter", "-m/proc/1/ns/mnt",
"/bin/bash", "-c", "echo b > /proc/sysrq-trigger")
output, err := rebootCmd.CombinedOutput()
if ctx.Err() == context.DeadlineExceeded {
return fmt.Errorf("reboot command timed out: %w; output: %s", err, strings.TrimSpace(string(output)))
}
if err != nil {
return fmt.Errorf("reboot command failed: %w; output: %s", err, strings.TrimSpace(string(output)))
}
return nil

Copilot uses AI. Check for mistakes.
}
Comment on lines +508 to +513
Copy link

Copilot AI Mar 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This introduces an unconditional host reboot action (via sysrq-trigger) in the main daemon loop. Consider gating this behavior behind an explicit config/feature flag (similar to EnableDriftDetectionAndRemediation) and adding rate limiting/guardrails to reduce the risk of reboot loops or unexpected reboots in environments that don’t want automated power actions.

Copilot uses AI. Check for mistakes.

func startNodeConditionLoop(ctx context.Context, cfg *config.Config, logger *logrus.Logger, wg *sync.WaitGroup) {
wg.Add(1)

Comment on lines +516 to +517
Copy link

Copilot AI Mar 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

startDaemonLoops pre-increments the WaitGroup counter for the loops it starts, but startNodeConditionLoop also calls wg.Add(1) internally. This hidden increment is inconsistent with the other loops and makes it easier to accidentally introduce a WaitGroup misuse/panic later; consider moving the Add(1) into startDaemonLoops and removing it from here.

Suggested change
wg.Add(1)

Copilot uses AI. Check for mistakes.
go func() {
defer wg.Done()
ticker := time.NewTicker(1 * time.Minute)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
now := time.Now()
logger.Infof("Starting node condition check at %s...", now.Format("2006-01-02 15:04:05"))

// Load kubeconfig
kubeConfig, err := clientcmd.BuildConfigFromFlags("", config.KubeletKubeconfigPath)
if err != nil {
Comment on lines +530 to +532
Copy link

Copilot AI Mar 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

config, err := clientcmd.BuildConfigFromFlags(..., "/var/lib/kubelet/kubelet/kubeconfig") hardcodes a path that already exists as config.KubeletKubeconfigPath and also introduces a local variable named config that shadows the imported pkg/config identifier. Use the shared constant and rename the local to something like kubeConfig to avoid confusion.

Copilot uses AI. Check for mistakes.
logger.Errorf("failed to load kubeconfig: %s", err.Error())
return
}

Comment on lines +534 to +536
Copy link

Copilot AI Mar 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this loop, errors when loading kubeconfig cause a return, which stops the goroutine permanently and prevents any future node-condition checks. Consider logging the error and continue to the next tick (or implement a backoff) so transient failures don’t disable remediation for the lifetime of the agent process.

Suggested change
return
}

Copilot uses AI. Check for mistakes.
continue
}

// Create Kubernetes clientset
clientset, err := kubernetes.NewForConfig(kubeConfig)
if err != nil {
logger.Errorf("failed to create clientset: %s", err.Error())
continue
}

nodeName, err := getNodeName()
if err != nil {
logger.Errorf("failed to get node name: %s", err.Error())
continue
}

// Get the node
node, err := clientset.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{})
Comment on lines +553 to +554
Copy link

Copilot AI Mar 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

defer cancel() is inside a long-running for loop, so the cancels will be deferred until the goroutine exits (potentially never), leaking per-iteration resources. Scope the timeout context to a small inner function/block and call cancel() at the end of each iteration instead of deferring in the outer loop.

Suggested change
// Get the node
node, err := clientset.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{})
// Get the node with a per-call context to avoid leaking resources across iterations
ctx, cancel := context.WithCancel(context.Background())
node, err := clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
cancel()

Copilot uses AI. Check for mistakes.
if err != nil {
logger.Errorf("failed to get node %s: %s", nodeName, err.Error())
Comment on lines +553 to +556
Copy link

Copilot AI Mar 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the Node GET fails, the code logs the error but then continues and dereferences node.Status.Conditions, which will panic when node is nil. This should return/continue on error (and avoid attempting remediation) so the agent doesn’t crash the loop on API failures.

Copilot uses AI. Check for mistakes.
continue
}

hostBootTime, err := getBootTime()
if err != nil {
logger.Errorf("failed to get host boot time: %s", err.Error())
continue
}

for _, condition := range node.Status.Conditions {
switch condition.Type {
case "KernelDeadlock":
if condition.Status == "True" && condition.LastTransitionTime.Time.After(hostBootTime) {
logger.Infof("Node has a kernel deadlock since %s, rebooting...",
condition.LastTransitionTime.Time.Format("2006-01-02 15:04:05"))

// Reboot the node
err := rebootNode()
if err != nil {
logger.Errorf("failed to reboot node: %s", err.Error())
}
}
}
}

logger.Infof("Node condition check completed successfully at %s", time.Now().Format("2006-01-02 15:04:05"))
}
}

Check failure on line 584 in commands.go

View workflow job for this annotation

GitHub Actions / Code Quality Checks

expression in go must be function call

Check failure on line 584 in commands.go

View workflow job for this annotation

GitHub Actions / Lint

expression in go must be function call (typecheck)
}()

Check failure on line 585 in commands.go

View workflow job for this annotation

GitHub Actions / E2E Tests (MSI + Token)

syntax error: unexpected ( after top level declaration

Check failure on line 585 in commands.go

View workflow job for this annotation

GitHub Actions / Code Quality Checks

expected ';', found '('

Check failure on line 585 in commands.go

View workflow job for this annotation

GitHub Actions / Lint

expected ';', found '(' (typecheck)

Check failure on line 585 in commands.go

View workflow job for this annotation

GitHub Actions / Lint

syntax error: unexpected ( after top level declaration (typecheck)

Check failure on line 585 in commands.go

View workflow job for this annotation

GitHub Actions / Build

syntax error: unexpected ( after top level declaration
}
4 changes: 4 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ require (
github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
github.com/emicklei/go-restful/v3 v3.13.0 // indirect
github.com/fsnotify/fsnotify v1.9.0 // indirect
github.com/fxamacker/cbor/v2 v2.9.0 // indirect
Expand Down Expand Up @@ -85,10 +86,13 @@ require (
go.yaml.in/yaml/v3 v3.0.4 // indirect
golang.org/x/crypto v0.47.0 // indirect
golang.org/x/net v0.49.0 // indirect
golang.org/x/oauth2 v0.33.0 // indirect
golang.org/x/sys v0.40.0 // indirect
golang.org/x/term v0.39.0 // indirect
golang.org/x/text v0.33.0 // indirect
golang.org/x/time v0.9.0 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb // indirect
gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/apiextensions-apiserver v0.34.3 // indirect
Expand Down
8 changes: 8 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,8 @@ github.com/spf13/viper v1.21.0/go.mod h1:P0lhsswPGWD/1lZJ9ny3fYnVqxiegrlNrEmgLjb
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
Expand Down Expand Up @@ -243,6 +245,8 @@ golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o=
golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8=
golang.org/x/oauth2 v0.33.0 h1:4Q+qn+E5z8gPRJfmRy7C2gGG3T4jIprK6aSYgTXGRpo=
golang.org/x/oauth2 v0.33.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
Expand Down Expand Up @@ -274,6 +278,8 @@ golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE=
golang.org/x/text v0.33.0/go.mod h1:LuMebE6+rBincTi9+xWTY8TztLzKHc/9C1uBCG27+q8=
golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY=
golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
Expand All @@ -296,6 +302,8 @@ gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntN
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
gopkg.in/dnaeon/go-vcr.v4 v4.0.2 h1:7T5VYf2ifyK01ETHbJPl5A6XTpUljD4Trw3GEDcdedk=
gopkg.in/dnaeon/go-vcr.v4 v4.0.2/go.mod h1:65yxh9goQVrudqofKtHA4JNFWd6XZRkWfKN4YpMx7KI=
gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnfEbYzo=
gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M=
gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc=
gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
Expand Down
Loading