diff --git a/Makefile b/Makefile index 32a99749e..abea05a81 100644 --- a/Makefile +++ b/Makefile @@ -165,7 +165,7 @@ docker-build: generate docker-build-ci docker-build-check-ci # For local use .PHONY: docker-build-ci docker-build-ci: - docker build . -t ${IMG} --build-arg FIPS_ENABLED="${FIPS_ENABLED}" --build-arg LDFLAGS="${LDFLAGS}" --build-arg GOARCH="${GOARCH}" + docker build . -t ${IMG} --build-arg FIPS_ENABLED="${FIPS_ENABLED}" --build-arg LDFLAGS="${LDFLAGS}" --build-arg GOARCH="${GOARCH}" --platform=linux/${GOARCH} # For local use .PHONY: docker-build-check-ci diff --git a/cmd/main.go b/cmd/main.go index d808d8e85..14b60f388 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -140,6 +140,7 @@ type options struct { remoteConfigEnabled bool datadogDashboardEnabled bool datadogGenericResourceEnabled bool + untaintControllerEnabled bool // Secret Backend options secretBackendCommand string @@ -177,6 +178,7 @@ func (opts *options) Parse() { flag.BoolVar(&opts.remoteConfigEnabled, "remoteConfigEnabled", false, "Enable RemoteConfig capabilities in the Operator (beta)") flag.BoolVar(&opts.datadogDashboardEnabled, "datadogDashboardEnabled", false, "Enable the DatadogDashboard controller") flag.BoolVar(&opts.datadogGenericResourceEnabled, "datadogGenericResourceEnabled", false, "Enable the DatadogGenericResource controller") + flag.BoolVar(&opts.untaintControllerEnabled, "untaintControllerEnabled", false, "Enable the Untaint controller") // DatadogAgentInternal flag.BoolVar(&opts.datadogAgentInternalEnabled, "datadogAgentInternalEnabled", true, "Enable the DatadogAgentInternal controller") @@ -288,6 +290,7 @@ func run(opts *options) error { IntrospectionEnabled: opts.introspectionEnabled, DatadogDashboardEnabled: opts.datadogDashboardEnabled, DatadogGenericResourceEnabled: opts.datadogGenericResourceEnabled, + UntaintControllerEnabled: opts.untaintControllerEnabled, }), }) if err != nil { @@ -352,20 +355,22 @@ func run(opts *options) error { CanaryAutoPauseMaxSlowStartDuration: opts.edsCanaryAutoPauseMaxSlowStartDuration, MaxPodSchedulerFailure: opts.edsMaxPodSchedulerFailure, }, - SupportCilium: opts.supportCilium, - CredsManager: credsManager, - Creds: creds, - SecretRefreshInterval: opts.secretRefreshInterval, - DatadogAgentEnabled: opts.datadogAgentEnabled, - DatadogAgentInternalEnabled: opts.datadogAgentInternalEnabled, - DatadogMonitorEnabled: opts.datadogMonitorEnabled, - DatadogSLOEnabled: opts.datadogSLOEnabled, - OperatorMetricsEnabled: opts.operatorMetricsEnabled, - V2APIEnabled: true, - IntrospectionEnabled: opts.introspectionEnabled, - DatadogAgentProfileEnabled: opts.datadogAgentProfileEnabled, - DatadogDashboardEnabled: opts.datadogDashboardEnabled, - DatadogGenericResourceEnabled: opts.datadogGenericResourceEnabled, + SupportCilium: opts.supportCilium, + CredsManager: credsManager, + Creds: creds, + SecretRefreshInterval: opts.secretRefreshInterval, + DatadogAgentEnabled: opts.datadogAgentEnabled, + DatadogAgentInternalEnabled: opts.datadogAgentInternalEnabled, + DatadogMonitorEnabled: opts.datadogMonitorEnabled, + DatadogSLOEnabled: opts.datadogSLOEnabled, + OperatorMetricsEnabled: opts.operatorMetricsEnabled, + V2APIEnabled: true, + IntrospectionEnabled: opts.introspectionEnabled, + DatadogAgentProfileEnabled: opts.datadogAgentProfileEnabled, + DatadogDashboardEnabled: opts.datadogDashboardEnabled, + DatadogGenericResourceEnabled: opts.datadogGenericResourceEnabled, + UntaintControllerEnabled: opts.untaintControllerEnabled, + UntaintControllerEventsEnabled: os.Getenv("DD_UNTAINT_CONTROLLER_EVENTS_ENABLED") == "true", } versionInfo, platformInfo, err := getVersionAndPlatformInfo(rest.CopyConfig(mgr.GetConfig())) diff --git a/internal/controller/metrics/const.go b/internal/controller/metrics/const.go index 88c16b82d..9cad2c8a7 100644 --- a/internal/controller/metrics/const.go +++ b/internal/controller/metrics/const.go @@ -8,6 +8,7 @@ package metrics const ( datadogAgentSubsystem = "datadogagent" datadogAgentProfileSubsystem = "datadogagentprofile" + untaintSubsystem = "untaint" TrueValue = 1.0 FalseValue = 0.0 diff --git a/internal/controller/metrics/untaint.go b/internal/controller/metrics/untaint.go new file mode 100644 index 000000000..09deb2196 --- /dev/null +++ b/internal/controller/metrics/untaint.go @@ -0,0 +1,37 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +package metrics + +import ( + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +var ( + // TaintRemovalsTotal is the total number of taints removed from nodes. + TaintRemovalsTotal = prometheus.NewCounter( + prometheus.CounterOpts{ + Subsystem: untaintSubsystem, + Name: "taint_removals_total", + Help: "Total number of taints removed from nodes", + }, + ) + + // TaintRemovalLatency is the time between agent pod becoming Ready and taint removal. + TaintRemovalLatency = prometheus.NewHistogram( + prometheus.HistogramOpts{ + Subsystem: untaintSubsystem, + Name: "taint_removal_latency_seconds", + Help: "Time between agent pod becoming Ready and taint removal from the node", + Buckets: prometheus.DefBuckets, + }, + ) +) + +func init() { + metrics.Registry.MustRegister(TaintRemovalsTotal) + metrics.Registry.MustRegister(TaintRemovalLatency) +} diff --git a/internal/controller/setup.go b/internal/controller/setup.go index c8a8bc6c9..f8365ddb7 100644 --- a/internal/controller/setup.go +++ b/internal/controller/setup.go @@ -41,22 +41,24 @@ const ( // SetupOptions defines options for setting up controllers to ease testing type SetupOptions struct { - SupportExtendedDaemonset ExtendedDaemonsetOptions - SupportCilium bool - CredsManager *config.CredentialManager - Creds config.Creds - SecretRefreshInterval time.Duration - DatadogAgentEnabled bool - DatadogAgentInternalEnabled bool - DatadogMonitorEnabled bool - DatadogSLOEnabled bool - OperatorMetricsEnabled bool - V2APIEnabled bool - IntrospectionEnabled bool - DatadogAgentProfileEnabled bool - OtelAgentEnabled bool - DatadogDashboardEnabled bool - DatadogGenericResourceEnabled bool + SupportExtendedDaemonset ExtendedDaemonsetOptions + SupportCilium bool + CredsManager *config.CredentialManager + Creds config.Creds + SecretRefreshInterval time.Duration + DatadogAgentEnabled bool + DatadogAgentInternalEnabled bool + DatadogMonitorEnabled bool + DatadogSLOEnabled bool + OperatorMetricsEnabled bool + V2APIEnabled bool + IntrospectionEnabled bool + DatadogAgentProfileEnabled bool + OtelAgentEnabled bool + DatadogDashboardEnabled bool + DatadogGenericResourceEnabled bool + UntaintControllerEnabled bool + UntaintControllerEventsEnabled bool } // ExtendedDaemonsetOptions defines ExtendedDaemonset options @@ -85,6 +87,7 @@ var controllerStarters = map[string]starterFunc{ profileControllerName: startDatadogAgentProfiles, dashboardControllerName: startDatadogDashboard, genericResourceControllerName: startDatadogGenericResource, + untaintControllerName: startUntaint, } // SetupControllers starts all controllers (also used by e2e tests) @@ -257,6 +260,22 @@ func startDatadogSLO(logger logr.Logger, mgr manager.Manager, pInfo kubernetes.P return sloReconciler.SetupWithManager(mgr) } +func startUntaint(logger logr.Logger, mgr manager.Manager, _ kubernetes.PlatformInfo, options SetupOptions, _ datadog.MetricsForwardersManager) error { + if !options.UntaintControllerEnabled { + logger.Info("Feature disabled, not starting the controller", "controller", untaintControllerName) + return nil + } + + logger.Info("untaint controller enabled", "controller", untaintControllerName, "eventsEnabled", options.UntaintControllerEventsEnabled) + + return (&UntaintReconciler{ + client: mgr.GetClient(), + log: ctrl.Log.WithName("controllers").WithName(untaintControllerName), + recorder: mgr.GetEventRecorderFor(untaintControllerName), + eventsEnabled: options.UntaintControllerEventsEnabled, + }).SetupWithManager(mgr) +} + func startDatadogAgentProfiles(logger logr.Logger, mgr manager.Manager, pInfo kubernetes.PlatformInfo, options SetupOptions, metricForwardersMgr datadog.MetricsForwardersManager) error { if !options.DatadogAgentProfileEnabled { logger.Info("Feature disabled, not starting the controller", "controller", profileControllerName) diff --git a/internal/controller/untaint_controller.go b/internal/controller/untaint_controller.go new file mode 100644 index 000000000..d144fd69a --- /dev/null +++ b/internal/controller/untaint_controller.go @@ -0,0 +1,255 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +package controller + +import ( + "context" + "encoding/json" + "fmt" + "time" + + "github.com/go-logr/logr" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/record" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + "github.com/DataDog/datadog-operator/api/datadoghq/common" + "github.com/DataDog/datadog-operator/internal/controller/metrics" + "github.com/DataDog/datadog-operator/pkg/constants" +) + +const ( + untaintControllerName = "Untaint" + + // Fixed taint to remove + agentNotReadyTaintKey = "agent.datadoghq.com/not-ready" + agentNotReadyTaintValue = "presence" + agentNotReadyTaintEffect = corev1.TaintEffectNoSchedule +) + +// UntaintReconciler watches agent pods and removes the taint +// agent.datadoghq.com/not-ready=presence:NoSchedule from their nodes once Ready. +type UntaintReconciler struct { + client client.Client + log logr.Logger + recorder record.EventRecorder + eventsEnabled bool +} + +// +kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch;patch +// +kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch +// +kubebuilder:rbac:groups="",resources=events,verbs=create;patch + +// Reconcile is called with the name of a Node and removes the agent-not-ready taint +// if a Ready agent pod exists on that node. +func (r *UntaintReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + log := r.log.WithValues("node", req.Name) + + // 1. Get the Node from cache + node := &corev1.Node{} + if err := r.client.Get(ctx, req.NamespacedName, node); err != nil { + if apierrors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, fmt.Errorf("failed to get node %s: %w", req.Name, err) + } + + // 2. Check if the taint we care about is present + if !hasTaint(node) { + return ctrl.Result{}, nil + } + + // 3. List agent pods on this node + podList := &corev1.PodList{} + labelSelector := labels.SelectorFromSet(map[string]string{ + common.AgentDeploymentComponentLabelKey: constants.DefaultAgentResourceSuffix, + }) + if err := r.client.List(ctx, podList, + client.MatchingLabelsSelector{Selector: labelSelector}, + client.MatchingFields{"spec.nodeName": req.Name}, + ); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to list pods on node %s: %w", req.Name, err) + } + + // 4. Check if any agent pod is Ready; record its Ready transition time for the latency metric + var readyTransitionTime *time.Time + anyReady := false + for i := range podList.Items { + pod := &podList.Items[i] + for _, c := range pod.Status.Conditions { + if c.Type == corev1.PodReady && c.Status == corev1.ConditionTrue { + anyReady = true + if !c.LastTransitionTime.IsZero() { + t := c.LastTransitionTime.Time + readyTransitionTime = &t + } + break + } + } + if anyReady { + break + } + } + + if !anyReady { + log.V(1).Info("No ready agent pod found, skipping taint removal") + return ctrl.Result{}, nil + } + + // 5. Remove the taint via JSON patch (test-and-set for optimistic concurrency) + if err := r.removeTaint(ctx, node); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to remove taint from node %s: %w", req.Name, err) + } + + log.Info("Removed agent-not-ready taint from node") + + // 6. Record metrics + metrics.TaintRemovalsTotal.Inc() + if readyTransitionTime != nil { + metrics.TaintRemovalLatency.Observe(time.Since(*readyTransitionTime).Seconds()) + } + + // 7. Optionally emit Kubernetes event + if r.eventsEnabled { + r.recorder.Eventf(node, corev1.EventTypeNormal, "TaintRemoved", + "Removed taint %s from node %s after agent became ready", agentNotReadyTaintKey, node.Name) + } + + return ctrl.Result{}, nil +} + +// hasTaint returns true if the node has the agent-not-ready taint. +func hasTaint(node *corev1.Node) bool { + for _, t := range node.Spec.Taints { + if t.Key == agentNotReadyTaintKey && t.Value == agentNotReadyTaintValue && t.Effect == agentNotReadyTaintEffect { + return true + } + } + return false +} + +type jsonPatchOp struct { + Op string `json:"op"` + Path string `json:"path"` + Value interface{} `json:"value"` +} + +// removeTaint patches the node to remove the agent-not-ready taint using a JSON test-and-set patch. +func (r *UntaintReconciler) removeTaint(ctx context.Context, node *corev1.Node) error { + newTaints := make([]corev1.Taint, 0, len(node.Spec.Taints)) + for _, t := range node.Spec.Taints { + if t.Key == agentNotReadyTaintKey && t.Value == agentNotReadyTaintValue && t.Effect == agentNotReadyTaintEffect { + continue + } + newTaints = append(newTaints, t) + } + + if len(newTaints) == len(node.Spec.Taints) { + return nil // taint not present + } + + // Use JSON patch with test precondition for optimistic concurrency. + patch := []jsonPatchOp{ + {Op: "test", Path: "/spec/taints", Value: node.Spec.Taints}, + {Op: "replace", Path: "/spec/taints", Value: newTaints}, + } + + patchBytes, err := json.Marshal(patch) + if err != nil { + return fmt.Errorf("failed to marshal patch: %w", err) + } + + // TODO: If this fails due to conflict with optimistic concurrency, we should requeue + // the reconciliation instead of returning an error. + return r.client.Patch(ctx, node, client.RawPatch(types.JSONPatchType, patchBytes)) +} + +// SetupWithManager sets up the controller to watch agent pods and map them to their nodes. +func (r *UntaintReconciler) SetupWithManager(mgr ctrl.Manager) error { + // Index pods by spec.nodeName so we can list pods on a specific node efficiently. + if err := mgr.GetFieldIndexer().IndexField(context.Background(), &corev1.Pod{}, "spec.nodeName", func(obj client.Object) []string { + pod, ok := obj.(*corev1.Pod) + if !ok || pod.Spec.NodeName == "" { + return nil + } + return []string{pod.Spec.NodeName} + }); err != nil { + return fmt.Errorf("failed to index pods by spec.nodeName: %w", err) + } + + return ctrl.NewControllerManagedBy(mgr). + Named(untaintControllerName). + Watches( + &corev1.Pod{}, + handler.EnqueueRequestsFromMapFunc(r.podToNodeRequest), + builder.WithPredicates(r.agentPodReadinessPredicate()), + ). + Complete(r) +} + +// podToNodeRequest maps a Pod event to a reconcile.Request for the pod's node. +func (r *UntaintReconciler) podToNodeRequest(ctx context.Context, obj client.Object) []reconcile.Request { + pod, ok := obj.(*corev1.Pod) + if !ok || pod.Spec.NodeName == "" { + return nil + } + return []reconcile.Request{ + {NamespacedName: types.NamespacedName{Name: pod.Spec.NodeName}}, + } +} + +// agentPodReadinessPredicate returns a predicate that only processes agent pods +// when they transition to Ready. +func (r *UntaintReconciler) agentPodReadinessPredicate() predicate.Predicate { + return predicate.Funcs{ + CreateFunc: func(e event.CreateEvent) bool { + return r.isAgentPod(e.Object) && isPodReady(e.Object) + }, + UpdateFunc: func(e event.UpdateEvent) bool { + if !r.isAgentPod(e.ObjectNew) { + return false + } + wasReady := isPodReady(e.ObjectOld) + isReady := isPodReady(e.ObjectNew) + return !wasReady && isReady + }, + DeleteFunc: func(e event.DeleteEvent) bool { + return false + }, + GenericFunc: func(e event.GenericEvent) bool { + return false + }, + } +} + +// isAgentPod returns true if the object has the agent component label. +func (r *UntaintReconciler) isAgentPod(obj client.Object) bool { + lbls := obj.GetLabels() + return lbls[common.AgentDeploymentComponentLabelKey] == constants.DefaultAgentResourceSuffix +} + +// isPodReady returns true if the pod has the Ready condition set to True. +func isPodReady(obj client.Object) bool { + pod, ok := obj.(*corev1.Pod) + if !ok { + return false + } + for _, c := range pod.Status.Conditions { + if c.Type == corev1.PodReady && c.Status == corev1.ConditionTrue { + return true + } + } + return false +} diff --git a/internal/controller/untaint_controller_integration_test.go b/internal/controller/untaint_controller_integration_test.go new file mode 100644 index 000000000..478bb12c6 --- /dev/null +++ b/internal/controller/untaint_controller_integration_test.go @@ -0,0 +1,196 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +//go:build integration +// +build integration + +package controller + +import ( + "context" + "time" + + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + + "github.com/DataDog/datadog-operator/api/datadoghq/common" + "github.com/DataDog/datadog-operator/pkg/constants" +) + +const ( + untaintTimeout = 15 * time.Second + untaintInterval = 200 * time.Millisecond +) + +var _ = Describe("Untaint Controller", func() { + ctx := context.Background() + nodeName := "tainted-node-1" + + Context("When agent pod becomes Ready on a tainted node", func() { + agentPodName := "agent-pod-integration" + agentPodNamespace := "default" + + AfterEach(func() { + // Clean up agent pod + pod := &corev1.Pod{} + if err := k8sClient.Get(ctx, types.NamespacedName{Namespace: agentPodNamespace, Name: agentPodName}, pod); err == nil { + _ = k8sClient.Delete(ctx, pod) + } + // Restore taint on node for next test + node := &corev1.Node{} + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: nodeName}, node)).Should(Succeed()) + if !hasTaint(node) { + node.Spec.Taints = append(node.Spec.Taints, corev1.Taint{ + Key: agentNotReadyTaintKey, + Value: agentNotReadyTaintValue, + Effect: agentNotReadyTaintEffect, + }) + Expect(k8sClient.Update(ctx, node)).Should(Succeed()) + } + }) + + It("Should remove the taint when an agent pod transitions to Ready", func() { + // 1. Create agent pod (not ready) + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: agentPodName, + Namespace: agentPodNamespace, + Labels: map[string]string{ + common.AgentDeploymentComponentLabelKey: constants.DefaultAgentResourceSuffix, + }, + }, + Spec: corev1.PodSpec{ + NodeName: nodeName, + Containers: []corev1.Container{{Name: "agent", Image: "fake"}}, + }, + } + Expect(k8sClient.Create(ctx, pod)).Should(Succeed()) + + // Set pod status to not-ready + pod.Status.Conditions = []corev1.PodCondition{ + {Type: corev1.PodReady, Status: corev1.ConditionFalse}, + } + Expect(k8sClient.Status().Update(ctx, pod)).Should(Succeed()) + + // 2. Verify taint is still present + node := &corev1.Node{} + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: nodeName}, node)).Should(Succeed()) + Expect(hasTaint(node)).To(BeTrue(), "taint should still be present before agent is ready") + + // 3. Update pod to Ready + Expect(k8sClient.Get(ctx, types.NamespacedName{Namespace: agentPodNamespace, Name: agentPodName}, pod)).Should(Succeed()) + pod.Status.Conditions = []corev1.PodCondition{ + { + Type: corev1.PodReady, + Status: corev1.ConditionTrue, + LastTransitionTime: metav1.Now(), + }, + } + Expect(k8sClient.Status().Update(ctx, pod)).Should(Succeed()) + + // 4. Eventually: taint should be removed + Eventually(func() bool { + fresh := &corev1.Node{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: nodeName}, fresh); err != nil { + return false + } + return !hasTaint(fresh) + }, untaintTimeout, untaintInterval).Should(BeTrue(), "taint should be removed after agent becomes ready") + }) + + It("Should not remove the taint while the agent pod is not Ready", func() { + // Create agent pod, never make it ready + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: agentPodName, + Namespace: agentPodNamespace, + Labels: map[string]string{ + common.AgentDeploymentComponentLabelKey: constants.DefaultAgentResourceSuffix, + }, + }, + Spec: corev1.PodSpec{ + NodeName: nodeName, + Containers: []corev1.Container{{Name: "agent", Image: "fake"}}, + }, + } + Expect(k8sClient.Create(ctx, pod)).Should(Succeed()) + pod.Status.Conditions = []corev1.PodCondition{ + {Type: corev1.PodReady, Status: corev1.ConditionFalse}, + } + Expect(k8sClient.Status().Update(ctx, pod)).Should(Succeed()) + + // Wait a moment; taint must remain + Consistently(func() bool { + fresh := &corev1.Node{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: nodeName}, fresh); err != nil { + return false + } + return hasTaint(fresh) + }, 3*time.Second, untaintInterval).Should(BeTrue(), "taint should remain while agent is not ready") + }) + }) + + Context("When agent pod is already Ready at startup (startup catch-up)", func() { + agentPodName := "agent-pod-catchup" + agentPodNamespace := "default" + + AfterEach(func() { + pod := &corev1.Pod{} + if err := k8sClient.Get(ctx, types.NamespacedName{Namespace: agentPodNamespace, Name: agentPodName}, pod); err == nil { + _ = k8sClient.Delete(ctx, pod) + } + // Restore taint + node := &corev1.Node{} + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: nodeName}, node)).Should(Succeed()) + if !hasTaint(node) { + node.Spec.Taints = append(node.Spec.Taints, corev1.Taint{ + Key: agentNotReadyTaintKey, + Value: agentNotReadyTaintValue, + Effect: agentNotReadyTaintEffect, + }) + Expect(k8sClient.Update(ctx, node)).Should(Succeed()) + } + }) + + It("Should remove the taint when a Ready pod is created (cache sync)", func() { + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: agentPodName, + Namespace: agentPodNamespace, + Labels: map[string]string{ + common.AgentDeploymentComponentLabelKey: constants.DefaultAgentResourceSuffix, + }, + }, + Spec: corev1.PodSpec{ + NodeName: nodeName, + Containers: []corev1.Container{{Name: "agent", Image: "fake"}}, + }, + } + Expect(k8sClient.Create(ctx, pod)).Should(Succeed()) + + // Immediately set to Ready (simulates pod being ready on create) + Expect(k8sClient.Get(ctx, types.NamespacedName{Namespace: agentPodNamespace, Name: agentPodName}, pod)).Should(Succeed()) + pod.Status.Conditions = []corev1.PodCondition{ + { + Type: corev1.PodReady, + Status: corev1.ConditionTrue, + LastTransitionTime: metav1.Now(), + }, + } + Expect(k8sClient.Status().Update(ctx, pod)).Should(Succeed()) + + Eventually(func() bool { + fresh := &corev1.Node{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: nodeName}, fresh); err != nil { + return false + } + return !hasTaint(fresh) + }, untaintTimeout, untaintInterval).Should(BeTrue()) + }) + }) +}) diff --git a/pkg/config/config.go b/pkg/config/config.go index af3515acc..ea57c8e61 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -68,6 +68,7 @@ type WatchOptions struct { IntrospectionEnabled bool DatadogDashboardEnabled bool DatadogGenericResourceEnabled bool + UntaintControllerEnabled bool } // CacheOptions function configures Controller Runtime cache options on a resource level (supported in v0.16+). @@ -121,14 +122,14 @@ func CacheOptions(logger logr.Logger, opts WatchOptions) cache.Options { byObject[profileObj] = cache.ByObject{ Namespaces: agentProfileNamespaces, } + } - // It is very important to reduce memory usage when profiles are used. - // For the profiles feature we need to list the agent pods, but we're only - // interested in the node name and the labels. This function removes all the - // rest of fields to reduce memory usage. - // Pods are watched in DatadogAgent namespace(s) since that's where Agent pods are running. + if opts.DatadogAgentProfileEnabled || opts.UntaintControllerEnabled { + // For the profiles feature and untaint controller we need to list agent pods. + // The profiles feature needs node name and labels; the untaint controller also needs + // Status.Conditions to check readiness. Pods are watched in DatadogAgent namespace(s). agentNamespaces := GetWatchNamespacesFromEnv(logger, AgentWatchNamespaceEnvVar) - logger.Info("DatadogAgentProfile Enabled", "watching Pods in namespaces", slices.Collect(maps.Keys(agentNamespaces))) + logger.Info("Pod cache enabled", "watching Pods in namespaces", slices.Collect(maps.Keys(agentNamespaces))) byObject[podObj] = cache.ByObject{ Namespaces: agentNamespaces, @@ -151,14 +152,19 @@ func CacheOptions(logger logr.Logger, opts WatchOptions) cache.Options { }, } + // Preserve conditions for the untaint controller (readiness check) + if opts.UntaintControllerEnabled { + newPod.Status.Conditions = pod.Status.Conditions + } + return newPod, nil }, } } - if opts.DatadogAgentProfileEnabled || opts.IntrospectionEnabled { - // Also for the profiles feature, we need to list the nodes, but we're only - // interested in the node name and the labels. + if opts.DatadogAgentProfileEnabled || opts.IntrospectionEnabled || opts.UntaintControllerEnabled { + // Also for the profiles feature, introspection and untaint controller, we need to list the + // nodes. The untaint controller additionally needs Spec.Taints to check for the target taint. // Note that if in the future we need to list or get pods or nodes and use other // fields we'll need to modify this function. // @@ -175,6 +181,11 @@ func CacheOptions(logger logr.Logger, opts WatchOptions) cache.Options { }, } + // Preserve taints for the untaint controller + if opts.UntaintControllerEnabled { + newNode.Spec.Taints = node.Spec.Taints + } + return newNode, nil }, }