From d0adaa711377c69debcd7b0b23dd2807d0dc0f6a Mon Sep 17 00:00:00 2001 From: elmiko Date: Wed, 22 Oct 2025 11:28:09 -0400 Subject: [PATCH 1/2] pass allNodes to node info provider Process This change passes all the nodes to the mixed node info provider processor that is called from `RunOnce`. The change is to allow unschedulable and unready nodes to be processed as bad canidates during the node info template generation. The Process function has been updated to separate nodes into good and bad candidates to make the filtering match the original intent. --- cluster-autoscaler/core/static_autoscaler.go | 2 +- .../mixed_nodeinfos_processor.go | 24 +++++++++++-------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index 86d4ab9826a4..9b089a319c71 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -348,7 +348,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr return typedErr.AddPrefix("failed to initialize RemainingPdbTracker: ") } - nodeInfosForGroups, autoscalerError := a.processors.TemplateNodeInfoProvider.Process(autoscalingCtx, readyNodes, daemonsets, a.taintConfig, currentTime) + nodeInfosForGroups, autoscalerError := a.processors.TemplateNodeInfoProvider.Process(autoscalingCtx, allNodes, daemonsets, a.taintConfig, currentTime) if autoscalerError != nil { klog.Errorf("Failed to get node infos for groups: %v", autoscalerError) return autoscalerError.AddPrefix("failed to build node infos for node groups: ") diff --git a/cluster-autoscaler/processors/nodeinfosprovider/mixed_nodeinfos_processor.go b/cluster-autoscaler/processors/nodeinfosprovider/mixed_nodeinfos_processor.go index 4b297372e7b1..304a05039e39 100644 --- a/cluster-autoscaler/processors/nodeinfosprovider/mixed_nodeinfos_processor.go +++ b/cluster-autoscaler/processors/nodeinfosprovider/mixed_nodeinfos_processor.go @@ -78,6 +78,18 @@ func (p *MixedTemplateNodeInfoProvider) Process(autoscalingCtx *ca_context.Autos result := make(map[string]*framework.NodeInfo) seenGroups := make(map[string]bool) + // sort nodes into those good and bad candidates for templates. the bad candidates will be processed + // at the end of this function as a last resort for a node info template. + goodCandidates := make([]*apiv1.Node, 0) + badCandidates := make([]*apiv1.Node, 0) + for _, node := range nodes { + if isNodeGoodTemplateCandidate(node, now) { + goodCandidates = append(goodCandidates, node) + } else { + badCandidates = append(badCandidates, node) + } + } + // processNode returns information whether the nodeTemplate was generated and if there was an error. processNode := func(node *apiv1.Node) (bool, string, caerror.AutoscalerError) { nodeGroup, err := autoscalingCtx.CloudProvider.NodeGroupForNode(node) @@ -103,11 +115,7 @@ func (p *MixedTemplateNodeInfoProvider) Process(autoscalingCtx *ca_context.Autos return false, "", nil } - for _, node := range nodes { - // Broken nodes might have some stuff missing. Skipping. - if !isNodeGoodTemplateCandidate(node, now) { - continue - } + for _, node := range goodCandidates { added, id, typedErr := processNode(node) if typedErr != nil { return map[string]*framework.NodeInfo{}, typedErr @@ -156,11 +164,7 @@ func (p *MixedTemplateNodeInfoProvider) Process(autoscalingCtx *ca_context.Autos } // Last resort - unready/unschedulable nodes. - for _, node := range nodes { - // Allowing broken nodes - if isNodeGoodTemplateCandidate(node, now) { - continue - } + for _, node := range badCandidates { added, _, typedErr := processNode(node) if typedErr != nil { return map[string]*framework.NodeInfo{}, typedErr From 4c4511b63281578e17283b56616d58a5dbbf7e00 Mon Sep 17 00:00:00 2001 From: elmiko Date: Wed, 22 Oct 2025 11:06:49 -0400 Subject: [PATCH 2/2] add --scale-from-unschedulable flag This change introduces a flag which will instruct the CA to ignore a node's `.spec.unschedulable` field when creating node template for considering which node group to scale. --- cluster-autoscaler/FAQ.md | 1 + cluster-autoscaler/config/autoscaling_options.go | 3 +++ cluster-autoscaler/config/flags/flags.go | 2 ++ cluster-autoscaler/simulator/node_info_utils.go | 6 ++++++ cluster-autoscaler/utils/taints/taints.go | 11 +++++++++++ 5 files changed, 23 insertions(+) diff --git a/cluster-autoscaler/FAQ.md b/cluster-autoscaler/FAQ.md index 62c235c27458..36951ee18773 100644 --- a/cluster-autoscaler/FAQ.md +++ b/cluster-autoscaler/FAQ.md @@ -1104,6 +1104,7 @@ The following startup parameters are supported for cluster autoscaler: | `scale-down-unready-enabled` | Should CA scale down unready nodes of the cluster | true | | `scale-down-unready-time` | How long an unready node should be unneeded before it is eligible for scale down | 20m0s | | `scale-down-utilization-threshold` | The maximum value between the sum of cpu requests and sum of memory requests of all pods running on the node divided by node's corresponding allocatable resource, below which a node can be considered for scale down | 0.5 | +| `scale-from-unschedulable` | Should CA ignore a node's .spec.unschedulable field when creating a node template for considering to scale a node group. | false | | `scale-up-from-zero` | Should CA scale up when there are 0 ready nodes. | true | | `scan-interval` | How often cluster is reevaluated for scale up or down | 10s | | `scheduler-config-file` | scheduler-config allows changing configuration of in-tree scheduler plugins acting on PreFilter and Filter extension points | | diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go index a144580989cd..d910447eb12e 100644 --- a/cluster-autoscaler/config/autoscaling_options.go +++ b/cluster-autoscaler/config/autoscaling_options.go @@ -230,6 +230,9 @@ type AutoscalingOptions struct { BalancingLabels []string // AWSUseStaticInstanceList tells if AWS cloud provider use static instance type list or dynamically fetch from remote APIs. AWSUseStaticInstanceList bool + // ScaleFromUnschedulable tells the autoscaler to ignore a node's .spec.unschedulable field when creating a node template. + // Specifically, this will cause the autoscaler to set the node template's .spec.unschedulable field to false. + ScaleFromUnschedulable bool // GCEOptions contain autoscaling options specific to GCE cloud provider. GCEOptions GCEOptions // KubeClientOpts specify options for kube client diff --git a/cluster-autoscaler/config/flags/flags.go b/cluster-autoscaler/config/flags/flags.go index 0f7209ebbb1b..71b328b32117 100644 --- a/cluster-autoscaler/config/flags/flags.go +++ b/cluster-autoscaler/config/flags/flags.go @@ -167,6 +167,7 @@ var ( balancingIgnoreLabelsFlag = multiStringFlag("balancing-ignore-label", "Specifies a label to ignore in addition to the basic and cloud-provider set of labels when comparing if two node groups are similar") balancingLabelsFlag = multiStringFlag("balancing-label", "Specifies a label to use for comparing if two node groups are similar, rather than the built in heuristics. Setting this flag disables all other comparison logic, and cannot be combined with --balancing-ignore-label.") awsUseStaticInstanceList = flag.Bool("aws-use-static-instance-list", false, "Should CA fetch instance types in runtime or use a static list. AWS only") + scaleFromUnschedulable = flag.Bool("scale-from-unschedulable", false, "Specifies that the CA should ignore a node's .spec.unschedulable field in node templates when considering to scale a node group.") // GCE specific flags concurrentGceRefreshes = flag.Int("gce-concurrent-refreshes", 1, "Maximum number of concurrent refreshes per cloud object type.") @@ -351,6 +352,7 @@ func createAutoscalingOptions() config.AutoscalingOptions { }, NodeDeletionDelayTimeout: *nodeDeletionDelayTimeout, AWSUseStaticInstanceList: *awsUseStaticInstanceList, + ScaleFromUnschedulable: *scaleFromUnschedulable, GCEOptions: config.GCEOptions{ ConcurrentRefreshes: *concurrentGceRefreshes, MigInstancesMinRefreshWaitTime: *gceMigInstancesMinRefreshWaitTime, diff --git a/cluster-autoscaler/simulator/node_info_utils.go b/cluster-autoscaler/simulator/node_info_utils.go index ffa8f33be53a..40251c333b9a 100644 --- a/cluster-autoscaler/simulator/node_info_utils.go +++ b/cluster-autoscaler/simulator/node_info_utils.go @@ -115,6 +115,12 @@ func createSanitizedNode(node *apiv1.Node, newName string, taintConfig *taints.T } newNode.Labels[apiv1.LabelHostname] = newName + if taintConfig != nil { + if taintConfig.ShouldScaleFromUnschedulable() { + newNode.Spec.Unschedulable = false + } + } + if taintConfig != nil { newNode.Spec.Taints = taints.SanitizeTaints(newNode.Spec.Taints, *taintConfig) } diff --git a/cluster-autoscaler/utils/taints/taints.go b/cluster-autoscaler/utils/taints/taints.go index acae356089d0..70881bd559af 100644 --- a/cluster-autoscaler/utils/taints/taints.go +++ b/cluster-autoscaler/utils/taints/taints.go @@ -97,6 +97,11 @@ type TaintConfig struct { startupTaintPrefixes []string statusTaintPrefixes []string explicitlyReportedTaints TaintKeySet + // The scaleFromUnschedulable field helps to inform the CA when + // to ignore .spec.unschedulable for a node. It is being added to this + // struct for convenience as it will be used in similar places that check + // for taints to ignore. + scaleFromUnschedulable bool } // NewTaintConfig returns the taint config extracted from options @@ -128,6 +133,7 @@ func NewTaintConfig(opts config.AutoscalingOptions) TaintConfig { startupTaintPrefixes: []string{IgnoreTaintPrefix, StartupTaintPrefix}, statusTaintPrefixes: []string{StatusTaintPrefix}, explicitlyReportedTaints: explicitlyReportedTaints, + scaleFromUnschedulable: opts.ScaleFromUnschedulable, } } @@ -147,6 +153,11 @@ func (tc TaintConfig) IsStatusTaint(taint string) bool { return matchesAnyPrefix(tc.statusTaintPrefixes, taint) } +// ShouldScaleFromUnschedulable returns whether a node's .spec.unschedulable field should be ignored. +func (tc TaintConfig) ShouldScaleFromUnschedulable() bool { + return tc.scaleFromUnschedulable +} + func (tc TaintConfig) isExplicitlyReportedTaint(taint string) bool { _, ok := tc.explicitlyReportedTaints[taint] return ok