update node info processors to include unschedulable nodes (#8520)

elmiko · web-flow · commit 2cd7445fd4e9 · 2025-10-28T15:56:01.000-07:00
* pass allNodes to node info provider Process

This change passes all the nodes to the mixed node info provider
processor that is called from `RunOnce`. The change is to allow
unschedulable and unready nodes to be processed as bad canidates during
the node info template generation.

The Process function has been updated to separate nodes into good and
bad candidates to make the filtering match the original intent.

* add --scale-from-unschedulable flag

This change introduces a flag which will instruct the CA to ignore a
node's `.spec.unschedulable` field when creating node template for
considering which node group to scale.
diff --git a/cluster-autoscaler/FAQ.md b/cluster-autoscaler/FAQ.md
@@ -1104,6 +1104,7 @@ The following startup parameters are supported for cluster autoscaler:
 | `scale-down-unready-enabled` | Should CA scale down unready nodes of the cluster | true |
 | `scale-down-unready-time` | How long an unready node should be unneeded before it is eligible for scale down | 20m0s |
 | `scale-down-utilization-threshold` | The maximum value between the sum of cpu requests and sum of memory requests of all pods running on the node divided by node's corresponding allocatable resource, below which a node can be considered for scale down | 0.5 |
+| `scale-from-unschedulable` | Should CA ignore a node's .spec.unschedulable field when creating a node template for considering to scale a node group. | false |
 | `scale-up-from-zero` | Should CA scale up when there are 0 ready nodes. | true |
 | `scan-interval` | How often cluster is reevaluated for scale up or down | 10s |
 | `scheduler-config-file` | scheduler-config allows changing configuration of in-tree scheduler plugins acting on PreFilter and Filter extension points |  |
diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go
@@ -230,6 +230,9 @@ type AutoscalingOptions struct {
 	BalancingLabels []string
 	// AWSUseStaticInstanceList tells if AWS cloud provider use static instance type list or dynamically fetch from remote APIs.
 	AWSUseStaticInstanceList bool
+	// ScaleFromUnschedulable tells the autoscaler to ignore a node's .spec.unschedulable field when creating a node template.
+	// Specifically, this will cause the autoscaler to set the node template's .spec.unschedulable field to false.
+	ScaleFromUnschedulable bool
 	// GCEOptions contain autoscaling options specific to GCE cloud provider.
 	GCEOptions GCEOptions
 	// KubeClientOpts specify options for kube client
diff --git a/cluster-autoscaler/config/flags/flags.go b/cluster-autoscaler/config/flags/flags.go
@@ -167,6 +167,7 @@ var (
 	balancingIgnoreLabelsFlag = multiStringFlag("balancing-ignore-label", "Specifies a label to ignore in addition to the basic and cloud-provider set of labels when comparing if two node groups are similar")
 	balancingLabelsFlag       = multiStringFlag("balancing-label", "Specifies a label to use for comparing if two node groups are similar, rather than the built in heuristics. Setting this flag disables all other comparison logic, and cannot be combined with --balancing-ignore-label.")
 	awsUseStaticInstanceList  = flag.Bool("aws-use-static-instance-list", false, "Should CA fetch instance types in runtime or use a static list. AWS only")
+	scaleFromUnschedulable    = flag.Bool("scale-from-unschedulable", false, "Specifies that the CA should ignore a node's .spec.unschedulable field in node templates when considering to scale a node group.")
 
 	// GCE specific flags
 	concurrentGceRefreshes             = flag.Int("gce-concurrent-refreshes", 1, "Maximum number of concurrent refreshes per cloud object type.")
@@ -351,6 +352,7 @@ func createAutoscalingOptions() config.AutoscalingOptions {
 		},
 		NodeDeletionDelayTimeout: *nodeDeletionDelayTimeout,
 		AWSUseStaticInstanceList: *awsUseStaticInstanceList,
+		ScaleFromUnschedulable:   *scaleFromUnschedulable,
 		GCEOptions: config.GCEOptions{
 			ConcurrentRefreshes:            *concurrentGceRefreshes,
 			MigInstancesMinRefreshWaitTime: *gceMigInstancesMinRefreshWaitTime,
diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go
@@ -348,7 +348,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr
 		return typedErr.AddPrefix("failed to initialize RemainingPdbTracker: ")
 	}
 
-	nodeInfosForGroups, autoscalerError := a.processors.TemplateNodeInfoProvider.Process(autoscalingCtx, readyNodes, daemonsets, a.taintConfig, currentTime)
+	nodeInfosForGroups, autoscalerError := a.processors.TemplateNodeInfoProvider.Process(autoscalingCtx, allNodes, daemonsets, a.taintConfig, currentTime)
 	if autoscalerError != nil {
 		klog.Errorf("Failed to get node infos for groups: %v", autoscalerError)
 		return autoscalerError.AddPrefix("failed to build node infos for node groups: ")
diff --git a/cluster-autoscaler/processors/nodeinfosprovider/mixed_nodeinfos_processor.go b/cluster-autoscaler/processors/nodeinfosprovider/mixed_nodeinfos_processor.go
@@ -78,6 +78,18 @@ func (p *MixedTemplateNodeInfoProvider) Process(autoscalingCtx *ca_context.Autos
 	result := make(map[string]*framework.NodeInfo)
 	seenGroups := make(map[string]bool)
 
+	// sort nodes into those good and bad candidates for templates. the bad candidates will be processed
+	// at the end of this function as a last resort for a node info template.
+	goodCandidates := make([]*apiv1.Node, 0)
+	badCandidates := make([]*apiv1.Node, 0)
+	for _, node := range nodes {
+		if isNodeGoodTemplateCandidate(node, now) {
+			goodCandidates = append(goodCandidates, node)
+		} else {
+			badCandidates = append(badCandidates, node)
+		}
+	}
+
 	// processNode returns information whether the nodeTemplate was generated and if there was an error.
 	processNode := func(node *apiv1.Node) (bool, string, caerror.AutoscalerError) {
 		nodeGroup, err := autoscalingCtx.CloudProvider.NodeGroupForNode(node)
@@ -103,11 +115,7 @@ func (p *MixedTemplateNodeInfoProvider) Process(autoscalingCtx *ca_context.Autos
 		return false, "", nil
 	}
 
-	for _, node := range nodes {
-		// Broken nodes might have some stuff missing. Skipping.
-		if !isNodeGoodTemplateCandidate(node, now) {
-			continue
-		}
+	for _, node := range goodCandidates {
 		added, id, typedErr := processNode(node)
 		if typedErr != nil {
 			return map[string]*framework.NodeInfo{}, typedErr
@@ -156,11 +164,7 @@ func (p *MixedTemplateNodeInfoProvider) Process(autoscalingCtx *ca_context.Autos
 	}
 
 	// Last resort - unready/unschedulable nodes.
-	for _, node := range nodes {
-		// Allowing broken nodes
-		if isNodeGoodTemplateCandidate(node, now) {
-			continue
-		}
+	for _, node := range badCandidates {
 		added, _, typedErr := processNode(node)
 		if typedErr != nil {
 			return map[string]*framework.NodeInfo{}, typedErr
diff --git a/cluster-autoscaler/simulator/node_info_utils.go b/cluster-autoscaler/simulator/node_info_utils.go
@@ -115,6 +115,12 @@ func createSanitizedNode(node *apiv1.Node, newName string, taintConfig *taints.T
 	}
 	newNode.Labels[apiv1.LabelHostname] = newName
 
+	if taintConfig != nil {
+		if taintConfig.ShouldScaleFromUnschedulable() {
+			newNode.Spec.Unschedulable = false
+		}
+	}
+
 	if taintConfig != nil {
 		newNode.Spec.Taints = taints.SanitizeTaints(newNode.Spec.Taints, *taintConfig)
 	}
diff --git a/cluster-autoscaler/utils/taints/taints.go b/cluster-autoscaler/utils/taints/taints.go
@@ -97,6 +97,11 @@ type TaintConfig struct {
 	startupTaintPrefixes     []string
 	statusTaintPrefixes      []string
 	explicitlyReportedTaints TaintKeySet
+	// The scaleFromUnschedulable field helps to inform the CA when
+	// to ignore .spec.unschedulable for a node. It is being added to this
+	// struct for convenience as it will be used in similar places that check
+	// for taints to ignore.
+	scaleFromUnschedulable bool
 }
 
 // NewTaintConfig returns the taint config extracted from options
@@ -128,6 +133,7 @@ func NewTaintConfig(opts config.AutoscalingOptions) TaintConfig {
 		startupTaintPrefixes:     []string{IgnoreTaintPrefix, StartupTaintPrefix},
 		statusTaintPrefixes:      []string{StatusTaintPrefix},
 		explicitlyReportedTaints: explicitlyReportedTaints,
+		scaleFromUnschedulable:   opts.ScaleFromUnschedulable,
 	}
 }
 
@@ -147,6 +153,11 @@ func (tc TaintConfig) IsStatusTaint(taint string) bool {
 	return matchesAnyPrefix(tc.statusTaintPrefixes, taint)
 }
 
+// ShouldScaleFromUnschedulable returns whether a node's .spec.unschedulable field should be ignored.
+func (tc TaintConfig) ShouldScaleFromUnschedulable() bool {
+	return tc.scaleFromUnschedulable
+}
+
 func (tc TaintConfig) isExplicitlyReportedTaint(taint string) bool {
 	_, ok := tc.explicitlyReportedTaints[taint]
 	return ok

Original file line number	Diff line number	Diff line change
`@@ -348,7 +348,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr`
`348`	`348`	`return typedErr.AddPrefix("failed to initialize RemainingPdbTracker: ")`
`349`	`349`	`}`
`350`	`350`
`351`		`- nodeInfosForGroups, autoscalerError := a.processors.TemplateNodeInfoProvider.Process(autoscalingCtx, readyNodes, daemonsets, a.taintConfig, currentTime)`
	`351`	`+ nodeInfosForGroups, autoscalerError := a.processors.TemplateNodeInfoProvider.Process(autoscalingCtx, allNodes, daemonsets, a.taintConfig, currentTime)`
`352`	`352`	`if autoscalerError != nil {`
`353`	`353`	`klog.Errorf("Failed to get node infos for groups: %v", autoscalerError)`
`354`	`354`	`return autoscalerError.AddPrefix("failed to build node infos for node groups: ")`
Original file line number	Diff line number	Diff line change
`@@ -115,6 +115,12 @@ func createSanitizedNode(node apiv1.Node, newName string, taintConfig taints.T`
`115`	`115`	`}`
`116`	`116`	`newNode.Labels[apiv1.LabelHostname] = newName`
`117`	`117`
	`118`	`+ if taintConfig != nil {`
	`119`	`+ if taintConfig.ShouldScaleFromUnschedulable() {`
	`120`	`+ newNode.Spec.Unschedulable = false`
	`121`	`+ }`
	`122`	`+ }`
	`123`	`+`
`118`	`124`	`if taintConfig != nil {`
`119`	`125`	`newNode.Spec.Taints = taints.SanitizeTaints(newNode.Spec.Taints, *taintConfig)`
`120`	`126`	`}`