UPSTREAM: 8520: update node info processors to include unschedulable nodes (kubernetes#8520)

elmiko · elmiko · commit 4e426506d0f2 · 2025-10-29T10:14:47.000-04:00
* pass allNodes to node info provider Process

This change passes all the nodes to the mixed node info provider
processor that is called from `RunOnce`. The change is to allow
unschedulable and unready nodes to be processed as bad canidates during
the node info template generation.

The Process function has been updated to separate nodes into good and
bad candidates to make the filtering match the original intent.

* add --scale-from-unschedulable flag

This change introduces a flag which will instruct the CA to ignore a
node's `.spec.unschedulable` field when creating node template for
considering which node group to scale.
diff --git a/cluster-autoscaler/FAQ.md b/cluster-autoscaler/FAQ.md
@@ -1028,6 +1028,7 @@ The following startup parameters are supported for cluster autoscaler:
 | `debugging-snapshot-enabled` | Whether the debugging snapshot of cluster autoscaler feature is enabled. | false
 | `node-delete-delay-after-taint` | How long to wait before deleting a node after tainting it. | 5 seconds
 | `enable-provisioning-requests` | Whether the clusterautoscaler will be handling the ProvisioningRequest CRs. | false
+| `scale-from-unschedulable` | Should CA ignore a node's .spec.unschedulable field when creating a node template for considering to scale a node group. | false |
 
 # Troubleshooting
 
diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go
@@ -234,6 +234,9 @@ type AutoscalingOptions struct {
 	BalancingLabels []string
 	// AWSUseStaticInstanceList tells if AWS cloud provider use static instance type list or dynamically fetch from remote APIs.
 	AWSUseStaticInstanceList bool
+	// ScaleFromUnschedulable tells the autoscaler to ignore a node's .spec.unschedulable field when creating a node template.
+	// Specifically, this will cause the autoscaler to set the node template's .spec.unschedulable field to false.
+	ScaleFromUnschedulable bool
 	// GCEOptions contain autoscaling options specific to GCE cloud provider.
 	GCEOptions GCEOptions
 	// KubeClientOpts specify options for kube client
diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go
@@ -358,7 +358,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr
 		return typedErr.AddPrefix("failed to initialize RemainingPdbTracker: ")
 	}
 
-	nodeInfosForGroups, autoscalerError := a.processors.TemplateNodeInfoProvider.Process(autoscalingContext, readyNodes, daemonsets, a.taintConfig, currentTime)
+	nodeInfosForGroups, autoscalerError := a.processors.TemplateNodeInfoProvider.Process(autoscalingContext, allNodes, daemonsets, a.taintConfig, currentTime)
 	if autoscalerError != nil {
 		klog.Errorf("Failed to get node infos for groups: %v", autoscalerError)
 		return autoscalerError.AddPrefix("failed to build node infos for node groups: ")
diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go
@@ -225,6 +225,7 @@ var (
 	balancingIgnoreLabelsFlag = multiStringFlag("balancing-ignore-label", "Specifies a label to ignore in addition to the basic and cloud-provider set of labels when comparing if two node groups are similar")
 	balancingLabelsFlag       = multiStringFlag("balancing-label", "Specifies a label to use for comparing if two node groups are similar, rather than the built in heuristics. Setting this flag disables all other comparison logic, and cannot be combined with --balancing-ignore-label.")
 	awsUseStaticInstanceList  = flag.Bool("aws-use-static-instance-list", false, "Should CA fetch instance types in runtime or use a static list. AWS only")
+	scaleFromUnschedulable    = flag.Bool("scale-from-unschedulable", false, "Specifies that the CA should ignore a node's .spec.unschedulable field in node templates when considering to scale a node group.")
 
 	// GCE specific flags
 	concurrentGceRefreshes             = flag.Int("gce-concurrent-refreshes", 1, "Maximum number of concurrent refreshes per cloud object type.")
@@ -418,6 +419,7 @@ func createAutoscalingOptions() config.AutoscalingOptions {
 		},
 		NodeDeletionDelayTimeout: *nodeDeletionDelayTimeout,
 		AWSUseStaticInstanceList: *awsUseStaticInstanceList,
+		ScaleFromUnschedulable:   *scaleFromUnschedulable,
 		GCEOptions: config.GCEOptions{
 			ConcurrentRefreshes:            *concurrentGceRefreshes,
 			MigInstancesMinRefreshWaitTime: *gceMigInstancesMinRefreshWaitTime,
diff --git a/cluster-autoscaler/processors/nodeinfosprovider/mixed_nodeinfos_processor.go b/cluster-autoscaler/processors/nodeinfosprovider/mixed_nodeinfos_processor.go
@@ -78,6 +78,18 @@ func (p *MixedTemplateNodeInfoProvider) Process(ctx *context.AutoscalingContext,
 	result := make(map[string]*framework.NodeInfo)
 	seenGroups := make(map[string]bool)
 
+	// sort nodes into those good and bad candidates for templates. the bad candidates will be processed
+	// at the end of this function as a last resort for a node info template.
+	goodCandidates := make([]*apiv1.Node, 0)
+	badCandidates := make([]*apiv1.Node, 0)
+	for _, node := range nodes {
+		if isNodeGoodTemplateCandidate(node, now) {
+			goodCandidates = append(goodCandidates, node)
+		} else {
+			badCandidates = append(badCandidates, node)
+		}
+	}
+
 	// processNode returns information whether the nodeTemplate was generated and if there was an error.
 	processNode := func(node *apiv1.Node) (bool, string, caerror.AutoscalerError) {
 		nodeGroup, err := ctx.CloudProvider.NodeGroupForNode(node)
@@ -103,11 +115,7 @@ func (p *MixedTemplateNodeInfoProvider) Process(ctx *context.AutoscalingContext,
 		return false, "", nil
 	}
 
-	for _, node := range nodes {
-		// Broken nodes might have some stuff missing. Skipping.
-		if !isNodeGoodTemplateCandidate(node, now) {
-			continue
-		}
+	for _, node := range goodCandidates {
 		added, id, typedErr := processNode(node)
 		if typedErr != nil {
 			return map[string]*framework.NodeInfo{}, typedErr
@@ -158,11 +166,7 @@ func (p *MixedTemplateNodeInfoProvider) Process(ctx *context.AutoscalingContext,
 	}
 
 	// Last resort - unready/unschedulable nodes.
-	for _, node := range nodes {
-		// Allowing broken nodes
-		if isNodeGoodTemplateCandidate(node, now) {
-			continue
-		}
+	for _, node := range badCandidates {
 		added, _, typedErr := processNode(node)
 		if typedErr != nil {
 			return map[string]*framework.NodeInfo{}, typedErr
diff --git a/cluster-autoscaler/simulator/node_info_utils.go b/cluster-autoscaler/simulator/node_info_utils.go
@@ -112,6 +112,12 @@ func createSanitizedNode(node *apiv1.Node, newName string, taintConfig *taints.T
 	}
 	newNode.Labels[apiv1.LabelHostname] = newName
 
+	if taintConfig != nil {
+		if taintConfig.ShouldScaleFromUnschedulable() {
+			newNode.Spec.Unschedulable = false
+		}
+	}
+
 	if taintConfig != nil {
 		newNode.Spec.Taints = taints.SanitizeTaints(newNode.Spec.Taints, *taintConfig)
 	}
diff --git a/cluster-autoscaler/utils/taints/taints.go b/cluster-autoscaler/utils/taints/taints.go
@@ -96,6 +96,11 @@ type TaintConfig struct {
 	startupTaintPrefixes     []string
 	statusTaintPrefixes      []string
 	explicitlyReportedTaints TaintKeySet
+	// The scaleFromUnschedulable field helps to inform the CA when
+	// to ignore .spec.unschedulable for a node. It is being added to this
+	// struct for convenience as it will be used in similar places that check
+	// for taints to ignore.
+	scaleFromUnschedulable bool
 }
 
 // NewTaintConfig returns the taint config extracted from options
@@ -127,6 +132,7 @@ func NewTaintConfig(opts config.AutoscalingOptions) TaintConfig {
 		startupTaintPrefixes:     []string{IgnoreTaintPrefix, StartupTaintPrefix},
 		statusTaintPrefixes:      []string{StatusTaintPrefix},
 		explicitlyReportedTaints: explicitlyReportedTaints,
+		scaleFromUnschedulable:   opts.ScaleFromUnschedulable,
 	}
 }
 
@@ -146,6 +152,11 @@ func (tc TaintConfig) IsStatusTaint(taint string) bool {
 	return matchesAnyPrefix(tc.statusTaintPrefixes, taint)
 }
 
+// ShouldScaleFromUnschedulable returns whether a node's .spec.unschedulable field should be ignored.
+func (tc TaintConfig) ShouldScaleFromUnschedulable() bool {
+	return tc.scaleFromUnschedulable
+}
+
 func (tc TaintConfig) isExplicitlyReportedTaint(taint string) bool {
 	_, ok := tc.explicitlyReportedTaints[taint]
 	return ok

Original file line number	Diff line number	Diff line change
`@@ -358,7 +358,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr`
`358`	`358`	`return typedErr.AddPrefix("failed to initialize RemainingPdbTracker: ")`
`359`	`359`	`}`
`360`	`360`
`361`		`- nodeInfosForGroups, autoscalerError := a.processors.TemplateNodeInfoProvider.Process(autoscalingContext, readyNodes, daemonsets, a.taintConfig, currentTime)`
	`361`	`+ nodeInfosForGroups, autoscalerError := a.processors.TemplateNodeInfoProvider.Process(autoscalingContext, allNodes, daemonsets, a.taintConfig, currentTime)`
`362`	`362`	`if autoscalerError != nil {`
`363`	`363`	`klog.Errorf("Failed to get node infos for groups: %v", autoscalerError)`
`364`	`364`	`return autoscalerError.AddPrefix("failed to build node infos for node groups: ")`
Original file line number	Diff line number	Diff line change
`@@ -112,6 +112,12 @@ func createSanitizedNode(node apiv1.Node, newName string, taintConfig taints.T`
`112`	`112`	`}`
`113`	`113`	`newNode.Labels[apiv1.LabelHostname] = newName`
`114`	`114`
	`115`	`+ if taintConfig != nil {`
	`116`	`+ if taintConfig.ShouldScaleFromUnschedulable() {`
	`117`	`+ newNode.Spec.Unschedulable = false`
	`118`	`+ }`
	`119`	`+ }`
	`120`	`+`
`115`	`121`	`if taintConfig != nil {`
`116`	`122`	`newNode.Spec.Taints = taints.SanitizeTaints(newNode.Spec.Taints, *taintConfig)`
`117`	`123`	`}`