From 56126abf2ef6b224cffca11b4a49531fb2b16b01 Mon Sep 17 00:00:00 2001 From: Juanadelacuesta <8647634+Juanadelacuesta@users.noreply.github.com> Date: Tue, 24 Feb 2026 16:39:13 +0100 Subject: [PATCH 1/5] style: refactor filterByTeinted --- scheduler/reconciler/filters.go | 337 +++++++++++++++++++++----------- 1 file changed, 227 insertions(+), 110 deletions(-) diff --git a/scheduler/reconciler/filters.go b/scheduler/reconciler/filters.go index 5ae45618418..3ce16d0c747 100644 --- a/scheduler/reconciler/filters.go +++ b/scheduler/reconciler/filters.go @@ -99,6 +99,200 @@ func (set allocSet) filterOldTerminalAllocs(a ReconcilerState) (remain, ignore a return remain, ignored } +// allocCategory represents the classification category for an allocation +type allocCategory string + +const ( + categoryUntainted allocCategory = "untainted" + categoryMigrate allocCategory = "migrate" + categoryLost allocCategory = "lost" + categoryDisconnecting allocCategory = "disconnecting" + categoryReconnecting allocCategory = "reconnecting" + categoryIgnore allocCategory = "ignore" + categoryExpiring allocCategory = "expiring" +) + +// allocContext holds all the contextual information needed to classify an allocation +type allocContext struct { + alloc *structs.Allocation + shouldReconnect bool + taintedNode *structs.Node + nodeIsTainted bool + now time.Time +} + +// classificationRule defines a single decision rule in the classification table +type classificationRule struct { + name string + condition func(ctx allocContext) bool + category allocCategory +} + +// getClassificationRules returns the ordered list of classification rules. +// Rules are evaluated in order, first match wins. +// To add new cases, simply add new rules to this list in the appropriate priority order. +var getClassificationRules = func() []classificationRule { + return []classificationRule{ + // Priority 1: Failed reconnect cases + { + name: "failed-reconnect-still-running", + condition: func(aCtx allocContext) bool { + return aCtx.shouldReconnect && + aCtx.alloc.DesiredStatus == structs.AllocDesiredStatusRun && + aCtx.alloc.ClientStatus == structs.AllocClientStatusFailed + }, + category: categoryReconnecting, + }, + // Priority 2: Server-terminal allocations (stopped replacements) + { + name: "server-terminal-no-reconnect", + condition: func(aCtx allocContext) bool { + return aCtx.alloc.TerminalStatus() && !aCtx.shouldReconnect && + aCtx.alloc.ServerTerminalStatus() + }, + category: categoryIgnore, + }, + // Priority 3: Terminal canaries that need migration + { + name: "terminal-canary-migrate", + condition: func(aCtx allocContext) bool { + return aCtx.alloc.TerminalStatus() && !aCtx.shouldReconnect && + aCtx.alloc.DeploymentStatus.IsCanary() && + aCtx.alloc.DesiredTransition.ShouldMigrate() + }, + category: categoryMigrate, + }, + // Priority 4: Other terminal allocations + { + name: "terminal-no-reconnect", + condition: func(aCtx allocContext) bool { + return aCtx.alloc.TerminalStatus() && !aCtx.shouldReconnect + }, + category: categoryUntainted, + }, + // Priority 5: Expired allocations + { + name: "expired", + condition: func(aCtx allocContext) bool { + return aCtx.alloc.Expired(aCtx.now) + }, + category: categoryExpiring, + }, + // Priority 6: Failed reconnect marked to stop + { + name: "failed-reconnect-stopped", + condition: func(aCtx allocContext) bool { + return aCtx.shouldReconnect && + aCtx.alloc.ClientStatus == structs.AllocClientStatusFailed && + aCtx.alloc.DesiredStatus == structs.AllocDesiredStatusStop + }, + category: categoryIgnore, + }, + // Priority 7: Disconnected node - unknown alloc + { + name: "disconnected-node-unknown-alloc", + condition: func(aCtx allocContext) bool { + return aCtx.taintedNode != nil && + aCtx.taintedNode.Status == structs.NodeStatusDisconnected && + aCtx.alloc.ClientStatus == structs.AllocClientStatusUnknown + }, + category: categoryUntainted, + }, + // Priority 8: Disconnected node - pending alloc + { + name: "disconnected-node-pending-alloc", + condition: func(aCtx allocContext) bool { + return aCtx.taintedNode != nil && + aCtx.taintedNode.Status == structs.NodeStatusDisconnected && + aCtx.alloc.ClientStatus == structs.AllocClientStatusPending + }, + category: categoryLost, + }, + // Priority 9: Disconnected node - no disconnect timeout + { + name: "disconnected-node-no-timeout", + condition: func(aCtx allocContext) bool { + return aCtx.taintedNode != nil && + aCtx.taintedNode.Status == structs.NodeStatusDisconnected && + aCtx.alloc.DisconnectTimeout(aCtx.now) == aCtx.now + }, + category: categoryLost, + }, + // Priority 10: Disconnected node - within grace period + { + name: "disconnected-node-grace-period", + condition: func(aCtx allocContext) bool { + return aCtx.taintedNode != nil && + aCtx.taintedNode.Status == structs.NodeStatusDisconnected + }, + category: categoryDisconnecting, + }, + // Priority 11: Migrate flag set + { + name: "migrate-flag-set", + condition: func(aCtx allocContext) bool { + return aCtx.alloc.DesiredTransition.ShouldMigrate() + }, + category: categoryMigrate, + }, + // Priority 12: Untainted/ready node with reconnect + { + name: "untainted-or-ready-reconnect", + condition: func(aCtx allocContext) bool { + return (!aCtx.nodeIsTainted || (aCtx.taintedNode != nil && aCtx.taintedNode.Status == structs.NodeStatusReady)) && + aCtx.shouldReconnect + }, + category: categoryReconnecting, + }, + // Priority 13: Untainted/ready node + { + name: "untainted-or-ready-node", + condition: func(aCtx allocContext) bool { + return !aCtx.nodeIsTainted || (aCtx.taintedNode != nil && aCtx.taintedNode.Status == structs.NodeStatusReady) + }, + category: categoryUntainted, + }, + // Priority 14: Node GC'd (nil) + { + name: "node-gcd", + condition: func(aCtx allocContext) bool { + return aCtx.taintedNode == nil + }, + category: categoryLost, + }, + // Priority 15: Terminal node, no replace, unknown alloc + { + name: "terminal-node-no-replace-unknown", + condition: func(aCtx allocContext) bool { + return aCtx.taintedNode != nil && + aCtx.taintedNode.TerminalStatus() && + !aCtx.alloc.ReplaceOnDisconnect() && + aCtx.alloc.ClientStatus == structs.AllocClientStatusUnknown + }, + category: categoryUntainted, + }, + // Priority 16: Terminal node, no replace, running alloc + { + name: "terminal-node-no-replace-running", + condition: func(aCtx allocContext) bool { + return aCtx.taintedNode != nil && + aCtx.taintedNode.TerminalStatus() && + !aCtx.alloc.ReplaceOnDisconnect() && + aCtx.alloc.ClientStatus == structs.AllocClientStatusRunning + }, + category: categoryDisconnecting, + }, + // Priority 17: Terminal node (all other cases) + { + name: "terminal-node", + condition: func(aCtx allocContext) bool { + return aCtx.taintedNode != nil && aCtx.taintedNode.TerminalStatus() + }, + category: categoryLost, + }, + } +} + // filterByTainted takes a set of tainted nodes and filters the allocation set // into the following groups: // 1. Those that exist on untainted nodes @@ -117,128 +311,51 @@ func (set allocSet) filterByTainted(state ClusterState) (untainted, migrate, los ignore = make(allocSet) expiring = make(allocSet) + // Create a map for quick category assignment + categoryMap := map[allocCategory]*allocSet{ + categoryUntainted: &untainted, + categoryMigrate: &migrate, + categoryLost: &lost, + categoryDisconnecting: &disconnecting, + categoryReconnecting: &reconnecting, + categoryIgnore: &ignore, + categoryExpiring: &expiring, + } + + rules := getClassificationRules() + for _, alloc := range set { - shouldReconnect := false + // Build the context for classification + ctx := allocContext{ + alloc: alloc, + now: state.Now, + } - // Only compute reconnect for unknown, running, and failed since they - // need to go through the reconnect logic. + // Compute shouldReconnect - only for unknown, running, and failed allocs if alloc.ClientStatus == structs.AllocClientStatusUnknown || alloc.ClientStatus == structs.AllocClientStatusRunning || alloc.ClientStatus == structs.AllocClientStatusFailed { - shouldReconnect = alloc.NeedsToReconnect() - } - - // Failed allocs that need to be reconnected must be added to - // reconnecting so that they can be handled as a failed reconnect. - if shouldReconnect && - alloc.DesiredStatus == structs.AllocDesiredStatusRun && - alloc.ClientStatus == structs.AllocClientStatusFailed { - reconnecting[alloc.ID] = alloc - continue - } - - if alloc.TerminalStatus() && !shouldReconnect { - // Server-terminal allocs, if they should not reconnect, - // are probably stopped replacements and should be ignored - if alloc.ServerTerminalStatus() { - ignore[alloc.ID] = alloc - continue - } - - // Terminal canaries that have been marked for migration need to be - // migrated, otherwise we block deployments from progressing by - // counting them as running canaries. - if alloc.DeploymentStatus.IsCanary() && alloc.DesiredTransition.ShouldMigrate() { - migrate[alloc.ID] = alloc - continue - } - - // Terminal allocs, if not reconnect, are always untainted as they - // should never be migrated. - untainted[alloc.ID] = alloc - continue + ctx.shouldReconnect = alloc.NeedsToReconnect() } - // The alloc is expired. These end up getting added to "lost" but this - // skips having to evaluate the reconnecting logic for these allocs. - if alloc.Expired(state.Now) { - expiring[alloc.ID] = alloc - continue - } - - // Ignore failed allocs that need to be reconnected and that have been - // marked to stop by the server. - if shouldReconnect && - alloc.ClientStatus == structs.AllocClientStatusFailed && - alloc.DesiredStatus == structs.AllocDesiredStatusStop { - ignore[alloc.ID] = alloc - continue - } - - taintedNode, nodeIsTainted := state.TaintedNodes[alloc.NodeID] - if taintedNode != nil && taintedNode.Status == structs.NodeStatusDisconnected { - // ignore allocs already marked unknown, they should already have a followup eval - if alloc.ClientStatus == structs.AllocClientStatusUnknown { - untainted[alloc.ID] = alloc - continue - } - // If the alloc is pending mark it lost so it is replaced immediately. - if alloc.ClientStatus == structs.AllocClientStatusPending { - lost[alloc.ID] = alloc - continue + // Get node taint information + ctx.taintedNode, ctx.nodeIsTainted = state.TaintedNodes[alloc.NodeID] + + // Apply classification rules in order (first match wins) + classified := false + for _, rule := range rules { + if rule.condition(ctx) { + targetSet := categoryMap[rule.category] + (*targetSet)[alloc.ID] = alloc + classified = true + break } - // Allocs with nil disconnect blocks or disconnect.lost_after == 0 are lost - if alloc.DisconnectTimeout(state.Now) == state.Now { - lost[alloc.ID] = alloc - continue - } - disconnecting[alloc.ID] = alloc - continue - } - - // Non-terminal allocs that should migrate should always migrate - if alloc.DesiredTransition.ShouldMigrate() { - migrate[alloc.ID] = alloc - continue } - if !nodeIsTainted || (taintedNode != nil && taintedNode.Status == structs.NodeStatusReady) { - // Filter allocs on a node that is now re-connected to be resumed. - if shouldReconnect { - reconnecting[alloc.ID] = alloc - continue - } - - // Otherwise, Node is untainted so alloc is untainted + // Default: untainted (if no rule matched) + if !classified { untainted[alloc.ID] = alloc - continue - } - - // Allocs on GC'd (nil) or lost nodes are Lost - if taintedNode == nil { - lost[alloc.ID] = alloc - continue - } - - // Allocs on terminal nodes that can't be rescheduled need to be treated - // differently than those that can. - if taintedNode.TerminalStatus() { - if !alloc.ReplaceOnDisconnect() { - if alloc.ClientStatus == structs.AllocClientStatusUnknown { - untainted[alloc.ID] = alloc - continue - } else if alloc.ClientStatus == structs.AllocClientStatusRunning { - disconnecting[alloc.ID] = alloc - continue - } - } - - lost[alloc.ID] = alloc - continue } - - // All other allocs are untainted - untainted[alloc.ID] = alloc } return From 3280bf46550e3be180a3d607f8add101904b0530 Mon Sep 17 00:00:00 2001 From: Juanadelacuesta <8647634+Juanadelacuesta@users.noreply.github.com> Date: Tue, 24 Feb 2026 17:43:25 +0100 Subject: [PATCH 2/5] style: remove name --- scheduler/reconciler/filters.go | 271 +++++++++++++++----------------- 1 file changed, 125 insertions(+), 146 deletions(-) diff --git a/scheduler/reconciler/filters.go b/scheduler/reconciler/filters.go index 3ce16d0c747..da35d572bab 100644 --- a/scheduler/reconciler/filters.go +++ b/scheduler/reconciler/filters.go @@ -131,166 +131,147 @@ type classificationRule struct { // getClassificationRules returns the ordered list of classification rules. // Rules are evaluated in order, first match wins. // To add new cases, simply add new rules to this list in the appropriate priority order. -var getClassificationRules = func() []classificationRule { - return []classificationRule{ - // Priority 1: Failed reconnect cases - { - name: "failed-reconnect-still-running", - condition: func(aCtx allocContext) bool { - return aCtx.shouldReconnect && - aCtx.alloc.DesiredStatus == structs.AllocDesiredStatusRun && - aCtx.alloc.ClientStatus == structs.AllocClientStatusFailed - }, - category: categoryReconnecting, +var classificationRules = []classificationRule{ + // Priority 1: Failed reconnect cases + { + condition: func(aCtx allocContext) bool { + return aCtx.shouldReconnect && + aCtx.alloc.DesiredStatus == structs.AllocDesiredStatusRun && + aCtx.alloc.ClientStatus == structs.AllocClientStatusFailed }, - // Priority 2: Server-terminal allocations (stopped replacements) - { - name: "server-terminal-no-reconnect", - condition: func(aCtx allocContext) bool { - return aCtx.alloc.TerminalStatus() && !aCtx.shouldReconnect && - aCtx.alloc.ServerTerminalStatus() - }, - category: categoryIgnore, + category: categoryReconnecting, + }, + // Priority 2: Server-terminal allocations (stopped replacements) + { + condition: func(aCtx allocContext) bool { + return aCtx.alloc.TerminalStatus() && !aCtx.shouldReconnect && + aCtx.alloc.ServerTerminalStatus() }, - // Priority 3: Terminal canaries that need migration - { - name: "terminal-canary-migrate", - condition: func(aCtx allocContext) bool { - return aCtx.alloc.TerminalStatus() && !aCtx.shouldReconnect && - aCtx.alloc.DeploymentStatus.IsCanary() && - aCtx.alloc.DesiredTransition.ShouldMigrate() - }, - category: categoryMigrate, + category: categoryIgnore, + }, + // Priority 3: Terminal canaries that need migration + { + condition: func(aCtx allocContext) bool { + return aCtx.alloc.TerminalStatus() && !aCtx.shouldReconnect && + aCtx.alloc.DeploymentStatus.IsCanary() && + aCtx.alloc.DesiredTransition.ShouldMigrate() }, - // Priority 4: Other terminal allocations - { - name: "terminal-no-reconnect", - condition: func(aCtx allocContext) bool { - return aCtx.alloc.TerminalStatus() && !aCtx.shouldReconnect - }, - category: categoryUntainted, + category: categoryMigrate, + }, + // Priority 4: Other terminal allocations + { + condition: func(aCtx allocContext) bool { + return aCtx.alloc.TerminalStatus() && !aCtx.shouldReconnect }, - // Priority 5: Expired allocations - { - name: "expired", - condition: func(aCtx allocContext) bool { - return aCtx.alloc.Expired(aCtx.now) - }, - category: categoryExpiring, + category: categoryUntainted, + }, + // Priority 5: Expired allocations + { + condition: func(aCtx allocContext) bool { + return aCtx.alloc.Expired(aCtx.now) }, - // Priority 6: Failed reconnect marked to stop - { - name: "failed-reconnect-stopped", - condition: func(aCtx allocContext) bool { - return aCtx.shouldReconnect && - aCtx.alloc.ClientStatus == structs.AllocClientStatusFailed && - aCtx.alloc.DesiredStatus == structs.AllocDesiredStatusStop - }, - category: categoryIgnore, + category: categoryExpiring, + }, + // Priority 6: Failed reconnect marked to stop + { + condition: func(aCtx allocContext) bool { + return aCtx.shouldReconnect && + aCtx.alloc.ClientStatus == structs.AllocClientStatusFailed && + aCtx.alloc.DesiredStatus == structs.AllocDesiredStatusStop }, - // Priority 7: Disconnected node - unknown alloc - { - name: "disconnected-node-unknown-alloc", - condition: func(aCtx allocContext) bool { - return aCtx.taintedNode != nil && - aCtx.taintedNode.Status == structs.NodeStatusDisconnected && - aCtx.alloc.ClientStatus == structs.AllocClientStatusUnknown - }, - category: categoryUntainted, + category: categoryIgnore, + }, + // Priority 7: Disconnected node - unknown alloc + { + condition: func(aCtx allocContext) bool { + return aCtx.taintedNode != nil && + aCtx.taintedNode.Status == structs.NodeStatusDisconnected && + aCtx.alloc.ClientStatus == structs.AllocClientStatusUnknown }, - // Priority 8: Disconnected node - pending alloc - { - name: "disconnected-node-pending-alloc", - condition: func(aCtx allocContext) bool { - return aCtx.taintedNode != nil && - aCtx.taintedNode.Status == structs.NodeStatusDisconnected && - aCtx.alloc.ClientStatus == structs.AllocClientStatusPending - }, - category: categoryLost, + category: categoryUntainted, + }, + // Priority 8: Disconnected node - pending alloc + { + condition: func(aCtx allocContext) bool { + return aCtx.taintedNode != nil && + aCtx.taintedNode.Status == structs.NodeStatusDisconnected && + aCtx.alloc.ClientStatus == structs.AllocClientStatusPending }, - // Priority 9: Disconnected node - no disconnect timeout - { - name: "disconnected-node-no-timeout", - condition: func(aCtx allocContext) bool { - return aCtx.taintedNode != nil && - aCtx.taintedNode.Status == structs.NodeStatusDisconnected && - aCtx.alloc.DisconnectTimeout(aCtx.now) == aCtx.now - }, - category: categoryLost, + category: categoryLost, + }, + // Priority 9: Disconnected node - no disconnect timeout + { + condition: func(aCtx allocContext) bool { + return aCtx.taintedNode != nil && + aCtx.taintedNode.Status == structs.NodeStatusDisconnected && + aCtx.alloc.DisconnectTimeout(aCtx.now) == aCtx.now }, - // Priority 10: Disconnected node - within grace period - { - name: "disconnected-node-grace-period", - condition: func(aCtx allocContext) bool { - return aCtx.taintedNode != nil && - aCtx.taintedNode.Status == structs.NodeStatusDisconnected - }, - category: categoryDisconnecting, + category: categoryLost, + }, + // Priority 10: Disconnected node - within grace period + { + condition: func(aCtx allocContext) bool { + return aCtx.taintedNode != nil && + aCtx.taintedNode.Status == structs.NodeStatusDisconnected }, - // Priority 11: Migrate flag set - { - name: "migrate-flag-set", - condition: func(aCtx allocContext) bool { - return aCtx.alloc.DesiredTransition.ShouldMigrate() - }, - category: categoryMigrate, + category: categoryDisconnecting, + }, + // Priority 11: Migrate flag set + { + condition: func(aCtx allocContext) bool { + return aCtx.alloc.DesiredTransition.ShouldMigrate() }, - // Priority 12: Untainted/ready node with reconnect - { - name: "untainted-or-ready-reconnect", - condition: func(aCtx allocContext) bool { - return (!aCtx.nodeIsTainted || (aCtx.taintedNode != nil && aCtx.taintedNode.Status == structs.NodeStatusReady)) && - aCtx.shouldReconnect - }, - category: categoryReconnecting, + category: categoryMigrate, + }, + // Priority 12: Untainted/ready node with reconnect + { + condition: func(aCtx allocContext) bool { + return (!aCtx.nodeIsTainted || (aCtx.taintedNode != nil && aCtx.taintedNode.Status == structs.NodeStatusReady)) && + aCtx.shouldReconnect }, - // Priority 13: Untainted/ready node - { - name: "untainted-or-ready-node", - condition: func(aCtx allocContext) bool { - return !aCtx.nodeIsTainted || (aCtx.taintedNode != nil && aCtx.taintedNode.Status == structs.NodeStatusReady) - }, - category: categoryUntainted, + category: categoryReconnecting, + }, + // Priority 13: Untainted/ready node + { + condition: func(aCtx allocContext) bool { + return !aCtx.nodeIsTainted || (aCtx.taintedNode != nil && aCtx.taintedNode.Status == structs.NodeStatusReady) }, - // Priority 14: Node GC'd (nil) - { - name: "node-gcd", - condition: func(aCtx allocContext) bool { - return aCtx.taintedNode == nil - }, - category: categoryLost, + category: categoryUntainted, + }, + // Priority 14: Node GC'd (nil) + { + condition: func(aCtx allocContext) bool { + return aCtx.taintedNode == nil }, - // Priority 15: Terminal node, no replace, unknown alloc - { - name: "terminal-node-no-replace-unknown", - condition: func(aCtx allocContext) bool { - return aCtx.taintedNode != nil && - aCtx.taintedNode.TerminalStatus() && - !aCtx.alloc.ReplaceOnDisconnect() && - aCtx.alloc.ClientStatus == structs.AllocClientStatusUnknown - }, - category: categoryUntainted, + category: categoryLost, + }, + // Priority 15: Terminal node, no replace, unknown alloc + { + condition: func(aCtx allocContext) bool { + return aCtx.taintedNode != nil && + aCtx.taintedNode.TerminalStatus() && + !aCtx.alloc.ReplaceOnDisconnect() && + aCtx.alloc.ClientStatus == structs.AllocClientStatusUnknown }, - // Priority 16: Terminal node, no replace, running alloc - { - name: "terminal-node-no-replace-running", - condition: func(aCtx allocContext) bool { - return aCtx.taintedNode != nil && - aCtx.taintedNode.TerminalStatus() && - !aCtx.alloc.ReplaceOnDisconnect() && - aCtx.alloc.ClientStatus == structs.AllocClientStatusRunning - }, - category: categoryDisconnecting, + category: categoryUntainted, + }, + // Priority 16: Terminal node, no replace, running alloc + { + condition: func(aCtx allocContext) bool { + return aCtx.taintedNode != nil && + aCtx.taintedNode.TerminalStatus() && + !aCtx.alloc.ReplaceOnDisconnect() && + aCtx.alloc.ClientStatus == structs.AllocClientStatusRunning }, - // Priority 17: Terminal node (all other cases) - { - name: "terminal-node", - condition: func(aCtx allocContext) bool { - return aCtx.taintedNode != nil && aCtx.taintedNode.TerminalStatus() - }, - category: categoryLost, + category: categoryDisconnecting, + }, + // Priority 17: Terminal node (all other cases) + { + condition: func(aCtx allocContext) bool { + return aCtx.taintedNode != nil && aCtx.taintedNode.TerminalStatus() }, - } + category: categoryLost, + }, } // filterByTainted takes a set of tainted nodes and filters the allocation set @@ -322,8 +303,6 @@ func (set allocSet) filterByTainted(state ClusterState) (untainted, migrate, los categoryExpiring: &expiring, } - rules := getClassificationRules() - for _, alloc := range set { // Build the context for classification ctx := allocContext{ @@ -343,7 +322,7 @@ func (set allocSet) filterByTainted(state ClusterState) (untainted, migrate, los // Apply classification rules in order (first match wins) classified := false - for _, rule := range rules { + for _, rule := range classificationRules { if rule.condition(ctx) { targetSet := categoryMap[rule.category] (*targetSet)[alloc.ID] = alloc From c5bc99b68dc006ddbe63c3af0e36996b816d5a49 Mon Sep 17 00:00:00 2001 From: Juanadelacuesta <8647634+Juanadelacuesta@users.noreply.github.com> Date: Tue, 24 Feb 2026 18:23:44 +0100 Subject: [PATCH 3/5] fix: remove duplicated information and declare de classification rules statically, remove the rule's names --- scheduler/reconciler/filters.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/scheduler/reconciler/filters.go b/scheduler/reconciler/filters.go index da35d572bab..d6562f1f7cb 100644 --- a/scheduler/reconciler/filters.go +++ b/scheduler/reconciler/filters.go @@ -117,7 +117,6 @@ type allocContext struct { alloc *structs.Allocation shouldReconnect bool taintedNode *structs.Node - nodeIsTainted bool now time.Time } @@ -226,7 +225,8 @@ var classificationRules = []classificationRule{ // Priority 12: Untainted/ready node with reconnect { condition: func(aCtx allocContext) bool { - return (!aCtx.nodeIsTainted || (aCtx.taintedNode != nil && aCtx.taintedNode.Status == structs.NodeStatusReady)) && + return (aCtx.taintedNode == nil || (aCtx.taintedNode != nil && + aCtx.taintedNode.Status == structs.NodeStatusReady)) && aCtx.shouldReconnect }, category: categoryReconnecting, @@ -234,7 +234,8 @@ var classificationRules = []classificationRule{ // Priority 13: Untainted/ready node { condition: func(aCtx allocContext) bool { - return !aCtx.nodeIsTainted || (aCtx.taintedNode != nil && aCtx.taintedNode.Status == structs.NodeStatusReady) + return aCtx.taintedNode == nil || (aCtx.taintedNode != nil && + aCtx.taintedNode.Status == structs.NodeStatusReady) }, category: categoryUntainted, }, @@ -318,7 +319,7 @@ func (set allocSet) filterByTainted(state ClusterState) (untainted, migrate, los } // Get node taint information - ctx.taintedNode, ctx.nodeIsTainted = state.TaintedNodes[alloc.NodeID] + ctx.taintedNode, _ = state.TaintedNodes[alloc.NodeID] // Apply classification rules in order (first match wins) classified := false From fbd20afea6a5e048a5e698e8592e780300015015 Mon Sep 17 00:00:00 2001 From: Juanadelacuesta <8647634+Juanadelacuesta@users.noreply.github.com> Date: Fri, 1 May 2026 13:10:11 +0200 Subject: [PATCH 4/5] tests: add testing for filter function --- scheduler/reconciler/filters_test.go | 200 +++++++++++++++++++++++++++ 1 file changed, 200 insertions(+) diff --git a/scheduler/reconciler/filters_test.go b/scheduler/reconciler/filters_test.go index c54c94b6739..9f5b28a2d7a 100644 --- a/scheduler/reconciler/filters_test.go +++ b/scheduler/reconciler/filters_test.go @@ -5,10 +5,12 @@ package reconciler import ( "testing" + "time" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" "github.com/shoenig/test/must" + "github.com/stretchr/testify/require" ) func TestReconciler_filterServerTerminalAllocs(t *testing.T) { @@ -72,3 +74,201 @@ func TestReconciler_filterServerTerminalAllocs(t *testing.T) { must.MapLen(t, 6, filtered) }) } + +type fakeAlloc struct { + ID string + NodeID string + ClientStatus string + DesiredStatus string + terminal bool + serverTerminal bool + canary bool + shouldMigrate bool + expired bool + reconnect bool + disconnectNow bool + replaceOnDisc bool +} + +func (a fakeAlloc) NeedsToReconnect() bool { return a.reconnect } +func (a fakeAlloc) TerminalStatus() bool { return a.terminal } +func (a fakeAlloc) ServerTerminalStatus() bool { + return a.serverTerminal +} + +func (a fakeAlloc) Expired(now time.Time) bool { return a.expired } +func (a fakeAlloc) DisconnectTimeout(now time.Time) time.Time { + if a.disconnectNow { + return now + } + return now.Add(time.Hour) +} + +func (a fakeAlloc) ReplaceOnDisconnect() bool { return a.replaceOnDisc } + +/* +func (a fakeAlloc) DesiredTransition() fakeTransition { + return fakeTransition{a.shouldMigrate} +} +*/ + +func TestAllocSet_FilterByTainted(t *testing.T) { + now := time.Now() + + nodes := map[string]*structs.Node{ + "draining": { + ID: "draining", + DrainStrategy: mock.DrainNode().DrainStrategy, + }, + "down": { + ID: "down", + Status: structs.NodeStatusDown, + }, + "nil": nil, + "normal": { + ID: "normal", + Status: structs.NodeStatusReady, + }, + "disconnected": { + ID: "disconnected", + Status: structs.NodeStatusDisconnected, + }, + } + + type testCase struct { + name string + alloc *structs.Allocation + expected string + } + + tests := []testCase{ + { + name: "failed reconnecting", + alloc: &structs.Allocation{ + ID: "a1", + NodeID: "normal", + ClientStatus: structs.AllocClientStatusFailed, + DesiredStatus: structs.AllocDesiredStatusRun, + }, + expected: "reconnecting", + }, + /* { + name: "terminal server -> ignore", + alloc: &structs.Allocation{ + ID: "a2", + NodeID: "normal", + ClientStatus: structs.AllocClientStatusComplete, + }, + expected: "ignore", + }, + { + name: "terminal canary migrate", + alloc: &structs.Allocation{ + ID: "a3", + NodeID: "normal", + DeploymentStatus: &structs.AllocDeploymentStatus{ + Canary: true, + }, + DesiredTransition: structs.DesiredTransition{ + Migrate: pointer.Of(true), + }, + ClientStatus: structs.AllocClientStatusComplete, + }, + expected: "migrate", + }, + { + name: "expired -> expiring", + alloc: &structs.Allocation{ + ID: "a4", + NodeID: "normal", + }, + expected: "expiring", + }, + { + name: "disconnected pending -> lost", + alloc: &structs.Allocation{ + ID: "a5", + NodeID: "disconnected", + ClientStatus: structs.AllocClientStatusPending, + }, + expected: "lost", + }, + { + name: "disconnected -> disconnecting", + alloc: &structs.Allocation{ + ID: "a6", + NodeID: "disconnected", + ClientStatus: structs.AllocClientStatusRunning, + }, + expected: "disconnecting", + }, + { + name: "should migrate", + alloc: &structs.Allocation{ + ID: "a7", + NodeID: "normal", + DesiredTransition: structs.DesiredTransition{ + Migrate: pointer.Of(true), + }, + }, + expected: "migrate", + }, + { + name: "untainted normal", + alloc: &structs.Allocation{ + ID: "a8", + NodeID: "normal", + }, + expected: "untainted", + }, + { + name: "nil node -> lost", + alloc: &structs.Allocation{ + ID: "a9", + NodeID: "nil", + }, + expected: "lost", + }, + { + name: "down node -> lost", + alloc: &structs.Allocation{ + ID: "a10", + NodeID: "down", + }, + expected: "lost", + }, */ + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + set := allocSet{ + tt.alloc.ID: tt.alloc, + } + + state := ClusterState{ + Now: now, + TaintedNodes: nodes, + } + + u, m, l, d, r, i, e := set.filterByTainted(state) + + buckets := map[string]allocSet{ + "untainted": u, + "migrate": m, + "lost": l, + "disconnecting": d, + "reconnecting": r, + "ignore": i, + "expiring": e, + } + + for name, bucket := range buckets { + if name == tt.expected { + require.Contains(t, bucket, tt.alloc.ID) + } else { + require.NotContains(t, bucket, tt.alloc.ID) + } + } + }) + } +} From 8e493f6743f9f23d7e7868ca0865f71d17aa4c89 Mon Sep 17 00:00:00 2001 From: Juanadelacuesta <8647634+Juanadelacuesta@users.noreply.github.com> Date: Fri, 1 May 2026 15:22:03 +0200 Subject: [PATCH 5/5] style: rewrite tests before refatcor and use them for verification --- scheduler/reconciler/filters.go | 168 +++++++------- scheduler/reconciler/filters_test.go | 322 +++++++++++++++------------ 2 files changed, 258 insertions(+), 232 deletions(-) diff --git a/scheduler/reconciler/filters.go b/scheduler/reconciler/filters.go index d6562f1f7cb..17c135baad5 100644 --- a/scheduler/reconciler/filters.go +++ b/scheduler/reconciler/filters.go @@ -99,7 +99,7 @@ func (set allocSet) filterOldTerminalAllocs(a ReconcilerState) (remain, ignore a return remain, ignored } -// allocCategory represents the classification category for an allocation +// allocCategory represents the classification bucket for an allocation. type allocCategory string const ( @@ -112,164 +112,153 @@ const ( categoryExpiring allocCategory = "expiring" ) -// allocContext holds all the contextual information needed to classify an allocation +// allocContext holds the per-allocation data used by classification rules. type allocContext struct { alloc *structs.Allocation shouldReconnect bool taintedNode *structs.Node + nodeIsTainted bool now time.Time } -// classificationRule defines a single decision rule in the classification table type classificationRule struct { - name string - condition func(ctx allocContext) bool + condition func(allocContext) bool category allocCategory } -// getClassificationRules returns the ordered list of classification rules. -// Rules are evaluated in order, first match wins. -// To add new cases, simply add new rules to this list in the appropriate priority order. +// classificationRules are evaluated in order; first match wins. var classificationRules = []classificationRule{ - // Priority 1: Failed reconnect cases + // Failed allocs that need reconnect and are still desired to run. { - condition: func(aCtx allocContext) bool { - return aCtx.shouldReconnect && - aCtx.alloc.DesiredStatus == structs.AllocDesiredStatusRun && - aCtx.alloc.ClientStatus == structs.AllocClientStatusFailed + condition: func(ctx allocContext) bool { + return ctx.shouldReconnect && + ctx.alloc.DesiredStatus == structs.AllocDesiredStatusRun && + ctx.alloc.ClientStatus == structs.AllocClientStatusFailed }, category: categoryReconnecting, }, - // Priority 2: Server-terminal allocations (stopped replacements) + // Server-terminal allocs should be ignored when they are not reconnecting. { - condition: func(aCtx allocContext) bool { - return aCtx.alloc.TerminalStatus() && !aCtx.shouldReconnect && - aCtx.alloc.ServerTerminalStatus() + condition: func(ctx allocContext) bool { + return ctx.alloc.TerminalStatus() && !ctx.shouldReconnect && ctx.alloc.ServerTerminalStatus() }, category: categoryIgnore, }, - // Priority 3: Terminal canaries that need migration + // Terminal canaries marked for migration. { - condition: func(aCtx allocContext) bool { - return aCtx.alloc.TerminalStatus() && !aCtx.shouldReconnect && - aCtx.alloc.DeploymentStatus.IsCanary() && - aCtx.alloc.DesiredTransition.ShouldMigrate() + condition: func(ctx allocContext) bool { + return ctx.alloc.TerminalStatus() && !ctx.shouldReconnect && + ctx.alloc.DeploymentStatus.IsCanary() && ctx.alloc.DesiredTransition.ShouldMigrate() }, category: categoryMigrate, }, - // Priority 4: Other terminal allocations + // Terminal allocs that are not reconnecting are untainted. { - condition: func(aCtx allocContext) bool { - return aCtx.alloc.TerminalStatus() && !aCtx.shouldReconnect + condition: func(ctx allocContext) bool { + return ctx.alloc.TerminalStatus() && !ctx.shouldReconnect }, category: categoryUntainted, }, - // Priority 5: Expired allocations + // Expired allocs are handled before reconnecting/disconnect paths. { - condition: func(aCtx allocContext) bool { - return aCtx.alloc.Expired(aCtx.now) + condition: func(ctx allocContext) bool { + return ctx.alloc.Expired(ctx.now) }, category: categoryExpiring, }, - // Priority 6: Failed reconnect marked to stop + // Failed reconnects that server already marked to stop should be ignored. { - condition: func(aCtx allocContext) bool { - return aCtx.shouldReconnect && - aCtx.alloc.ClientStatus == structs.AllocClientStatusFailed && - aCtx.alloc.DesiredStatus == structs.AllocDesiredStatusStop + condition: func(ctx allocContext) bool { + return ctx.shouldReconnect && + ctx.alloc.ClientStatus == structs.AllocClientStatusFailed && + ctx.alloc.DesiredStatus == structs.AllocDesiredStatusStop }, category: categoryIgnore, }, - // Priority 7: Disconnected node - unknown alloc + // Disconnected node and alloc already unknown. { - condition: func(aCtx allocContext) bool { - return aCtx.taintedNode != nil && - aCtx.taintedNode.Status == structs.NodeStatusDisconnected && - aCtx.alloc.ClientStatus == structs.AllocClientStatusUnknown + condition: func(ctx allocContext) bool { + return ctx.taintedNode != nil && + ctx.taintedNode.Status == structs.NodeStatusDisconnected && + ctx.alloc.ClientStatus == structs.AllocClientStatusUnknown }, category: categoryUntainted, }, - // Priority 8: Disconnected node - pending alloc + // Disconnected pending allocs should be replaced immediately. { - condition: func(aCtx allocContext) bool { - return aCtx.taintedNode != nil && - aCtx.taintedNode.Status == structs.NodeStatusDisconnected && - aCtx.alloc.ClientStatus == structs.AllocClientStatusPending + condition: func(ctx allocContext) bool { + return ctx.taintedNode != nil && + ctx.taintedNode.Status == structs.NodeStatusDisconnected && + ctx.alloc.ClientStatus == structs.AllocClientStatusPending }, category: categoryLost, }, - // Priority 9: Disconnected node - no disconnect timeout + // Disconnected allocs with nil disconnect block or lost_after=0 are lost. { - condition: func(aCtx allocContext) bool { - return aCtx.taintedNode != nil && - aCtx.taintedNode.Status == structs.NodeStatusDisconnected && - aCtx.alloc.DisconnectTimeout(aCtx.now) == aCtx.now + condition: func(ctx allocContext) bool { + return ctx.taintedNode != nil && + ctx.taintedNode.Status == structs.NodeStatusDisconnected && + ctx.alloc.DisconnectTimeout(ctx.now) == ctx.now }, category: categoryLost, }, - // Priority 10: Disconnected node - within grace period + // Remaining disconnected allocs are disconnecting. { - condition: func(aCtx allocContext) bool { - return aCtx.taintedNode != nil && - aCtx.taintedNode.Status == structs.NodeStatusDisconnected + condition: func(ctx allocContext) bool { + return ctx.taintedNode != nil && ctx.taintedNode.Status == structs.NodeStatusDisconnected }, category: categoryDisconnecting, }, - // Priority 11: Migrate flag set + // Non-terminal allocs marked for migration always migrate. { - condition: func(aCtx allocContext) bool { - return aCtx.alloc.DesiredTransition.ShouldMigrate() + condition: func(ctx allocContext) bool { + return ctx.alloc.DesiredTransition.ShouldMigrate() }, category: categoryMigrate, }, - // Priority 12: Untainted/ready node with reconnect + // Ready or untainted nodes with reconnecting allocs. { - condition: func(aCtx allocContext) bool { - return (aCtx.taintedNode == nil || (aCtx.taintedNode != nil && - aCtx.taintedNode.Status == structs.NodeStatusReady)) && - aCtx.shouldReconnect + condition: func(ctx allocContext) bool { + return (!ctx.nodeIsTainted || (ctx.taintedNode != nil && ctx.taintedNode.Status == structs.NodeStatusReady)) && + ctx.shouldReconnect }, category: categoryReconnecting, }, - // Priority 13: Untainted/ready node + // Ready or untainted nodes. { - condition: func(aCtx allocContext) bool { - return aCtx.taintedNode == nil || (aCtx.taintedNode != nil && - aCtx.taintedNode.Status == structs.NodeStatusReady) + condition: func(ctx allocContext) bool { + return !ctx.nodeIsTainted || (ctx.taintedNode != nil && ctx.taintedNode.Status == structs.NodeStatusReady) }, category: categoryUntainted, }, - // Priority 14: Node GC'd (nil) + // GC'd (tainted map entry with nil node) nodes are lost. { - condition: func(aCtx allocContext) bool { - return aCtx.taintedNode == nil + condition: func(ctx allocContext) bool { + return ctx.taintedNode == nil }, category: categoryLost, }, - // Priority 15: Terminal node, no replace, unknown alloc + // Terminal node allocations that cannot be replaced. { - condition: func(aCtx allocContext) bool { - return aCtx.taintedNode != nil && - aCtx.taintedNode.TerminalStatus() && - !aCtx.alloc.ReplaceOnDisconnect() && - aCtx.alloc.ClientStatus == structs.AllocClientStatusUnknown + condition: func(ctx allocContext) bool { + return ctx.taintedNode.TerminalStatus() && + !ctx.alloc.ReplaceOnDisconnect() && + ctx.alloc.ClientStatus == structs.AllocClientStatusUnknown }, category: categoryUntainted, }, - // Priority 16: Terminal node, no replace, running alloc { - condition: func(aCtx allocContext) bool { - return aCtx.taintedNode != nil && - aCtx.taintedNode.TerminalStatus() && - !aCtx.alloc.ReplaceOnDisconnect() && - aCtx.alloc.ClientStatus == structs.AllocClientStatusRunning + condition: func(ctx allocContext) bool { + return ctx.taintedNode.TerminalStatus() && + !ctx.alloc.ReplaceOnDisconnect() && + ctx.alloc.ClientStatus == structs.AllocClientStatusRunning }, category: categoryDisconnecting, }, - // Priority 17: Terminal node (all other cases) + // Remaining terminal node allocs are lost. { - condition: func(aCtx allocContext) bool { - return aCtx.taintedNode != nil && aCtx.taintedNode.TerminalStatus() + condition: func(ctx allocContext) bool { + return ctx.taintedNode.TerminalStatus() }, category: categoryLost, }, @@ -293,7 +282,7 @@ func (set allocSet) filterByTainted(state ClusterState) (untainted, migrate, los ignore = make(allocSet) expiring = make(allocSet) - // Create a map for quick category assignment + // Create a map for quick category assignment. categoryMap := map[allocCategory]*allocSet{ categoryUntainted: &untainted, categoryMigrate: &migrate, @@ -305,23 +294,24 @@ func (set allocSet) filterByTainted(state ClusterState) (untainted, migrate, los } for _, alloc := range set { - // Build the context for classification + // Build context for classification rules. ctx := allocContext{ alloc: alloc, now: state.Now, } - // Compute shouldReconnect - only for unknown, running, and failed allocs + // Compute shouldReconnect for unknown/running/failed allocs. if alloc.ClientStatus == structs.AllocClientStatusUnknown || alloc.ClientStatus == structs.AllocClientStatusRunning || alloc.ClientStatus == structs.AllocClientStatusFailed { ctx.shouldReconnect = alloc.NeedsToReconnect() } - // Get node taint information - ctx.taintedNode, _ = state.TaintedNodes[alloc.NodeID] + // Get node taint information, preserving whether key existed so we can + // distinguish missing-node from GC'd-node semantics. + ctx.taintedNode, ctx.nodeIsTainted = state.TaintedNodes[alloc.NodeID] - // Apply classification rules in order (first match wins) + // Apply classification rules in order (first match wins). classified := false for _, rule := range classificationRules { if rule.condition(ctx) { @@ -332,7 +322,7 @@ func (set allocSet) filterByTainted(state ClusterState) (untainted, migrate, los } } - // Default: untainted (if no rule matched) + // Default category if no rule matched. if !classified { untainted[alloc.ID] = alloc } diff --git a/scheduler/reconciler/filters_test.go b/scheduler/reconciler/filters_test.go index 9f5b28a2d7a..4649ea59195 100644 --- a/scheduler/reconciler/filters_test.go +++ b/scheduler/reconciler/filters_test.go @@ -7,10 +7,10 @@ import ( "testing" "time" + "github.com/hashicorp/nomad/helper/pointer" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" "github.com/shoenig/test/must" - "github.com/stretchr/testify/require" ) func TestReconciler_filterServerTerminalAllocs(t *testing.T) { @@ -75,199 +75,235 @@ func TestReconciler_filterServerTerminalAllocs(t *testing.T) { }) } -type fakeAlloc struct { - ID string - NodeID string - ClientStatus string - DesiredStatus string - terminal bool - serverTerminal bool - canary bool - shouldMigrate bool - expired bool - reconnect bool - disconnectNow bool - replaceOnDisc bool -} - -func (a fakeAlloc) NeedsToReconnect() bool { return a.reconnect } -func (a fakeAlloc) TerminalStatus() bool { return a.terminal } -func (a fakeAlloc) ServerTerminalStatus() bool { - return a.serverTerminal -} - -func (a fakeAlloc) Expired(now time.Time) bool { return a.expired } -func (a fakeAlloc) DisconnectTimeout(now time.Time) time.Time { - if a.disconnectNow { - return now - } - return now.Add(time.Hour) -} - -func (a fakeAlloc) ReplaceOnDisconnect() bool { return a.replaceOnDisc } - -/* -func (a fakeAlloc) DesiredTransition() fakeTransition { - return fakeTransition{a.shouldMigrate} -} -*/ - -func TestAllocSet_FilterByTainted(t *testing.T) { +func TestAllocSet_filterByTainted_ClassificationRules(t *testing.T) { now := time.Now() nodes := map[string]*structs.Node{ - "draining": { - ID: "draining", - DrainStrategy: mock.DrainNode().DrainStrategy, - }, - "down": { - ID: "down", - Status: structs.NodeStatusDown, - }, - "nil": nil, - "normal": { - ID: "normal", + "ready": { + ID: "ready", Status: structs.NodeStatusReady, }, "disconnected": { ID: "disconnected", Status: structs.NodeStatusDisconnected, }, + "down": { + ID: "down", + Status: structs.NodeStatusDown, + }, + "initializing": { + ID: "initializing", + Status: structs.NodeStatusInit, + }, + "gc": nil, + } + + setDisconnect := func(alloc *structs.Allocation, lostAfter time.Duration, replace bool) { + alloc.Job.TaskGroups[0].Disconnect = &structs.DisconnectStrategy{ + LostAfter: lostAfter, + Replace: pointer.Of(replace), + } } - type testCase struct { + makeAlloc := func(id, nodeID, clientStatus, desiredStatus string) *structs.Allocation { + alloc := mock.Alloc() + alloc.ID = id + alloc.NodeID = nodeID + alloc.ClientStatus = clientStatus + alloc.DesiredStatus = desiredStatus + alloc.AllocStates = nil + alloc.DeploymentStatus = nil + alloc.DesiredTransition = structs.DesiredTransition{} + setDisconnect(alloc, 5*time.Minute, true) + return alloc + } + + unknownState := []*structs.AllocState{{ + Field: structs.AllocStateFieldClientStatus, + Value: structs.AllocClientStatusUnknown, + Time: now.Add(-10 * time.Second), + }} + + testCases := []struct { name string alloc *structs.Allocation + nodeMap map[string]*structs.Node expected string - } - - tests := []testCase{ + }{ { - name: "failed reconnecting", - alloc: &structs.Allocation{ - ID: "a1", - NodeID: "normal", - ClientStatus: structs.AllocClientStatusFailed, - DesiredStatus: structs.AllocDesiredStatusRun, - }, + name: "failed reconnect run", + alloc: func() *structs.Allocation { + a := makeAlloc("c1", "ready", structs.AllocClientStatusFailed, structs.AllocDesiredStatusRun) + a.AllocStates = unknownState + return a + }(), + nodeMap: nodes, expected: "reconnecting", }, - /* { - name: "terminal server -> ignore", - alloc: &structs.Allocation{ - ID: "a2", - NodeID: "normal", - ClientStatus: structs.AllocClientStatusComplete, - }, + { + name: "terminal server status ignored", + alloc: makeAlloc("c2", "ready", structs.AllocClientStatusRunning, structs.AllocDesiredStatusStop), + nodeMap: nodes, expected: "ignore", }, { name: "terminal canary migrate", - alloc: &structs.Allocation{ - ID: "a3", - NodeID: "normal", - DeploymentStatus: &structs.AllocDeploymentStatus{ - Canary: true, - }, - DesiredTransition: structs.DesiredTransition{ - Migrate: pointer.Of(true), - }, - ClientStatus: structs.AllocClientStatusComplete, - }, + alloc: func() *structs.Allocation { + a := makeAlloc("c3", "ready", structs.AllocClientStatusComplete, structs.AllocDesiredStatusRun) + a.DeploymentStatus = &structs.AllocDeploymentStatus{Canary: true} + a.DesiredTransition = structs.DesiredTransition{Migrate: pointer.Of(true)} + return a + }(), + nodeMap: nodes, expected: "migrate", }, { - name: "expired -> expiring", - alloc: &structs.Allocation{ - ID: "a4", - NodeID: "normal", - }, + name: "terminal untainted", + alloc: makeAlloc("c4", "ready", structs.AllocClientStatusComplete, structs.AllocDesiredStatusRun), + nodeMap: nodes, + expected: "untainted", + }, + { + name: "expired alloc", + alloc: func() *structs.Allocation { + a := makeAlloc("c5", "ready", structs.AllocClientStatusUnknown, structs.AllocDesiredStatusRun) + setDisconnect(a, 1*time.Second, true) + a.AllocStates = unknownState + return a + }(), + nodeMap: nodes, expected: "expiring", }, { - name: "disconnected pending -> lost", - alloc: &structs.Allocation{ - ID: "a5", - NodeID: "disconnected", - ClientStatus: structs.AllocClientStatusPending, - }, + name: "failed reconnect stop ignored", + alloc: func() *structs.Allocation { + a := makeAlloc("c6", "ready", structs.AllocClientStatusFailed, structs.AllocDesiredStatusStop) + a.AllocStates = unknownState + return a + }(), + nodeMap: nodes, + expected: "ignore", + }, + { + name: "disconnected unknown becomes untainted", + alloc: makeAlloc("c7", "disconnected", structs.AllocClientStatusUnknown, structs.AllocDesiredStatusRun), + nodeMap: nodes, + expected: "untainted", + }, + { + name: "disconnected pending lost", + alloc: makeAlloc("c8", "disconnected", structs.AllocClientStatusPending, structs.AllocDesiredStatusRun), + nodeMap: nodes, + expected: "lost", + }, + { + name: "disconnected zero timeout lost", + alloc: func() *structs.Allocation { + a := makeAlloc("c9", "disconnected", structs.AllocClientStatusRunning, structs.AllocDesiredStatusRun) + a.Job = nil + return a + }(), + nodeMap: nodes, expected: "lost", }, { - name: "disconnected -> disconnecting", - alloc: &structs.Allocation{ - ID: "a6", - NodeID: "disconnected", - ClientStatus: structs.AllocClientStatusRunning, - }, + name: "disconnected grace period", + alloc: makeAlloc("c10", "disconnected", structs.AllocClientStatusRunning, structs.AllocDesiredStatusRun), + nodeMap: nodes, expected: "disconnecting", }, { - name: "should migrate", - alloc: &structs.Allocation{ - ID: "a7", - NodeID: "normal", - DesiredTransition: structs.DesiredTransition{ - Migrate: pointer.Of(true), - }, - }, + name: "migrate flag", + alloc: func() *structs.Allocation { + a := makeAlloc("c11", "ready", structs.AllocClientStatusPending, structs.AllocDesiredStatusRun) + a.DesiredTransition = structs.DesiredTransition{Migrate: pointer.Of(true)} + return a + }(), + nodeMap: nodes, expected: "migrate", }, { - name: "untainted normal", - alloc: &structs.Allocation{ - ID: "a8", - NodeID: "normal", - }, + name: "untainted reconnecting via ready node", + alloc: func() *structs.Allocation { + a := makeAlloc("c12", "ready", structs.AllocClientStatusRunning, structs.AllocDesiredStatusRun) + a.AllocStates = unknownState + return a + }(), + nodeMap: nodes, + expected: "reconnecting", + }, + { + name: "untainted on non tainted node", + alloc: makeAlloc("c13", "missing", structs.AllocClientStatusRunning, structs.AllocDesiredStatusRun), + nodeMap: nodes, expected: "untainted", }, { - name: "nil node -> lost", - alloc: &structs.Allocation{ - ID: "a9", - NodeID: "nil", - }, + name: "gc node lost", + alloc: makeAlloc("c14", "gc", structs.AllocClientStatusRunning, structs.AllocDesiredStatusRun), + nodeMap: nodes, expected: "lost", }, { - name: "down node -> lost", - alloc: &structs.Allocation{ - ID: "a10", - NodeID: "down", - }, + name: "terminal node unknown no replace", + alloc: func() *structs.Allocation { + a := makeAlloc("c15", "down", structs.AllocClientStatusUnknown, structs.AllocDesiredStatusRun) + setDisconnect(a, 5*time.Minute, false) + return a + }(), + nodeMap: nodes, + expected: "untainted", + }, + { + name: "terminal node running no replace", + alloc: func() *structs.Allocation { + a := makeAlloc("c16", "down", structs.AllocClientStatusRunning, structs.AllocDesiredStatusRun) + setDisconnect(a, 5*time.Minute, false) + return a + }(), + nodeMap: nodes, + expected: "disconnecting", + }, + { + name: "terminal node default lost", + alloc: makeAlloc("c17", "down", structs.AllocClientStatusPending, structs.AllocDesiredStatusRun), + nodeMap: nodes, expected: "lost", - }, */ + }, + { + name: "other tainted node defaults to untainted", + alloc: makeAlloc("c18", "initializing", structs.AllocClientStatusPending, structs.AllocDesiredStatusRun), + nodeMap: nodes, + expected: "untainted", + }, } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - set := allocSet{ - tt.alloc.ID: tt.alloc, - } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + all := allocSet{tc.alloc.ID: tc.alloc} + state := ClusterState{Now: now, TaintedNodes: tc.nodeMap} - state := ClusterState{ - Now: now, - TaintedNodes: nodes, - } - - u, m, l, d, r, i, e := set.filterByTainted(state) + untainted, migrate, lost, disconnecting, reconnecting, ignore, expiring := all.filterByTainted(state) buckets := map[string]allocSet{ - "untainted": u, - "migrate": m, - "lost": l, - "disconnecting": d, - "reconnecting": r, - "ignore": i, - "expiring": e, + "untainted": untainted, + "migrate": migrate, + "lost": lost, + "disconnecting": disconnecting, + "reconnecting": reconnecting, + "ignore": ignore, + "expiring": expiring, } - for name, bucket := range buckets { - if name == tt.expected { - require.Contains(t, bucket, tt.alloc.ID) - } else { - require.NotContains(t, bucket, tt.alloc.ID) + for category, bucket := range buckets { + if category == tc.expected { + must.MapContainsKey(t, bucket, tc.alloc.ID) + must.MapLen(t, 1, bucket) + continue } + + must.MapNotContainsKey(t, bucket, tc.alloc.ID) + must.MapLen(t, 0, bucket) } }) }