diff --git a/api/tasks.go b/api/tasks.go index a0624df90dc..a2aebd8a76c 100644 --- a/api/tasks.go +++ b/api/tasks.go @@ -507,6 +507,7 @@ type TaskGroup struct { Meta map[string]string `hcl:"meta,block"` Services []*Service `hcl:"service,block"` ShutdownDelay *time.Duration `mapstructure:"shutdown_delay" hcl:"shutdown_delay,optional"` + MaxRunDuration *time.Duration `mapstructure:"max_run_duration" hcl:"max_run_duration,optional"` // Deprecated: StopAfterClientDisconnect is deprecated in Nomad 1.8 and ignored in Nomad 1.10. Use Disconnect.StopOnClientAfter. StopAfterClientDisconnect *time.Duration `mapstructure:"stop_after_client_disconnect" hcl:"stop_after_client_disconnect,optional"` // Deprecated: MaxClientDisconnect is deprecated in Nomad 1.8.0 and ignored in Nomad 1.10. Use Disconnect.LostAfter. diff --git a/client/allocrunner/alloc_runner.go b/client/allocrunner/alloc_runner.go index e0cb0d7daa8..e289ea27c65 100644 --- a/client/allocrunner/alloc_runner.go +++ b/client/allocrunner/alloc_runner.go @@ -737,8 +737,14 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState { return nil } - return structs.NewTaskEvent(structs.TaskKilling). + event := structs.NewTaskEvent(structs.TaskKilling). SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout) + + if ar.state.MaxRunDurationExceeded { + event.SetDisplayMessage(structs.AllocTimeoutReasonMaxRunDuration) + } + + return event } // Kill leader first, synchronously @@ -809,6 +815,23 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState { } wg.Wait() + // Skip poststop tasks entirely when max_run_duration has been exceeded so + // they are not started after the allocation has timed out. + if ar.state.MaxRunDurationExceeded { + for name, tr := range ar.tasks { + if !tr.IsPoststopTask() { + continue + } + + state := tr.TaskState() + if state != nil { + states[name] = state + } + } + + return states + } + // Perform no action on post stop tasks, but retain their states if they exist. This // commonly happens at the time of alloc GC from the client node. for name, tr := range ar.tasks { @@ -845,7 +868,10 @@ func (ar *allocRunner) clientAlloc(taskStates map[string]*structs.TaskState) *st } // Compute the ClientStatus - if ar.state.ClientStatus != "" { + if ar.state.MaxRunDurationExceeded { + a.ClientStatus = structs.AllocClientStatusComplete + a.ClientDescription = structs.AllocTimeoutReasonMaxRunDuration + } else if ar.state.ClientStatus != "" { // The client status is being forced a.ClientStatus, a.ClientDescription = ar.state.ClientStatus, ar.state.ClientDescription } else { @@ -983,7 +1009,7 @@ func (ar *allocRunner) AllocState() *state.State { // If TaskStateUpdated has not been called yet, ar.state.TaskStates // won't be set as it is not the canonical source of TaskStates. if len(state.TaskStates) == 0 { - ar.state.TaskStates = make(map[string]*structs.TaskState, len(ar.tasks)) + state.TaskStates = make(map[string]*structs.TaskState, len(ar.tasks)) for k, tr := range ar.tasks { state.TaskStates[k] = tr.TaskState() } @@ -1081,6 +1107,27 @@ func (ar *allocRunner) Listener() *cstructs.AllocListener { return ar.allocBroadcaster.Listen() } +func (ar *allocRunner) EnforceMaxRunDurationTimeout(deadline time.Time) { + now := time.Now() + + if ar.isShuttingDown() { + return + } + + if now.Before(deadline) { + return + } + + ar.stateLock.Lock() + ar.state.MaxRunDurationExceeded = true + ar.state.ClientStatus = structs.AllocClientStatusComplete + ar.state.ClientDescription = structs.AllocTimeoutReasonMaxRunDuration + ar.stateLock.Unlock() + + ar.logger.Debug("allocation exceeded max_run_duration, killing tasks", "deadline", deadline) + ar.killTasks() +} + func (ar *allocRunner) destroyImpl() { // Stop any running tasks and persist states in case the client is // shutdown before Destroy finishes. @@ -1255,8 +1302,8 @@ func (ar *allocRunner) Shutdown() { go func() { ar.logger.Trace("shutting down") - // Shutdown tasks gracefully if they were run - wg := sync.WaitGroup{} + // Shutdown task runners + var wg sync.WaitGroup for _, tr := range ar.tasks { wg.Add(1) go func(tr *taskrunner.TaskRunner) { diff --git a/client/allocrunner/alloc_runner_hooks.go b/client/allocrunner/alloc_runner_hooks.go index abc2898b08b..f17d9c03c43 100644 --- a/client/allocrunner/alloc_runner_hooks.go +++ b/client/allocrunner/alloc_runner_hooks.go @@ -111,6 +111,7 @@ func (ar *allocRunner) initRunnerHooks(config *clientconfig.Config) error { ar.runnerHooks = []interfaces.RunnerHook{ newIdentityHook(hookLogger, ar.widmgr), newAllocDirHook(hookLogger, ar.allocDir), + newMaxRunDurationHook(hookLogger, alloc, ar.clientBaseLabels, ar.EnforceMaxRunDurationTimeout), newConsulHook(consulHookConfig{ alloc: ar.alloc, allocdir: ar.allocDir, diff --git a/client/allocrunner/alloc_runner_test.go b/client/allocrunner/alloc_runner_test.go index 794923d4819..01401358998 100644 --- a/client/allocrunner/alloc_runner_test.go +++ b/client/allocrunner/alloc_runner_test.go @@ -497,6 +497,95 @@ func TestAllocRunner_Lifecycle_Poststop(t *testing.T) { } +func TestAllocRunner_MaxRunDuration_SkipsPoststopTasks(t *testing.T) { + ci.Parallel(t) + + alloc := mock.LifecycleAlloc() + tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name] + + alloc.Job.Type = structs.JobTypeBatch + maxRunDuration := 50 * time.Millisecond + alloc.Job.TaskGroups[0].MaxRunDuration = &maxRunDuration + + mainTask := alloc.Job.TaskGroups[0].Tasks[0] + mainTask.Config["run_for"] = "100s" + mainTask.KillTimeout = 10 * time.Millisecond + + poststopTask := alloc.Job.TaskGroups[0].Tasks[1] + poststopTask.Name = "poststop" + poststopTask.Lifecycle.Hook = structs.TaskLifecycleHookPoststop + poststopTask.Config["run_for"] = "10s" + + alloc.Job.TaskGroups[0].Tasks = []*structs.Task{mainTask, poststopTask} + alloc.AllocatedResources.Tasks = map[string]*structs.AllocatedTaskResources{ + mainTask.Name: tr, + poststopTask.Name: tr, + } + + conf, cleanup := testAllocRunnerConfig(t, alloc) + defer cleanup() + + arIface, err := NewAllocRunner(conf) + must.NoError(t, err) + ar := arIface.(*allocRunner) + + go ar.Run() + defer destroy(ar) + + upd := conf.StateUpdater.(*MockStateUpdater) + + testutil.WaitForResult(func() (bool, error) { + last := upd.Last() + if last == nil { + return false, fmt.Errorf("no updates") + } + + if last.ClientStatus != structs.AllocClientStatusRunning { + return false, fmt.Errorf("expected alloc to be running not %s", last.ClientStatus) + } + + if s := last.TaskStates[mainTask.Name].State; s != structs.TaskStateRunning { + return false, fmt.Errorf("expected main task to be running not %s", s) + } + + if s := last.TaskStates[poststopTask.Name].State; s != structs.TaskStatePending { + return false, fmt.Errorf("expected poststop task to be pending not %s", s) + } + + return true, nil + }, func(err error) { + t.Fatalf("error waiting for initial state:\n%v", err) + }) + + testutil.WaitForResult(func() (bool, error) { + last := upd.Last() + if last == nil { + return false, fmt.Errorf("no updates") + } + + if last.ClientStatus != structs.AllocClientStatusComplete { + return false, fmt.Errorf("expected alloc to be complete not %s", last.ClientStatus) + } + + if last.ClientDescription != structs.AllocTimeoutReasonMaxRunDuration { + return false, fmt.Errorf("expected alloc description %q not %q", structs.AllocTimeoutReasonMaxRunDuration, last.ClientDescription) + } + + if s := last.TaskStates[mainTask.Name].State; s != structs.TaskStateDead { + return false, fmt.Errorf("expected main task to be dead not %s", s) + } + + if s := last.TaskStates[poststopTask.Name].State; s != structs.TaskStatePending { + return false, fmt.Errorf("expected poststop task to remain pending not %s", s) + } + + return true, nil + }, func(err error) { + last := upd.Last() + t.Fatalf("error waiting for max_run_duration state:\n%v\nlast=%#v", err, last) + }) +} + func TestAllocRunner_Lifecycle_Restart(t *testing.T) { ci.Parallel(t) @@ -2676,6 +2765,124 @@ func TestAllocRunner_GetUpdatePriority(t *testing.T) { must.Eq(t, cstructs.AllocUpdatePriorityUrgent, ar.GetUpdatePriority(calloc)) } +func TestAllocRunner_MaxRunDuration_StopsExpiredAlloc(t *testing.T) { + ci.Parallel(t) + + alloc := mock.BatchAlloc() + task := alloc.Job.TaskGroups[0].Tasks[0] + task.Driver = "mock_driver" + task.Config = map[string]interface{}{ + "run_for": "10s", + } + task.KillTimeout = 10 * time.Millisecond + maxRunDuration := 50 * time.Millisecond + alloc.Job.TaskGroups[0].MaxRunDuration = &maxRunDuration + + conf, cleanup := testAllocRunnerConfig(t, alloc) + defer cleanup() + + arIface, err := NewAllocRunner(conf) + must.NoError(t, err) + ar := arIface.(*allocRunner) + + go ar.Run() + defer destroy(ar) + + testutil.WaitForResult(func() (bool, error) { + state := ar.AllocState() + if state == nil { + return false, fmt.Errorf("no alloc state") + } + if state.ClientStatus != structs.AllocClientStatusComplete { + return false, fmt.Errorf("got status %v; want %v", state.ClientStatus, structs.AllocClientStatusComplete) + } + if state.ClientDescription != structs.AllocTimeoutReasonMaxRunDuration { + return false, fmt.Errorf("got description %q; want %q", state.ClientDescription, structs.AllocTimeoutReasonMaxRunDuration) + } + if !state.MaxRunDurationExceeded { + return false, fmt.Errorf("max run duration was not marked exceeded") + } + return true, nil + }, func(err error) { + state := ar.AllocState() + t.Fatalf("timed out waiting for alloc runner max_run_duration enforcement: %v; state=%#v", err, state) + }) +} + +func TestAllocRunner_MaxRunDuration_UpdateExtendsRunningAlloc(t *testing.T) { + ci.Parallel(t) + + alloc := mock.BatchAlloc() + task := alloc.Job.TaskGroups[0].Tasks[0] + task.Driver = "mock_driver" + task.Config = map[string]interface{}{ + "run_for": "10s", + } + task.KillTimeout = 10 * time.Millisecond + + initialMaxRunDuration := 75 * time.Millisecond + alloc.Job.TaskGroups[0].MaxRunDuration = &initialMaxRunDuration + + conf, cleanup := testAllocRunnerConfig(t, alloc) + defer cleanup() + + arIface, err := NewAllocRunner(conf) + must.NoError(t, err) + ar := arIface.(*allocRunner) + + go ar.Run() + defer destroy(ar) + + testutil.WaitForResult(func() (bool, error) { + state := ar.AllocState() + if state == nil { + return false, fmt.Errorf("no alloc state") + } + if state.ClientStatus != structs.AllocClientStatusRunning { + return false, fmt.Errorf("got status %v; want %v", state.ClientStatus, structs.AllocClientStatusRunning) + } + return true, nil + }, func(err error) { + state := ar.AllocState() + t.Fatalf("timed out waiting for alloc runner to start: %v; state=%#v", err, state) + }) + + time.Sleep(40 * time.Millisecond) + + updatedAlloc := ar.Alloc().Copy() + updatedAlloc.AllocModifyIndex++ + updatedMaxRunDuration := 200 * time.Millisecond + updatedAlloc.Job.TaskGroups[0].MaxRunDuration = &updatedMaxRunDuration + ar.Update(updatedAlloc) + + time.Sleep(60 * time.Millisecond) + + state := ar.AllocState() + must.NotNil(t, state) + must.False(t, state.MaxRunDurationExceeded) + must.Eq(t, structs.AllocClientStatusRunning, state.ClientStatus) + + testutil.WaitForResult(func() (bool, error) { + state := ar.AllocState() + if state == nil { + return false, fmt.Errorf("no alloc state") + } + if state.ClientStatus != structs.AllocClientStatusComplete { + return false, fmt.Errorf("got status %v; want %v", state.ClientStatus, structs.AllocClientStatusComplete) + } + if state.ClientDescription != structs.AllocTimeoutReasonMaxRunDuration { + return false, fmt.Errorf("got description %q; want %q", state.ClientDescription, structs.AllocTimeoutReasonMaxRunDuration) + } + if !state.MaxRunDurationExceeded { + return false, fmt.Errorf("max run duration was not marked exceeded") + } + return true, nil + }, func(err error) { + state := ar.AllocState() + t.Fatalf("timed out waiting for alloc runner max_run_duration enforcement after update: %v; state=%#v", err, state) + }) +} + func TestAllocRunner_setHookStatsHandler(t *testing.T) { ci.Parallel(t) diff --git a/client/allocrunner/max_run_duration_hook.go b/client/allocrunner/max_run_duration_hook.go new file mode 100644 index 00000000000..7369e01571e --- /dev/null +++ b/client/allocrunner/max_run_duration_hook.go @@ -0,0 +1,213 @@ +// Copyright IBM Corp. 2015, 2025 +// SPDX-License-Identifier: BUSL-1.1 + +package allocrunner + +import ( + "sync" + "time" + + "github.com/hashicorp/go-hclog" + metrics "github.com/hashicorp/go-metrics/compat" + "github.com/hashicorp/nomad/client/allocrunner/interfaces" + "github.com/hashicorp/nomad/client/taskenv" + "github.com/hashicorp/nomad/nomad/structs" +) + +var ( + _ interfaces.RunnerPrerunHook = (*maxRunDurationHook)(nil) + _ interfaces.RunnerPostrunHook = (*maxRunDurationHook)(nil) + _ interfaces.RunnerUpdateHook = (*maxRunDurationHook)(nil) + _ interfaces.ShutdownHook = (*maxRunDurationHook)(nil) +) + +type maxRunDurationHook struct { + mu sync.Mutex + + alloc *structs.Allocation + + timer *time.Timer + deadline time.Time + maxRunDuration time.Duration + hasMaxRunDuration bool + + onTimeout func(time.Time) + logger hclog.Logger + baseLabels []metrics.Label +} + +func newMaxRunDurationHook( + logger hclog.Logger, + alloc *structs.Allocation, + baseLabels []metrics.Label, + onTimeout func(time.Time), +) interfaces.RunnerHook { + return &maxRunDurationHook{ + alloc: alloc, + onTimeout: onTimeout, + logger: logger.Named("max_run_duration"), + baseLabels: baseLabels, + } +} + +func (h *maxRunDurationHook) Name() string { + return "max_run_duration" +} + +func (h *maxRunDurationHook) Prerun(*taskenv.TaskEnv) error { + h.mu.Lock() + defer h.mu.Unlock() + + h.resetTimer() + return nil +} + +func (h *maxRunDurationHook) Update(req *interfaces.RunnerUpdateRequest) error { + h.mu.Lock() + defer h.mu.Unlock() + + h.alloc = req.Alloc + h.resetTimer() + return nil +} + +func (h *maxRunDurationHook) Postrun() error { + h.mu.Lock() + defer h.mu.Unlock() + + h.stopTimer() + return nil +} + +func (h *maxRunDurationHook) Shutdown() { + h.mu.Lock() + defer h.mu.Unlock() + + h.stopTimer() +} + +func (h *maxRunDurationHook) resetTimer() { + deadline, maxRunDuration, ok := h.currentDeadline() + if !ok { + h.stopTimer() + h.deadline = time.Time{} + h.maxRunDuration = 0 + h.hasMaxRunDuration = false + return + } + + if h.hasMaxRunDuration && h.maxRunDuration == maxRunDuration && h.deadline.Equal(deadline) { + return + } + + prevMaxRunDuration := h.maxRunDuration + prevDeadline := h.deadline + hadMaxRunDuration := h.hasMaxRunDuration + + h.stopTimer() + + h.maxRunDuration = maxRunDuration + h.hasMaxRunDuration = true + h.deadline = deadline + h.emitMetrics(maxRunDuration, deadline) + + remaining := time.Until(deadline) + + if hadMaxRunDuration { + h.logger.Debug("updated max_run_duration", + "task_group", h.alloc.TaskGroup, + "old_configured", prevMaxRunDuration, + "new_configured", maxRunDuration, + "old_deadline", prevDeadline, + "new_deadline", deadline, + "remaining", remaining, + ) + } + + if remaining <= 0 { + h.logger.Debug("allocation exceeded max_run_duration, enforcing immediately", + "task_group", h.alloc.TaskGroup, + "configured", maxRunDuration, + "remaining", remaining, + "deadline", deadline, + ) + go h.onTimeout(deadline) + return + } + + timer := time.NewTimer(remaining) + h.timer = timer + + h.logger.Trace("armed max_run_duration timer", + "task_group", h.alloc.TaskGroup, + "configured", maxRunDuration, + "remaining", remaining, + "deadline", deadline, + ) + + go func(t *time.Timer, deadline time.Time) { + <-t.C + + h.mu.Lock() + if h.timer != t { + h.mu.Unlock() + return + } + h.timer = nil + h.mu.Unlock() + + h.onTimeout(deadline) + }(timer, deadline) +} + +func (h *maxRunDurationHook) stopTimer() { + if h.timer == nil { + return + } + + if !h.timer.Stop() { + select { + case <-h.timer.C: + default: + } + } + + h.timer = nil +} + +func (h *maxRunDurationHook) emitMetrics(maxRunDuration time.Duration, deadline time.Time) { + labels := h.baseLabels + labels = append(labels, metrics.Label{Name: "task_group", Value: h.alloc.TaskGroup}) + + metrics.SetGaugeWithLabels( + []string{"client", "allocs", "max_run_duration", "configured_seconds"}, + float32(maxRunDuration.Seconds()), + labels, + ) + metrics.SetGaugeWithLabels( + []string{"client", "allocs", "max_run_duration", "remaining_seconds"}, + float32(time.Until(deadline).Seconds()), + labels, + ) +} + +func (h *maxRunDurationHook) currentDeadline() (time.Time, time.Duration, bool) { + if h.alloc.TerminalStatus() { + return time.Time{}, 0, false + } + + if h.alloc.DesiredStatus != "" && h.alloc.DesiredStatus != structs.AllocDesiredStatusRun { + return time.Time{}, 0, false + } + + maxRunDuration, ok := h.alloc.MaxRunDuration() + if !ok { + return time.Time{}, 0, false + } + + if deadline, ok := h.alloc.MaxRunDurationDeadline(); ok { + return deadline, maxRunDuration, true + } + + return time.Now().Add(maxRunDuration), maxRunDuration, true +} diff --git a/client/allocrunner/max_run_duration_hook_test.go b/client/allocrunner/max_run_duration_hook_test.go new file mode 100644 index 00000000000..e24a2489179 --- /dev/null +++ b/client/allocrunner/max_run_duration_hook_test.go @@ -0,0 +1,315 @@ +// Copyright IBM Corp. 2015, 2025 +// SPDX-License-Identifier: BUSL-1.1 + +package allocrunner + +import ( + "strings" + "testing" + "time" + + log "github.com/hashicorp/go-hclog" + metrics "github.com/hashicorp/go-metrics/compat" + "github.com/hashicorp/nomad/ci" + "github.com/hashicorp/nomad/client/allocrunner/interfaces" + "github.com/hashicorp/nomad/client/taskenv" + "github.com/hashicorp/nomad/nomad/mock" + "github.com/hashicorp/nomad/nomad/structs" + "github.com/shoenig/test/must" +) + +func newTestMaxRunDurationHookCallback() (func(time.Time), chan time.Time) { + deadlines := make(chan time.Time, 8) + return func(deadline time.Time) { + deadlines <- deadline + }, deadlines +} + +func newTestMaxRunDurationHook( + alloc *structs.Allocation, + baseLabels []metrics.Label, + onTimeout func(time.Time), +) *maxRunDurationHook { + hook := newMaxRunDurationHook(log.NewNullLogger(), alloc, baseLabels, onTimeout) + + h, ok := hook.(*maxRunDurationHook) + if !ok { + panic("newMaxRunDurationHook returned unexpected hook type") + } + + return h +} + +func TestMaxRunDurationHook_Prerun_ArmsTimerImmediately(t *testing.T) { + ci.Parallel(t) + + alloc := mock.BatchAlloc() + maxRunDuration := 50 * time.Millisecond + alloc.Job.Type = structs.JobTypeBatch + alloc.Job.TaskGroups[0].MaxRunDuration = &maxRunDuration + + onTimeout, deadlines := newTestMaxRunDurationHookCallback() + hook := newTestMaxRunDurationHook(alloc, nil, onTimeout) + + err := hook.Prerun((*taskenv.TaskEnv)(nil)) + must.NoError(t, err) + + select { + case deadline := <-deadlines: + must.False(t, deadline.IsZero()) + case <-time.After(500 * time.Millisecond): + t.Fatal("timed out waiting for max_run_duration deadline") + } +} + +func TestMaxRunDurationHook_Update_DoesNotExtendDeadlineOnUnrelatedAllocChange(t *testing.T) { + ci.Parallel(t) + + alloc := mock.BatchAlloc() + maxRunDuration := 50 * time.Millisecond + alloc.Job.Type = structs.JobTypeBatch + alloc.Job.TaskGroups[0].MaxRunDuration = &maxRunDuration + + onTimeout, deadlines := newTestMaxRunDurationHookCallback() + hook := newTestMaxRunDurationHook(alloc, nil, onTimeout) + + err := hook.Prerun((*taskenv.TaskEnv)(nil)) + must.NoError(t, err) + + updated := alloc.Copy() + updated.ClientDescription = "unrelated alloc update" + + time.Sleep(20 * time.Millisecond) + + err = hook.Update(&interfaces.RunnerUpdateRequest{Alloc: updated}) + must.NoError(t, err) + + select { + case <-deadlines: + case <-time.After(200 * time.Millisecond): + t.Fatal("timed out waiting for original max_run_duration deadline after unrelated update") + } +} + +func TestMaxRunDurationHook_Update_RearmsOnDurationChange(t *testing.T) { + ci.Parallel(t) + + alloc := mock.BatchAlloc() + initial := 200 * time.Millisecond + alloc.Job.Type = structs.JobTypeBatch + alloc.Job.TaskGroups[0].MaxRunDuration = &initial + + onTimeout, deadlines := newTestMaxRunDurationHookCallback() + hook := newTestMaxRunDurationHook(alloc, nil, onTimeout) + + err := hook.Prerun((*taskenv.TaskEnv)(nil)) + must.NoError(t, err) + + updated := alloc.Copy() + latest := 40 * time.Millisecond + updated.Job.TaskGroups[0].MaxRunDuration = &latest + + err = hook.Update(&interfaces.RunnerUpdateRequest{Alloc: updated}) + must.NoError(t, err) + + select { + case <-deadlines: + case <-time.After(500 * time.Millisecond): + t.Fatal("timed out waiting for updated max_run_duration deadline") + } +} + +func TestMaxRunDurationHook_DoesNotFireWhenAllocNotEligible(t *testing.T) { + ci.Parallel(t) + + cases := []struct { + name string + alloc *structs.Allocation + }{ + { + name: "non-batch job", + alloc: func() *structs.Allocation { + alloc := mock.BatchAlloc() + maxRunDuration := 25 * time.Millisecond + alloc.Job.Type = structs.JobTypeService + alloc.Job.TaskGroups[0].MaxRunDuration = &maxRunDuration + return alloc + }(), + }, + { + name: "desired status not run", + alloc: func() *structs.Allocation { + alloc := mock.BatchAlloc() + maxRunDuration := 25 * time.Millisecond + alloc.Job.Type = structs.JobTypeBatch + alloc.Job.TaskGroups[0].MaxRunDuration = &maxRunDuration + alloc.DesiredStatus = structs.AllocDesiredStatusStop + return alloc + }(), + }, + { + name: "terminal alloc", + alloc: func() *structs.Allocation { + alloc := mock.BatchAlloc() + maxRunDuration := 25 * time.Millisecond + alloc.Job.Type = structs.JobTypeBatch + alloc.Job.TaskGroups[0].MaxRunDuration = &maxRunDuration + alloc.ClientStatus = structs.AllocClientStatusComplete + return alloc + }(), + }, + { + name: "no max run duration", + alloc: func() *structs.Allocation { + alloc := mock.BatchAlloc() + alloc.Job.Type = structs.JobTypeBatch + alloc.Job.TaskGroups[0].MaxRunDuration = nil + return alloc + }(), + }, + } + + for _, tc := range cases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + onTimeout, deadlines := newTestMaxRunDurationHookCallback() + hook := newTestMaxRunDurationHook(tc.alloc, nil, onTimeout) + + err := hook.Prerun((*taskenv.TaskEnv)(nil)) + must.NoError(t, err) + + select { + case deadline := <-deadlines: + t.Fatalf("unexpected deadline fired: %v", deadline) + case <-time.After(100 * time.Millisecond): + } + }) + } +} + +func TestMaxRunDurationHook_Postrun_CancelsTimer(t *testing.T) { + ci.Parallel(t) + + alloc := mock.BatchAlloc() + maxRunDuration := 150 * time.Millisecond + alloc.Job.Type = structs.JobTypeBatch + alloc.Job.TaskGroups[0].MaxRunDuration = &maxRunDuration + + onTimeout, deadlines := newTestMaxRunDurationHookCallback() + hook := newTestMaxRunDurationHook(alloc, nil, onTimeout) + + err := hook.Prerun((*taskenv.TaskEnv)(nil)) + must.NoError(t, err) + + err = hook.Postrun() + must.NoError(t, err) + + select { + case deadline := <-deadlines: + t.Fatalf("unexpected deadline fired after postrun: %v", deadline) + case <-time.After(250 * time.Millisecond): + } +} + +func TestMaxRunDurationHook_Shutdown_CancelsTimer(t *testing.T) { + ci.Parallel(t) + + alloc := mock.BatchAlloc() + maxRunDuration := 150 * time.Millisecond + alloc.Job.Type = structs.JobTypeBatch + alloc.Job.TaskGroups[0].MaxRunDuration = &maxRunDuration + + onTimeout, deadlines := newTestMaxRunDurationHookCallback() + hook := newTestMaxRunDurationHook(alloc, nil, onTimeout) + + err := hook.Prerun((*taskenv.TaskEnv)(nil)) + must.NoError(t, err) + + hook.Shutdown() + + select { + case deadline := <-deadlines: + t.Fatalf("unexpected deadline fired after shutdown: %v", deadline) + case <-time.After(250 * time.Millisecond): + } +} + +func TestMaxRunDurationHook_Prerun_ArmsTimerBeforeTasksAreFullyRunning(t *testing.T) { + ci.Parallel(t) + + alloc := mock.BatchAlloc() + maxRunDuration := 50 * time.Millisecond + alloc.Job.Type = structs.JobTypeBatch + alloc.Job.TaskGroups[0].MaxRunDuration = &maxRunDuration + + // Simulate the initial prerun state where tasks have not yet started and + // therefore no StartedAt timestamps are available to compute a fully-running + // deadline from task state. + for _, ts := range alloc.TaskStates { + ts.State = structs.TaskStatePending + ts.StartedAt = time.Time{} + } + + onTimeout, deadlines := newTestMaxRunDurationHookCallback() + hook := newTestMaxRunDurationHook(alloc, nil, onTimeout) + + err := hook.Prerun((*taskenv.TaskEnv)(nil)) + must.NoError(t, err) + + select { + case deadline := <-deadlines: + must.False(t, deadline.IsZero()) + case <-time.After(500 * time.Millisecond): + t.Fatal("timed out waiting for max_run_duration deadline before tasks were fully running") + } +} + +func TestMaxRunDurationHook_EmitMetrics(t *testing.T) { + ci.Parallel(t) + + inMemorySink := metrics.NewInmemSink(10*time.Millisecond, 50*time.Millisecond) + _, err := metrics.NewGlobal(metrics.DefaultConfig("nomad_test"), inMemorySink) + must.NoError(t, err) + + alloc := mock.BatchAlloc() + maxRunDuration := 2 * time.Minute + alloc.Job.Type = structs.JobTypeBatch + alloc.Job.TaskGroups[0].MaxRunDuration = &maxRunDuration + + baseLabels := []metrics.Label{ + {Name: "node_id", Value: "node-123"}, + } + + onTimeout, _ := newTestMaxRunDurationHookCallback() + hook := newTestMaxRunDurationHook(alloc, baseLabels, onTimeout) + + err = hook.Prerun((*taskenv.TaskEnv)(nil)) + must.NoError(t, err) + + data := inMemorySink.Data() + must.Len(t, 1, data) + + configuredSuffix := "client.allocs.max_run_duration.configured_seconds;node_id=node-123;task_group=" + alloc.TaskGroup + remainingSuffix := "client.allocs.max_run_duration.remaining_seconds;node_id=node-123;task_group=" + alloc.TaskGroup + + var configuredFound bool + var remainingFound bool + + for key, gauge := range data[0].Gauges { + if strings.HasSuffix(key, configuredSuffix) { + must.Eq(t, float32(maxRunDuration.Seconds()), gauge.Value) + configuredFound = true + } + + if strings.HasSuffix(key, remainingSuffix) { + must.Positive(t, gauge.Value) + must.True(t, gauge.Value <= float32(maxRunDuration.Seconds())) + remainingFound = true + } + } + + must.True(t, configuredFound) + must.True(t, remainingFound) +} diff --git a/client/allocrunner/state/state.go b/client/allocrunner/state/state.go index 54954711907..dd1f2327bf9 100644 --- a/client/allocrunner/state/state.go +++ b/client/allocrunner/state/state.go @@ -19,6 +19,11 @@ type State struct { // allocations client state ClientDescription string + // MaxRunDurationExceeded indicates the allocation exceeded its configured + // max_run_duration and should be reported as complete with a timeout reason + // regardless of task exit status. + MaxRunDurationExceeded bool + // DeploymentStatus captures the status of the deployment DeploymentStatus *structs.AllocDeploymentStatus @@ -60,11 +65,12 @@ func (s *State) Copy() *State { taskStates[k] = v.Copy() } return &State{ - ClientStatus: s.ClientStatus, - ClientDescription: s.ClientDescription, - DeploymentStatus: s.DeploymentStatus.Copy(), - TaskStates: taskStates, - NetworkStatus: s.NetworkStatus.Copy(), + ClientStatus: s.ClientStatus, + ClientDescription: s.ClientDescription, + MaxRunDurationExceeded: s.MaxRunDurationExceeded, + DeploymentStatus: s.DeploymentStatus.Copy(), + TaskStates: taskStates, + NetworkStatus: s.NetworkStatus.Copy(), } } diff --git a/command/agent/job_endpoint.go b/command/agent/job_endpoint.go index d4baeb9f360..99232856ba8 100644 --- a/command/agent/job_endpoint.go +++ b/command/agent/job_endpoint.go @@ -1253,6 +1253,10 @@ func ApiTgToStructsTG(job *structs.Job, taskGroup *api.TaskGroup, tg *structs.Ta tg.ShutdownDelay = taskGroup.ShutdownDelay } + if taskGroup.MaxRunDuration != nil { + tg.MaxRunDuration = taskGroup.MaxRunDuration + } + if taskGroup.ReschedulePolicy != nil { tg.ReschedulePolicy = &structs.ReschedulePolicy{ Attempts: *taskGroup.ReschedulePolicy.Attempts, diff --git a/command/agent/job_endpoint_test.go b/command/agent/job_endpoint_test.go index 740126988c1..0abef0b8f70 100644 --- a/command/agent/job_endpoint_test.go +++ b/command/agent/job_endpoint_test.go @@ -2834,6 +2834,7 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) { ProgressDeadline: pointer.Of(5 * time.Minute), AutoRevert: pointer.Of(true), }, + MaxRunDuration: pointer.Of(10 * time.Second), Meta: map[string]string{ "key": "value", }, @@ -3275,6 +3276,7 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) { AutoPromote: false, Canary: 1, }, + MaxRunDuration: pointer.Of(10 * time.Second), Meta: map[string]string{ "key": "value", }, diff --git a/nomad/state/events.go b/nomad/state/events.go index 07ea50c3acf..396d4d42fd0 100644 --- a/nomad/state/events.go +++ b/nomad/state/events.go @@ -357,12 +357,19 @@ func eventFromChange(change memdb.Change) (structs.Event, bool) { // remove job info to help keep size of alloc event down alloc.Job = nil + allocEvent := &structs.AllocationEvent{Allocation: alloc} + if alloc.ClientStatus == structs.AllocClientStatusComplete && + alloc.ClientDescription == structs.AllocTimeoutReasonMaxRunDuration { + allocEvent.Timeout = true + allocEvent.TimeoutReason = alloc.ClientDescription + } + return structs.Event{ Topic: structs.TopicAllocation, Key: after.ID, FilterKeys: filterKeys, Namespace: after.Namespace, - Payload: &structs.AllocationEvent{Allocation: alloc}, + Payload: allocEvent, }, true case "jobs": after, ok := change.After.(*structs.Job) diff --git a/nomad/state/events_test.go b/nomad/state/events_test.go index 8cbab8f26ea..4f885e6db20 100644 --- a/nomad/state/events_test.go +++ b/nomad/state/events_test.go @@ -538,6 +538,54 @@ func TestEventsFromChanges_ApplyPlanResultsRequestType(t *testing.T) { must.Len(t, 1, evals) must.Len(t, 1, jobs) must.Len(t, 1, deploys) + + for _, e := range allocs { + allocEvent := e.Payload.(*structs.AllocationEvent) + must.False(t, allocEvent.Timeout) + must.Eq(t, "", allocEvent.TimeoutReason) + } +} + +func TestEventFromChange_AllocationTimeoutFields(t *testing.T) { + ci.Parallel(t) + s := TestStateStoreCfg(t, TestStateStorePublisher(t)) + defer s.StopEventBroker() + + timeoutAlloc := mock.Alloc() + timeoutAlloc.ClientStatus = structs.AllocClientStatusComplete + timeoutAlloc.ClientDescription = structs.AllocTimeoutReasonMaxRunDuration + + timeoutEvent, ok := eventFromChange(memdb.Change{ + Table: "allocs", + Before: nil, + After: timeoutAlloc, + }) + must.True(t, ok) + + timeoutPayload, ok := timeoutEvent.Payload.(*structs.AllocationEvent) + must.True(t, ok) + must.True(t, timeoutPayload.Timeout) + must.Eq(t, structs.AllocTimeoutReasonMaxRunDuration, timeoutPayload.TimeoutReason) + must.Eq(t, structs.AllocClientStatusComplete, timeoutPayload.Allocation.ClientStatus) + must.Eq(t, structs.AllocTimeoutReasonMaxRunDuration, timeoutPayload.Allocation.ClientDescription) + + nonTimeoutAlloc := mock.Alloc() + nonTimeoutAlloc.ClientStatus = structs.AllocClientStatusFailed + nonTimeoutAlloc.ClientDescription = structs.AllocTimeoutReasonMaxRunDuration + + nonTimeoutEvent, ok := eventFromChange(memdb.Change{ + Table: "allocs", + Before: nil, + After: nonTimeoutAlloc, + }) + must.True(t, ok) + + nonTimeoutPayload, ok := nonTimeoutEvent.Payload.(*structs.AllocationEvent) + must.True(t, ok) + must.False(t, nonTimeoutPayload.Timeout) + must.Eq(t, "", nonTimeoutPayload.TimeoutReason) + must.Eq(t, structs.AllocClientStatusFailed, nonTimeoutPayload.Allocation.ClientStatus) + must.Eq(t, structs.AllocTimeoutReasonMaxRunDuration, nonTimeoutPayload.Allocation.ClientDescription) } func TestEventsFromChanges_BatchNodeUpdateDrainRequestType(t *testing.T) { diff --git a/nomad/structs/alloc.go b/nomad/structs/alloc.go index fbc7777e776..e662b634175 100644 --- a/nomad/structs/alloc.go +++ b/nomad/structs/alloc.go @@ -20,6 +20,10 @@ const ( AllocDesiredStatusRun = "run" // Allocation should run AllocDesiredStatusStop = "stop" // Allocation should stop AllocDesiredStatusEvict = "evict" // Allocation should stop, and was evicted + + // AllocTimeoutReasonMaxRunDuration is the reason used when an allocation is + // stopped because it exceeded its configured max_run_duration. + AllocTimeoutReasonMaxRunDuration = "allocation exceeded max_run_duration" ) const ( @@ -357,6 +361,95 @@ func (a *Allocation) TerminalStatus() bool { return a.ServerTerminalStatus() || a.ClientTerminalStatus() } +// MaxRunDuration returns the configured max_run_duration for the allocation's +// task group, if any. +func (a *Allocation) MaxRunDuration() (time.Duration, bool) { + if a == nil || a.Job == nil { + return 0, false + } + + tg := a.Job.LookupTaskGroup(a.TaskGroup) + if tg == nil || tg.MaxRunDuration == nil || *tg.MaxRunDuration <= 0 { + return 0, false + } + + switch a.Job.Type { + case JobTypeBatch, JobTypeSysBatch: + return *tg.MaxRunDuration, true + default: + return 0, false + } +} + +// FullyRunningSince returns the latest StartedAt timestamp across all task +// states, but only when every known task state is running with a non-zero start +// time. +func FullyRunningSince(taskStates map[string]*TaskState) (time.Time, bool) { + if len(taskStates) == 0 { + return time.Time{}, false + } + + var latest time.Time + + for _, ts := range taskStates { + if ts == nil || ts.State != TaskStateRunning || ts.StartedAt.IsZero() { + return time.Time{}, false + } + if ts.StartedAt.After(latest) { + latest = ts.StartedAt + } + } + + if latest.IsZero() { + return time.Time{}, false + } + + return latest, true +} + +func (a *Allocation) fullyRunningSince() (time.Time, bool) { + if a == nil { + return time.Time{}, false + } + + return FullyRunningSince(a.TaskStates) +} + +// MaxRunDurationDeadline returns the deadline at which the allocation should be +// considered timed out based on max_run_duration. +func (a *Allocation) MaxRunDurationDeadline() (time.Time, bool) { + maxRunDuration, ok := a.MaxRunDuration() + if !ok { + return time.Time{}, false + } + + startedAt, ok := a.fullyRunningSince() + if !ok { + return time.Time{}, false + } + + return startedAt.Add(maxRunDuration), true +} + +// MaxRunDurationExpired returns whether the allocation has exceeded its +// configured max_run_duration. +func (a *Allocation) MaxRunDurationExpired(now time.Time) bool { + if a == nil || a.DesiredStatus != AllocDesiredStatusRun || a.ClientStatus != AllocClientStatusRunning { + return false + } + + if a.ClientTerminalStatus() || a.ServerTerminalStatus() { + return false + } + + deadline, ok := a.MaxRunDurationDeadline() + if !ok { + return false + } + + return !deadline.After(now) +} + // ServerTerminalStatus returns true if the desired state of the allocation is terminal func (a *Allocation) ServerTerminalStatus() bool { switch a.DesiredStatus { diff --git a/nomad/structs/event.go b/nomad/structs/event.go index 2affe4da618..e1aaecc6118 100644 --- a/nomad/structs/event.go +++ b/nomad/structs/event.go @@ -141,6 +141,14 @@ type EvaluationEvent struct { // Allocs embedded Job has been removed to reduce size. type AllocationEvent struct { Allocation *Allocation + + // Timeout indicates the allocation was stopped because it exceeded its + // configured max_run_duration. + Timeout bool `json:",omitempty"` + + // TimeoutReason is a human-readable explanation for timeout-triggered + // allocation stops. + TimeoutReason string `json:",omitempty"` } // DeploymentEvent holds a newly updated Deployment. diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index df142bab0dc..76576bec5b8 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -6867,6 +6867,11 @@ type TaskGroup struct { // group services in consul and stopping tasks. ShutdownDelay *time.Duration + // MaxRunDuration is the maximum amount of time a batch or sysbatch task + // group allocation may run after entering the running state before Nomad + // stops it. + MaxRunDuration *time.Duration + // StopAfterClientDisconnect, if set, configures the client to stop the task group // after this duration since the last known good heartbeat // To be deprecated after 1.8.0 infavor of Disconnect.StopOnClientAfter @@ -6935,6 +6940,10 @@ func (tg *TaskGroup) Copy() *TaskGroup { ntg.ShutdownDelay = tg.ShutdownDelay } + if tg.MaxRunDuration != nil { + ntg.MaxRunDuration = tg.MaxRunDuration + } + return ntg } @@ -7054,6 +7063,18 @@ func (tg *TaskGroup) Validate(j *Job) error { } } + if tg.MaxRunDuration != nil { + if *tg.MaxRunDuration <= 0 { + mErr = multierror.Append(mErr, errors.New("MaxRunDuration must be greater than zero")) + } + + switch j.Type { + case JobTypeBatch, JobTypeSysBatch: + default: + mErr = multierror.Append(mErr, fmt.Errorf("Job type %q does not allow max_run_duration", j.Type)) + } + } + for idx, constr := range tg.Constraints { if err := constr.Validate(); err != nil { outer := fmt.Errorf("Constraint %d validation failed: %s", idx+1, err) diff --git a/nomad/structs/structs_test.go b/nomad/structs/structs_test.go index 6f77ff76b76..e49f6d19d99 100644 --- a/nomad/structs/structs_test.go +++ b/nomad/structs/structs_test.go @@ -1543,6 +1543,81 @@ func TestTaskGroup_Validate(t *testing.T) { }, jobType: JobTypeSystem, }, + { + name: "invalid max run duration for service job", + tg: &TaskGroup{ + Name: "web", + Count: 1, + MaxRunDuration: pointer.Of(5 * time.Minute), + Tasks: []*Task{ + {Name: "web"}, + }, + }, + expErr: []string{ + `Job type "service" does not allow max_run_duration`, + }, + jobType: JobTypeService, + }, + { + name: "invalid non-positive max run duration for batch job", + tg: &TaskGroup{ + Name: "web", + Count: 1, + MaxRunDuration: pointer.Of(time.Duration(0)), + Tasks: []*Task{ + {Name: "web"}, + }, + }, + expErr: []string{ + "MaxRunDuration must be greater than zero", + }, + jobType: JobTypeBatch, + }, + { + name: "valid max run duration for batch job", + tg: &TaskGroup{ + Name: "web", + Count: 1, + MaxRunDuration: pointer.Of(5 * time.Minute), + Tasks: []*Task{ + { + Name: "web", + Driver: "mock_driver", + Resources: DefaultResources(), + LogConfig: DefaultLogConfig(), + }, + }, + RestartPolicy: NewRestartPolicy(JobTypeBatch), + ReschedulePolicy: NewReschedulePolicy(JobTypeBatch), + EphemeralDisk: DefaultEphemeralDisk(), + }, + jobType: JobTypeBatch, + }, + { + name: "valid max run duration for sysbatch job", + tg: &TaskGroup{ + Name: "web", + Count: 1, + MaxRunDuration: pointer.Of(5 * time.Minute), + Tasks: []*Task{ + { + Name: "web", + Driver: "mock_driver", + Resources: DefaultResources(), + LogConfig: DefaultLogConfig(), + }, + }, + RestartPolicy: &RestartPolicy{ + Attempts: 0, + Interval: 5 * time.Second, + Delay: 5 * time.Second, + Mode: RestartPolicyModeFail, + RenderTemplates: false, + }, + EphemeralDisk: DefaultEphemeralDisk(), + }, + jobType: JobTypeSysBatch, + }, { name: "duplicated por label", tg: &TaskGroup{ diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go index ff008e91258..5c2351b6aff 100644 --- a/scheduler/generic_sched_test.go +++ b/scheduler/generic_sched_test.go @@ -3380,6 +3380,93 @@ func TestServiceSched_JobModify_InPlace08(t *testing.T) { must.NotNil(t, newAlloc.AllocatedResources) } +func TestServiceSched_JobModify_MaxRunDuration_InPlace(t *testing.T) { + ci.Parallel(t) + + h := tests.NewHarness(t) + + // Create a node + node := mock.Node() + must.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + + // Create a batch job with a running allocation + job := mock.BatchJob() + job.TaskGroups[0].Count = 1 + initialMaxRunDuration := 1 * time.Hour + job.TaskGroups[0].MaxRunDuration = &initialMaxRunDuration + must.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), nil, job)) + + alloc := mock.BatchAlloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = node.ID + alloc.Name = fmt.Sprintf("%s.%s[%d]", job.ID, job.TaskGroups[0].Name, 0) + must.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Allocation{alloc})) + + // Update only max_run_duration + job2 := job.Copy() + updatedMaxRunDuration := 4 * time.Hour + job2.TaskGroups[0].MaxRunDuration = &updatedMaxRunDuration + must.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), nil, job2)) + + // Create a mock evaluation + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job.ID, + Status: structs.EvalStatusPending, + AnnotatePlan: true, + } + must.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewBatchScheduler, eval) + must.NoError(t, err) + + // Ensure a single plan + must.SliceLen(t, 1, h.Plans) + plan := h.Plans[0] + + // Ensure the plan did not evict any allocs + var update []*structs.Allocation + for _, updateList := range plan.NodeUpdate { + update = append(update, updateList...) + } + must.SliceLen(t, 0, update) + + // Ensure the plan updated the existing alloc in place + var planned []*structs.Allocation + for _, allocList := range plan.NodeAllocation { + planned = append(planned, allocList...) + } + must.SliceLen(t, 1, planned) + must.Eq(t, job2, planned[0].Job) + must.NotNil(t, plan.Annotations) + must.NotNil(t, plan.Annotations.DesiredTGUpdates) + must.NotNil(t, plan.Annotations.DesiredTGUpdates[job.TaskGroups[0].Name]) + must.Eq(t, uint64(1), plan.Annotations.DesiredTGUpdates[job.TaskGroups[0].Name].InPlaceUpdate) + must.Eq(t, uint64(0), plan.Annotations.DesiredTGUpdates[job.TaskGroups[0].Name].DestructiveUpdate) + must.Eq(t, uint64(0), plan.Annotations.DesiredTGUpdates[job.TaskGroups[0].Name].Place) + must.Eq(t, uint64(0), plan.Annotations.DesiredTGUpdates[job.TaskGroups[0].Name].Stop) + + // Lookup the allocations by JobID + ws := memdb.NewWatchSet() + out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false) + must.NoError(t, err) + must.Len(t, 1, out) + + updatedAlloc := out[0] + maxRunDuration, ok := updatedAlloc.MaxRunDuration() + must.True(t, ok) + must.Eq(t, updatedMaxRunDuration, maxRunDuration) + must.Eq(t, alloc.ID, planned[0].ID) + must.Eq(t, updatedAlloc.ID, planned[0].ID) + + h.AssertEvalStatus(t, structs.EvalStatusComplete) +} + func TestServiceSched_JobModify_DistinctProperty(t *testing.T) { ci.Parallel(t) diff --git a/scheduler/util.go b/scheduler/util.go index 0af7dd72de9..b8b972ede73 100644 --- a/scheduler/util.go +++ b/scheduler/util.go @@ -844,6 +844,22 @@ func genericAllocUpdateFn(ctx feasible.Context, stack feasible.Stack, evalID str return false, true, nil } + // max_run_duration-only updates. This field does not affect placement + // or allocated resources, so we can update the alloc in place without + // re-running feasibility. + if existingTG := existing.Job.LookupTaskGroup(newTG.Name); existingTG != nil { + oldMax, oldOK := existing.MaxRunDuration() + newAlloc := existing.Copy() + newAlloc.EvalID = evalID + newAlloc.Job = nil + newAlloc.Resources = nil + + newMax, newOK := newAlloc.MaxRunDuration() + if oldOK != newOK || oldMax != newMax { + return false, false, newAlloc + } + } + // Set the existing node as the base set stack.SetNodes([]*structs.Node{node})