hashicorp · pkazmierczak · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026
@@ -507,6 +507,7 @@ type TaskGroup struct {
 	Meta             map[string]string         `hcl:"meta,block"`
 	Services         []*Service                `hcl:"service,block"`
 	ShutdownDelay    *time.Duration            `mapstructure:"shutdown_delay" hcl:"shutdown_delay,optional"`
+	MaxRunDuration   *time.Duration            `mapstructure:"max_run_duration" hcl:"max_run_duration,optional"`
 	// Deprecated: StopAfterClientDisconnect is deprecated in Nomad 1.8 and ignored in Nomad 1.10. Use Disconnect.StopOnClientAfter.
 	StopAfterClientDisconnect *time.Duration `mapstructure:"stop_after_client_disconnect" hcl:"stop_after_client_disconnect,optional"`
 	// Deprecated: MaxClientDisconnect is deprecated in Nomad 1.8.0 and ignored in Nomad 1.10. Use Disconnect.LostAfter.

@@ -737,8 +737,14 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState {
 			return nil
 		}
 
-		return structs.NewTaskEvent(structs.TaskKilling).
+		event := structs.NewTaskEvent(structs.TaskKilling).
 			SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout)
+
+		if ar.state.MaxRunDurationExceeded {
+			event.SetDisplayMessage(structs.AllocTimeoutReasonMaxRunDuration)
+		}
+
+		return event
 	}
 
 	// Kill leader first, synchronously
@@ -809,6 +815,23 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState {
 	}
 	wg.Wait()
 
+	// Skip poststop tasks entirely when max_run_duration has been exceeded so
+	// they are not started after the allocation has timed out.
+	if ar.state.MaxRunDurationExceeded {
+		for name, tr := range ar.tasks {
+			if !tr.IsPoststopTask() {
+				continue
+			}
+
+			state := tr.TaskState()
+			if state != nil {
+				states[name] = state
+			}
+		}
+
+		return states
+	}
+
 	// Perform no action on post stop tasks, but retain their states if they exist. This
 	// commonly happens at the time of alloc GC from the client node.
 	for name, tr := range ar.tasks {
@@ -845,7 +868,10 @@ func (ar *allocRunner) clientAlloc(taskStates map[string]*structs.TaskState) *st
 	}
 
 	// Compute the ClientStatus
-	if ar.state.ClientStatus != "" {
+	if ar.state.MaxRunDurationExceeded {
+		a.ClientStatus = structs.AllocClientStatusComplete
+		a.ClientDescription = structs.AllocTimeoutReasonMaxRunDuration
+	} else if ar.state.ClientStatus != "" {
 		// The client status is being forced
 		a.ClientStatus, a.ClientDescription = ar.state.ClientStatus, ar.state.ClientDescription
 	} else {
@@ -983,7 +1009,7 @@ func (ar *allocRunner) AllocState() *state.State {
 	// If TaskStateUpdated has not been called yet, ar.state.TaskStates
 	// won't be set as it is not the canonical source of TaskStates.
 	if len(state.TaskStates) == 0 {
-		ar.state.TaskStates = make(map[string]*structs.TaskState, len(ar.tasks))
+		state.TaskStates = make(map[string]*structs.TaskState, len(ar.tasks))
 		for k, tr := range ar.tasks {
 			state.TaskStates[k] = tr.TaskState()
 		}
@@ -1081,6 +1107,27 @@ func (ar *allocRunner) Listener() *cstructs.AllocListener {
 	return ar.allocBroadcaster.Listen()
 }
 
+func (ar *allocRunner) EnforceMaxRunDurationTimeout(deadline time.Time) {
+	now := time.Now()
+
+	if ar.isShuttingDown() {
+		return
+	}
+
+	if now.Before(deadline) {
+		return
+	}
+
+	ar.stateLock.Lock()
+	ar.state.MaxRunDurationExceeded = true
+	ar.state.ClientStatus = structs.AllocClientStatusComplete
+	ar.state.ClientDescription = structs.AllocTimeoutReasonMaxRunDuration
+	ar.stateLock.Unlock()
+
+	ar.logger.Debug("allocation exceeded max_run_duration, killing tasks", "deadline", deadline)
+	ar.killTasks()
+}
+
 func (ar *allocRunner) destroyImpl() {
 	// Stop any running tasks and persist states in case the client is
 	// shutdown before Destroy finishes.
@@ -1255,8 +1302,8 @@ func (ar *allocRunner) Shutdown() {
 	go func() {
 		ar.logger.Trace("shutting down")
 
-		// Shutdown tasks gracefully if they were run
-		wg := sync.WaitGroup{}
+		// Shutdown task runners
+		var wg sync.WaitGroup
 		for _, tr := range ar.tasks {
 			wg.Add(1)
 			go func(tr *taskrunner.TaskRunner) {

@@ -111,6 +111,7 @@ func (ar *allocRunner) initRunnerHooks(config *clientconfig.Config) error {
 	ar.runnerHooks = []interfaces.RunnerHook{
 		newIdentityHook(hookLogger, ar.widmgr),
 		newAllocDirHook(hookLogger, ar.allocDir),
+		newMaxRunDurationHook(hookLogger, alloc, ar.clientBaseLabels, ar.EnforceMaxRunDurationTimeout),
 		newConsulHook(consulHookConfig{
 			alloc:                   ar.alloc,
 			allocdir:                ar.allocDir,

@@ -497,6 +497,95 @@ func TestAllocRunner_Lifecycle_Poststop(t *testing.T) {
 
 }
 
+func TestAllocRunner_MaxRunDuration_SkipsPoststopTasks(t *testing.T) {
+	ci.Parallel(t)
+
+	alloc := mock.LifecycleAlloc()
+	tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name]
+
+	alloc.Job.Type = structs.JobTypeBatch
+	maxRunDuration := 50 * time.Millisecond
+	alloc.Job.TaskGroups[0].MaxRunDuration = &maxRunDuration
+
+	mainTask := alloc.Job.TaskGroups[0].Tasks[0]
+	mainTask.Config["run_for"] = "100s"
+	mainTask.KillTimeout = 10 * time.Millisecond
+
+	poststopTask := alloc.Job.TaskGroups[0].Tasks[1]
+	poststopTask.Name = "poststop"
+	poststopTask.Lifecycle.Hook = structs.TaskLifecycleHookPoststop
+	poststopTask.Config["run_for"] = "10s"
+
+	alloc.Job.TaskGroups[0].Tasks = []*structs.Task{mainTask, poststopTask}
+	alloc.AllocatedResources.Tasks = map[string]*structs.AllocatedTaskResources{
+		mainTask.Name:     tr,
+		poststopTask.Name: tr,
+	}
+
+	conf, cleanup := testAllocRunnerConfig(t, alloc)
+	defer cleanup()
+
+	arIface, err := NewAllocRunner(conf)
+	must.NoError(t, err)
+	ar := arIface.(*allocRunner)
+
+	go ar.Run()
+	defer destroy(ar)
+
+	upd := conf.StateUpdater.(*MockStateUpdater)
+
+	testutil.WaitForResult(func() (bool, error) {
+		last := upd.Last()
+		if last == nil {
+			return false, fmt.Errorf("no updates")
+		}
+
+		if last.ClientStatus != structs.AllocClientStatusRunning {
+			return false, fmt.Errorf("expected alloc to be running not %s", last.ClientStatus)
+		}
+
+		if s := last.TaskStates[mainTask.Name].State; s != structs.TaskStateRunning {
+			return false, fmt.Errorf("expected main task to be running not %s", s)
+		}
+
+		if s := last.TaskStates[poststopTask.Name].State; s != structs.TaskStatePending {
+			return false, fmt.Errorf("expected poststop task to be pending not %s", s)
+		}
+
+		return true, nil
+	}, func(err error) {
+		t.Fatalf("error waiting for initial state:\n%v", err)
+	})
+
+	testutil.WaitForResult(func() (bool, error) {
+		last := upd.Last()
+		if last == nil {
+			return false, fmt.Errorf("no updates")
+		}
+
+		if last.ClientStatus != structs.AllocClientStatusComplete {
+			return false, fmt.Errorf("expected alloc to be complete not %s", last.ClientStatus)
+		}
+
+		if last.ClientDescription != structs.AllocTimeoutReasonMaxRunDuration {
+			return false, fmt.Errorf("expected alloc description %q not %q", structs.AllocTimeoutReasonMaxRunDuration, last.ClientDescription)
+		}
+
+		if s := last.TaskStates[mainTask.Name].State; s != structs.TaskStateDead {
+			return false, fmt.Errorf("expected main task to be dead not %s", s)
+		}
+
+		if s := last.TaskStates[poststopTask.Name].State; s != structs.TaskStatePending {
+			return false, fmt.Errorf("expected poststop task to remain pending not %s", s)
+		}
+
+		return true, nil
+	}, func(err error) {
+		last := upd.Last()
+		t.Fatalf("error waiting for max_run_duration state:\n%v\nlast=%#v", err, last)
+	})
+}
+
 func TestAllocRunner_Lifecycle_Restart(t *testing.T) {
 	ci.Parallel(t)
 
@@ -2676,6 +2765,124 @@ func TestAllocRunner_GetUpdatePriority(t *testing.T) {
 	must.Eq(t, cstructs.AllocUpdatePriorityUrgent, ar.GetUpdatePriority(calloc))
 }
 
+func TestAllocRunner_MaxRunDuration_StopsExpiredAlloc(t *testing.T) {
+	ci.Parallel(t)
+
+	alloc := mock.BatchAlloc()
+	task := alloc.Job.TaskGroups[0].Tasks[0]
+	task.Driver = "mock_driver"
+	task.Config = map[string]interface{}{
+		"run_for": "10s",
+	}
+	task.KillTimeout = 10 * time.Millisecond
+	maxRunDuration := 50 * time.Millisecond
+	alloc.Job.TaskGroups[0].MaxRunDuration = &maxRunDuration
+
+	conf, cleanup := testAllocRunnerConfig(t, alloc)
+	defer cleanup()
+
+	arIface, err := NewAllocRunner(conf)
+	must.NoError(t, err)
+	ar := arIface.(*allocRunner)
+
+	go ar.Run()
+	defer destroy(ar)
+
+	testutil.WaitForResult(func() (bool, error) {
+		state := ar.AllocState()
+		if state == nil {
+			return false, fmt.Errorf("no alloc state")
+		}
+		if state.ClientStatus != structs.AllocClientStatusComplete {
+			return false, fmt.Errorf("got status %v; want %v", state.ClientStatus, structs.AllocClientStatusComplete)
+		}
+		if state.ClientDescription != structs.AllocTimeoutReasonMaxRunDuration {
+			return false, fmt.Errorf("got description %q; want %q", state.ClientDescription, structs.AllocTimeoutReasonMaxRunDuration)
+		}
+		if !state.MaxRunDurationExceeded {
+			return false, fmt.Errorf("max run duration was not marked exceeded")
+		}
+		return true, nil
+	}, func(err error) {
+		state := ar.AllocState()
+		t.Fatalf("timed out waiting for alloc runner max_run_duration enforcement: %v; state=%#v", err, state)
+	})
+}
+
+func TestAllocRunner_MaxRunDuration_UpdateExtendsRunningAlloc(t *testing.T) {
+	ci.Parallel(t)
+
+	alloc := mock.BatchAlloc()
+	task := alloc.Job.TaskGroups[0].Tasks[0]
+	task.Driver = "mock_driver"
+	task.Config = map[string]interface{}{
+		"run_for": "10s",
+	}
+	task.KillTimeout = 10 * time.Millisecond
+
+	initialMaxRunDuration := 75 * time.Millisecond
+	alloc.Job.TaskGroups[0].MaxRunDuration = &initialMaxRunDuration
+
+	conf, cleanup := testAllocRunnerConfig(t, alloc)
+	defer cleanup()
+
+	arIface, err := NewAllocRunner(conf)
+	must.NoError(t, err)
+	ar := arIface.(*allocRunner)
+
+	go ar.Run()
+	defer destroy(ar)
+
+	testutil.WaitForResult(func() (bool, error) {
+		state := ar.AllocState()
+		if state == nil {
+			return false, fmt.Errorf("no alloc state")
+		}
+		if state.ClientStatus != structs.AllocClientStatusRunning {
+			return false, fmt.Errorf("got status %v; want %v", state.ClientStatus, structs.AllocClientStatusRunning)
+		}
+		return true, nil
+	}, func(err error) {
+		state := ar.AllocState()
+		t.Fatalf("timed out waiting for alloc runner to start: %v; state=%#v", err, state)
+	})
+
+	time.Sleep(40 * time.Millisecond)
+
+	updatedAlloc := ar.Alloc().Copy()
+	updatedAlloc.AllocModifyIndex++
+	updatedMaxRunDuration := 200 * time.Millisecond
+	updatedAlloc.Job.TaskGroups[0].MaxRunDuration = &updatedMaxRunDuration
+	ar.Update(updatedAlloc)
+
+	time.Sleep(60 * time.Millisecond)
+
+	state := ar.AllocState()
+	must.NotNil(t, state)
+	must.False(t, state.MaxRunDurationExceeded)
+	must.Eq(t, structs.AllocClientStatusRunning, state.ClientStatus)
+
+	testutil.WaitForResult(func() (bool, error) {
+		state := ar.AllocState()
+		if state == nil {
+			return false, fmt.Errorf("no alloc state")
+		}
+		if state.ClientStatus != structs.AllocClientStatusComplete {
+			return false, fmt.Errorf("got status %v; want %v", state.ClientStatus, structs.AllocClientStatusComplete)
+		}
+		if state.ClientDescription != structs.AllocTimeoutReasonMaxRunDuration {
+			return false, fmt.Errorf("got description %q; want %q", state.ClientDescription, structs.AllocTimeoutReasonMaxRunDuration)
+		}
+		if !state.MaxRunDurationExceeded {
+			return false, fmt.Errorf("max run duration was not marked exceeded")
+		}
+		return true, nil
+	}, func(err error) {
+		state := ar.AllocState()
+		t.Fatalf("timed out waiting for alloc runner max_run_duration enforcement after update: %v; state=%#v", err, state)
+	})
+}
+
 func TestAllocRunner_setHookStatsHandler(t *testing.T) {
 	ci.Parallel(t)