Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions internal/daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -577,6 +577,9 @@ func (d *Daemon) handleDelete(params json.RawMessage) Response {
_ = d.sessMgr.Delete(s.Key())
}
err = d.store.DeleteTeam(p.Name)
// Clear accumulated crash-loop state for this team's roles so a
// subsequent re-apply starts fresh. See ArcavenAE/marvel#29.
d.teamCtrl.ClearRoleHealthForTeam(t.Workspace, t.Name)
case "workspace":
ws, getErr := d.store.GetWorkspace(p.Name)
if getErr != nil {
Expand All @@ -597,6 +600,9 @@ func (d *Daemon) handleDelete(params json.RawMessage) Response {
}
_ = d.sessMgr.CleanupWorkspace(ws.Name)
err = d.store.DeleteWorkspace(p.Name)
// Clear accumulated crash-loop state for every role under every
// team in this workspace. See ArcavenAE/marvel#29.
d.teamCtrl.ClearRoleHealthForWorkspace(ws.Name)
default:
return Response{Error: fmt.Sprintf("unknown resource type: %s", p.ResourceType)}
}
Expand Down
36 changes: 36 additions & 0 deletions internal/team/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"fmt"
"log"
"sort"
"strings"
"sync"
"time"

Expand Down Expand Up @@ -111,6 +112,41 @@ func (c *Controller) RoleHealthSnapshot(workspace, team, role string) (*RoleHeal
}, true
}

// ClearRoleHealthForTeam deletes crash-loop state for every role under
// the given (workspace, team). Called from the cascade delete path in
// daemon.handleDelete so that a subsequent re-apply of the same manifest
// starts with a fresh RestartCount and BackoffUntil — without this,
// accumulated state survives workspace/team delete (the map is keyed
// by name, which the operator is free to reuse) and the reconciler
// refuses spawns until the prior generation's backoff window elapses.
// If the prior generation hit MaxRestarts saturation, BackoffUntil
// would be frozen far in the future and the role would never recover.
// See ArcavenAE/marvel#29.
func (c *Controller) ClearRoleHealthForTeam(workspace, team string) {
c.mu.Lock()
defer c.mu.Unlock()
prefix := workspace + "/" + team + "/"
for k := range c.roleHealth {
if strings.HasPrefix(k, prefix) {
delete(c.roleHealth, k)
}
}
}

// ClearRoleHealthForWorkspace deletes crash-loop state for every role
// under every team in the given workspace. Called from the workspace-
// delete cascade. See ClearRoleHealthForTeam for the rationale.
func (c *Controller) ClearRoleHealthForWorkspace(workspace string) {
c.mu.Lock()
defer c.mu.Unlock()
prefix := workspace + "/"
for k := range c.roleHealth {
if strings.HasPrefix(k, prefix) {
delete(c.roleHealth, k)
}
}
}

// ReconcileOnce runs one reconciliation pass for all teams.
// Reaps dead sessions first so the reconciler sees accurate state.
func (c *Controller) ReconcileOnce() {
Expand Down
95 changes: 95 additions & 0 deletions internal/team/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -914,6 +914,101 @@ func TestReapPathSaturatesMaxRestarts(t *testing.T) {

// TestComputeBackoff locks in the exponential curve shape so future
// tweaks are intentional and reviewable.
// TestClearRoleHealthForTeam verifies that the cascade-delete helper
// removes only the entries under (workspace, team), leaves siblings
// untouched, and handles the prefix edge case where one team name is
// a prefix of another (e.g., "b" vs "bb"). See ArcavenAE/marvel#29.
func TestClearRoleHealthForTeam(t *testing.T) {
t.Parallel()
store := api.NewStore()
ctrl := NewController(store, nil)

// Populate state for: ws1/teamA/{w,r}, ws1/teamAA/w, ws1/teamB/w, ws2/teamA/w
keys := []string{
"ws1/teamA/worker",
"ws1/teamA/reviewer",
"ws1/teamAA/worker",
"ws1/teamB/worker",
"ws2/teamA/worker",
}
for _, k := range keys {
ctrl.roleHealth[k] = &RoleHealth{RestartCount: 3}
}

ctrl.ClearRoleHealthForTeam("ws1", "teamA")

want := map[string]bool{
"ws1/teamAA/worker": true, // must survive — name has teamA as prefix but isn't teamA
"ws1/teamB/worker": true,
"ws2/teamA/worker": true,
}
for k, rh := range ctrl.roleHealth {
if !want[k] {
t.Errorf("ws1/teamA cleared but %q still present (count=%d)", k, rh.RestartCount)
}
delete(want, k)
}
for k := range want {
t.Errorf("expected %q to survive, but it was deleted", k)
}
}

// TestClearRoleHealthForWorkspace verifies workspace-level cascade
// clears every team's roles under that workspace, and that another
// workspace whose name shares a prefix (e.g., "ws1" vs "ws1-alt") is
// not affected. See ArcavenAE/marvel#29.
func TestClearRoleHealthForWorkspace(t *testing.T) {
t.Parallel()
store := api.NewStore()
ctrl := NewController(store, nil)

keys := []string{
"ws1/teamA/worker",
"ws1/teamA/reviewer",
"ws1/teamB/worker",
"ws1-alt/teamA/worker", // must survive
"ws2/teamA/worker", // must survive
}
for _, k := range keys {
ctrl.roleHealth[k] = &RoleHealth{RestartCount: 5}
}

ctrl.ClearRoleHealthForWorkspace("ws1")

want := map[string]bool{
"ws1-alt/teamA/worker": true,
"ws2/teamA/worker": true,
}
for k, rh := range ctrl.roleHealth {
if !want[k] {
t.Errorf("ws1 cleared but %q still present (count=%d)", k, rh.RestartCount)
}
delete(want, k)
}
for k := range want {
t.Errorf("expected %q to survive, but it was deleted", k)
}
}

// TestClearRoleHealthForTeamEmpty verifies the no-op case: clearing
// a workspace/team that was never recorded leaves the map unchanged
// and does not panic on an empty map.
func TestClearRoleHealthForTeamEmpty(t *testing.T) {
t.Parallel()
store := api.NewStore()
ctrl := NewController(store, nil)

ctrl.ClearRoleHealthForTeam("never", "recorded")
if len(ctrl.roleHealth) != 0 {
t.Errorf("expected empty map, got %d entries", len(ctrl.roleHealth))
}

ctrl.ClearRoleHealthForWorkspace("never")
if len(ctrl.roleHealth) != 0 {
t.Errorf("expected empty map, got %d entries", len(ctrl.roleHealth))
}
}

func TestComputeBackoff(t *testing.T) {
t.Parallel()
cases := []struct {
Expand Down
Loading