diff --git a/internal/daemon/daemon.go b/internal/daemon/daemon.go index 3b28775..41154bb 100644 --- a/internal/daemon/daemon.go +++ b/internal/daemon/daemon.go @@ -577,6 +577,9 @@ func (d *Daemon) handleDelete(params json.RawMessage) Response { _ = d.sessMgr.Delete(s.Key()) } err = d.store.DeleteTeam(p.Name) + // Clear accumulated crash-loop state for this team's roles so a + // subsequent re-apply starts fresh. See ArcavenAE/marvel#29. + d.teamCtrl.ClearRoleHealthForTeam(t.Workspace, t.Name) case "workspace": ws, getErr := d.store.GetWorkspace(p.Name) if getErr != nil { @@ -597,6 +600,9 @@ func (d *Daemon) handleDelete(params json.RawMessage) Response { } _ = d.sessMgr.CleanupWorkspace(ws.Name) err = d.store.DeleteWorkspace(p.Name) + // Clear accumulated crash-loop state for every role under every + // team in this workspace. See ArcavenAE/marvel#29. + d.teamCtrl.ClearRoleHealthForWorkspace(ws.Name) default: return Response{Error: fmt.Sprintf("unknown resource type: %s", p.ResourceType)} } diff --git a/internal/team/controller.go b/internal/team/controller.go index eb332d0..0089db5 100644 --- a/internal/team/controller.go +++ b/internal/team/controller.go @@ -8,6 +8,7 @@ import ( "fmt" "log" "sort" + "strings" "sync" "time" @@ -111,6 +112,41 @@ func (c *Controller) RoleHealthSnapshot(workspace, team, role string) (*RoleHeal }, true } +// ClearRoleHealthForTeam deletes crash-loop state for every role under +// the given (workspace, team). Called from the cascade delete path in +// daemon.handleDelete so that a subsequent re-apply of the same manifest +// starts with a fresh RestartCount and BackoffUntil — without this, +// accumulated state survives workspace/team delete (the map is keyed +// by name, which the operator is free to reuse) and the reconciler +// refuses spawns until the prior generation's backoff window elapses. +// If the prior generation hit MaxRestarts saturation, BackoffUntil +// would be frozen far in the future and the role would never recover. +// See ArcavenAE/marvel#29. +func (c *Controller) ClearRoleHealthForTeam(workspace, team string) { + c.mu.Lock() + defer c.mu.Unlock() + prefix := workspace + "/" + team + "/" + for k := range c.roleHealth { + if strings.HasPrefix(k, prefix) { + delete(c.roleHealth, k) + } + } +} + +// ClearRoleHealthForWorkspace deletes crash-loop state for every role +// under every team in the given workspace. Called from the workspace- +// delete cascade. See ClearRoleHealthForTeam for the rationale. +func (c *Controller) ClearRoleHealthForWorkspace(workspace string) { + c.mu.Lock() + defer c.mu.Unlock() + prefix := workspace + "/" + for k := range c.roleHealth { + if strings.HasPrefix(k, prefix) { + delete(c.roleHealth, k) + } + } +} + // ReconcileOnce runs one reconciliation pass for all teams. // Reaps dead sessions first so the reconciler sees accurate state. func (c *Controller) ReconcileOnce() { diff --git a/internal/team/controller_test.go b/internal/team/controller_test.go index 597f0f7..9c5ee60 100644 --- a/internal/team/controller_test.go +++ b/internal/team/controller_test.go @@ -914,6 +914,101 @@ func TestReapPathSaturatesMaxRestarts(t *testing.T) { // TestComputeBackoff locks in the exponential curve shape so future // tweaks are intentional and reviewable. +// TestClearRoleHealthForTeam verifies that the cascade-delete helper +// removes only the entries under (workspace, team), leaves siblings +// untouched, and handles the prefix edge case where one team name is +// a prefix of another (e.g., "b" vs "bb"). See ArcavenAE/marvel#29. +func TestClearRoleHealthForTeam(t *testing.T) { + t.Parallel() + store := api.NewStore() + ctrl := NewController(store, nil) + + // Populate state for: ws1/teamA/{w,r}, ws1/teamAA/w, ws1/teamB/w, ws2/teamA/w + keys := []string{ + "ws1/teamA/worker", + "ws1/teamA/reviewer", + "ws1/teamAA/worker", + "ws1/teamB/worker", + "ws2/teamA/worker", + } + for _, k := range keys { + ctrl.roleHealth[k] = &RoleHealth{RestartCount: 3} + } + + ctrl.ClearRoleHealthForTeam("ws1", "teamA") + + want := map[string]bool{ + "ws1/teamAA/worker": true, // must survive — name has teamA as prefix but isn't teamA + "ws1/teamB/worker": true, + "ws2/teamA/worker": true, + } + for k, rh := range ctrl.roleHealth { + if !want[k] { + t.Errorf("ws1/teamA cleared but %q still present (count=%d)", k, rh.RestartCount) + } + delete(want, k) + } + for k := range want { + t.Errorf("expected %q to survive, but it was deleted", k) + } +} + +// TestClearRoleHealthForWorkspace verifies workspace-level cascade +// clears every team's roles under that workspace, and that another +// workspace whose name shares a prefix (e.g., "ws1" vs "ws1-alt") is +// not affected. See ArcavenAE/marvel#29. +func TestClearRoleHealthForWorkspace(t *testing.T) { + t.Parallel() + store := api.NewStore() + ctrl := NewController(store, nil) + + keys := []string{ + "ws1/teamA/worker", + "ws1/teamA/reviewer", + "ws1/teamB/worker", + "ws1-alt/teamA/worker", // must survive + "ws2/teamA/worker", // must survive + } + for _, k := range keys { + ctrl.roleHealth[k] = &RoleHealth{RestartCount: 5} + } + + ctrl.ClearRoleHealthForWorkspace("ws1") + + want := map[string]bool{ + "ws1-alt/teamA/worker": true, + "ws2/teamA/worker": true, + } + for k, rh := range ctrl.roleHealth { + if !want[k] { + t.Errorf("ws1 cleared but %q still present (count=%d)", k, rh.RestartCount) + } + delete(want, k) + } + for k := range want { + t.Errorf("expected %q to survive, but it was deleted", k) + } +} + +// TestClearRoleHealthForTeamEmpty verifies the no-op case: clearing +// a workspace/team that was never recorded leaves the map unchanged +// and does not panic on an empty map. +func TestClearRoleHealthForTeamEmpty(t *testing.T) { + t.Parallel() + store := api.NewStore() + ctrl := NewController(store, nil) + + ctrl.ClearRoleHealthForTeam("never", "recorded") + if len(ctrl.roleHealth) != 0 { + t.Errorf("expected empty map, got %d entries", len(ctrl.roleHealth)) + } + + ctrl.ClearRoleHealthForWorkspace("never") + if len(ctrl.roleHealth) != 0 { + t.Errorf("expected empty map, got %d entries", len(ctrl.roleHealth)) + } +} + func TestComputeBackoff(t *testing.T) { t.Parallel() cases := []struct {