From 483759a74476481562b9e77e670214024c3b6dd5 Mon Sep 17 00:00:00 2001 From: Michael Pursifull Date: Sat, 18 Apr 2026 22:41:49 -0500 Subject: [PATCH] fix(team): clear RoleHealth on workspace/team cascade delete MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before this change, team.Controller.roleHealth survived daemon.handleDelete's workspace and team cascade paths. The map is keyed by workspace/team/role — three name components the operator is free to reuse after a delete. When a fresh manifest re-used the same names, accumulated RestartCount and BackoffUntil carried over. If the prior generation had hit MaxRestarts saturation, BackoffUntil was frozen to saturationFreezeUntil (far future) and the reconciler gate at controller.go:230 refused spawns forever — producing the silent stall Skippy reproduced in ArcavenAE/marvel#29. Adds: - team.Controller.ClearRoleHealthForTeam(workspace, team) - team.Controller.ClearRoleHealthForWorkspace(workspace) Both take c.mu, iterate roleHealth, and delete entries whose keys match the workspace-or-team prefix with a trailing `/` so that sibling names sharing a prefix (teamA vs teamAA, ws1 vs ws1-alt) are not affected. Wires both helpers into daemon.handleDelete's workspace and team branches, after the corresponding Store.Delete* call so the cascade happens in the same direction everywhere: drop the records, then drop the crash-loop history. Tests cover prefix-boundary correctness, sibling-survival, and the empty-map no-op case. No daemon-level regression test — the two call sites are trivially auditable in the diff and the unit tests cover the underlying contract. Refs ArcavenAE/marvel#29 --- internal/daemon/daemon.go | 6 ++ internal/team/controller.go | 36 ++++++++++++ internal/team/controller_test.go | 95 ++++++++++++++++++++++++++++++++ 3 files changed, 137 insertions(+) diff --git a/internal/daemon/daemon.go b/internal/daemon/daemon.go index 3b28775..41154bb 100644 --- a/internal/daemon/daemon.go +++ b/internal/daemon/daemon.go @@ -577,6 +577,9 @@ func (d *Daemon) handleDelete(params json.RawMessage) Response { _ = d.sessMgr.Delete(s.Key()) } err = d.store.DeleteTeam(p.Name) + // Clear accumulated crash-loop state for this team's roles so a + // subsequent re-apply starts fresh. See ArcavenAE/marvel#29. + d.teamCtrl.ClearRoleHealthForTeam(t.Workspace, t.Name) case "workspace": ws, getErr := d.store.GetWorkspace(p.Name) if getErr != nil { @@ -597,6 +600,9 @@ func (d *Daemon) handleDelete(params json.RawMessage) Response { } _ = d.sessMgr.CleanupWorkspace(ws.Name) err = d.store.DeleteWorkspace(p.Name) + // Clear accumulated crash-loop state for every role under every + // team in this workspace. See ArcavenAE/marvel#29. + d.teamCtrl.ClearRoleHealthForWorkspace(ws.Name) default: return Response{Error: fmt.Sprintf("unknown resource type: %s", p.ResourceType)} } diff --git a/internal/team/controller.go b/internal/team/controller.go index eb332d0..0089db5 100644 --- a/internal/team/controller.go +++ b/internal/team/controller.go @@ -8,6 +8,7 @@ import ( "fmt" "log" "sort" + "strings" "sync" "time" @@ -111,6 +112,41 @@ func (c *Controller) RoleHealthSnapshot(workspace, team, role string) (*RoleHeal }, true } +// ClearRoleHealthForTeam deletes crash-loop state for every role under +// the given (workspace, team). Called from the cascade delete path in +// daemon.handleDelete so that a subsequent re-apply of the same manifest +// starts with a fresh RestartCount and BackoffUntil — without this, +// accumulated state survives workspace/team delete (the map is keyed +// by name, which the operator is free to reuse) and the reconciler +// refuses spawns until the prior generation's backoff window elapses. +// If the prior generation hit MaxRestarts saturation, BackoffUntil +// would be frozen far in the future and the role would never recover. +// See ArcavenAE/marvel#29. +func (c *Controller) ClearRoleHealthForTeam(workspace, team string) { + c.mu.Lock() + defer c.mu.Unlock() + prefix := workspace + "/" + team + "/" + for k := range c.roleHealth { + if strings.HasPrefix(k, prefix) { + delete(c.roleHealth, k) + } + } +} + +// ClearRoleHealthForWorkspace deletes crash-loop state for every role +// under every team in the given workspace. Called from the workspace- +// delete cascade. See ClearRoleHealthForTeam for the rationale. +func (c *Controller) ClearRoleHealthForWorkspace(workspace string) { + c.mu.Lock() + defer c.mu.Unlock() + prefix := workspace + "/" + for k := range c.roleHealth { + if strings.HasPrefix(k, prefix) { + delete(c.roleHealth, k) + } + } +} + // ReconcileOnce runs one reconciliation pass for all teams. // Reaps dead sessions first so the reconciler sees accurate state. func (c *Controller) ReconcileOnce() { diff --git a/internal/team/controller_test.go b/internal/team/controller_test.go index 597f0f7..9c5ee60 100644 --- a/internal/team/controller_test.go +++ b/internal/team/controller_test.go @@ -914,6 +914,101 @@ func TestReapPathSaturatesMaxRestarts(t *testing.T) { // TestComputeBackoff locks in the exponential curve shape so future // tweaks are intentional and reviewable. +// TestClearRoleHealthForTeam verifies that the cascade-delete helper +// removes only the entries under (workspace, team), leaves siblings +// untouched, and handles the prefix edge case where one team name is +// a prefix of another (e.g., "b" vs "bb"). See ArcavenAE/marvel#29. +func TestClearRoleHealthForTeam(t *testing.T) { + t.Parallel() + store := api.NewStore() + ctrl := NewController(store, nil) + + // Populate state for: ws1/teamA/{w,r}, ws1/teamAA/w, ws1/teamB/w, ws2/teamA/w + keys := []string{ + "ws1/teamA/worker", + "ws1/teamA/reviewer", + "ws1/teamAA/worker", + "ws1/teamB/worker", + "ws2/teamA/worker", + } + for _, k := range keys { + ctrl.roleHealth[k] = &RoleHealth{RestartCount: 3} + } + + ctrl.ClearRoleHealthForTeam("ws1", "teamA") + + want := map[string]bool{ + "ws1/teamAA/worker": true, // must survive — name has teamA as prefix but isn't teamA + "ws1/teamB/worker": true, + "ws2/teamA/worker": true, + } + for k, rh := range ctrl.roleHealth { + if !want[k] { + t.Errorf("ws1/teamA cleared but %q still present (count=%d)", k, rh.RestartCount) + } + delete(want, k) + } + for k := range want { + t.Errorf("expected %q to survive, but it was deleted", k) + } +} + +// TestClearRoleHealthForWorkspace verifies workspace-level cascade +// clears every team's roles under that workspace, and that another +// workspace whose name shares a prefix (e.g., "ws1" vs "ws1-alt") is +// not affected. See ArcavenAE/marvel#29. +func TestClearRoleHealthForWorkspace(t *testing.T) { + t.Parallel() + store := api.NewStore() + ctrl := NewController(store, nil) + + keys := []string{ + "ws1/teamA/worker", + "ws1/teamA/reviewer", + "ws1/teamB/worker", + "ws1-alt/teamA/worker", // must survive + "ws2/teamA/worker", // must survive + } + for _, k := range keys { + ctrl.roleHealth[k] = &RoleHealth{RestartCount: 5} + } + + ctrl.ClearRoleHealthForWorkspace("ws1") + + want := map[string]bool{ + "ws1-alt/teamA/worker": true, + "ws2/teamA/worker": true, + } + for k, rh := range ctrl.roleHealth { + if !want[k] { + t.Errorf("ws1 cleared but %q still present (count=%d)", k, rh.RestartCount) + } + delete(want, k) + } + for k := range want { + t.Errorf("expected %q to survive, but it was deleted", k) + } +} + +// TestClearRoleHealthForTeamEmpty verifies the no-op case: clearing +// a workspace/team that was never recorded leaves the map unchanged +// and does not panic on an empty map. +func TestClearRoleHealthForTeamEmpty(t *testing.T) { + t.Parallel() + store := api.NewStore() + ctrl := NewController(store, nil) + + ctrl.ClearRoleHealthForTeam("never", "recorded") + if len(ctrl.roleHealth) != 0 { + t.Errorf("expected empty map, got %d entries", len(ctrl.roleHealth)) + } + + ctrl.ClearRoleHealthForWorkspace("never") + if len(ctrl.roleHealth) != 0 { + t.Errorf("expected empty map, got %d entries", len(ctrl.roleHealth)) + } +} + func TestComputeBackoff(t *testing.T) { t.Parallel() cases := []struct {