diff --git a/internal/coordinator/auto_resume_test.go b/internal/coordinator/auto_resume_test.go new file mode 100644 index 0000000..c1049fe --- /dev/null +++ b/internal/coordinator/auto_resume_test.go @@ -0,0 +1,223 @@ +package coordinator + +import ( + "context" + "sync" + "testing" +) + +// mockAmbientBackend is a test backend that simulates an Ambient session lifecycle +// with support for session state transitions (missing → created). +type mockAmbientBackend struct { + mu sync.Mutex + sessions map[string]bool // sessionID -> exists + restartCalled bool + createCount int +} + +func newMockAmbientBackend() *mockAmbientBackend { + return &mockAmbientBackend{ + sessions: make(map[string]bool), + } +} + +func (b *mockAmbientBackend) Name() string { return "ambient" } +func (b *mockAmbientBackend) Available() bool { return true } +func (b *mockAmbientBackend) SupportsAutoResume() bool { return true } + +func (b *mockAmbientBackend) CreateSession(_ context.Context, opts SessionCreateOpts) (string, error) { + b.mu.Lock() + defer b.mu.Unlock() + b.createCount++ + sessionID := "mock-ambient-session-" + string(rune('0'+b.createCount)) + b.sessions[sessionID] = true + return sessionID, nil +} + +func (b *mockAmbientBackend) KillSession(_ context.Context, sessionID string) error { + b.mu.Lock() + defer b.mu.Unlock() + delete(b.sessions, sessionID) + return nil +} + +func (b *mockAmbientBackend) SessionExists(sessionID string) bool { + b.mu.Lock() + defer b.mu.Unlock() + return b.sessions[sessionID] +} + +func (b *mockAmbientBackend) ListSessions() ([]string, error) { + b.mu.Lock() + defer b.mu.Unlock() + var list []string + for sid := range b.sessions { + list = append(list, sid) + } + return list, nil +} + +func (b *mockAmbientBackend) GetStatus(_ context.Context, sessionID string) (SessionStatus, error) { + if b.SessionExists(sessionID) { + return SessionStatusRunning, nil + } + return SessionStatusMissing, nil +} + +func (b *mockAmbientBackend) IsIdle(_ string) bool { return true } +func (b *mockAmbientBackend) CaptureOutput(_ string, _ int) ([]string, error) { return nil, nil } +func (b *mockAmbientBackend) CheckApproval(_ string) ApprovalInfo { return ApprovalInfo{} } +func (b *mockAmbientBackend) SendInput(_ string, _ string) error { return nil } +func (b *mockAmbientBackend) Approve(_ string) error { return nil } +func (b *mockAmbientBackend) AlwaysAllow(_ string) error { return nil } +func (b *mockAmbientBackend) Interrupt(_ context.Context, _ string) error { return nil } +func (b *mockAmbientBackend) DiscoverSessions() (map[string]string, error) { return nil, nil } + +// TestAutoResumeAmbientSession verifies that when an Ambient session is stopped +// (SessionExists returns false), the restartAgentService call correctly creates a new +// session. This test directly exercises restartAgentService rather than the full +// SingleAgentCheckIn flow to avoid test timeouts. +func TestAutoResumeAmbientSession(t *testing.T) { + srv, cleanup := mustStartServer(t) + defer cleanup() + + space := "TestAutoResume" + agentName := "test-agent" + + // Install mock ambient backend + mockBackend := newMockAmbientBackend() + srv.backends = map[string]SessionBackend{"ambient": mockBackend} + srv.defaultBackend = "ambient" + + // Create an agent with an initial session + initialSessionID := "initial-session" + mockBackend.mu.Lock() + mockBackend.sessions[initialSessionID] = true + mockBackend.mu.Unlock() + + srv.mu.Lock() + ks := srv.getOrCreateSpaceLocked(space) + ks.setAgentStatus(agentName, &AgentUpdate{ + Status: StatusIdle, + Summary: agentName + ": ready", + SessionID: initialSessionID, + BackendType: "ambient", + }) + if _, ok := ks.Agents[agentName]; !ok { + ks.Agents[agentName] = &AgentRecord{} + } + ks.Agents[agentName].Config = &AgentConfig{ + WorkDir: "/workspace", + } + srv.mu.Unlock() + + // Simulate the session being stopped (e.g., due to inactivity timeout) + mockBackend.mu.Lock() + delete(mockBackend.sessions, initialSessionID) + mockBackend.mu.Unlock() + + // Verify the session is gone + if mockBackend.SessionExists(initialSessionID) { + t.Fatal("expected initial session to be stopped") + } + + // Directly test the restart service instead of full check-in to avoid timeout + newSessionID, canonical, err := srv.restartAgentService(space, agentName, spawnRequest{}) + if err != nil { + t.Fatalf("restartAgentService failed: %v", err) + } + + // Verify a new session was created + if mockBackend.createCount != 1 { + t.Errorf("expected 1 session creation, got %d", mockBackend.createCount) + } + + if newSessionID == initialSessionID { + t.Error("expected new session ID after auto-resume") + } + if newSessionID == "" { + t.Error("expected non-empty session ID after auto-resume") + } + if canonical != agentName { + t.Errorf("expected canonical name %q, got %q", agentName, canonical) + } + + // Verify the new session exists + if !mockBackend.SessionExists(newSessionID) { + t.Errorf("new session %q does not exist", newSessionID) + } + + // Verify the agent status was updated with the new session + srv.mu.RLock() + agent, ok := ks.agentStatusOk(agentName) + srv.mu.RUnlock() + if !ok { + t.Fatal("agent not found after auto-resume") + } + if agent.SessionID != newSessionID { + t.Errorf("agent session ID = %q, want %q", agent.SessionID, newSessionID) + } + if agent.BackendType != "ambient" { + t.Errorf("agent backend type = %q, want %q", agent.BackendType, "ambient") + } +} + +// TestAutoResumeOnlyForAmbient verifies that auto-resume currently only applies to +// backends that report SupportsAutoResume() == true (currently Ambient), not tmux sessions +// (which should skip). This behavior is enforced via the backend capability interface. +func TestAutoResumeOnlyForAmbient(t *testing.T) { + srv, cleanup := mustStartServer(t) + defer cleanup() + + space := "TestTmuxNoResume" + agentName := "tmux-agent" + + // Install mock tmux backend + mockBackend := newSpawnCapturingBackend() + srv.backends = map[string]SessionBackend{"tmux": mockBackend} + srv.defaultBackend = "tmux" + + // Create an agent with a tmux session that doesn't exist + srv.mu.Lock() + ks := srv.getOrCreateSpaceLocked(space) + ks.setAgentStatus(agentName, &AgentUpdate{ + Status: StatusIdle, + Summary: agentName + ": ready", + SessionID: "missing-tmux-session", + BackendType: "tmux", + }) + srv.mu.Unlock() + + // Call SingleAgentCheckIn — should skip, not auto-resume + result := srv.SingleAgentCheckIn(space, agentName, "", "") + + // Verify it was skipped + if len(result.Skipped) != 1 { + t.Errorf("expected 1 skipped, got %d: %v", len(result.Skipped), result.Skipped) + } + + // Verify no session was created + select { + case <-mockBackend.captured: + t.Error("expected no session creation for tmux backend") + default: + // Expected: no session created + } +} + +// TestSingleAgentCheckInNonexistentAgent verifies that calling SingleAgentCheckIn +// on a nonexistent agent returns an appropriate error. +func TestSingleAgentCheckInNonexistentAgent(t *testing.T) { + srv, cleanup := mustStartServer(t) + defer cleanup() + + space := "TestNonexistent" + + // Call SingleAgentCheckIn on non-existent agent + result := srv.SingleAgentCheckIn(space, "nonexistent", "", "") + + // Should get an error + if len(result.Errors) != 1 { + t.Errorf("expected 1 error for nonexistent agent, got %d: %v", len(result.Errors), result.Errors) + } +} diff --git a/internal/coordinator/lifecycle.go b/internal/coordinator/lifecycle.go index 8f345b8..3de40e1 100644 --- a/internal/coordinator/lifecycle.go +++ b/internal/coordinator/lifecycle.go @@ -705,6 +705,32 @@ func (s *Server) restartAgentService(spaceName, agentName string, req spawnReque return sessionID, canonical, nil } +// maybeAutoResumeAgent checks if a session should be auto-resumed and restarts it if needed. +// Returns the (possibly new) sessionID, whether a restart occurred, and any error. +// Auto-resume only applies to backends that support it (checked via SupportsAutoResume()). +func (s *Server) maybeAutoResumeAgent(spaceName, canonical, sessionID string, backend SessionBackend) (string, bool, error) { + // Only auto-resume if the backend supports it + if !backend.SupportsAutoResume() { + return sessionID, false, nil + } + + // Check if session exists + if backend.SessionExists(sessionID) { + return sessionID, false, nil + } + + // Session is missing and backend supports auto-resume — restart it + s.logEvent(fmt.Sprintf("[%s/%s] auto-resume: session %s not found, attempting restart", spaceName, canonical, sessionID)) + + newSessionID, _, err := s.restartAgentService(spaceName, canonical, spawnRequest{}) + if err != nil { + return sessionID, false, fmt.Errorf("auto-resume failed: %w", err) + } + + s.logEvent(fmt.Sprintf("[%s/%s] auto-resume: restarted in session %s", spaceName, canonical, newSessionID)) + return newSessionID, true, nil +} + // introspectResponse is returned by GET /spaces/{space}/agent/{name}/introspect. type introspectResponse struct { Agent string `json:"agent"` diff --git a/internal/coordinator/lifecycle_test.go b/internal/coordinator/lifecycle_test.go index 37912e9..5adce12 100644 --- a/internal/coordinator/lifecycle_test.go +++ b/internal/coordinator/lifecycle_test.go @@ -21,6 +21,7 @@ func newSpawnCapturingBackend() *spawnCapturingBackend { func (b *spawnCapturingBackend) Name() string { return "tmux" } func (b *spawnCapturingBackend) Available() bool { return true } +func (b *spawnCapturingBackend) SupportsAutoResume() bool { return false } func (b *spawnCapturingBackend) CreateSession(_ context.Context, opts SessionCreateOpts) (string, error) { b.captured <- opts return "mock-session-id", nil diff --git a/internal/coordinator/session_backend.go b/internal/coordinator/session_backend.go index ea87ac3..b1c0ca0 100644 --- a/internal/coordinator/session_backend.go +++ b/internal/coordinator/session_backend.go @@ -15,6 +15,10 @@ type SessionBackend interface { // Available reports whether this backend is operational. Available() bool + // SupportsAutoResume reports whether this backend supports automatic + // session resumption when a stopped session receives a message. + SupportsAutoResume() bool + // --- Lifecycle --- // CreateSession creates a new session and launches the given command. diff --git a/internal/coordinator/session_backend_ambient.go b/internal/coordinator/session_backend_ambient.go index 9e6728d..06f1fc5 100644 --- a/internal/coordinator/session_backend_ambient.go +++ b/internal/coordinator/session_backend_ambient.go @@ -131,6 +131,8 @@ func (b *AmbientSessionBackend) Available() bool { return avail } +func (b *AmbientSessionBackend) SupportsAutoResume() bool { return true } + func (b *AmbientSessionBackend) setCachedAvail(v bool) { b.availMu.Lock() b.availCached = v diff --git a/internal/coordinator/session_backend_tmux.go b/internal/coordinator/session_backend_tmux.go index 3821ad2..8f2463b 100644 --- a/internal/coordinator/session_backend_tmux.go +++ b/internal/coordinator/session_backend_tmux.go @@ -36,6 +36,8 @@ func (b *TmuxSessionBackend) Name() string { return "tmux" } func (b *TmuxSessionBackend) Available() bool { return tmuxAvailable() } +func (b *TmuxSessionBackend) SupportsAutoResume() bool { return false } + func (b *TmuxSessionBackend) CreateSession(ctx context.Context, opts SessionCreateOpts) (string, error) { sessionID := opts.SessionID command := opts.Command diff --git a/internal/coordinator/tmux.go b/internal/coordinator/tmux.go index 5fd2dab..84c858f 100644 --- a/internal/coordinator/tmux.go +++ b/internal/coordinator/tmux.go @@ -738,7 +738,17 @@ func (s *Server) SingleAgentCheckIn(spaceName, agentName, checkModel, workModel result.Errors = append(result.Errors, backend.Name()+" not available") return result } - if !backend.SessionExists(sessionID) { + + // Attempt auto-resume if the session is missing and the backend supports it + newSessionID, resumed, err := s.maybeAutoResumeAgent(spaceName, canonical, sessionID, backend) + if err != nil { + result.Errors = append(result.Errors, fmt.Sprintf("%s: %v", canonical, err)) + return result + } + if resumed { + sessionID = newSessionID + } else if !backend.SessionExists(sessionID) { + // Session doesn't exist and wasn't auto-resumed result.Skipped = append(result.Skipped, canonical+" (session not found: "+sessionID+")") return result }