diff --git a/src/commands/autopilot.ts b/src/commands/autopilot.ts index 11d1e1be..25190bbe 100644 --- a/src/commands/autopilot.ts +++ b/src/commands/autopilot.ts @@ -54,6 +54,31 @@ function logError(phase: string, e: unknown) { * 3. `which gbrain` for installs where the binary is on $PATH. * 4. Throw — nothing on $PATH, no way to supervise the worker. */ +/** + * Resolve after `ms`, or reject with an AbortError as soon as `signal` aborts. + * + * Used for the between-cycle sleep in the autopilot loop so that SIGTERM/SIGINT + * can interrupt a long (up to 600s adaptive) wait and let graceful shutdown + * complete within systemd's TimeoutStopSec window instead of being SIGKILLed + * with a stale lockfile left behind. + * + * Exported for unit testing; behavior mirrors Node's `setTimeout`-with-AbortSignal + * semantics but avoids pulling in `timers/promises` for cross-runtime use. + */ +export function sleepCancelable(ms: number, signal: AbortSignal): Promise { + return new Promise((resolve, reject) => { + if (signal.aborted) { + reject(new DOMException('aborted', 'AbortError')); + return; + } + const timer = setTimeout(resolve, ms); + signal.addEventListener('abort', () => { + clearTimeout(timer); + reject(new DOMException('aborted', 'AbortError')); + }, { once: true }); + }); +} + export function resolveGbrainCliPath(): string { const arg1 = process.argv[1] ?? ''; if (arg1.endsWith('/gbrain') || arg1.endsWith('/cli.ts') || arg1.endsWith('\\gbrain.exe')) { @@ -129,6 +154,15 @@ export async function runAutopilot(engine: BrainEngine, args: string[]) { let workerProc: ChildProcess | null = null; let crashCount = 0; + // Cancelable between-cycle sleep. Without this, SIGTERM/SIGINT sets + // `stopping = true` but the loop only re-evaluates after the current + // `setTimeout` resolves — up to `interval` seconds (600s worst case after + // adaptive scaling). Under systemd's default TimeoutStopSec=90 that means + // SIGKILL beats the drain path, the lockfile leaks, and the next + // invocation has to wait out the 10-minute staleness check or be cleaned + // up by hand. Abort the timer on shutdown so the loop exits immediately. + const cycleAbort = new AbortController(); + if (useMinionsDispatch) { const cliPath = resolveGbrainCliPath(); const startWorker = () => { @@ -164,6 +198,9 @@ export async function runAutopilot(engine: BrainEngine, args: string[]) { const shutdown = async (sig: string) => { if (stopping) return; stopping = true; + // Abort the in-flight cycle sleep so the main loop exits immediately + // instead of running out the remainder of the current interval. + cycleAbort.abort(); console.log(`Autopilot stopping (${sig}).`); if (workerProc) { try { workerProc.kill('SIGTERM'); } catch { /* already dead */ } @@ -280,8 +317,14 @@ export async function runAutopilot(engine: BrainEngine, args: string[]) { } } - // Wait for next cycle - await new Promise(r => setTimeout(r, interval * 1000)); + // Wait for next cycle (cancelable on SIGTERM/SIGINT) + try { + await sleepCancelable(interval * 1000, cycleAbort.signal); + } catch (e) { + // Only swallow the abort path; let unexpected rejections surface. + if (!(e instanceof DOMException) || e.name !== 'AbortError') throw e; + // Loop will exit via the `while (!stopping)` check on next iteration. + } } } diff --git a/test/autopilot-sleep-cancelable.test.ts b/test/autopilot-sleep-cancelable.test.ts new file mode 100644 index 00000000..2ddd41cc --- /dev/null +++ b/test/autopilot-sleep-cancelable.test.ts @@ -0,0 +1,48 @@ +import { describe, test, expect } from 'bun:test'; +import { sleepCancelable } from '../src/commands/autopilot.ts'; + +describe('sleepCancelable', () => { + test('resolves after the given delay when never aborted', async () => { + const start = Date.now(); + await sleepCancelable(40, new AbortController().signal); + const elapsed = Date.now() - start; + // Allow generous slack for CI timer jitter — we only care that it waited + // at least ~40ms and didn't resolve immediately. + expect(elapsed).toBeGreaterThanOrEqual(30); + }); + + test('rejects immediately with AbortError when the signal is already aborted', async () => { + const ctl = new AbortController(); + ctl.abort(); + const start = Date.now(); + let caught: unknown; + try { + await sleepCancelable(10_000, ctl.signal); + } catch (e) { + caught = e; + } + const elapsed = Date.now() - start; + expect(caught).toBeInstanceOf(DOMException); + expect((caught as DOMException).name).toBe('AbortError'); + // Rejection must be effectively synchronous — the 10s timer is never armed. + expect(elapsed).toBeLessThan(50); + }); + + test('rejects with AbortError mid-sleep when the signal aborts', async () => { + const ctl = new AbortController(); + // Abort well before the 10s timer would fire. If sleepCancelable were + // not cancelable, this test would hang for ~10s and bun:test would kill it. + setTimeout(() => ctl.abort(), 20); + const start = Date.now(); + let caught: unknown; + try { + await sleepCancelable(10_000, ctl.signal); + } catch (e) { + caught = e; + } + const elapsed = Date.now() - start; + expect(caught).toBeInstanceOf(DOMException); + expect((caught as DOMException).name).toBe('AbortError'); + expect(elapsed).toBeLessThan(200); + }); +});