From 754ebf8250da1a9f13e2871d3490f4db84fe9621 Mon Sep 17 00:00:00 2001 From: sharziki Date: Sat, 18 Apr 2026 15:59:48 -0400 Subject: [PATCH] fix(autopilot): make between-cycle sleep cancelable on SIGTERM/SIGINT The autopilot loop wrapped its between-cycle wait in `await new Promise(r => setTimeout(r, interval * 1000))`, which is not cancelable. When SIGTERM/SIGINT fired, the shutdown handler flipped `stopping = true`, but the `while (!stopping)` guard only re-evaluates after the current sleep resolves. With adaptive intervals scaling up to 600s on a healthy brain, that means systemd's default TimeoutStopSec=90 loses the race, SIGKILL preempts the drain path, and the autopilot lockfile at `~/.gbrain/autopilot.lock` is left stale. The next invocation has to either wait out the 10-minute staleness check or be cleaned up by hand. Extract an exported `sleepCancelable(ms, signal)` helper that resolves after `ms` or rejects with an AbortError as soon as `signal` aborts. Wire an `AbortController` into the cycle loop, abort it from the `shutdown` handler, and swallow only the abort-path rejection so the loop exits on its next `stopping` check. Added focused tests covering the never-aborted, pre-aborted, and mid-sleep abort paths. Fixes #204 --- src/commands/autopilot.ts | 47 ++++++++++++++++++++++-- test/autopilot-sleep-cancelable.test.ts | 48 +++++++++++++++++++++++++ 2 files changed, 93 insertions(+), 2 deletions(-) create mode 100644 test/autopilot-sleep-cancelable.test.ts diff --git a/src/commands/autopilot.ts b/src/commands/autopilot.ts index 11d1e1be..25190bbe 100644 --- a/src/commands/autopilot.ts +++ b/src/commands/autopilot.ts @@ -54,6 +54,31 @@ function logError(phase: string, e: unknown) { * 3. `which gbrain` for installs where the binary is on $PATH. * 4. Throw — nothing on $PATH, no way to supervise the worker. */ +/** + * Resolve after `ms`, or reject with an AbortError as soon as `signal` aborts. + * + * Used for the between-cycle sleep in the autopilot loop so that SIGTERM/SIGINT + * can interrupt a long (up to 600s adaptive) wait and let graceful shutdown + * complete within systemd's TimeoutStopSec window instead of being SIGKILLed + * with a stale lockfile left behind. + * + * Exported for unit testing; behavior mirrors Node's `setTimeout`-with-AbortSignal + * semantics but avoids pulling in `timers/promises` for cross-runtime use. + */ +export function sleepCancelable(ms: number, signal: AbortSignal): Promise { + return new Promise((resolve, reject) => { + if (signal.aborted) { + reject(new DOMException('aborted', 'AbortError')); + return; + } + const timer = setTimeout(resolve, ms); + signal.addEventListener('abort', () => { + clearTimeout(timer); + reject(new DOMException('aborted', 'AbortError')); + }, { once: true }); + }); +} + export function resolveGbrainCliPath(): string { const arg1 = process.argv[1] ?? ''; if (arg1.endsWith('/gbrain') || arg1.endsWith('/cli.ts') || arg1.endsWith('\\gbrain.exe')) { @@ -129,6 +154,15 @@ export async function runAutopilot(engine: BrainEngine, args: string[]) { let workerProc: ChildProcess | null = null; let crashCount = 0; + // Cancelable between-cycle sleep. Without this, SIGTERM/SIGINT sets + // `stopping = true` but the loop only re-evaluates after the current + // `setTimeout` resolves — up to `interval` seconds (600s worst case after + // adaptive scaling). Under systemd's default TimeoutStopSec=90 that means + // SIGKILL beats the drain path, the lockfile leaks, and the next + // invocation has to wait out the 10-minute staleness check or be cleaned + // up by hand. Abort the timer on shutdown so the loop exits immediately. + const cycleAbort = new AbortController(); + if (useMinionsDispatch) { const cliPath = resolveGbrainCliPath(); const startWorker = () => { @@ -164,6 +198,9 @@ export async function runAutopilot(engine: BrainEngine, args: string[]) { const shutdown = async (sig: string) => { if (stopping) return; stopping = true; + // Abort the in-flight cycle sleep so the main loop exits immediately + // instead of running out the remainder of the current interval. + cycleAbort.abort(); console.log(`Autopilot stopping (${sig}).`); if (workerProc) { try { workerProc.kill('SIGTERM'); } catch { /* already dead */ } @@ -280,8 +317,14 @@ export async function runAutopilot(engine: BrainEngine, args: string[]) { } } - // Wait for next cycle - await new Promise(r => setTimeout(r, interval * 1000)); + // Wait for next cycle (cancelable on SIGTERM/SIGINT) + try { + await sleepCancelable(interval * 1000, cycleAbort.signal); + } catch (e) { + // Only swallow the abort path; let unexpected rejections surface. + if (!(e instanceof DOMException) || e.name !== 'AbortError') throw e; + // Loop will exit via the `while (!stopping)` check on next iteration. + } } } diff --git a/test/autopilot-sleep-cancelable.test.ts b/test/autopilot-sleep-cancelable.test.ts new file mode 100644 index 00000000..2ddd41cc --- /dev/null +++ b/test/autopilot-sleep-cancelable.test.ts @@ -0,0 +1,48 @@ +import { describe, test, expect } from 'bun:test'; +import { sleepCancelable } from '../src/commands/autopilot.ts'; + +describe('sleepCancelable', () => { + test('resolves after the given delay when never aborted', async () => { + const start = Date.now(); + await sleepCancelable(40, new AbortController().signal); + const elapsed = Date.now() - start; + // Allow generous slack for CI timer jitter — we only care that it waited + // at least ~40ms and didn't resolve immediately. + expect(elapsed).toBeGreaterThanOrEqual(30); + }); + + test('rejects immediately with AbortError when the signal is already aborted', async () => { + const ctl = new AbortController(); + ctl.abort(); + const start = Date.now(); + let caught: unknown; + try { + await sleepCancelable(10_000, ctl.signal); + } catch (e) { + caught = e; + } + const elapsed = Date.now() - start; + expect(caught).toBeInstanceOf(DOMException); + expect((caught as DOMException).name).toBe('AbortError'); + // Rejection must be effectively synchronous — the 10s timer is never armed. + expect(elapsed).toBeLessThan(50); + }); + + test('rejects with AbortError mid-sleep when the signal aborts', async () => { + const ctl = new AbortController(); + // Abort well before the 10s timer would fire. If sleepCancelable were + // not cancelable, this test would hang for ~10s and bun:test would kill it. + setTimeout(() => ctl.abort(), 20); + const start = Date.now(); + let caught: unknown; + try { + await sleepCancelable(10_000, ctl.signal); + } catch (e) { + caught = e; + } + const elapsed = Date.now() - start; + expect(caught).toBeInstanceOf(DOMException); + expect((caught as DOMException).name).toBe('AbortError'); + expect(elapsed).toBeLessThan(200); + }); +});