Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 45 additions & 2 deletions src/commands/autopilot.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,31 @@ function logError(phase: string, e: unknown) {
* 3. `which gbrain` for installs where the binary is on $PATH.
* 4. Throw — nothing on $PATH, no way to supervise the worker.
*/
/**
* Resolve after `ms`, or reject with an AbortError as soon as `signal` aborts.
*
* Used for the between-cycle sleep in the autopilot loop so that SIGTERM/SIGINT
* can interrupt a long (up to 600s adaptive) wait and let graceful shutdown
* complete within systemd's TimeoutStopSec window instead of being SIGKILLed
* with a stale lockfile left behind.
*
* Exported for unit testing; behavior mirrors Node's `setTimeout`-with-AbortSignal
* semantics but avoids pulling in `timers/promises` for cross-runtime use.
*/
export function sleepCancelable(ms: number, signal: AbortSignal): Promise<void> {
return new Promise((resolve, reject) => {
if (signal.aborted) {
reject(new DOMException('aborted', 'AbortError'));
return;
}
const timer = setTimeout(resolve, ms);
signal.addEventListener('abort', () => {
clearTimeout(timer);
reject(new DOMException('aborted', 'AbortError'));
}, { once: true });
});
}

export function resolveGbrainCliPath(): string {
const arg1 = process.argv[1] ?? '';
if (arg1.endsWith('/gbrain') || arg1.endsWith('/cli.ts') || arg1.endsWith('\\gbrain.exe')) {
Expand Down Expand Up @@ -129,6 +154,15 @@ export async function runAutopilot(engine: BrainEngine, args: string[]) {
let workerProc: ChildProcess | null = null;
let crashCount = 0;

// Cancelable between-cycle sleep. Without this, SIGTERM/SIGINT sets
// `stopping = true` but the loop only re-evaluates after the current
// `setTimeout` resolves — up to `interval` seconds (600s worst case after
// adaptive scaling). Under systemd's default TimeoutStopSec=90 that means
// SIGKILL beats the drain path, the lockfile leaks, and the next
// invocation has to wait out the 10-minute staleness check or be cleaned
// up by hand. Abort the timer on shutdown so the loop exits immediately.
const cycleAbort = new AbortController();

if (useMinionsDispatch) {
const cliPath = resolveGbrainCliPath();
const startWorker = () => {
Expand Down Expand Up @@ -164,6 +198,9 @@ export async function runAutopilot(engine: BrainEngine, args: string[]) {
const shutdown = async (sig: string) => {
if (stopping) return;
stopping = true;
// Abort the in-flight cycle sleep so the main loop exits immediately
// instead of running out the remainder of the current interval.
cycleAbort.abort();
console.log(`Autopilot stopping (${sig}).`);
if (workerProc) {
try { workerProc.kill('SIGTERM'); } catch { /* already dead */ }
Expand Down Expand Up @@ -280,8 +317,14 @@ export async function runAutopilot(engine: BrainEngine, args: string[]) {
}
}

// Wait for next cycle
await new Promise(r => setTimeout(r, interval * 1000));
// Wait for next cycle (cancelable on SIGTERM/SIGINT)
try {
await sleepCancelable(interval * 1000, cycleAbort.signal);
} catch (e) {
// Only swallow the abort path; let unexpected rejections surface.
if (!(e instanceof DOMException) || e.name !== 'AbortError') throw e;
// Loop will exit via the `while (!stopping)` check on next iteration.
}
}
}

Expand Down
48 changes: 48 additions & 0 deletions test/autopilot-sleep-cancelable.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import { describe, test, expect } from 'bun:test';
import { sleepCancelable } from '../src/commands/autopilot.ts';

describe('sleepCancelable', () => {
test('resolves after the given delay when never aborted', async () => {
const start = Date.now();
await sleepCancelable(40, new AbortController().signal);
const elapsed = Date.now() - start;
// Allow generous slack for CI timer jitter — we only care that it waited
// at least ~40ms and didn't resolve immediately.
expect(elapsed).toBeGreaterThanOrEqual(30);
});

test('rejects immediately with AbortError when the signal is already aborted', async () => {
const ctl = new AbortController();
ctl.abort();
const start = Date.now();
let caught: unknown;
try {
await sleepCancelable(10_000, ctl.signal);
} catch (e) {
caught = e;
}
const elapsed = Date.now() - start;
expect(caught).toBeInstanceOf(DOMException);
expect((caught as DOMException).name).toBe('AbortError');
// Rejection must be effectively synchronous — the 10s timer is never armed.
expect(elapsed).toBeLessThan(50);
});

test('rejects with AbortError mid-sleep when the signal aborts', async () => {
const ctl = new AbortController();
// Abort well before the 10s timer would fire. If sleepCancelable were
// not cancelable, this test would hang for ~10s and bun:test would kill it.
setTimeout(() => ctl.abort(), 20);
const start = Date.now();
let caught: unknown;
try {
await sleepCancelable(10_000, ctl.signal);
} catch (e) {
caught = e;
}
const elapsed = Date.now() - start;
expect(caught).toBeInstanceOf(DOMException);
expect((caught as DOMException).name).toBe('AbortError');
expect(elapsed).toBeLessThan(200);
});
});
Loading