From 4524a64e806d9c8a44c33341ac491ff820dbe91f Mon Sep 17 00:00:00 2001 From: oneshot Date: Wed, 29 Apr 2026 00:47:35 +0000 Subject: [PATCH 1/2] fix: kill child process tree on timeout + heartbeat step 4 (GAL-739) --- src/config.ts | 4 +++- src/exec.test.ts | 19 +++++++++++++++++++ src/exec.ts | 7 ++++--- src/steps/plan.ts | 25 ++++++++++++++++++------- 4 files changed, 44 insertions(+), 11 deletions(-) create mode 100644 src/exec.test.ts diff --git a/src/config.ts b/src/config.ts index d058224..712f706 100644 --- a/src/config.ts +++ b/src/config.ts @@ -189,8 +189,10 @@ export const saveConfig = (config: OneshotConfig): void => { renameSync(tmpPath, CONFIG_PATH); }; +// Deep-mode planning on monorepos with 3000+ line files routinely runs 40-80 min. +// The previous 60 min default caused silent kills on GAL-738. const DEFAULT_STEP_TIMEOUTS = { - planMinutes: 60, + planMinutes: 90, executeMinutes: 180, reviewMinutes: 60, deepReviewMinutes: 60, diff --git a/src/exec.test.ts b/src/exec.test.ts new file mode 100644 index 0000000..36c74a3 --- /dev/null +++ b/src/exec.test.ts @@ -0,0 +1,19 @@ +import { describe, expect, test } from "bun:test"; +import { exec } from "./exec"; + +describe("killProcessTree", () => { + test.skipIf(process.platform !== "linux")("kills a timed-out grandchild process", async () => { + await expect( + exec('bash -c "sleep 30"', { timeoutMs: 2_000 }) + ).rejects.toMatchObject({ code: "ERR_TIMEOUT" }); + + await new Promise((resolve) => setTimeout(resolve, 6_000)); + + const proc = Bun.spawn(["pgrep", "-f", "^sleep 30$"], { + stdout: "ignore", + stderr: "ignore", + }); + + expect(await proc.exited).toBe(1); + }, 15_000); +}); diff --git a/src/exec.ts b/src/exec.ts index 648e94e..1927023 100644 --- a/src/exec.ts +++ b/src/exec.ts @@ -35,11 +35,9 @@ const classifyError = (stderr: string, stdout: string): ErrorCode => { }; const killProcessTree = (pid: number): void => { - // Bun.spawn does not setpgid the child, so signaling -pid would either - // ESRCH or hit the parent group. Signal the child directly; if it has - // descendants they will be reaped when the shell exits. try { process.kill(pid, "SIGTERM"); + process.kill(-pid, "SIGTERM"); } catch { // Already dead } @@ -48,6 +46,7 @@ const killProcessTree = (pid: number): void => { setTimeout(() => { try { process.kill(pid, "SIGKILL"); + process.kill(-pid, "SIGKILL"); } catch { // Already dead } @@ -60,7 +59,9 @@ export const exec = async ( ): Promise => { const { timeoutMs = 120_000, stream = false } = options; + // Run the child in its own process group so timeout signals reach the full tree. const proc = Bun.spawn(["bash", "-c", command], { + detached: true, stdout: "pipe", stderr: "pipe", }); diff --git a/src/steps/plan.ts b/src/steps/plan.ts index cf696a5..7bbc941 100644 --- a/src/steps/plan.ts +++ b/src/steps/plan.ts @@ -3,6 +3,7 @@ import { join } from "path"; import type { PipelineContext } from "../config"; import { exec, execOrThrow } from "../exec"; import { getStepTimeout } from "../config"; +import { log } from "../log"; import { shellEscape } from "../shell"; import { PROMPTS_DIR, CLAUDE_PLUGIN_DIR } from "../paths"; @@ -25,11 +26,21 @@ export const plan = async (ctx: PipelineContext): Promise => { const model = options.model ?? config.claude.model; const timeoutMs = getStepTimeout(config, "planMinutes"); - - const result = await execOrThrow( - `cd ${shellEscape(worktreePath)} && claude -p ${shellEscape(prompt)} ${pluginFlag}--model ${shellEscape(model)} --no-session-persistence`, - { timeoutMs, stream: true } - ); - - return result.trim(); + const stepStart = Date.now(); + const heartbeat = setInterval(() => { + const elapsed = Math.round((Date.now() - stepStart) / 60_000); + const limit = timeoutMs / 60_000; + log.info(`planning: still working after ${elapsed}m (planMinutes limit = ${limit})`); + }, 120_000); + + try { + const result = await execOrThrow( + `cd ${shellEscape(worktreePath)} && claude -p ${shellEscape(prompt)} ${pluginFlag}--model ${shellEscape(model)} --no-session-persistence`, + { timeoutMs, stream: true } + ); + + return result.trim(); + } finally { + clearInterval(heartbeat); + } }; From 2c35d1a278ea77f639af98ea3fe6eaa77ce75f9b Mon Sep 17 00:00:00 2001 From: Andy Wilkinson Date: Wed, 29 Apr 2026 00:53:39 +0000 Subject: [PATCH 2/2] fix: address review findings --- .oneshot-pr-body.txt | 39 +++++++++++++++++++++++++++++++++++++++ .oneshot-pr-title.txt | 1 + src/exec.ts | 8 ++++++-- 3 files changed, 46 insertions(+), 2 deletions(-) create mode 100644 .oneshot-pr-body.txt create mode 100644 .oneshot-pr-title.txt diff --git a/.oneshot-pr-body.txt b/.oneshot-pr-body.txt new file mode 100644 index 0000000..2335d08 --- /dev/null +++ b/.oneshot-pr-body.txt @@ -0,0 +1,39 @@ +## Summary + +- Group-signal the process tree in `exec.ts` so a `claude -p` grandchild actually dies when the step timeout fires (root cause of the GAL-738 80+ min wedges). +- Emit a 2-minute heartbeat log inside step 4 so operators can distinguish "slow" from "wedged" instead of staring at a frozen `[4/8] Planning with Claude…` line. +- Bump `planMinutes` default from 60 to 90 to cover deep-mode plans on monorepos with 3000+ line target files. +- New `src/exec.test.ts` asserts the grandchild-sleep scenario is cleaned up on Linux. + +## Why + +Two recent `oneshot --local --bg` dispatches against `personal/galleonlabs-zkp2p` stalled inside step 4 for 96 min and 3h52m. The child `claude -p` process stayed alive because `killProcessTree` only signalled the top-level bash PID — the grandchild was reparented to PID 1 and kept burning CPU. Meanwhile, no log output was emitted after the step started, so the operator had no way to tell "still working" from "stuck". The 60-min default was also too tight for deep-mode plans that routinely run 40-80 min. + +Relates to GAL-739 and GAL-738. + +## Changes + +**`src/exec.ts` — process group kill** +- Spawn bash with `detached: true` so the child gets its own process group +- Signal both the leader PID and the negated PID (process group) in `killProcessTree` for both SIGTERM and the follow-up SIGKILL + +**`src/steps/plan.ts` — heartbeat** +- Start a 2-minute `setInterval` before awaiting `execOrThrow` that logs elapsed time and the `planMinutes` limit via `log.info` +- Clear the interval in a `finally` block + +**`src/config.ts` — timeout bump** +- `DEFAULT_STEP_TIMEOUTS.planMinutes`: 60 → 90 + +**`src/exec.test.ts` (new)** +- One test case: spawns `bash -c "sleep 30"` through the `exec` helper with a 2s timeout, waits 6s, then asserts via `pgrep` that no orphan `sleep 30` process remains +- Skipped on non-Linux platforms + +## Test plan + +Not run — typecheck and test execution deferred to CI / operator verification. The spec commands are: + +``` +bun run typecheck +bun run test +bun test src/exec.test.ts +``` diff --git a/.oneshot-pr-title.txt b/.oneshot-pr-title.txt new file mode 100644 index 0000000..e2ed926 --- /dev/null +++ b/.oneshot-pr-title.txt @@ -0,0 +1 @@ +fix: kill child process tree on timeout + heartbeat step 4 (GAL-739) \ No newline at end of file diff --git a/src/exec.ts b/src/exec.ts index 1927023..359faf3 100644 --- a/src/exec.ts +++ b/src/exec.ts @@ -36,8 +36,10 @@ const classifyError = (stderr: string, stdout: string): ErrorCode => { const killProcessTree = (pid: number): void => { try { - process.kill(pid, "SIGTERM"); process.kill(-pid, "SIGTERM"); + } catch {} + try { + process.kill(pid, "SIGTERM"); } catch { // Already dead } @@ -45,8 +47,10 @@ const killProcessTree = (pid: number): void => { // Follow up with SIGKILL after 5s in case SIGTERM is ignored setTimeout(() => { try { - process.kill(pid, "SIGKILL"); process.kill(-pid, "SIGKILL"); + } catch {} + try { + process.kill(pid, "SIGKILL"); } catch { // Already dead }