-
Notifications
You must be signed in to change notification settings - Fork 852
fix: handle zombie jobs when process crashes but status remains "running" #218
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -313,6 +313,35 @@ async function resolveLatestTrackedTaskThread(cwd, options = {}) { | |
| const jobs = sortJobsNewestFirst(listJobs(workspaceRoot)).filter((job) => job.id !== options.excludeJobId); | ||
| const activeTask = jobs.find((job) => job.jobClass === "task" && (job.status === "queued" || job.status === "running")); | ||
| if (activeTask) { | ||
| // Check if the process is actually alive before blocking | ||
| if (activeTask.pid) { | ||
| try { | ||
| process.kill(activeTask.pid, 0); | ||
| } catch (err) { | ||
| // Process is dead but job status wasn't updated — mark it as failed and clean up | ||
| if (err.code === "ESRCH" || err.code === "EPERM") { | ||
| await upsertJob(workspaceRoot, { | ||
| id: activeTask.id, | ||
| status: "failed", | ||
| phase: "failed", | ||
| pid: null, | ||
| completedAt: new Date().toISOString(), | ||
| errorMessage: "Process exited unexpectedly while job status was 'running'." | ||
| }); | ||
| // Re-fetch jobs after the update | ||
| const updatedJobs = sortJobsNewestFirst(listJobs(workspaceRoot)).filter((job) => job.id !== options.excludeJobId); | ||
| const newActiveTask = updatedJobs.find((job) => job.jobClass === "task" && (job.status === "queued" || job.status === "running")); | ||
| if (!newActiveTask) { | ||
| // Zombie job was cleaned up, proceed normally | ||
| } else { | ||
| throw new Error(`Task ${newActiveTask.id} is still running. Use /codex:status before continuing it.`); | ||
| } | ||
| } else { | ||
| throw new Error(`Task ${activeTask.id} is still running. Use /codex:status before continuing it.`); | ||
| } | ||
| return null; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
After the zombie cleanup path marks Useful? React with 👍 / 👎. |
||
| } | ||
| } | ||
| throw new Error(`Task ${activeTask.id} is still running. Use /codex:status before continuing it.`); | ||
| } | ||
|
|
||
|
|
@@ -457,7 +486,7 @@ async function executeTaskRun(request) { | |
| defaultPrompt: resumeThreadId ? DEFAULT_CONTINUE_PROMPT : "", | ||
| model: request.model, | ||
| effort: request.effort, | ||
| sandbox: request.write ? "workspace-write" : "read-only", | ||
| sandbox: request.fullAccess ? "danger-full-access" : (request.write ? "workspace-write" : "read-only"), | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The sandbox selection now checks Useful? React with 👍 / 👎. |
||
| onProgress: request.onProgress, | ||
| persistThread: true, | ||
| threadName: resumeThreadId ? null : buildPersistentTaskThreadName(request.prompt || DEFAULT_CONTINUE_PROMPT) | ||
|
|
@@ -704,7 +733,7 @@ async function handleReview(argv) { | |
| async function handleTask(argv) { | ||
| const { options, positionals } = parseCommandInput(argv, { | ||
| valueOptions: ["model", "effort", "cwd", "prompt-file"], | ||
| booleanOptions: ["json", "write", "resume-last", "resume", "fresh", "background"], | ||
| booleanOptions: ["json", "write", "full-access", "resume-last", "resume", "fresh", "background"], | ||
| aliasMap: { | ||
| m: "model" | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,14 +1,64 @@ | ||
| import fs from "node:fs"; | ||
| import process from "node:process"; | ||
|
|
||
| import { readJobFile, resolveJobFile, resolveJobLogFile, upsertJob, writeJobFile } from "./state.mjs"; | ||
| import { readJobFile, resolveJobFile, resolveJobLogFile, upsertJob, writeJobFile, loadState, saveState } from "./state.mjs"; | ||
|
|
||
| export const SESSION_ID_ENV = "CODEX_COMPANION_SESSION_ID"; | ||
|
|
||
| export function nowIso() { | ||
| return new Date().toISOString(); | ||
| } | ||
|
|
||
| /** | ||
| * Check if a process with the given PID is still alive. | ||
| * Uses signal 0 (existence check) to avoid side effects. | ||
| * @param {number} pid | ||
| * @returns {boolean} | ||
| */ | ||
| export function isProcessAlive(pid) { | ||
| if (!Number.isFinite(pid) || pid <= 0) { | ||
| return false; | ||
| } | ||
| try { | ||
| // Signal 0 checks if process exists without sending any signal | ||
| process.kill(pid, 0); | ||
| return true; | ||
| } catch (e) { | ||
| // ESRCH = no such process — process does not exist | ||
| // EPERM = permission denied — process exists but we can't signal it | ||
| return e.code === "EPERM"; | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Sweep all jobs in state and mark zombie (dead PID) jobs as failed. | ||
| * This prevents zombie "running" jobs from blocking new task submissions. | ||
| * @param {string} cwd - workspace root | ||
| * @returns {Promise<number>} - number of jobs marked as failed | ||
| */ | ||
| export async function sweepZombieJobs(cwd) { | ||
| const state = loadState(cwd); | ||
| let zombiesFixed = 0; | ||
| const now = nowIso(); | ||
|
|
||
| for (const job of state.jobs) { | ||
| if (job.status === "running" && job.pid) { | ||
| if (!isProcessAlive(job.pid)) { | ||
| console.warn(`[tracked-jobs] Zombie job detected: ${job.id} (PID ${job.pid}). Marking as failed.`); | ||
| job.status = "failed"; | ||
| job.endedAt = now; | ||
| job.error = "Process died unexpectedly — marked failed by zombie sweep"; | ||
|
Comment on lines
+48
to
+50
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
When a zombie job is marked failed here, the code writes Useful? React with 👍 / 👎. |
||
| zombiesFixed++; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| if (zombiesFixed > 0) { | ||
| saveState(cwd, state); | ||
| } | ||
| return zombiesFixed; | ||
| } | ||
|
|
||
| function normalizeProgressEvent(value) { | ||
| if (value && typeof value === "object" && !Array.isArray(value)) { | ||
| return { | ||
|
|
@@ -140,6 +190,11 @@ function readStoredJobOrNull(workspaceRoot, jobId) { | |
| } | ||
|
|
||
| export async function runTrackedJob(job, runner, options = {}) { | ||
| // FIX #216: Clean up zombie jobs before starting a new one. | ||
| // If a previous job's worker process died without updating state, | ||
| // its record stays stuck in "running" — blocking new tasks. | ||
| await sweepZombieJobs(job.workspaceRoot); | ||
|
|
||
| const runningRecord = { | ||
| ...job, | ||
| status: "running", | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The zombie cleanup branch currently treats
EPERMfromprocess.kill(pid, 0)as if the process were dead, butEPERMmeans the PID exists and is just not signalable by this user. In that case this path will incorrectly mark an active task as failed and allow another task/resume flow to start concurrently. This is especially risky in mixed-privilege environments (e.g., task started under a different uid).Useful? React with 👍 / 👎.