Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 31 additions & 2 deletions plugins/codex/scripts/codex-companion.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,35 @@ async function resolveLatestTrackedTaskThread(cwd, options = {}) {
const jobs = sortJobsNewestFirst(listJobs(workspaceRoot)).filter((job) => job.id !== options.excludeJobId);
const activeTask = jobs.find((job) => job.jobClass === "task" && (job.status === "queued" || job.status === "running"));
if (activeTask) {
// Check if the process is actually alive before blocking
if (activeTask.pid) {
try {
process.kill(activeTask.pid, 0);
} catch (err) {
// Process is dead but job status wasn't updated — mark it as failed and clean up
if (err.code === "ESRCH" || err.code === "EPERM") {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Treat EPERM as active when probing task PID

The zombie cleanup branch currently treats EPERM from process.kill(pid, 0) as if the process were dead, but EPERM means the PID exists and is just not signalable by this user. In that case this path will incorrectly mark an active task as failed and allow another task/resume flow to start concurrently. This is especially risky in mixed-privilege environments (e.g., task started under a different uid).

Useful? React with 👍 / 👎.

await upsertJob(workspaceRoot, {
id: activeTask.id,
status: "failed",
phase: "failed",
pid: null,
completedAt: new Date().toISOString(),
errorMessage: "Process exited unexpectedly while job status was 'running'."
});
// Re-fetch jobs after the update
const updatedJobs = sortJobsNewestFirst(listJobs(workspaceRoot)).filter((job) => job.id !== options.excludeJobId);
const newActiveTask = updatedJobs.find((job) => job.jobClass === "task" && (job.status === "queued" || job.status === "running"));
if (!newActiveTask) {
// Zombie job was cleaned up, proceed normally
} else {
throw new Error(`Task ${newActiveTask.id} is still running. Use /codex:status before continuing it.`);
}
} else {
throw new Error(`Task ${activeTask.id} is still running. Use /codex:status before continuing it.`);
}
return null;
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Continue thread lookup after cleaning zombie task

After the zombie cleanup path marks activeTask as failed, the function immediately returns null, which skips the normal fallback logic (trackedTask lookup and findLatestTaskThread). In a task --resume-last flow where a stale running job is cleaned up but an earlier resumable thread exists, this now raises "No previous Codex task thread was found" instead of resuming, so the recovery path still breaks for resumed tasks.

Useful? React with 👍 / 👎.

}
}
throw new Error(`Task ${activeTask.id} is still running. Use /codex:status before continuing it.`);
}

Expand Down Expand Up @@ -457,7 +486,7 @@ async function executeTaskRun(request) {
defaultPrompt: resumeThreadId ? DEFAULT_CONTINUE_PROMPT : "",
model: request.model,
effort: request.effort,
sandbox: request.write ? "workspace-write" : "read-only",
sandbox: request.fullAccess ? "danger-full-access" : (request.write ? "workspace-write" : "read-only"),
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Wire --full-access through task request pipeline

The sandbox selection now checks request.fullAccess, but handleTask never reads options["full-access"] and never passes a fullAccess field into executeTaskRun/buildTaskRequest, so this condition is always false. As a result, the newly accepted --full-access flag is silently ignored and users cannot actually request danger-full-access mode.

Useful? React with 👍 / 👎.

onProgress: request.onProgress,
persistThread: true,
threadName: resumeThreadId ? null : buildPersistentTaskThreadName(request.prompt || DEFAULT_CONTINUE_PROMPT)
Expand Down Expand Up @@ -704,7 +733,7 @@ async function handleReview(argv) {
async function handleTask(argv) {
const { options, positionals } = parseCommandInput(argv, {
valueOptions: ["model", "effort", "cwd", "prompt-file"],
booleanOptions: ["json", "write", "resume-last", "resume", "fresh", "background"],
booleanOptions: ["json", "write", "full-access", "resume-last", "resume", "fresh", "background"],
aliasMap: {
m: "model"
}
Expand Down
57 changes: 56 additions & 1 deletion plugins/codex/scripts/lib/tracked-jobs.mjs
Original file line number Diff line number Diff line change
@@ -1,14 +1,64 @@
import fs from "node:fs";
import process from "node:process";

import { readJobFile, resolveJobFile, resolveJobLogFile, upsertJob, writeJobFile } from "./state.mjs";
import { readJobFile, resolveJobFile, resolveJobLogFile, upsertJob, writeJobFile, loadState, saveState } from "./state.mjs";

export const SESSION_ID_ENV = "CODEX_COMPANION_SESSION_ID";

export function nowIso() {
return new Date().toISOString();
}

/**
* Check if a process with the given PID is still alive.
* Uses signal 0 (existence check) to avoid side effects.
* @param {number} pid
* @returns {boolean}
*/
export function isProcessAlive(pid) {
if (!Number.isFinite(pid) || pid <= 0) {
return false;
}
try {
// Signal 0 checks if process exists without sending any signal
process.kill(pid, 0);
return true;
} catch (e) {
// ESRCH = no such process — process does not exist
// EPERM = permission denied — process exists but we can't signal it
return e.code === "EPERM";
}
}

/**
* Sweep all jobs in state and mark zombie (dead PID) jobs as failed.
* This prevents zombie "running" jobs from blocking new task submissions.
* @param {string} cwd - workspace root
* @returns {Promise<number>} - number of jobs marked as failed
*/
export async function sweepZombieJobs(cwd) {
const state = loadState(cwd);
let zombiesFixed = 0;
const now = nowIso();

for (const job of state.jobs) {
if (job.status === "running" && job.pid) {
if (!isProcessAlive(job.pid)) {
console.warn(`[tracked-jobs] Zombie job detected: ${job.id} (PID ${job.pid}). Marking as failed.`);
job.status = "failed";
job.endedAt = now;
job.error = "Process died unexpectedly — marked failed by zombie sweep";
Comment on lines +48 to +50
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Persist standard failure fields in zombie sweep

When a zombie job is marked failed here, the code writes endedAt/error but does not set the standard failure fields used elsewhere (phase, pid, completedAt, errorMessage). As a result, downstream status/result rendering can show stale runtime metadata and omit the failure reason/timestamp for swept jobs. The zombie path should emit the same failure shape as other failure updates.

Useful? React with 👍 / 👎.

zombiesFixed++;
}
}
}

if (zombiesFixed > 0) {
saveState(cwd, state);
}
return zombiesFixed;
}

function normalizeProgressEvent(value) {
if (value && typeof value === "object" && !Array.isArray(value)) {
return {
Expand Down Expand Up @@ -140,6 +190,11 @@ function readStoredJobOrNull(workspaceRoot, jobId) {
}

export async function runTrackedJob(job, runner, options = {}) {
// FIX #216: Clean up zombie jobs before starting a new one.
// If a previous job's worker process died without updating state,
// its record stays stuck in "running" — blocking new tasks.
await sweepZombieJobs(job.workspaceRoot);

const runningRecord = {
...job,
status: "running",
Expand Down