diff --git a/plugins/codex/scripts/lib/state.mjs b/plugins/codex/scripts/lib/state.mjs index 2da23498..df28391c 100644 --- a/plugins/codex/scripts/lib/state.mjs +++ b/plugins/codex/scripts/lib/state.mjs @@ -2,6 +2,7 @@ import { createHash } from "node:crypto"; import fs from "node:fs"; import os from "node:os"; import path from "node:path"; +import process from "node:process"; import { resolveWorkspaceRoot } from "./workspace.mjs"; @@ -16,6 +17,94 @@ function nowIso() { return new Date().toISOString(); } +function isProcessAlive(pidValue) { + const pid = Number(pidValue); + if (!Number.isFinite(pid) || pid <= 0) { + return false; + } + + try { + process.kill(Math.trunc(pid), 0); + return true; + } catch (error) { + if (error && typeof error === "object" && "code" in error && error.code === "EPERM") { + return true; + } + return false; + } +} + +function normalizePid(pidValue) { + const pid = Number(pidValue); + if (!Number.isFinite(pid) || pid <= 0) { + return null; + } + return Math.trunc(pid); +} + +function appendStaleJobLog(job, message) { + if (!job?.logFile) { + return; + } + try { + fs.appendFileSync(job.logFile, `[${nowIso()}] ${message}\n`, "utf8"); + } catch { + // Best-effort logging; status reconciliation should not fail on log write errors. + } +} + +function reconcileRunningJobs(cwd, jobs) { + const completedAt = nowIso(); + let changed = false; + + const nextJobs = jobs.map((job) => { + if (job?.status !== "running") { + return job; + } + const pid = normalizePid(job.pid); + if (pid == null) { + return job; + } + if (isProcessAlive(pid)) { + return job; + } + + changed = true; + const reason = `process ${pid} is not running`; + const errorMessage = `Codex job ended unexpectedly (${reason}); auto-reconciled as failed.`; + const nextJob = { + ...job, + status: "failed", + phase: "failed", + pid: null, + completedAt, + errorMessage, + updatedAt: completedAt + }; + + appendStaleJobLog(job, `Detected stale running job (${reason}). Marked as failed automatically.`); + const jobFile = resolveJobFile(cwd, job.id); + if (fs.existsSync(jobFile)) { + try { + const stored = readJobFile(jobFile); + writeJobFile(cwd, job.id, { + ...stored, + ...nextJob + }); + } catch { + // Ignore malformed on-disk job files; state reconciliation still proceeds. + } + } + + return nextJob; + }); + + return { + changed, + jobs: nextJobs + }; +} + function defaultState() { return { version: STATE_VERSION, @@ -147,7 +236,15 @@ export function upsertJob(cwd, jobPatch) { } export function listJobs(cwd) { - return loadState(cwd).jobs; + const state = loadState(cwd); + const reconciled = reconcileRunningJobs(cwd, state.jobs ?? []); + if (reconciled.changed) { + saveState(cwd, { + ...state, + jobs: reconciled.jobs + }); + } + return reconciled.jobs; } export function setConfig(cwd, key, value) { diff --git a/tests/state.test.mjs b/tests/state.test.mjs index 0f8f57ce..f8818820 100644 --- a/tests/state.test.mjs +++ b/tests/state.test.mjs @@ -5,7 +5,15 @@ import test from "node:test"; import assert from "node:assert/strict"; import { makeTempDir } from "./helpers.mjs"; -import { resolveJobFile, resolveJobLogFile, resolveStateDir, resolveStateFile, saveState } from "../plugins/codex/scripts/lib/state.mjs"; +import { + listJobs, + resolveJobFile, + resolveJobLogFile, + resolveStateDir, + resolveStateFile, + saveState, + writeJobFile +} from "../plugins/codex/scripts/lib/state.mjs"; test("resolveStateDir uses a temp-backed per-workspace directory", () => { const workspace = makeTempDir(); @@ -103,3 +111,46 @@ test("saveState prunes dropped job artifacts when indexed jobs exceed the cap", .sort() ); }); + +test("listJobs auto-reconciles stale running jobs when pid is no longer alive", () => { + const workspace = makeTempDir(); + const jobId = "task-stale-running"; + const logFile = resolveJobLogFile(workspace, jobId); + const staleJob = { + id: jobId, + status: "running", + phase: "verifying", + pid: 999999, + logFile, + updatedAt: "2026-01-01T00:00:00.000Z", + createdAt: "2026-01-01T00:00:00.000Z", + startedAt: "2026-01-01T00:00:00.000Z" + }; + + saveState(workspace, { + version: 1, + config: { stopReviewGate: false }, + jobs: [staleJob] + }); + fs.writeFileSync(logFile, "[2026-01-01T00:00:01.000Z] Running command: fake\n", "utf8"); + writeJobFile(workspace, jobId, staleJob); + + const [job] = listJobs(workspace); + + assert.equal(job.status, "failed"); + assert.equal(job.phase, "failed"); + assert.equal(job.pid, null); + assert.match(job.errorMessage, /auto-reconciled as failed/); + assert.ok(job.completedAt); + + const persistedState = JSON.parse(fs.readFileSync(resolveStateFile(workspace), "utf8")); + assert.equal(persistedState.jobs[0].status, "failed"); + assert.equal(persistedState.jobs[0].pid, null); + + const persistedJob = JSON.parse(fs.readFileSync(resolveJobFile(workspace, jobId), "utf8")); + assert.equal(persistedJob.status, "failed"); + assert.equal(persistedJob.pid, null); + + const logTail = fs.readFileSync(logFile, "utf8"); + assert.match(logTail, /Detected stale running job/); +});