From 7ecded4092b723184d92529774b32fc154b1fa64 Mon Sep 17 00:00:00 2001 From: bradley inniss Date: Thu, 12 Mar 2026 01:09:40 -0700 Subject: [PATCH 01/19] Improve Codex multi-agent planning and runtime DX --- README.md | 20 +- docs/codex-app-server.md | 1438 +++++++++++++++++ docs/codex-cli-reference.md | 859 ++++++++++ specs/smoke/html-game-planning.md | 102 ++ src/agents/codex/codex-path.test.ts | 74 + src/agents/codex/codex-path.ts | 231 +++ src/agents/codex/session.test.ts | 2 +- src/agents/codex/session.ts | 283 +++- src/agents/codex/session.unit.test.ts | 322 +++- src/cli/commands/run-command.test-harness.ts | 21 +- .../run.postexec-json.integration.test.ts | 6 +- src/cli/commands/run.ts | 2 +- src/core/codex-config.test.ts | 33 +- src/core/codex-config.ts | 69 + src/core/config-loader.test.ts | 42 +- src/core/config-loader.ts | 13 +- src/types/index.ts | 1 + 17 files changed, 3399 insertions(+), 119 deletions(-) create mode 100644 docs/codex-app-server.md create mode 100644 docs/codex-cli-reference.md create mode 100644 specs/smoke/html-game-planning.md create mode 100644 src/agents/codex/codex-path.test.ts create mode 100644 src/agents/codex/codex-path.ts diff --git a/README.md b/README.md index 23419ef..51a35de 100644 --- a/README.md +++ b/README.md @@ -104,6 +104,8 @@ Orca loads config in this order (later overrides earlier): `.ts` is preferred over `.js` when both exist. +Stale executor values from older configs are ignored and coerced to `codex`. Orca no longer supports alternate executors. + ```ts // orca.config.ts import { defineOrcaConfig } from "orcastrator"; @@ -117,7 +119,13 @@ export default defineOrcaConfig({ codex: { model: "gpt-5.3-codex", - effort: "medium", // "low" | "medium" | "high" — applies to all Codex turns + effort: "medium", // fallback for all Codex turns unless overridden below + thinkingLevel: { + decision: "low", // planning gate / quick routing decisions + planning: "xhigh", // task graph generation + review: "high", // task graph consultation + post-execution review prompts + execution: "medium", // task execution turns + }, timeoutMs: 300000, multiAgent: false, // see Multi-agent section perCwdExtraUserRoots: [ @@ -168,10 +176,20 @@ After planning, Orca runs a pre-execution review that can edit the task graph (a After execution, Orca runs validation commands and asks Codex to review findings. With `onFindings: "auto_fix"`, it applies fixes and retries up to `maxCycles` times, then reports. Set `ORCA_SKIP_VALIDATORS=1` to skip validator auto-detection at runtime. +Use `codex.thinkingLevel` when you want different reasoning levels for different stages instead of a single global `codex.effort`. + ### Multi-agent mode Set `codex.multiAgent: true` to spawn parallel Codex agents per task. Faster for large refactors with independent subtasks; higher token cost. **Note:** this writes `multi_agent = true` to your global `~/.codex/config.toml`. +If `~/.codex/config.toml` already enables `[features].multi_agent = true`, Orca also treats the run as multi-agent-aware for planning, review, consultation, and execution prompts even when `codex.multiAgent` is not set in Orca config. + +### Codex binary and MCP diagnostics + +When `ORCA_CODEX_PATH` is unset, Orca auto-selects the newest installed Codex CLI/app-server it can find instead of blindly trusting the first `codex` binary on `PATH`. This avoids talking to an older global install when a newer desktop build is present. + +If configured Codex MCP servers are enabled but not logged in, Orca now summarizes that once and continues without them instead of streaming raw app-server auth noise throughout the run. + ### Skills Orca auto-loads skills in this precedence order (first name wins): diff --git a/docs/codex-app-server.md b/docs/codex-app-server.md new file mode 100644 index 0000000..1b646d6 --- /dev/null +++ b/docs/codex-app-server.md @@ -0,0 +1,1438 @@ +# Codex App Server + +Codex app-server is the interface Codex uses to power rich clients (for example, the Codex VS Code extension). Use it when you want a deep integration inside your own product: authentication, conversation history, approvals, and streamed agent events. The app-server implementation is open source in the Codex GitHub repository ([openai/codex/codex-rs/app-server](https://github.com/openai/codex/tree/main/codex-rs/app-server)). See the [Open Source](https://developers.openai.com/codex/open-source) page for the full list of open-source Codex components. + +If you are automating jobs or running Codex in CI, use the + Codex SDK instead. + +## Protocol + +Like [MCP](https://modelcontextprotocol.io/), `codex app-server` supports bidirectional communication using JSON-RPC 2.0 messages (with the `"jsonrpc":"2.0"` header omitted on the wire). + +Supported transports: + +- `stdio` (`--listen stdio://`, default): newline-delimited JSON (JSONL). +- `websocket` (`--listen ws://IP:PORT`, experimental): one JSON-RPC message per WebSocket text frame. + +In WebSocket mode, app-server uses bounded queues. When request ingress is full, the server rejects new requests with JSON-RPC error code `-32001` and message `"Server overloaded; retry later."` Clients should retry with an exponentially increasing delay and jitter. + +## Message schema + +Requests include `method`, `params`, and `id`: + +```json +{ "method": "thread/start", "id": 10, "params": { "model": "gpt-5.1-codex" } } +``` + +Responses echo the `id` with either `result` or `error`: + +```json +{ "id": 10, "result": { "thread": { "id": "thr_123" } } } +``` + +```json +{ "id": 10, "error": { "code": 123, "message": "Something went wrong" } } +``` + +Notifications omit `id` and use only `method` and `params`: + +```json +{ "method": "turn/started", "params": { "turn": { "id": "turn_456" } } } +``` + +You can generate a TypeScript schema or a JSON Schema bundle from the CLI. Each output is specific to the Codex version you ran, so the generated artifacts match that version exactly: + +```bash +codex app-server generate-ts --out ./schemas +codex app-server generate-json-schema --out ./schemas +``` + +## Getting started + +1. Start the server with `codex app-server` (default stdio transport) or `codex app-server --listen ws://127.0.0.1:4500` (experimental WebSocket transport). +2. Connect a client over the selected transport, then send `initialize` followed by the `initialized` notification. +3. Start a thread and a turn, then keep reading notifications from the active transport stream. + +Example (Node.js / TypeScript): + +```ts + + + +const proc = spawn("codex", ["app-server"], { + stdio: ["pipe", "pipe", "inherit"], +}); +const rl = readline.createInterface({ input: proc.stdout }); + +const send = (message: unknown) => { + proc.stdin.write(`${JSON.stringify(message)}\n`); +}; + +let threadId: string | null = null; + +rl.on("line", (line) => { + const msg = JSON.parse(line) as any; + console.log("server:", msg); + + if (msg.id === 1 && msg.result?.thread?.id && !threadId) { + threadId = msg.result.thread.id; + send({ + method: "turn/start", + id: 2, + params: { + threadId, + input: [{ type: "text", text: "Summarize this repo." }], + }, + }); + } +}); + +send({ + method: "initialize", + id: 0, + params: { + clientInfo: { + name: "my_product", + title: "My Product", + version: "0.1.0", + }, + }, +}); +send({ method: "initialized", params: {} }); +send({ method: "thread/start", id: 1, params: { model: "gpt-5.1-codex" } }); +``` + +## Core primitives + +- **Thread**: A conversation between a user and the Codex agent. Threads contain turns. +- **Turn**: A single user request and the agent work that follows. Turns contain items and stream incremental updates. +- **Item**: A unit of input or output (user message, agent message, command runs, file change, tool call, and more). + +Use the thread APIs to create, list, or archive conversations. Drive a conversation with turn APIs and stream progress via turn notifications. + +## Lifecycle overview + +- **Initialize once per connection**: Immediately after opening a transport connection, send an `initialize` request with your client metadata, then emit `initialized`. The server rejects any request on that connection before this handshake. +- **Start (or resume) a thread**: Call `thread/start` for a new conversation, `thread/resume` to continue an existing one, or `thread/fork` to branch history into a new thread id. +- **Begin a turn**: Call `turn/start` with the target `threadId` and user input. Optional fields override model, personality, `cwd`, sandbox policy, and more. +- **Steer an active turn**: Call `turn/steer` to append user input to the currently in-flight turn without creating a new turn. +- **Stream events**: After `turn/start`, keep reading notifications on stdout: `thread/archived`, `thread/unarchived`, `item/started`, `item/completed`, `item/agentMessage/delta`, tool progress, and other updates. +- **Finish the turn**: The server emits `turn/completed` with final status when the model finishes or after a `turn/interrupt` cancellation. + +## Initialization + +Clients must send a single `initialize` request per transport connection before invoking any other method on that connection, then acknowledge with an `initialized` notification. Requests sent before initialization receive a `Not initialized` error, and repeated `initialize` calls on the same connection return `Already initialized`. + +The server returns the user agent string it will present to upstream services. Set `clientInfo` to identify your integration. + +`initialize.params.capabilities` also supports per-connection notification opt-out via `optOutNotificationMethods`, which is a list of exact method names to suppress for that connection. Matching is exact (no wildcards/prefixes). Unknown method names are accepted and ignored. + +**Important**: Use `clientInfo.name` to identify your client for the OpenAI Compliance Logs Platform. If you are developing a new Codex integration intended for enterprise use, please contact OpenAI to get it added to a known clients list. For more context, see the [Codex logs reference](https://chatgpt.com/admin/api-reference#tag/Logs:-Codex). + +Example (from the Codex VS Code extension): + +```json +{ + "method": "initialize", + "id": 0, + "params": { + "clientInfo": { + "name": "codex_vscode", + "title": "Codex VS Code Extension", + "version": "0.1.0" + } + } +} +``` + +Example with notification opt-out: + +```json +{ + "method": "initialize", + "id": 1, + "params": { + "clientInfo": { + "name": "my_client", + "title": "My Client", + "version": "0.1.0" + }, + "capabilities": { + "experimentalApi": true, + "optOutNotificationMethods": [ + "codex/event/session_configured", + "item/agentMessage/delta" + ] + } + } +} +``` + +## Experimental API opt-in + +Some app-server methods and fields are intentionally gated behind `experimentalApi` capability. + +- Omit `capabilities` (or set `experimentalApi` to `false`) to stay on the stable API surface, and the server rejects experimental methods/fields. +- Set `capabilities.experimentalApi` to `true` to enable experimental methods and fields. + +```json +{ + "method": "initialize", + "id": 1, + "params": { + "clientInfo": { + "name": "my_client", + "title": "My Client", + "version": "0.1.0" + }, + "capabilities": { + "experimentalApi": true + } + } +} +``` + +If a client sends an experimental method or field without opting in, app-server rejects it with: + +` requires experimentalApi capability` + +## API overview + +- `thread/start` - create a new thread; emits `thread/started` and automatically subscribes you to turn/item events for that thread. +- `thread/resume` - reopen an existing thread by id so later `turn/start` calls append to it. +- `thread/fork` - fork a thread into a new thread id by copying stored history; emits `thread/started` for the new thread. +- `thread/read` - read a stored thread by id without resuming it; set `includeTurns` to return full turn history. Returned `thread` objects include runtime `status`. +- `thread/list` - page through stored thread logs; supports cursor-based pagination plus `modelProviders`, `sourceKinds`, `archived`, and `cwd` filters. Returned `thread` objects include runtime `status`. +- `thread/loaded/list` - list the thread ids currently loaded in memory. +- `thread/archive` - move a thread's log file into the archived directory; returns `{}` on success and emits `thread/archived`. +- `thread/unsubscribe` - unsubscribe this connection from thread turn/item events. If this was the last subscriber, the server unloads the thread and emits `thread/closed`. +- `thread/unarchive` - restore an archived thread rollout back into the active sessions directory; returns the restored `thread` and emits `thread/unarchived`. +- `thread/status/changed` - notification emitted when a loaded thread's runtime `status` changes. +- `thread/compact/start` - trigger conversation history compaction for a thread; returns `{}` immediately while progress streams via `turn/*` and `item/*` notifications. +- `thread/rollback` - drop the last N turns from the in-memory context and persist a rollback marker; returns the updated `thread`. +- `turn/start` - add user input to a thread and begin Codex generation; responds with the initial `turn` and streams events. For `collaborationMode`, `settings.developer_instructions: null` means "use built-in instructions for the selected mode." +- `turn/steer` - append user input to the active in-flight turn for a thread; returns the accepted `turnId`. +- `turn/interrupt` - request cancellation of an in-flight turn; success is `{}` and the turn ends with `status: "interrupted"`. +- `review/start` - kick off the Codex reviewer for a thread; emits `enteredReviewMode` and `exitedReviewMode` items. +- `command/exec` - run a single command under the server sandbox without starting a thread/turn. +- `model/list` - list available models (set `includeHidden: true` to include entries with `hidden: true`) with effort options, optional `upgrade`, and `inputModalities`. +- `experimentalFeature/list` - list feature flags with lifecycle stage metadata and cursor pagination. +- `collaborationMode/list` - list collaboration mode presets (experimental, no pagination). +- `skills/list` - list skills for one or more `cwd` values (supports `forceReload` and optional `perCwdExtraUserRoots`). +- `app/list` - list available apps (connectors) with pagination plus accessibility/enabled metadata. +- `skills/config/write` - enable or disable skills by path. +- `mcpServer/oauth/login` - start an OAuth login for a configured MCP server; returns an authorization URL and emits `mcpServer/oauthLogin/completed` on completion. +- `tool/requestUserInput` - prompt the user with 1-3 short questions for a tool call (experimental); questions can set `isOther` for a free-form option. +- `config/mcpServer/reload` - reload MCP server configuration from disk and queue a refresh for loaded threads. +- `mcpServerStatus/list` - list MCP servers, tools, resources, and auth status (cursor + limit pagination). +- `windowsSandbox/setupStart` - start Windows sandbox setup for `elevated` or `unelevated` mode; returns quickly and later emits `windowsSandbox/setupCompleted`. +- `feedback/upload` - submit a feedback report (classification + optional reason/logs + conversation id, plus optional `extraLogFiles` attachments). +- `config/read` - fetch the effective configuration on disk after resolving configuration layering. +- `externalAgentConfig/detect` - detect migratable external-agent artifacts with `includeHome` and optional `cwds`; each detected item includes `cwd` (`null` for home). +- `externalAgentConfig/import` - apply selected external-agent migration items by passing explicit `migrationItems` with `cwd` (`null` for home). +- `config/value/write` - write a single configuration key/value to the user's `config.toml` on disk. +- `config/batchWrite` - apply configuration edits atomically to the user's `config.toml` on disk. +- `configRequirements/read` - fetch requirements from `requirements.toml` and/or MDM, including allow-lists, pinned `featureRequirements`, and residency/network requirements (or `null` if you haven't set any up). + +## Models + +### List models (`model/list`) + +Call `model/list` to discover available models and their capabilities before rendering model or personality selectors. + +```json +{ "method": "model/list", "id": 6, "params": { "limit": 20, "includeHidden": false } } +{ "id": 6, "result": { + "data": [{ + "id": "gpt-5.4", + "model": "gpt-5.4", + "displayName": "GPT-5.4", + "hidden": false, + "defaultReasoningEffort": "medium", + "supportedReasoningEfforts": [{ + "reasoningEffort": "low", + "description": "Lower latency" + }], + "inputModalities": ["text", "image"], + "supportsPersonality": true, + "isDefault": true + }], + "nextCursor": null +} } +``` + +Each model entry can include: + +- `supportedReasoningEfforts` - supported effort options for the model. +- `defaultReasoningEffort` - suggested default effort for clients. +- `upgrade` - optional recommended upgrade model id for migration prompts in clients. +- `upgradeInfo` - optional upgrade metadata for migration prompts in clients. +- `hidden` - whether the model is hidden from the default picker list. +- `inputModalities` - supported input types for the model (for example `text`, `image`). +- `supportsPersonality` - whether the model supports personality-specific instructions such as `/personality`. +- `isDefault` - whether the model is the recommended default. + +By default, `model/list` returns picker-visible models only. Set `includeHidden: true` if you need the full list and want to filter on the client side using `hidden`. + +When `inputModalities` is missing (older model catalogs), treat it as `["text", "image"]` for backward compatibility. + +### List experimental features (`experimentalFeature/list`) + +Use this endpoint to discover feature flags with metadata and lifecycle stage: + +```json +{ "method": "experimentalFeature/list", "id": 7, "params": { "limit": 20 } } +{ "id": 7, "result": { + "data": [{ + "name": "unified_exec", + "stage": "beta", + "displayName": "Unified exec", + "description": "Use the unified PTY-backed execution tool.", + "announcement": "Beta rollout for improved command execution reliability.", + "enabled": false, + "defaultEnabled": false + }], + "nextCursor": null +} } +``` + +`stage` can be `beta`, `underDevelopment`, `stable`, `deprecated`, or `removed`. For non-beta flags, `displayName`, `description`, and `announcement` may be `null`. + +## Threads + +- `thread/read` reads a stored thread without subscribing to it; set `includeTurns` to include turns. +- `thread/list` supports cursor pagination plus `modelProviders`, `sourceKinds`, `archived`, and `cwd` filtering. +- `thread/loaded/list` returns the thread IDs currently in memory. +- `thread/archive` moves the thread's persisted JSONL log into the archived directory. +- `thread/unsubscribe` unsubscribes the current connection from a loaded thread and can trigger `thread/closed`. +- `thread/unarchive` restores an archived thread rollout back into the active sessions directory. +- `thread/compact/start` triggers compaction and returns `{}` immediately. +- `thread/rollback` drops the last N turns from the in-memory context and records a rollback marker in the thread's persisted JSONL log. + +### Start or resume a thread + +Start a fresh thread when you need a new Codex conversation. + +```json +{ "method": "thread/start", "id": 10, "params": { + "model": "gpt-5.1-codex", + "cwd": "/Users/me/project", + "approvalPolicy": "never", + "sandbox": "workspaceWrite", + "personality": "friendly", + "serviceName": "my_app_server_client" +} } +{ "id": 10, "result": { + "thread": { + "id": "thr_123", + "preview": "", + "ephemeral": false, + "modelProvider": "openai", + "createdAt": 1730910000 + } +} } +{ "method": "thread/started", "params": { "thread": { "id": "thr_123" } } } +``` + +`serviceName` is optional. Set it when you want app-server to tag thread-level metrics with your integration's service name. + +To continue a stored session, call `thread/resume` with the `thread.id` you recorded earlier. The response shape matches `thread/start`. You can also pass the same configuration overrides supported by `thread/start`, such as `personality`: + +```json +{ "method": "thread/resume", "id": 11, "params": { + "threadId": "thr_123", + "personality": "friendly" +} } +{ "id": 11, "result": { "thread": { "id": "thr_123", "name": "Bug bash notes", "ephemeral": false } } } +``` + +Resuming a thread doesn't update `thread.updatedAt` (or the rollout file's modified time) by itself. The timestamp updates when you start a turn. + +If you mark an enabled MCP server as `required` in config and that server fails to initialize, `thread/start` and `thread/resume` fail instead of continuing without it. + +`dynamicTools` on `thread/start` is an experimental field (requires `capabilities.experimentalApi = true`). Codex persists these dynamic tools in the thread rollout metadata and restores them on `thread/resume` when you don't supply new dynamic tools. + +If you resume with a different model than the one recorded in the rollout, Codex emits a warning and applies a one-time model-switch instruction on the next turn. + +To branch from a stored session, call `thread/fork` with the `thread.id`. This creates a new thread id and emits a `thread/started` notification for it: + +```json +{ "method": "thread/fork", "id": 12, "params": { "threadId": "thr_123" } } +{ "id": 12, "result": { "thread": { "id": "thr_456" } } } +{ "method": "thread/started", "params": { "thread": { "id": "thr_456" } } } +``` + +When a user-facing thread title has been set, app-server hydrates `thread.name` on `thread/list`, `thread/read`, `thread/resume`, `thread/unarchive`, and `thread/rollback` responses. `thread/start` and `thread/fork` may omit `name` (or return `null`) until a title is set later. + +### Read a stored thread (without resuming) + +Use `thread/read` when you want stored thread data but don't want to resume the thread or subscribe to its events. + +- `includeTurns` - when `true`, the response includes the thread's turns; when `false` or omitted, you get the thread summary only. +- Returned `thread` objects include runtime `status` (`notLoaded`, `idle`, `systemError`, or `active` with `activeFlags`). + +```json +{ "method": "thread/read", "id": 19, "params": { "threadId": "thr_123", "includeTurns": true } } +{ "id": 19, "result": { "thread": { "id": "thr_123", "name": "Bug bash notes", "ephemeral": false, "status": { "type": "notLoaded" }, "turns": [] } } } +``` + +Unlike `thread/resume`, `thread/read` doesn't load the thread into memory or emit `thread/started`. + +### List threads (with pagination & filters) + +`thread/list` lets you render a history UI. Results default to newest-first by `createdAt`. Filters apply before pagination. Pass any combination of: + +- `cursor` - opaque string from a prior response; omit for the first page. +- `limit` - server defaults to a reasonable page size if unset. +- `sortKey` - `created_at` (default) or `updated_at`. +- `modelProviders` - restrict results to specific providers; unset, null, or an empty array includes all providers. +- `sourceKinds` - restrict results to specific thread sources. When omitted or `[]`, the server defaults to interactive sources only: `cli` and `vscode`. +- `archived` - when `true`, list archived threads only. When `false` or omitted, list non-archived threads (default). +- `cwd` - restrict results to threads whose session current working directory exactly matches this path. + +`sourceKinds` accepts the following values: + +- `cli` +- `vscode` +- `exec` +- `appServer` +- `subAgent` +- `subAgentReview` +- `subAgentCompact` +- `subAgentThreadSpawn` +- `subAgentOther` +- `unknown` + +Example: + +```json +{ "method": "thread/list", "id": 20, "params": { + "cursor": null, + "limit": 25, + "sortKey": "created_at" +} } +{ "id": 20, "result": { + "data": [ + { "id": "thr_a", "preview": "Create a TUI", "ephemeral": false, "modelProvider": "openai", "createdAt": 1730831111, "updatedAt": 1730831111, "name": "TUI prototype", "status": { "type": "notLoaded" } }, + { "id": "thr_b", "preview": "Fix tests", "ephemeral": true, "modelProvider": "openai", "createdAt": 1730750000, "updatedAt": 1730750000, "status": { "type": "notLoaded" } } + ], + "nextCursor": "opaque-token-or-null" +} } +``` + +When `nextCursor` is `null`, you have reached the final page. + +### Track thread status changes + +`thread/status/changed` is emitted whenever a loaded thread's runtime status changes. The payload includes `threadId` and the new `status`. + +```json +{ + "method": "thread/status/changed", + "params": { + "threadId": "thr_123", + "status": { "type": "active", "activeFlags": ["waitingOnApproval"] } + } +} +``` + +### List loaded threads + +`thread/loaded/list` returns thread IDs currently loaded in memory. + +```json +{ "method": "thread/loaded/list", "id": 21 } +{ "id": 21, "result": { "data": ["thr_123", "thr_456"] } } +``` + +### Unsubscribe from a loaded thread + +`thread/unsubscribe` removes the current connection's subscription to a thread. The response status is one of: + +- `unsubscribed` when the connection was subscribed and is now removed. +- `notSubscribed` when the connection was not subscribed to that thread. +- `notLoaded` when the thread is not loaded. + +If this was the last subscriber, the server unloads the thread and emits a `thread/status/changed` transition to `notLoaded` plus `thread/closed`. + +```json +{ "method": "thread/unsubscribe", "id": 22, "params": { "threadId": "thr_123" } } +{ "id": 22, "result": { "status": "unsubscribed" } } +{ "method": "thread/status/changed", "params": { + "threadId": "thr_123", + "status": { "type": "notLoaded" } +} } +{ "method": "thread/closed", "params": { "threadId": "thr_123" } } +``` + +### Archive a thread + +Use `thread/archive` to move the persisted thread log (stored as a JSONL file on disk) into the archived sessions directory. + +```json +{ "method": "thread/archive", "id": 22, "params": { "threadId": "thr_b" } } +{ "id": 22, "result": {} } +{ "method": "thread/archived", "params": { "threadId": "thr_b" } } +``` + +Archived threads won't appear in future calls to `thread/list` unless you pass `archived: true`. + +### Unarchive a thread + +Use `thread/unarchive` to move an archived thread rollout back into the active sessions directory. + +```json +{ "method": "thread/unarchive", "id": 24, "params": { "threadId": "thr_b" } } +{ "id": 24, "result": { "thread": { "id": "thr_b", "name": "Bug bash notes" } } } +{ "method": "thread/unarchived", "params": { "threadId": "thr_b" } } +``` + +### Trigger thread compaction + +Use `thread/compact/start` to trigger manual history compaction for a thread. The request returns immediately with `{}`. + +App-server emits progress as standard `turn/*` and `item/*` notifications on the same `threadId`, including a `contextCompaction` item lifecycle (`item/started` then `item/completed`). + +```json +{ "method": "thread/compact/start", "id": 25, "params": { "threadId": "thr_b" } } +{ "id": 25, "result": {} } +``` + +### Roll back recent turns + +Use `thread/rollback` to remove the last `numTurns` entries from the in-memory context and persist a rollback marker in the rollout log. The returned `thread` includes `turns` populated after the rollback. + +```json +{ "method": "thread/rollback", "id": 26, "params": { "threadId": "thr_b", "numTurns": 1 } } +{ "id": 26, "result": { "thread": { "id": "thr_b", "name": "Bug bash notes", "ephemeral": false } } } +``` + +## Turns + +The `input` field accepts a list of items: + +- `{ "type": "text", "text": "Explain this diff" }` +- `{ "type": "image", "url": "https://.../design.png" }` +- `{ "type": "localImage", "path": "/tmp/screenshot.png" }` + +You can override configuration settings per turn (model, effort, personality, `cwd`, sandbox policy, summary). When specified, these settings become the defaults for later turns on the same thread. `outputSchema` applies only to the current turn. For `sandboxPolicy.type = "externalSandbox"`, set `networkAccess` to `restricted` or `enabled`; for `workspaceWrite`, `networkAccess` remains a boolean. + +For `turn/start.collaborationMode`, `settings.developer_instructions: null` means "use built-in instructions for the selected mode" rather than clearing mode instructions. + +### Sandbox read access (`ReadOnlyAccess`) + +`sandboxPolicy` supports explicit read-access controls: + +- `readOnly`: optional `access` (`{ "type": "fullAccess" }` by default, or restricted roots). +- `workspaceWrite`: optional `readOnlyAccess` (`{ "type": "fullAccess" }` by default, or restricted roots). + +Restricted read access shape: + +```json +{ + "type": "restricted", + "includePlatformDefaults": true, + "readableRoots": ["/Users/me/shared-read-only"] +} +``` + +On macOS, `includePlatformDefaults: true` appends a curated platform-default Seatbelt policy for restricted-read sessions. This improves tool compatibility without broadly allowing all of `/System`. + +Examples: + +```json +{ "type": "readOnly", "access": { "type": "fullAccess" } } +``` + +```json +{ + "type": "workspaceWrite", + "writableRoots": ["/Users/me/project"], + "readOnlyAccess": { + "type": "restricted", + "includePlatformDefaults": true, + "readableRoots": ["/Users/me/shared-read-only"] + }, + "networkAccess": false +} +``` + +### Start a turn + +```json +{ "method": "turn/start", "id": 30, "params": { + "threadId": "thr_123", + "input": [ { "type": "text", "text": "Run tests" } ], + "cwd": "/Users/me/project", + "approvalPolicy": "unlessTrusted", + "sandboxPolicy": { + "type": "workspaceWrite", + "writableRoots": ["/Users/me/project"], + "networkAccess": true + }, + "model": "gpt-5.1-codex", + "effort": "medium", + "summary": "concise", + "personality": "friendly", + "outputSchema": { + "type": "object", + "properties": { "answer": { "type": "string" } }, + "required": ["answer"], + "additionalProperties": false + } +} } +{ "id": 30, "result": { "turn": { "id": "turn_456", "status": "inProgress", "items": [], "error": null } } } +``` + +### Steer an active turn + +Use `turn/steer` to append more user input to the active in-flight turn. + +- Include `expectedTurnId`; it must match the active turn id. +- The request fails if there is no active turn on the thread. +- `turn/steer` doesn't emit a new `turn/started` notification. +- `turn/steer` doesn't accept turn-level overrides (`model`, `cwd`, `sandboxPolicy`, or `outputSchema`). + +```json +{ "method": "turn/steer", "id": 32, "params": { + "threadId": "thr_123", + "input": [ { "type": "text", "text": "Actually focus on failing tests first." } ], + "expectedTurnId": "turn_456" +} } +{ "id": 32, "result": { "turnId": "turn_456" } } +``` + +### Start a turn (invoke a skill) + +Invoke a skill explicitly by including `$` in the text input and adding a `skill` input item alongside it. + +```json +{ "method": "turn/start", "id": 33, "params": { + "threadId": "thr_123", + "input": [ + { "type": "text", "text": "$skill-creator Add a new skill for triaging flaky CI and include step-by-step usage." }, + { "type": "skill", "name": "skill-creator", "path": "/Users/me/.codex/skills/skill-creator/SKILL.md" } + ] +} } +{ "id": 33, "result": { "turn": { "id": "turn_457", "status": "inProgress", "items": [], "error": null } } } +``` + +### Interrupt a turn + +```json +{ "method": "turn/interrupt", "id": 31, "params": { "threadId": "thr_123", "turnId": "turn_456" } } +{ "id": 31, "result": {} } +``` + +On success, the turn finishes with `status: "interrupted"`. + +## Review + +`review/start` runs the Codex reviewer for a thread and streams review items. Targets include: + +- `uncommittedChanges` +- `baseBranch` (diff against a branch) +- `commit` (review a specific commit) +- `custom` (free-form instructions) + +Use `delivery: "inline"` (default) to run the review on the existing thread, or `delivery: "detached"` to fork a new review thread. + +Example request/response: + +```json +{ "method": "review/start", "id": 40, "params": { + "threadId": "thr_123", + "delivery": "inline", + "target": { "type": "commit", "sha": "1234567deadbeef", "title": "Polish tui colors" } +} } +{ "id": 40, "result": { + "turn": { + "id": "turn_900", + "status": "inProgress", + "items": [ + { "type": "userMessage", "id": "turn_900", "content": [ { "type": "text", "text": "Review commit 1234567: Polish tui colors" } ] } + ], + "error": null + }, + "reviewThreadId": "thr_123" +} } +``` + +For a detached review, use `"delivery": "detached"`. The response is the same shape, but `reviewThreadId` will be the id of the new review thread (different from the original `threadId`). The server also emits a `thread/started` notification for that new thread before streaming the review turn. + +Codex streams the usual `turn/started` notification followed by an `item/started` with an `enteredReviewMode` item: + +```json +{ + "method": "item/started", + "params": { + "item": { + "type": "enteredReviewMode", + "id": "turn_900", + "review": "current changes" + } + } +} +``` + +When the reviewer finishes, the server emits `item/started` and `item/completed` containing an `exitedReviewMode` item with the final review text: + +```json +{ + "method": "item/completed", + "params": { + "item": { + "type": "exitedReviewMode", + "id": "turn_900", + "review": "Looks solid overall..." + } + } +} +``` + +Use this notification to render the reviewer output in your client. + +## Command execution + +`command/exec` runs a single command (`argv` array) under the server sandbox without creating a thread. + +```json +{ "method": "command/exec", "id": 50, "params": { + "command": ["ls", "-la"], + "cwd": "/Users/me/project", + "sandboxPolicy": { "type": "workspaceWrite" }, + "timeoutMs": 10000 +} } +{ "id": 50, "result": { "exitCode": 0, "stdout": "...", "stderr": "" } } +``` + +Use `sandboxPolicy.type = "externalSandbox"` if you already sandbox the server process and want Codex to skip its own sandbox enforcement. For external sandbox mode, set `networkAccess` to `restricted` (default) or `enabled`. For `readOnly` and `workspaceWrite`, use the same optional `access` / `readOnlyAccess` structure shown above. + +Notes: + +- The server rejects empty `command` arrays. +- `sandboxPolicy` accepts the same shape used by `turn/start` (for example, `dangerFullAccess`, `readOnly`, `workspaceWrite`, `externalSandbox`). +- When omitted, `timeoutMs` falls back to the server default. + +### Read admin requirements (`configRequirements/read`) + +Use `configRequirements/read` to inspect the effective admin requirements loaded from `requirements.toml` and/or MDM. + +```json +{ "method": "configRequirements/read", "id": 52, "params": {} } +{ "id": 52, "result": { + "requirements": { + "allowedApprovalPolicies": ["onRequest", "unlessTrusted"], + "allowedSandboxModes": ["readOnly", "workspaceWrite"], + "featureRequirements": { + "personality": true, + "unified_exec": false + }, + "network": { + "enabled": true, + "allowedDomains": ["api.openai.com"], + "allowUnixSockets": ["/tmp/example.sock"], + "dangerouslyAllowAllUnixSockets": false + } + } +} } +``` + +`result.requirements` is `null` when no requirements are configured. See the docs on [`requirements.toml`](https://developers.openai.com/codex/config-reference#requirementstoml) for details on supported keys and values. + +### Windows sandbox setup (`windowsSandbox/setupStart`) + +Custom Windows clients can trigger sandbox setup asynchronously instead of blocking on startup checks. + +```json +{ "method": "windowsSandbox/setupStart", "id": 53, "params": { "mode": "elevated" } } +{ "id": 53, "result": { "started": true } } +``` + +App-server starts setup in the background and later emits a completion notification: + +```json +{ + "method": "windowsSandbox/setupCompleted", + "params": { "mode": "elevated", "success": true, "error": null } +} +``` + +Modes: + +- `elevated` - run the elevated Windows sandbox setup path. +- `unelevated` - run the legacy setup/preflight path. + +## Events + +Event notifications are the server-initiated stream for thread lifecycles, turn lifecycles, and the items within them. After you start or resume a thread, keep reading the active transport stream for `thread/started`, `thread/archived`, `thread/unarchived`, `thread/closed`, `thread/status/changed`, `turn/*`, `item/*`, and `serverRequest/resolved` notifications. + +### Notification opt-out + +Clients can suppress specific notifications per connection by sending exact method names in `initialize.params.capabilities.optOutNotificationMethods`. + +- Exact-match only: `item/agentMessage/delta` suppresses only that method. +- Unknown method names are ignored. +- Applies to both legacy (`codex/event/*`) and v2 (`thread/*`, `turn/*`, `item/*`, etc.) notifications. +- Doesn't apply to requests, responses, or errors. + +### Fuzzy file search events (experimental) + +The fuzzy file search session API emits per-query notifications: + +- `fuzzyFileSearch/sessionUpdated` - `{ sessionId, query, files }` with the current matches for the active query. +- `fuzzyFileSearch/sessionCompleted` - `{ sessionId }` once indexing and matching for that query completes. + +### Windows sandbox setup events + +- `windowsSandbox/setupCompleted` - `{ mode, success, error }` emitted after a `windowsSandbox/setupStart` request finishes. + +### Turn events + +- `turn/started` - `{ turn }` with the turn id, empty `items`, and `status: "inProgress"`. +- `turn/completed` - `{ turn }` where `turn.status` is `completed`, `interrupted`, or `failed`; failures carry `{ error: { message, codexErrorInfo?, additionalDetails? } }`. +- `turn/diff/updated` - `{ threadId, turnId, diff }` with the latest aggregated unified diff across every file change in the turn. +- `turn/plan/updated` - `{ turnId, explanation?, plan }` whenever the agent shares or changes its plan; each `plan` entry is `{ step, status }` with `status` in `pending`, `inProgress`, or `completed`. +- `thread/tokenUsage/updated` - usage updates for the active thread. + +`turn/diff/updated` and `turn/plan/updated` currently include empty `items` arrays even when item events stream. Use `item/*` notifications as the source of truth for turn items. + +### Items + +`ThreadItem` is the tagged union carried in turn responses and `item/*` notifications. Common item types include: + +- `userMessage` - `{id, content}` where `content` is a list of user inputs (`text`, `image`, or `localImage`). +- `agentMessage` - `{id, text, phase?}` containing the accumulated agent reply. When present, `phase` uses Responses API wire values (`commentary`, `final_answer`). +- `plan` - `{id, text}` containing proposed plan text in plan mode. Treat the final `plan` item from `item/completed` as authoritative. +- `reasoning` - `{id, summary, content}` where `summary` holds streamed reasoning summaries and `content` holds raw reasoning blocks. +- `commandExecution` - `{id, command, cwd, status, commandActions, aggregatedOutput?, exitCode?, durationMs?}`. +- `fileChange` - `{id, changes, status}` describing proposed edits; `changes` list `{path, kind, diff}`. +- `mcpToolCall` - `{id, server, tool, status, arguments, result?, error?}`. +- `dynamicToolCall` - `{id, tool, arguments, status, contentItems?, success?, durationMs?}` for client-executed dynamic tool invocations. +- `collabToolCall` - `{id, tool, status, senderThreadId, receiverThreadId?, newThreadId?, prompt?, agentStatus?}`. +- `webSearch` - `{id, query, action?}` for web search requests issued by the agent. +- `imageView` - `{id, path}` emitted when the agent invokes the image viewer tool. +- `enteredReviewMode` - `{id, review}` sent when the reviewer starts. +- `exitedReviewMode` - `{id, review}` emitted when the reviewer finishes. +- `contextCompaction` - `{id}` emitted when Codex compacts the conversation history. + +For `webSearch.action`, the action `type` can be `search` (`query?`, `queries?`), `openPage` (`url?`), or `findInPage` (`url?`, `pattern?`). + +The app server deprecates the legacy `thread/compacted` notification; use the `contextCompaction` item instead. + +All items emit two shared lifecycle events: + +- `item/started` - emits the full `item` when a new unit of work begins; the `item.id` matches the `itemId` used by deltas. +- `item/completed` - sends the final `item` once work finishes; treat this as the authoritative state. + +### Item deltas + +- `item/agentMessage/delta` - appends streamed text for the agent message. +- `item/plan/delta` - streams proposed plan text. The final `plan` item may not exactly equal the concatenated deltas. +- `item/reasoning/summaryTextDelta` - streams readable reasoning summaries; `summaryIndex` increments when a new summary section opens. +- `item/reasoning/summaryPartAdded` - marks a boundary between reasoning summary sections. +- `item/reasoning/textDelta` - streams raw reasoning text (when supported by the model). +- `item/commandExecution/outputDelta` - streams stdout/stderr for a command; append deltas in order. +- `item/fileChange/outputDelta` - contains the tool call response of the underlying `apply_patch` tool call. + +## Errors + +If a turn fails, the server emits an `error` event with `{ error: { message, codexErrorInfo?, additionalDetails? } }` and then finishes the turn with `status: "failed"`. When an upstream HTTP status is available, it appears in `codexErrorInfo.httpStatusCode`. + +Common `codexErrorInfo` values include: + +- `ContextWindowExceeded` +- `UsageLimitExceeded` +- `HttpConnectionFailed` (4xx/5xx upstream errors) +- `ResponseStreamConnectionFailed` +- `ResponseStreamDisconnected` +- `ResponseTooManyFailedAttempts` +- `BadRequest`, `Unauthorized`, `SandboxError`, `InternalServerError`, `Other` + +When an upstream HTTP status is available, the server forwards it in `httpStatusCode` on the relevant `codexErrorInfo` variant. + +## Approvals + +Depending on a user's Codex settings, command execution and file changes may require approval. The app-server sends a server-initiated JSON-RPC request to the client, and the client responds with a decision payload. + +- Command execution decisions: `accept`, `acceptForSession`, `decline`, `cancel`, or `{ "acceptWithExecpolicyAmendment": { "execpolicy_amendment": ["cmd", "..."] } }`. +- File change decisions: `accept`, `acceptForSession`, `decline`, `cancel`. + +- Requests include `threadId` and `turnId` - use them to scope UI state to the active conversation. +- The server resumes or declines the work and ends the item with `item/completed`. + +### Command execution approvals + +Order of messages: + +1. `item/started` shows the pending `commandExecution` item with `command`, `cwd`, and other fields. +2. `item/commandExecution/requestApproval` includes `itemId`, `threadId`, `turnId`, optional `reason`, optional `command`, optional `cwd`, optional `commandActions`, optional `proposedExecpolicyAmendment`, optional `networkApprovalContext`, and optional `availableDecisions`. When `initialize.params.capabilities.experimentalApi = true`, the payload can also include experimental `additionalPermissions` describing requested per-command sandbox access. Any filesystem paths inside `additionalPermissions` are absolute on the wire. +3. Client responds with one of the command execution approval decisions above. +4. `serverRequest/resolved` confirms that the pending request has been answered or cleared. +5. `item/completed` returns the final `commandExecution` item with `status: completed | failed | declined`. + +When `networkApprovalContext` is present, the prompt is for managed network access (not a general shell-command approval). The current v2 schema exposes the target `host` and `protocol`; clients should render a network-specific prompt and not rely on `command` being a user-meaningful shell command preview. + +Codex groups concurrent network approval prompts by destination (`host`, protocol, and port). The app-server may therefore send one prompt that unblocks multiple queued requests to the same destination, while different ports on the same host are treated separately. + +### File change approvals + +Order of messages: + +1. `item/started` emits a `fileChange` item with proposed `changes` and `status: "inProgress"`. +2. `item/fileChange/requestApproval` includes `itemId`, `threadId`, `turnId`, optional `reason`, and optional `grantRoot`. +3. Client responds with one of the file change approval decisions above. +4. `serverRequest/resolved` confirms that the pending request has been answered or cleared. +5. `item/completed` returns the final `fileChange` item with `status: completed | failed | declined`. + +### `tool/requestUserInput` + +When the client responds to `item/tool/requestUserInput`, app-server emits `serverRequest/resolved` with `{ threadId, requestId }`. If the pending request is cleared by turn start, turn completion, or turn interruption before the client answers, the server emits the same notification for that cleanup. + +### Dynamic tool calls (experimental) + +`dynamicTools` on `thread/start` and the corresponding `item/tool/call` request or response flow are experimental APIs. + +When a dynamic tool is invoked during a turn, app-server emits: + +1. `item/started` with `item.type = "dynamicToolCall"`, `status = "inProgress"`, plus `tool` and `arguments`. +2. `item/tool/call` as a server request to the client. +3. The client response payload with returned content items. +4. `item/completed` with `item.type = "dynamicToolCall"`, the final `status`, and any returned `contentItems` or `success` value. + +### MCP tool-call approvals (apps) + +App (connector) tool calls can also require approval. When an app tool call has side effects, the server may elicit approval with `tool/requestUserInput` and options such as **Accept**, **Decline**, and **Cancel**. Destructive tool annotations always trigger approval even when the tool also advertises less-privileged hints. If the user declines or cancels, the related `mcpToolCall` item completes with an error instead of running the tool. + +## Skills + +Invoke a skill by including `$` in the user text input. Add a `skill` input item (recommended) so the server injects full skill instructions instead of relying on the model to resolve the name. + +```json +{ + "method": "turn/start", + "id": 101, + "params": { + "threadId": "thread-1", + "input": [ + { + "type": "text", + "text": "$skill-creator Add a new skill for triaging flaky CI." + }, + { + "type": "skill", + "name": "skill-creator", + "path": "/Users/me/.codex/skills/skill-creator/SKILL.md" + } + ] + } +} +``` + +If you omit the `skill` item, the model will still parse the `$` marker and try to locate the skill, which can add latency. + +Example: + +``` +$skill-creator Add a new skill for triaging flaky CI and include step-by-step usage. +``` + +Use `skills/list` to fetch available skills (optionally scoped by `cwds`, with `forceReload`). You can also include `perCwdExtraUserRoots` to scan extra absolute paths as `user` scope for specific `cwd` values. App-server ignores entries whose `cwd` isn't present in `cwds`. `skills/list` may reuse a cached result per `cwd`; set `forceReload: true` to refresh from disk. When present, the server reads `interface` and `dependencies` from `SKILL.json`. + +```json +{ "method": "skills/list", "id": 25, "params": { + "cwds": ["/Users/me/project", "/Users/me/other-project"], + "forceReload": true, + "perCwdExtraUserRoots": [ + { + "cwd": "/Users/me/project", + "extraUserRoots": ["/Users/me/shared-skills"] + } + ] +} } +{ "id": 25, "result": { + "data": [{ + "cwd": "/Users/me/project", + "skills": [ + { + "name": "skill-creator", + "description": "Create or update a Codex skill", + "enabled": true, + "interface": { + "displayName": "Skill Creator", + "shortDescription": "Create or update a Codex skill" + }, + "dependencies": { + "tools": [ + { + "type": "env_var", + "value": "GITHUB_TOKEN", + "description": "GitHub API token" + }, + { + "type": "mcp", + "value": "github", + "transport": "streamable_http", + "url": "https://example.com/mcp" + } + ] + } + } + ], + "errors": [] + }] +} } +``` + +To enable or disable a skill by path: + +```json +{ + "method": "skills/config/write", + "id": 26, + "params": { + "path": "/Users/me/.codex/skills/skill-creator/SKILL.md", + "enabled": false + } +} +``` + +## Apps (connectors) + +Use `app/list` to fetch available apps. In the CLI/TUI, `/apps` is the user-facing picker; in custom clients, call `app/list` directly. Each entry includes both `isAccessible` (available to the user) and `isEnabled` (enabled in `config.toml`) so clients can distinguish install/access from local enabled state. App entries can also include optional `branding`, `appMetadata`, and `labels` fields. + +```json +{ "method": "app/list", "id": 50, "params": { + "cursor": null, + "limit": 50, + "threadId": "thread-1", + "forceRefetch": false +} } +{ "id": 50, "result": { + "data": [ + { + "id": "demo-app", + "name": "Demo App", + "description": "Example connector for documentation.", + "logoUrl": "https://example.com/demo-app.png", + "logoUrlDark": null, + "distributionChannel": null, + "branding": null, + "appMetadata": null, + "labels": null, + "installUrl": "https://chatgpt.com/apps/demo-app/demo-app", + "isAccessible": true, + "isEnabled": true + } + ], + "nextCursor": null +} } +``` + +If you provide `threadId`, app feature gating (`features.apps`) uses that thread's config snapshot. When omitted, app-server uses the latest global config. + +`app/list` returns after both accessible apps and directory apps load. Set `forceRefetch: true` to bypass app caches and fetch fresh data. Cache entries are only replaced when refreshes succeed. + +The server also emits `app/list/updated` notifications whenever either source (accessible apps or directory apps) finishes loading. Each notification includes the latest merged app list. + +```json +{ + "method": "app/list/updated", + "params": { + "data": [ + { + "id": "demo-app", + "name": "Demo App", + "description": "Example connector for documentation.", + "logoUrl": "https://example.com/demo-app.png", + "logoUrlDark": null, + "distributionChannel": null, + "branding": null, + "appMetadata": null, + "labels": null, + "installUrl": "https://chatgpt.com/apps/demo-app/demo-app", + "isAccessible": true, + "isEnabled": true + } + ] + } +} +``` + +Invoke an app by inserting `$` in the text input and adding a `mention` input item with the `app://` path (recommended). + +```json +{ + "method": "turn/start", + "id": 51, + "params": { + "threadId": "thread-1", + "input": [ + { + "type": "text", + "text": "$demo-app Pull the latest updates from the team." + }, + { + "type": "mention", + "name": "Demo App", + "path": "app://demo-app" + } + ] + } +} +``` + +### Config RPC examples for app settings + +Use `config/read`, `config/value/write`, and `config/batchWrite` to inspect or update app controls in `config.toml`. + +Read the effective app config shape (including `_default` and per-tool overrides): + +```json +{ "method": "config/read", "id": 60, "params": { "includeLayers": false } } +{ "id": 60, "result": { + "config": { + "apps": { + "_default": { + "enabled": true, + "destructive_enabled": true, + "open_world_enabled": true + }, + "google_drive": { + "enabled": true, + "destructive_enabled": false, + "default_tools_approval_mode": "prompt", + "tools": { + "files/delete": { "enabled": false, "approval_mode": "approve" } + } + } + } + } +} } +``` + +Update a single app setting: + +```json +{ + "method": "config/value/write", + "id": 61, + "params": { + "keyPath": "apps.google_drive.default_tools_approval_mode", + "value": "prompt", + "mergeStrategy": "replace" + } +} +``` + +Apply multiple app edits atomically: + +```json +{ + "method": "config/batchWrite", + "id": 62, + "params": { + "edits": [ + { + "keyPath": "apps._default.destructive_enabled", + "value": false, + "mergeStrategy": "upsert" + }, + { + "keyPath": "apps.google_drive.tools.files/delete.approval_mode", + "value": "approve", + "mergeStrategy": "upsert" + } + ] + } +} +``` + +### Detect and import external agent config + +Use `externalAgentConfig/detect` to discover migratable external-agent artifacts, then pass the selected entries to `externalAgentConfig/import`. + +Detection example: + +```json +{ "method": "externalAgentConfig/detect", "id": 63, "params": { + "includeHome": true, + "cwds": ["/Users/me/project"] +} } +{ "id": 63, "result": { + "items": [ + { + "itemType": "AGENTS_MD", + "description": "Import /Users/me/project/CLAUDE.md to /Users/me/project/AGENTS.md.", + "cwd": "/Users/me/project" + }, + { + "itemType": "SKILLS", + "description": "Copy skill folders from /Users/me/.claude/skills to /Users/me/.agents/skills.", + "cwd": null + } + ] +} } +``` + +Import example: + +```json +{ "method": "externalAgentConfig/import", "id": 64, "params": { + "migrationItems": [ + { + "itemType": "AGENTS_MD", + "description": "Import /Users/me/project/CLAUDE.md to /Users/me/project/AGENTS.md.", + "cwd": "/Users/me/project" + } + ] +} } +{ "id": 64, "result": {} } +``` + +Supported `itemType` values are `AGENTS_MD`, `CONFIG`, `SKILLS`, and `MCP_SERVER_CONFIG`. Detection returns only items that still have work to do. For example, AGENTS migration is skipped when `AGENTS.md` already exists and is non-empty, and skill imports do not overwrite existing skill directories. + +## Auth endpoints + +The JSON-RPC auth/account surface exposes request/response methods plus server-initiated notifications (no `id`). Use these to determine auth state, start or cancel logins, logout, and inspect ChatGPT rate limits. + +### Authentication modes + +Codex supports three authentication modes. `account/updated.authMode` shows the active mode, and `account/read` also reports it. + +- **API key (`apikey`)** - the caller supplies an OpenAI API key and Codex stores it for API requests. +- **ChatGPT managed (`chatgpt`)** - Codex owns the ChatGPT OAuth flow, persists tokens, and refreshes them automatically. +- **ChatGPT external tokens (`chatgptAuthTokens`)** - a host app supplies `idToken` and `accessToken` directly. Codex stores these tokens in memory, and the host app must refresh them when asked. + +### API overview + +- `account/read` - fetch current account info; optionally refresh tokens. +- `account/login/start` - begin login (`apiKey`, `chatgpt`, or `chatgptAuthTokens`). +- `account/login/completed` (notify) - emitted when a login attempt finishes (success or error). +- `account/login/cancel` - cancel a pending ChatGPT login by `loginId`. +- `account/logout` - sign out; triggers `account/updated`. +- `account/updated` (notify) - emitted whenever auth mode changes (`authMode`: `apikey`, `chatgpt`, `chatgptAuthTokens`, or `null`). +- `account/chatgptAuthTokens/refresh` (server request) - request fresh externally managed ChatGPT tokens after an authorization error. +- `account/rateLimits/read` - fetch ChatGPT rate limits. +- `account/rateLimits/updated` (notify) - emitted whenever a user's ChatGPT rate limits change. +- `mcpServer/oauthLogin/completed` (notify) - emitted after a `mcpServer/oauth/login` flow finishes; payload includes `{ name, success, error? }`. + +### 1) Check auth state + +Request: + +```json +{ "method": "account/read", "id": 1, "params": { "refreshToken": false } } +``` + +Response examples: + +```json +{ "id": 1, "result": { "account": null, "requiresOpenaiAuth": false } } +``` + +```json +{ "id": 1, "result": { "account": null, "requiresOpenaiAuth": true } } +``` + +```json +{ + "id": 1, + "result": { "account": { "type": "apiKey" }, "requiresOpenaiAuth": true } +} +``` + +```json +{ + "id": 1, + "result": { + "account": { + "type": "chatgpt", + "email": "user@example.com", + "planType": "pro" + }, + "requiresOpenaiAuth": true + } +} +``` + +Field notes: + +- `refreshToken` (boolean): set `true` to force a token refresh in managed ChatGPT mode. In external token mode (`chatgptAuthTokens`), app-server ignores this flag. +- `requiresOpenaiAuth` reflects the active provider; when `false`, Codex can run without OpenAI credentials. + +### 2) Log in with an API key + +1. Send: + + ```json + { + "method": "account/login/start", + "id": 2, + "params": { "type": "apiKey", "apiKey": "sk-..." } + } + ``` + +2. Expect: + + ```json + { "id": 2, "result": { "type": "apiKey" } } + ``` + +3. Notifications: + + ```json + { + "method": "account/login/completed", + "params": { "loginId": null, "success": true, "error": null } + } + ``` + + ```json + { "method": "account/updated", "params": { "authMode": "apikey" } } + ``` + +### 3) Log in with ChatGPT (browser flow) + +1. Start: + + ```json + { "method": "account/login/start", "id": 3, "params": { "type": "chatgpt" } } + ``` + + ```json + { + "id": 3, + "result": { + "type": "chatgpt", + "loginId": "", + "authUrl": "https://chatgpt.com/...&redirect_uri=http%3A%2F%2Flocalhost%3A%2Fauth%2Fcallback" + } + } + ``` + +2. Open `authUrl` in a browser; the app-server hosts the local callback. +3. Wait for notifications: + + ```json + { + "method": "account/login/completed", + "params": { "loginId": "", "success": true, "error": null } + } + ``` + + ```json + { "method": "account/updated", "params": { "authMode": "chatgpt" } } + ``` + +### 3b) Log in with externally managed ChatGPT tokens (`chatgptAuthTokens`) + +Use this mode when a host application owns the user's ChatGPT auth lifecycle and supplies tokens directly. + +1. Send: + + ```json + { + "method": "account/login/start", + "id": 7, + "params": { + "type": "chatgptAuthTokens", + "idToken": "", + "accessToken": "" + } + } + ``` + +2. Expect: + + ```json + { "id": 7, "result": { "type": "chatgptAuthTokens" } } + ``` + +3. Notifications: + + ```json + { + "method": "account/login/completed", + "params": { "loginId": null, "success": true, "error": null } + } + ``` + + ```json + { + "method": "account/updated", + "params": { "authMode": "chatgptAuthTokens" } + } + ``` + +When the server receives a `401 Unauthorized`, it may request refreshed tokens from the host app: + +```json +{ + "method": "account/chatgptAuthTokens/refresh", + "id": 8, + "params": { "reason": "unauthorized", "previousAccountId": "org-123" } +} +{ "id": 8, "result": { "idToken": "", "accessToken": "" } } +``` + +The server retries the original request after a successful refresh response. Requests time out after about 10 seconds. + +### 4) Cancel a ChatGPT login + +```json +{ "method": "account/login/cancel", "id": 4, "params": { "loginId": "" } } +{ "method": "account/login/completed", "params": { "loginId": "", "success": false, "error": "..." } } +``` + +### 5) Logout + +```json +{ "method": "account/logout", "id": 5 } +{ "id": 5, "result": {} } +{ "method": "account/updated", "params": { "authMode": null } } +``` + +### 6) Rate limits (ChatGPT) + +```json +{ "method": "account/rateLimits/read", "id": 6 } +{ "id": 6, "result": { + "rateLimits": { + "limitId": "codex", + "limitName": null, + "primary": { "usedPercent": 25, "windowDurationMins": 15, "resetsAt": 1730947200 }, + "secondary": null + }, + "rateLimitsByLimitId": { + "codex": { + "limitId": "codex", + "limitName": null, + "primary": { "usedPercent": 25, "windowDurationMins": 15, "resetsAt": 1730947200 }, + "secondary": null + }, + "codex_other": { + "limitId": "codex_other", + "limitName": "codex_other", + "primary": { "usedPercent": 42, "windowDurationMins": 60, "resetsAt": 1730950800 }, + "secondary": null + } + } +} } +{ "method": "account/rateLimits/updated", "params": { + "rateLimits": { + "limitId": "codex", + "primary": { "usedPercent": 31, "windowDurationMins": 15, "resetsAt": 1730948100 } + } +} } +``` + +Field notes: + +- `rateLimits` is the backward-compatible single-bucket view. +- `rateLimitsByLimitId` (when present) is the multi-bucket view keyed by metered `limit_id` (for example `codex`). +- `limitId` is the metered bucket identifier. +- `limitName` is an optional user-facing label for the bucket. +- `usedPercent` is current usage within the quota window. +- `windowDurationMins` is the quota window length. +- `resetsAt` is a Unix timestamp (seconds) for the next reset. \ No newline at end of file diff --git a/docs/codex-cli-reference.md b/docs/codex-cli-reference.md new file mode 100644 index 0000000..aae41c7 --- /dev/null +++ b/docs/codex-cli-reference.md @@ -0,0 +1,859 @@ +# Command line options + +export const globalFlagOptions = [ + { + key: "PROMPT", + type: "string", + description: + "Optional text instruction to start the session. Omit to launch the TUI without a pre-filled message.", + }, + { + key: "--image, -i", + type: "path[,path...]", + description: + "Attach one or more image files to the initial prompt. Separate multiple paths with commas or repeat the flag.", + }, + { + key: "--model, -m", + type: "string", + description: + "Override the model set in configuration (for example `gpt-5-codex`).", + }, + { + key: "--oss", + type: "boolean", + defaultValue: "false", + description: + 'Use the local open source model provider (equivalent to `-c model_provider="oss"`). Validates that Ollama is running.', + }, + { + key: "--profile, -p", + type: "string", + description: + "Configuration profile name to load from `~/.codex/config.toml`.", + }, + { + key: "--sandbox, -s", + type: "read-only | workspace-write | danger-full-access", + description: + "Select the sandbox policy for model-generated shell commands.", + }, + { + key: "--ask-for-approval, -a", + type: "untrusted | on-request | never", + description: + "Control when Codex pauses for human approval before running a command. `on-failure` is deprecated; prefer `on-request` for interactive runs or `never` for non-interactive runs.", + }, + { + key: "--full-auto", + type: "boolean", + defaultValue: "false", + description: + "Shortcut for low-friction local work: sets `--ask-for-approval on-request` and `--sandbox workspace-write`.", + }, + { + key: "--dangerously-bypass-approvals-and-sandbox, --yolo", + type: "boolean", + defaultValue: "false", + description: + "Run every command without approvals or sandboxing. Only use inside an externally hardened environment.", + }, + { + key: "--cd, -C", + type: "path", + description: + "Set the working directory for the agent before it starts processing your request.", + }, + { + key: "--search", + type: "boolean", + defaultValue: "false", + description: + 'Enable live web search (sets `web_search = "live"` instead of the default `"cached"`).', + }, + { + key: "--add-dir", + type: "path", + description: + "Grant additional directories write access alongside the main workspace. Repeat for multiple paths.", + }, + { + key: "--no-alt-screen", + type: "boolean", + defaultValue: "false", + description: + "Disable alternate screen mode for the TUI (overrides `tui.alternate_screen` for this run).", + }, + { + key: "--enable", + type: "feature", + description: + "Force-enable a feature flag (translates to `-c features.=true`). Repeatable.", + }, + { + key: "--disable", + type: "feature", + description: + "Force-disable a feature flag (translates to `-c features.=false`). Repeatable.", + }, + { + key: "--config, -c", + type: "key=value", + description: + "Override configuration values. Values parse as JSON if possible; otherwise the literal string is used.", + }, +]; + +export const commandOverview = [ + { + key: "codex", + href: "/codex/cli/reference#codex-interactive", + type: "stable", + description: + "Launch the terminal UI. Accepts the global flags above plus an optional prompt or image attachments.", + }, + { + key: "codex app-server", + href: "/codex/cli/reference#codex-app-server", + type: "experimental", + description: + "Launch the Codex app server for local development or debugging.", + }, + { + key: "codex app", + href: "/codex/cli/reference#codex-app", + type: "stable", + description: + "Launch the Codex desktop app on macOS, optionally opening a specific workspace path.", + }, + { + key: "codex debug app-server send-message-v2", + href: "/codex/cli/reference#codex-debug-app-server-send-message-v2", + type: "experimental", + description: + "Debug app-server by sending a single V2 message through the built-in test client.", + }, + { + key: "codex apply", + href: "/codex/cli/reference#codex-apply", + type: "stable", + description: + "Apply the latest diff generated by a Codex Cloud task to your local working tree. Alias: `codex a`.", + }, + { + key: "codex cloud", + href: "/codex/cli/reference#codex-cloud", + type: "experimental", + description: + "Browse or execute Codex Cloud tasks from the terminal without opening the TUI. Alias: `codex cloud-tasks`.", + }, + { + key: "codex completion", + href: "/codex/cli/reference#codex-completion", + type: "stable", + description: + "Generate shell completion scripts for Bash, Zsh, Fish, or PowerShell.", + }, + { + key: "codex features", + href: "/codex/cli/reference#codex-features", + type: "stable", + description: + "List feature flags and persistently enable or disable them in `config.toml`.", + }, + { + key: "codex exec", + href: "/codex/cli/reference#codex-exec", + type: "stable", + description: + "Run Codex non-interactively. Alias: `codex e`. Stream results to stdout or JSONL and optionally resume previous sessions.", + }, + { + key: "codex execpolicy", + href: "/codex/cli/reference#codex-execpolicy", + type: "experimental", + description: + "Evaluate execpolicy rule files and see whether a command would be allowed, prompted, or blocked.", + }, + { + key: "codex login", + href: "/codex/cli/reference#codex-login", + type: "stable", + description: + "Authenticate Codex using ChatGPT OAuth, device auth, or an API key piped over stdin.", + }, + { + key: "codex logout", + href: "/codex/cli/reference#codex-logout", + type: "stable", + description: "Remove stored authentication credentials.", + }, + { + key: "codex mcp", + href: "/codex/cli/reference#codex-mcp", + type: "experimental", + description: + "Manage Model Context Protocol servers (list, add, remove, authenticate).", + }, + { + key: "codex mcp-server", + href: "/codex/cli/reference#codex-mcp-server", + type: "experimental", + description: + "Run Codex itself as an MCP server over stdio. Useful when another agent consumes Codex.", + }, + { + key: "codex resume", + href: "/codex/cli/reference#codex-resume", + type: "stable", + description: + "Continue a previous interactive session by ID or resume the most recent conversation.", + }, + { + key: "codex fork", + href: "/codex/cli/reference#codex-fork", + type: "stable", + description: + "Fork a previous interactive session into a new thread, preserving the original transcript.", + }, + { + key: "codex sandbox", + href: "/codex/cli/reference#codex-sandbox", + type: "experimental", + description: + "Run arbitrary commands inside Codex-provided macOS seatbelt or Linux sandboxes (Landlock by default, optional bubblewrap pipeline).", + }, +]; + +export const execOptions = [ + { + key: "PROMPT", + type: "string | - (read stdin)", + description: + "Initial instruction for the task. Use `-` to pipe the prompt from stdin.", + }, + { + key: "--image, -i", + type: "path[,path...]", + description: + "Attach images to the first message. Repeatable; supports comma-separated lists.", + }, + { + key: "--model, -m", + type: "string", + description: "Override the configured model for this run.", + }, + { + key: "--oss", + type: "boolean", + defaultValue: "false", + description: + "Use the local open source provider (requires a running Ollama instance).", + }, + { + key: "--sandbox, -s", + type: "read-only | workspace-write | danger-full-access", + description: + "Sandbox policy for model-generated commands. Defaults to configuration.", + }, + { + key: "--profile, -p", + type: "string", + description: "Select a configuration profile defined in config.toml.", + }, + { + key: "--full-auto", + type: "boolean", + defaultValue: "false", + description: + "Apply the low-friction automation preset (`workspace-write` sandbox and `on-request` approvals).", + }, + { + key: "--dangerously-bypass-approvals-and-sandbox, --yolo", + type: "boolean", + defaultValue: "false", + description: + "Bypass approval prompts and sandboxing. Dangerous—only use inside an isolated runner.", + }, + { + key: "--cd, -C", + type: "path", + description: "Set the workspace root before executing the task.", + }, + { + key: "--skip-git-repo-check", + type: "boolean", + defaultValue: "false", + description: + "Allow running outside a Git repository (useful for one-off directories).", + }, + { + key: "--ephemeral", + type: "boolean", + defaultValue: "false", + description: "Run without persisting session rollout files to disk.", + }, + { + key: "--output-schema", + type: "path", + description: + "JSON Schema file describing the expected final response shape. Codex validates tool output against it.", + }, + { + key: "--color", + type: "always | never | auto", + defaultValue: "auto", + description: "Control ANSI color in stdout.", + }, + { + key: "--json, --experimental-json", + type: "boolean", + defaultValue: "false", + description: + "Print newline-delimited JSON events instead of formatted text.", + }, + { + key: "--output-last-message, -o", + type: "path", + description: + "Write the assistant’s final message to a file. Useful for downstream scripting.", + }, + { + key: "Resume subcommand", + type: "codex exec resume [SESSION_ID]", + description: + "Resume an exec session by ID or add `--last` to continue the most recent session from the current working directory. Add `--all` to consider sessions from any directory. Accepts an optional follow-up prompt.", + }, + { + key: "-c, --config", + type: "key=value", + description: + "Inline configuration override for the non-interactive run (repeatable).", + }, +]; + +export const appServerOptions = [ + { + key: "--listen", + type: "stdio:// | ws://IP:PORT", + defaultValue: "stdio://", + description: + "Transport listener URL. `ws://` is experimental and intended for development/testing.", + }, +]; + +export const appOptions = [ + { + key: "PATH", + type: "path", + defaultValue: ".", + description: + "Workspace path to open in Codex Desktop (`codex app` is available on macOS only).", + }, + { + key: "--download-url", + type: "url", + description: + "Advanced override for the Codex desktop DMG download URL used during install.", + }, +]; + +export const debugAppServerSendMessageV2Options = [ + { + key: "USER_MESSAGE", + type: "string", + description: + "Message text sent to app-server through the built-in V2 test-client flow.", + }, +]; + +export const resumeOptions = [ + { + key: "SESSION_ID", + type: "uuid", + description: + "Resume the specified session. Omit and use `--last` to continue the most recent session.", + }, + { + key: "--last", + type: "boolean", + defaultValue: "false", + description: + "Skip the picker and resume the most recent conversation from the current working directory.", + }, + { + key: "--all", + type: "boolean", + defaultValue: "false", + description: + "Include sessions outside the current working directory when selecting the most recent session.", + }, +]; + +export const featuresOptions = [ + { + key: "List subcommand", + type: "codex features list", + description: + "Show known feature flags, their maturity stage, and their effective state.", + }, + { + key: "Enable subcommand", + type: "codex features enable ", + description: + "Persistently enable a feature flag in `config.toml`. Respects the active `--profile` when provided.", + }, + { + key: "Disable subcommand", + type: "codex features disable ", + description: + "Persistently disable a feature flag in `config.toml`. Respects the active `--profile` when provided.", + }, +]; + +export const execResumeOptions = [ + { + key: "SESSION_ID", + type: "uuid", + description: + "Resume the specified session. Omit and use `--last` to continue the most recent session.", + }, + { + key: "--last", + type: "boolean", + defaultValue: "false", + description: + "Resume the most recent conversation from the current working directory.", + }, + { + key: "--all", + type: "boolean", + defaultValue: "false", + description: + "Include sessions outside the current working directory when selecting the most recent session.", + }, + { + key: "--image, -i", + type: "path[,path...]", + description: + "Attach one or more images to the follow-up prompt. Separate multiple paths with commas or repeat the flag.", + }, + { + key: "PROMPT", + type: "string | - (read stdin)", + description: + "Optional follow-up instruction sent immediately after resuming.", + }, +]; + +export const forkOptions = [ + { + key: "SESSION_ID", + type: "uuid", + description: + "Fork the specified session. Omit and use `--last` to fork the most recent session.", + }, + { + key: "--last", + type: "boolean", + defaultValue: "false", + description: + "Skip the picker and fork the most recent conversation automatically.", + }, + { + key: "--all", + type: "boolean", + defaultValue: "false", + description: + "Show sessions beyond the current working directory in the picker.", + }, +]; + +export const execpolicyOptions = [ + { + key: "--rules, -r", + type: "path (repeatable)", + description: + "Path to an execpolicy rule file to evaluate. Provide multiple flags to combine rules across files.", + }, + { + key: "--pretty", + type: "boolean", + defaultValue: "false", + description: "Pretty-print the JSON result.", + }, + { + key: "COMMAND...", + type: "var-args", + description: "Command to be checked against the specified policies.", + }, +]; + +export const loginOptions = [ + { + key: "--with-api-key", + type: "boolean", + description: + "Read an API key from stdin (for example `printenv OPENAI_API_KEY | codex login --with-api-key`).", + }, + { + key: "--device-auth", + type: "boolean", + description: + "Use OAuth device code flow instead of launching a browser window.", + }, + { + key: "status subcommand", + type: "codex login status", + description: + "Print the active authentication mode and exit with 0 when logged in.", + }, +]; + +export const applyOptions = [ + { + key: "TASK_ID", + type: "string", + description: + "Identifier of the Codex Cloud task whose diff should be applied.", + }, +]; + +export const sandboxMacOptions = [ + { + key: "--full-auto", + type: "boolean", + defaultValue: "false", + description: + "Grant write access to the current workspace and `/tmp` without approvals.", + }, + { + key: "--config, -c", + type: "key=value", + description: + "Pass configuration overrides into the sandboxed run (repeatable).", + }, + { + key: "COMMAND...", + type: "var-args", + description: + "Shell command to execute under macOS Seatbelt. Everything after `--` is forwarded.", + }, +]; + +export const sandboxLinuxOptions = [ + { + key: "--full-auto", + type: "boolean", + defaultValue: "false", + description: + "Grant write access to the current workspace and `/tmp` inside the Landlock sandbox.", + }, + { + key: "--config, -c", + type: "key=value", + description: + "Configuration overrides applied before launching the sandbox (repeatable).", + }, + { + key: "COMMAND...", + type: "var-args", + description: + "Command to execute under Landlock + seccomp. Provide the executable after `--`.", + }, +]; + +export const completionOptions = [ + { + key: "SHELL", + type: "bash | zsh | fish | power-shell | elvish", + defaultValue: "bash", + description: "Shell to generate completions for. Output prints to stdout.", + }, +]; + +export const cloudExecOptions = [ + { + key: "QUERY", + type: "string", + description: + "Task prompt. If omitted, Codex prompts interactively for details.", + }, + { + key: "--env", + type: "ENV_ID", + description: + "Target Codex Cloud environment identifier (required). Use `codex cloud` to list options.", + }, + { + key: "--attempts", + type: "1-4", + defaultValue: "1", + description: + "Number of assistant attempts (best-of-N) Codex Cloud should run.", + }, +]; + +export const cloudListOptions = [ + { + key: "--env", + type: "ENV_ID", + description: "Filter tasks by environment identifier.", + }, + { + key: "--limit", + type: "1-20", + defaultValue: "20", + description: "Maximum number of tasks to return.", + }, + { + key: "--cursor", + type: "string", + description: "Pagination cursor returned by a previous request.", + }, + { + key: "--json", + type: "boolean", + defaultValue: "false", + description: "Emit machine-readable JSON instead of plain text.", + }, +]; + +export const mcpCommands = [ + { + key: "list", + type: "--json", + description: + "List configured MCP servers. Add `--json` for machine-readable output.", + }, + { + key: "get ", + type: "--json", + description: + "Show a specific server configuration. `--json` prints the raw config entry.", + }, + { + key: "add ", + type: "-- | --url ", + description: + "Register a server using a stdio launcher command or a streamable HTTP URL. Supports `--env KEY=VALUE` for stdio transports.", + }, + { + key: "remove ", + description: "Delete a stored MCP server definition.", + }, + { + key: "login ", + type: "--scopes scope1,scope2", + description: + "Start an OAuth login for a streamable HTTP server (servers that support OAuth only).", + }, + { + key: "logout ", + description: + "Remove stored OAuth credentials for a streamable HTTP server.", + }, +]; + +export const mcpAddOptions = [ + { + key: "COMMAND...", + type: "stdio transport", + description: + "Executable plus arguments to launch the MCP server. Provide after `--`.", + }, + { + key: "--env KEY=VALUE", + type: "repeatable", + description: + "Environment variable assignments applied when launching a stdio server.", + }, + { + key: "--url", + type: "https://…", + description: + "Register a streamable HTTP server instead of stdio. Mutually exclusive with `COMMAND...`.", + }, + { + key: "--bearer-token-env-var", + type: "ENV_VAR", + description: + "Environment variable whose value is sent as a bearer token when connecting to a streamable HTTP server.", + }, +]; + +## How to read this reference + +This page catalogs every documented Codex CLI command and flag. Use the interactive tables to search by key or description. Each section indicates whether the option is stable or experimental and calls out risky combinations. + +The CLI inherits most defaults from ~/.codex/config.toml. Any + -c key=value overrides you pass at the command line take + precedence for that invocation. See [Config + basics](https://developers.openai.com/codex/config-basic#configuration-precedence) for more information. + +## Global flags + + + +These options apply to the base `codex` command and propagate to each subcommand unless a section below specifies otherwise. +When you run a subcommand, place global flags after it (for example, `codex exec --oss ...`) so Codex applies them as intended. + +## Command overview + +The Maturity column uses feature maturity labels such as Experimental, Beta, + and Stable. See [Feature Maturity](https://developers.openai.com/codex/feature-maturity) for how to + interpret these labels. + + + +## Command details + +### `codex` (interactive) + +Running `codex` with no subcommand launches the interactive terminal UI (TUI). The agent accepts the global flags above plus image attachments. Web search defaults to cached mode; use `--search` to switch to live browsing and `--full-auto` to let Codex run most commands without prompts. + +### `codex app-server` + +Launch the Codex app server locally. This is primarily for development and debugging and may change without notice. + + + +`codex app-server --listen stdio://` keeps the default JSONL-over-stdio behavior. `--listen ws://IP:PORT` enables WebSocket transport (experimental). If you generate schemas for client bindings, add `--experimental` to include gated fields and methods. + +### `codex app` + +Launch Codex Desktop from the terminal on macOS and optionally open a specific workspace path. + + + +`codex app` installs/opens the desktop app on macOS, then opens the provided workspace path. This subcommand is macOS-only. + +### `codex debug app-server send-message-v2` + +Send one message through app-server's V2 thread/turn flow using the built-in app-server test client. + + + +This debug flow initializes with `experimentalApi: true`, starts a thread, sends a turn, and streams server notifications. Use it to reproduce and inspect app-server protocol behavior locally. + +### `codex apply` + +Apply the most recent diff from a Codex cloud task to your local repository. You must authenticate and have access to the task. + + + +Codex prints the patched files and exits non-zero if `git apply` fails (for example, due to conflicts). + +### `codex cloud` + +Interact with Codex cloud tasks from the terminal. The default command opens an interactive picker; `codex cloud exec` submits a task directly, and `codex cloud list` returns recent tasks for scripting or quick inspection. + + + +Authentication follows the same credentials as the main CLI. Codex exits non-zero if the task submission fails. + +#### `codex cloud list` + +List recent cloud tasks with optional filtering and pagination. + + + +Plain-text output prints a task URL followed by status details. Use `--json` for automation. The JSON payload contains a `tasks` array plus an optional `cursor` value. Each task includes `id`, `url`, `title`, `status`, `updated_at`, `environment_id`, `environment_label`, `summary`, `is_review`, and `attempt_total`. + +### `codex completion` + +Generate shell completion scripts and redirect the output to the appropriate location, for example `codex completion zsh > "${fpath[1]}/_codex"`. + + + +### `codex features` + +Manage feature flags stored in `~/.codex/config.toml`. The `enable` and `disable` commands persist changes so they apply to future sessions. When you launch with `--profile`, Codex writes to that profile instead of the root configuration. + + + +### `codex exec` + +Use `codex exec` (or the short form `codex e`) for scripted or CI-style runs that should finish without human interaction. + + + +Codex writes formatted output by default. Add `--json` to receive newline-delimited JSON events (one per state change). The optional `resume` subcommand lets you continue non-interactive tasks. Use `--last` to pick the most recent session from the current working directory, or add `--all` to search across all sessions: + + + +### `codex execpolicy` + +Check `execpolicy` rule files before you save them. `codex execpolicy check` accepts one or more `--rules` flags (for example, files under `~/.codex/rules`) and emits JSON showing the strictest decision and any matching rules. Add `--pretty` to format the output. The `execpolicy` command is currently in preview. + + + +### `codex login` + +Authenticate the CLI with a ChatGPT account or API key. With no flags, Codex opens a browser for the ChatGPT OAuth flow. + + + +`codex login status` exits with `0` when credentials are present, which is helpful in automation scripts. + +### `codex logout` + +Remove saved credentials for both API key and ChatGPT authentication. This command has no flags. + +### `codex mcp` + +Manage Model Context Protocol server entries stored in `~/.codex/config.toml`. + + + +The `add` subcommand supports both stdio and streamable HTTP transports: + + + +OAuth actions (`login`, `logout`) only work with streamable HTTP servers (and only when the server supports OAuth). + +### `codex mcp-server` + +Run Codex as an MCP server over stdio so that other tools can connect. This command inherits global configuration overrides and exits when the downstream client closes the connection. + +### `codex resume` + +Continue an interactive session by ID or resume the most recent conversation. `codex resume` scopes `--last` to the current working directory unless you pass `--all`. It accepts the same global flags as `codex`, including model and sandbox overrides. + + + +### `codex fork` + +Fork a previous interactive session into a new thread. By default, `codex fork` opens the session picker; add `--last` to fork your most recent session instead. + + + +### `codex sandbox` + +Use the sandbox helper to run a command under the same policies Codex uses internally. + +#### macOS seatbelt + + + +#### Linux Landlock + + + +## Flag combinations and safety tips + +- Set `--full-auto` for unattended local work, but avoid combining it with `--dangerously-bypass-approvals-and-sandbox` unless you are inside a dedicated sandbox VM. +- When you need to grant Codex write access to more directories, prefer `--add-dir` rather than forcing `--sandbox danger-full-access`. +- Pair `--json` with `--output-last-message` in CI to capture machine-readable progress and a final natural-language summary. + +## Related resources + +- [Codex CLI overview](https://developers.openai.com/codex/cli): installation, upgrades, and quick tips. +- [Config basics](https://developers.openai.com/codex/config-basic): persist defaults like the model and provider. +- [Advanced Config](https://developers.openai.com/codex/config-advanced): profiles, providers, sandbox tuning, and integrations. +- [AGENTS.md](https://developers.openai.com/codex/guides/agents-md): conceptual overview of Codex agent capabilities and best practices. \ No newline at end of file diff --git a/specs/smoke/html-game-planning.md b/specs/smoke/html-game-planning.md new file mode 100644 index 0000000..cdd89b8 --- /dev/null +++ b/specs/smoke/html-game-planning.md @@ -0,0 +1,102 @@ +# Bun HTML Game Planning Smoke + +Manual smoke scenario for exercising: + +- multi-step planning +- pre-execution task-graph review +- consultation before execution +- execution against a local git repo in `tmp/` +- optional multi-agent prompt guidance + +The generated project stays local and gitignored under `tmp/smoke/html-game/`. + +## Workspace setup + +```bash +mkdir -p tmp/smoke/html-game +cd tmp/smoke/html-game +bun init -y +git init +git add . +git commit -m "baseline" +``` + +If you want to exercise the multi-agent prompt path, enable one of these before running Orca: + +- set `codex.multiAgent: true` in the Orca config you use for the smoke run +- or ensure `~/.codex/config.toml` contains: + +```toml +[features] +multi_agent = true +``` + +## Local spec file + +Copy the spec below into `tmp/smoke/html-game/SMOKE_SPEC.md`, then run Orca from inside `tmp/smoke/html-game/`. + +```md +# Build a Tiny HTML Arcade Game + +Create a small single-page browser game in this Bun project using plain HTML, CSS, and JavaScript. + +## Goal + +Build a simple arcade-style game that is real enough to require planning and task coordination, but still small enough to finish in one run. + +## Requirements + +- Use vanilla HTML, CSS, and JavaScript only. No framework. +- The game must be playable in the browser from local files or a tiny local static server. +- Include a visible start state, active gameplay state, and game-over state. +- Include a restart flow so the player can immediately play again after losing. +- Include score tracking that visibly updates during play. +- Include keyboard input for movement. +- Include moving obstacles, enemies, or hazards with collision detection. +- Include lightweight styling so the game feels intentional, not raw browser defaults. +- Include a short on-screen explanation of controls and objective. +- Keep code organized enough that multiple implementation tasks could reasonably be split across files or concerns. + +## Suggested game shape + +Aim for a tiny dodge-or-collect game such as: + +- move a player square or ship +- avoid falling hazards or collect targets +- increase score over time or on pickups +- end the run on collision + +You do not need sound, assets, backend code, or external libraries. + +## Verification + +- Add a lightweight local verification step and run it before finishing. +- The verification can be a small Bun test, a script, or another local check that proves the required files and core game states exist. +- Keep verification simple and local. + +## Acceptance criteria + +- The project contains the files needed to run the game locally. +- A human can open the game and play it with the keyboard. +- Score, collision handling, game over, and restart all work. +- The local verification step passes. +- Keep the implementation simple and behavior-preserving relative to the spec. +``` + +## Manual run flow + +From `tmp/smoke/html-game/`: + +```bash +orca plan --spec ./SMOKE_SPEC.md +orca run --spec ./SMOKE_SPEC.md +``` + +## Manual acceptance + +- `orca plan` should produce a multi-task graph, not the single fallback execution task. +- The planned graph should show clear ownership boundaries and only necessary dependencies. +- If multi-agent is enabled, the graph should favor safe parallelizable task breakdown instead of bundled do-everything tasks. +- The review/consultation steps should complete without a hard blocking failure. +- `orca run` should finish with a playable local HTML game. +- The verification step created by the run should pass. diff --git a/src/agents/codex/codex-path.test.ts b/src/agents/codex/codex-path.test.ts new file mode 100644 index 0000000..9ff7a96 --- /dev/null +++ b/src/agents/codex/codex-path.test.ts @@ -0,0 +1,74 @@ +import { describe, expect, test } from "bun:test"; + +import { + clearResolvedCodexPathCacheForTests, + compareCodexCliVersions, + parseCodexCliVersion, + selectPreferredCodexBinary, +} from "./codex-path.js"; + +describe("codex-path", () => { + test("parseCodexCliVersion handles stable releases", () => { + expect(parseCodexCliVersion("codex-cli 0.77.0")).toEqual({ + major: 0, + minor: 77, + patch: 0, + prerelease: [], + raw: "codex-cli 0.77.0", + }); + }); + + test("parseCodexCliVersion handles prereleases", () => { + expect(parseCodexCliVersion("codex-cli 0.115.0-alpha.4")).toEqual({ + major: 0, + minor: 115, + patch: 0, + prerelease: ["alpha", 4], + raw: "codex-cli 0.115.0-alpha.4", + }); + }); + + test("compareCodexCliVersions prefers newer minors", () => { + const older = parseCodexCliVersion("codex-cli 0.77.0"); + const newer = parseCodexCliVersion("codex-cli 0.115.0-alpha.4"); + + expect(older).not.toBeNull(); + expect(newer).not.toBeNull(); + expect(compareCodexCliVersions(newer!, older!)).toBeGreaterThan(0); + }); + + test("compareCodexCliVersions prefers stable over prerelease for same numeric version", () => { + const prerelease = parseCodexCliVersion("codex-cli 0.115.0-alpha.4"); + const stable = parseCodexCliVersion("codex-cli 0.115.0"); + + expect(prerelease).not.toBeNull(); + expect(stable).not.toBeNull(); + expect(compareCodexCliVersions(stable!, prerelease!)).toBeGreaterThan(0); + }); + + test("selectPreferredCodexBinary prefers the newest parsed version", () => { + expect( + selectPreferredCodexBinary([ + { path: "/usr/local/bin/codex", versionOutput: "codex-cli 0.77.0" }, + { + path: "/Applications/Codex.app/Contents/Resources/codex", + versionOutput: "codex-cli 0.115.0-alpha.4", + }, + ]), + ).toBe("/Applications/Codex.app/Contents/Resources/codex"); + }); + + test("selectPreferredCodexBinary falls back to the first available path when versions are unavailable", () => { + expect( + selectPreferredCodexBinary([ + { path: "/first/codex", versionOutput: null }, + { path: "/second/codex", versionOutput: null }, + ]), + ).toBe("/first/codex"); + }); + + test("clearResolvedCodexPathCacheForTests is callable", () => { + clearResolvedCodexPathCacheForTests(); + expect(true).toBe(true); + }); +}); diff --git a/src/agents/codex/codex-path.ts b/src/agents/codex/codex-path.ts new file mode 100644 index 0000000..85d58eb --- /dev/null +++ b/src/agents/codex/codex-path.ts @@ -0,0 +1,231 @@ +import { execFile as execFileCallback } from "node:child_process"; +import { accessSync, constants as fsConstants } from "node:fs"; +import { access } from "node:fs/promises"; +import path from "node:path"; +import { promisify } from "node:util"; + +const execFile = promisify(execFileCallback); +const FALLBACK_CODEX_PATH = "codex"; + +const KNOWN_CODEX_BINARY_CANDIDATES = [ + "/Applications/Codex.app/Contents/Resources/codex", + "/opt/homebrew/bin/codex", + "/usr/local/bin/codex", +] as const; + +export interface ParsedCodexCliVersion { + major: number; + minor: number; + patch: number; + prerelease: Array; + raw: string; +} + +export interface CodexBinaryProbe { + path: string; + versionOutput: string | null; +} + +let cachedResolvedCodexPath: Promise | null = null; + +export function clearResolvedCodexPathCacheForTests(): void { + cachedResolvedCodexPath = null; +} + +export function parseCodexCliVersion(output: string): ParsedCodexCliVersion | null { + const match = output.match(/codex-cli\s+(\d+)\.(\d+)\.(\d+)(?:-([A-Za-z0-9.-]+))?/i); + if (!match) { + return null; + } + + const prerelease = match[4] + ? match[4] + .split(".") + .map((part) => (/^\d+$/.test(part) ? Number(part) : part)) + : []; + + return { + major: Number(match[1]), + minor: Number(match[2]), + patch: Number(match[3]), + prerelease, + raw: match[0], + }; +} + +export function compareCodexCliVersions(a: ParsedCodexCliVersion, b: ParsedCodexCliVersion): number { + for (const key of ["major", "minor", "patch"] as const) { + if (a[key] !== b[key]) { + return a[key] - b[key]; + } + } + + if (a.prerelease.length === 0 && b.prerelease.length === 0) { + return 0; + } + + if (a.prerelease.length === 0) { + return 1; + } + + if (b.prerelease.length === 0) { + return -1; + } + + const maxLength = Math.max(a.prerelease.length, b.prerelease.length); + for (let index = 0; index < maxLength; index += 1) { + const left = a.prerelease[index]; + const right = b.prerelease[index]; + + if (left === undefined) { + return -1; + } + + if (right === undefined) { + return 1; + } + + if (left === right) { + continue; + } + + if (typeof left === "number" && typeof right === "number") { + return left - right; + } + + if (typeof left === "number") { + return -1; + } + + if (typeof right === "number") { + return 1; + } + + return left.localeCompare(right); + } + + return 0; +} + +export function selectPreferredCodexBinary(probes: CodexBinaryProbe[]): string | null { + const candidates = probes.filter((probe) => probe.path.trim().length > 0); + if (candidates.length === 0) { + return null; + } + + let best = candidates[0] ?? null; + if (!best) { + return null; + } + + let bestVersion = parseCodexCliVersion(best.versionOutput ?? ""); + + for (const candidate of candidates.slice(1)) { + const candidateVersion = parseCodexCliVersion(candidate.versionOutput ?? ""); + if (!bestVersion) { + if (candidateVersion) { + best = candidate; + bestVersion = candidateVersion; + } + continue; + } + + if (!candidateVersion) { + continue; + } + + if (compareCodexCliVersions(candidateVersion, bestVersion) > 0) { + best = candidate; + bestVersion = candidateVersion; + } + } + + return best.path; +} + +function resolveCodexPathOnPath(): string | null { + const pathValue = process.env.PATH?.trim(); + if (!pathValue) { + return null; + } + + for (const entry of pathValue.split(path.delimiter)) { + const trimmed = entry.trim(); + if (trimmed.length === 0) { + continue; + } + + const candidatePath = path.join(trimmed, "codex"); + try { + accessSync(candidatePath, fsConstants.X_OK); + return candidatePath; + } catch { + continue; + } + } + + return null; +} + +function getCandidatePaths(): string[] { + return Array.from( + new Set( + [resolveCodexPathOnPath(), ...KNOWN_CODEX_BINARY_CANDIDATES].filter( + (value): value is string => typeof value === "string" && value.trim().length > 0, + ), + ), + ); +} + +async function isExecutable(filePath: string): Promise { + try { + await access(filePath, fsConstants.X_OK); + return true; + } catch { + return false; + } +} + +async function readCodexCliVersion(filePath: string): Promise { + try { + const { stdout, stderr } = await execFile(filePath, ["--version"], { + timeout: 1_500, + }); + const output = `${stdout ?? ""}\n${stderr ?? ""}`.trim(); + return output.length > 0 ? output : null; + } catch { + return null; + } +} + +async function autoResolveCodexPath(): Promise { + const candidates = getCandidatePaths(); + const available = await Promise.all( + candidates.map(async (candidatePath) => { + if (!(await isExecutable(candidatePath))) { + return null; + } + + return { + path: candidatePath, + versionOutput: await readCodexCliVersion(candidatePath), + } satisfies CodexBinaryProbe; + }), + ); + + const preferred = selectPreferredCodexBinary( + available.filter((probe): probe is CodexBinaryProbe => probe !== null), + ); + + return preferred ?? FALLBACK_CODEX_PATH; +} + +export async function resolveCodexPath(): Promise { + const explicitPath = process.env.ORCA_CODEX_PATH?.trim(); + if (explicitPath && explicitPath.length > 0) { + return explicitPath; + } + + cachedResolvedCodexPath ??= autoResolveCodexPath(); + return cachedResolvedCodexPath; +} diff --git a/src/agents/codex/session.test.ts b/src/agents/codex/session.test.ts index 78fa17f..52b3a45 100644 --- a/src/agents/codex/session.test.ts +++ b/src/agents/codex/session.test.ts @@ -6,8 +6,8 @@ import { createCodexSession } from "./session.js"; // Try common locations for the codex binary const CODEX_PATHS = [ + "/Applications/Codex.app/Contents/Resources/codex", Bun.which("codex"), - `${process.env.HOME}/.nvm/versions/node/v22.22.0/bin/codex`, "/opt/homebrew/bin/codex", "/usr/local/bin/codex", ].filter(Boolean) as string[]; diff --git a/src/agents/codex/session.ts b/src/agents/codex/session.ts index 107bdad..f853694 100644 --- a/src/agents/codex/session.ts +++ b/src/agents/codex/session.ts @@ -1,3 +1,4 @@ +import { readFile } from "node:fs/promises"; import path from "node:path"; import { CodexClient } from "@ratley/codex-client"; @@ -11,9 +12,12 @@ import type { TaskGraphReviewOperation, TaskGraphReviewResult } from "../../types/index.js"; +import { isCodexMultiAgentActive } from "../../core/codex-config.js"; import { TaskGraphReviewPayloadSchema } from "../../core/task-graph-review.js"; import type { CodexEffort } from "../../types/effort.js"; import { loadSkills, type LoadedSkill } from "../../utils/skill-loader.js"; +import { logger } from "../../utils/logger.js"; +import { resolveCodexPath } from "./codex-path.js"; export type { PlanResult, TaskExecutionResult }; @@ -28,12 +32,26 @@ function getCodeSimplifierGuidance(): string[] { ]; } -function buildPlanningPrompt(spec: string, systemContext: string): string { +function getMultiAgentPlanningGuidance(multiAgentActive: boolean): string[] { + if (!multiAgentActive) { + return []; + } + + return [ + "Codex multi-agent mode is enabled for this run. Shape the task graph so safe subagent parallelization is obvious.", + "Assign clear file or subsystem ownership per task so subagents do not step on each other.", + "Only add dependencies that are truly required for correctness.", + "Do not bundle unrelated work into a single do-everything task when it can be safely split.", + ]; +} + +function buildPlanningPrompt(spec: string, systemContext: string, multiAgentActive: boolean): string { return [ systemContext, "You are decomposing a spec into an ordered task graph.", "Prefer task decomposition that maximizes safe parallelism for independent workstreams.", "Isolate task ownership (files/subsystems) to avoid cross-task collisions.", + ...getMultiAgentPlanningGuidance(multiAgentActive), ...getCodeSimplifierGuidance(), "Return a JSON array of tasks.", "Each task must include fields: id, name, description, dependencies, acceptance_criteria, status, retries, maxRetries.", @@ -51,11 +69,20 @@ function buildTaskExecutionPrompt( runId: string, cwd: string, systemContext?: string, + multiAgentActive = false, ): string { return [ ...(systemContext ? [systemContext] : []), "You are Orca's task execution assistant.", ...getCodeSimplifierGuidance(), + ...(multiAgentActive + ? [ + "Codex multi-agent mode is enabled for this run.", + "If this task contains clearly independent subtasks with disjoint ownership, use subagents to parallelize them.", + "Do not use subagents for tightly coupled, blocking, or highly stateful work.", + "Integrate subagent results yourself before final completion.", + ] + : []), `Run ID: ${runId}`, `Repository CWD: ${cwd}`, `Task ID: ${task.id}`, @@ -88,11 +115,20 @@ function buildPlanDecisionPrompt(spec: string, systemContext: string): string { ].join("\n\n"); } -function buildTaskGraphReviewPrompt(tasks: Task[], systemContext: string): string { +function buildTaskGraphReviewPrompt(tasks: Task[], systemContext: string, multiAgentActive: boolean): string { return [ systemContext, "You are Orca's pre-execution task-graph reviewer.", ...getCodeSimplifierGuidance(), + ...(multiAgentActive + ? [ + "Codex multi-agent mode is enabled for this run. Review the graph for safe subagent parallelization.", + "Split independent work into separate tasks when subagents could execute it in parallel.", + "Remove fake dependencies that unnecessarily serialize independent work.", + "Flag ownership collisions where multiple tasks would touch the same files or subsystem without coordination.", + "Add coordination tasks when parallel work needs a final integration step.", + ] + : []), "Return JSON matching this shape exactly: {\"changes\":[...operations...]}", "Allowed operation shapes:", "- {\"op\":\"update_task\",\"taskId\":\"...\",\"fields\":{\"name\"?:string,\"description\"?:string,\"acceptance_criteria\"?:string[]}}", @@ -106,6 +142,32 @@ function buildTaskGraphReviewPrompt(tasks: Task[], systemContext: string): strin ].join("\n\n"); } +function buildTaskGraphConsultationPrompt(tasks: Task[], multiAgentActive: boolean): string { + const taskGraphJson = JSON.stringify(tasks, null, 2); + + return [ + "Review this Orca task graph before execution.", + "Flag any: missing steps, wrong dependency order, tasks that are underdefined, or potential blockers.", + ...(multiAgentActive + ? [ + "", + "Codex multi-agent mode is enabled for this run.", + "Treat missed safe parallelism, fake dependencies, overlapping ownership, or missing integration tasks as review concerns.", + "Flag tasks that should be split for safe subagent execution, or tasks that would cause subagents to step on each other.", + ] + : []), + "", + "Set ok: false ONLY if there is a hard blocking issue — dependency cycle, circular reference, a task that cannot possibly run as defined, or a critical missing step that would cause the run to fail.", + "For minor issues (ambiguous wording, style preferences, nice-to-haves): list them in issues but set ok: true.", + "If the graph looks generally reasonable and executable, set ok: true even if you have minor suggestions.", + "", + "Be brief. Output JSON on the last line: { \"issues\": [...], \"ok\": boolean }", + "", + "Task graph:", + taskGraphJson, + ].join("\n"); +} + function parseTaskGraphReview(raw: string): TaskGraphReviewResult { const parsed = JSON.parse(extractJson(raw)) as unknown; const result = TaskGraphReviewPayloadSchema.safeParse(parsed); @@ -289,18 +351,12 @@ function getModel(config?: OrcaConfig): string { return config?.codex?.model ?? process.env.ORCA_CODEX_MODEL ?? "gpt-5.3-codex"; } -function getCodexPath(): string { - return ( - process.env.ORCA_CODEX_PATH ?? - `${process.env.HOME}/.nvm/versions/node/v22.22.0/bin/codex` - ); -} - -type ThinkingStep = "decision" | "planning" | "execution"; +type ThinkingStep = "decision" | "planning" | "review" | "execution"; const DEFAULT_THINKING_BY_STEP: Record = { decision: "low", planning: "high", + review: "high", execution: "medium", }; @@ -317,15 +373,26 @@ function getEffort(config: OrcaConfig | undefined, step: ThinkingStep): CodexEff return DEFAULT_THINKING_BY_STEP[step]; } -function buildTurnInput(text: string, skills: LoadedSkill[]): Array<{ type: "text"; text: string } | { type: "skill"; name: string; path: string }> { - return [ - { type: "text", text }, - ...skills.map((skill) => ({ - type: "skill" as const, - name: skill.name, - path: skill.dirPath, - })), - ]; +function buildTurnInput(text: string, skills: LoadedSkill[]): Array<{ type: "text"; text: string }> { + const usableSkills = skills.filter((skill) => skill.body.trim().length > 0); + if (usableSkills.length === 0) { + return [{ type: "text", text }]; + } + + const skillContext = usableSkills.map((skill) => [ + `Skill: ${skill.name}`, + `Source: ${skill.filePath}`, + skill.body.trim(), + ].join("\n")).join("\n\n"); + + return [{ + type: "text", + text: [ + text, + "Referenced Orca skills:", + skillContext, + ].join("\n\n"), + }]; } interface RawSkill { @@ -367,22 +434,36 @@ function getPerCwdExtraUserRootsForCwd(config: OrcaConfig | undefined, cwd: stri } async function loadCodexListedSkills(client: CodexClient, cwd: string, config?: OrcaConfig): Promise { - const maybeRequest = Reflect.get(client as object, "request"); - if (typeof maybeRequest !== "function") { - return []; - } - - const request = maybeRequest as (this: unknown, method: string, params?: unknown, timeoutMs?: number) => Promise; - const perCwdExtraUserRoots = getPerCwdExtraUserRootsForCwd(config, cwd); let response: unknown; try { - response = await request.call(client, "skills/list", { - cwds: [cwd], - forceReload: true, - ...(perCwdExtraUserRoots.length > 0 ? { perCwdExtraUserRoots } : {}), - }); + const maybeListSkills = Reflect.get(client as object, "listSkills"); + if (typeof maybeListSkills === "function") { + response = await maybeListSkills.call(client, { + cwds: [cwd], + forceReload: true, + ...(perCwdExtraUserRoots.length > 0 ? { perCwdExtraUserRoots } : {}), + }); + } else { + const maybeRequest = Reflect.get(client as object, "request"); + if (typeof maybeRequest !== "function") { + return []; + } + + const request = maybeRequest as ( + this: unknown, + method: string, + params?: unknown, + timeoutMs?: number + ) => Promise; + + response = await request.call(client, "skills/list", { + cwds: [cwd], + forceReload: true, + ...(perCwdExtraUserRoots.length > 0 ? { perCwdExtraUserRoots } : {}), + }); + } } catch { return []; } @@ -408,10 +489,17 @@ async function loadCodexListedSkills(client: CodexClient, cwd: string, config?: continue; } + let skillBody = ""; + try { + skillBody = await readFile(normalizedSkillPath, "utf8"); + } catch { + skillBody = ""; + } + discovered.push({ name: skill.name, description: "", - body: "", + body: skillBody, dirPath: path.dirname(normalizedSkillPath), filePath: normalizedSkillPath, }); @@ -459,6 +547,99 @@ async function resolveTurnSkills(client: CodexClient, config: OrcaConfig | undef return [...mergedByName.values()]; } +function extractUnknownFeatureKey(line: string): string | null { + const match = line.match(/unknown feature key in config:\s*([A-Za-z0-9_.-]+)/i); + return match?.[1] ?? null; +} + +function isIgnorableMcpStderrLine(line: string): boolean { + return ( + line.includes("codex_rmcp_client::oauth: failed to read OAuth tokens from keyring") || + line.includes("rmcp::transport::worker: worker quit with fatal: Transport channel closed, when AuthRequired(") || + line.includes("codex_core::mcp_connection_manager: Failed to list resources for MCP server") || + line.includes("codex_core::mcp_connection_manager: Failed to list resource templates for MCP server") || + line.includes("codex_core::shell_snapshot: Failed to delete shell snapshot") || + line.includes("codex_rmcp_client::rmcp_client: Failed to kill MCP process group") || + line.includes("codex_protocol::openai_models: Model personality requested but model_messages is missing") + ); +} + +function attachCodexStderrDiagnostics(client: CodexClient, codexPath: string): void { + const on = Reflect.get(client as object, "on"); + if (typeof on !== "function") { + return; + } + + const reportedLines = new Set(); + const reportedUnsupportedFeatures = new Set(); + + on.call(client, "stderr", (payload: unknown) => { + const line = String(payload).trim(); + if (line.length === 0) { + return; + } + + const unsupportedFeature = extractUnknownFeatureKey(line); + if (unsupportedFeature) { + if (!reportedUnsupportedFeatures.has(unsupportedFeature)) { + reportedUnsupportedFeatures.add(unsupportedFeature); + logger.warn( + `Codex binary ${codexPath} does not support feature '${unsupportedFeature}'. Orca will continue, but you should update Codex or point ORCA_CODEX_PATH at a newer binary.`, + ); + } + return; + } + + if (isIgnorableMcpStderrLine(line)) { + return; + } + + if (reportedLines.has(line)) { + return; + } + + reportedLines.add(line); + logger.warn(`Codex app-server: ${line}`); + }); +} + +async function warnAboutUnavailableMcpServers(client: CodexClient): Promise { + const request = Reflect.get(client as object, "request"); + if (typeof request !== "function") { + return; + } + + let response: unknown; + try { + response = await request.call(client, "mcpServerStatus/list", { limit: 50 }, 10_000); + } catch { + return; + } + + if (!response || typeof response !== "object" || !("data" in response) || !Array.isArray(response.data)) { + return; + } + + const unavailableServers = response.data + .filter((entry): entry is { name: string; authStatus: string } => + !!entry && + typeof entry === "object" && + typeof (entry as { name?: unknown }).name === "string" && + typeof (entry as { authStatus?: unknown }).authStatus === "string", + ) + .filter((entry) => entry.authStatus === "notLoggedIn") + .map((entry) => entry.name); + + if (unavailableServers.length === 0) { + return; + } + + const loginCommands = unavailableServers.map((name) => `codex mcp login ${name}`).join(" ; "); + logger.warn( + `Configured Codex MCP servers need login and will be unavailable for this Orca run: ${unavailableServers.join(", ")}. Orca will continue without them. Run ${loginCommands} or disable them in ~/.codex/config.toml if you do not need them.`, + ); +} + /** * Create a persistent Codex session. The thread persists across calls — * planSpec and executeTask share context within the same session. @@ -478,19 +659,24 @@ export async function createCodexSession( executeTask: (task: Task, runId: string, systemContext?: string) => Promise; consultTaskGraph: (tasks: Task[]) => Promise; reviewChanges: (threadId?: string) => Promise; - runPrompt: (prompt: string) => Promise; + runPrompt: (prompt: string, step?: ThinkingStep) => Promise; disconnect: () => Promise; threadId: string; }> { + const multiAgentActive = await isCodexMultiAgentActive(config); + const codexPath = await resolveCodexPath(); + const client = new CodexClient({ - codexPath: getCodexPath(), + codexPath, model: getModel(config), cwd, approvalPolicy: "never", sandbox: "workspace-write", }); + attachCodexStderrDiagnostics(client, codexPath); await client.connect(); + await warnAboutUnavailableMcpServers(client); let skills: LoadedSkill[]; let threadId: string; @@ -524,7 +710,7 @@ export async function createCodexSession( const result = await client.runTurn({ threadId, effort: getEffort(config, "planning"), - input: buildTurnInput(buildPlanningPrompt(spec, systemContext), skills), + input: buildTurnInput(buildPlanningPrompt(spec, systemContext, multiAgentActive), skills), }); const rawResponse = extractAgentText(result); @@ -538,8 +724,8 @@ export async function createCodexSession( async reviewTaskGraph(tasks: Task[], systemContext: string): Promise { const result = await client.runTurn({ threadId, - effort: getEffort(config, "planning"), - input: buildTurnInput(buildTaskGraphReviewPrompt(tasks, systemContext), skills), + effort: getEffort(config, "review"), + input: buildTurnInput(buildTaskGraphReviewPrompt(tasks, systemContext, multiAgentActive), skills), }); const rawResponse = extractAgentText(result); @@ -554,7 +740,7 @@ export async function createCodexSession( const result = await client.runTurn({ threadId, effort: getEffort(config, "execution"), - input: buildTurnInput(buildTaskExecutionPrompt(task, runId, cwd, systemContext), skills), + input: buildTurnInput(buildTaskExecutionPrompt(task, runId, cwd, systemContext, multiAgentActive), skills), }); const rawResponse = extractAgentText(result); @@ -580,24 +766,11 @@ export async function createCodexSession( }, async consultTaskGraph(tasks: Task[]): Promise { - const taskGraphJson = JSON.stringify(tasks, null, 2); - const prompt = [ - "Review this Orca task graph before execution.", - "Flag any: missing steps, wrong dependency order, tasks that are underdefined, or potential blockers.", - "", - "Set ok: false ONLY if there is a hard blocking issue — dependency cycle, circular reference, a task that cannot possibly run as defined, or a critical missing step that would cause the run to fail.", - "For minor issues (ambiguous wording, style preferences, nice-to-haves): list them in issues but set ok: true.", - "If the graph looks generally reasonable and executable, set ok: true even if you have minor suggestions.", - "", - "Be brief. Output JSON on the last line: { \"issues\": [...], \"ok\": boolean }", - "", - "Task graph:", - taskGraphJson, - ].join("\n"); + const prompt = buildTaskGraphConsultationPrompt(tasks, multiAgentActive); const result = await client.runTurn({ threadId, - effort: getEffort(config, "decision"), + effort: getEffort(config, "review"), input: buildTurnInput(prompt, skills), }); @@ -628,10 +801,10 @@ export async function createCodexSession( return result.reviewText; }, - async runPrompt(prompt: string): Promise { + async runPrompt(prompt: string, step: ThinkingStep = "execution"): Promise { const result = await client.runTurn({ threadId, - effort: getEffort(config, "execution"), + effort: getEffort(config, step), input: buildTurnInput(prompt, skills), }); diff --git a/src/agents/codex/session.unit.test.ts b/src/agents/codex/session.unit.test.ts index 4f74513..7a755bf 100644 --- a/src/agents/codex/session.unit.test.ts +++ b/src/agents/codex/session.unit.test.ts @@ -1,10 +1,66 @@ +import { mkdtemp, mkdir, writeFile } from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; import { afterEach, describe, expect, mock, test } from "bun:test"; afterEach(() => { mock.restore(); }); +function mockMultiAgentDetection(active = false): void { + mock.module("../../core/codex-config.js", () => ({ + isCodexMultiAgentActive: async () => active, + })); +} + describe("codex session effort wiring", () => { + test("uses ORCA_CODEX_PATH override and otherwise resolves a default Codex binary", async () => { + const constructedOptions: Array<{ codexPath?: string }> = []; + const originalCodexPath = process.env.ORCA_CODEX_PATH; + + mockMultiAgentDetection(false); + mock.module("@ratley/codex-client", () => ({ + CodexClient: class { + constructor(options: { codexPath?: string }) { + constructedOptions.push(options); + } + async connect(): Promise {} + async disconnect(): Promise {} + async startThread(): Promise<{ id: string }> { + return { id: "thread-1" }; + } + async runReview(): Promise<{ reviewText: string }> { + return { reviewText: "ok" }; + } + }, + })); + + mock.module("../../utils/skill-loader.js", () => ({ + loadSkills: async () => [], + })); + + const { createCodexSession } = await import(`./session.ts?test=${Math.random()}`); + + try { + delete process.env.ORCA_CODEX_PATH; + const defaultSession = await createCodexSession(process.cwd()); + await defaultSession.disconnect(); + + process.env.ORCA_CODEX_PATH = "/tmp/custom-codex"; + const overriddenSession = await createCodexSession(process.cwd()); + await overriddenSession.disconnect(); + + expect(constructedOptions[0]?.codexPath).toBeTruthy(); + expect(constructedOptions[1]?.codexPath).toBe("/tmp/custom-codex"); + } finally { + if (originalCodexPath === undefined) { + delete process.env.ORCA_CODEX_PATH; + } else { + process.env.ORCA_CODEX_PATH = originalCodexPath; + } + } + }); + test("passes configured effort into Codex runTurn", async () => { const runTurnMock = mock(async () => ({ agentMessage: "[]", @@ -12,6 +68,7 @@ describe("codex session effort wiring", () => { items: [], })); + mockMultiAgentDetection(false); mock.module("@ratley/codex-client", () => ({ CodexClient: class { async connect(): Promise {} @@ -42,7 +99,7 @@ describe("codex session effort wiring", () => { } }); - test("smoke: uses per-step thinkingLevel values for decision/planning/execution turns", async () => { + test("smoke: uses per-step thinkingLevel values for decision/planning/review/execution turns", async () => { const efforts: string[] = []; const runTurnMock = mock(async (params: { effort?: string; input?: Array<{ text?: string }> }) => { efforts.push(params.effort ?? ""); @@ -55,6 +112,22 @@ describe("codex session effort wiring", () => { }; } + if (prompt.includes("pre-execution task-graph reviewer")) { + return { + agentMessage: '{"changes":[]}', + turn: { status: "completed" }, + items: [], + }; + } + + if (prompt.includes("Review this Orca task graph before execution.")) { + return { + agentMessage: '{"issues":[],"ok":true}', + turn: { status: "completed" }, + items: [], + }; + } + return { agentMessage: "[]", turn: { status: "completed" }, @@ -62,6 +135,7 @@ describe("codex session effort wiring", () => { }; }); + mockMultiAgentDetection(false); mock.module("@ratley/codex-client", () => ({ CodexClient: class { async connect(): Promise {} @@ -86,6 +160,7 @@ describe("codex session effort wiring", () => { thinkingLevel: { decision: "low", planning: "xhigh", + review: "high", execution: "medium", }, }, @@ -94,6 +169,7 @@ describe("codex session effort wiring", () => { try { await session.decidePlanningNeed("spec", "context"); await session.planSpec("spec", "context"); + await session.reviewTaskGraph([], "context"); await session.executeTask( { id: "t1", @@ -108,8 +184,10 @@ describe("codex session effort wiring", () => { "run-1", "context", ); + await session.consultTaskGraph([]); + await session.runPrompt("review prompt", "review"); - expect(efforts).toEqual(["low", "xhigh", "medium"]); + expect(efforts).toEqual(["low", "xhigh", "high", "medium", "high", "high"]); } finally { await session.disconnect(); } @@ -139,6 +217,7 @@ describe("codex session code-simplifier guidance", () => { }; }); + mockMultiAgentDetection(false); mock.module("@ratley/codex-client", () => ({ CodexClient: class { async connect(): Promise {} @@ -190,10 +269,185 @@ describe("codex session code-simplifier guidance", () => { }); }); +describe("codex session multi-agent prompt guidance", () => { + test("includes multi-agent guidance in planning, review, consultation, and execution prompts when active", async () => { + const prompts: string[] = []; + const runTurnMock = mock(async (params: { input?: Array<{ text?: string }> }) => { + const prompt = params.input?.[0]?.text ?? ""; + prompts.push(prompt); + + if (prompt.includes("pre-execution task-graph reviewer")) { + return { + agentMessage: '{"changes":[]}', + turn: { status: "completed" }, + items: [], + }; + } + + if (prompt.includes("Review this Orca task graph before execution.")) { + return { + agentMessage: '{"issues":[],"ok":true}', + turn: { status: "completed" }, + items: [], + }; + } + + return { + agentMessage: "[]", + turn: { status: "completed" }, + items: [], + }; + }); + + mockMultiAgentDetection(true); + mock.module("@ratley/codex-client", () => ({ + CodexClient: class { + async connect(): Promise {} + async disconnect(): Promise {} + async startThread(): Promise<{ id: string }> { + return { id: "thread-1" }; + } + runTurn = runTurnMock; + async runReview(): Promise<{ reviewText: string }> { + return { reviewText: "ok" }; + } + }, + })); + + mock.module("../../utils/skill-loader.js", () => ({ + loadSkills: async () => [], + })); + + const { createCodexSession } = await import(`./session.ts?test=${Math.random()}`); + const session = await createCodexSession(process.cwd()); + + try { + await session.planSpec("spec", "context"); + await session.reviewTaskGraph([], "context"); + await session.consultTaskGraph([]); + await session.executeTask( + { + id: "t1", + name: "Task", + description: "Do thing", + dependencies: [], + acceptance_criteria: ["Done"], + status: "pending", + retries: 0, + maxRetries: 3, + }, + "run-1", + "context", + ); + + const planningPrompt = prompts.find((prompt) => prompt.includes("You are decomposing a spec into an ordered task graph.")) ?? ""; + const reviewPrompt = prompts.find((prompt) => prompt.includes("You are Orca's pre-execution task-graph reviewer.")) ?? ""; + const consultationPrompt = prompts.find((prompt) => prompt.includes("Review this Orca task graph before execution.")) ?? ""; + const executionPrompt = prompts.find((prompt) => prompt.includes("You are Orca's task execution assistant.")) ?? ""; + + expect(planningPrompt).toContain("Codex multi-agent mode is enabled for this run. Shape the task graph so safe subagent parallelization is obvious."); + expect(planningPrompt).toContain("Do not bundle unrelated work into a single do-everything task when it can be safely split."); + + expect(reviewPrompt).toContain("Codex multi-agent mode is enabled for this run. Review the graph for safe subagent parallelization."); + expect(reviewPrompt).toContain("Flag ownership collisions where multiple tasks would touch the same files or subsystem without coordination."); + + expect(consultationPrompt).toContain("Codex multi-agent mode is enabled for this run."); + expect(consultationPrompt).toContain("Treat missed safe parallelism, fake dependencies, overlapping ownership, or missing integration tasks as review concerns."); + + expect(executionPrompt).toContain("Codex multi-agent mode is enabled for this run."); + expect(executionPrompt).toContain("If this task contains clearly independent subtasks with disjoint ownership, use subagents to parallelize them."); + expect(executionPrompt).toContain("Integrate subagent results yourself before final completion."); + } finally { + await session.disconnect(); + } + }); + + test("omits multi-agent guidance from planning, review, consultation, and execution prompts when inactive", async () => { + const prompts: string[] = []; + const runTurnMock = mock(async (params: { input?: Array<{ text?: string }> }) => { + const prompt = params.input?.[0]?.text ?? ""; + prompts.push(prompt); + + if (prompt.includes("pre-execution task-graph reviewer")) { + return { + agentMessage: '{"changes":[]}', + turn: { status: "completed" }, + items: [], + }; + } + + if (prompt.includes("Review this Orca task graph before execution.")) { + return { + agentMessage: '{"issues":[],"ok":true}', + turn: { status: "completed" }, + items: [], + }; + } + + return { + agentMessage: "[]", + turn: { status: "completed" }, + items: [], + }; + }); + + mockMultiAgentDetection(false); + mock.module("@ratley/codex-client", () => ({ + CodexClient: class { + async connect(): Promise {} + async disconnect(): Promise {} + async startThread(): Promise<{ id: string }> { + return { id: "thread-1" }; + } + runTurn = runTurnMock; + async runReview(): Promise<{ reviewText: string }> { + return { reviewText: "ok" }; + } + }, + })); + + mock.module("../../utils/skill-loader.js", () => ({ + loadSkills: async () => [], + })); + + const { createCodexSession } = await import(`./session.ts?test=${Math.random()}`); + const session = await createCodexSession(process.cwd()); + + try { + await session.planSpec("spec", "context"); + await session.reviewTaskGraph([], "context"); + await session.consultTaskGraph([]); + await session.executeTask( + { + id: "t1", + name: "Task", + description: "Do thing", + dependencies: [], + acceptance_criteria: ["Done"], + status: "pending", + retries: 0, + maxRetries: 3, + }, + "run-1", + "context", + ); + + for (const prompt of prompts) { + expect(prompt).not.toContain("Codex multi-agent mode is enabled for this run."); + expect(prompt).not.toContain("use subagents to parallelize them"); + expect(prompt).not.toContain("safe subagent parallelization"); + } + } finally { + await session.disconnect(); + } + }); +}); + describe("codex session skill discovery", () => { test("calls skills/list with forceReload and perCwdExtraUserRoots", async () => { const requestMock = mock(async () => ({ data: [] })); + mockMultiAgentDetection(false); mock.module("@ratley/codex-client", () => ({ CodexClient: class { async connect(): Promise {} @@ -235,10 +489,22 @@ describe("codex session skill discovery", () => { }); test("merges app-server listed skills after Orca-loaded skills without overriding deterministic precedence", async () => { - type TurnInputItem = { type: "text"; text: string } | { type: "skill"; name: string; path: string }; + type TurnInputItem = { type: "text"; text: string }; let capturedInput: TurnInputItem[] = []; - + const listedSkillsRoot = await mkdtemp(path.join(os.tmpdir(), "orca-listed-skills-")); + const alphaSkillPath = path.join(listedSkillsRoot, "alpha-skill", "SKILL.md"); + const codeSimplifierPath = path.join(listedSkillsRoot, "code-simplifier", "SKILL.md"); + const zetaSkillPath = path.join(listedSkillsRoot, "zeta-skill", "SKILL.md"); + + await mkdir(path.dirname(alphaSkillPath), { recursive: true }); + await mkdir(path.dirname(codeSimplifierPath), { recursive: true }); + await mkdir(path.dirname(zetaSkillPath), { recursive: true }); + await writeFile(alphaSkillPath, "alpha body", "utf8"); + await writeFile(codeSimplifierPath, "server code simplifier body", "utf8"); + await writeFile(zetaSkillPath, "zeta body", "utf8"); + + mockMultiAgentDetection(false); mock.module("@ratley/codex-client", () => ({ CodexClient: class { async connect(): Promise {} @@ -255,9 +521,9 @@ describe("codex session skill discovery", () => { data: [ { skills: [ - { name: "zeta-skill", path: "/srv/zeta/zeta-skill/SKILL.md" }, - { name: "code-simplifier", path: "/srv/override/code-simplifier/SKILL.md" }, - { name: "alpha-skill", path: "/srv/alpha/alpha-skill/SKILL.md" }, + { name: "zeta-skill", path: zetaSkillPath }, + { name: "code-simplifier", path: codeSimplifierPath }, + { name: "alpha-skill", path: alphaSkillPath }, ], }, ], @@ -287,22 +553,26 @@ describe("codex session skill discovery", () => { try { await session.planSpec("spec", "context"); - const skills = capturedInput.filter((item): item is { type: "skill"; name: string; path: string } => item.type === "skill"); - expect(skills).toEqual([ - { type: "skill", name: "code-simplifier", path: "/tmp/skills/code-simplifier" }, - { type: "skill", name: "alpha-skill", path: "/srv/alpha/alpha-skill" }, - { type: "skill", name: "zeta-skill", path: "/srv/zeta/zeta-skill" }, - ]); + const prompt = capturedInput[0]?.text ?? ""; + expect(prompt).toContain("Referenced Orca skills:"); + expect(prompt).toContain("Skill: code-simplifier"); + expect(prompt).toContain("Skill: alpha-skill"); + expect(prompt).toContain("Skill: zeta-skill"); + expect(prompt).toContain("body"); + expect(prompt).toContain("alpha body"); + expect(prompt).toContain("zeta body"); + expect(prompt).not.toContain("server code simplifier body"); } finally { await session.disconnect(); } }); }); -describe("codex session explicit skill input", () => { +describe("codex session inline skill context", () => { test("disconnects Codex client if skill loading fails during session creation", async () => { const disconnectMock = mock(async () => {}); + mockMultiAgentDetection(false); mock.module("@ratley/codex-client", () => ({ CodexClient: class { async connect(): Promise {} @@ -331,8 +601,8 @@ describe("codex session explicit skill input", () => { expect(disconnectMock).toHaveBeenCalledTimes(1); }); - test("includes skill items with valid name/path alongside text input for every runTurn", async () => { - type TurnInputItem = { type: "text"; text: string } | { type: "skill"; name: string; path: string }; + test("includes inline skill context inside the text input for every runTurn", async () => { + type TurnInputItem = { type: "text"; text: string }; const runTurnCalls: Array<{ input?: TurnInputItem[] }> = []; const runTurnMock = mock(async (params: { input?: TurnInputItem[] }) => { @@ -362,6 +632,7 @@ describe("codex session explicit skill input", () => { }; }); + mockMultiAgentDetection(false); mock.module("@ratley/codex-client", () => ({ CodexClient: class { async connect(): Promise {} @@ -414,17 +685,14 @@ describe("codex session explicit skill input", () => { expect(runTurnCalls.length).toBe(5); for (const call of runTurnCalls) { - const textItem = call.input?.find((item) => item.type === "text"); - expect(textItem?.type).toBe("text"); - expect((textItem as { text?: string } | undefined)?.text).toBeTruthy(); - - const skillItems = call.input?.filter((item) => item.type === "skill") ?? []; - expect(skillItems).toHaveLength(1); - expect(skillItems[0]).toEqual({ - type: "skill", - name: "code-simplifier", - path: "/tmp/skills/code-simplifier", - }); + expect(call.input).toHaveLength(1); + + const text = call.input?.[0]?.text ?? ""; + expect(text).toBeTruthy(); + expect(text).toContain("Referenced Orca skills:"); + expect(text).toContain("Skill: code-simplifier"); + expect(text).toContain("Source: /tmp/skills/code-simplifier/SKILL.md"); + expect(text).toContain("body"); } } finally { await session.disconnect(); diff --git a/src/cli/commands/run-command.test-harness.ts b/src/cli/commands/run-command.test-harness.ts index 10afc4a..6587d6a 100644 --- a/src/cli/commands/run-command.test-harness.ts +++ b/src/cli/commands/run-command.test-harness.ts @@ -2,6 +2,7 @@ import { afterEach, beforeEach, mock } from "bun:test"; import { mkdtemp, rm } from "node:fs/promises"; import os from "node:os"; import path from "node:path"; +import { pathToFileURL } from "node:url"; import { Command } from "commander"; type RunModule = typeof import("./run.js"); @@ -25,18 +26,23 @@ export function createRunCommandTestHarness(tempPrefix: string): RunCommandTestH const originalRunsDir = process.env.ORCA_RUNS_DIR; const originalSkipValidators = process.env.ORCA_SKIP_VALIDATORS; const originalOpenaiApiKey = process.env.OPENAI_API_KEY; + const originalHome = process.env.HOME; + const originalCwd = process.cwd(); beforeEach(async () => { tempDir = await mkdtemp(path.join(os.tmpdir(), tempPrefix)); process.env.ORCA_RUNS_DIR = path.join(tempDir, "runs"); process.env.ORCA_SKIP_VALIDATORS = "1"; process.env.OPENAI_API_KEY = "test-openai-key"; + process.env.HOME = tempDir; + process.chdir(tempDir); process.exitCode = 0; }); afterEach(async () => { mock.restore(); process.exitCode = 0; + process.chdir(originalCwd); if (originalRunsDir === undefined) { delete process.env.ORCA_RUNS_DIR; } else { @@ -52,6 +58,11 @@ export function createRunCommandTestHarness(tempPrefix: string): RunCommandTestH } else { process.env.OPENAI_API_KEY = originalOpenaiApiKey; } + if (originalHome === undefined) { + delete process.env.HOME; + } else { + process.env.HOME = originalHome; + } await rm(tempDir, { recursive: true, force: true }); }); @@ -95,8 +106,14 @@ export function createRunCommandTestHarness(tempPrefix: string): RunCommandTestH } } - const { resolveConfig: realResolveConfig } = await import(`../../core/config-loader.js?real=${Math.random()}`); - const resolveConfigMock = mock((configPath?: string) => realResolveConfig(configPath)); + const resolveConfigMock = mock(async (configPath?: string) => { + if (configPath) { + const imported = await import(`${pathToFileURL(configPath).href}?test=${Math.random()}`); + return imported.default; + } + + return { executor: "codex" as const }; + }); const ensureCodexMultiAgentMock = mock(async () => ({ action: "skipped" as const, path: path.join(tempDir, "mock-codex-config.toml") diff --git a/src/cli/commands/run.postexec-json.integration.test.ts b/src/cli/commands/run.postexec-json.integration.test.ts index 58ca1cf..1aaabac 100644 --- a/src/cli/commands/run.postexec-json.integration.test.ts +++ b/src/cli/commands/run.postexec-json.integration.test.ts @@ -30,7 +30,8 @@ describe("post-exec reviewer JSON hardening integration", () => { expect(runPromptMock).toHaveBeenCalledTimes(2); expect(runPromptMock).toHaveBeenNthCalledWith( 2, - expect.stringContaining("previous post-execution review response was invalid") + expect.stringContaining("previous post-execution review response was invalid"), + "review" ); }); @@ -56,7 +57,8 @@ describe("post-exec reviewer JSON hardening integration", () => { expect(runPromptMock).toHaveBeenCalledTimes(2); expect(runPromptMock).toHaveBeenNthCalledWith( 2, - expect.stringContaining("Schema validation failed") + expect.stringContaining("Schema validation failed"), + "review" ); }); diff --git a/src/cli/commands/run.ts b/src/cli/commands/run.ts index b533131..0a1ad8d 100644 --- a/src/cli/commands/run.ts +++ b/src/cli/commands/run.ts @@ -520,7 +520,7 @@ export async function runCommandHandler(options: RunCommandOptions): Promise codexSession.runPrompt(prompt, "review"), cycleIndex, prompt, reviewConfig.prompt diff --git a/src/core/codex-config.test.ts b/src/core/codex-config.test.ts index 1cb76ab..b0dfcc9 100644 --- a/src/core/codex-config.test.ts +++ b/src/core/codex-config.test.ts @@ -3,7 +3,7 @@ import os from "node:os"; import path from "node:path"; import { afterEach, beforeEach, describe, expect, it } from "bun:test"; -import { ensureCodexMultiAgent } from "./codex-config.js"; +import { ensureCodexMultiAgent, isCodexMultiAgentActive } from "./codex-config.js"; let tmpDir: string; let tmpConfigFile: string; @@ -87,3 +87,34 @@ describe("ensureCodexMultiAgent", () => { expect(content).toContain("multi_agent = true"); }); }); + +describe("isCodexMultiAgentActive", () => { + it("returns true when Orca config enables multiAgent", async () => { + await expect(isCodexMultiAgentActive({ codex: { multiAgent: true } }, tmpConfigFile)).resolves.toBe(true); + }); + + it("returns true when root config enables multi_agent", async () => { + const fs = await import("node:fs/promises"); + await fs.writeFile(tmpConfigFile, "[features]\nmulti_agent = true\n", "utf8"); + + await expect(isCodexMultiAgentActive(undefined, tmpConfigFile)).resolves.toBe(true); + }); + + it("returns true when both Orca config and root config enable multi-agent", async () => { + const fs = await import("node:fs/promises"); + await fs.writeFile(tmpConfigFile, "[features]\nmulti_agent = true\n", "utf8"); + + await expect(isCodexMultiAgentActive({ codex: { multiAgent: true } }, tmpConfigFile)).resolves.toBe(true); + }); + + it("returns false when Orca config is false and no root config enables multi-agent", async () => { + await expect(isCodexMultiAgentActive({ codex: { multiAgent: false } }, tmpConfigFile)).resolves.toBe(false); + }); + + it("returns false when root config contains multi_agent = false", async () => { + const fs = await import("node:fs/promises"); + await fs.writeFile(tmpConfigFile, "[features]\nmulti_agent = false\n", "utf8"); + + await expect(isCodexMultiAgentActive(undefined, tmpConfigFile)).resolves.toBe(false); + }); +}); diff --git a/src/core/codex-config.ts b/src/core/codex-config.ts index 6b893ba..2d18fe7 100644 --- a/src/core/codex-config.ts +++ b/src/core/codex-config.ts @@ -22,6 +22,75 @@ function containsMultiAgentSetting(content: string): boolean { return /multi_agent\s*=/.test(content); } +function isRootFeaturesSection(sectionPath: string[]): boolean { + return sectionPath.length === 1 && sectionPath[0] === "features"; +} + +function parseSectionPath(line: string): string[] | null { + const match = line.match(/^\[(.+)\]$/); + if (!match?.[1]) { + return null; + } + + return match[1] + .split(".") + .map((part) => part.trim().replace(/^"(.*)"$/, "$1")) + .filter((part) => part.length > 0); +} + +function hasEnabledRootMultiAgentSetting(content: string): boolean { + let currentSection: string[] = []; + + for (const rawLine of content.split(/\r?\n/u)) { + const line = rawLine.replace(/\s+#.*$/u, "").trim(); + if (line.length === 0 || line.startsWith("#")) { + continue; + } + + const sectionPath = parseSectionPath(line); + if (sectionPath !== null) { + currentSection = sectionPath; + continue; + } + + if (!isRootFeaturesSection(currentSection)) { + continue; + } + + const match = line.match(/^multi_agent\s*=\s*(true|false)\s*$/u); + if (!match?.[1]) { + continue; + } + + return match[1] === "true"; + } + + return false; +} + +export async function isCodexMultiAgentActive( + config?: OrcaConfig, + _configFile?: string, +): Promise { + if (isMultiAgentEnabled(config)) { + return true; + } + + const configFile = _configFile ?? GLOBAL_CONFIG_FILE; + + let existingContent: string; + try { + existingContent = await readFile(configFile, "utf8"); + } catch (err) { + if ((err as NodeJS.ErrnoException).code === "ENOENT") { + return false; + } + throw err; + } + + return hasEnabledRootMultiAgentSetting(existingContent); +} + /** * Ensures `~/.codex/config.toml` has `multi_agent = true` set. * diff --git a/src/core/config-loader.test.ts b/src/core/config-loader.test.ts index af69e11..55f2ff2 100644 --- a/src/core/config-loader.test.ts +++ b/src/core/config-loader.test.ts @@ -3,7 +3,7 @@ import os from "node:os"; import path from "node:path"; import { promises as fs } from "node:fs"; -import { mergeConfigs, resolveConfig, resolveConfigFromPaths } from "./config-loader.js"; +import { mergeConfigs, resolveConfigFromPaths } from "./config-loader.js"; describe("config-loader", () => { let tempDir: string; @@ -26,14 +26,17 @@ describe("config-loader", () => { await fs.rm(tempDir, { recursive: true, force: true }); }); - test("resolveConfig prefers project ts config over project js config when both exist", async () => { - process.chdir(tempDir); - process.env.HOME = tempDir; - - await fs.writeFile(path.join(tempDir, "orca.config.js"), "export default { runsDir: 'from-js' };\n", "utf8"); - await fs.writeFile(path.join(tempDir, "orca.config.ts"), "export default { runsDir: 'from-ts' };\n", "utf8"); + test("resolveConfigFromPaths prefers project ts config over project js config when both exist", async () => { + const projectJsPath = path.join(tempDir, "orca.config.js"); + const projectTsPath = path.join(tempDir, "orca.config.ts"); + await fs.writeFile(projectJsPath, "export default { runsDir: 'from-js' };\n", "utf8"); + await fs.writeFile(projectTsPath, "export default { runsDir: 'from-ts' };\n", "utf8"); - const resolved = await resolveConfig(); + const resolved = await resolveConfigFromPaths( + path.join(tempDir, "missing-global.js"), + projectJsPath, + projectTsPath + ); expect(resolved?.runsDir).toBe("from-ts"); }); @@ -76,17 +79,17 @@ describe("config-loader", () => { expect(resolved?.sessionLogs).toBe("/tmp/orca-session-logs"); }); - test("resolveConfigFromPaths throws on invalid executor value", async () => { + test("resolveConfigFromPaths coerces stale executor values to codex", async () => { const cliPath = path.join(tempDir, "cli.config.js"); - await fs.writeFile(cliPath, "export default { executor: 'invalid-executor' };\n", "utf8"); + await fs.writeFile(cliPath, "export default { executor: 'claude' };\n", "utf8"); - await expect( - resolveConfigFromPaths( - path.join(tempDir, "missing-global.js"), - path.join(tempDir, "missing-project.js"), - cliPath - ) - ).rejects.toThrow("Config.executor must be 'codex', got invalid-executor"); + const resolved = await resolveConfigFromPaths( + path.join(tempDir, "missing-global.js"), + path.join(tempDir, "missing-project.js"), + cliPath + ); + + expect(resolved?.executor).toBe("codex"); }); test("resolveConfigFromPaths rejects unknown hookCommands keys", async () => { @@ -184,7 +187,7 @@ describe("config-loader", () => { const cliPath = path.join(tempDir, "cli.config.js"); await fs.writeFile( cliPath, - "export default { codex: { thinkingLevel: { decision: 'low', planning: 'xhigh', execution: 'medium' } } };\n", + "export default { codex: { thinkingLevel: { decision: 'low', planning: 'xhigh', review: 'high', execution: 'medium' } } };\n", "utf8" ); @@ -197,6 +200,7 @@ describe("config-loader", () => { expect(resolved?.codex?.thinkingLevel).toEqual({ decision: "low", planning: "xhigh", + review: "high", execution: "medium", }); }); @@ -367,6 +371,7 @@ describe("config-loader", () => { const projectConfig = { codex: { thinkingLevel: { + review: "high" as const, execution: "medium" as const, }, }, @@ -377,6 +382,7 @@ describe("config-loader", () => { expect(merged?.codex?.thinkingLevel).toEqual({ decision: "low", planning: "high", + review: "high", execution: "medium", }); }); diff --git a/src/core/config-loader.ts b/src/core/config-loader.ts index 8eb4d5f..faf6d50 100644 --- a/src/core/config-loader.ts +++ b/src/core/config-loader.ts @@ -97,16 +97,7 @@ function coerceConfig(candidate: unknown): OrcaConfig { } if ("executor" in candidate && candidate.executor !== undefined) { - if (candidate.executor !== "codex") { - const executorDisplay = - typeof candidate.executor === "string" - ? candidate.executor - : (JSON.stringify(candidate.executor) ?? describeType(candidate.executor)); - - throw new Error( - `Config.executor must be 'codex', got ${executorDisplay}` - ); - } + candidate.executor = "codex"; } if ("codex" in candidate && candidate.codex !== undefined) { @@ -129,7 +120,7 @@ function coerceConfig(candidate: unknown): OrcaConfig { throw new Error(`Config.codex.thinkingLevel must be an object, got ${describeType(candidate.codex.thinkingLevel)}`); } - for (const key of ["decision", "planning", "execution"] as const) { + for (const key of ["decision", "planning", "review", "execution"] as const) { const value = candidate.codex.thinkingLevel[key]; if (value !== undefined) { if (typeof value !== "string") { diff --git a/src/types/index.ts b/src/types/index.ts index 4de444f..e53f1e4 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -152,6 +152,7 @@ export interface OrcaConfig { thinkingLevel?: { decision?: CodexEffort; planning?: CodexEffort; + review?: CodexEffort; execution?: CodexEffort; }; command?: string; From 2ab121d2b37969bb93b0c7e3c104a485433555ef Mon Sep 17 00:00:00 2001 From: bradley inniss Date: Mon, 16 Mar 2026 00:41:39 -0700 Subject: [PATCH 02/19] Implement Orca clarification question flow --- README.md | 4 +- session-logs/LATEST.md | 13 ++ src/agents/codex/session.ts | 227 ++++++++++++++++++++++++-- src/agents/codex/session.unit.test.ts | 179 +++++++++++++++++++- src/cli/commands/answer.test.ts | 135 ++++++++++++++- src/cli/commands/answer.ts | 44 ++++- src/cli/commands/help.ts | 2 + src/cli/commands/run.ts | 15 +- src/cli/commands/status.test.ts | 83 ++++++++++ src/cli/commands/status.ts | 7 + src/core/config-loader.ts | 1 + src/core/planner.ts | 35 +++- src/core/question-flow.ts | 157 ++++++++++++++++++ src/core/task-runner.ts | 8 +- src/state/schema.ts | 24 +++ src/state/store.ts | 2 +- src/types/config-typing.typecheck.ts | 4 + src/types/index.ts | 32 ++++ 18 files changed, 926 insertions(+), 46 deletions(-) create mode 100644 session-logs/LATEST.md create mode 100644 src/cli/commands/status.test.ts create mode 100644 src/core/question-flow.ts diff --git a/README.md b/README.md index 51a35de..0aa06b4 100644 --- a/README.md +++ b/README.md @@ -56,7 +56,7 @@ If a run hits `waiting_for_answer`, it's blocked until you respond: ```bash orca status --last # read the question -orca answer "yes, use migration A" # unblock it +orca answer "yes, use migration A" # answer and resume the live run ``` ### Spec / plan files @@ -247,7 +247,7 @@ orca setup Interactive setup wizard ### Hooks -Available hook names: `onMilestone`, `onTaskComplete`, `onTaskFail`, `onInvalidPlan`, `onFindings`, `onComplete`, `onError`. +Available hook names: `onMilestone`, `onQuestion`, `onTaskComplete`, `onTaskFail`, `onInvalidPlan`, `onFindings`, `onComplete`, `onError`. - Function hooks (`config.hooks`): receive `(event, context)` where `context = { cwd, pid, invokedAt }` - Command hooks (`config.hookCommands` / `--on-*` flags): receive full event JSON over stdin diff --git a/session-logs/LATEST.md b/session-logs/LATEST.md new file mode 100644 index 0000000..c009db1 --- /dev/null +++ b/session-logs/LATEST.md @@ -0,0 +1,13 @@ +# Session Log + +- Timestamp: 2026-03-16T07:40:30Z +- Scope: Codex clarification question flow, pending-question persistence, onQuestion hooks, answer/resume handling, and live CLI smoke coverage. +- Verification: + - `bun test src` + - `npm run typecheck:tsc` + - `npm run build` + - `bun test src/__tests__/client.test.ts src/__tests__/integration.test.ts` +- Notes: + - Orca now surfaces Codex `requestUserInput` prompts in `status.json`, `orca status`, and `onQuestion` hooks. + - `orca answer` writes structured answers and the original live run resumes without `orca resume`. + - The CLI smoke passed against a fake Codex app-server exercising `waiting_for_answer` end to end. diff --git a/src/agents/codex/session.ts b/src/agents/codex/session.ts index f853694..c907919 100644 --- a/src/agents/codex/session.ts +++ b/src/agents/codex/session.ts @@ -1,19 +1,32 @@ -import { readFile } from "node:fs/promises"; +import { readFile, unlink } from "node:fs/promises"; import path from "node:path"; import { CodexClient } from "@ratley/codex-client"; -import type { CompletedTurn } from "@ratley/codex-client"; +import type { + CompletedTurn, + RequestId, + ToolRequestUserInputParams, + ToolRequestUserInputResponse, +} from "@ratley/codex-client"; import type { + HookEvent, OrcaConfig, PlanResult, + RunId, Task, TaskExecutionResult, TaskGraphReviewOperation, TaskGraphReviewResult } from "../../types/index.js"; import { isCodexMultiAgentActive } from "../../core/codex-config.js"; +import { + buildQuestionHookMessage, + createPendingQuestion, + parseQuestionAnswerInput, +} from "../../core/question-flow.js"; import { TaskGraphReviewPayloadSchema } from "../../core/task-graph-review.js"; +import { RunStore } from "../../state/store.js"; import type { CodexEffort } from "../../types/effort.js"; import { loadSkills, type LoadedSkill } from "../../utils/skill-loader.js"; import { logger } from "../../utils/logger.js"; @@ -360,6 +373,8 @@ const DEFAULT_THINKING_BY_STEP: Record = { execution: "medium", }; +const ANSWER_FILE_POLL_MS = 500; + function getEffort(config: OrcaConfig | undefined, step: ThinkingStep): CodexEffort { const explicitThinkingLevel = config?.codex?.thinkingLevel?.[step]; if (explicitThinkingLevel !== undefined) { @@ -640,6 +655,39 @@ async function warnAboutUnavailableMcpServers(client: CodexClient): Promise { + return new Promise((resolve) => { + setTimeout(resolve, ms); + }); +} + +async function appendRunError( + store: RunStore, + runId: RunId, + message: string, + taskId?: string, +): Promise { + const run = await store.getRun(runId); + if (!run) { + return; + } + + await store.updateRun(runId, { + errors: [...run.errors, { at: new Date().toISOString(), message, ...(taskId ? { taskId } : {}) }], + }); +} + +async function clearAnswerFile(store: RunStore, runId: RunId): Promise { + const answerPath = path.join(store.getRunDir(runId), "answer.txt"); + await unlink(answerPath).catch(() => undefined); +} + +export interface SessionInteractionContext { + runId: RunId; + store: RunStore; + emitHook?: (event: HookEvent) => Promise; +} + /** * Create a persistent Codex session. The thread persists across calls — * planSpec and executeTask share context within the same session. @@ -652,6 +700,7 @@ export interface ConsultationResult { export async function createCodexSession( cwd: string, config?: OrcaConfig, + interactionContext?: SessionInteractionContext, ): Promise<{ decidePlanningNeed: (spec: string, systemContext: string) => Promise; planSpec: (spec: string, systemContext: string) => Promise; @@ -678,6 +727,152 @@ export async function createCodexSession( await client.connect(); await warnAboutUnavailableMcpServers(client); + let activeTaskContext: { taskId: string; taskName: string } | undefined; + + const respondToUserInputRequest = (requestId: RequestId, response: ToolRequestUserInputResponse): void => { + const specificResponder = Reflect.get(client as object, "respondToUserInputRequest"); + if (typeof specificResponder === "function") { + specificResponder.call(client, requestId, response); + return; + } + + const genericResponder = Reflect.get(client as object, "respondToServerRequest"); + if (typeof genericResponder === "function") { + genericResponder.call(client, requestId, response); + return; + } + + throw new Error("Codex client does not support responding to server requests"); + }; + + const rejectUserInputRequest = (requestId: RequestId, message: string): void => { + const rejector = Reflect.get(client as object, "rejectServerRequest"); + if (typeof rejector === "function") { + rejector.call(client, requestId, { code: -32603, message }); + return; + } + + throw new Error("Codex client does not support rejecting server requests"); + }; + + const clearPendingQuestion = async (requestId: RequestId, overallStatus: "running" | "waiting_for_answer"): Promise => { + if (!interactionContext) { + return; + } + + const currentRun = await interactionContext.store.getRun(interactionContext.runId); + if (!currentRun || currentRun.pendingQuestion?.requestId !== requestId) { + return; + } + + await interactionContext.store.updateRun(interactionContext.runId, { + overallStatus, + pendingQuestion: undefined, + }); + }; + + const on = Reflect.get(client as object, "on"); + if (typeof on === "function") { + on.call( + client, + "request:userInput", + (request: { requestId: RequestId } & ToolRequestUserInputParams) => { + void (async () => { + if (!interactionContext) { + rejectUserInputRequest( + request.requestId, + "Orca cannot answer Codex requestUserInput prompts without an interactive run context.", + ); + return; + } + + const pendingQuestion = createPendingQuestion(request.requestId, request); + await clearAnswerFile(interactionContext.store, interactionContext.runId); + await interactionContext.store.updateRun(interactionContext.runId, { + overallStatus: "waiting_for_answer", + pendingQuestion, + }); + + if (interactionContext.emitHook) { + await interactionContext.emitHook({ + runId: interactionContext.runId, + hook: "onQuestion", + message: buildQuestionHookMessage(pendingQuestion), + timestamp: pendingQuestion.receivedAt, + requestId: pendingQuestion.requestId, + threadId: pendingQuestion.threadId, + turnId: pendingQuestion.turnId, + itemId: pendingQuestion.itemId, + questions: pendingQuestion.questions, + ...(activeTaskContext + ? { taskId: activeTaskContext.taskId, taskName: activeTaskContext.taskName } + : {}), + metadata: { + questionCount: pendingQuestion.questions.length, + }, + }); + } + + const answerPath = path.join(interactionContext.store.getRunDir(interactionContext.runId), "answer.txt"); + + while (true) { + const currentRun = await interactionContext.store.getRun(interactionContext.runId); + if (!currentRun) { + rejectUserInputRequest(request.requestId, `Run not found while waiting for answer: ${interactionContext.runId}`); + return; + } + + if (currentRun.overallStatus === "cancelled") { + rejectUserInputRequest(request.requestId, `Run ${interactionContext.runId} was cancelled while waiting for input.`); + await clearPendingQuestion(request.requestId, "waiting_for_answer"); + return; + } + + let rawAnswer: string; + try { + rawAnswer = await readFile(answerPath, "utf8"); + } catch (error) { + if ((error as NodeJS.ErrnoException).code === "ENOENT") { + await sleep(ANSWER_FILE_POLL_MS); + continue; + } + + throw error; + } + + try { + const parsedAnswer = parseQuestionAnswerInput(rawAnswer, pendingQuestion); + respondToUserInputRequest(request.requestId, parsedAnswer); + await clearAnswerFile(interactionContext.store, interactionContext.runId); + await clearPendingQuestion(request.requestId, "running"); + return; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + logger.warn(`Invalid answer for run ${interactionContext.runId}; waiting for another response (${message})`); + await appendRunError( + interactionContext.store, + interactionContext.runId, + `invalid-answer: ${message}`, + activeTaskContext?.taskId, + ); + await clearAnswerFile(interactionContext.store, interactionContext.runId); + } + } + })().catch(async (error) => { + const message = error instanceof Error ? error.message : String(error); + logger.warn(`Failed while handling Codex requestUserInput: ${message}`); + if (interactionContext) { + await appendRunError(interactionContext.store, interactionContext.runId, `request-user-input-failed: ${message}`, activeTaskContext?.taskId); + } + }); + }, + ); + + on.call(client, "serverRequest:resolved", (notification: { requestId: RequestId }) => { + void clearPendingQuestion(notification.requestId, "running"); + }); + } + let skills: LoadedSkill[]; let threadId: string; try { @@ -737,11 +932,17 @@ export async function createCodexSession( runId: string, systemContext?: string, ): Promise { - const result = await client.runTurn({ - threadId, - effort: getEffort(config, "execution"), - input: buildTurnInput(buildTaskExecutionPrompt(task, runId, cwd, systemContext, multiAgentActive), skills), - }); + activeTaskContext = { taskId: task.id, taskName: task.name }; + let result: CompletedTurn; + try { + result = await client.runTurn({ + threadId, + effort: getEffort(config, "execution"), + input: buildTurnInput(buildTaskExecutionPrompt(task, runId, cwd, systemContext, multiAgentActive), skills), + }); + } finally { + activeTaskContext = undefined; + } const rawResponse = extractAgentText(result); @@ -826,8 +1027,9 @@ export async function decidePlanningNeed( spec: string, systemContext: string, config?: OrcaConfig, + interactionContext?: SessionInteractionContext, ): Promise { - const session = await createCodexSession(process.cwd(), config); + const session = await createCodexSession(process.cwd(), config, interactionContext); try { return await session.decidePlanningNeed(spec, systemContext); @@ -840,8 +1042,9 @@ export async function planSpec( spec: string, systemContext: string, config?: OrcaConfig, + interactionContext?: SessionInteractionContext, ): Promise { - const session = await createCodexSession(process.cwd(), config); + const session = await createCodexSession(process.cwd(), config, interactionContext); try { return await session.planSpec(spec, systemContext); @@ -854,8 +1057,9 @@ export async function reviewTaskGraph( tasks: Task[], systemContext: string, config?: OrcaConfig, + interactionContext?: SessionInteractionContext, ): Promise { - const session = await createCodexSession(process.cwd(), config); + const session = await createCodexSession(process.cwd(), config, interactionContext); try { return await session.reviewTaskGraph(tasks, systemContext); @@ -869,8 +1073,9 @@ export async function executeTask( runId: string, config?: OrcaConfig, systemContext?: string, + interactionContext?: SessionInteractionContext, ): Promise { - const session = await createCodexSession(process.cwd(), config); + const session = await createCodexSession(process.cwd(), config, interactionContext); try { return await session.executeTask(task, runId, systemContext); diff --git a/src/agents/codex/session.unit.test.ts b/src/agents/codex/session.unit.test.ts index 7a755bf..b876be2 100644 --- a/src/agents/codex/session.unit.test.ts +++ b/src/agents/codex/session.unit.test.ts @@ -1,12 +1,30 @@ -import { mkdtemp, mkdir, writeFile } from "node:fs/promises"; +import { mkdtemp, mkdir, readFile, rm, writeFile } from "node:fs/promises"; +import { EventEmitter } from "node:events"; import os from "node:os"; import path from "node:path"; import { afterEach, describe, expect, mock, test } from "bun:test"; +import { RunStore } from "../../state/store.js"; + afterEach(() => { mock.restore(); }); +async function waitFor(load: () => Promise, timeoutMs = 2_000): Promise { + const start = Date.now(); + + while (Date.now() - start < timeoutMs) { + const value = await load(); + if (value !== null) { + return value; + } + + await new Promise((resolve) => setTimeout(resolve, 20)); + } + + throw new Error(`Timed out after ${timeoutMs}ms`); +} + function mockMultiAgentDetection(active = false): void { mock.module("../../core/codex-config.js", () => ({ isCodexMultiAgentActive: async () => active, @@ -699,3 +717,162 @@ describe("codex session inline skill context", () => { } }); }); + +describe("codex session question flow", () => { + test("persists pending questions, emits onQuestion, and resumes the same run after an answer", async () => { + const tempDir = await mkdtemp(path.join(os.tmpdir(), "orca-question-flow-")); + const store = new RunStore(path.join(tempDir, "runs")); + const runId = "run-1000-abcd"; + await store.createRun(runId, "/tmp/spec.md"); + await store.updateRun(runId, { mode: "run", overallStatus: "running" }); + + const hookEvents: Array<{ hook: string; message: string; taskId?: string; questions?: Array<{ id: string }> }> = []; + const responses: Array<{ requestId: string | number; response: unknown }> = []; + let resolveAnswerResponse: (() => void) | undefined; + const answerResponse = new Promise((resolve) => { + resolveAnswerResponse = resolve; + }); + let clientInstance: EventEmitter | null = null; + + try { + mockMultiAgentDetection(false); + mock.module("@ratley/codex-client", () => ({ + CodexClient: class extends EventEmitter { + constructor() { + super(); + clientInstance = this; + } + + async connect(): Promise {} + async disconnect(): Promise {} + async startThread(): Promise<{ id: string }> { + return { id: "thread-1" }; + } + async runReview(): Promise<{ reviewText: string }> { + return { reviewText: "ok" }; + } + respondToUserInputRequest(requestId: string | number, response: unknown): void { + responses.push({ requestId, response }); + resolveAnswerResponse?.(); + } + rejectServerRequest(): void {} + async runTurn(): Promise<{ agentMessage: string; turn: { status: "completed" }; items: [] }> { + queueMicrotask(() => { + clientInstance?.emit("request:userInput", { + requestId: "req-1", + itemId: "item-1", + threadId: "thread-1", + turnId: "turn-1", + questions: [ + { + header: "Game Type", + id: "game_type", + question: "Which game type should I build?", + isOther: true, + isSecret: false, + options: [ + { label: "Arcade", description: "Arcade style" }, + { label: "Puzzle", description: "Puzzle style" }, + ], + }, + ], + }); + }); + + await answerResponse; + clientInstance?.emit("serverRequest:resolved", { requestId: "req-1" }); + + return { + agentMessage: '{"outcome":"done"}', + turn: { status: "completed" }, + items: [], + }; + } + }, + })); + + mock.module("../../utils/skill-loader.js", () => ({ + loadSkills: async () => [], + })); + + const { createCodexSession } = await import(`./session.ts?test=${Math.random()}`); + const session = await createCodexSession(process.cwd(), undefined, { + runId: runId as `${string}-${number}-${string}`, + store, + emitHook: async (event) => { + hookEvents.push({ + hook: event.hook, + message: event.message, + ...(event.taskId ? { taskId: event.taskId } : {}), + ...("questions" in event ? { questions: event.questions.map((question) => ({ id: question.id })) } : {}), + }); + }, + }); + + try { + const executionPromise = session.executeTask( + { + id: "task-1", + name: "Build the game", + description: "Implement the requested game.", + dependencies: [], + acceptance_criteria: ["Game is implemented"], + status: "pending", + retries: 0, + maxRetries: 3, + }, + runId, + "context", + ); + + const waitingRun = await waitFor(async () => { + const run = await store.getRun(runId); + return run?.pendingQuestion ? run : null; + }); + + expect(waitingRun.overallStatus).toBe("waiting_for_answer"); + expect(waitingRun.pendingQuestion?.requestId).toBe("req-1"); + expect(waitingRun.pendingQuestion?.questions[0]?.id).toBe("game_type"); + expect(hookEvents).toContainEqual({ + hook: "onQuestion", + message: "Which game type should I build?", + taskId: "task-1", + questions: [{ id: "game_type" }], + }); + + const answerPath = path.join(store.getRunDir(runId), "answer.txt"); + await writeFile( + answerPath, + `${JSON.stringify({ answers: { game_type: { answers: ["Arcade"] } } })}\n`, + "utf8", + ); + + const result = await executionPromise; + expect(result.outcome).toBe("done"); + expect(responses).toEqual([ + { + requestId: "req-1", + response: { + answers: { + game_type: { + answers: ["Arcade"], + }, + }, + }, + }, + ]); + + const resumedRun = await waitFor(async () => { + const run = await store.getRun(runId); + return run && run.pendingQuestion === undefined ? run : null; + }); + expect(resumedRun.overallStatus).toBe("running"); + await expect(readFile(answerPath, "utf8")).rejects.toThrow(); + } finally { + await session.disconnect(); + } + } finally { + await rm(tempDir, { recursive: true, force: true }); + } + }); +}); diff --git a/src/cli/commands/answer.test.ts b/src/cli/commands/answer.test.ts index 6d8a28d..d361ac0 100644 --- a/src/cli/commands/answer.test.ts +++ b/src/cli/commands/answer.test.ts @@ -85,21 +85,41 @@ afterEach(async () => { }); describe("answer command", () => { - test("submits positional answer and resumes waiting run", async () => { + test("submits positional answer for a single pending question", async () => { const runId = "answer-positional-1000-abcd"; const store = new RunStore(runsDir); await store.createRun(runId, "/tmp/spec.md"); - await store.updateRun(runId, { overallStatus: "waiting_for_answer" }); + await store.updateRun(runId, { + overallStatus: "waiting_for_answer", + pendingQuestion: { + requestId: "req-1", + threadId: "thread-1", + turnId: "turn-1", + itemId: "item-1", + receivedAt: new Date().toISOString(), + questions: [ + { + header: "Migration", + id: "migration", + question: "Which migration should I use?", + isOther: true, + isSecret: false, + options: [{ label: "A", description: "Use migration A" }], + }, + ], + }, + }); const { answerModule } = await loadAnswerModule(); await answerModule.answerCommandHandler(runId, "yes", {}); const run = await store.getRun(runId); - expect(run?.overallStatus).toBe("running"); + expect(run?.overallStatus).toBe("waiting_for_answer"); const answerPath = path.join(runsDir, runId, "answer.txt"); const payload = await readFile(answerPath, "utf8"); - expect(payload).toBe("yes\n"); + expect(payload).toContain('"migration"'); + expect(payload).toContain('"yes"'); expect(logs.join("\n")).toContain(`Answer submitted. Run ${runId} will resume shortly.`); }); @@ -120,7 +140,26 @@ describe("answer command", () => { const runId = "answer-no-input-1000-abcd"; const store = new RunStore(runsDir); await store.createRun(runId, "/tmp/spec.md"); - await store.updateRun(runId, { overallStatus: "waiting_for_answer" }); + await store.updateRun(runId, { + overallStatus: "waiting_for_answer", + pendingQuestion: { + requestId: "req-1", + threadId: "thread-1", + turnId: "turn-1", + itemId: "item-1", + receivedAt: new Date().toISOString(), + questions: [ + { + header: "Migration", + id: "migration", + question: "Which migration should I use?", + isOther: true, + isSecret: false, + options: null, + }, + ], + }, + }); const { answerModule } = await loadAnswerModule(); @@ -133,7 +172,26 @@ describe("answer command", () => { const runId = "answer-prompt-1000-abcd"; const store = new RunStore(runsDir); await store.createRun(runId, "/tmp/spec.md"); - await store.updateRun(runId, { overallStatus: "waiting_for_answer" }); + await store.updateRun(runId, { + overallStatus: "waiting_for_answer", + pendingQuestion: { + requestId: "req-1", + threadId: "thread-1", + turnId: "turn-1", + itemId: "item-1", + receivedAt: new Date().toISOString(), + questions: [ + { + header: "Migration", + id: "migration", + question: "Which migration should I use?", + isOther: true, + isSecret: false, + options: [{ label: "A", description: "Use migration A" }], + }, + ], + }, + }); setStdoutTty(true); const { answerModule, inputMock } = await loadAnswerModule({ @@ -145,14 +203,34 @@ describe("answer command", () => { expect(inputMock).toHaveBeenCalled(); const answerPath = path.join(runsDir, runId, "answer.txt"); const payload = await readFile(answerPath, "utf8"); - expect(payload).toBe("from prompt\n"); + expect(payload).toContain('"migration"'); + expect(payload).toContain('"from prompt"'); }); test("uses interactive run selection when no run id is provided in tty mode", async () => { const runId = "selected-run-1000-abcd"; const store = new RunStore(runsDir); await store.createRun(runId, "/tmp/spec.md"); - await store.updateRun(runId, { overallStatus: "waiting_for_answer" }); + await store.updateRun(runId, { + overallStatus: "waiting_for_answer", + pendingQuestion: { + requestId: "req-1", + threadId: "thread-1", + turnId: "turn-1", + itemId: "item-1", + receivedAt: new Date().toISOString(), + questions: [ + { + header: "Migration", + id: "migration", + question: "Which migration should I use?", + isOther: true, + isSecret: false, + options: null, + }, + ], + }, + }); setStdoutTty(true); const { answerModule, selectMock } = await loadAnswerModule({ @@ -164,7 +242,46 @@ describe("answer command", () => { expect(selectMock).toHaveBeenCalled(); const answerPath = path.join(runsDir, runId, "answer.txt"); const payload = await readFile(answerPath, "utf8"); - expect(payload).toBe("selected answer\n"); + expect(payload).toContain('"selected answer"'); + }); + + test("requires JSON mapping when multiple pending questions are answered non-interactively", async () => { + const runId = "answer-multi-1000-abcd"; + const store = new RunStore(runsDir); + await store.createRun(runId, "/tmp/spec.md"); + await store.updateRun(runId, { + overallStatus: "waiting_for_answer", + pendingQuestion: { + requestId: "req-1", + threadId: "thread-1", + turnId: "turn-1", + itemId: "item-1", + receivedAt: new Date().toISOString(), + questions: [ + { + header: "Backend", + id: "backend", + question: "Which backend?", + isOther: true, + isSecret: false, + options: null, + }, + { + header: "Frontend", + id: "frontend", + question: "Which frontend?", + isOther: true, + isSecret: false, + options: null, + }, + ], + }, + }); + + const { answerModule } = await loadAnswerModule(); + await expect(answerModule.answerCommandHandler(runId, "just one answer", {})).rejects.toThrow( + "multiple pending questions require a JSON object mapping question ids to answers", + ); }); test("fails when positional run-id and --run are both provided", async () => { diff --git a/src/cli/commands/answer.ts b/src/cli/commands/answer.ts index e299394..5c51718 100644 --- a/src/cli/commands/answer.ts +++ b/src/cli/commands/answer.ts @@ -4,7 +4,9 @@ import { promises as fs } from "node:fs"; import { input } from "@inquirer/prompts"; import type { Command } from "commander"; +import { parseQuestionAnswerInput, serializeQuestionAnswerResponse } from "../../core/question-flow.js"; import { RunStore } from "../../state/store.js"; +import type { PendingQuestion } from "../../types/index.js"; import { selectRun } from "../../utils/select-run.js"; export interface AnswerCommandOptions { @@ -24,7 +26,17 @@ function resolveRunId(positionalRunId: string | undefined, optionRunId: string | return positionalRunId ?? optionRunId; } -async function resolveAnswer(answerArg: string | undefined): Promise { +function formatQuestionPrompt(question: PendingQuestion["questions"][number]): string { + const options = question.options && question.options.length > 0 + ? ` Options: ${question.options.map((option) => option.label).join(", ")}.` + : ""; + return `${question.header}: ${question.question}${options}`; +} + +async function resolveAnswerPayload( + pendingQuestion: PendingQuestion | undefined, + answerArg: string | undefined, +): Promise { if (answerArg) { return answerArg; } @@ -33,12 +45,26 @@ async function resolveAnswer(answerArg: string | undefined): Promise { throw new Error("no answer provided"); } - const value = await input({ message: "Answer:" }); - if (!value) { - throw new Error("no answer provided"); + if (!pendingQuestion || pendingQuestion.questions.length === 0) { + const value = await input({ message: "Answer:" }); + if (!value) { + throw new Error("no answer provided"); + } + + return value; + } + + const answers: Record = {}; + for (const question of pendingQuestion.questions) { + const value = await input({ message: formatQuestionPrompt(question) }); + if (!value) { + throw new Error(`no answer provided for question '${question.id}'`); + } + + answers[question.id] = { answers: [value] }; } - return value; + return JSON.stringify({ answers }); } export async function answerCommandHandler( @@ -75,12 +101,14 @@ export async function answerCommandHandler( return; } - const answer = await resolveAnswer(answerArg); + const answerPayload = await resolveAnswerPayload(run.pendingQuestion, answerArg); + const serialized = run.pendingQuestion + ? serializeQuestionAnswerResponse(parseQuestionAnswerInput(answerPayload, run.pendingQuestion)) + : `${answerPayload}\n`; const answerPath = path.join(store.getRunDir(runId), "answer.txt"); await fs.mkdir(path.dirname(answerPath), { recursive: true }); - await fs.writeFile(answerPath, `${answer}\n`, "utf8"); + await fs.writeFile(answerPath, serialized, "utf8"); - await store.updateRun(runId, { overallStatus: "running" }); console.log(`Answer submitted. Run ${runId} will resume shortly.`); } diff --git a/src/cli/commands/help.ts b/src/cli/commands/help.ts index 53559aa..50e490f 100644 --- a/src/cli/commands/help.ts +++ b/src/cli/commands/help.ts @@ -34,6 +34,7 @@ function printStyledHelpPage(): void { { command: "orca status", description: "list all runs" }, { command: "orca status --last", description: "show most recent run" }, { command: "orca status --run ", description: "show run details" }, + { command: "orca answer ", description: "answer a waiting question" }, { command: "orca resume --last", description: "resume most recent run" }, { command: "orca resume --run ", description: "resume incomplete run" }, { command: "orca cancel --run ", description: "cancel active run" } @@ -59,6 +60,7 @@ function printStyledHelpPage(): void { { command: "--codex-effort ", description: "override Codex thinking level for the current run" }, { command: "--full-auto", description: "skip all questions, proceed autonomously" }, { command: "--on-complete ", description: "shell hook on run complete" }, + { command: "--on-question ", description: "shell hook on question required" }, { command: "--on-error ", description: "shell hook on run error" }, { command: "-h, --help", description: "show help for any command" }, { command: "-V, --version", description: "show version" } diff --git a/src/cli/commands/run.ts b/src/cli/commands/run.ts index 0a1ad8d..71b3d52 100644 --- a/src/cli/commands/run.ts +++ b/src/cli/commands/run.ts @@ -58,6 +58,7 @@ export interface RunCommandOptions { codexOnly?: boolean; codexEffort?: CodexEffort; onMilestone?: string; + onQuestion?: string; onTaskComplete?: string; onTaskFail?: string; onInvalidPlan?: string; @@ -68,6 +69,7 @@ export interface RunCommandOptions { const ALL_HOOKS: HookName[] = [ "onMilestone", + "onQuestion", "onTaskComplete", "onTaskFail", "onInvalidPlan", @@ -77,6 +79,7 @@ const ALL_HOOKS: HookName[] = [ ]; const VALID_HOOK_NAMES = new Set([ "onMilestone", + "onQuestion", "onTaskComplete", "onTaskFail", "onInvalidPlan", @@ -146,6 +149,7 @@ function computeFinalStatus(overallStatus: string, allTasksDone: boolean): "comp function buildCliCommandHooks(options: RunCommandOptions): Partial> { return { ...(options.onMilestone ? { onMilestone: options.onMilestone } : {}), + ...(options.onQuestion ? { onQuestion: options.onQuestion } : {}), ...(options.onTaskComplete ? { onTaskComplete: options.onTaskComplete } : {}), ...(options.onTaskFail ? { onTaskFail: options.onTaskFail } : {}), ...(options.onInvalidPlan ? { onInvalidPlan: options.onInvalidPlan } : {}), @@ -440,7 +444,7 @@ export async function runCommandHandler(options: RunCommandOptions): Promise", "Codex thinking level override for this run", parseCodexEffortOption) .option("--on-milestone ", "Shell hook command for onMilestone") + .option("--on-question ", "Shell hook command for onQuestion") .option("--on-task-complete ", "Shell hook command for onTaskComplete") .option("--on-task-fail ", "Shell hook command for onTaskFail") .option("--on-invalid-plan ", "Shell hook command for onInvalidPlan") diff --git a/src/cli/commands/status.test.ts b/src/cli/commands/status.test.ts new file mode 100644 index 0000000..af5a3f8 --- /dev/null +++ b/src/cli/commands/status.test.ts @@ -0,0 +1,83 @@ +import { afterEach, beforeEach, describe, expect, test } from "bun:test"; +import os from "node:os"; +import path from "node:path"; +import { mkdtemp, rm } from "node:fs/promises"; + +import { RunStore } from "../../state/store.js"; + +type StatusModule = typeof import("./status.js"); + +let tempDir = ""; +let runsDir = ""; +let logs: string[] = []; +const originalRunsDir = process.env.ORCA_RUNS_DIR; +const originalConsoleLog = console.log; + +async function loadStatusModule(): Promise { + return import(`./status.js?test=${Math.random()}`); +} + +beforeEach(async () => { + tempDir = await mkdtemp(path.join(os.tmpdir(), "orca-status-test-")); + runsDir = path.join(tempDir, "runs"); + process.env.ORCA_RUNS_DIR = runsDir; + logs = []; + process.exitCode = 0; + + console.log = (...args: unknown[]) => { + logs.push(args.map(String).join(" ")); + }; +}); + +afterEach(async () => { + console.log = originalConsoleLog; + + if (originalRunsDir === undefined) { + delete process.env.ORCA_RUNS_DIR; + } else { + process.env.ORCA_RUNS_DIR = originalRunsDir; + } + + process.exitCode = 0; + await rm(tempDir, { recursive: true, force: true }); +}); + +describe("status command", () => { + test("prints pending question details for runs waiting for input", async () => { + const runId = "run-1000-abcd"; + const store = new RunStore(runsDir); + await store.createRun(runId, "/tmp/spec.md"); + await store.updateRun(runId, { + mode: "run", + overallStatus: "waiting_for_answer", + pendingQuestion: { + requestId: "req-1", + threadId: "thread-1", + turnId: "turn-1", + itemId: "item-1", + receivedAt: new Date().toISOString(), + questions: [ + { + header: "Game Type", + id: "game_type", + question: "Which game type should I build?", + isOther: true, + isSecret: false, + options: [ + { label: "Arcade", description: "Arcade style" }, + { label: "Puzzle", description: "Puzzle style" }, + ], + }, + ], + }, + }); + + const statusModule = await loadStatusModule(); + await statusModule.statusCommandHandler({ run: runId }); + + const output = logs.join("\n"); + expect(output).toContain("Pending Question:"); + expect(output).toContain("Game Type: Which game type should I build?"); + expect(output).toContain("Options: Arcade, Puzzle."); + }); +}); diff --git a/src/cli/commands/status.ts b/src/cli/commands/status.ts index fd46565..d738aba 100644 --- a/src/cli/commands/status.ts +++ b/src/cli/commands/status.ts @@ -1,5 +1,6 @@ import type { Command } from "commander"; +import { formatPendingQuestionForStatus } from "../../core/question-flow.js"; import { RunStore } from "../../state/store.js"; import type { RunStatus, Task } from "../../types/index.js"; import { getLastRun } from "../../utils/last-run.js"; @@ -64,6 +65,12 @@ async function printDetailedRun(run: RunStatus): Promise { console.log(`Updated: ${run.updatedAt}`); console.log(`Milestones: ${run.milestones.length}`); console.log(`Errors: ${run.errors.length}`); + if (run.pendingQuestion) { + console.log(""); + for (const line of formatPendingQuestionForStatus(run.pendingQuestion)) { + console.log(line); + } + } console.log(""); console.log("Tasks:"); if (run.tasks.length === 0) { diff --git a/src/core/config-loader.ts b/src/core/config-loader.ts index faf6d50..ebcb83d 100644 --- a/src/core/config-loader.ts +++ b/src/core/config-loader.ts @@ -9,6 +9,7 @@ import type { HookName, OrcaConfig } from "../types/index.js"; const KNOWN_HOOK_NAMES: HookName[] = [ "onMilestone", + "onQuestion", "onTaskComplete", "onTaskFail", "onInvalidPlan", diff --git a/src/core/planner.ts b/src/core/planner.ts index 512630e..7342f36 100644 --- a/src/core/planner.ts +++ b/src/core/planner.ts @@ -1,8 +1,13 @@ import { promises as fs } from "node:fs"; import path from "node:path"; -import { decidePlanningNeed as decidePlanningNeedWithCodex, planSpec as planSpecWithCodex, reviewTaskGraph as reviewTaskGraphWithCodex } from "../agents/codex/session.js"; -import type { OrcaConfig, Task, TaskGraphReviewResult } from "../types/index.js"; +import { + decidePlanningNeed as decidePlanningNeedWithCodex, + planSpec as planSpecWithCodex, + reviewTaskGraph as reviewTaskGraphWithCodex, + type SessionInteractionContext, +} from "../agents/codex/session.js"; +import type { HookEvent, OrcaConfig, Task, TaskGraphReviewResult } from "../types/index.js"; import { logger } from "../utils/logger.js"; import { loadSkills, type LoadedSkill } from "../utils/skill-loader.js"; import { RunStore } from "../state/store.js"; @@ -177,6 +182,7 @@ async function runTaskGraphReview( tasks: Task[], systemContext: string, config: OrcaConfig | undefined, + interactionContext?: SessionInteractionContext, ): Promise<{ finalTasks: Task[]; review: TaskGraphReviewResult | null }> { const planReviewConfig = getPlanReviewConfig(config); if (!planReviewConfig.enabled) { @@ -188,7 +194,7 @@ async function runTaskGraphReview( const reviewFn = resolveReviewTaskGraphImpl(config); let review: TaskGraphReviewResult; try { - review = await reviewFn(tasks, systemContext, config); + review = await reviewFn(tasks, systemContext, config, interactionContext); } catch (error) { if (planReviewConfig.onInvalid === "warn_skip") { logger.warn(`Review output invalid; skipping review changes (${error instanceof Error ? error.message : String(error)})`); @@ -218,6 +224,7 @@ async function runTaskGraphReview( type PlannerOptions = { allowPlanSkip?: boolean; + emitHook?: (event: HookEvent) => Promise; }; function buildSingleExecutionTask(spec: string): Task[] { @@ -236,9 +243,14 @@ function buildSingleExecutionTask(spec: string): Task[] { ]; } -async function runFullPlanning(spec: string, systemContext: string, config?: OrcaConfig): Promise { +async function runFullPlanning( + spec: string, + systemContext: string, + config?: OrcaConfig, + interactionContext?: SessionInteractionContext, +): Promise { const planSpecImpl = resolvePlanSpecImpl(config); - const result = await planSpecImpl(spec, systemContext, config); + const result = await planSpecImpl(spec, systemContext, config, interactionContext); try { validateDAG(result.tasks); @@ -249,7 +261,7 @@ async function runFullPlanning(spec: string, systemContext: string, config?: Orc const planReviewConfig = getPlanReviewConfig(config); let finalTasks = result.tasks; try { - const reviewed = await runTaskGraphReview(result.tasks, systemContext, config); + const reviewed = await runTaskGraphReview(result.tasks, systemContext, config, interactionContext); finalTasks = reviewed.finalTasks; } catch (error) { if (planReviewConfig.onInvalid === "warn_skip") { @@ -275,21 +287,26 @@ export async function runPlanner( const spec = await fs.readFile(specPath, "utf8"); const [skills, instructions] = await Promise.all([loadSkills(config), loadProjectInstructions(specPath)]); const systemContext = buildSystemContext(skills, instructions); + const interactiveContext = { + runId: runId as HookEvent["runId"], + store, + ...(options?.emitHook ? { emitHook: options.emitHook } : {}), + }; let finalTasks: Task[]; if (options?.allowPlanSkip === true) { const decidePlanningNeed = resolveDecidePlanningNeedImpl(config); - const decision = await decidePlanningNeed(spec, systemContext, config); + const decision = await decidePlanningNeed(spec, systemContext, config, interactiveContext); if (!decision.needsPlan) { logger.info(`Planning skipped: ${decision.reason}`); finalTasks = buildSingleExecutionTask(spec); } else { logger.info(`Planning required: ${decision.reason}`); - finalTasks = await runFullPlanning(spec, systemContext, config); + finalTasks = await runFullPlanning(spec, systemContext, config, interactiveContext); } } else { - finalTasks = await runFullPlanning(spec, systemContext, config); + finalTasks = await runFullPlanning(spec, systemContext, config, interactiveContext); } await store.writeTasks(runId, finalTasks); diff --git a/src/core/question-flow.ts b/src/core/question-flow.ts new file mode 100644 index 0000000..062d61d --- /dev/null +++ b/src/core/question-flow.ts @@ -0,0 +1,157 @@ +import type { + ToolRequestUserInputParams, + ToolRequestUserInputResponse, +} from "@ratley/codex-client"; + +import type { PendingQuestion, PendingQuestionPrompt } from "../types/index.js"; + +function normalizeQuestionPrompt(question: ToolRequestUserInputParams["questions"][number]): PendingQuestionPrompt { + return { + header: question.header, + id: question.id, + question: question.question, + isOther: question.isOther ?? false, + isSecret: question.isSecret ?? false, + ...(question.options !== undefined ? { options: question.options } : {}), + }; +} + +function normalizeAnswerList(value: unknown): string[] | null { + if (typeof value === "string") { + const trimmed = value.trim(); + return trimmed.length > 0 ? [trimmed] : []; + } + + if (Array.isArray(value)) { + const answers = value + .filter((entry): entry is string => typeof entry === "string") + .map((entry) => entry.trim()) + .filter((entry) => entry.length > 0); + return answers; + } + + if ( + value && + typeof value === "object" && + "answers" in value && + Array.isArray((value as { answers?: unknown }).answers) + ) { + return normalizeAnswerList((value as { answers?: unknown[] }).answers); + } + + return null; +} + +function formatQuestionBlock(question: PendingQuestionPrompt): string { + const optionText = question.options && question.options.length > 0 + ? ` Options: ${question.options.map((option) => option.label).join(", ")}.` + : ""; + + return `${question.header}: ${question.question}${optionText}`; +} + +export function createPendingQuestion( + requestId: string | number, + params: ToolRequestUserInputParams, + receivedAt: string = new Date().toISOString(), +): PendingQuestion { + return { + requestId, + threadId: params.threadId, + turnId: params.turnId, + itemId: params.itemId, + receivedAt, + questions: params.questions.map((question) => normalizeQuestionPrompt(question)), + }; +} + +export function buildQuestionHookMessage(pendingQuestion: PendingQuestion): string { + if (pendingQuestion.questions.length === 1) { + return pendingQuestion.questions[0]?.question ?? "Codex requested user input."; + } + + return `Codex requested answers for ${pendingQuestion.questions.length} questions.`; +} + +export function formatPendingQuestionForStatus(pendingQuestion: PendingQuestion): string[] { + return [ + "Pending Question:", + ...pendingQuestion.questions.map((question) => `- ${formatQuestionBlock(question)}`), + ]; +} + +export function serializeQuestionAnswerResponse(response: ToolRequestUserInputResponse): string { + return `${JSON.stringify(response, null, 2)}\n`; +} + +export function parseQuestionAnswerInput( + rawInput: string, + pendingQuestion: PendingQuestion, +): ToolRequestUserInputResponse { + const trimmed = rawInput.trim(); + if (trimmed.length === 0) { + throw new Error("answer payload is empty"); + } + + if (pendingQuestion.questions.length === 1 && !trimmed.startsWith("{")) { + const onlyQuestion = pendingQuestion.questions[0]; + if (!onlyQuestion) { + throw new Error("pending question is missing its question definition"); + } + + return { + answers: { + [onlyQuestion.id]: { + answers: [trimmed], + }, + }, + }; + } + + let parsed: unknown; + try { + parsed = JSON.parse(trimmed); + } catch (error) { + throw new Error( + pendingQuestion.questions.length === 1 + ? `answer payload is not valid JSON: ${error instanceof Error ? error.message : String(error)}` + : "multiple pending questions require a JSON object mapping question ids to answers", + ); + } + + if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) { + throw new Error("answer payload must be a JSON object"); + } + + const record = parsed as Record; + if ("answers" in record && record.answers && typeof record.answers === "object" && !Array.isArray(record.answers)) { + const normalizedAnswers: Record = {}; + + for (const [questionId, answerValue] of Object.entries(record.answers as Record)) { + const answers = normalizeAnswerList(answerValue); + if (answers === null) { + throw new Error(`answer payload for '${questionId}' must be a string, string array, or { answers: string[] }`); + } + + normalizedAnswers[questionId] = { answers }; + } + + return { answers: normalizedAnswers }; + } + + const normalizedAnswers: Record = {}; + for (const question of pendingQuestion.questions) { + if (!(question.id in record)) { + throw new Error(`answer payload is missing question id '${question.id}'`); + } + + const answers = normalizeAnswerList(record[question.id]); + if (answers === null) { + throw new Error(`answer payload for '${question.id}' must be a string, string array, or { answers: string[] }`); + } + + normalizedAnswers[question.id] = { answers }; + } + + return { answers: normalizedAnswers }; +} diff --git a/src/core/task-runner.ts b/src/core/task-runner.ts index f45af15..3f2f255 100644 --- a/src/core/task-runner.ts +++ b/src/core/task-runner.ts @@ -68,7 +68,7 @@ function hasPendingTasks(tasks: Task[]): boolean { } export interface TaskRunnerOptions { - runId: string; + runId: RunId; store: RunStore; config?: OrcaConfig; emitHook?: EmitHook; @@ -153,7 +153,11 @@ export async function runTaskRunner(options: TaskRunnerOptions): Promise { if (mockFn) { executeTaskFn = mockFn; } else { - codexSession = await createCodexSession(process.cwd(), config); + codexSession = await createCodexSession(process.cwd(), config, { + runId, + store, + emitHook, + }); executeTaskFn = (task, taskRunId, _cfg, systemContext) => codexSession!.executeTask(task, taskRunId, systemContext); } diff --git a/src/state/schema.ts b/src/state/schema.ts index 267eb98..bc4ac03 100644 --- a/src/state/schema.ts +++ b/src/state/schema.ts @@ -28,6 +28,29 @@ const ErrorEntrySchema = z.object({ taskId: z.string().optional() }); +const PendingQuestionOptionSchema = z.object({ + label: z.string(), + description: z.string() +}); + +const PendingQuestionPromptSchema = z.object({ + header: z.string(), + id: z.string(), + question: z.string(), + isOther: z.boolean(), + isSecret: z.boolean(), + options: z.array(PendingQuestionOptionSchema).nullable().optional() +}); + +const PendingQuestionSchema = z.object({ + requestId: z.union([z.string(), z.number().int()]), + threadId: z.string(), + turnId: z.string(), + itemId: z.string(), + receivedAt: z.string(), + questions: z.array(PendingQuestionPromptSchema) +}); + const PrStatusSchema = z.object({ draftTitle: z.string().optional(), draftBody: z.string().optional(), @@ -54,6 +77,7 @@ export const RunStatusSchema = z.object({ tasks: z.array(TaskSchema), milestones: z.array(z.string()), errors: z.array(ErrorEntrySchema), + pendingQuestion: PendingQuestionSchema.optional(), pr: PrStatusSchema.optional() }); diff --git a/src/state/store.ts b/src/state/store.ts index 7385d3d..f243fdf 100644 --- a/src/state/store.ts +++ b/src/state/store.ts @@ -103,7 +103,7 @@ export class RunStore { } private async writeJsonAtomic(filePath: string, data: unknown): Promise { - const tmpPath = `${filePath}.tmp`; + const tmpPath = `${filePath}.${process.pid}.${Date.now()}.${Math.random().toString(16).slice(2)}.tmp`; const payload = `${JSON.stringify(data, null, 2)}\n`; await fs.writeFile(tmpPath, payload, "utf8"); diff --git a/src/types/config-typing.typecheck.ts b/src/types/config-typing.typecheck.ts index f515a95..6450177 100644 --- a/src/types/config-typing.typecheck.ts +++ b/src/types/config-typing.typecheck.ts @@ -2,6 +2,10 @@ import { defineOrcaConfig } from "./index.js"; defineOrcaConfig({ hooks: { + onQuestion: async (event) => { + const questionId: string = event.questions[0]?.id ?? ""; + void questionId; + }, onTaskComplete: async (event, context) => { const taskId: string = event.taskId; const taskName: string = event.taskName; diff --git a/src/types/index.ts b/src/types/index.ts index e53f1e4..4a55b02 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -48,6 +48,7 @@ export interface RunStatus { tasks: Task[]; milestones: string[]; errors: Array<{ at: string; message: string; taskId?: string }>; + pendingQuestion?: PendingQuestion | undefined; pr?: { draftTitle?: string; draftBody?: string; @@ -57,6 +58,29 @@ export interface RunStatus { }; } +export interface PendingQuestionOption { + label: string; + description: string; +} + +export interface PendingQuestionPrompt { + header: string; + id: string; + question: string; + isOther: boolean; + isSecret: boolean; + options?: PendingQuestionOption[] | null; +} + +export interface PendingQuestion { + requestId: string | number; + threadId: string; + turnId: string; + itemId: string; + receivedAt: string; + questions: PendingQuestionPrompt[]; +} + export interface BaseHookEvent { runId: RunId; message: string; @@ -69,6 +93,14 @@ export interface BaseHookEvent { export interface HookEventMap { onMilestone: BaseHookEvent & { hook: "onMilestone" }; + onQuestion: BaseHookEvent & { + hook: "onQuestion"; + requestId: string | number; + threadId: string; + turnId: string; + itemId: string; + questions: PendingQuestionPrompt[]; + }; onTaskComplete: BaseHookEvent & { hook: "onTaskComplete"; taskId: string; taskName: string }; onTaskFail: BaseHookEvent & { hook: "onTaskFail"; taskId: string; taskName: string; error: string }; onInvalidPlan: BaseHookEvent & { hook: "onInvalidPlan"; error: string }; From 97fc21ef39951ff5a2ad006e142dafe0089b671d Mon Sep 17 00:00:00 2001 From: bradley inniss Date: Mon, 16 Mar 2026 12:18:08 -0700 Subject: [PATCH 03/19] Tighten README for question flow and Codex docs --- README.md | 24 ++++++++++++++++-------- session-logs/LATEST.md | 5 +++-- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 0aa06b4..9ad2bb6 100644 --- a/README.md +++ b/README.md @@ -52,11 +52,17 @@ Run states: `planning` → `running` → `completed` | `failed` | `cancelled` | ### Answering questions -If a run hits `waiting_for_answer`, it's blocked until you respond: +If a run hits `waiting_for_answer`, execution pauses until a response is submitted to the live run: ```bash orca status --last # read the question -orca answer "yes, use migration A" # answer and resume the live run +orca answer "yes, use migration A" # answer and resume the same run +``` + +For multi-question prompts, pass JSON mapping question IDs to answers: + +```bash +orca answer '{"answers":{"q1":{"answers":["yes"]},"q2":{"answers":["option-a"]}}}' ``` ### Spec / plan files @@ -104,7 +110,7 @@ Orca loads config in this order (later overrides earlier): `.ts` is preferred over `.js` when both exist. -Stale executor values from older configs are ignored and coerced to `codex`. Orca no longer supports alternate executors. +Orca is Codex-only. Stale executor values from older configs are coerced to `codex`. ```ts // orca.config.ts @@ -176,11 +182,11 @@ After planning, Orca runs a pre-execution review that can edit the task graph (a After execution, Orca runs validation commands and asks Codex to review findings. With `onFindings: "auto_fix"`, it applies fixes and retries up to `maxCycles` times, then reports. Set `ORCA_SKIP_VALIDATORS=1` to skip validator auto-detection at runtime. -Use `codex.thinkingLevel` when you want different reasoning levels for different stages instead of a single global `codex.effort`. +Use `codex.thinkingLevel` when you want per-step reasoning levels instead of a single global `codex.effort`. Supported steps: `decision`, `planning`, `review`, `execution`. ### Multi-agent mode -Set `codex.multiAgent: true` to spawn parallel Codex agents per task. Faster for large refactors with independent subtasks; higher token cost. **Note:** this writes `multi_agent = true` to your global `~/.codex/config.toml`. +Set `codex.multiAgent: true` to enable multi-agent-aware prompt guidance. Orca's task runner stays sequential, but Codex can use subagents inside a task turn when work is independent. **Note:** this writes `multi_agent = true` to your global `~/.codex/config.toml`. If `~/.codex/config.toml` already enables `[features].multi_agent = true`, Orca also treats the run as multi-agent-aware for planning, review, consultation, and execution prompts even when `codex.multiAgent` is not set in Orca config. @@ -228,14 +234,15 @@ orca setup Interactive setup wizard **Key flags for `orca` (run):** -- `--codex-only` — force Codex executor -- `--codex-effort ` — override effort for this run +- `--codex-only` — compatibility flag; executor is already Codex-only +- `--codex-effort ` — override effort for this run - `--config ` — explicit config file +- `--on-question ` — command hook when Codex requests user input - `--on-complete `, `--on-error `, `--on-task-complete `, `--on-findings `, etc. **Key flags for `orca resume`:** -- `--codex-only`, `--codex-effort `, `--config `, `--run `, `--last` +- `--codex-only`, `--codex-effort `, `--config `, `--run `, `--last` **`orca setup` flags:** @@ -251,6 +258,7 @@ Available hook names: `onMilestone`, `onQuestion`, `onTaskComplete`, `onTaskFail - Function hooks (`config.hooks`): receive `(event, context)` where `context = { cwd, pid, invokedAt }` - Command hooks (`config.hookCommands` / `--on-*` flags): receive full event JSON over stdin +- `onQuestion` includes request metadata (`requestId`, `threadId`, `turnId`, `itemId`) and `questions[]` - Unknown hook keys in config are rejected at load time ### Run ID format diff --git a/session-logs/LATEST.md b/session-logs/LATEST.md index c009db1..d4607d8 100644 --- a/session-logs/LATEST.md +++ b/session-logs/LATEST.md @@ -1,7 +1,7 @@ # Session Log -- Timestamp: 2026-03-16T07:40:30Z -- Scope: Codex clarification question flow, pending-question persistence, onQuestion hooks, answer/resume handling, and live CLI smoke coverage. +- Timestamp: 2026-03-17T02:18:00Z +- Scope: Codex clarification question flow, pending-question persistence, onQuestion hooks, answer/resume handling, multi-agent prompt guidance, README/docs cleanup, and release-readiness verification. - Verification: - `bun test src` - `npm run typecheck:tsc` @@ -11,3 +11,4 @@ - Orca now surfaces Codex `requestUserInput` prompts in `status.json`, `orca status`, and `onQuestion` hooks. - `orca answer` writes structured answers and the original live run resumes without `orca resume`. - The CLI smoke passed against a fake Codex app-server exercising `waiting_for_answer` end to end. + - README/docs were tightened to reflect Codex-only execution, per-step `thinkingLevel`, multi-agent prompt guidance, and the current answer flow. From faadccabe127bcf4bb33e7c4083f07962b001c38 Mon Sep 17 00:00:00 2001 From: bradley inniss Date: Mon, 16 Mar 2026 23:00:02 -0700 Subject: [PATCH 04/19] Prepare Orca 0.2.25 for codex-client 0.1.5 --- package-lock.json | 13 ++++++------- package.json | 4 ++-- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/package-lock.json b/package-lock.json index a0a0fe4..71df0c4 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,16 +1,16 @@ { "name": "orcastrator", - "version": "0.2.24", + "version": "0.2.25", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "orcastrator", - "version": "0.2.24", + "version": "0.2.25", "license": "MIT", "dependencies": { "@inquirer/prompts": "^8.2.1", - "@ratley/codex-client": "^0.1.4", + "@ratley/codex-client": "^0.1.5", "chalk": "^5.3.0", "commander": "^13.1.0", "zod": "^4.3.6" @@ -912,10 +912,9 @@ } }, "node_modules/@ratley/codex-client": { - "version": "0.1.4", - "resolved": "https://registry.npmjs.org/@ratley/codex-client/-/codex-client-0.1.4.tgz", - "integrity": "sha512-lefQMXoR12cImkNnZiHN/MraDdxKzrwH5DZSft6cQJ1s+tIZllNyCBBD6dtGX8O+8uz19SL/NyyHbBIDUtlIlw==", - "license": "MIT", + "version": "0.1.5", + "resolved": "https://registry.npmjs.org/@ratley/codex-client/-/codex-client-0.1.5.tgz", + "integrity": "sha512-sYpKYpCXn//QZpKJyboWPCC1YuZrMBacKYmDBMyFRw3F59w4n1tec5n1yHRrfF19IBGsx5MtHevZjfu6tuEaXw==", "peerDependencies": { "typescript": "^5" } diff --git a/package.json b/package.json index ba261be..285e0f3 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "orcastrator", - "version": "0.2.24", + "version": "0.2.25", "type": "module", "types": "dist/index.d.ts", "exports": { @@ -40,7 +40,7 @@ }, "dependencies": { "@inquirer/prompts": "^8.2.1", - "@ratley/codex-client": "^0.1.4", + "@ratley/codex-client": "^0.1.5", "chalk": "^5.3.0", "commander": "^13.1.0", "zod": "^4.3.6" From 124de523a398f5a12f3457572865dd3c79e80cf9 Mon Sep 17 00:00:00 2001 From: bradley inniss Date: Mon, 16 Mar 2026 23:01:13 -0700 Subject: [PATCH 05/19] Stop tracking local session logs and TODO --- .gitignore | 1 + TODO.md | 40 ---------------------------------------- session-logs/LATEST.md | 14 -------------- 3 files changed, 1 insertion(+), 54 deletions(-) delete mode 100644 TODO.md delete mode 100644 session-logs/LATEST.md diff --git a/.gitignore b/.gitignore index bf3b17a..0f98a7d 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ dist .npmrc tmp/ session-logs/ +TODO.md diff --git a/TODO.md b/TODO.md deleted file mode 100644 index 90ca9cd..0000000 --- a/TODO.md +++ /dev/null @@ -1,40 +0,0 @@ -# Orca TODO - -## Skills System -- ✅ Shipped in v0.2.6 — skill loader, frontmatter parsing, injection into planner + task-runner -- ✅ Skill discovery precedence is now explicit and deterministic: config.skills[] > .orca/skills/ > ~/.orca/skills/ > bundled `/.orca/skills/` (first name wins) -- ✅ `orca skills list` command shipped (includes bundled source labeling) - -## Codex-as-Executor -- ✅ Shipped in v0.2.7 — Codex is now default executor; persistent session per run; Claude fallback on init failure -- config: executor?: "claude" | "codex" in OrcaConfig (default: "codex") - -## Multi-Agent -- ✅ Shipped: opt-in via `codex: { multiAgent: true }` in orca.config.js — writes to `~/.codex/config.toml` -- Smoke test once Bradley has a real project to run it against (watch for "spawning sub-agents" in codex output) - -## Validation Hardening -- ✅ Shipped in v0.2.8 — executor config validation, symlink guard, EACCES/EPERM resilience, parseTaskArray field defaults, Codex session leak fix, claude session unit tests (19 new), shared PlanResult/TaskExecutionResult types - -## Recent Ships -- ✅ Bundled `code-simplifier` skill added at `.orca/skills/code-simplifier/SKILL.md`; planner/reviewer/executor prompts now explicitly apply it for all code-writing and code-review steps (behavior-preserving by default) -- ✅ Codex turn input now includes explicit `skill` items (`{ type: "skill", name, path }`) for all loaded skills (same precedence as loader), plus text input for every turn -- ✅ Codex app-server `skills/list` integration shipped with `forceReload: true` + optional `codex.perCwdExtraUserRoots` support (app-server-discovered skills append deterministically) -- ✅ `orca skills list` command shipped -- ✅ Executor override flags shipped: `--codex-only` / `--claude-only` -- ✅ Claude planner/executor deterministic structured-output path shipped (text JSON fallback gated) -- ✅ Effort controls shipped: `--codex-effort `, `--claude-effort ` -- ✅ Planning gate shipped: low-thinking `needsPlan` decision pass before heavy planning; skips full planner/review when a single execution task is enough -- ✅ Fine-grained Codex thinking controls shipped: `codex.thinkingLevel.decision|planning|execution` (with clear defaults) -- ✅ `orca setup --check` key detection improved (OpenClaw env + cross-platform `.env` fallbacks) -- ✅ Dedicated post-exec reviewer JSON hardening integration target shipped (`npm run test:postexec-json`) - -## Remaining -- Zod v3→v4 upgrade (peer dep conflict with @anthropic-ai/claude-agent-sdk@0.2.47) -- Review → improvement step: pre-execution review that modifies the task graph -- AGENTS.md / CLAUDE.md injection into planning context -- Review cycle depth: `maxReviewCycles` config property + `--max-review-cycles ` CLI flag - - Controls how many back-and-forth exchanges between executor and reviewer are allowed - - Always ends with a review (reviewer has last word) - - Default: 1 (one exchange, one final review) - - Example: maxReviewCycles=2 means executor→reviewer→executor→reviewer diff --git a/session-logs/LATEST.md b/session-logs/LATEST.md deleted file mode 100644 index d4607d8..0000000 --- a/session-logs/LATEST.md +++ /dev/null @@ -1,14 +0,0 @@ -# Session Log - -- Timestamp: 2026-03-17T02:18:00Z -- Scope: Codex clarification question flow, pending-question persistence, onQuestion hooks, answer/resume handling, multi-agent prompt guidance, README/docs cleanup, and release-readiness verification. -- Verification: - - `bun test src` - - `npm run typecheck:tsc` - - `npm run build` - - `bun test src/__tests__/client.test.ts src/__tests__/integration.test.ts` -- Notes: - - Orca now surfaces Codex `requestUserInput` prompts in `status.json`, `orca status`, and `onQuestion` hooks. - - `orca answer` writes structured answers and the original live run resumes without `orca resume`. - - The CLI smoke passed against a fake Codex app-server exercising `waiting_for_answer` end to end. - - README/docs were tightened to reflect Codex-only execution, per-step `thinkingLevel`, multi-agent prompt guidance, and the current answer flow. From e74d658c18a8c31fbde42ffdf636147ae40176ba Mon Sep 17 00:00:00 2001 From: bradley inniss Date: Wed, 18 Mar 2026 19:42:04 -0700 Subject: [PATCH 06/19] Preserve cancelled runs during question flow --- src/agents/codex/session.ts | 9 +- src/agents/codex/session.unit.test.ts | 130 ++++++++++++++++++++++++++ 2 files changed, 136 insertions(+), 3 deletions(-) diff --git a/src/agents/codex/session.ts b/src/agents/codex/session.ts index c907919..655085f 100644 --- a/src/agents/codex/session.ts +++ b/src/agents/codex/session.ts @@ -755,7 +755,10 @@ export async function createCodexSession( throw new Error("Codex client does not support rejecting server requests"); }; - const clearPendingQuestion = async (requestId: RequestId, overallStatus: "running" | "waiting_for_answer"): Promise => { + const clearPendingQuestion = async ( + requestId: RequestId, + overallStatus?: "running" | "waiting_for_answer" + ): Promise => { if (!interactionContext) { return; } @@ -766,7 +769,7 @@ export async function createCodexSession( } await interactionContext.store.updateRun(interactionContext.runId, { - overallStatus, + ...(overallStatus ? { overallStatus } : {}), pendingQuestion: undefined, }); }; @@ -824,7 +827,7 @@ export async function createCodexSession( if (currentRun.overallStatus === "cancelled") { rejectUserInputRequest(request.requestId, `Run ${interactionContext.runId} was cancelled while waiting for input.`); - await clearPendingQuestion(request.requestId, "waiting_for_answer"); + await clearPendingQuestion(request.requestId); return; } diff --git a/src/agents/codex/session.unit.test.ts b/src/agents/codex/session.unit.test.ts index b876be2..d26aa05 100644 --- a/src/agents/codex/session.unit.test.ts +++ b/src/agents/codex/session.unit.test.ts @@ -875,4 +875,134 @@ describe("codex session question flow", () => { await rm(tempDir, { recursive: true, force: true }); } }); + + test("keeps a run cancelled when cancellation happens while waiting for input", async () => { + const tempDir = await mkdtemp(path.join(os.tmpdir(), "orca-question-flow-cancelled-")); + const store = new RunStore(path.join(tempDir, "runs")); + const runId = "run-1000-abcd"; + await store.createRun(runId, "/tmp/spec.md"); + await store.updateRun(runId, { mode: "run", overallStatus: "running" }); + + const rejectedRequests: Array<{ requestId: string | number; error: { code: number; message: string } }> = []; + let settleRequest: (() => void) | undefined; + const requestSettled = new Promise((resolve) => { + settleRequest = resolve; + }); + let clientInstance: EventEmitter | null = null; + + try { + mockMultiAgentDetection(false); + mock.module("@ratley/codex-client", () => ({ + CodexClient: class extends EventEmitter { + constructor() { + super(); + clientInstance = this; + } + + async connect(): Promise {} + async disconnect(): Promise {} + async startThread(): Promise<{ id: string }> { + return { id: "thread-1" }; + } + async runReview(): Promise<{ reviewText: string }> { + return { reviewText: "ok" }; + } + respondToUserInputRequest(): void { + throw new Error("request should be rejected when run is cancelled"); + } + rejectServerRequest(requestId: string | number, error: { code: number; message: string }): void { + rejectedRequests.push({ requestId, error }); + settleRequest?.(); + } + async runTurn(): Promise<{ agentMessage: string; turn: { status: "completed" }; items: [] }> { + queueMicrotask(() => { + clientInstance?.emit("request:userInput", { + requestId: "req-1", + itemId: "item-1", + threadId: "thread-1", + turnId: "turn-1", + questions: [ + { + header: "Game Type", + id: "game_type", + question: "Which game type should I build?", + isOther: true, + isSecret: false, + options: [ + { label: "Arcade", description: "Arcade style" }, + { label: "Puzzle", description: "Puzzle style" }, + ], + }, + ], + }); + }); + + await requestSettled; + + return { + agentMessage: '{"outcome":"done"}', + turn: { status: "completed" }, + items: [], + }; + } + }, + })); + + mock.module("../../utils/skill-loader.js", () => ({ + loadSkills: async () => [], + })); + + const { createCodexSession } = await import(`./session.ts?test=${Math.random()}`); + const session = await createCodexSession(process.cwd(), undefined, { + runId: runId as `${string}-${number}-${string}`, + store, + }); + + try { + const executionPromise = session.executeTask( + { + id: "task-1", + name: "Build the game", + description: "Implement the requested game.", + dependencies: [], + acceptance_criteria: ["Game is implemented"], + status: "pending", + retries: 0, + maxRetries: 3, + }, + runId, + "context", + ); + + await waitFor(async () => { + const run = await store.getRun(runId); + return run?.pendingQuestion ? run : null; + }); + + await store.updateRun(runId, { overallStatus: "cancelled" }); + + await executionPromise; + + const cancelledRun = await waitFor(async () => { + const run = await store.getRun(runId); + return run && run.pendingQuestion === undefined ? run : null; + }); + + expect(cancelledRun.overallStatus).toBe("cancelled"); + expect(rejectedRequests).toEqual([ + { + requestId: "req-1", + error: { + code: -32603, + message: `Run ${runId} was cancelled while waiting for input.`, + }, + }, + ]); + } finally { + await session.disconnect(); + } + } finally { + await rm(tempDir, { recursive: true, force: true }); + } + }); }); From b48c5bcab12dfdffcea0b6eccc8e0407e3ea8813 Mon Sep 17 00:00:00 2001 From: bradley inniss Date: Fri, 20 Mar 2026 02:49:19 -0700 Subject: [PATCH 07/19] Replace Claude review with Codex workflow --- .github/workflows/claude-code-review.yml | 91 -------- .github/workflows/codex-code-review.yml | 269 +++++++++++++++++++++++ bun.lock | 4 +- 3 files changed, 271 insertions(+), 93 deletions(-) delete mode 100644 .github/workflows/claude-code-review.yml create mode 100644 .github/workflows/codex-code-review.yml diff --git a/.github/workflows/claude-code-review.yml b/.github/workflows/claude-code-review.yml deleted file mode 100644 index 5cdb4ec..0000000 --- a/.github/workflows/claude-code-review.yml +++ /dev/null @@ -1,91 +0,0 @@ -name: Claude Code Review - -on: - workflow_dispatch: - inputs: - pr_number: - description: "PR number to review when manually triggered" - required: true - pull_request: - types: [opened, synchronize] - -jobs: - claude-review: - runs-on: ubuntu-latest - permissions: - contents: read - pull-requests: write - issues: write - id-token: write - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - fetch-depth: 1 - - - name: Setup Bun - uses: oven-sh/setup-bun@v2 - - - name: Install dependencies - run: bun install --frozen-lockfile - - - name: Run Claude Code Review (structured) - id: claude-review - uses: anthropics/claude-code-action@v1 - with: - claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} - prompt: | - REPO: ${{ github.repository }} - PR NUMBER: ${{ github.event.pull_request.number || inputs.pr_number }} - - You are reviewing a PR for Orca, a TypeScript CLI package. - Focus only on correctness, regressions, and security/runtime breakage. - - RULES: - 1. "No issues found" is valid and preferred when true. - 2. Review the diff as source of truth; ignore stale PR descriptions. - 3. Do not report style-only suggestions. - 4. For each finding, include exact file:line + concrete failure scenario. - 5. Use only these severities: - - CRITICAL: production breakage, security, data loss - - BUG: incorrect behavior users will hit - - RISK: concrete reproducible failure scenario with current code paths - - Return valid JSON matching schema: - - result: "NO_ISSUES" or "HAS_FINDINGS" - - comment_body: full human PR comment markdown - - If no issues: "No issues found. [one sentence summary of what the PR does]." - If issues: list findings by severity with file:line and quoted code. - - claude_args: | - --model claude-opus-4-6 - --allowed-tools "Bash(gh pr diff:*),Bash(gh pr view:*),Bash(gh pr list:*)" - --json-schema '{"type":"object","properties":{"result":{"type":"string","enum":["NO_ISSUES","HAS_FINDINGS"]},"comment_body":{"type":"string"}},"required":["result","comment_body"]}' - - - name: Post review comment (for humans) - if: ${{ steps.claude-review.outputs.structured_output != '' }} - env: - GH_TOKEN: ${{ github.token }} - PR_NUMBER: ${{ github.event.pull_request.number || inputs.pr_number }} - STRUCTURED: ${{ steps.claude-review.outputs.structured_output }} - run: | - echo "$STRUCTURED" | jq -r '.comment_body' > body.txt - gh pr comment "$PR_NUMBER" --body-file body.txt - - - name: Gate PR on Claude result (deterministic) - env: - STRUCTURED: ${{ steps.claude-review.outputs.structured_output }} - run: | - if [ -z "$STRUCTURED" ]; then - echo "Claude review gate skipped: structured output missing (expected when this workflow is first introduced and not yet on default branch)." - exit 0 - fi - - result="$(echo "$STRUCTURED" | jq -r '.result')" - if [ "$result" != "NO_ISSUES" ]; then - echo "Claude review gate failed: result=$result" - exit 1 - fi - echo "Claude review gate passed: NO_ISSUES" diff --git a/.github/workflows/codex-code-review.yml b/.github/workflows/codex-code-review.yml new file mode 100644 index 0000000..71caeb1 --- /dev/null +++ b/.github/workflows/codex-code-review.yml @@ -0,0 +1,269 @@ +name: Codex Code Review + +on: + pull_request: + types: [opened, reopened, synchronize, ready_for_review] + +concurrency: + group: codex-structured-review-${{ github.event.pull_request.number }} + cancel-in-progress: true + +jobs: + codex-structured-review: + name: Run Codex structured review + if: ${{ !github.event.pull_request.draft }} + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write + issues: write + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + GITHUB_TOKEN: ${{ github.token }} + PR_NUMBER: ${{ github.event.pull_request.number }} + HEAD_SHA: ${{ github.event.pull_request.head.sha }} + BASE_SHA: ${{ github.event.pull_request.base.sha }} + REPOSITORY: ${{ github.repository }} + PR_TITLE: ${{ github.event.pull_request.title }} + PR_BODY: ${{ github.event.pull_request.body }} + CHECKOUT_DIR: repo-checkout + REVIEW_WORKSPACE: codex-review-workspace + + steps: + - name: Checkout pull request merge commit + uses: actions/checkout@v5 + with: + path: ${{ env.CHECKOUT_DIR }} + ref: refs/pull/${{ github.event.pull_request.number }}/merge + + - name: Fetch base and head refs + run: | + set -euxo pipefail + git -C "${CHECKOUT_DIR}" fetch --no-tags origin \ + "${{ github.event.pull_request.base.ref }}" \ + +refs/pull/${{ github.event.pull_request.number }}/head + + - name: Prepare isolated review workspace + run: | + set -euo pipefail + rm -rf "${REVIEW_WORKSPACE}" + mkdir -p "${REVIEW_WORKSPACE}/.github/tmp" + python3 <<'PY' + import os + import pathlib + import shutil + import subprocess + + checkout = pathlib.Path(os.environ["CHECKOUT_DIR"]).resolve() + workspace = pathlib.Path(os.environ["REVIEW_WORKSPACE"]).resolve() + + def copy_file(rel_path: str) -> None: + src = checkout / rel_path + if not src.exists(): + return + dest = workspace / rel_path + dest.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(src, dest) + + required_files = [ + "README.md", + "SKILL.md", + "docs/PLAN.md", + "docs/codex-app-server.md", + "docs/codex-cli-reference.md", + ".orca/skills/code-simplifier/SKILL.md", + ] + for rel_path in required_files: + copy_file(rel_path) + + changed_files = subprocess.run( + [ + "git", + "-C", + str(checkout), + "diff", + "--name-only", + "--diff-filter=ACMR", + os.environ["BASE_SHA"], + os.environ["HEAD_SHA"], + ], + check=True, + capture_output=True, + text=True, + ).stdout.splitlines() + + manifest = workspace / ".github" / "tmp" / "changed-files.txt" + manifest.write_text("\n".join(changed_files) + ("\n" if changed_files else "")) + + for rel_path in changed_files: + copy_file(rel_path) + PY + + - name: Generate structured output schema + run: | + set -euo pipefail + cat <<'JSON' > "${REVIEW_WORKSPACE}/codex-output-schema.json" + { + "type": "object", + "properties": { + "result": { + "type": "string", + "enum": ["NO_ISSUES", "HAS_FINDINGS"] + }, + "comment_body": { + "type": "string", + "minLength": 1 + } + }, + "required": ["result", "comment_body"], + "additionalProperties": false + } + JSON + + - name: Build Codex review prompt + run: | + set -euo pipefail + cat <<'PROMPT' > "${REVIEW_WORKSPACE}/codex-prompt.md" + REPO: ${REPOSITORY} + PR NUMBER: ${PR_NUMBER} + + You are reviewing a PR for Orca, a TypeScript CLI that plans and executes task graphs through Codex. + + REVIEW CONTEXT: + - Read `README.md` for the current public CLI/config surface. + - Read `SKILL.md` for project operating conventions and expected Orca behavior. + - Read `docs/PLAN.md` for architecture and run lifecycle expectations. + - If the diff touches Codex protocol, session, or config behavior, also read: + - `docs/codex-app-server.md` + - `docs/codex-cli-reference.md` + - `code-simplifier` is bundled for Orca, but this is still a correctness review, not a style review. + + WORKSPACE SCOPE: + - The review workspace is intentionally isolated. It contains only: + 1. required context files (`README.md`, `SKILL.md`, relevant docs, and bundled review skill context) + 2. `.github/tmp/changed-files.txt` + 3. the PR-changed files that still exist at HEAD + - Treat `.github/tmp/changed-files.txt` as the allowlist for code review targets. + - Do not raise findings against files that are not in the changed-files allowlist. + - If a deleted file matters, use the unified diff only. Do not go hunting through unrelated code. + + SELF-REVIEW EXCLUSION: + - Ignore `.github/workflows/codex-code-review.yml` and the removed Claude review workflow file when reviewing this PR. + + PR TITLE: + ${PR_TITLE} + + PR BODY: + ${PR_BODY} + + YOUR JOB: Find correctness bugs, regressions, and security/runtime breakage that will affect real Orca users. + NOT YOUR JOB: Style preferences, naming suggestions, speculative future improvements, or docs nitpicks. + + RULES: + 1. "No issues found" is a VALID and PREFERRED outcome. Most PRs by experienced engineers are correct. If you are hedging, delete the finding. + 2. The diff is the source of truth, not the PR description. Review what changed, not what the description says should have changed. + 3. Before including any finding, ask: "Would this break current Orca behavior, a documented CLI/config contract, or a real user workflow?" If no, do not include it. + 4. Orca is Codex-only. Do not invent findings based on removed or unsupported Claude behavior. + 5. For task-runner, state-store, question-flow, planning, review, and PR workflow changes, focus on observable run-state regressions, stuck states, lost updates, incorrect transitions, and broken CLI behavior. + 6. For Codex session/app-server changes, focus on protocol mismatches, request/response handling, prompt/turn regressions, cancellation behavior, and failures caused by outdated or missing compatibility handling. + 7. For config and docs changes, only flag issues when the docs/config surface is demonstrably inconsistent with the implemented behavior in the diff or with included source files. Do not nitpick wording. + 8. Do not suggest tests unless the missing test is directly tied to a real bug or regression scenario in the changed code. + 9. When you find a real issue, be specific: exact file:line references, the concrete failure scenario, and why the current code causes it. + 10. Do not produce style-only findings such as extracting constants, renaming symbols, reorganizing helpers, or future-proofing abstractions. + 11. FINAL SELF-CHECK: If a finding depends on "someone might add X later", "consider", "could be cleaner", or "the abstraction feels wrong", delete it. + + SEVERITY LEVELS (only use these): + - CRITICAL: Production breakage, security issue, data loss, or a run-state/protocol bug that makes Orca unusable. + - BUG: Incorrect current behavior that users will hit. + - RISK: A concrete reproducible failure scenario with current inputs and callers. + + You MUST return valid JSON matching the schema. + - result: "NO_ISSUES" or "HAS_FINDINGS" + - comment_body: a complete PR comment for humans. If issues are found, list findings by severity with file:line references. If no issues are found, write: "No issues found. [one sentence summary of what the PR does]." + PROMPT + + { + echo "" + echo "Repository: ${REPOSITORY}" + echo "Pull Request #: ${PR_NUMBER}" + echo "Base SHA: ${BASE_SHA}" + echo "Head SHA: ${HEAD_SHA}" + echo "" + echo "Changed files allowlist:" + cat "${REVIEW_WORKSPACE}/.github/tmp/changed-files.txt" + echo "" + echo "Changed files:" + git -C "${CHECKOUT_DIR}" --no-pager diff --name-status "${BASE_SHA}" "${HEAD_SHA}" + echo "" + echo "Unified diff (context=5):" + git -C "${CHECKOUT_DIR}" --no-pager diff --unified=5 "${BASE_SHA}" "${HEAD_SHA}" + } >> "${REVIEW_WORKSPACE}/codex-prompt.md" + + - name: Remove full checkout before review + run: | + set -euo pipefail + rm -rf "${CHECKOUT_DIR}" + + - name: Run Codex structured review + id: run-codex + uses: openai/codex-action@main + with: + openai-api-key: ${{ secrets.OPENAI_API_KEY }} + prompt-file: ${{ env.REVIEW_WORKSPACE }}/codex-prompt.md + output-schema-file: ${{ env.REVIEW_WORKSPACE }}/codex-output-schema.json + output-file: ${{ env.REVIEW_WORKSPACE }}/codex-output.json + working-directory: ${{ env.REVIEW_WORKSPACE }} + sandbox: read-only + safety-strategy: unsafe + allow-bots: true + model: gpt-5.4-2026-03-05 + + - name: Inspect structured output + if: ${{ always() }} + run: | + if [ -s "${REVIEW_WORKSPACE}/codex-output.json" ]; then + jq '.' "${REVIEW_WORKSPACE}/codex-output.json" + else + echo "Codex output file missing" + exit 1 + fi + + - name: Publish review comment + if: ${{ steps.run-codex.outcome == 'success' }} + env: + REVIEW_JSON: ${{ env.REVIEW_WORKSPACE }}/codex-output.json + run: | + set -euo pipefail + comment_body=$(jq -r '.comment_body' "$REVIEW_JSON") + + curl -sS \ + -X POST \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${GITHUB_TOKEN}" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + "https://api.github.com/repos/${REPOSITORY}/issues/${PR_NUMBER}/comments" \ + -d "$(jq -n --arg body "$comment_body" '{body: $body}')" >/dev/null + + - name: Gate PR on Codex result + if: ${{ steps.run-codex.outcome == 'success' }} + env: + REVIEW_JSON: ${{ env.REVIEW_WORKSPACE }}/codex-output.json + run: | + set -euo pipefail + result=$(jq -r '.result' "$REVIEW_JSON") + if [ "$result" != "NO_ISSUES" ]; then + echo "Codex review gate failed: result=$result" + exit 1 + fi + echo "Codex review gate passed: NO_ISSUES" + + - name: Upload Codex artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: codex-review-artifacts + path: | + ${{ env.REVIEW_WORKSPACE }}/codex-prompt.md + ${{ env.REVIEW_WORKSPACE }}/codex-output-schema.json + ${{ env.REVIEW_WORKSPACE }}/codex-output.json + ${{ env.REVIEW_WORKSPACE }}/.github/tmp/changed-files.txt diff --git a/bun.lock b/bun.lock index 5152e2a..541dc8e 100644 --- a/bun.lock +++ b/bun.lock @@ -6,7 +6,7 @@ "name": "orca", "dependencies": { "@inquirer/prompts": "^8.2.1", - "@ratley/codex-client": "^0.1.4", + "@ratley/codex-client": "^0.1.5", "chalk": "^5.3.0", "commander": "^13.1.0", "zod": "^4.3.6", @@ -121,7 +121,7 @@ "@publint/pack": ["@publint/pack@0.1.4", "", {}, "sha512-HDVTWq3H0uTXiU0eeSQntcVUTPP3GamzeXI41+x7uU9J65JgWQh3qWZHblR1i0npXfFtF+mxBiU2nJH8znxWnQ=="], - "@ratley/codex-client": ["@ratley/codex-client@0.1.4", "", { "peerDependencies": { "typescript": "^5" } }, "sha512-lefQMXoR12cImkNnZiHN/MraDdxKzrwH5DZSft6cQJ1s+tIZllNyCBBD6dtGX8O+8uz19SL/NyyHbBIDUtlIlw=="], + "@ratley/codex-client": ["@ratley/codex-client@0.1.5", "", { "peerDependencies": { "typescript": "^5" } }, "sha512-sYpKYpCXn//QZpKJyboWPCC1YuZrMBacKYmDBMyFRw3F59w4n1tec5n1yHRrfF19IBGsx5MtHevZjfu6tuEaXw=="], "@sindresorhus/is": ["@sindresorhus/is@4.6.0", "", {}, "sha512-t09vSN3MdfsyCHoFcTRCH/iUtG7OJ0CsjzB8cjAmKc/va/kIgeDI/TxsigdncE/4be734m0cvIYwNaV4i2XqAw=="], From c1c493e190597398e0fa773bfac31cc3586c9db5 Mon Sep 17 00:00:00 2001 From: bradley inniss Date: Fri, 20 Mar 2026 02:56:38 -0700 Subject: [PATCH 08/19] Stabilize Codex review workflow --- .github/workflows/codex-code-review.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/codex-code-review.yml b/.github/workflows/codex-code-review.yml index 71caeb1..03cc975 100644 --- a/.github/workflows/codex-code-review.yml +++ b/.github/workflows/codex-code-review.yml @@ -123,7 +123,7 @@ jobs: - name: Build Codex review prompt run: | set -euo pipefail - cat <<'PROMPT' > "${REVIEW_WORKSPACE}/codex-prompt.md" + cat < "${REVIEW_WORKSPACE}/codex-prompt.md" REPO: ${REPOSITORY} PR NUMBER: ${PR_NUMBER} @@ -206,7 +206,7 @@ jobs: - name: Run Codex structured review id: run-codex - uses: openai/codex-action@main + uses: openai/codex-action@v1 with: openai-api-key: ${{ secrets.OPENAI_API_KEY }} prompt-file: ${{ env.REVIEW_WORKSPACE }}/codex-prompt.md @@ -214,7 +214,6 @@ jobs: output-file: ${{ env.REVIEW_WORKSPACE }}/codex-output.json working-directory: ${{ env.REVIEW_WORKSPACE }} sandbox: read-only - safety-strategy: unsafe allow-bots: true model: gpt-5.4-2026-03-05 From d1c252f9ef9cb1cb086bf79454ca15efd8ba0f8d Mon Sep 17 00:00:00 2001 From: bradley inniss Date: Fri, 20 Mar 2026 03:00:42 -0700 Subject: [PATCH 09/19] Align Codex review workflow with working runner setup --- .github/workflows/codex-code-review.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/codex-code-review.yml b/.github/workflows/codex-code-review.yml index 03cc975..7ada7af 100644 --- a/.github/workflows/codex-code-review.yml +++ b/.github/workflows/codex-code-review.yml @@ -12,7 +12,7 @@ jobs: codex-structured-review: name: Run Codex structured review if: ${{ !github.event.pull_request.draft }} - runs-on: ubuntu-latest + runs-on: blacksmith-2vcpu-ubuntu-2404 permissions: contents: read pull-requests: write @@ -206,7 +206,7 @@ jobs: - name: Run Codex structured review id: run-codex - uses: openai/codex-action@v1 + uses: openai/codex-action@main with: openai-api-key: ${{ secrets.OPENAI_API_KEY }} prompt-file: ${{ env.REVIEW_WORKSPACE }}/codex-prompt.md @@ -214,6 +214,7 @@ jobs: output-file: ${{ env.REVIEW_WORKSPACE }}/codex-output.json working-directory: ${{ env.REVIEW_WORKSPACE }} sandbox: read-only + safety-strategy: unsafe allow-bots: true model: gpt-5.4-2026-03-05 From f872d41f4a9977383846325ee6fad75e68626f1a Mon Sep 17 00:00:00 2001 From: bradley inniss Date: Fri, 20 Mar 2026 03:05:44 -0700 Subject: [PATCH 10/19] Fix Codex review prompt generation --- .github/workflows/codex-code-review.yml | 29 +++++++++++++++---------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/.github/workflows/codex-code-review.yml b/.github/workflows/codex-code-review.yml index 7ada7af..85f24f4 100644 --- a/.github/workflows/codex-code-review.yml +++ b/.github/workflows/codex-code-review.yml @@ -30,6 +30,14 @@ jobs: REVIEW_WORKSPACE: codex-review-workspace steps: + - name: Validate OpenAI secret + run: | + set -euo pipefail + if [ -z "${OPENAI_API_KEY:-}" ]; then + echo "::error::OPENAI_API_KEY is not configured for this repository or organization." + exit 1 + fi + - name: Checkout pull request merge commit uses: actions/checkout@v5 with: @@ -123,10 +131,7 @@ jobs: - name: Build Codex review prompt run: | set -euo pipefail - cat < "${REVIEW_WORKSPACE}/codex-prompt.md" - REPO: ${REPOSITORY} - PR NUMBER: ${PR_NUMBER} - + cat <<'PROMPT' > "${REVIEW_WORKSPACE}/codex-prompt.md" You are reviewing a PR for Orca, a TypeScript CLI that plans and executes task graphs through Codex. REVIEW CONTEXT: @@ -150,12 +155,6 @@ jobs: SELF-REVIEW EXCLUSION: - Ignore `.github/workflows/codex-code-review.yml` and the removed Claude review workflow file when reviewing this PR. - PR TITLE: - ${PR_TITLE} - - PR BODY: - ${PR_BODY} - YOUR JOB: Find correctness bugs, regressions, and security/runtime breakage that will affect real Orca users. NOT YOUR JOB: Style preferences, naming suggestions, speculative future improvements, or docs nitpicks. @@ -183,6 +182,14 @@ jobs: PROMPT { + echo "REPO: ${REPOSITORY}" + echo "PR NUMBER: ${PR_NUMBER}" + echo "" + echo "PR TITLE:" + printf '%s\n' "${PR_TITLE}" + echo "" + echo "PR BODY:" + printf '%s\n' "${PR_BODY}" echo "" echo "Repository: ${REPOSITORY}" echo "Pull Request #: ${PR_NUMBER}" @@ -219,7 +226,7 @@ jobs: model: gpt-5.4-2026-03-05 - name: Inspect structured output - if: ${{ always() }} + if: ${{ steps.run-codex.outcome == 'success' }} run: | if [ -s "${REVIEW_WORKSPACE}/codex-output.json" ]; then jq '.' "${REVIEW_WORKSPACE}/codex-output.json" From 5c7ff0cc92f5a4389bd5c7578256ed83d52f47d0 Mon Sep 17 00:00:00 2001 From: bradley inniss Date: Fri, 20 Mar 2026 03:34:50 -0700 Subject: [PATCH 11/19] Harden question answer parsing --- src/core/question-flow.test.ts | 76 ++++++++++++++++++++++++ src/core/question-flow.ts | 102 +++++++++++++++++++-------------- 2 files changed, 135 insertions(+), 43 deletions(-) create mode 100644 src/core/question-flow.test.ts diff --git a/src/core/question-flow.test.ts b/src/core/question-flow.test.ts new file mode 100644 index 0000000..c7fb8a7 --- /dev/null +++ b/src/core/question-flow.test.ts @@ -0,0 +1,76 @@ +import { describe, expect, test } from "bun:test"; + +import type { PendingQuestion } from "../types/index.js"; +import { parseQuestionAnswerInput } from "./question-flow.js"; + +function makePendingQuestion(questions: PendingQuestion["questions"]): PendingQuestion { + return { + requestId: "req-1", + threadId: "thread-1", + turnId: "turn-1", + itemId: "item-1", + receivedAt: new Date().toISOString(), + questions, + }; +} + +describe("parseQuestionAnswerInput", () => { + test("treats JSON snippets as plain text for a single pending question", () => { + const pendingQuestion = makePendingQuestion([ + { + header: "Config", + id: "config", + question: "What config should I use?", + isOther: true, + isSecret: false, + }, + ]); + + expect(parseQuestionAnswerInput('{"useMigration":true}', pendingQuestion)).toEqual({ + answers: { + config: { + answers: ['{"useMigration":true}'], + }, + }, + }); + }); + + test("rejects documented answers payloads that include unknown question ids", () => { + const pendingQuestion = makePendingQuestion([ + { + header: "Backend", + id: "backend", + question: "Which backend should I use?", + isOther: true, + isSecret: false, + }, + ]); + + expect(() => + parseQuestionAnswerInput('{"answers":{"backedn":{"answers":["bun"]}}}', pendingQuestion) + ).toThrow("answer payload includes unknown question id 'backedn'"); + }); + + test("requires every pending question id in explicit answers payloads", () => { + const pendingQuestion = makePendingQuestion([ + { + header: "Runtime", + id: "runtime", + question: "Which runtime should I use?", + isOther: true, + isSecret: false, + }, + { + header: "Package Manager", + id: "package_manager", + question: "Which package manager should I use?", + isOther: true, + isSecret: false, + }, + ]); + + expect(() => + parseQuestionAnswerInput('{"answers":{"runtime":{"answers":["bun"]}}}', pendingQuestion) + ).toThrow("answer payload is missing question id 'package_manager'"); + }); +}); diff --git a/src/core/question-flow.ts b/src/core/question-flow.ts index 062d61d..1f4c011 100644 --- a/src/core/question-flow.ts +++ b/src/core/question-flow.ts @@ -84,6 +84,45 @@ export function serializeQuestionAnswerResponse(response: ToolRequestUserInputRe return `${JSON.stringify(response, null, 2)}\n`; } +function buildSingleQuestionTextResponse(question: PendingQuestionPrompt, answer: string): ToolRequestUserInputResponse { + return { + answers: { + [question.id]: { + answers: [answer], + }, + }, + }; +} + +function normalizeStructuredAnswers( + answerRecord: Record, + pendingQuestion: PendingQuestion, +): Record { + const expectedQuestionIds = new Set(pendingQuestion.questions.map((question) => question.id)); + + for (const questionId of Object.keys(answerRecord)) { + if (!expectedQuestionIds.has(questionId)) { + throw new Error(`answer payload includes unknown question id '${questionId}'`); + } + } + + const normalizedAnswers: Record = {}; + for (const question of pendingQuestion.questions) { + if (!(question.id in answerRecord)) { + throw new Error(`answer payload is missing question id '${question.id}'`); + } + + const answers = normalizeAnswerList(answerRecord[question.id]); + if (answers === null) { + throw new Error(`answer payload for '${question.id}' must be a string, string array, or { answers: string[] }`); + } + + normalizedAnswers[question.id] = { answers }; + } + + return normalizedAnswers; +} + export function parseQuestionAnswerInput( rawInput: string, pendingQuestion: PendingQuestion, @@ -93,65 +132,42 @@ export function parseQuestionAnswerInput( throw new Error("answer payload is empty"); } - if (pendingQuestion.questions.length === 1 && !trimmed.startsWith("{")) { - const onlyQuestion = pendingQuestion.questions[0]; - if (!onlyQuestion) { - throw new Error("pending question is missing its question definition"); - } + const onlyQuestion = pendingQuestion.questions.length === 1 ? pendingQuestion.questions[0] : undefined; + if (pendingQuestion.questions.length === 1 && !onlyQuestion) { + throw new Error("pending question is missing its question definition"); + } - return { - answers: { - [onlyQuestion.id]: { - answers: [trimmed], - }, - }, - }; + if (pendingQuestion.questions.length === 1 && !trimmed.startsWith("{")) { + return buildSingleQuestionTextResponse(onlyQuestion!, trimmed); } let parsed: unknown; try { parsed = JSON.parse(trimmed); - } catch (error) { - throw new Error( - pendingQuestion.questions.length === 1 - ? `answer payload is not valid JSON: ${error instanceof Error ? error.message : String(error)}` - : "multiple pending questions require a JSON object mapping question ids to answers", - ); + } catch { + if (pendingQuestion.questions.length === 1) { + return buildSingleQuestionTextResponse(onlyQuestion!, trimmed); + } + + throw new Error("multiple pending questions require a JSON object mapping question ids to answers"); } if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) { + if (pendingQuestion.questions.length === 1) { + return buildSingleQuestionTextResponse(onlyQuestion!, trimmed); + } + throw new Error("answer payload must be a JSON object"); } const record = parsed as Record; if ("answers" in record && record.answers && typeof record.answers === "object" && !Array.isArray(record.answers)) { - const normalizedAnswers: Record = {}; - - for (const [questionId, answerValue] of Object.entries(record.answers as Record)) { - const answers = normalizeAnswerList(answerValue); - if (answers === null) { - throw new Error(`answer payload for '${questionId}' must be a string, string array, or { answers: string[] }`); - } - - normalizedAnswers[questionId] = { answers }; - } - - return { answers: normalizedAnswers }; + return { answers: normalizeStructuredAnswers(record.answers as Record, pendingQuestion) }; } - const normalizedAnswers: Record = {}; - for (const question of pendingQuestion.questions) { - if (!(question.id in record)) { - throw new Error(`answer payload is missing question id '${question.id}'`); - } - - const answers = normalizeAnswerList(record[question.id]); - if (answers === null) { - throw new Error(`answer payload for '${question.id}' must be a string, string array, or { answers: string[] }`); - } - - normalizedAnswers[question.id] = { answers }; + if (pendingQuestion.questions.length === 1 && !(onlyQuestion!.id in record)) { + return buildSingleQuestionTextResponse(onlyQuestion!, trimmed); } - return { answers: normalizedAnswers }; + return { answers: normalizeStructuredAnswers(record, pendingQuestion) }; } From 83c125625e700fff7334825d9289c6f304a89f53 Mon Sep 17 00:00:00 2001 From: bradley inniss Date: Fri, 20 Mar 2026 03:56:38 -0700 Subject: [PATCH 12/19] Fix review feedback on question flow and Codex path resolution --- src/agents/codex/codex-path.test.ts | 28 +++ src/agents/codex/codex-path.ts | 17 +- src/agents/codex/session.ts | 176 ++++++++++++++- src/agents/codex/session.unit.test.ts | 304 ++++++++++++++++++++++++++ src/cli/commands/answer.test.ts | 64 +++++- src/cli/commands/answer.ts | 70 +++++- src/cli/commands/run.ts | 1 + src/core/planner.ts | 1 + src/core/task-runner.ts | 1 + src/state/schema.ts | 7 + src/types/index.ts | 7 + 11 files changed, 654 insertions(+), 22 deletions(-) diff --git a/src/agents/codex/codex-path.test.ts b/src/agents/codex/codex-path.test.ts index 9ff7a96..819b1a6 100644 --- a/src/agents/codex/codex-path.test.ts +++ b/src/agents/codex/codex-path.test.ts @@ -1,8 +1,12 @@ +import { chmodSync, mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs"; +import os from "node:os"; +import path from "node:path"; import { describe, expect, test } from "bun:test"; import { clearResolvedCodexPathCacheForTests, compareCodexCliVersions, + resolveCodexPathsOnPath, parseCodexCliVersion, selectPreferredCodexBinary, } from "./codex-path.js"; @@ -67,6 +71,30 @@ describe("codex-path", () => { ).toBe("/first/codex"); }); + test("resolveCodexPathsOnPath includes all executable codex binaries on PATH", () => { + const tempRoot = mkdtempSync(path.join(os.tmpdir(), "orca-codex-path-")); + const firstDir = path.join(tempRoot, "first"); + const secondDir = path.join(tempRoot, "second"); + const firstCodex = path.join(firstDir, "codex"); + const secondCodex = path.join(secondDir, "codex"); + + mkdirSync(firstDir, { recursive: true }); + mkdirSync(secondDir, { recursive: true }); + writeFileSync(firstCodex, "#!/bin/sh\necho codex-cli 0.77.0\n", { mode: 0o755 }); + writeFileSync(secondCodex, "#!/bin/sh\necho codex-cli 0.115.0\n", { mode: 0o755 }); + chmodSync(firstCodex, 0o755); + chmodSync(secondCodex, 0o755); + + try { + expect(resolveCodexPathsOnPath([firstDir, secondDir].join(path.delimiter))).toEqual([ + firstCodex, + secondCodex, + ]); + } finally { + rmSync(tempRoot, { recursive: true, force: true }); + } + }); + test("clearResolvedCodexPathCacheForTests is callable", () => { clearResolvedCodexPathCacheForTests(); expect(true).toBe(true); diff --git a/src/agents/codex/codex-path.ts b/src/agents/codex/codex-path.ts index 85d58eb..0845eee 100644 --- a/src/agents/codex/codex-path.ts +++ b/src/agents/codex/codex-path.ts @@ -143,13 +143,14 @@ export function selectPreferredCodexBinary(probes: CodexBinaryProbe[]): string | return best.path; } -function resolveCodexPathOnPath(): string | null { - const pathValue = process.env.PATH?.trim(); - if (!pathValue) { - return null; +export function resolveCodexPathsOnPath(pathValue: string | undefined = process.env.PATH): string[] { + const normalizedPath = pathValue?.trim(); + if (!normalizedPath) { + return []; } - for (const entry of pathValue.split(path.delimiter)) { + const discovered: string[] = []; + for (const entry of normalizedPath.split(path.delimiter)) { const trimmed = entry.trim(); if (trimmed.length === 0) { continue; @@ -158,19 +159,19 @@ function resolveCodexPathOnPath(): string | null { const candidatePath = path.join(trimmed, "codex"); try { accessSync(candidatePath, fsConstants.X_OK); - return candidatePath; + discovered.push(candidatePath); } catch { continue; } } - return null; + return discovered; } function getCandidatePaths(): string[] { return Array.from( new Set( - [resolveCodexPathOnPath(), ...KNOWN_CODEX_BINARY_CANDIDATES].filter( + [...resolveCodexPathsOnPath(), ...KNOWN_CODEX_BINARY_CANDIDATES].filter( (value): value is string => typeof value === "string" && value.trim().length > 0, ), ), diff --git a/src/agents/codex/session.ts b/src/agents/codex/session.ts index 655085f..49bc0ec 100644 --- a/src/agents/codex/session.ts +++ b/src/agents/codex/session.ts @@ -1,4 +1,7 @@ +import { randomUUID } from "node:crypto"; +import { createServer } from "node:net"; import { readFile, unlink } from "node:fs/promises"; +import os from "node:os"; import path from "node:path"; import { CodexClient } from "@ratley/codex-client"; @@ -12,6 +15,7 @@ import type { import type { HookEvent, OrcaConfig, + PendingAnswerChannel, PlanResult, RunId, Task, @@ -682,9 +686,127 @@ async function clearAnswerFile(store: RunStore, runId: RunId): Promise { await unlink(answerPath).catch(() => undefined); } +type ResumeOverallStatus = "planning" | "running"; + +type SecretAnswerChannelState = { + requestId: RequestId; + descriptor: PendingAnswerChannel; + nextSubmission: () => Promise; + close: () => Promise; +}; + +type SecretAnswerChannelFactory = (requestId: RequestId) => Promise; + +let testSecretAnswerChannelFactory: SecretAnswerChannelFactory | null = null; + +export function setSecretAnswerChannelFactoryForTests(factory: SecretAnswerChannelFactory | null): void { + testSecretAnswerChannelFactory = factory; +} + +function hasSecretQuestions(params: ToolRequestUserInputParams | { questions: Array<{ isSecret?: boolean }> }): boolean { + return params.questions.some((question) => question.isSecret === true); +} + +async function createSecretAnswerChannel(requestId: RequestId): Promise { + if (testSecretAnswerChannelFactory) { + return await testSecretAnswerChannelFactory(requestId); + } + + const token = randomUUID(); + const socketPath = process.platform === "win32" + ? `\\\\.\\pipe\\orca-answer-${process.pid}-${Date.now()}-${Math.random().toString(16).slice(2)}` + : path.join(os.tmpdir(), `orca-answer-${process.pid}-${Date.now()}-${Math.random().toString(16).slice(2)}.sock`); + const queuedAnswers: string[] = []; + const waitingResolvers: Array<(answer: string) => void> = []; + const server = createServer((socket) => { + socket.setEncoding("utf8"); + let buffer = ""; + let handled = false; + + socket.on("data", (chunk) => { + buffer += chunk; + if (handled || !buffer.includes("\n")) { + return; + } + + handled = true; + let response: { ok: boolean; error?: string } = { ok: true }; + + try { + const parsed = JSON.parse(buffer.trim()) as { token?: unknown; answer?: unknown }; + if (parsed.token !== token) { + throw new Error("invalid secret answer token"); + } + if (typeof parsed.answer !== "string" || parsed.answer.trim().length === 0) { + throw new Error("secret answer payload must include a non-empty answer string"); + } + + const resolver = waitingResolvers.shift(); + if (resolver) { + resolver(parsed.answer); + } else { + queuedAnswers.push(parsed.answer); + } + } catch (error) { + response = { + ok: false as const, + error: error instanceof Error ? error.message : String(error), + }; + } + + socket.end(`${JSON.stringify(response)}\n`); + }); + + socket.on("error", () => { + socket.destroy(); + }); + }); + + await new Promise((resolve, reject) => { + server.once("error", reject); + server.listen(socketPath, () => { + server.off("error", reject); + resolve(); + }); + }); + + return { + requestId, + descriptor: { + transport: "ipc", + path: socketPath, + token, + }, + nextSubmission: async () => { + const next = queuedAnswers.shift(); + if (next !== undefined) { + return next; + } + + return await new Promise((resolve) => { + waitingResolvers.push(resolve); + }); + }, + close: async () => { + for (const resolve of waitingResolvers.splice(0)) { + resolve(""); + } + + await new Promise((resolve) => { + server.close(() => resolve()); + }); + + if (process.platform !== "win32") { + await unlink(socketPath).catch(() => undefined); + } + }, + }; +} + export interface SessionInteractionContext { runId: RunId; store: RunStore; + resumeOverallStatus?: ResumeOverallStatus; emitHook?: (event: HookEvent) => Promise; } @@ -728,6 +850,8 @@ export async function createCodexSession( await warnAboutUnavailableMcpServers(client); let activeTaskContext: { taskId: string; taskName: string } | undefined; + let activeSecretAnswerChannel: SecretAnswerChannelState | undefined; + const resumedOverallStatus: ResumeOverallStatus = interactionContext?.resumeOverallStatus ?? "running"; const respondToUserInputRequest = (requestId: RequestId, response: ToolRequestUserInputResponse): void => { const specificResponder = Reflect.get(client as object, "respondToUserInputRequest"); @@ -757,8 +881,13 @@ export async function createCodexSession( const clearPendingQuestion = async ( requestId: RequestId, - overallStatus?: "running" | "waiting_for_answer" + overallStatus?: ResumeOverallStatus | "waiting_for_answer" ): Promise => { + if (activeSecretAnswerChannel?.requestId === requestId) { + await activeSecretAnswerChannel.close().catch(() => undefined); + activeSecretAnswerChannel = undefined; + } + if (!interactionContext) { return; } @@ -771,6 +900,7 @@ export async function createCodexSession( await interactionContext.store.updateRun(interactionContext.runId, { ...(overallStatus ? { overallStatus } : {}), pendingQuestion: undefined, + answerChannel: undefined, }); }; @@ -791,9 +921,19 @@ export async function createCodexSession( const pendingQuestion = createPendingQuestion(request.requestId, request); await clearAnswerFile(interactionContext.store, interactionContext.runId); + let secretAnswerChannel: SecretAnswerChannelState | undefined; + if (hasSecretQuestions(request)) { + secretAnswerChannel = await createSecretAnswerChannel(request.requestId); + activeSecretAnswerChannel = secretAnswerChannel; + } else if (activeSecretAnswerChannel) { + await activeSecretAnswerChannel.close().catch(() => undefined); + activeSecretAnswerChannel = undefined; + } + await interactionContext.store.updateRun(interactionContext.runId, { overallStatus: "waiting_for_answer", pendingQuestion, + ...(secretAnswerChannel ? { answerChannel: secretAnswerChannel.descriptor } : { answerChannel: undefined }), }); if (interactionContext.emitHook) { @@ -817,6 +957,7 @@ export async function createCodexSession( } const answerPath = path.join(interactionContext.store.getRunDir(interactionContext.runId), "answer.txt"); + let nextSecretAnswer = secretAnswerChannel?.nextSubmission(); while (true) { const currentRun = await interactionContext.store.getRun(interactionContext.runId); @@ -832,22 +973,35 @@ export async function createCodexSession( } let rawAnswer: string; - try { - rawAnswer = await readFile(answerPath, "utf8"); - } catch (error) { - if ((error as NodeJS.ErrnoException).code === "ENOENT") { - await sleep(ANSWER_FILE_POLL_MS); + if (secretAnswerChannel) { + const submittedSecretAnswer = await Promise.race([ + nextSecretAnswer ?? Promise.resolve(""), + sleep(ANSWER_FILE_POLL_MS).then(() => null), + ]); + if (submittedSecretAnswer === null) { continue; } - throw error; + rawAnswer = submittedSecretAnswer; + nextSecretAnswer = secretAnswerChannel.nextSubmission(); + } else { + try { + rawAnswer = await readFile(answerPath, "utf8"); + } catch (error) { + if ((error as NodeJS.ErrnoException).code === "ENOENT") { + await sleep(ANSWER_FILE_POLL_MS); + continue; + } + + throw error; + } } try { const parsedAnswer = parseQuestionAnswerInput(rawAnswer, pendingQuestion); respondToUserInputRequest(request.requestId, parsedAnswer); await clearAnswerFile(interactionContext.store, interactionContext.runId); - await clearPendingQuestion(request.requestId, "running"); + await clearPendingQuestion(request.requestId, resumedOverallStatus); return; } catch (error) { const message = error instanceof Error ? error.message : String(error); @@ -872,7 +1026,7 @@ export async function createCodexSession( ); on.call(client, "serverRequest:resolved", (notification: { requestId: RequestId }) => { - void clearPendingQuestion(notification.requestId, "running"); + void clearPendingQuestion(notification.requestId, resumedOverallStatus); }); } @@ -1016,6 +1170,10 @@ export async function createCodexSession( }, async disconnect(): Promise { + if (activeSecretAnswerChannel) { + await activeSecretAnswerChannel.close().catch(() => undefined); + activeSecretAnswerChannel = undefined; + } await client.disconnect(); }, }; diff --git a/src/agents/codex/session.unit.test.ts b/src/agents/codex/session.unit.test.ts index d26aa05..8623027 100644 --- a/src/agents/codex/session.unit.test.ts +++ b/src/agents/codex/session.unit.test.ts @@ -719,6 +719,130 @@ describe("codex session inline skill context", () => { }); describe("codex session question flow", () => { + test("restores planning status after answering a planning-time clarification", async () => { + const tempDir = await mkdtemp(path.join(os.tmpdir(), "orca-question-flow-planning-")); + const store = new RunStore(path.join(tempDir, "runs")); + const runId = "run-1000-abcd"; + await store.createRun(runId, "/tmp/spec.md"); + await store.updateRun(runId, { mode: "plan", overallStatus: "planning" }); + + const responses: Array<{ requestId: string | number; response: unknown }> = []; + let resolveAnswerResponse: (() => void) | undefined; + const answerResponse = new Promise((resolve) => { + resolveAnswerResponse = resolve; + }); + let clientInstance: EventEmitter | null = null; + + try { + mockMultiAgentDetection(false); + mock.module("@ratley/codex-client", () => ({ + CodexClient: class extends EventEmitter { + constructor() { + super(); + clientInstance = this; + } + + async connect(): Promise {} + async disconnect(): Promise {} + async startThread(): Promise<{ id: string }> { + return { id: "thread-1" }; + } + async runReview(): Promise<{ reviewText: string }> { + return { reviewText: "ok" }; + } + respondToUserInputRequest(requestId: string | number, response: unknown): void { + responses.push({ requestId, response }); + resolveAnswerResponse?.(); + } + rejectServerRequest(): void {} + async runTurn(): Promise<{ agentMessage: string; turn: { status: "completed" }; items: [] }> { + queueMicrotask(() => { + clientInstance?.emit("request:userInput", { + requestId: "req-1", + itemId: "item-1", + threadId: "thread-1", + turnId: "turn-1", + questions: [ + { + header: "Framework", + id: "framework", + question: "Which framework should I target?", + isOther: true, + isSecret: false, + options: null, + }, + ], + }); + }); + + await answerResponse; + clientInstance?.emit("serverRequest:resolved", { requestId: "req-1" }); + + return { + agentMessage: "[]", + turn: { status: "completed" }, + items: [], + }; + } + }, + })); + + mock.module("../../utils/skill-loader.js", () => ({ + loadSkills: async () => [], + })); + + const { createCodexSession } = await import(`./session.ts?test=${Math.random()}`); + const session = await createCodexSession(process.cwd(), undefined, { + runId: runId as `${string}-${number}-${string}`, + store, + resumeOverallStatus: "planning", + }); + + try { + const planningPromise = session.planSpec("spec", "context"); + + const waitingRun = await waitFor(async () => { + const run = await store.getRun(runId); + return run?.pendingQuestion ? run : null; + }); + + expect(waitingRun.overallStatus).toBe("waiting_for_answer"); + + const answerPath = path.join(store.getRunDir(runId), "answer.txt"); + await writeFile( + answerPath, + `${JSON.stringify({ answers: { framework: { answers: ["bun"] } } })}\n`, + "utf8", + ); + + await planningPromise; + + const resumedRun = await waitFor(async () => { + const run = await store.getRun(runId); + return run && run.pendingQuestion === undefined ? run : null; + }); + + expect(resumedRun.overallStatus).toBe("planning"); + expect(responses).toEqual([ + { + requestId: "req-1", + response: { + answers: { + framework: { + answers: ["bun"], + }, + }, + }, + }, + ]); + } finally { + await session.disconnect(); + } + } finally { + await rm(tempDir, { recursive: true, force: true }); + } + }); + test("persists pending questions, emits onQuestion, and resumes the same run after an answer", async () => { const tempDir = await mkdtemp(path.join(os.tmpdir(), "orca-question-flow-")); const store = new RunStore(path.join(tempDir, "runs")); @@ -999,6 +1123,186 @@ describe("codex session question flow", () => { }, ]); } finally { + await session.disconnect(); + } + } finally { + await rm(tempDir, { recursive: true, force: true }); + } + }); + + test("uses a direct answer channel for secret questions and clears it after resume", async () => { + const tempDir = await mkdtemp(path.join(os.tmpdir(), "orca-question-flow-secret-")); + const store = new RunStore(path.join(tempDir, "runs")); + const runId = "run-1000-abcd"; + await store.createRun(runId, "/tmp/spec.md"); + await store.updateRun(runId, { mode: "run", overallStatus: "running" }); + + const responses: Array<{ requestId: string | number; response: unknown }> = []; + let resolveAnswerResponse: (() => void) | undefined; + const answerResponse = new Promise((resolve) => { + resolveAnswerResponse = resolve; + }); + let clientInstance: EventEmitter | null = null; + let submitSecretAnswer: ((answer: string) => void) | undefined; + + try { + mockMultiAgentDetection(false); + mock.module("@ratley/codex-client", () => ({ + CodexClient: class extends EventEmitter { + constructor() { + super(); + clientInstance = this; + } + + async connect(): Promise {} + async disconnect(): Promise {} + async startThread(): Promise<{ id: string }> { + return { id: "thread-1" }; + } + async runReview(): Promise<{ reviewText: string }> { + return { reviewText: "ok" }; + } + respondToUserInputRequest(requestId: string | number, response: unknown): void { + responses.push({ requestId, response }); + resolveAnswerResponse?.(); + } + rejectServerRequest(): void {} + async runTurn(): Promise<{ agentMessage: string; turn: { status: "completed" }; items: [] }> { + queueMicrotask(() => { + clientInstance?.emit("request:userInput", { + requestId: "req-1", + itemId: "item-1", + threadId: "thread-1", + turnId: "turn-1", + questions: [ + { + header: "API Key", + id: "api_key", + question: "Which API key should I use?", + isOther: true, + isSecret: true, + options: null, + }, + ], + }); + }); + + await answerResponse; + clientInstance?.emit("serverRequest:resolved", { requestId: "req-1" }); + + return { + agentMessage: '{"outcome":"done"}', + turn: { status: "completed" }, + items: [], + }; + } + }, + })); + + mock.module("../../utils/skill-loader.js", () => ({ + loadSkills: async () => [], + })); + + const sessionModule = await import(`./session.ts?test=${Math.random()}`); + sessionModule.setSecretAnswerChannelFactoryForTests(async (requestId) => { + let queuedAnswer: string | undefined; + let waitingResolver: ((answer: string) => void) | undefined; + + submitSecretAnswer = (answer: string) => { + if (waitingResolver) { + const resolve = waitingResolver; + waitingResolver = undefined; + resolve(answer); + return; + } + + queuedAnswer = answer; + }; + + return { + requestId, + descriptor: { + transport: "ipc", + path: "/tmp/orca-test-secret-answer.sock", + token: "secret-token", + }, + nextSubmission: async () => { + if (queuedAnswer !== undefined) { + const answer = queuedAnswer; + queuedAnswer = undefined; + return answer; + } + + return await new Promise((resolve) => { + waitingResolver = resolve; + }); + }, + close: async () => { + if (waitingResolver) { + const resolve = waitingResolver; + waitingResolver = undefined; + resolve(""); + } + }, + }; + }); + const session = await sessionModule.createCodexSession(process.cwd(), undefined, { + runId: runId as `${string}-${number}-${string}`, + store, + resumeOverallStatus: "running", + }); + + try { + const executionPromise = session.executeTask( + { + id: "task-1", + name: "Configure auth", + description: "Use the provided secret.", + dependencies: [], + acceptance_criteria: ["Auth is configured"], + status: "pending", + retries: 0, + maxRetries: 3, + }, + runId, + "context", + ); + + const waitingRun = await waitFor(async () => { + const run = await store.getRun(runId); + return run?.answerChannel ? run : null; + }); + + expect(waitingRun.overallStatus).toBe("waiting_for_answer"); + expect(waitingRun.answerChannel?.transport).toBe("ipc"); + + submitSecretAnswer?.("super-secret"); + + const result = await executionPromise; + expect(result.outcome).toBe("done"); + expect(responses).toEqual([ + { + requestId: "req-1", + response: { + answers: { + api_key: { + answers: ["super-secret"], + }, + }, + }, + }, + ]); + + const resumedRun = await waitFor(async () => { + const run = await store.getRun(runId); + return run && run.pendingQuestion === undefined ? run : null; + }); + + expect(resumedRun.answerChannel).toBeUndefined(); + expect(resumedRun.overallStatus).toBe("running"); + await expect(readFile(path.join(store.getRunDir(runId), "answer.txt"), "utf8")).rejects.toThrow(); + } finally { + sessionModule.setSecretAnswerChannelFactoryForTests(null); await session.disconnect(); } } finally { diff --git a/src/cli/commands/answer.test.ts b/src/cli/commands/answer.test.ts index d361ac0..e2f42f4 100644 --- a/src/cli/commands/answer.test.ts +++ b/src/cli/commands/answer.test.ts @@ -27,17 +27,25 @@ function setStdoutTty(value: boolean): void { async function loadAnswerModule(options?: { input?: () => Promise; select?: () => Promise; -}): Promise<{ answerModule: AnswerModule; inputMock: ReturnType; selectMock: ReturnType }> { + password?: () => Promise; +}): Promise<{ + answerModule: AnswerModule; + inputMock: ReturnType; + selectMock: ReturnType; + passwordMock: ReturnType; +}> { const inputMock = mock(options?.input ?? (async () => "prompt answer")); const selectMock = mock(options?.select ?? (async () => "selected-run-1000-abcd")); + const passwordMock = mock(options?.password ?? (async () => "prompt secret")); mock.module("@inquirer/prompts", () => ({ input: inputMock, + password: passwordMock, select: selectMock })); const answerModule = await import(`./answer.js?test=${Math.random()}`); - return { answerModule, inputMock, selectMock }; + return { answerModule, inputMock, selectMock, passwordMock }; } beforeEach(async () => { @@ -284,6 +292,58 @@ describe("answer command", () => { ); }); + test("uses password prompt and direct answer channel for secret questions", async () => { + const runId = "answer-secret-1000-abcd"; + const submissions: string[] = []; + + const store = new RunStore(runsDir); + await store.createRun(runId, "/tmp/spec.md"); + await store.updateRun(runId, { + overallStatus: "waiting_for_answer", + pendingQuestion: { + requestId: "req-1", + threadId: "thread-1", + turnId: "turn-1", + itemId: "item-1", + receivedAt: new Date().toISOString(), + questions: [ + { + header: "API Key", + id: "api_key", + question: "What API key should I use?", + isOther: true, + isSecret: true, + options: null, + }, + ], + }, + answerChannel: { + transport: "ipc", + path: path.join(tempDir, "secret-answer.sock"), + token: "secret-token", + }, + }); + setStdoutTty(true); + + const { answerModule, passwordMock, inputMock } = await loadAnswerModule({ + password: async () => "super-secret", + }); + answerModule.setAnswerChannelSubmitterForTests(async (channel, payload) => { + submissions.push(JSON.stringify({ channel, payload })); + }); + + await answerModule.answerCommandHandler(runId, undefined, {}); + + expect(passwordMock).toHaveBeenCalled(); + expect(inputMock).not.toHaveBeenCalled(); + expect(submissions).toHaveLength(1); + expect(submissions[0]).toContain('"token":"secret-token"'); + expect(submissions[0]).toContain('\\"super-secret\\"'); + + const answerPath = path.join(runsDir, runId, "answer.txt"); + await expect(readFile(answerPath, "utf8")).rejects.toThrow(); + }); + test("fails when positional run-id and --run are both provided", async () => { const { answerModule } = await loadAnswerModule(); diff --git a/src/cli/commands/answer.ts b/src/cli/commands/answer.ts index 5c51718..30e06e4 100644 --- a/src/cli/commands/answer.ts +++ b/src/cli/commands/answer.ts @@ -1,18 +1,27 @@ import path from "node:path"; import { promises as fs } from "node:fs"; +import { connect } from "node:net"; -import { input } from "@inquirer/prompts"; +import { input, password } from "@inquirer/prompts"; import type { Command } from "commander"; import { parseQuestionAnswerInput, serializeQuestionAnswerResponse } from "../../core/question-flow.js"; import { RunStore } from "../../state/store.js"; -import type { PendingQuestion } from "../../types/index.js"; +import type { PendingAnswerChannel, PendingQuestion } from "../../types/index.js"; import { selectRun } from "../../utils/select-run.js"; export interface AnswerCommandOptions { run?: string; } +type AnswerChannelSubmitter = (channel: PendingAnswerChannel, payload: string) => Promise; + +let testAnswerChannelSubmitter: AnswerChannelSubmitter | null = null; + +export function setAnswerChannelSubmitterForTests(submitter: AnswerChannelSubmitter | null): void { + testAnswerChannelSubmitter = submitter; +} + function createStore(): RunStore { const runsDir = process.env.ORCA_RUNS_DIR; return runsDir ? new RunStore(runsDir) : new RunStore(); @@ -33,6 +42,15 @@ function formatQuestionPrompt(question: PendingQuestion["questions"][number]): s return `${question.header}: ${question.question}${options}`; } +function hasSecretQuestions(pendingQuestion: PendingQuestion | undefined): boolean { + return pendingQuestion?.questions.some((question) => question.isSecret) ?? false; +} + +async function promptForQuestionAnswer(question: PendingQuestion["questions"][number]): Promise { + const prompt = { message: formatQuestionPrompt(question) }; + return question.isSecret ? await password(prompt) : await input(prompt); +} + async function resolveAnswerPayload( pendingQuestion: PendingQuestion | undefined, answerArg: string | undefined, @@ -56,7 +74,7 @@ async function resolveAnswerPayload( const answers: Record = {}; for (const question of pendingQuestion.questions) { - const value = await input({ message: formatQuestionPrompt(question) }); + const value = await promptForQuestionAnswer(question); if (!value) { throw new Error(`no answer provided for question '${question.id}'`); } @@ -67,6 +85,41 @@ async function resolveAnswerPayload( return JSON.stringify({ answers }); } +async function submitAnswerViaChannel(channel: PendingAnswerChannel, payload: string): Promise { + if (testAnswerChannelSubmitter) { + await testAnswerChannelSubmitter(channel, payload); + return; + } + + await new Promise((resolve, reject) => { + const socket = connect(channel.path); + socket.setEncoding("utf8"); + + let responseBuffer = ""; + + socket.on("connect", () => { + socket.write(`${JSON.stringify({ token: channel.token, answer: payload })}\n`); + }); + + socket.on("data", (chunk) => { + responseBuffer += chunk; + }); + + socket.on("error", reject); + socket.on("end", () => { + try { + const parsed = JSON.parse(responseBuffer.trim()) as { ok?: boolean; error?: unknown }; + if (parsed.ok !== true) { + throw new Error(typeof parsed.error === "string" ? parsed.error : "secret answer channel rejected the payload"); + } + resolve(); + } catch (error) { + reject(error); + } + }); + }); +} + export async function answerCommandHandler( positionalRunId: string | undefined, answerArg: string | undefined, @@ -105,6 +158,17 @@ export async function answerCommandHandler( const serialized = run.pendingQuestion ? serializeQuestionAnswerResponse(parseQuestionAnswerInput(answerPayload, run.pendingQuestion)) : `${answerPayload}\n`; + + if (hasSecretQuestions(run.pendingQuestion)) { + if (!run.answerChannel) { + throw new Error("run is waiting for a secret answer but has no active answer channel"); + } + + await submitAnswerViaChannel(run.answerChannel, answerPayload); + console.log(`Answer submitted. Run ${runId} will resume shortly.`); + return; + } + const answerPath = path.join(store.getRunDir(runId), "answer.txt"); await fs.mkdir(path.dirname(answerPath), { recursive: true }); await fs.writeFile(answerPath, serialized, "utf8"); diff --git a/src/cli/commands/run.ts b/src/cli/commands/run.ts index 71b3d52..20ee6f9 100644 --- a/src/cli/commands/run.ts +++ b/src/cli/commands/run.ts @@ -478,6 +478,7 @@ export async function runCommandHandler(options: RunCommandOptions): Promise { codexSession = await createCodexSession(process.cwd(), config, { runId, store, + resumeOverallStatus: "running", emitHook, }); executeTaskFn = (task, taskRunId, _cfg, systemContext) => diff --git a/src/state/schema.ts b/src/state/schema.ts index bc4ac03..9c95c5a 100644 --- a/src/state/schema.ts +++ b/src/state/schema.ts @@ -51,6 +51,12 @@ const PendingQuestionSchema = z.object({ questions: z.array(PendingQuestionPromptSchema) }); +const PendingAnswerChannelSchema = z.object({ + transport: z.literal("ipc"), + path: z.string(), + token: z.string(), +}); + const PrStatusSchema = z.object({ draftTitle: z.string().optional(), draftBody: z.string().optional(), @@ -78,6 +84,7 @@ export const RunStatusSchema = z.object({ milestones: z.array(z.string()), errors: z.array(ErrorEntrySchema), pendingQuestion: PendingQuestionSchema.optional(), + answerChannel: PendingAnswerChannelSchema.optional(), pr: PrStatusSchema.optional() }); diff --git a/src/types/index.ts b/src/types/index.ts index 4a55b02..b556443 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -49,6 +49,7 @@ export interface RunStatus { milestones: string[]; errors: Array<{ at: string; message: string; taskId?: string }>; pendingQuestion?: PendingQuestion | undefined; + answerChannel?: PendingAnswerChannel | undefined; pr?: { draftTitle?: string; draftBody?: string; @@ -81,6 +82,12 @@ export interface PendingQuestion { questions: PendingQuestionPrompt[]; } +export interface PendingAnswerChannel { + transport: "ipc"; + path: string; + token: string; +} + export interface BaseHookEvent { runId: RunId; message: string; From ef20a2126e5e79fc6647ca664ef0eea24138f5ee Mon Sep 17 00:00:00 2001 From: bradley inniss Date: Fri, 20 Mar 2026 04:07:04 -0700 Subject: [PATCH 13/19] Fix status question ids and skills list metadata fallback --- src/agents/codex/session.ts | 54 ++++++++++++++----- src/agents/codex/session.unit.test.ts | 77 +++++++++++++++++++++++++++ src/cli/commands/status.test.ts | 44 ++++++++++++++- src/core/question-flow.ts | 2 +- 4 files changed, 163 insertions(+), 14 deletions(-) diff --git a/src/agents/codex/session.ts b/src/agents/codex/session.ts index 49bc0ec..0dd4fe0 100644 --- a/src/agents/codex/session.ts +++ b/src/agents/codex/session.ts @@ -416,6 +416,10 @@ function buildTurnInput(text: string, skills: LoadedSkill[]): Array<{ type: "tex interface RawSkill { name?: unknown; + description?: unknown; + shortDescription?: unknown; + interface?: unknown; + dependencies?: unknown; path?: unknown; } @@ -424,6 +428,26 @@ interface RawSkillsListEntry { skills?: unknown; } +function renderSkillMetadataBody(skill: RawSkill): string { + const sections: string[] = []; + + if (typeof skill.description === "string" && skill.description.trim().length > 0) { + sections.push(skill.description.trim()); + } else if (typeof skill.shortDescription === "string" && skill.shortDescription.trim().length > 0) { + sections.push(skill.shortDescription.trim()); + } + + if (skill.interface && typeof skill.interface === "object") { + sections.push(`Interface:\n${JSON.stringify(skill.interface, null, 2)}`); + } + + if (skill.dependencies && typeof skill.dependencies === "object") { + sections.push(`Dependencies:\n${JSON.stringify(skill.dependencies, null, 2)}`); + } + + return sections.join("\n\n").trim(); +} + function normalizePerCwdExtraUserRoots(config?: OrcaConfig): Array<{ cwd: string; extraUserRoots: string[] }> { const configured = config?.codex?.perCwdExtraUserRoots; if (!configured || configured.length === 0) { @@ -499,28 +523,34 @@ async function loadCodexListedSkills(client: CodexClient, cwd: string, config?: } for (const skill of entry.skills as RawSkill[]) { - if (!skill || typeof skill !== "object" || typeof skill.name !== "string" || typeof skill.path !== "string") { + if (!skill || typeof skill !== "object" || typeof skill.name !== "string") { continue; } - const normalizedSkillPath = skill.path.trim(); - if (normalizedSkillPath.length === 0) { - continue; + let skillBody = ""; + let normalizedSkillPath: string | null = null; + if (typeof skill.path === "string" && skill.path.trim().length > 0) { + normalizedSkillPath = skill.path.trim(); + try { + skillBody = await readFile(normalizedSkillPath, "utf8"); + } catch { + skillBody = ""; + } } - let skillBody = ""; - try { - skillBody = await readFile(normalizedSkillPath, "utf8"); - } catch { - skillBody = ""; + if (skillBody.trim().length === 0) { + skillBody = renderSkillMetadataBody(skill); } discovered.push({ name: skill.name, - description: "", + description: + typeof skill.description === "string" + ? skill.description + : (typeof skill.shortDescription === "string" ? skill.shortDescription : ""), body: skillBody, - dirPath: path.dirname(normalizedSkillPath), - filePath: normalizedSkillPath, + dirPath: normalizedSkillPath ? path.dirname(normalizedSkillPath) : cwd, + filePath: normalizedSkillPath ?? `${cwd}#skills/list:${skill.name}`, }); } } diff --git a/src/agents/codex/session.unit.test.ts b/src/agents/codex/session.unit.test.ts index 8623027..d3965a2 100644 --- a/src/agents/codex/session.unit.test.ts +++ b/src/agents/codex/session.unit.test.ts @@ -584,6 +584,83 @@ describe("codex session skill discovery", () => { await session.disconnect(); } }); + + test("keeps metadata-only skills/list entries when path is omitted", async () => { + type TurnInputItem = { type: "text"; text: string }; + + let capturedInput: TurnInputItem[] = []; + + mockMultiAgentDetection(false); + mock.module("@ratley/codex-client", () => ({ + CodexClient: class { + async connect(): Promise {} + async disconnect(): Promise {} + async startThread(): Promise<{ id: string }> { + return { id: "thread-1" }; + } + async runTurn(params: { input?: TurnInputItem[] }): Promise<{ agentMessage: string; turn: { status: "completed" }; items: [] }> { + capturedInput = params.input ?? []; + return { agentMessage: "[]", turn: { status: "completed" }, items: [] }; + } + async request(): Promise<{ + data: Array<{ + cwd: string; + skills: Array<{ + name: string; + description: string; + enabled: boolean; + interface: { displayName: string }; + dependencies: { tools: Array<{ type: string; value: string }> }; + }>; + errors: []; + }>; + }> { + return { + data: [ + { + cwd: process.cwd(), + skills: [ + { + name: "metadata-only-skill", + description: "Use metadata when path is absent.", + enabled: true, + interface: { displayName: "Metadata Skill" }, + dependencies: { tools: [{ type: "env_var", value: "OPENAI_API_KEY" }] }, + }, + ], + errors: [], + }, + ], + }; + } + async runReview(): Promise<{ reviewText: string }> { + return { reviewText: "ok" }; + } + }, + })); + + mock.module("../../utils/skill-loader.js", () => ({ + loadSkills: async () => [], + })); + + const { createCodexSession } = await import(`./session.ts?test=${Math.random()}`); + const session = await createCodexSession(process.cwd()); + + try { + await session.planSpec("spec", "context"); + + const prompt = capturedInput[0]?.text ?? ""; + expect(prompt).toContain("Referenced Orca skills:"); + expect(prompt).toContain("Skill: metadata-only-skill"); + expect(prompt).toContain("Use metadata when path is absent."); + expect(prompt).toContain("Interface:"); + expect(prompt).toContain('"displayName": "Metadata Skill"'); + expect(prompt).toContain("Dependencies:"); + expect(prompt).toContain('"OPENAI_API_KEY"'); + } finally { + await session.disconnect(); + } + }); }); describe("codex session inline skill context", () => { diff --git a/src/cli/commands/status.test.ts b/src/cli/commands/status.test.ts index af5a3f8..5da09ee 100644 --- a/src/cli/commands/status.test.ts +++ b/src/cli/commands/status.test.ts @@ -77,7 +77,49 @@ describe("status command", () => { const output = logs.join("\n"); expect(output).toContain("Pending Question:"); - expect(output).toContain("Game Type: Which game type should I build?"); + expect(output).toContain("Game Type (game_type): Which game type should I build?"); expect(output).toContain("Options: Arcade, Puzzle."); }); + + test("prints question ids for multi-question answer payloads", async () => { + const runId = "run-2000-abcd"; + const store = new RunStore(runsDir); + await store.createRun(runId, "/tmp/spec.md"); + await store.updateRun(runId, { + mode: "run", + overallStatus: "waiting_for_answer", + pendingQuestion: { + requestId: "req-2", + threadId: "thread-2", + turnId: "turn-2", + itemId: "item-2", + receivedAt: new Date().toISOString(), + questions: [ + { + header: "Backend", + id: "backend", + question: "Which backend should I use?", + isOther: true, + isSecret: false, + options: null, + }, + { + header: "Frontend", + id: "frontend", + question: "Which frontend should I use?", + isOther: true, + isSecret: false, + options: null, + }, + ], + }, + }); + + const statusModule = await loadStatusModule(); + await statusModule.statusCommandHandler({ run: runId }); + + const output = logs.join("\n"); + expect(output).toContain("Backend (backend): Which backend should I use?"); + expect(output).toContain("Frontend (frontend): Which frontend should I use?"); + }); }); diff --git a/src/core/question-flow.ts b/src/core/question-flow.ts index 1f4c011..0287129 100644 --- a/src/core/question-flow.ts +++ b/src/core/question-flow.ts @@ -47,7 +47,7 @@ function formatQuestionBlock(question: PendingQuestionPrompt): string { ? ` Options: ${question.options.map((option) => option.label).join(", ")}.` : ""; - return `${question.header}: ${question.question}${optionText}`; + return `${question.header} (${question.id}): ${question.question}${optionText}`; } export function createPendingQuestion( From d6b5a0908802cd5b05a4a86560cc395b94872773 Mon Sep 17 00:00:00 2001 From: bradley inniss Date: Fri, 20 Mar 2026 04:20:00 -0700 Subject: [PATCH 14/19] Fix Codex task outcome parsing and resolved input waits --- src/agents/codex/session.ts | 20 +-- src/agents/codex/session.unit.test.ts | 174 ++++++++++++++++++++++++++ 2 files changed, 185 insertions(+), 9 deletions(-) diff --git a/src/agents/codex/session.ts b/src/agents/codex/session.ts index 0dd4fe0..f64728b 100644 --- a/src/agents/codex/session.ts +++ b/src/agents/codex/session.ts @@ -882,6 +882,7 @@ export async function createCodexSession( let activeTaskContext: { taskId: string; taskName: string } | undefined; let activeSecretAnswerChannel: SecretAnswerChannelState | undefined; const resumedOverallStatus: ResumeOverallStatus = interactionContext?.resumeOverallStatus ?? "running"; + const resolvedServerRequests = new Set(); const respondToUserInputRequest = (requestId: RequestId, response: ToolRequestUserInputResponse): void => { const specificResponder = Reflect.get(client as object, "respondToUserInputRequest"); @@ -1002,6 +1003,11 @@ export async function createCodexSession( return; } + if (resolvedServerRequests.delete(request.requestId)) { + await clearPendingQuestion(request.requestId, resumedOverallStatus); + return; + } + let rawAnswer: string; if (secretAnswerChannel) { const submittedSecretAnswer = await Promise.race([ @@ -1056,6 +1062,7 @@ export async function createCodexSession( ); on.call(client, "serverRequest:resolved", (notification: { requestId: RequestId }) => { + resolvedServerRequests.add(notification.requestId); void clearPendingQuestion(notification.requestId, resumedOverallStatus); }); } @@ -1132,25 +1139,20 @@ export async function createCodexSession( } const rawResponse = extractAgentText(result); - - // Primary signal: use the SDK's structured turn status. + const parsedResult = parseTaskExecution(rawResponse); const status = result.turn.status; - if (status === "completed") { - return { outcome: "done", rawResponse }; - } if (status === "failed") { return { outcome: "failed", - error: result.turn.error?.message ?? "Turn failed", + error: parsedResult.error ?? result.turn.error?.message ?? "Turn failed", rawResponse, }; } if (status === "interrupted") { - return { outcome: "failed", error: "Turn was interrupted", rawResponse }; + return { outcome: "failed", error: parsedResult.error ?? "Turn was interrupted", rawResponse }; } - // Fallback: status is unexpected/missing — parse text as before. - return parseTaskExecution(rawResponse); + return parsedResult; }, async consultTaskGraph(tasks: Task[]): Promise { diff --git a/src/agents/codex/session.unit.test.ts b/src/agents/codex/session.unit.test.ts index d3965a2..7710715 100644 --- a/src/agents/codex/session.unit.test.ts +++ b/src/agents/codex/session.unit.test.ts @@ -117,6 +117,61 @@ describe("codex session effort wiring", () => { } }); + test("executeTask respects an assistant failure marker even when turn status is completed", async () => { + const runTurnMock = mock(async () => ({ + agentMessage: '{"outcome":"failed","error":"missing dependency"}', + turn: { status: "completed" }, + items: [], + })); + + mockMultiAgentDetection(false); + mock.module("@ratley/codex-client", () => ({ + CodexClient: class { + async connect(): Promise {} + async disconnect(): Promise {} + async startThread(): Promise<{ id: string }> { + return { id: "thread-1" }; + } + runTurn = runTurnMock; + async runReview(): Promise<{ reviewText: string }> { + return { reviewText: "ok" }; + } + }, + })); + + mock.module("../../utils/skill-loader.js", () => ({ + loadSkills: async () => [], + })); + + const { createCodexSession } = await import(`./session.ts?test=${Math.random()}`); + const session = await createCodexSession(process.cwd()); + + try { + const result = await session.executeTask( + { + id: "t1", + name: "Task", + description: "Do thing", + dependencies: [], + acceptance_criteria: ["Done"], + status: "pending", + retries: 0, + maxRetries: 3, + }, + "run-1", + "context", + ); + + expect(result).toEqual({ + outcome: "failed", + error: "missing dependency", + rawResponse: '{"outcome":"failed","error":"missing dependency"}', + }); + } finally { + await session.disconnect(); + } + }); + test("smoke: uses per-step thinkingLevel values for decision/planning/review/execution turns", async () => { const efforts: string[] = []; const runTurnMock = mock(async (params: { effort?: string; input?: Array<{ text?: string }> }) => { @@ -1386,4 +1441,123 @@ describe("codex session question flow", () => { await rm(tempDir, { recursive: true, force: true }); } }); + + test("stops waiting when app-server resolves a user-input request before any answer is provided", async () => { + const tempDir = await mkdtemp(path.join(os.tmpdir(), "orca-question-flow-resolved-")); + const store = new RunStore(path.join(tempDir, "runs")); + const runId = "run-1000-abcd"; + await store.createRun(runId, "/tmp/spec.md"); + await store.updateRun(runId, { mode: "run", overallStatus: "running" }); + + const responses: Array<{ requestId: string | number; response: unknown }> = []; + let clientInstance: EventEmitter | null = null; + + try { + mockMultiAgentDetection(false); + mock.module("@ratley/codex-client", () => ({ + CodexClient: class extends EventEmitter { + constructor() { + super(); + clientInstance = this; + } + + async connect(): Promise {} + async disconnect(): Promise {} + async startThread(): Promise<{ id: string }> { + return { id: "thread-1" }; + } + async runReview(): Promise<{ reviewText: string }> { + return { reviewText: "ok" }; + } + respondToUserInputRequest(requestId: string | number, response: unknown): void { + responses.push({ requestId, response }); + } + rejectServerRequest(): void {} + async runTurn(): Promise<{ agentMessage: string; turn: { status: "completed" }; items: [] }> { + queueMicrotask(() => { + clientInstance?.emit("request:userInput", { + requestId: "req-1", + itemId: "item-1", + threadId: "thread-1", + turnId: "turn-1", + questions: [ + { + header: "Framework", + id: "framework", + question: "Which framework should I target?", + isOther: true, + isSecret: false, + options: null, + }, + ], + }); + }); + + queueMicrotask(() => { + setTimeout(() => { + clientInstance?.emit("serverRequest:resolved", { requestId: "req-1" }); + }, 20); + }); + + return { + agentMessage: '{"outcome":"done"}', + turn: { status: "completed" }, + items: [], + }; + } + }, + })); + + mock.module("../../utils/skill-loader.js", () => ({ + loadSkills: async () => [], + })); + + const { createCodexSession } = await import(`./session.ts?test=${Math.random()}`); + const session = await createCodexSession(process.cwd(), undefined, { + runId: runId as `${string}-${number}-${string}`, + store, + resumeOverallStatus: "running", + }); + + try { + const executionPromise = session.executeTask( + { + id: "task-1", + name: "Build the game", + description: "Implement the requested game.", + dependencies: [], + acceptance_criteria: ["Game is implemented"], + status: "pending", + retries: 0, + maxRetries: 3, + }, + runId, + "context", + ); + + const waitingRun = await waitFor(async () => { + const run = await store.getRun(runId); + return run?.pendingQuestion ? run : null; + }); + + expect(waitingRun.overallStatus).toBe("waiting_for_answer"); + + const result = await executionPromise; + expect(result.outcome).toBe("done"); + expect(responses).toEqual([]); + + const resumedRun = await waitFor(async () => { + const run = await store.getRun(runId); + return run && run.pendingQuestion === undefined ? run : null; + }); + + expect(resumedRun.overallStatus).toBe("running"); + await expect(readFile(path.join(store.getRunDir(runId), "answer.txt"), "utf8")).rejects.toThrow(); + } finally { + await session.disconnect(); + } + } finally { + await rm(tempDir, { recursive: true, force: true }); + } + }); }); From 48e7fdcd01f7560c3c6f5d024ef3cbde43c9177d Mon Sep 17 00:00:00 2001 From: bradley inniss Date: Fri, 20 Mar 2026 13:04:19 -0700 Subject: [PATCH 15/19] Fix Codex skill and secret question regressions --- src/agents/codex/session.ts | 102 +++++++++++++++++-- src/agents/codex/session.unit.test.ts | 139 ++++++++++++++------------ src/cli/commands/answer.test.ts | 11 +- src/cli/commands/answer.ts | 6 +- src/core/codex-config.test.ts | 7 ++ src/core/codex-config.ts | 11 +- src/core/secret-answer-channel.ts | 63 ++++++++++++ src/state/schema.ts | 7 -- src/types/index.ts | 1 - 9 files changed, 255 insertions(+), 92 deletions(-) create mode 100644 src/core/secret-answer-channel.ts diff --git a/src/agents/codex/session.ts b/src/agents/codex/session.ts index f64728b..87ef7ba 100644 --- a/src/agents/codex/session.ts +++ b/src/agents/codex/session.ts @@ -1,6 +1,6 @@ import { randomUUID } from "node:crypto"; import { createServer } from "node:net"; -import { readFile, unlink } from "node:fs/promises"; +import { readdir, readFile, unlink } from "node:fs/promises"; import os from "node:os"; import path from "node:path"; @@ -29,10 +29,15 @@ import { createPendingQuestion, parseQuestionAnswerInput, } from "../../core/question-flow.js"; +import { + clearSecretAnswerChannel, + writeSecretAnswerChannel, +} from "../../core/secret-answer-channel.js"; import { TaskGraphReviewPayloadSchema } from "../../core/task-graph-review.js"; import { RunStore } from "../../state/store.js"; import type { CodexEffort } from "../../types/effort.js"; -import { loadSkills, type LoadedSkill } from "../../utils/skill-loader.js"; +import * as skillLoader from "../../utils/skill-loader.js"; +import type { LoadedSkill } from "../../utils/skill-loader.js"; import { logger } from "../../utils/logger.js"; import { resolveCodexPath } from "./codex-path.js"; @@ -476,6 +481,70 @@ function getPerCwdExtraUserRootsForCwd(config: OrcaConfig | undefined, cwd: stri return normalizePerCwdExtraUserRoots(config).filter((entry) => entry.cwd === normalizedCwd); } +async function loadConfiguredPerCwdExtraRootSkills( + config: OrcaConfig | undefined, + cwd: string, +): Promise { + const configuredRoots = getPerCwdExtraUserRootsForCwd(config, cwd); + if (configuredRoots.length === 0) { + return []; + } + + const candidateDirs = new Set(); + for (const entry of configuredRoots) { + for (const root of entry.extraUserRoots) { + const resolvedRoot = path.resolve(root); + candidateDirs.add(resolvedRoot); + candidateDirs.add(path.join(resolvedRoot, "skills")); + candidateDirs.add(path.join(resolvedRoot, ".agents", "skills")); + candidateDirs.add(path.join(resolvedRoot, ".codex", "skills")); + } + } + + const discovered: LoadedSkill[] = []; + for (const candidateDir of candidateDirs) { + let entries; + try { + entries = await readdir(candidateDir, { withFileTypes: true, encoding: "utf8" }); + } catch (error) { + if ((error as NodeJS.ErrnoException).code === "ENOENT") { + continue; + } + + throw error; + } + + for (const entry of entries) { + if (!entry.isDirectory()) { + continue; + } + + const skillDir = path.join(candidateDir, entry.name); + const skillFile = path.join(skillDir, "SKILL.md"); + let skillFileContent: string; + try { + skillFileContent = await readFile(skillFile, "utf8"); + } catch (error) { + if ((error as NodeJS.ErrnoException).code === "ENOENT") { + continue; + } + + throw error; + } + + const frontmatterMatch = skillFileContent.match(/^---\r?\n[\s\S]*?\r?\n---\r?\n?/u); + discovered.push({ + name: entry.name, + description: "", + body: frontmatterMatch ? skillFileContent.slice(frontmatterMatch[0].length) : skillFileContent, + dirPath: skillDir, + filePath: skillFile, + }); + } + } + return discovered; +} + async function loadCodexListedSkills(client: CodexClient, cwd: string, config?: OrcaConfig): Promise { const perCwdExtraUserRoots = getPerCwdExtraUserRootsForCwd(config, cwd); @@ -575,11 +644,12 @@ async function loadCodexListedSkills(client: CodexClient, cwd: string, config?: } async function resolveTurnSkills(client: CodexClient, config: OrcaConfig | undefined, cwd: string): Promise { - const baseSkills = await loadSkills(config); + const baseSkills = await skillLoader.loadSkills(config); + const configuredExtraRootSkills = await loadConfiguredPerCwdExtraRootSkills(config, cwd); const listedSkills = await loadCodexListedSkills(client, cwd, config); - if (listedSkills.length === 0) { + if (configuredExtraRootSkills.length === 0 && listedSkills.length === 0) { return baseSkills; } @@ -587,6 +657,11 @@ async function resolveTurnSkills(client: CodexClient, config: OrcaConfig | undef for (const skill of baseSkills) { mergedByName.set(skill.name, skill); } + for (const skill of configuredExtraRootSkills) { + if (!mergedByName.has(skill.name)) { + mergedByName.set(skill.name, skill); + } + } for (const skill of listedSkills) { if (!mergedByName.has(skill.name)) { mergedByName.set(skill.name, skill); @@ -914,9 +989,15 @@ export async function createCodexSession( requestId: RequestId, overallStatus?: ResumeOverallStatus | "waiting_for_answer" ): Promise => { - if (activeSecretAnswerChannel?.requestId === requestId) { - await activeSecretAnswerChannel.close().catch(() => undefined); - activeSecretAnswerChannel = undefined; + const secretAnswerChannel = activeSecretAnswerChannel; + if (secretAnswerChannel?.requestId === requestId) { + if (interactionContext) { + await clearSecretAnswerChannel(interactionContext.runId).catch(() => undefined); + } + await secretAnswerChannel.close().catch(() => undefined); + if (activeSecretAnswerChannel === secretAnswerChannel) { + activeSecretAnswerChannel = undefined; + } } if (!interactionContext) { @@ -931,7 +1012,6 @@ export async function createCodexSession( await interactionContext.store.updateRun(interactionContext.runId, { ...(overallStatus ? { overallStatus } : {}), pendingQuestion: undefined, - answerChannel: undefined, }); }; @@ -956,7 +1036,9 @@ export async function createCodexSession( if (hasSecretQuestions(request)) { secretAnswerChannel = await createSecretAnswerChannel(request.requestId); activeSecretAnswerChannel = secretAnswerChannel; + await writeSecretAnswerChannel(interactionContext.runId, secretAnswerChannel.descriptor); } else if (activeSecretAnswerChannel) { + await clearSecretAnswerChannel(interactionContext.runId).catch(() => undefined); await activeSecretAnswerChannel.close().catch(() => undefined); activeSecretAnswerChannel = undefined; } @@ -964,7 +1046,6 @@ export async function createCodexSession( await interactionContext.store.updateRun(interactionContext.runId, { overallStatus: "waiting_for_answer", pendingQuestion, - ...(secretAnswerChannel ? { answerChannel: secretAnswerChannel.descriptor } : { answerChannel: undefined }), }); if (interactionContext.emitHook) { @@ -1203,6 +1284,9 @@ export async function createCodexSession( async disconnect(): Promise { if (activeSecretAnswerChannel) { + if (interactionContext) { + await clearSecretAnswerChannel(interactionContext.runId).catch(() => undefined); + } await activeSecretAnswerChannel.close().catch(() => undefined); activeSecretAnswerChannel = undefined; } diff --git a/src/agents/codex/session.unit.test.ts b/src/agents/codex/session.unit.test.ts index 7710715..2aba49e 100644 --- a/src/agents/codex/session.unit.test.ts +++ b/src/agents/codex/session.unit.test.ts @@ -640,80 +640,83 @@ describe("codex session skill discovery", () => { } }); - test("keeps metadata-only skills/list entries when path is omitted", async () => { + test("loads real skill bodies from configured perCwdExtraUserRoots even when skills/list omits path", async () => { type TurnInputItem = { type: "text"; text: string }; let capturedInput: TurnInputItem[] = []; + const sharedRoot = await mkdtemp(path.join(os.tmpdir(), "orca-extra-skill-root-")); + const skillName = "shared-root-skill"; + const sharedSkillPath = path.join(sharedRoot, ".agents", "skills", skillName, "SKILL.md"); + await mkdir(path.dirname(sharedSkillPath), { recursive: true }); + await writeFile(sharedSkillPath, "Real shared skill workflow body", "utf8"); - mockMultiAgentDetection(false); - mock.module("@ratley/codex-client", () => ({ - CodexClient: class { - async connect(): Promise {} - async disconnect(): Promise {} - async startThread(): Promise<{ id: string }> { - return { id: "thread-1" }; - } - async runTurn(params: { input?: TurnInputItem[] }): Promise<{ agentMessage: string; turn: { status: "completed" }; items: [] }> { - capturedInput = params.input ?? []; - return { agentMessage: "[]", turn: { status: "completed" }, items: [] }; - } - async request(): Promise<{ - data: Array<{ - cwd: string; - skills: Array<{ - name: string; - description: string; - enabled: boolean; - interface: { displayName: string }; - dependencies: { tools: Array<{ type: string; value: string }> }; + try { + mockMultiAgentDetection(false); + mock.module("@ratley/codex-client", () => ({ + CodexClient: class { + async connect(): Promise {} + async disconnect(): Promise {} + async startThread(): Promise<{ id: string }> { + return { id: "thread-1" }; + } + async runTurn(params: { input?: TurnInputItem[] }): Promise<{ agentMessage: string; turn: { status: "completed" }; items: [] }> { + capturedInput = params.input ?? []; + return { agentMessage: "[]", turn: { status: "completed" }, items: [] }; + } + async request(): Promise<{ + data: Array<{ + cwd: string; + skills: Array<{ + name: string; + description: string; + enabled: boolean; + interface: { displayName: string }; + dependencies: { tools: Array<{ type: string; value: string }> }; + }>; + errors: []; }>; - errors: []; - }>; - }> { - return { - data: [ - { - cwd: process.cwd(), - skills: [ - { - name: "metadata-only-skill", - description: "Use metadata when path is absent.", - enabled: true, - interface: { displayName: "Metadata Skill" }, - dependencies: { tools: [{ type: "env_var", value: "OPENAI_API_KEY" }] }, - }, - ], - errors: [], - }, - ], - }; - } - async runReview(): Promise<{ reviewText: string }> { - return { reviewText: "ok" }; - } - }, - })); - - mock.module("../../utils/skill-loader.js", () => ({ - loadSkills: async () => [], - })); + }> { + return { + data: [ + { + cwd: process.cwd(), + skills: [ + { + name: skillName, + description: "Use metadata when path is absent.", + enabled: true, + interface: { displayName: "Metadata Skill" }, + dependencies: { tools: [{ type: "env_var", value: "OPENAI_API_KEY" }] }, + }, + ], + errors: [], + }, + ], + }; + } + async runReview(): Promise<{ reviewText: string }> { + return { reviewText: "ok" }; + } + }, + })); - const { createCodexSession } = await import(`./session.ts?test=${Math.random()}`); - const session = await createCodexSession(process.cwd()); + const { createCodexSession } = await import(`./session.ts?test=${Math.random()}`); + const session = await createCodexSession(process.cwd(), { + codex: { + perCwdExtraUserRoots: [{ cwd: process.cwd(), extraUserRoots: [sharedRoot] }], + }, + }); - try { await session.planSpec("spec", "context"); const prompt = capturedInput[0]?.text ?? ""; expect(prompt).toContain("Referenced Orca skills:"); - expect(prompt).toContain("Skill: metadata-only-skill"); - expect(prompt).toContain("Use metadata when path is absent."); - expect(prompt).toContain("Interface:"); - expect(prompt).toContain('"displayName": "Metadata Skill"'); - expect(prompt).toContain("Dependencies:"); - expect(prompt).toContain('"OPENAI_API_KEY"'); - } finally { + expect(prompt).toContain(`Skill: ${skillName}`); + expect(prompt).toContain("Real shared skill workflow body"); + expect(prompt).not.toContain("Use metadata when path is absent."); await session.disconnect(); + } finally { + await rm(sharedRoot, { recursive: true, force: true }); } }); }); @@ -1400,13 +1403,19 @@ describe("codex session question flow", () => { "context", ); + const { readSecretAnswerChannel } = await import("../../core/secret-answer-channel.js"); + const waitingRun = await waitFor(async () => { const run = await store.getRun(runId); - return run?.answerChannel ? run : null; + return run?.pendingQuestion ? run : null; }); expect(waitingRun.overallStatus).toBe("waiting_for_answer"); - expect(waitingRun.answerChannel?.transport).toBe("ipc"); + await expect(readSecretAnswerChannel(runId as `${string}-${number}-${string}`)).resolves.toEqual({ + transport: "ipc", + path: "/tmp/orca-test-secret-answer.sock", + token: "secret-token", + }); submitSecretAnswer?.("super-secret"); @@ -1430,7 +1439,7 @@ describe("codex session question flow", () => { return run && run.pendingQuestion === undefined ? run : null; }); - expect(resumedRun.answerChannel).toBeUndefined(); + await expect(readSecretAnswerChannel(runId as `${string}-${number}-${string}`)).resolves.toBeNull(); expect(resumedRun.overallStatus).toBe("running"); await expect(readFile(path.join(store.getRunDir(runId), "answer.txt"), "utf8")).rejects.toThrow(); } finally { diff --git a/src/cli/commands/answer.test.ts b/src/cli/commands/answer.test.ts index e2f42f4..fe26f77 100644 --- a/src/cli/commands/answer.test.ts +++ b/src/cli/commands/answer.test.ts @@ -295,6 +295,7 @@ describe("answer command", () => { test("uses password prompt and direct answer channel for secret questions", async () => { const runId = "answer-secret-1000-abcd"; const submissions: string[] = []; + const { writeSecretAnswerChannel } = await import("../../core/secret-answer-channel.js"); const store = new RunStore(runsDir); await store.createRun(runId, "/tmp/spec.md"); @@ -317,11 +318,11 @@ describe("answer command", () => { }, ], }, - answerChannel: { - transport: "ipc", - path: path.join(tempDir, "secret-answer.sock"), - token: "secret-token", - }, + }); + await writeSecretAnswerChannel(runId as `${string}-${number}-${string}`, { + transport: "ipc", + path: path.join(tempDir, "secret-answer.sock"), + token: "secret-token", }); setStdoutTty(true); diff --git a/src/cli/commands/answer.ts b/src/cli/commands/answer.ts index 30e06e4..ae4b7b1 100644 --- a/src/cli/commands/answer.ts +++ b/src/cli/commands/answer.ts @@ -6,6 +6,7 @@ import { input, password } from "@inquirer/prompts"; import type { Command } from "commander"; import { parseQuestionAnswerInput, serializeQuestionAnswerResponse } from "../../core/question-flow.js"; +import { readSecretAnswerChannel } from "../../core/secret-answer-channel.js"; import { RunStore } from "../../state/store.js"; import type { PendingAnswerChannel, PendingQuestion } from "../../types/index.js"; import { selectRun } from "../../utils/select-run.js"; @@ -160,11 +161,12 @@ export async function answerCommandHandler( : `${answerPayload}\n`; if (hasSecretQuestions(run.pendingQuestion)) { - if (!run.answerChannel) { + const answerChannel = await readSecretAnswerChannel(runId as `${string}-${number}-${string}`); + if (!answerChannel) { throw new Error("run is waiting for a secret answer but has no active answer channel"); } - await submitAnswerViaChannel(run.answerChannel, answerPayload); + await submitAnswerViaChannel(answerChannel, answerPayload); console.log(`Answer submitted. Run ${runId} will resume shortly.`); return; } diff --git a/src/core/codex-config.test.ts b/src/core/codex-config.test.ts index b0dfcc9..87e2f65 100644 --- a/src/core/codex-config.test.ts +++ b/src/core/codex-config.test.ts @@ -111,6 +111,13 @@ describe("isCodexMultiAgentActive", () => { await expect(isCodexMultiAgentActive({ codex: { multiAgent: false } }, tmpConfigFile)).resolves.toBe(false); }); + it("returns false when Orca config explicitly disables multi-agent even if root config enables it", async () => { + const fs = await import("node:fs/promises"); + await fs.writeFile(tmpConfigFile, "[features]\nmulti_agent = true\n", "utf8"); + + await expect(isCodexMultiAgentActive({ codex: { multiAgent: false } }, tmpConfigFile)).resolves.toBe(false); + }); + it("returns false when root config contains multi_agent = false", async () => { const fs = await import("node:fs/promises"); await fs.writeFile(tmpConfigFile, "[features]\nmulti_agent = false\n", "utf8"); diff --git a/src/core/codex-config.ts b/src/core/codex-config.ts index 2d18fe7..eab7e39 100644 --- a/src/core/codex-config.ts +++ b/src/core/codex-config.ts @@ -13,9 +13,13 @@ const ORCA_MULTI_AGENT_BLOCK = `# Added by orca — remove or set multi_agent = multi_agent = true `; +function getExplicitMultiAgentSetting(config?: OrcaConfig): boolean | undefined { + return typeof config?.codex?.multiAgent === "boolean" ? config.codex.multiAgent : undefined; +} + function isMultiAgentEnabled(config?: OrcaConfig): boolean { // Default: off. Only enable if explicitly set to true. - return config?.codex?.multiAgent === true; + return getExplicitMultiAgentSetting(config) === true; } function containsMultiAgentSetting(content: string): boolean { @@ -72,8 +76,9 @@ export async function isCodexMultiAgentActive( config?: OrcaConfig, _configFile?: string, ): Promise { - if (isMultiAgentEnabled(config)) { - return true; + const explicitMultiAgentSetting = getExplicitMultiAgentSetting(config); + if (explicitMultiAgentSetting !== undefined) { + return explicitMultiAgentSetting; } const configFile = _configFile ?? GLOBAL_CONFIG_FILE; diff --git a/src/core/secret-answer-channel.ts b/src/core/secret-answer-channel.ts new file mode 100644 index 0000000..cf35d63 --- /dev/null +++ b/src/core/secret-answer-channel.ts @@ -0,0 +1,63 @@ +import { mkdir, readFile, rename, rm, writeFile } from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; + +import type { PendingAnswerChannel, RunId } from "../types/index.js"; + +function getSecretAnswerChannelsDir(): string { + return path.join(os.homedir(), ".orca", "runtime", "answer-channels"); +} + +function getSecretAnswerChannelFilePath(runId: RunId): string { + return path.join(getSecretAnswerChannelsDir(), `${runId}.json`); +} + +export async function writeSecretAnswerChannel( + runId: RunId, + channel: PendingAnswerChannel, +): Promise { + const channelsDir = getSecretAnswerChannelsDir(); + const channelFile = getSecretAnswerChannelFilePath(runId); + const tempFile = `${channelFile}.${process.pid}.${Date.now()}.tmp`; + const payload = JSON.stringify(channel); + + await mkdir(channelsDir, { recursive: true, mode: 0o700 }); + await writeFile(tempFile, payload, { encoding: "utf8", mode: 0o600 }); + await rename(tempFile, channelFile); +} + +export async function readSecretAnswerChannel(runId: RunId): Promise { + const channelFile = getSecretAnswerChannelFilePath(runId); + + let raw: string; + try { + raw = await readFile(channelFile, "utf8"); + } catch (error) { + if ((error as NodeJS.ErrnoException).code === "ENOENT") { + return null; + } + + throw error; + } + + const parsed = JSON.parse(raw) as Partial; + if ( + parsed.transport !== "ipc" || + typeof parsed.path !== "string" || + parsed.path.length === 0 || + typeof parsed.token !== "string" || + parsed.token.length === 0 + ) { + throw new Error(`invalid secret answer channel metadata for run ${runId}`); + } + + return { + transport: "ipc", + path: parsed.path, + token: parsed.token, + }; +} + +export async function clearSecretAnswerChannel(runId: RunId): Promise { + await rm(getSecretAnswerChannelFilePath(runId), { force: true }); +} diff --git a/src/state/schema.ts b/src/state/schema.ts index 9c95c5a..bc4ac03 100644 --- a/src/state/schema.ts +++ b/src/state/schema.ts @@ -51,12 +51,6 @@ const PendingQuestionSchema = z.object({ questions: z.array(PendingQuestionPromptSchema) }); -const PendingAnswerChannelSchema = z.object({ - transport: z.literal("ipc"), - path: z.string(), - token: z.string(), -}); - const PrStatusSchema = z.object({ draftTitle: z.string().optional(), draftBody: z.string().optional(), @@ -84,7 +78,6 @@ export const RunStatusSchema = z.object({ milestones: z.array(z.string()), errors: z.array(ErrorEntrySchema), pendingQuestion: PendingQuestionSchema.optional(), - answerChannel: PendingAnswerChannelSchema.optional(), pr: PrStatusSchema.optional() }); diff --git a/src/types/index.ts b/src/types/index.ts index b556443..4d157f3 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -49,7 +49,6 @@ export interface RunStatus { milestones: string[]; errors: Array<{ at: string; message: string; taskId?: string }>; pendingQuestion?: PendingQuestion | undefined; - answerChannel?: PendingAnswerChannel | undefined; pr?: { draftTitle?: string; draftBody?: string; From e864becbb3b93f8fda2a02b79773b73a8c0e0dd9 Mon Sep 17 00:00:00 2001 From: bradley inniss Date: Fri, 20 Mar 2026 18:08:51 -0700 Subject: [PATCH 16/19] Harden Orca e2e question flow and smoke coverage --- specs/smoke/README.md | 32 + specs/smoke/no-plan-library.md | 80 ++ specs/smoke/question-flow-greeter.md | 117 +++ specs/smoke/review-cycle-validator.md | 94 +++ src/agents/codex/session.test.ts | 129 +-- src/agents/codex/session.ts | 400 +++++++++- src/agents/codex/session.unit.test.ts | 779 +++++++++++++++++-- src/cli/commands/plan.test.ts | 62 ++ src/cli/commands/plan.ts | 7 +- src/cli/commands/run-command.test-harness.ts | 3 +- src/cli/commands/run.test.ts | 46 ++ src/cli/commands/run.ts | 47 +- src/core/task-runner.test.ts | 34 + src/core/task-runner.ts | 26 +- src/state/schema.ts | 1 + src/types/index.ts | 1 + 16 files changed, 1665 insertions(+), 193 deletions(-) create mode 100644 specs/smoke/README.md create mode 100644 specs/smoke/no-plan-library.md create mode 100644 specs/smoke/question-flow-greeter.md create mode 100644 specs/smoke/review-cycle-validator.md create mode 100644 src/cli/commands/plan.test.ts diff --git a/specs/smoke/README.md b/specs/smoke/README.md new file mode 100644 index 0000000..466b8ee --- /dev/null +++ b/specs/smoke/README.md @@ -0,0 +1,32 @@ +# Orca Smoke Specs + +Checked-in manual smoke scenarios for exercising Orca end-to-end against real local repos in `tmp/`. + +These specs are intentionally small. The goal is to verify Orca behavior, not to build impressive apps. + +## Scenarios + +- `html-game-planning.md` + - planning-heavy Bun HTML game + - exercises `orca plan`, task-graph review, and consultation +- `question-flow-greeter.md` + - execution run that should require a clarification question + - exercises `waiting_for_answer`, `onQuestion`, `orca answer`, and same-run resume +- `no-plan-library.md` + - small execution run that Orca should automatically keep as a single task + - exercises the planning-skip path plus standard completion hooks +- `review-cycle-validator.md` + - execution run with a validator that should fail on the first attempt and succeed after Orca fixes findings + - exercises post-exec review, validator findings, `onFindings`, and auto-fix looping + +## Usage + +Create each smoke project under `tmp/smoke//`, copy the spec body into `SMOKE_SPEC.md`, `cd` into that repo, and run Orca from that project directory. + +Prefer isolated run state when smoking locally: + +```bash +export ORCA_RUNS_DIR="$(pwd)/.orca-runs" +``` + +For hook validation, point the hook commands at simple local scripts that append stdin payloads to a JSONL file. diff --git a/specs/smoke/no-plan-library.md b/specs/smoke/no-plan-library.md new file mode 100644 index 0000000..a974c94 --- /dev/null +++ b/specs/smoke/no-plan-library.md @@ -0,0 +1,80 @@ +# No-Plan Library Smoke + +Manual smoke scenario for exercising: + +- automatic planning skip for a tiny task +- standard task execution +- task completion hooks +- final completion without review-loop complications + +The generated project stays local and gitignored under `tmp/smoke/no-plan-lib/`. + +## Workspace setup + +```bash +mkdir -p tmp/smoke/no-plan-lib +cd tmp/smoke/no-plan-lib +bun init -y +git init +git add . +git commit -m "baseline" +``` + +Create a small starting project: + +```bash +cat > math.ts <<'EOF' +export function sum(a: number, b: number): number { + return a + b; +} +EOF + +cat > math.test.ts <<'EOF' +import { expect, test } from "bun:test"; +import { sum } from "./math"; + +test("sum adds two numbers", () => { + expect(sum(2, 3)).toBe(5); +}); +EOF +``` + +## Local spec file + +Copy the spec below into `tmp/smoke/no-plan-lib/SMOKE_SPEC.md`. + +```md +# Add A Small Math Helper + +Update this Bun project. Orca should decide that no multi-step plan is needed and keep execution as a single task. + +## Requirements + +- Add `multiply(a, b)` to `math.ts`. +- Add a Bun test that covers the new helper. +- Keep the project tiny and dependency-free. +- Run local verification before finishing. + +## Verification + +- Run `bun test`. +``` + +## Suggested run flow + +From `tmp/smoke/no-plan-lib/`: + +```bash +export ORCA_RUNS_DIR="$(pwd)/.orca-runs" +orca run --spec ./SMOKE_SPEC.md \ + --on-task-complete 'node ./hook-log.mjs task-complete' \ + --on-complete 'node ./hook-log.mjs complete' \ + --on-error 'node ./hook-log.mjs error' +``` + +## Manual acceptance + +- Orca decides planning can be skipped and finishes in a single execution task. +- `bun test` passes. +- Completion hooks fire. +- The project remains simple and framework-free. diff --git a/specs/smoke/question-flow-greeter.md b/specs/smoke/question-flow-greeter.md new file mode 100644 index 0000000..b43961d --- /dev/null +++ b/specs/smoke/question-flow-greeter.md @@ -0,0 +1,117 @@ +# Question Flow Greeter Smoke + +Manual smoke scenario for exercising: + +- a small execution run where Orca should skip multi-step planning +- live clarification during execution +- `waiting_for_answer` +- `onQuestion` +- `orca answer` +- same-run resume and completion hooks + +The generated project stays local and gitignored under `tmp/smoke/question-greeter/`. + +## Workspace setup + +```bash +mkdir -p tmp/smoke/question-greeter +cd tmp/smoke/question-greeter +bun init -y +git init +git add . +git commit -m "baseline" +``` + +Create a minimal starting project: + +```bash +mkdir -p test +cat > index.ts <<'EOF' +export const releaseCodename = "TODO"; + +export function greet(): string { + return `Release: ${releaseCodename}`; +} + +if (import.meta.main) { + console.log(greet()); +} +EOF + +cat > test/greet.test.ts <<'EOF' +import { expect, test } from "bun:test"; +import { greet, releaseCodename } from "../index"; + +test("release codename is set", () => { + expect(releaseCodename).not.toBe("TODO"); +}); + +test("greet includes the release codename", () => { + expect(greet()).toContain(releaseCodename); +}); +EOF + +cat > validate-answer.ts <<'EOF' +import { readFileSync } from "node:fs"; + +const codename = readFileSync(new URL("./codename.txt", import.meta.url), "utf8").trim(); +if (!codename) { + throw new Error("codename.txt was empty"); +} +console.log(`Validated codename: ${codename}`); +EOF +``` + +## Local spec file + +Copy the spec below into `tmp/smoke/question-greeter/SMOKE_SPEC.md`. + +```md +# Fill In A Missing Release Codename + +Update this Bun project so it asks for the missing release codename if needed during execution instead of guessing. + +## Requirements + +- Do not invent a codename. +- If the codename is not specified in the repo or the prompt context, ask for it explicitly before making the change. +- Update `index.ts` so the exported value is no longer `TODO`. +- Write the chosen codename into `codename.txt`. +- Keep the project minimal and avoid adding dependencies. +- Run local verification before finishing. + +## Verification + +- Run `bun test`. +- Run `bun run validate-answer.ts`. +``` + +## Suggested run flow + +From `tmp/smoke/question-greeter/`: + +```bash +export ORCA_RUNS_DIR="$(pwd)/.orca-runs" +orca run --spec ./SMOKE_SPEC.md \ + --on-question 'node ./hook-log.mjs question' \ + --on-complete 'node ./hook-log.mjs complete' \ + --on-error 'node ./hook-log.mjs error' \ + --on-task-complete 'node ./hook-log.mjs task-complete' +``` + +When the run enters `waiting_for_answer`, answer it: + +```bash +orca status --run +orca answer "Nebula-7" +``` + +## Manual acceptance + +- Orca asks for the missing codename instead of guessing. +- `orca status` shows the pending question and question IDs if multiple prompts are present. +- `orca answer` resumes the same live run. +- The final run completes successfully. +- `bun test` passes. +- `bun run validate-answer.ts` passes. +- `codename.txt` contains the answer that was provided. diff --git a/specs/smoke/review-cycle-validator.md b/specs/smoke/review-cycle-validator.md new file mode 100644 index 0000000..8d634ae --- /dev/null +++ b/specs/smoke/review-cycle-validator.md @@ -0,0 +1,94 @@ +# Review Cycle Validator Smoke + +Manual smoke scenario for exercising: + +- planned or unplanned execution with validator commands +- post-exec findings collection +- `onFindings` +- auto-fix review loop +- final completion after review + +The generated project stays local and gitignored under `tmp/smoke/review-cycle/`. + +## Workspace setup + +```bash +mkdir -p tmp/smoke/review-cycle +cd tmp/smoke/review-cycle +bun init -y +git init +git add . +git commit -m "baseline" +``` + +Create a small starting project: + +```bash +cat > index.ts <<'EOF' +export function label(name: string): string { + return `Hello, ${name}`; +} +EOF + +cat > index.test.ts <<'EOF' +import { expect, test } from "bun:test"; +import { label } from "./index"; + +test("label greets by name", () => { + expect(label("Orca")).toBe("Hello, Orca"); +}); +EOF + +cat > verify-output.ts <<'EOF' +import { readFileSync } from "node:fs"; + +const source = readFileSync(new URL("./index.ts", import.meta.url), "utf8"); +if (!source.includes("Goodbye")) { + throw new Error("expected updated farewell text"); +} +console.log("Verified expected farewell text"); +EOF +``` + +## Local spec file + +Copy the spec below into `tmp/smoke/review-cycle/SMOKE_SPEC.md`. + +```md +# Change The Greeting Contract + +Update this Bun project so the exported function now returns a farewell string instead of a greeting. + +## Requirements + +- Change `label(name)` so it returns `Goodbye, `. +- Update tests to match the new behavior. +- Keep the project tiny and dependency-free. +- Run local verification before finishing. + +## Verification + +- Run `bun test`. +- Run `bun run verify-output.ts`. +``` + +## Suggested run flow + +Use an Orca config that enables execution review and validator commands, then run: + +```bash +export ORCA_RUNS_DIR="$(pwd)/.orca-runs" +orca run --spec ./SMOKE_SPEC.md \ + --on-findings 'node ./hook-log.mjs findings' \ + --on-complete 'node ./hook-log.mjs complete' \ + --on-error 'node ./hook-log.mjs error' +``` + +## Manual acceptance + +- The validator commands run after execution. +- If execution or review misses something, Orca records findings and loops. +- `onFindings` fires when the review cycle finds a real problem. +- The final run reaches `completed` only after review succeeds. +- `bun test` passes. +- `bun run verify-output.ts` passes. diff --git a/src/agents/codex/session.test.ts b/src/agents/codex/session.test.ts index 52b3a45..923865b 100644 --- a/src/agents/codex/session.test.ts +++ b/src/agents/codex/session.test.ts @@ -1,10 +1,9 @@ import { existsSync } from "node:fs"; +import path from "node:path"; import { describe, expect, test } from "bun:test"; import type { Task } from "../../types/index.js"; -import { createCodexSession } from "./session.js"; -// Try common locations for the codex binary const CODEX_PATHS = [ "/Applications/Codex.app/Contents/Resources/codex", Bun.which("codex"), @@ -12,7 +11,8 @@ const CODEX_PATHS = [ "/usr/local/bin/codex", ].filter(Boolean) as string[]; -const codexPath = CODEX_PATHS.find((p) => existsSync(p)) ?? null; +const codexPath = CODEX_PATHS.find((candidate) => existsSync(candidate)) ?? null; +const sessionModulePath = path.resolve(import.meta.dir, "session.ts"); function makeTask(overrides: Partial = {}): Task { return { @@ -35,64 +35,87 @@ function makeTask(overrides: Partial = {}): Task { }; } +function runCodexIntegrationSnippet(snippet: string): { exitCode: number; stdout: string; stderr: string } { + const proc = Bun.spawnSync({ + cmd: ["bun", "--eval", snippet], + stdout: "pipe", + stderr: "pipe", + env: { + ...process.env, + ORCA_CODEX_PATH: codexPath ?? "", + }, + }); + + return { + exitCode: proc.exitCode, + stdout: proc.stdout.toString("utf8"), + stderr: proc.stderr.toString("utf8"), + }; +} + if (!codexPath) { test("codex adapter integration skipped (codex binary not found)", () => { - // Guard: skip gracefully when codex is not installed expect(codexPath).toBeNull(); }); } else { describe("Codex adapter integration (createCodexSession)", () => { - test("creates session, executes a simple task, and disconnects", async () => { - const session = await createCodexSession("/tmp"); - - try { - expect(typeof session.threadId).toBe("string"); - expect(session.threadId.length).toBeGreaterThan(0); - - const result = await session.executeTask(makeTask(), "test-run-id"); - - expect(result.outcome === "done" || result.outcome === "failed").toBe(true); - expect(typeof result.rawResponse).toBe("string"); - expect(result.rawResponse.length).toBeGreaterThan(0); - - console.log(`executeTask outcome: ${result.outcome}`); - console.log(`rawResponse length: ${result.rawResponse.length}`); - } finally { - await session.disconnect(); - } - }, 300_000); - - test("consultTaskGraph returns valid ConsultationResult", async () => { - const session = await createCodexSession("/tmp"); - - try { - const tasks: Task[] = [ - makeTask({ id: "task-1", name: "Create file", dependencies: [] }), - makeTask({ id: "task-2", name: "Read file", dependencies: ["task-1"] }), - ]; - - const result = await session.consultTaskGraph(tasks); - - expect(Array.isArray(result.issues)).toBe(true); - expect(typeof result.ok).toBe("boolean"); - - console.log(`Consultation ok: ${result.ok}`); - console.log(`Issues: ${JSON.stringify(result.issues)}`); - } finally { - await session.disconnect(); - } + test("creates session, executes a simple task, and disconnects", () => { + const task = JSON.stringify(makeTask()); + const { exitCode, stdout, stderr } = runCodexIntegrationSnippet(` + import { createCodexSession } from ${JSON.stringify(sessionModulePath)}; + const task = ${task}; + const session = await createCodexSession("/tmp"); + try { + const result = await session.executeTask(task, "test-run-id"); + console.log(JSON.stringify({ + threadId: session.threadId, + outcome: result.outcome, + rawLength: result.rawResponse.length, + })); + } finally { + await session.disconnect(); + } + `); + + expect(exitCode).toBe(0); + expect(stderr).toBe(""); + + const parsed = JSON.parse(stdout.trim()) as { + threadId: string; + outcome: string; + rawLength: number; + }; + expect(parsed.threadId.length).toBeGreaterThan(0); + expect(parsed.outcome === "done" || parsed.outcome === "failed").toBe(true); + expect(parsed.rawLength).toBeGreaterThan(0); }, 300_000); - test("reviewChanges returns a string", async () => { - const session = await createCodexSession("/tmp"); - - try { - const review = await session.reviewChanges(); - expect(typeof review).toBe("string"); - console.log(`Review length: ${review.length}`); - } finally { - await session.disconnect(); - } + test("consultTaskGraph returns valid ConsultationResult", () => { + const tasks = JSON.stringify([ + makeTask({ id: "task-1", name: "Create file", dependencies: [] }), + makeTask({ id: "task-2", name: "Read file", dependencies: ["task-1"] }), + ]); + const { exitCode, stdout, stderr } = runCodexIntegrationSnippet(` + import { createCodexSession } from ${JSON.stringify(sessionModulePath)}; + const tasks = ${tasks}; + const session = await createCodexSession("/tmp"); + try { + const result = await session.consultTaskGraph(tasks); + console.log(JSON.stringify(result)); + } finally { + await session.disconnect(); + } + `); + + expect(exitCode).toBe(0); + expect(stderr).toBe(""); + + const parsed = JSON.parse(stdout.trim()) as { + issues: unknown[]; + ok: boolean; + }; + expect(Array.isArray(parsed.issues)).toBe(true); + expect(typeof parsed.ok).toBe("boolean"); }, 300_000); }); } diff --git a/src/agents/codex/session.ts b/src/agents/codex/session.ts index 87ef7ba..1c25843 100644 --- a/src/agents/codex/session.ts +++ b/src/agents/codex/session.ts @@ -8,6 +8,7 @@ import { CodexClient } from "@ratley/codex-client"; import type { CompletedTurn, RequestId, + ThreadItem, ToolRequestUserInputParams, ToolRequestUserInputResponse, } from "@ratley/codex-client"; @@ -67,12 +68,41 @@ function getMultiAgentPlanningGuidance(multiAgentActive: boolean): string[] { ]; } -function buildPlanningPrompt(spec: string, systemContext: string, multiAgentActive: boolean): string { +function getClarificationRequestGuidance(clarificationToolAvailable: boolean, scope: "planning" | "execution" | "review"): string[] { + if (!clarificationToolAvailable) { + return []; + } + + const firstLine = scope === "execution" + ? "If you need any user-provided value, preference, approval, or clarification to complete this task correctly, use Codex's request_user_input tool instead of guessing, failing, or baking the question into a later task." + : "If a blocking ambiguity prevents correct work, use Codex's request_user_input tool to ask concise clarification questions instead of guessing."; + + return [ + firstLine, + "Ask at most 3 short questions with stable snake_case ids.", + "If you need a secret such as a token or password, mark that question as secret.", + ...(scope === "execution" + ? [ + "If this task is itself about obtaining clarification, it is not complete until the question has been asked, answered, and the answer has been applied.", + "After the user answers, continue the same turn and finish the requested file changes, commands, and verification before responding.", + "Do not stop after acknowledging the answer. Resume implementation immediately and only finish once the requested edits are on disk and validated.", + ] + : []), + ]; +} + +function buildPlanningPrompt( + spec: string, + systemContext: string, + multiAgentActive: boolean, + clarificationToolAvailable: boolean, +): string { return [ systemContext, "You are decomposing a spec into an ordered task graph.", "Prefer task decomposition that maximizes safe parallelism for independent workstreams.", "Isolate task ownership (files/subsystems) to avoid cross-task collisions.", + ...getClarificationRequestGuidance(clarificationToolAvailable, "planning"), ...getMultiAgentPlanningGuidance(multiAgentActive), ...getCodeSimplifierGuidance(), "Return a JSON array of tasks.", @@ -92,6 +122,7 @@ function buildTaskExecutionPrompt( cwd: string, systemContext?: string, multiAgentActive = false, + clarificationContext?: string, ): string { return [ ...(systemContext ? [systemContext] : []), @@ -111,6 +142,9 @@ function buildTaskExecutionPrompt( `Task Name: ${task.name}`, "Task Description:", task.description, + ...(clarificationContext && clarificationContext.trim().length > 0 + ? ["Resolved Clarification Context:", clarificationContext.trim()] + : []), "Acceptance Criteria:", ...task.acceptance_criteria.map( (criterion, index) => `${index + 1}. ${criterion}`, @@ -124,10 +158,44 @@ function buildTaskExecutionPrompt( ].join("\n\n"); } -function buildPlanDecisionPrompt(spec: string, systemContext: string): string { +function buildTaskExecutionClarificationPrompt( + task: Task, + runId: string, + cwd: string, + systemContext?: string, +): string { + return [ + ...(systemContext ? [systemContext] : []), + "You are Orca's execution clarification gate.", + ...getClarificationRequestGuidance(true, "execution"), + `Run ID: ${runId}`, + `Repository CWD: ${cwd}`, + `Task ID: ${task.id}`, + `Task Name: ${task.name}`, + "Task Description:", + task.description, + "Acceptance Criteria:", + ...task.acceptance_criteria.map( + (criterion, index) => `${index + 1}. ${criterion}`, + ), + "Inspect the repository and task to decide whether execution needs any user-provided value or preference.", + "Do not edit files, do not run mutating commands, and do not claim the task is complete in this turn.", + "If user input is needed, ask via request_user_input, wait for the answer, then continue and summarize the resolved constraint.", + 'Return JSON only with shape: {"needsInput":boolean,"context":string}', + "Set needsInput=true only if you actually asked the user in this clarification turn.", + "Set context to a concise execution-ready summary of the user-provided value(s) and any discovered constraints the execution turn must honor. Use an empty string when no extra context is needed.", + ].join("\n\n"); +} + +function buildPlanDecisionPrompt( + spec: string, + systemContext: string, + clarificationToolAvailable: boolean, +): string { return [ systemContext, "You are Orca's planning gate.", + ...getClarificationRequestGuidance(clarificationToolAvailable, "planning"), "Decide whether this spec needs multi-step planning or can run as one direct execution task.", "Set needsPlan=true when coordination/dependencies/research/design across multiple steps are required.", "Set needsPlan=false when a single focused execution task is sufficient.", @@ -137,11 +205,17 @@ function buildPlanDecisionPrompt(spec: string, systemContext: string): string { ].join("\n\n"); } -function buildTaskGraphReviewPrompt(tasks: Task[], systemContext: string, multiAgentActive: boolean): string { +function buildTaskGraphReviewPrompt( + tasks: Task[], + systemContext: string, + multiAgentActive: boolean, + clarificationToolAvailable: boolean, +): string { return [ systemContext, "You are Orca's pre-execution task-graph reviewer.", ...getCodeSimplifierGuidance(), + ...getClarificationRequestGuidance(clarificationToolAvailable, "review"), ...(multiAgentActive ? [ "Codex multi-agent mode is enabled for this run. Review the graph for safe subagent parallelization.", @@ -164,12 +238,25 @@ function buildTaskGraphReviewPrompt(tasks: Task[], systemContext: string, multiA ].join("\n\n"); } -function buildTaskGraphConsultationPrompt(tasks: Task[], multiAgentActive: boolean): string { +function buildTaskGraphConsultationPrompt( + tasks: Task[], + multiAgentActive: boolean, + clarificationToolAvailable: boolean, +): string { const taskGraphJson = JSON.stringify(tasks, null, 2); return [ "Review this Orca task graph before execution.", "Flag any: missing steps, wrong dependency order, tasks that are underdefined, or potential blockers.", + ...getClarificationRequestGuidance(clarificationToolAvailable, "review"), + ...(clarificationToolAvailable + ? [ + "Execution tasks are allowed to pause and ask request_user_input questions when they truly need a user-provided value.", + "Do not ask the user whether Orca may pause during execution for clarification. Assume that execution-time request_user_input is available.", + "If a task already says it should ask for a missing user value during execution, treat that as a valid execution mechanism, not a reason to ask a meta-question about whether clarification is allowed.", + "Only ask a review-time clarification question if the graph cannot be assessed or corrected without an answer right now.", + ] + : []), ...(multiAgentActive ? [ "", @@ -219,6 +306,14 @@ function extractAgentText(result: CompletedTurn): string { } } + const completedTurnAgentItems = result.turn.items.filter((item) => item.type === "agentMessage"); + if (completedTurnAgentItems.length > 0) { + const last = completedTurnAgentItems[completedTurnAgentItems.length - 1]; + if (last !== undefined && "text" in last && typeof last.text === "string") { + return last.text; + } + } + throw new Error("Codex response was empty"); } @@ -262,6 +357,11 @@ export interface PlanNeedDecision { reason: string; } +interface ExecutionClarificationDecision { + needsInput: boolean; + context: string; +} + function parsePlanDecision(raw: string): PlanNeedDecision { const json = extractJson(raw); const parsed = JSON.parse(json) as unknown; @@ -284,6 +384,28 @@ function parsePlanDecision(raw: string): PlanNeedDecision { }; } +function parseExecutionClarificationDecision(raw: string): ExecutionClarificationDecision { + const json = extractJson(raw); + const parsed = JSON.parse(json) as unknown; + if (!parsed || typeof parsed !== "object") { + throw new Error("Codex execution clarification response was not a JSON object"); + } + + const candidate = parsed as { needsInput?: unknown; context?: unknown }; + if (typeof candidate.needsInput !== "boolean") { + throw new Error("Codex execution clarification response missing boolean needsInput"); + } + + if (typeof candidate.context !== "string") { + throw new Error("Codex execution clarification response missing string context"); + } + + return { + needsInput: candidate.needsInput, + context: candidate.context, + }; +} + const POSITIVE_COMPLETION_PATTERNS = [ /\bdone\b/i, /\bcomplet/i, @@ -336,7 +458,7 @@ function inferOutcomeFromText(raw: string): TaskExecutionResult { return { outcome: "done", rawResponse: raw }; } -function parseTaskExecution(raw: string): TaskExecutionResult { +function parseTaskExecutionWithSource(raw: string): { result: TaskExecutionResult; usedCompletionMarker: boolean } { let json: string; let parsed: unknown; @@ -344,18 +466,26 @@ function parseTaskExecution(raw: string): TaskExecutionResult { json = extractJson(raw); parsed = JSON.parse(json); } catch { - // Codex did not emit a JSON completion marker — fall back to text inference. - return inferOutcomeFromText(raw); + return { + result: inferOutcomeFromText(raw), + usedCompletionMarker: false, + }; } if (!parsed || typeof parsed !== "object") { - return inferOutcomeFromText(raw); + return { + result: inferOutcomeFromText(raw), + usedCompletionMarker: false, + }; } const candidate = parsed as { outcome?: unknown; error?: unknown }; if (candidate.outcome !== "done" && candidate.outcome !== "failed") { - return inferOutcomeFromText(raw); + return { + result: inferOutcomeFromText(raw), + usedCompletionMarker: false, + }; } if (candidate.error !== undefined && typeof candidate.error !== "string") { @@ -363,12 +493,103 @@ function parseTaskExecution(raw: string): TaskExecutionResult { } return { - outcome: candidate.outcome, - rawResponse: raw, - ...(typeof candidate.error === "string" ? { error: candidate.error } : {}), + result: { + outcome: candidate.outcome, + rawResponse: raw, + ...(typeof candidate.error === "string" ? { error: candidate.error } : {}), + }, + usedCompletionMarker: true, }; } +function collectCompletedTurnItems(result: CompletedTurn): ThreadItem[] { + return [...result.items, ...(Array.isArray(result.turn.items) ? result.turn.items : [])]; +} + +function taskLikelyMutatesFiles(task: Task): boolean { + const normalized = [ + task.name, + task.description, + ...task.acceptance_criteria, + ] + .join("\n") + .toLowerCase(); + + return ( + normalized.includes(".ts") || + normalized.includes(".js") || + normalized.includes(".tsx") || + normalized.includes(".jsx") || + normalized.includes(".json") || + normalized.includes(".md") || + normalized.includes(".txt") || + normalized.includes("create ") || + normalized.includes("update ") || + normalized.includes("write ") || + normalized.includes("edit ") || + normalized.includes("export ") + ); +} + +function hasRecordedFileChanges(items: ThreadItem[]): boolean { + return items.some((item) => { + if (item.type !== "fileChange") { + return false; + } + + const status = "status" in item ? item.status : undefined; + const changes = "changes" in item ? item.changes : undefined; + return status === "completed" && Array.isArray(changes) && changes.length > 0; + }); +} + +function hasSuccessfulVerificationCommand(items: ThreadItem[]): boolean { + return items.some((item) => { + if (item.type !== "commandExecution") { + return false; + } + + const command = typeof item.command === "string" ? item.command : ""; + return ( + item.exitCode === 0 && + /(bun test|npm run test|npm test|validate|lint|typecheck|tsc|build|pytest|cargo test)/i.test(command) + ); + }); +} + +function enforceFallbackExecutionEvidence( + task: Task, + result: CompletedTurn, + parsedResult: TaskExecutionResult, + usedCompletionMarker: boolean, +): TaskExecutionResult { + if (usedCompletionMarker || parsedResult.outcome !== "done") { + return parsedResult; + } + + const items = collectCompletedTurnItems(result); + const fileChangesRecorded = hasRecordedFileChanges(items); + const verificationRan = hasSuccessfulVerificationCommand(items); + + if (taskLikelyMutatesFiles(task) && !fileChangesRecorded) { + return { + outcome: "failed", + rawResponse: parsedResult.rawResponse, + error: "Codex did not emit a JSON completion marker and no file changes were recorded for a task that required file edits.", + }; + } + + if (!fileChangesRecorded && !verificationRan && parsedResult.rawResponse.trim().length === 0) { + return { + outcome: "failed", + rawResponse: parsedResult.rawResponse, + error: "Codex did not emit a JSON completion marker or any concrete execution artifacts.", + }; + } + + return parsedResult; +} + function getModel(config?: OrcaConfig): string { return config?.codex?.model ?? process.env.ORCA_CODEX_MODEL ?? "gpt-5.3-codex"; } @@ -958,6 +1179,52 @@ export async function createCodexSession( let activeSecretAnswerChannel: SecretAnswerChannelState | undefined; const resumedOverallStatus: ResumeOverallStatus = interactionContext?.resumeOverallStatus ?? "running"; const resolvedServerRequests = new Set(); + const clarificationToolAvailable = interactionContext !== undefined; + + const buildRunTurnParams = ( + step: ThinkingStep, + input: Array<{ type: "text"; text: string }>, + enableQuestionTool = false, + ) => { + const effort = getEffort(config, step); + const usePlanCollaborationMode = enableQuestionTool && step !== "execution"; + return { + threadId, + effort, + input, + ...(usePlanCollaborationMode + ? { + collaborationMode: { + mode: "plan" as const, + settings: { + model: getModel(config), + reasoning_effort: effort, + developer_instructions: null, + }, + }, + } + : {}), + }; + }; + + const buildExecutionClarificationTurnParams = ( + input: Array<{ type: "text"; text: string }>, + ) => { + const effort = getEffort(config, "execution"); + return { + threadId, + effort, + input, + collaborationMode: { + mode: "plan" as const, + settings: { + model: getModel(config), + reasoning_effort: effort, + developer_instructions: null, + }, + }, + }; + }; const respondToUserInputRequest = (requestId: RequestId, response: ToolRequestUserInputResponse): void => { const specificResponder = Reflect.get(client as object, "respondToUserInputRequest"); @@ -1031,6 +1298,24 @@ export async function createCodexSession( } const pendingQuestion = createPendingQuestion(request.requestId, request); + const currentRun = await interactionContext.store.getRun(interactionContext.runId); + if (!currentRun) { + rejectUserInputRequest(request.requestId, `Run not found while waiting for input: ${interactionContext.runId}`); + return; + } + + if ( + currentRun.overallStatus === "completed" || + currentRun.overallStatus === "failed" || + currentRun.overallStatus === "cancelled" + ) { + rejectUserInputRequest( + request.requestId, + `Run ${interactionContext.runId} is already ${currentRun.overallStatus}; ignoring late requestUserInput prompt.`, + ); + return; + } + await clearAnswerFile(interactionContext.store, interactionContext.runId); let secretAnswerChannel: SecretAnswerChannelState | undefined; if (hasSecretQuestions(request)) { @@ -1150,24 +1435,32 @@ export async function createCodexSession( let skills: LoadedSkill[]; let threadId: string; - try { - skills = await resolveTurnSkills(client, config, cwd); + const startNewThread = async (): Promise => { const thread = await client.startThread({}); threadId = thread.id; + return threadId; + }; + try { + skills = await resolveTurnSkills(client, config, cwd); + await startNewThread(); } catch (error) { await client.disconnect(); throw error; } return { - threadId, + get threadId(): string { + return threadId; + }, async decidePlanningNeed(spec: string, systemContext: string): Promise { - const result = await client.runTurn({ - threadId, - effort: getEffort(config, "decision"), - input: buildTurnInput(buildPlanDecisionPrompt(spec, systemContext), skills), - }); + const result = await client.runTurn( + buildRunTurnParams( + "decision", + buildTurnInput(buildPlanDecisionPrompt(spec, systemContext, clarificationToolAvailable), skills), + clarificationToolAvailable, + ), + ); const rawResponse = extractAgentText(result); return parsePlanDecision(rawResponse); @@ -1178,9 +1471,11 @@ export async function createCodexSession( systemContext: string, ): Promise { const result = await client.runTurn({ - threadId, - effort: getEffort(config, "planning"), - input: buildTurnInput(buildPlanningPrompt(spec, systemContext, multiAgentActive), skills), + ...buildRunTurnParams( + "planning", + buildTurnInput(buildPlanningPrompt(spec, systemContext, multiAgentActive, clarificationToolAvailable), skills), + clarificationToolAvailable, + ), }); const rawResponse = extractAgentText(result); @@ -1193,9 +1488,14 @@ export async function createCodexSession( async reviewTaskGraph(tasks: Task[], systemContext: string): Promise { const result = await client.runTurn({ - threadId, - effort: getEffort(config, "review"), - input: buildTurnInput(buildTaskGraphReviewPrompt(tasks, systemContext, multiAgentActive), skills), + ...buildRunTurnParams( + "review", + buildTurnInput( + buildTaskGraphReviewPrompt(tasks, systemContext, multiAgentActive, clarificationToolAvailable), + skills, + ), + clarificationToolAvailable, + ), }); const rawResponse = extractAgentText(result); @@ -1208,19 +1508,49 @@ export async function createCodexSession( systemContext?: string, ): Promise { activeTaskContext = { taskId: task.id, taskName: task.name }; + let clarificationContext = ""; let result: CompletedTurn; try { + if (clarificationToolAvailable) { + await startNewThread(); + const clarificationResult = await client.runTurn( + buildExecutionClarificationTurnParams( + buildTurnInput( + buildTaskExecutionClarificationPrompt(task, runId, cwd, systemContext), + skills, + ), + ), + ); + const clarificationRawResponse = extractAgentText(clarificationResult); + const clarificationDecision = parseExecutionClarificationDecision(clarificationRawResponse); + clarificationContext = clarificationDecision.context.trim(); + } + + await startNewThread(); result = await client.runTurn({ - threadId, - effort: getEffort(config, "execution"), - input: buildTurnInput(buildTaskExecutionPrompt(task, runId, cwd, systemContext, multiAgentActive), skills), + ...buildRunTurnParams( + "execution", + buildTurnInput( + buildTaskExecutionPrompt( + task, + runId, + cwd, + systemContext, + multiAgentActive, + clarificationContext, + ), + skills, + ), + false, + ), }); } finally { activeTaskContext = undefined; } const rawResponse = extractAgentText(result); - const parsedResult = parseTaskExecution(rawResponse); + const { result: parsedTaskResult, usedCompletionMarker } = parseTaskExecutionWithSource(rawResponse); + const parsedResult = enforceFallbackExecutionEvidence(task, result, parsedTaskResult, usedCompletionMarker); const status = result.turn.status; if (status === "failed") { return { @@ -1237,12 +1567,10 @@ export async function createCodexSession( }, async consultTaskGraph(tasks: Task[]): Promise { - const prompt = buildTaskGraphConsultationPrompt(tasks, multiAgentActive); + const prompt = buildTaskGraphConsultationPrompt(tasks, multiAgentActive, clarificationToolAvailable); const result = await client.runTurn({ - threadId, - effort: getEffort(config, "review"), - input: buildTurnInput(prompt, skills), + ...buildRunTurnParams("review", buildTurnInput(prompt, skills), clarificationToolAvailable), }); const rawResponse = extractAgentText(result); @@ -1274,9 +1602,7 @@ export async function createCodexSession( async runPrompt(prompt: string, step: ThinkingStep = "execution"): Promise { const result = await client.runTurn({ - threadId, - effort: getEffort(config, step), - input: buildTurnInput(prompt, skills), + ...buildRunTurnParams(step, buildTurnInput(prompt, skills), clarificationToolAvailable && step !== "execution"), }); return extractAgentText(result); diff --git a/src/agents/codex/session.unit.test.ts b/src/agents/codex/session.unit.test.ts index 2aba49e..0f7a64d 100644 --- a/src/agents/codex/session.unit.test.ts +++ b/src/agents/codex/session.unit.test.ts @@ -172,6 +172,198 @@ describe("codex session effort wiring", () => { } }); + test("falls back to completed turn items when streamed agentMessage text is missing", async () => { + const runTurnMock = mock(async () => ({ + agentMessage: "", + turn: { + status: "completed", + items: [ + { + type: "agentMessage", + id: "msg-1", + text: '{"outcome":"done"}', + }, + ], + }, + items: [], + })); + + mockMultiAgentDetection(false); + mock.module("@ratley/codex-client", () => ({ + CodexClient: class { + async connect(): Promise {} + async disconnect(): Promise {} + async startThread(): Promise<{ id: string }> { + return { id: "thread-1" }; + } + runTurn = runTurnMock; + async runReview(): Promise<{ reviewText: string }> { + return { reviewText: "ok" }; + } + }, + })); + + mock.module("../../utils/skill-loader.js", () => ({ + loadSkills: async () => [], + })); + + const { createCodexSession } = await import(`./session.ts?test=${Math.random()}`); + const session = await createCodexSession(process.cwd()); + + try { + const result = await session.executeTask( + { + id: "t1", + name: "Task", + description: "Do thing", + dependencies: [], + acceptance_criteria: ["Done"], + status: "pending", + retries: 0, + maxRetries: 3, + }, + "run-1", + "context", + ); + + expect(result).toEqual({ + outcome: "done", + rawResponse: '{"outcome":"done"}', + }); + } finally { + await session.disconnect(); + } + }); + + test("starts a fresh Codex thread before executing each task", async () => { + const startThreadMock = mock(async () => ({ id: `thread-${startThreadMock.mock.calls.length + 1}` })); + const runTurnMock = mock(async () => { + if (runTurnMock.mock.calls.length === 1) { + return { + agentMessage: "[]", + turn: { status: "completed", items: [] }, + items: [], + }; + } + + return { + agentMessage: '{"outcome":"done"}', + turn: { status: "completed", items: [] }, + items: [], + }; + }); + + mockMultiAgentDetection(false); + mock.module("@ratley/codex-client", () => ({ + CodexClient: class { + async connect(): Promise {} + async disconnect(): Promise {} + startThread = startThreadMock; + runTurn = runTurnMock; + async runReview(): Promise<{ reviewText: string }> { + return { reviewText: "ok" }; + } + }, + })); + + mock.module("../../utils/skill-loader.js", () => ({ + loadSkills: async () => [], + })); + + const { createCodexSession } = await import(`./session.ts?test=${Math.random()}`); + const session = await createCodexSession(process.cwd()); + + try { + const initialThreadId = session.threadId; + await session.planSpec("spec", "context"); + await session.executeTask( + { + id: "t1", + name: "Task", + description: "Update index.ts and run tests", + dependencies: [], + acceptance_criteria: ["Done"], + status: "pending", + retries: 0, + maxRetries: 3, + }, + "run-1", + "context", + ); + + expect(startThreadMock).toHaveBeenCalledTimes(2); + const executeCall = (runTurnMock.mock.calls as Array>)[1]?.[0]; + expect(executeCall?.threadId).toBe(session.threadId); + expect(session.threadId).not.toBe(initialThreadId); + } finally { + await session.disconnect(); + } + }); + + test("fails fallback success for file-edit tasks when no file changes were recorded", async () => { + const runTurnMock = mock(async () => ({ + agentMessage: "Implemented the requested update.", + turn: { + status: "completed", + items: [ + { + type: "agentMessage", + id: "msg-1", + text: "Implemented the requested update.", + }, + ], + }, + items: [], + })); + + mockMultiAgentDetection(false); + mock.module("@ratley/codex-client", () => ({ + CodexClient: class { + async connect(): Promise {} + async disconnect(): Promise {} + async startThread(): Promise<{ id: string }> { + return { id: "thread-1" }; + } + runTurn = runTurnMock; + async runReview(): Promise<{ reviewText: string }> { + return { reviewText: "ok" }; + } + }, + })); + + mock.module("../../utils/skill-loader.js", () => ({ + loadSkills: async () => [], + })); + + const { createCodexSession } = await import(`./session.ts?test=${Math.random()}`); + const session = await createCodexSession(process.cwd()); + + try { + const result = await session.executeTask( + { + id: "t1", + name: "Write file", + description: "Create codename.txt with the exact answer.", + dependencies: [], + acceptance_criteria: ["codename.txt exists"], + status: "pending", + retries: 0, + maxRetries: 3, + }, + "run-1", + "context", + ); + + expect(result).toEqual({ + outcome: "failed", + rawResponse: "Implemented the requested update.", + error: "Codex did not emit a JSON completion marker and no file changes were recorded for a task that required file edits.", + }); + } finally { + await session.disconnect(); + } + }); + test("smoke: uses per-step thinkingLevel values for decision/planning/review/execution turns", async () => { const efforts: string[] = []; const runTurnMock = mock(async (params: { effort?: string; input?: Array<{ text?: string }> }) => { @@ -201,6 +393,14 @@ describe("codex session effort wiring", () => { }; } + if (prompt.includes("execution clarification gate")) { + return { + agentMessage: '{"needsInput":false,"context":"No extra clarification needed."}', + turn: { status: "completed" }, + items: [], + }; + } + return { agentMessage: "[]", turn: { status: "completed" }, @@ -345,6 +545,10 @@ describe("codex session code-simplifier guidance", () => { describe("codex session multi-agent prompt guidance", () => { test("includes multi-agent guidance in planning, review, consultation, and execution prompts when active", async () => { const prompts: string[] = []; + const runsDir = await mkdtemp(path.join(os.tmpdir(), "orca-session-multi-agent-active-")); + const store = new RunStore(runsDir); + const runId = "multi-agent-active-run-1000-abcd"; + await store.createRun(runId, "/tmp/spec.md"); const runTurnMock = mock(async (params: { input?: Array<{ text?: string }> }) => { const prompt = params.input?.[0]?.text ?? ""; prompts.push(prompt); @@ -365,6 +569,14 @@ describe("codex session multi-agent prompt guidance", () => { }; } + if (prompt.includes("execution clarification gate")) { + return { + agentMessage: '{"needsInput":false,"context":"No extra clarification needed."}', + turn: { status: "completed" }, + items: [], + }; + } + return { agentMessage: "[]", turn: { status: "completed" }, @@ -392,7 +604,11 @@ describe("codex session multi-agent prompt guidance", () => { })); const { createCodexSession } = await import(`./session.ts?test=${Math.random()}`); - const session = await createCodexSession(process.cwd()); + const session = await createCodexSession(process.cwd(), undefined, { + runId, + store, + resumeOverallStatus: "running", + }); try { await session.planSpec("spec", "context"); @@ -424,6 +640,9 @@ describe("codex session multi-agent prompt guidance", () => { expect(reviewPrompt).toContain("Codex multi-agent mode is enabled for this run. Review the graph for safe subagent parallelization."); expect(reviewPrompt).toContain("Flag ownership collisions where multiple tasks would touch the same files or subsystem without coordination."); + expect(consultationPrompt).toContain("Execution tasks are allowed to pause and ask request_user_input questions when they truly need a user-provided value."); + expect(consultationPrompt).toContain("Do not ask the user whether Orca may pause during execution for clarification. Assume that execution-time request_user_input is available."); + expect(consultationPrompt).toContain("If a task already says it should ask for a missing user value during execution, treat that as a valid execution mechanism, not a reason to ask a meta-question about whether clarification is allowed."); expect(consultationPrompt).toContain("Codex multi-agent mode is enabled for this run."); expect(consultationPrompt).toContain("Treat missed safe parallelism, fake dependencies, overlapping ownership, or missing integration tasks as review concerns."); @@ -432,11 +651,16 @@ describe("codex session multi-agent prompt guidance", () => { expect(executionPrompt).toContain("Integrate subagent results yourself before final completion."); } finally { await session.disconnect(); + await rm(runsDir, { recursive: true, force: true }); } }); test("omits multi-agent guidance from planning, review, consultation, and execution prompts when inactive", async () => { const prompts: string[] = []; + const runsDir = await mkdtemp(path.join(os.tmpdir(), "orca-session-multi-agent-inactive-")); + const store = new RunStore(runsDir); + const runId = "multi-agent-inactive-run-1000-abcd"; + await store.createRun(runId, "/tmp/spec.md"); const runTurnMock = mock(async (params: { input?: Array<{ text?: string }> }) => { const prompt = params.input?.[0]?.text ?? ""; prompts.push(prompt); @@ -457,6 +681,14 @@ describe("codex session multi-agent prompt guidance", () => { }; } + if (prompt.includes("execution clarification gate")) { + return { + agentMessage: '{"needsInput":false,"context":"No extra clarification needed."}', + turn: { status: "completed" }, + items: [], + }; + } + return { agentMessage: "[]", turn: { status: "completed" }, @@ -484,7 +716,11 @@ describe("codex session multi-agent prompt guidance", () => { })); const { createCodexSession } = await import(`./session.ts?test=${Math.random()}`); - const session = await createCodexSession(process.cwd()); + const session = await createCodexSession(process.cwd(), undefined, { + runId, + store, + resumeOverallStatus: "running", + }); try { await session.planSpec("spec", "context"); @@ -506,12 +742,219 @@ describe("codex session multi-agent prompt guidance", () => { ); for (const prompt of prompts) { + if (prompt.includes("Review this Orca task graph before execution.")) { + expect(prompt).toContain("Execution tasks are allowed to pause and ask request_user_input questions when they truly need a user-provided value."); + expect(prompt).toContain("Do not ask the user whether Orca may pause during execution for clarification. Assume that execution-time request_user_input is available."); + } expect(prompt).not.toContain("Codex multi-agent mode is enabled for this run."); expect(prompt).not.toContain("use subagents to parallelize them"); expect(prompt).not.toContain("safe subagent parallelization"); } } finally { await session.disconnect(); + await rm(runsDir, { recursive: true, force: true }); + } + }); + + test("uses a clarification turn before interactive execution", async () => { + type TurnInputItem = { type: "text"; text: string }; + + const runTurnCalls: Array<{ + collaborationMode?: { + mode?: string; + settings?: { + model?: string | null; + reasoning_effort?: string | null; + developer_instructions?: string | null; + }; + }; + input?: TurnInputItem[]; + }> = []; + const runsDir = await mkdtemp(path.join(os.tmpdir(), "orca-session-interactive-")); + const store = new RunStore(runsDir); + const runId = "interactive-run-1000-abcd"; + await store.createRun(runId, "/tmp/spec.md"); + + mockMultiAgentDetection(false); + mock.module("@ratley/codex-client", () => ({ + CodexClient: class { + async connect(): Promise {} + async disconnect(): Promise {} + async startThread(): Promise<{ id: string }> { + return { id: "thread-1" }; + } + async runTurn(params: { + collaborationMode?: { + mode?: string; + settings?: { + model?: string | null; + reasoning_effort?: string | null; + developer_instructions?: string | null; + }; + }; + input?: TurnInputItem[]; + }): Promise<{ agentMessage: string; turn: { status: "completed" }; items: [] }> { + runTurnCalls.push(params); + const prompt = params.input?.[0]?.text ?? ""; + if (prompt.includes("decomposing a spec")) { + return { agentMessage: "[]", turn: { status: "completed" }, items: [] }; + } + + if (prompt.includes("execution clarification gate")) { + return { + agentMessage: '{"needsInput":true,"context":"Use the user-provided release codename when updating files."}', + turn: { status: "completed" }, + items: [], + }; + } + + return { agentMessage: '{"outcome":"done"}', turn: { status: "completed" }, items: [] }; + } + async runReview(): Promise<{ reviewText: string }> { + return { reviewText: "ok" }; + } + }, + })); + + mock.module("../../utils/skill-loader.js", () => ({ + loadSkills: async () => [], + })); + + const { createCodexSession } = await import(`./session.ts?test=${Math.random()}`); + const session = await createCodexSession(process.cwd(), undefined, { + runId: runId as `${string}-${number}-${string}`, + store, + resumeOverallStatus: "running", + }); + + try { + await session.planSpec("spec", "context"); + await session.executeTask( + { + id: "T1", + name: "Collect Release Codename", + description: "Ask the user which release codename to use.", + dependencies: [], + acceptance_criteria: ["Ask exactly one clarification question and use the answer."], + status: "pending", + retries: 0, + maxRetries: 3, + }, + runId, + "context", + ); + + const planningCall = runTurnCalls[0]; + const clarificationCall = runTurnCalls[1]; + const executionCall = runTurnCalls[2]; + expect(planningCall?.collaborationMode).toEqual({ + mode: "plan", + settings: { + model: "gpt-5.3-codex", + reasoning_effort: "high", + developer_instructions: null, + }, + }); + expect(clarificationCall?.collaborationMode).toEqual({ + mode: "plan", + settings: { + model: "gpt-5.3-codex", + reasoning_effort: "medium", + developer_instructions: null, + }, + }); + expect(executionCall?.collaborationMode).toBeUndefined(); + + const clarificationPrompt = clarificationCall?.input?.[0]?.text ?? ""; + expect(clarificationPrompt).toContain("You are Orca's execution clarification gate."); + expect(clarificationPrompt).toContain("use Codex's request_user_input tool instead of guessing, failing, or baking the question into a later task"); + + const executionPrompt = executionCall?.input?.[0]?.text ?? ""; + expect(executionPrompt).toContain("Resolved Clarification Context:"); + expect(executionPrompt).toContain("Use the user-provided release codename when updating files."); + expect(executionPrompt).not.toContain("request_user_input"); + } finally { + await session.disconnect(); + await rm(runsDir, { recursive: true, force: true }); + } + }); + + test("runs a non-mutating clarification pass before ordinary interactive execution", async () => { + type TurnInputItem = { type: "text"; text: string }; + + const runTurnCalls: Array<{ collaborationMode?: { mode?: string }; input?: TurnInputItem[] }> = []; + const runsDir = await mkdtemp(path.join(os.tmpdir(), "orca-session-default-exec-")); + const store = new RunStore(runsDir); + const runId = "default-exec-1000-abcd"; + await store.createRun(runId, "/tmp/spec.md"); + + mockMultiAgentDetection(false); + mock.module("@ratley/codex-client", () => ({ + CodexClient: class { + async connect(): Promise {} + async disconnect(): Promise {} + async startThread(): Promise<{ id: string }> { + return { id: "thread-1" }; + } + async runTurn(params: { + collaborationMode?: { mode?: string }; + input?: TurnInputItem[]; + }): Promise<{ agentMessage: string; turn: { status: "completed" }; items: [] }> { + runTurnCalls.push(params); + const prompt = params.input?.[0]?.text ?? ""; + if (prompt.includes("execution clarification gate")) { + return { + agentMessage: '{"needsInput":false,"context":""}', + turn: { status: "completed" }, + items: [], + }; + } + return { agentMessage: '{"outcome":"done"}', turn: { status: "completed" }, items: [] }; + } + async runReview(): Promise<{ reviewText: string }> { + return { reviewText: "ok" }; + } + }, + })); + + mock.module("../../utils/skill-loader.js", () => ({ + loadSkills: async () => [], + })); + + const { createCodexSession } = await import(`./session.ts?test=${Math.random()}`); + const session = await createCodexSession(process.cwd(), undefined, { + runId: runId as `${string}-${number}-${string}`, + store, + resumeOverallStatus: "running", + }); + + try { + await session.executeTask( + { + id: "T1", + name: "Add subtract export", + description: "Add subtract(a, b) and update tests.", + dependencies: [], + acceptance_criteria: ["bun test passes"], + status: "pending", + retries: 0, + maxRetries: 3, + }, + runId, + "context", + ); + + expect(runTurnCalls[0]?.collaborationMode).toMatchObject({ mode: "plan" }); + expect(runTurnCalls[1]?.collaborationMode).toBeUndefined(); + + const clarificationPrompt = runTurnCalls[0]?.input?.[0]?.text ?? ""; + expect(clarificationPrompt).toContain("You are Orca's execution clarification gate."); + + const executionPrompt = runTurnCalls[1]?.input?.[0]?.text ?? ""; + expect(executionPrompt).not.toContain("request_user_input"); + } finally { + await session.disconnect(); + await rm(runsDir, { recursive: true, force: true }); } }); }); @@ -1015,31 +1458,41 @@ describe("codex session question flow", () => { resolveAnswerResponse?.(); } rejectServerRequest(): void {} + private runTurnCount = 0; async runTurn(): Promise<{ agentMessage: string; turn: { status: "completed" }; items: [] }> { - queueMicrotask(() => { - clientInstance?.emit("request:userInput", { - requestId: "req-1", - itemId: "item-1", - threadId: "thread-1", - turnId: "turn-1", - questions: [ - { - header: "Game Type", - id: "game_type", - question: "Which game type should I build?", - isOther: true, - isSecret: false, - options: [ - { label: "Arcade", description: "Arcade style" }, - { label: "Puzzle", description: "Puzzle style" }, - ], - }, - ], + this.runTurnCount += 1; + if (this.runTurnCount === 1) { + queueMicrotask(() => { + clientInstance?.emit("request:userInput", { + requestId: "req-1", + itemId: "item-1", + threadId: "thread-1", + turnId: "turn-1", + questions: [ + { + header: "Game Type", + id: "game_type", + question: "Which game type should I build?", + isOther: true, + isSecret: false, + options: [ + { label: "Arcade", description: "Arcade style" }, + { label: "Puzzle", description: "Puzzle style" }, + ], + }, + ], + }); }); - }); - await answerResponse; - clientInstance?.emit("serverRequest:resolved", { requestId: "req-1" }); + await answerResponse; + clientInstance?.emit("serverRequest:resolved", { requestId: "req-1" }); + + return { + agentMessage: '{"needsInput":true,"context":"Game type selected by user: Arcade."}', + turn: { status: "completed" }, + items: [], + }; + } return { agentMessage: '{"outcome":"done"}', @@ -1173,30 +1626,40 @@ describe("codex session question flow", () => { rejectedRequests.push({ requestId, error }); settleRequest?.(); } + private runTurnCount = 0; async runTurn(): Promise<{ agentMessage: string; turn: { status: "completed" }; items: [] }> { - queueMicrotask(() => { - clientInstance?.emit("request:userInput", { - requestId: "req-1", - itemId: "item-1", - threadId: "thread-1", - turnId: "turn-1", - questions: [ - { - header: "Game Type", - id: "game_type", - question: "Which game type should I build?", - isOther: true, - isSecret: false, - options: [ - { label: "Arcade", description: "Arcade style" }, - { label: "Puzzle", description: "Puzzle style" }, - ], - }, - ], + this.runTurnCount += 1; + if (this.runTurnCount === 1) { + queueMicrotask(() => { + clientInstance?.emit("request:userInput", { + requestId: "req-1", + itemId: "item-1", + threadId: "thread-1", + turnId: "turn-1", + questions: [ + { + header: "Game Type", + id: "game_type", + question: "Which game type should I build?", + isOther: true, + isSecret: false, + options: [ + { label: "Arcade", description: "Arcade style" }, + { label: "Puzzle", description: "Puzzle style" }, + ], + }, + ], + }); }); - }); - await requestSettled; + await requestSettled; + + return { + agentMessage: '{"needsInput":true,"context":"Game type must come from the user response."}', + turn: { status: "completed" }, + items: [], + }; + } return { agentMessage: '{"outcome":"done"}', @@ -1302,28 +1765,38 @@ describe("codex session question flow", () => { resolveAnswerResponse?.(); } rejectServerRequest(): void {} + private runTurnCount = 0; async runTurn(): Promise<{ agentMessage: string; turn: { status: "completed" }; items: [] }> { - queueMicrotask(() => { - clientInstance?.emit("request:userInput", { - requestId: "req-1", - itemId: "item-1", - threadId: "thread-1", - turnId: "turn-1", - questions: [ - { - header: "API Key", - id: "api_key", - question: "Which API key should I use?", - isOther: true, - isSecret: true, - options: null, - }, - ], + this.runTurnCount += 1; + if (this.runTurnCount === 1) { + queueMicrotask(() => { + clientInstance?.emit("request:userInput", { + requestId: "req-1", + itemId: "item-1", + threadId: "thread-1", + turnId: "turn-1", + questions: [ + { + header: "API Key", + id: "api_key", + question: "Which API key should I use?", + isOther: true, + isSecret: true, + options: null, + }, + ], + }); }); - }); - await answerResponse; - clientInstance?.emit("serverRequest:resolved", { requestId: "req-1" }); + await answerResponse; + clientInstance?.emit("serverRequest:resolved", { requestId: "req-1" }); + + return { + agentMessage: '{"needsInput":true,"context":"Use the secret API key provided by the user."}', + turn: { status: "completed" }, + items: [], + }; + } return { agentMessage: '{"outcome":"done"}', @@ -1482,31 +1955,41 @@ describe("codex session question flow", () => { responses.push({ requestId, response }); } rejectServerRequest(): void {} + private runTurnCount = 0; async runTurn(): Promise<{ agentMessage: string; turn: { status: "completed" }; items: [] }> { - queueMicrotask(() => { - clientInstance?.emit("request:userInput", { - requestId: "req-1", - itemId: "item-1", - threadId: "thread-1", - turnId: "turn-1", - questions: [ - { - header: "Framework", - id: "framework", - question: "Which framework should I target?", - isOther: true, - isSecret: false, - options: null, - }, - ], + this.runTurnCount += 1; + if (this.runTurnCount === 1) { + queueMicrotask(() => { + clientInstance?.emit("request:userInput", { + requestId: "req-1", + itemId: "item-1", + threadId: "thread-1", + turnId: "turn-1", + questions: [ + { + header: "Framework", + id: "framework", + question: "Which framework should I target?", + isOther: true, + isSecret: false, + options: null, + }, + ], + }); }); - }); - queueMicrotask(() => { - setTimeout(() => { - clientInstance?.emit("serverRequest:resolved", { requestId: "req-1" }); - }, 20); - }); + queueMicrotask(() => { + setTimeout(() => { + clientInstance?.emit("serverRequest:resolved", { requestId: "req-1" }); + }, 20); + }); + + return { + agentMessage: '{"needsInput":false,"context":""}', + turn: { status: "completed" }, + items: [], + }; + } return { agentMessage: '{"outcome":"done"}', @@ -1569,4 +2052,128 @@ describe("codex session question flow", () => { await rm(tempDir, { recursive: true, force: true }); } }); + + test("ignores late user-input requests after the run has already failed", async () => { + const tempDir = await mkdtemp(path.join(os.tmpdir(), "orca-question-flow-late-")); + const store = new RunStore(path.join(tempDir, "runs")); + const runId = "run-1000-abcd"; + await store.createRun(runId, "/tmp/spec.md"); + await store.updateRun(runId, { mode: "run", overallStatus: "running" }); + + const rejectedRequests: Array<{ requestId: string | number; error: { code: number; message: string } }> = []; + let clientInstance: EventEmitter | null = null; + + try { + mockMultiAgentDetection(false); + mock.module("@ratley/codex-client", () => ({ + CodexClient: class extends EventEmitter { + constructor() { + super(); + clientInstance = this; + } + + async connect(): Promise {} + async disconnect(): Promise {} + async startThread(): Promise<{ id: string }> { + return { id: "thread-1" }; + } + async runReview(): Promise<{ reviewText: string }> { + return { reviewText: "ok" }; + } + rejectServerRequest(requestId: string | number, error: { code: number; message: string }): void { + rejectedRequests.push({ requestId, error }); + } + private runTurnCount = 0; + async runTurn(): Promise<{ agentMessage: string; turn: { status: "completed" }; items: [] }> { + this.runTurnCount += 1; + if (this.runTurnCount === 1) { + await store.updateRun(runId, { overallStatus: "failed" }); + setTimeout(() => { + clientInstance?.emit("request:userInput", { + requestId: "req-late", + itemId: "item-1", + threadId: "thread-1", + turnId: "turn-1", + questions: [ + { + header: "Codename", + id: "codename", + question: "Which codename should I use?", + isOther: true, + isSecret: false, + options: null, + }, + ], + }); + }, 10); + + return { + agentMessage: '{"needsInput":false,"context":""}', + turn: { status: "completed" }, + items: [], + }; + } + + return { + agentMessage: '{"outcome":"done"}', + turn: { status: "completed" }, + items: [], + }; + } + }, + })); + + mock.module("../../utils/skill-loader.js", () => ({ + loadSkills: async () => [], + })); + + const { createCodexSession } = await import(`./session.ts?test=${Math.random()}`); + const session = await createCodexSession(process.cwd(), undefined, { + runId: runId as `${string}-${number}-${string}`, + store, + resumeOverallStatus: "running", + }); + + try { + const result = await session.executeTask( + { + id: "task-1", + name: "Configure release", + description: "Use the provided codename.", + dependencies: [], + acceptance_criteria: ["Release is configured"], + status: "pending", + retries: 0, + maxRetries: 3, + }, + runId, + "context", + ); + + expect(result.outcome).toBe("done"); + + await waitFor(async () => { + const run = await store.getRun(runId); + return rejectedRequests.length > 0 && run ? run : null; + }); + + const run = await store.getRun(runId); + expect(run?.overallStatus).toBe("failed"); + expect(run?.pendingQuestion).toBeUndefined(); + expect(rejectedRequests).toEqual([ + { + requestId: "req-late", + error: { + code: -32603, + message: `Run ${runId} is already failed; ignoring late requestUserInput prompt.`, + }, + }, + ]); + } finally { + await session.disconnect(); + } + } finally { + await rm(tempDir, { recursive: true, force: true }); + } + }); }); diff --git a/src/cli/commands/plan.test.ts b/src/cli/commands/plan.test.ts new file mode 100644 index 0000000..f3641f6 --- /dev/null +++ b/src/cli/commands/plan.test.ts @@ -0,0 +1,62 @@ +import { mkdtemp, rm, writeFile } from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; +import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test"; + +type PlanModule = typeof import("./plan.js"); + +let tempDir = ""; +let runsDir = ""; +let specPath = ""; +let logs: string[] = []; +const originalRunsDir = process.env.ORCA_RUNS_DIR; +const originalConsoleLog = console.log; + +async function loadPlanModule(): Promise { + return import(`./plan.js?test=${Math.random()}`); +} + +beforeEach(async () => { + tempDir = await mkdtemp(path.join(os.tmpdir(), "orca-plan-test-")); + runsDir = path.join(tempDir, "runs"); + specPath = path.join(tempDir, "spec.md"); + await writeFile(specPath, "# Spec\n", "utf8"); + process.env.ORCA_RUNS_DIR = runsDir; + logs = []; + + console.log = (...args: unknown[]) => { + logs.push(args.map(String).join(" ")); + }; + + mock.module("../../core/config-loader.js", () => ({ + resolveConfig: async () => undefined, + })); + mock.module("../../core/planner.js", () => ({ + runPlanner: async () => {}, + })); +}); + +afterEach(async () => { + mock.restore(); + console.log = originalConsoleLog; + + if (originalRunsDir === undefined) { + delete process.env.ORCA_RUNS_DIR; + } else { + process.env.ORCA_RUNS_DIR = originalRunsDir; + } + + await rm(tempDir, { recursive: true, force: true }); +}); + +describe("plan command", () => { + test("honors ORCA_RUNS_DIR for plan runs", async () => { + const planModule = await loadPlanModule(); + + await planModule.planCommand({ spec: specPath }); + + const runDirLine = logs.find((line) => line.startsWith("Run dir: ")); + expect(runDirLine).toBeTruthy(); + expect(runDirLine?.slice("Run dir: ".length)).toStartWith(runsDir); + }); +}); diff --git a/src/cli/commands/plan.ts b/src/cli/commands/plan.ts index fd2f7bb..c24fa4c 100644 --- a/src/cli/commands/plan.ts +++ b/src/cli/commands/plan.ts @@ -15,6 +15,11 @@ export interface PlanCommandOptions { onError?: string; } +function createStore(): RunStore { + const runsDir = process.env.ORCA_RUNS_DIR; + return runsDir ? new RunStore(runsDir) : new RunStore(); +} + export async function planCommand(options: { spec: string; config?: string }): Promise { const specPath = path.resolve(options.spec); await access(specPath, fsConstants.R_OK); @@ -23,7 +28,7 @@ export async function planCommand(options: { spec: string; config?: string }): P const runId = generateRunId(specPath); console.log(`Run ID: ${runId}`); - const store = new RunStore(); + const store = createStore(); await store.createRun(runId, specPath); await runPlanner(specPath, store, runId, orcaConfig); diff --git a/src/cli/commands/run-command.test-harness.ts b/src/cli/commands/run-command.test-harness.ts index 6587d6a..90c5b22 100644 --- a/src/cli/commands/run-command.test-harness.ts +++ b/src/cli/commands/run-command.test-harness.ts @@ -131,7 +131,8 @@ export function createRunCommandTestHarness(tempPrefix: string): RunCommandTestH InvalidPlanError: TestInvalidPlanError })); void mock.module("../../core/task-runner.js", () => ({ - runTaskRunner: runTaskRunnerMock + runTaskRunner: runTaskRunnerMock, + writeSessionSummary: async () => {} })); void mock.module("../../core/config-loader.js", () => ({ resolveConfig: resolveConfigMock diff --git a/src/cli/commands/run.test.ts b/src/cli/commands/run.test.ts index 4b23cd4..9c49beb 100644 --- a/src/cli/commands/run.test.ts +++ b/src/cli/commands/run.test.ts @@ -205,6 +205,52 @@ describe("run command executor flags", () => { expect(findingsEvent?.metadata?.cycleIndex).toBe(1); }); + test("skips fallback review when structured execution review already ran", async () => { + const { runModule, createCodexSessionMock } = await loadRunModule(); + const reviewChangesMock = mock(async () => "review"); + createCodexSessionMock.mockImplementationOnce(async () => ({ + consultTaskGraph: async () => ({ issues: [], ok: true }), + executeTask: async () => ({ outcome: "done" as const, rawResponse: '{"outcome":"done"}' }), + runPrompt: async () => '{"summary":"clean","findings":[],"fixed":false}', + reviewChanges: reviewChangesMock, + disconnect: async () => {} + })); + + const configPath = path.join(getTempDir(), "orca.config.js"); + await writeFile(configPath, "export default { review: { execution: { validator: { auto: false } } } };\n", "utf8"); + + await parseRun(runModule, ["run", "--task", "x", "--config", configPath]); + + expect(reviewChangesMock).not.toHaveBeenCalled(); + }); + + test("treats failed validator commands as findings even if reviewer reports none", async () => { + const { runModule, createCodexSessionMock, hookDispatchMock } = await loadRunModule(); + createCodexSessionMock.mockImplementationOnce(async () => ({ + consultTaskGraph: async () => ({ issues: [], ok: true }), + executeTask: async () => ({ outcome: "done" as const, rawResponse: '{"outcome":"done"}' }), + runPrompt: async () => '{"summary":"clean","findings":[],"fixed":false}', + reviewChanges: async () => "review", + disconnect: async () => {} + })); + + const configPath = path.join(getTempDir(), "orca.config.js"); + await writeFile( + configPath, + "export default { review: { execution: { onFindings: 'report_only', validator: { auto: false, commands: ['node -e \"process.exit(1)\"'] } } } };\n", + "utf8", + ); + + await parseRun(runModule, ["run", "--task", "x", "--config", configPath]); + + const findingsEvent = hookDispatchMock.mock.calls.find( + (call) => (call[0] as { hook?: string })?.hook === "onFindings" + )?.[0] as { message?: string; metadata?: { findingsCount?: number } } | undefined; + + expect(findingsEvent?.metadata?.findingsCount).toBe(1); + expect(findingsEvent?.message).toContain("Validator failures still need attention"); + }); + test("auto_fix loop stops when clean", async () => { const { runModule, createCodexSessionMock } = await loadRunModule(); const runPromptMock = mock(async () => '{"summary":"clean","findings":[],"fixed":false}'); diff --git a/src/cli/commands/run.ts b/src/cli/commands/run.ts index 20ee6f9..cc7af34 100644 --- a/src/cli/commands/run.ts +++ b/src/cli/commands/run.ts @@ -13,7 +13,7 @@ import { createCodexSession } from "../../agents/codex/session.js"; import { ensureCodexMultiAgent } from "../../core/codex-config.js"; import { resolveConfig } from "../../core/config-loader.js"; import { InvalidPlanError, runPlanner } from "../../core/planner.js"; -import { runTaskRunner } from "../../core/task-runner.js"; +import { runTaskRunner, writeSessionSummary } from "../../core/task-runner.js"; import { createOpenclawHookHandler, detectOpenclawAvailability } from "../../hooks/adapters/openclaw.js"; import { createStdoutHookHandler } from "../../hooks/adapters/stdout.js"; import { HookDispatcher } from "../../hooks/dispatcher.js"; @@ -246,10 +246,18 @@ async function runValidatorCommands(commands: string[]): Promise result.exitCode !== 0); return [ "You are Orca's post-execution reviewer.", "Inspect uncommitted repository changes and validation command output.", "If there are fixable findings, apply fixes directly in the workspace before responding.", + "Any validator command with a non-zero exit code is a real finding until it is fixed.", + ...(failedValidations.length > 0 + ? [ + "One or more validators failed in this cycle.", + "Do not return findings=[] unless you have actually fixed the failures and rerun validation successfully.", + ] + : []), "Respond with JSON only using this exact shape:", '{"summary":"...","findings":["..."],"fixed":true|false}', `Cycle: ${cycleIndex}`, @@ -506,16 +514,18 @@ export async function runCommandHandler(options: RunCommandOptions): Promise codexSession.executeTask(task, taskRunId, systemContext), }); - const reviewConfig = getExecutionReviewConfig(effectiveConfig); const finalSummaries: string[] = []; const runAfterExecution = await store.getRun(runId); @@ -528,12 +538,26 @@ export async function runCommandHandler(options: RunCommandOptions): Promise codexSession.runPrompt(prompt, "review"), cycleIndex, prompt, reviewConfig.prompt ); + const failedValidations = validationResults.filter((result) => result.exitCode !== 0); + const reviewResult = + failedValidations.length > 0 && initialReviewResult.findings.length === 0 + ? { + ...initialReviewResult, + summary: initialReviewResult.summary.trim().length > 0 + ? `${initialReviewResult.summary} Validator failures still need attention.` + : "Validator failures still need attention.", + findings: failedValidations.map( + (result) => `Validator failed: ${result.command} (exit ${result.exitCode})`, + ), + fixed: false, + } + : initialReviewResult; finalSummaries.push(`cycle ${cycleIndex}: ${reviewResult.summary}`); if (reviewResult.findings.length === 0) { @@ -568,7 +592,8 @@ export async function runCommandHandler(options: RunCommandOptions): Promise 0) { console.log(fallbackReview); } + + const finalRun = await store.getRun(runId); + if (finalRun && finalRun.overallStatus !== "failed" && finalRun.overallStatus !== "cancelled") { + await store.updateRun(runId, { overallStatus: "completed" }); + const completedAt = new Date().toISOString(); + await emitHook({ + runId: runId as HookEvent["runId"], + hook: "onComplete", + message: "run-completed", + timestamp: completedAt, + metadata: { overallStatus: "completed" } + }); + await writeSessionSummary(store, runId, effectiveConfig?.sessionLogs); + } } finally { await codexSession.disconnect(); } diff --git a/src/core/task-runner.test.ts b/src/core/task-runner.test.ts index 6ba1976..26a375f 100644 --- a/src/core/task-runner.test.ts +++ b/src/core/task-runner.test.ts @@ -76,6 +76,40 @@ describe("task-runner", () => { expect(hookEvents.some((event) => event.hook === "onComplete" && event.message === "run-completed")).toBe(true); }); + test("defers final completion when configured for post-execution review", async () => { + const tasks = [makeTask("t1")]; + await store.updateRun(runId, { + mode: "run", + overallStatus: "running", + tasks + }); + + const hookEvents: HookEvent[] = []; + + setExecuteTaskForTests(async () => ({ + outcome: "done", + rawResponse: '{"outcome":"done"}' + })); + + await runTaskRunner({ + runId, + store, + deferCompletion: true, + emitHook: async (event) => { + hookEvents.push(event); + } + }); + + const run = await store.getRun(runId); + if (!run) { + throw new Error("Run missing after deferred completion"); + } + + expect(run.overallStatus).toBe("reviewing"); + expect(hookEvents.some((event) => event.hook === "onMilestone" && event.message === "execution-completed")).toBe(true); + expect(hookEvents.some((event) => event.hook === "onComplete")).toBe(false); + }); + test("retries transient task failure and then succeeds", async () => { const tasks = [makeTask("t1")]; await store.updateRun(runId, { diff --git a/src/core/task-runner.ts b/src/core/task-runner.ts index cc14e6f..7db06ce 100644 --- a/src/core/task-runner.ts +++ b/src/core/task-runner.ts @@ -72,6 +72,7 @@ export interface TaskRunnerOptions { store: RunStore; config?: OrcaConfig; emitHook?: EmitHook; + deferCompletion?: boolean; /** Override executor — used by tests only. In production, use config.executor. */ executeTask?: ExecuteTaskFn; } @@ -118,7 +119,7 @@ function buildSessionSummary(run: RunStatus): string { ].join("\n"); } -async function writeSessionSummary(store: RunStore, runId: string, sessionLogsDir?: string): Promise { +export async function writeSessionSummary(store: RunStore, runId: string, sessionLogsDir?: string): Promise { if (!sessionLogsDir) { return; } @@ -138,7 +139,7 @@ async function writeSessionSummary(store: RunStore, runId: string, sessionLogsDi export async function runTaskRunner(options: TaskRunnerOptions): Promise { const emitHook = options.emitHook ?? defaultEmitHook; - const { runId, store, config } = options; + const { runId, store, config, deferCompletion = false } = options; const skills = await loadSkills(config); const taskSystemContext = skills.length === 0 ? undefined : formatSkillsSection(skills); @@ -205,22 +206,25 @@ export async function runTaskRunner(options: TaskRunnerOptions): Promise { const allDone = run.tasks.every((task) => task.status === "done"); if (allDone) { - await store.updateRun(runId, { overallStatus: "completed" }); + const nextOverallStatus = deferCompletion ? "reviewing" : "completed"; + await store.updateRun(runId, { overallStatus: nextOverallStatus }); const completedAt = new Date().toISOString(); await emitHook({ runId: run.runId, hook: "onMilestone", message: "execution-completed", timestamp: completedAt, - metadata: { overallStatus: "completed" } - }); - await emitHook({ - runId: run.runId, - hook: "onComplete", - message: "run-completed", - timestamp: completedAt, - metadata: { overallStatus: "completed" } + metadata: { overallStatus: nextOverallStatus } }); + if (!deferCompletion) { + await emitHook({ + runId: run.runId, + hook: "onComplete", + message: "run-completed", + timestamp: completedAt, + metadata: { overallStatus: "completed" } + }); + } await writeSessionSummary(store, runId, config?.sessionLogs); return; } diff --git a/src/state/schema.ts b/src/state/schema.ts index bc4ac03..247c06e 100644 --- a/src/state/schema.ts +++ b/src/state/schema.ts @@ -69,6 +69,7 @@ export const RunStatusSchema = z.object({ overallStatus: z.enum([ "planning", "running", + "reviewing", "waiting_for_answer", "completed", "failed", diff --git a/src/types/index.ts b/src/types/index.ts index 4d157f3..2f5ebcd 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -41,6 +41,7 @@ export interface RunStatus { overallStatus: | "planning" | "running" + | "reviewing" | "waiting_for_answer" | "completed" | "failed" From 7e466235286108cb4e00cdb929654729bc70594a Mon Sep 17 00:00:00 2001 From: bradley inniss Date: Fri, 20 Mar 2026 18:47:25 -0700 Subject: [PATCH 17/19] Fix Codex fallback completion and duplicate onComplete --- src/agents/codex/session.ts | 4 +- src/agents/codex/session.unit.test.ts | 82 ++++++++++++++++++++++++++- src/cli/commands/run.test.ts | 47 +++++++++++++++ src/cli/commands/run.ts | 7 ++- 4 files changed, 136 insertions(+), 4 deletions(-) diff --git a/src/agents/codex/session.ts b/src/agents/codex/session.ts index 1c25843..e522dc8 100644 --- a/src/agents/codex/session.ts +++ b/src/agents/codex/session.ts @@ -571,11 +571,11 @@ function enforceFallbackExecutionEvidence( const fileChangesRecorded = hasRecordedFileChanges(items); const verificationRan = hasSuccessfulVerificationCommand(items); - if (taskLikelyMutatesFiles(task) && !fileChangesRecorded) { + if (taskLikelyMutatesFiles(task) && !fileChangesRecorded && !verificationRan) { return { outcome: "failed", rawResponse: parsedResult.rawResponse, - error: "Codex did not emit a JSON completion marker and no file changes were recorded for a task that required file edits.", + error: "Codex did not emit a JSON completion marker, no file changes were recorded, and no successful verification command ran for a task that required file edits.", }; } diff --git a/src/agents/codex/session.unit.test.ts b/src/agents/codex/session.unit.test.ts index 0f7a64d..097b2e6 100644 --- a/src/agents/codex/session.unit.test.ts +++ b/src/agents/codex/session.unit.test.ts @@ -357,7 +357,87 @@ describe("codex session effort wiring", () => { expect(result).toEqual({ outcome: "failed", rawResponse: "Implemented the requested update.", - error: "Codex did not emit a JSON completion marker and no file changes were recorded for a task that required file edits.", + error: + "Codex did not emit a JSON completion marker, no file changes were recorded, and no successful verification command ran for a task that required file edits.", + }); + } finally { + await session.disconnect(); + } + }); + + test("accepts verification-backed shell edits when no file changes were recorded", async () => { + const runTurnMock = mock(async () => ({ + agentMessage: "Implemented the requested update and verified it.", + turn: { + status: "completed", + items: [ + { + type: "agentMessage", + id: "msg-1", + text: "Implemented the requested update and verified it.", + }, + { + type: "commandExecution", + id: "cmd-1", + command: "python scripts/update_release.py", + status: "completed", + exitCode: 0, + commandActions: [], + }, + { + type: "commandExecution", + id: "cmd-2", + command: "bun test", + status: "completed", + exitCode: 0, + commandActions: [], + }, + ], + }, + items: [], + })); + + mockMultiAgentDetection(false); + mock.module("@ratley/codex-client", () => ({ + CodexClient: class { + async connect(): Promise {} + async disconnect(): Promise {} + async startThread(): Promise<{ id: string }> { + return { id: "thread-1" }; + } + runTurn = runTurnMock; + async runReview(): Promise<{ reviewText: string }> { + return { reviewText: "ok" }; + } + }, + })); + + mock.module("../../utils/skill-loader.js", () => ({ + loadSkills: async () => [], + })); + + const { createCodexSession } = await import(`./session.ts?test=${Math.random()}`); + const session = await createCodexSession(process.cwd()); + + try { + const result = await session.executeTask( + { + id: "t1", + name: "Write file", + description: "Create codename.txt with the exact answer.", + dependencies: [], + acceptance_criteria: ["codename.txt exists"], + status: "pending", + retries: 0, + maxRetries: 3, + }, + "run-1", + "context", + ); + + expect(result).toEqual({ + outcome: "done", + rawResponse: "Implemented the requested update and verified it.", }); } finally { await session.disconnect(); diff --git a/src/cli/commands/run.test.ts b/src/cli/commands/run.test.ts index 9c49beb..f54ae23 100644 --- a/src/cli/commands/run.test.ts +++ b/src/cli/commands/run.test.ts @@ -182,6 +182,53 @@ describe("run command executor flags", () => { expect(reviewChangesMock).not.toHaveBeenCalled(); }); + test("does not dispatch onComplete twice when execution review is disabled", async () => { + const { runModule, runTaskRunnerMock, hookDispatchMock } = await loadRunModule(); + runTaskRunnerMock.mockImplementationOnce( + async (options: { + runId: string; + store: { updateRun: (runId: string, patch: unknown) => Promise }; + emitHook?: (event: unknown) => Promise; + }) => { + await options.store.updateRun(options.runId, { + overallStatus: "completed", + tasks: [ + { + id: "t1", + name: "task", + description: "task", + dependencies: [], + acceptance_criteria: ["done"], + status: "done", + retries: 0, + maxRetries: 3, + startedAt: new Date().toISOString(), + finishedAt: new Date().toISOString(), + }, + ], + }); + + await options.emitHook?.({ + runId: options.runId, + hook: "onComplete", + message: "run-completed", + timestamp: new Date().toISOString(), + metadata: { overallStatus: "completed" }, + }); + }, + ); + + const configPath = path.join(getTempDir(), "orca.config.js"); + await writeFile(configPath, "export default { review: { enabled: false } };\n", "utf8"); + + await parseRun(runModule, ["run", "--task", "x", "--config", configPath]); + + const onCompleteCalls = hookDispatchMock.mock.calls.filter( + (call) => (call[0] as { hook?: string })?.hook === "onComplete", + ); + expect(onCompleteCalls).toHaveLength(1); + }); + test("dispatches onFindings hook when post-execution review reports findings", async () => { const { runModule, createCodexSessionMock, hookDispatchMock } = await loadRunModule(); createCodexSessionMock.mockImplementationOnce(async () => ({ diff --git a/src/cli/commands/run.ts b/src/cli/commands/run.ts index cc7af34..4315b0c 100644 --- a/src/cli/commands/run.ts +++ b/src/cli/commands/run.ts @@ -611,7 +611,12 @@ export async function runCommandHandler(options: RunCommandOptions): Promise Date: Fri, 20 Mar 2026 19:24:57 -0700 Subject: [PATCH 18/19] Restore Codex thread persistence and review turn routing --- src/agents/codex/session.ts | 4 +- src/agents/codex/session.unit.test.ts | 57 +++++++++++++++++++++++++-- 2 files changed, 55 insertions(+), 6 deletions(-) diff --git a/src/agents/codex/session.ts b/src/agents/codex/session.ts index e522dc8..b4da4a6 100644 --- a/src/agents/codex/session.ts +++ b/src/agents/codex/session.ts @@ -1512,7 +1512,6 @@ export async function createCodexSession( let result: CompletedTurn; try { if (clarificationToolAvailable) { - await startNewThread(); const clarificationResult = await client.runTurn( buildExecutionClarificationTurnParams( buildTurnInput( @@ -1526,7 +1525,6 @@ export async function createCodexSession( clarificationContext = clarificationDecision.context.trim(); } - await startNewThread(); result = await client.runTurn({ ...buildRunTurnParams( "execution", @@ -1602,7 +1600,7 @@ export async function createCodexSession( async runPrompt(prompt: string, step: ThinkingStep = "execution"): Promise { const result = await client.runTurn({ - ...buildRunTurnParams(step, buildTurnInput(prompt, skills), clarificationToolAvailable && step !== "execution"), + ...buildRunTurnParams(step, buildTurnInput(prompt, skills), false), }); return extractAgentText(result); diff --git a/src/agents/codex/session.unit.test.ts b/src/agents/codex/session.unit.test.ts index 097b2e6..ff61099 100644 --- a/src/agents/codex/session.unit.test.ts +++ b/src/agents/codex/session.unit.test.ts @@ -235,7 +235,7 @@ describe("codex session effort wiring", () => { } }); - test("starts a fresh Codex thread before executing each task", async () => { + test("reuses the persistent Codex thread across planning and execution", async () => { const startThreadMock = mock(async () => ({ id: `thread-${startThreadMock.mock.calls.length + 1}` })); const runTurnMock = mock(async () => { if (runTurnMock.mock.calls.length === 1) { @@ -291,10 +291,10 @@ describe("codex session effort wiring", () => { "context", ); - expect(startThreadMock).toHaveBeenCalledTimes(2); + expect(startThreadMock).toHaveBeenCalledTimes(1); const executeCall = (runTurnMock.mock.calls as Array>)[1]?.[0]; expect(executeCall?.threadId).toBe(session.threadId); - expect(session.threadId).not.toBe(initialThreadId); + expect(session.threadId).toBe(initialThreadId); } finally { await session.disconnect(); } @@ -1037,6 +1037,57 @@ describe("codex session multi-agent prompt guidance", () => { await rm(runsDir, { recursive: true, force: true }); } }); + + test("does not force review prompts through plan collaboration mode", async () => { + type TurnInputItem = { type: "text"; text: string }; + + const runTurnCalls: Array<{ collaborationMode?: { mode?: string }; input?: TurnInputItem[] }> = []; + const runsDir = await mkdtemp(path.join(os.tmpdir(), "orca-session-review-prompt-")); + const store = new RunStore(runsDir); + const runId = "review-prompt-1000-abcd"; + await store.createRun(runId, "/tmp/spec.md"); + + mockMultiAgentDetection(false); + mock.module("@ratley/codex-client", () => ({ + CodexClient: class { + async connect(): Promise {} + async disconnect(): Promise {} + async startThread(): Promise<{ id: string }> { + return { id: "thread-1" }; + } + async runTurn(params: { + collaborationMode?: { mode?: string }; + input?: TurnInputItem[]; + }): Promise<{ agentMessage: string; turn: { status: "completed" }; items: [] }> { + runTurnCalls.push(params); + return { agentMessage: '{"summary":"clean","findings":[],"fixed":false}', turn: { status: "completed" }, items: [] }; + } + async runReview(): Promise<{ reviewText: string }> { + return { reviewText: "ok" }; + } + }, + })); + + mock.module("../../utils/skill-loader.js", () => ({ + loadSkills: async () => [], + })); + + const { createCodexSession } = await import(`./session.ts?test=${Math.random()}`); + const session = await createCodexSession(process.cwd(), undefined, { + runId: runId as `${string}-${number}-${string}`, + store, + resumeOverallStatus: "running", + }); + + try { + await session.runPrompt("review prompt", "review"); + expect(runTurnCalls).toHaveLength(1); + expect(runTurnCalls[0]?.collaborationMode).toBeUndefined(); + } finally { + await session.disconnect(); + await rm(runsDir, { recursive: true, force: true }); + } + }); }); describe("codex session skill discovery", () => { From 84ff460aa2543660f9ced8429b663876ade55bc5 Mon Sep 17 00:00:00 2001 From: bradley inniss Date: Sat, 21 Mar 2026 03:46:59 -0700 Subject: [PATCH 19/19] Fallback when Codex collaboration mode is unsupported --- src/agents/codex/session.ts | 48 +++++++++++++++- src/agents/codex/session.unit.test.ts | 81 +++++++++++++++++++++++++++ 2 files changed, 128 insertions(+), 1 deletion(-) diff --git a/src/agents/codex/session.ts b/src/agents/codex/session.ts index b4da4a6..21da6cc 100644 --- a/src/agents/codex/session.ts +++ b/src/agents/codex/session.ts @@ -116,6 +116,47 @@ function buildPlanningPrompt( ].join("\n\n"); } +function isUnsupportedCollaborationModeError(error: unknown): boolean { + const message = error instanceof Error ? error.message : String(error); + return ( + message.includes("-32601") || + message.includes("Method not found") || + message.includes("Invalid collaboration mode list response") + ); +} + +async function detectCollaborationModeSupport( + client: CodexClient, + codexPath: string, + interactiveRunEnabled: boolean, +): Promise { + if (!interactiveRunEnabled) { + return false; + } + + const maybeListCollaborationModes = ( + client as CodexClient & { listCollaborationModes?: () => Promise } + ).listCollaborationModes; + + if (typeof maybeListCollaborationModes !== "function") { + return true; + } + + try { + await maybeListCollaborationModes.call(client); + return true; + } catch (error) { + if (!isUnsupportedCollaborationModeError(error)) { + throw error; + } + + logger.warn( + `Codex binary at ${codexPath} does not support collaboration mode / question flow. Falling back to non-interactive prompts for this run.`, + ); + return false; + } +} + function buildTaskExecutionPrompt( task: Task, runId: string, @@ -1174,12 +1215,17 @@ export async function createCodexSession( attachCodexStderrDiagnostics(client, codexPath); await client.connect(); await warnAboutUnavailableMcpServers(client); + const collaborationModeAvailable = await detectCollaborationModeSupport( + client, + codexPath, + interactionContext !== undefined, + ); let activeTaskContext: { taskId: string; taskName: string } | undefined; let activeSecretAnswerChannel: SecretAnswerChannelState | undefined; const resumedOverallStatus: ResumeOverallStatus = interactionContext?.resumeOverallStatus ?? "running"; const resolvedServerRequests = new Set(); - const clarificationToolAvailable = interactionContext !== undefined; + const clarificationToolAvailable = interactionContext !== undefined && collaborationModeAvailable; const buildRunTurnParams = ( step: ThinkingStep, diff --git a/src/agents/codex/session.unit.test.ts b/src/agents/codex/session.unit.test.ts index ff61099..3c4b61a 100644 --- a/src/agents/codex/session.unit.test.ts +++ b/src/agents/codex/session.unit.test.ts @@ -1038,6 +1038,87 @@ describe("codex session multi-agent prompt guidance", () => { } }); + test("falls back to non-interactive prompts when collaboration mode is unsupported", async () => { + type TurnInputItem = { type: "text"; text: string }; + + const runTurnCalls: Array<{ collaborationMode?: { mode?: string }; input?: TurnInputItem[] }> = []; + const runsDir = await mkdtemp(path.join(os.tmpdir(), "orca-session-unsupported-collab-")); + const store = new RunStore(runsDir); + const runId = "unsupported-collab-1000-abcd"; + await store.createRun(runId, "/tmp/spec.md"); + + mockMultiAgentDetection(false); + mock.module("@ratley/codex-client", () => ({ + CodexClient: class { + async connect(): Promise {} + async disconnect(): Promise {} + async startThread(): Promise<{ id: string }> { + return { id: "thread-1" }; + } + async listCollaborationModes(): Promise { + throw new Error("-32601: Method not found"); + } + async runTurn(params: { + collaborationMode?: { mode?: string }; + input?: TurnInputItem[]; + }): Promise<{ agentMessage: string; turn: { status: "completed" }; items: [] }> { + runTurnCalls.push(params); + const prompt = params.input?.[0]?.text ?? ""; + if (prompt.includes("decomposing a spec")) { + return { agentMessage: "[]", turn: { status: "completed" }, items: [] }; + } + return { agentMessage: '{"outcome":"done"}', turn: { status: "completed" }, items: [] }; + } + async runReview(): Promise<{ reviewText: string }> { + return { reviewText: "ok" }; + } + }, + })); + + mock.module("../../utils/skill-loader.js", () => ({ + loadSkills: async () => [], + })); + + const { createCodexSession } = await import(`./session.ts?test=${Math.random()}`); + const session = await createCodexSession(process.cwd(), undefined, { + runId: runId as `${string}-${number}-${string}`, + store, + resumeOverallStatus: "running", + }); + + try { + await session.planSpec("spec", "context"); + await session.executeTask( + { + id: "T1", + name: "Add subtract export", + description: "Add subtract(a, b) and update tests.", + dependencies: [], + acceptance_criteria: ["bun test passes"], + status: "pending", + retries: 0, + maxRetries: 3, + }, + runId, + "context", + ); + + expect(runTurnCalls).toHaveLength(2); + expect(runTurnCalls[0]?.collaborationMode).toBeUndefined(); + expect(runTurnCalls[1]?.collaborationMode).toBeUndefined(); + + const planningPrompt = runTurnCalls[0]?.input?.[0]?.text ?? ""; + expect(planningPrompt).not.toContain("request_user_input"); + + const executionPrompt = runTurnCalls[1]?.input?.[0]?.text ?? ""; + expect(executionPrompt).not.toContain("execution clarification gate"); + expect(executionPrompt).not.toContain("request_user_input"); + } finally { + await session.disconnect(); + await rm(runsDir, { recursive: true, force: true }); + } + }); + test("does not force review prompts through plan collaboration mode", async () => { type TurnInputItem = { type: "text"; text: string };