feat: add support for Codex

crisbeto · crisbeto · commit 5d00dac620fc · 2025-10-02T13:04:56.000+02:00
Adds support for running evals using Codex.
diff --git a/README.md b/README.md
@@ -83,7 +83,7 @@ You can customize the `web-codegen-scorer eval` script with the following flags:
     - Example: `web-codegen-scorer eval --model=gemini-2.5-flash --autorater-model=gemini-2.5-flash --env=<config path>`
 
 - `--runner=<name>`: Specifies the runner to use to execute the eval. Supported runners are
-  `genkit` (default), `gemini-cli` or `claude-code`.
+  `genkit` (default), `gemini-cli`, `claude-code` or `codex`.
 
 - `--local`: Runs the script in local mode for the initial code generation request. Instead of
   calling the LLM, it will attempt to read the initial code from a corresponding file in the
diff --git a/package.json b/package.json
@@ -91,11 +91,13 @@
   },
   "optionalDependencies": {
     "@anthropic-ai/claude-code": "^2.0.0",
-    "@google/gemini-cli": "^0.5.0"
+    "@google/gemini-cli": "^0.5.0",
+    "@openai/codex": "^0.42.0"
   },
   "devDependencies": {
     "@anthropic-ai/claude-code": "^2.0.0",
     "@google/gemini-cli": "^0.5.0",
+    "@openai/codex": "^0.42.0",
     "prettier": "^3.5.3",
     "tsx": "^4.20.3"
   }
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
diff --git a/report-app/src/app/shared/provider-label.ts b/report-app/src/app/shared/provider-label.ts
@@ -8,6 +8,7 @@ const exactMatches: Record<string, string> = {
   solid: 'frameworks/solid.svg',
   'gemini-cli': 'gemini.webp',
   genkit: 'genkit.png',
+  codex: 'open-ai.png',
 };
 
 @Component({
diff --git a/runner/codegen/claude-code-runner.ts b/runner/codegen/claude-code-runner.ts
@@ -16,6 +16,8 @@ export class ClaudeCodeRunner extends BaseCliAgentRunner implements LlmRunner {
   readonly hasBuiltInRepairLoop = true;
   protected ignoredFilePatterns = ['**/CLAUDE.md', '**/.claude/**'];
   protected binaryName = 'claude';
+
+  // Claude only outputs once at the end so we bump the inactivity timeout.
   protected override inactivityTimeoutMins = 10;
   protected override totalRequestTimeoutMins = 10;
 
diff --git a/runner/codegen/codex-runner.ts b/runner/codegen/codex-runner.ts
@@ -0,0 +1,53 @@
+import {LlmGenerateFilesRequestOptions, LlmRunner} from './llm-runner.js';
+import {join} from 'path';
+import {mkdirSync} from 'fs';
+import {writeFile} from 'fs/promises';
+import {BaseCliAgentRunner} from './base-cli-agent-runner.js';
+
+const MODEL_MAPPING: Record<string, string> = {
+  'openai-o3': 'o3',
+  'openai-o4-mini': 'o4-mini',
+  'openai-gpt-5': 'gpt-5-codex',
+};
+
+/** Runner that generates code using Codex. */
+export class CodexRunner extends BaseCliAgentRunner implements LlmRunner {
+  readonly id = 'codex';
+  readonly displayName = 'Codex';
+  readonly hasBuiltInRepairLoop = true;
+  protected ignoredFilePatterns = ['**/AGENTS.md', '**/.codex/**'];
+  protected binaryName = 'codex';
+
+  getSupportedModels(): string[] {
+    return Object.keys(MODEL_MAPPING);
+  }
+
+  protected getCommandLineFlags(options: LlmGenerateFilesRequestOptions): string[] {
+    return [
+      'exec',
+      '--model',
+      MODEL_MAPPING[options.model],
+      // Skip all confirmations.
+      '--dangerously-bypass-approvals-and-sandbox',
+      '--skip-git-repo-check',
+      options.context.executablePrompt,
+    ];
+  }
+
+  protected async writeAgentFiles(options: LlmGenerateFilesRequestOptions): Promise<void> {
+    const {context} = options;
+    const instructionFilePath = join(context.directory, 'AGENTS.md');
+    const settingsDir = join(context.directory, '.codex');
+
+    mkdirSync(settingsDir);
+
+    await Promise.all([
+      writeFile(join(settingsDir, 'config.toml'), this.getSettingsFile()),
+      writeFile(instructionFilePath, super.getCommonInstructions(options)),
+    ]);
+  }
+
+  private getSettingsFile(): string {
+    return ['hide_agent_reasoning = true', ''].join('\n');
+  }
+}
diff --git a/runner/codegen/runner-creation.ts b/runner/codegen/runner-creation.ts
@@ -2,11 +2,13 @@ import {UserFacingError} from '../utils/errors.js';
 import type {GeminiCliRunner} from './gemini-cli-runner.js';
 import type {ClaudeCodeRunner} from './claude-code-runner.js';
 import type {GenkitRunner} from './genkit/genkit-runner.js';
+import type {CodexRunner} from './codex-runner.js';
 
 interface AvailableRunners {
   genkit: GenkitRunner;
   'gemini-cli': GeminiCliRunner;
   'claude-code': ClaudeCodeRunner;
+  'codex': CodexRunner;
 }
 
 /** Names of supported runners. */
@@ -31,6 +33,8 @@ export async function getRunnerByName<T extends RunnerName>(name: T): Promise<Av
       return import('./claude-code-runner.js').then(
         m => new m.ClaudeCodeRunner() as AvailableRunners[T],
       );
+    case 'codex':
+      return import('./codex-runner.js').then(m => new m.CodexRunner() as AvailableRunners[T]);
     default:
       throw new UserFacingError(`Unsupported runner ${name}`);
   }
diff --git a/runner/eval-cli.ts b/runner/eval-cli.ts
@@ -57,7 +57,7 @@ function builder(argv: Argv): Argv<Options> {
       .option('runner', {
         type: 'string',
         default: 'genkit' as const,
-        choices: ['genkit', 'gemini-cli', 'claude-code'] as RunnerName[],
+        choices: ['genkit', 'gemini-cli', 'claude-code', 'codex'] as RunnerName[],
         description: 'Runner to use to execute the eval',
       })
       .option('local', {