Skip to content

Commit 5d00dac

Browse files
committed
feat: add support for Codex
Adds support for running evals using Codex.
1 parent 9416a96 commit 5d00dac

File tree

8 files changed

+76
-3
lines changed

8 files changed

+76
-3
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ You can customize the `web-codegen-scorer eval` script with the following flags:
8383
- Example: `web-codegen-scorer eval --model=gemini-2.5-flash --autorater-model=gemini-2.5-flash --env=<config path>`
8484

8585
- `--runner=<name>`: Specifies the runner to use to execute the eval. Supported runners are
86-
`genkit` (default), `gemini-cli` or `claude-code`.
86+
`genkit` (default), `gemini-cli`, `claude-code` or `codex`.
8787

8888
- `--local`: Runs the script in local mode for the initial code generation request. Instead of
8989
calling the LLM, it will attempt to read the initial code from a corresponding file in the

package.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,11 +91,13 @@
9191
},
9292
"optionalDependencies": {
9393
"@anthropic-ai/claude-code": "^2.0.0",
94-
"@google/gemini-cli": "^0.5.0"
94+
"@google/gemini-cli": "^0.5.0",
95+
"@openai/codex": "^0.42.0"
9596
},
9697
"devDependencies": {
9798
"@anthropic-ai/claude-code": "^2.0.0",
9899
"@google/gemini-cli": "^0.5.0",
100+
"@openai/codex": "^0.42.0",
99101
"prettier": "^3.5.3",
100102
"tsx": "^4.20.3"
101103
}

pnpm-lock.yaml

Lines changed: 11 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

report-app/src/app/shared/provider-label.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ const exactMatches: Record<string, string> = {
88
solid: 'frameworks/solid.svg',
99
'gemini-cli': 'gemini.webp',
1010
genkit: 'genkit.png',
11+
codex: 'open-ai.png',
1112
};
1213

1314
@Component({

runner/codegen/claude-code-runner.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ export class ClaudeCodeRunner extends BaseCliAgentRunner implements LlmRunner {
1616
readonly hasBuiltInRepairLoop = true;
1717
protected ignoredFilePatterns = ['**/CLAUDE.md', '**/.claude/**'];
1818
protected binaryName = 'claude';
19+
20+
// Claude only outputs once at the end so we bump the inactivity timeout.
1921
protected override inactivityTimeoutMins = 10;
2022
protected override totalRequestTimeoutMins = 10;
2123

runner/codegen/codex-runner.ts

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import {LlmGenerateFilesRequestOptions, LlmRunner} from './llm-runner.js';
2+
import {join} from 'path';
3+
import {mkdirSync} from 'fs';
4+
import {writeFile} from 'fs/promises';
5+
import {BaseCliAgentRunner} from './base-cli-agent-runner.js';
6+
7+
const MODEL_MAPPING: Record<string, string> = {
8+
'openai-o3': 'o3',
9+
'openai-o4-mini': 'o4-mini',
10+
'openai-gpt-5': 'gpt-5-codex',
11+
};
12+
13+
/** Runner that generates code using Codex. */
14+
export class CodexRunner extends BaseCliAgentRunner implements LlmRunner {
15+
readonly id = 'codex';
16+
readonly displayName = 'Codex';
17+
readonly hasBuiltInRepairLoop = true;
18+
protected ignoredFilePatterns = ['**/AGENTS.md', '**/.codex/**'];
19+
protected binaryName = 'codex';
20+
21+
getSupportedModels(): string[] {
22+
return Object.keys(MODEL_MAPPING);
23+
}
24+
25+
protected getCommandLineFlags(options: LlmGenerateFilesRequestOptions): string[] {
26+
return [
27+
'exec',
28+
'--model',
29+
MODEL_MAPPING[options.model],
30+
// Skip all confirmations.
31+
'--dangerously-bypass-approvals-and-sandbox',
32+
'--skip-git-repo-check',
33+
options.context.executablePrompt,
34+
];
35+
}
36+
37+
protected async writeAgentFiles(options: LlmGenerateFilesRequestOptions): Promise<void> {
38+
const {context} = options;
39+
const instructionFilePath = join(context.directory, 'AGENTS.md');
40+
const settingsDir = join(context.directory, '.codex');
41+
42+
mkdirSync(settingsDir);
43+
44+
await Promise.all([
45+
writeFile(join(settingsDir, 'config.toml'), this.getSettingsFile()),
46+
writeFile(instructionFilePath, super.getCommonInstructions(options)),
47+
]);
48+
}
49+
50+
private getSettingsFile(): string {
51+
return ['hide_agent_reasoning = true', ''].join('\n');
52+
}
53+
}

runner/codegen/runner-creation.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,13 @@ import {UserFacingError} from '../utils/errors.js';
22
import type {GeminiCliRunner} from './gemini-cli-runner.js';
33
import type {ClaudeCodeRunner} from './claude-code-runner.js';
44
import type {GenkitRunner} from './genkit/genkit-runner.js';
5+
import type {CodexRunner} from './codex-runner.js';
56

67
interface AvailableRunners {
78
genkit: GenkitRunner;
89
'gemini-cli': GeminiCliRunner;
910
'claude-code': ClaudeCodeRunner;
11+
'codex': CodexRunner;
1012
}
1113

1214
/** Names of supported runners. */
@@ -31,6 +33,8 @@ export async function getRunnerByName<T extends RunnerName>(name: T): Promise<Av
3133
return import('./claude-code-runner.js').then(
3234
m => new m.ClaudeCodeRunner() as AvailableRunners[T],
3335
);
36+
case 'codex':
37+
return import('./codex-runner.js').then(m => new m.CodexRunner() as AvailableRunners[T]);
3438
default:
3539
throw new UserFacingError(`Unsupported runner ${name}`);
3640
}

runner/eval-cli.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ function builder(argv: Argv): Argv<Options> {
5757
.option('runner', {
5858
type: 'string',
5959
default: 'genkit' as const,
60-
choices: ['genkit', 'gemini-cli', 'claude-code'] as RunnerName[],
60+
choices: ['genkit', 'gemini-cli', 'claude-code', 'codex'] as RunnerName[],
6161
description: 'Runner to use to execute the eval',
6262
})
6363
.option('local', {

0 commit comments

Comments
 (0)