From 30dbe215c55d444f5f6cef285c2c85b74fcbeded Mon Sep 17 00:00:00 2001 From: Yifeng He Date: Tue, 21 Apr 2026 14:15:30 -0700 Subject: [PATCH 1/2] fix: skip model/API-key validation for oracle agent The oracle agent runs solution/solve.sh and never calls an LLM, but resolve_agent_env() was validating API keys for whatever model the CLI defaulted to (claude-haiku-4-5-20251001). This made `bench eval create -a oracle` fail without ANTHROPIC_API_KEY set, even though oracle doesn't need it. --- src/benchflow/_agent_env.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/benchflow/_agent_env.py b/src/benchflow/_agent_env.py index 8885b5e..5609e6c 100644 --- a/src/benchflow/_agent_env.py +++ b/src/benchflow/_agent_env.py @@ -144,7 +144,7 @@ def resolve_agent_env( """Resolve agent environment: auto-inherit keys, provider vars, env_mapping.""" agent_env = dict(agent_env or {}) auto_inherit_env(agent_env) - if model: + if model and agent != "oracle": inject_vertex_credentials(agent_env, model) resolve_provider_env(agent_env, model, agent) # Validate required API key for the chosen model From 360c460a33e4758325611a7a4bdd4b0a599a8eb6 Mon Sep 17 00:00:00 2001 From: Yifeng He Date: Tue, 21 Apr 2026 14:37:13 -0700 Subject: [PATCH 2/2] fix: don't assign default model to oracle agent Move the fix from resolve_agent_env to the CLI layer: oracle runs solve.sh and never calls an LLM, so it should not receive DEFAULT_MODEL at all. Both _run_single and _run_batch now pass model=None for oracle. Widen JobConfig.model to str | None to support this. --- src/benchflow/_agent_env.py | 2 +- src/benchflow/cli/eval.py | 6 ++++-- src/benchflow/job.py | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/benchflow/_agent_env.py b/src/benchflow/_agent_env.py index 5609e6c..8885b5e 100644 --- a/src/benchflow/_agent_env.py +++ b/src/benchflow/_agent_env.py @@ -144,7 +144,7 @@ def resolve_agent_env( """Resolve agent environment: auto-inherit keys, provider vars, env_mapping.""" agent_env = dict(agent_env or {}) auto_inherit_env(agent_env) - if model and agent != "oracle": + if model: inject_vertex_credentials(agent_env, model) resolve_provider_env(agent_env, model, agent) # Validate required API key for the chosen model diff --git a/src/benchflow/cli/eval.py b/src/benchflow/cli/eval.py index 0b13fd8..a92c0be 100644 --- a/src/benchflow/cli/eval.py +++ b/src/benchflow/cli/eval.py @@ -228,11 +228,12 @@ def _run_single( from benchflow.sdk import SDK sdk = SDK() + effective_model = None if agent == "oracle" else (model or DEFAULT_MODEL) result = asyncio.run( sdk.run( task_path=task_dir, agent=agent, - model=model or DEFAULT_MODEL, + model=effective_model, environment=environment, prompts=cast("list[str | None] | None", prompt), agent_env=agent_env, @@ -268,9 +269,10 @@ def _run_batch( ) -> None: from benchflow.job import Job, JobConfig, RetryConfig + effective_model = None if agent == "oracle" else (model or DEFAULT_MODEL) config = JobConfig( agent=agent, - model=model or DEFAULT_MODEL, + model=effective_model, environment=environment, concurrency=concurrency, retry=RetryConfig(max_retries=max_retries), diff --git a/src/benchflow/job.py b/src/benchflow/job.py index 6d8fe9b..95414fe 100644 --- a/src/benchflow/job.py +++ b/src/benchflow/job.py @@ -146,7 +146,7 @@ class JobConfig: """Configuration for a benchmark job.""" agent: str = DEFAULT_AGENT - model: str = DEFAULT_MODEL + model: str | None = DEFAULT_MODEL environment: str = "docker" concurrency: int = 4 prompts: list[str | None] | None = None