benchflow-ai · xdotli · Apr 22, 2026 · Apr 21, 2026 · Apr 21, 2026 · Apr 21, 2026
diff --git a/src/benchflow/_agent_env.py b/src/benchflow/_agent_env.py
@@ -144,7 +144,9 @@ def resolve_agent_env(
     """Resolve agent environment: auto-inherit keys, provider vars, env_mapping."""
     agent_env = dict(agent_env or {})
     auto_inherit_env(agent_env)
-    if model:
+    # Oracle runs solve.sh and never calls an LLM — model env vars and
+    # API-key validation are skipped even if a caller forwards a model.
+    if model and agent != "oracle":
         inject_vertex_credentials(agent_env, model)
         resolve_provider_env(agent_env, model, agent)
         # Validate required API key for the chosen model

diff --git a/src/benchflow/cli/eval.py b/src/benchflow/cli/eval.py
@@ -1,7 +1,12 @@
 """`bf eval` — the benchflow eval-runner command group.
 
-The future-facing entry point for running evaluations. Anthropic-style shape:
-resource creation, one command, return the result or a job-id.
+NOTE: This module is **not wired into the live CLI**.  The active
+``bench eval create`` command dispatches to ``cli/main.py:eval_create``.
+This file is kept as the future-facing design for the eval sub-command
+and must not be imported by ``cli/main.py`` (see
+``test_oracle_chokepoint.py::TestEvalModuleNotWiredIntoCLI``).
+
+Design shape — Anthropic-style resource creation:
 
     bf eval create <task-ref> [flags]
         One-shot eval — creates an Agent + Environment + Trajectory under
@@ -14,9 +19,6 @@
 
     bf eval list          Show recent eval runs (reads the jobs/ dir)
     bf eval retrieve ID   Look up a specific trajectory by trial name
-
-Replaces `bf run` + `bf job` as the idiomatic way to run evals. `bf run`
-stays around as a deprecated alias for one release.
 """
 
 from __future__ import annotations
@@ -29,7 +31,7 @@
 from rich.console import Console
 from rich.table import Table
 
-from benchflow.job import DEFAULT_AGENT, DEFAULT_MODEL
+from benchflow.job import DEFAULT_AGENT, effective_model as _effective_model
 
 console = Console()
 
@@ -228,12 +230,12 @@ def _run_single(
     from benchflow.sdk import SDK
 
     sdk = SDK()
-    effective_model = None if agent == "oracle" else (model or DEFAULT_MODEL)
+    eff_model = _effective_model(agent, model)
     result = asyncio.run(
         sdk.run(
             task_path=task_dir,
             agent=agent,
-            model=effective_model,
+            model=eff_model,
             environment=environment,
             prompts=cast("list[str | None] | None", prompt),
             agent_env=agent_env,
@@ -269,10 +271,10 @@ def _run_batch(
 ) -> None:
     from benchflow.job import Job, JobConfig, RetryConfig
 
-    effective_model = None if agent == "oracle" else (model or DEFAULT_MODEL)
+    eff_model = _effective_model(agent, model)
     config = JobConfig(
         agent=agent,
-        model=effective_model,
+        model=eff_model,
         environment=environment,
         concurrency=concurrency,
         retry=RetryConfig(max_retries=max_retries),

diff --git a/src/benchflow/cli/main.py b/src/benchflow/cli/main.py
@@ -10,7 +10,7 @@
 from rich.console import Console
 from rich.table import Table
 
-from benchflow.job import DEFAULT_AGENT, DEFAULT_MODEL
+from benchflow.job import DEFAULT_AGENT, effective_model
 
 console = Console()
 
@@ -167,7 +167,7 @@ def job(
             jobs_dir=jobs_dir,
             config=JobConfig(
                 agent=agent,
-                model=model or DEFAULT_MODEL,
+                model=effective_model(agent, model),
                 environment=environment,
                 concurrency=concurrency,
                 retry=RetryConfig(max_retries=max_retries),
@@ -343,7 +343,7 @@ def eval(
         jobs_dir=jobs_dir,
         config=JobConfig(
             agent=agent,
-            model=model or DEFAULT_MODEL,
+            model=effective_model(agent, model),
             environment=environment,
             concurrency=concurrency,
             skills_dir=effective_skills,
@@ -754,19 +754,20 @@ def eval_create(
             f"({result.score:.1%})[/bold], errors={result.errored}"
         )
     elif tasks_dir:
+        eff_model = effective_model(agent, model)
         # Smart detection: if tasks_dir has task.toml, it's a single task
         if (tasks_dir / "task.toml").exists():
             from benchflow.trial import Trial, TrialConfig, Scene
 
             config = TrialConfig(
                 task_path=tasks_dir,
-                scenes=[Scene.single(agent=agent, model=model or DEFAULT_MODEL,
+                scenes=[Scene.single(agent=agent, model=eff_model,
                                      skills_dir=str(skills_dir) if skills_dir else None)],
                 environment=environment,
                 sandbox_user=sandbox_user,
                 jobs_dir=jobs_dir,
                 agent=agent,
-                model=model or DEFAULT_MODEL,
+                model=eff_model,
                 skills_dir=str(skills_dir) if skills_dir else None,
             )
 
@@ -777,7 +778,7 @@ async def _run():
             run_result = asyncio.run(_run())
             reward = (run_result.rewards or {}).get("reward")
             console.print(f"\n[bold]Task:[/bold] {tasks_dir.name}")
-            console.print(f"[bold]Agent:[/bold] {agent} ({model or DEFAULT_MODEL})")
+            console.print(f"[bold]Agent:[/bold] {agent} ({eff_model or 'no model'})")
             console.print(f"[bold]Reward:[/bold] {reward}")
             console.print(f"[bold]Tool calls:[/bold] {run_result.n_tool_calls}")
             if run_result.error:
@@ -789,7 +790,7 @@ async def _run():
                 jobs_dir=jobs_dir,
                 config=JobConfig(
                     agent=agent,
-                    model=model or DEFAULT_MODEL,
+                    model=eff_model,
                     environment=environment,
                     concurrency=concurrency,
                     sandbox_user=sandbox_user,
@@ -915,7 +916,7 @@ def train_create(
                 jobs_dir=f"{jobs_dir}/sweep-{sweep_idx}",
                 config=JobConfig(
                     agent=agent,
-                    model=model or DEFAULT_MODEL,
+                    model=effective_model(agent, model),
                     environment=environment,
                     concurrency=concurrency,
                 ),

diff --git a/src/benchflow/job.py b/src/benchflow/job.py
@@ -141,12 +141,25 @@ def backoff_delay(self, attempt: int) -> float:
 DEFAULT_MODEL = "claude-haiku-4-5-20251001"
 
 
+def effective_model(agent: str, model: str | None) -> str | None:
+    """Resolve the model an agent should run with.
+
+    Oracle runs solve.sh and never calls an LLM, so it never receives a model
+    (the chokepoint in resolve_agent_env defends, but callers should also stop
+    materializing DEFAULT_MODEL into oracle configs to keep the data honest —
+    e.g. result-summary JSON shows model=null instead of a bogus default).
+    """
+    if agent == "oracle":
+        return None
+    return model or DEFAULT_MODEL
+
+
 @dataclass
 class JobConfig:
     """Configuration for a benchmark job."""
 
     agent: str = DEFAULT_AGENT
-    model: str | None = DEFAULT_MODEL
+    model: str | None = None
     environment: str = "docker"
     concurrency: int = 4
     prompts: list[str | None] | None = None
@@ -281,9 +294,10 @@ def _from_native_yaml(cls, raw: dict, **kwargs) -> "Job":
         sandbox_user = raw.get("sandbox_user", "agent")
         sandbox_locked_paths = raw.get("sandbox_locked_paths")
 
+        agent_name = raw.get("agent", DEFAULT_AGENT)
         config = JobConfig(
-            agent=raw.get("agent", DEFAULT_AGENT),
-            model=raw.get("model", DEFAULT_MODEL),
+            agent=agent_name,
+            model=effective_model(agent_name, raw.get("model")),
             environment=raw.get("environment", "docker"),
             concurrency=raw.get("concurrency", 4),
             prompts=prompts,
@@ -305,7 +319,7 @@ def _from_harbor_yaml(cls, raw: dict, **kwargs) -> "Job":
         agent_name = agent_cfg.get("name", DEFAULT_AGENT)
 
         # Model — keep provider prefix intact for downstream resolution
-        model = agent_cfg.get("model_name", "") or DEFAULT_MODEL
+        model = effective_model(agent_name, agent_cfg.get("model_name") or None)
 
         # Environment
         env_cfg = raw.get("environment", {})

diff --git a/tests/test_eval_cli.py b/tests/test_eval_cli.py
@@ -1,9 +1,9 @@
-"""Tests for `bf eval` — the new eval-runner CLI.
+"""Tests for ``benchflow.cli.eval`` — the future-facing eval CLI module.
 
-The create command takes a positional task reference and routes to either
-a single SDK.run or a batch Job based on whether the reference is a single
-task dir or a directory of task dirs. We don't exercise the actual runner
-here (that's the parity tests' job); we only verify the resolver.
+NOTE: ``cli/eval.py`` is **not wired into the live CLI** (the active
+``bench eval create`` lives in ``cli/main.py``).  These tests cover the
+task-reference resolver (``_resolve_task_ref``) which will be used once
+the module is promoted to the live entry point.
 """
 
 from __future__ import annotations