From 30dbe215c55d444f5f6cef285c2c85b74fcbeded Mon Sep 17 00:00:00 2001
From: Yifeng He <yfhe.prsn@gmail.com>
Date: Tue, 21 Apr 2026 14:15:30 -0700
Subject: [PATCH 1/6] fix: skip model/API-key validation for oracle agent

The oracle agent runs solution/solve.sh and never calls an LLM, but
resolve_agent_env() was validating API keys for whatever model the CLI
defaulted to (claude-haiku-4-5-20251001). This made `bench eval create
-a oracle` fail without ANTHROPIC_API_KEY set, even though oracle
doesn't need it.
---
 src/benchflow/_agent_env.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/benchflow/_agent_env.py b/src/benchflow/_agent_env.py
index 8885b5e..5609e6c 100644
--- a/src/benchflow/_agent_env.py
+++ b/src/benchflow/_agent_env.py
@@ -144,7 +144,7 @@ def resolve_agent_env(
     """Resolve agent environment: auto-inherit keys, provider vars, env_mapping."""
     agent_env = dict(agent_env or {})
     auto_inherit_env(agent_env)
-    if model:
+    if model and agent != "oracle":
         inject_vertex_credentials(agent_env, model)
         resolve_provider_env(agent_env, model, agent)
         # Validate required API key for the chosen model

From 360c460a33e4758325611a7a4bdd4b0a599a8eb6 Mon Sep 17 00:00:00 2001
From: Yifeng He <yfhe.prsn@gmail.com>
Date: Tue, 21 Apr 2026 14:37:13 -0700
Subject: [PATCH 2/6] fix: don't assign default model to oracle agent

Move the fix from resolve_agent_env to the CLI layer: oracle runs
solve.sh and never calls an LLM, so it should not receive DEFAULT_MODEL
at all. Both _run_single and _run_batch now pass model=None for oracle.
Widen JobConfig.model to str | None to support this.
---
 src/benchflow/_agent_env.py | 2 +-
 src/benchflow/cli/eval.py   | 6 ++++--
 src/benchflow/job.py        | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/benchflow/_agent_env.py b/src/benchflow/_agent_env.py
index 5609e6c..8885b5e 100644
--- a/src/benchflow/_agent_env.py
+++ b/src/benchflow/_agent_env.py
@@ -144,7 +144,7 @@ def resolve_agent_env(
     """Resolve agent environment: auto-inherit keys, provider vars, env_mapping."""
     agent_env = dict(agent_env or {})
     auto_inherit_env(agent_env)
-    if model and agent != "oracle":
+    if model:
         inject_vertex_credentials(agent_env, model)
         resolve_provider_env(agent_env, model, agent)
         # Validate required API key for the chosen model
diff --git a/src/benchflow/cli/eval.py b/src/benchflow/cli/eval.py
index 0b13fd8..a92c0be 100644
--- a/src/benchflow/cli/eval.py
+++ b/src/benchflow/cli/eval.py
@@ -228,11 +228,12 @@ def _run_single(
     from benchflow.sdk import SDK
 
     sdk = SDK()
+    effective_model = None if agent == "oracle" else (model or DEFAULT_MODEL)
     result = asyncio.run(
         sdk.run(
             task_path=task_dir,
             agent=agent,
-            model=model or DEFAULT_MODEL,
+            model=effective_model,
             environment=environment,
             prompts=cast("list[str | None] | None", prompt),
             agent_env=agent_env,
@@ -268,9 +269,10 @@ def _run_batch(
 ) -> None:
     from benchflow.job import Job, JobConfig, RetryConfig
 
+    effective_model = None if agent == "oracle" else (model or DEFAULT_MODEL)
     config = JobConfig(
         agent=agent,
-        model=model or DEFAULT_MODEL,
+        model=effective_model,
         environment=environment,
         concurrency=concurrency,
         retry=RetryConfig(max_retries=max_retries),
diff --git a/src/benchflow/job.py b/src/benchflow/job.py
index 6d8fe9b..95414fe 100644
--- a/src/benchflow/job.py
+++ b/src/benchflow/job.py
@@ -146,7 +146,7 @@ class JobConfig:
     """Configuration for a benchmark job."""
 
     agent: str = DEFAULT_AGENT
-    model: str = DEFAULT_MODEL
+    model: str | None = DEFAULT_MODEL
     environment: str = "docker"
     concurrency: int = 4
     prompts: list[str | None] | None = None

From 9b57b2afb8bade8912918fe3654e2a608779f373 Mon Sep 17 00:00:00 2001
From: Yifeng He <yfhe.prsn@gmail.com>
Date: Tue, 21 Apr 2026 15:32:20 -0700
Subject: [PATCH 3/6] =?UTF-8?q?fix:=20oracle=20agent=20=E2=80=94=20chokepo?=
 =?UTF-8?q?int=20guard,=20drop=20orphan=20eval=20CLI,=20helper?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR #173 moved the oracle/DEFAULT_MODEL guard from resolve_agent_env to
cli/eval.py, but cli/eval.py is orphaned (never imported into the live
CLI), so `bench eval create` still passes DEFAULT_MODEL to oracle and
trips ANTHROPIC_API_KEY validation. Three changes:

- Restore the `agent != "oracle"` guard in resolve_agent_env so the
  chokepoint defends against any caller that forwards a model.
- Delete the orphan cli/eval.py and its tests — the live eval_create
  lives in cli/main.py and was the actual code path users hit.
- Add effective_model(agent, model) helper, change JobConfig.model
  default to None, replace seven `model or DEFAULT_MODEL` sites in
  cli/main.py and job.py YAML loaders so oracle gets honest model=None
  end-to-end (in result/summary JSON, prints, and downstream Trial).

Regression test in test_resolve_env_helpers.py pins the chokepoint by
calling resolve_agent_env("oracle", DEFAULT_MODEL, {}) with no API key
and no host auth — verified to fail on main with the user-facing
ANTHROPIC_API_KEY error and pass after the fix.
---
 src/benchflow/_agent_env.py       |   4 +-
 src/benchflow/cli/eval.py         | 373 ------------------------------
 src/benchflow/cli/main.py         |  17 +-
 src/benchflow/job.py              |  22 +-
 tests/test_eval_cli.py            | 118 ----------
 tests/test_resolve_env_helpers.py |  42 ++++
 6 files changed, 72 insertions(+), 504 deletions(-)
 delete mode 100644 src/benchflow/cli/eval.py
 delete mode 100644 tests/test_eval_cli.py

diff --git a/src/benchflow/_agent_env.py b/src/benchflow/_agent_env.py
index 8885b5e..72d6498 100644
--- a/src/benchflow/_agent_env.py
+++ b/src/benchflow/_agent_env.py
@@ -144,7 +144,9 @@ def resolve_agent_env(
     """Resolve agent environment: auto-inherit keys, provider vars, env_mapping."""
     agent_env = dict(agent_env or {})
     auto_inherit_env(agent_env)
-    if model:
+    # Oracle runs solve.sh and never calls an LLM — model env vars and
+    # API-key validation are skipped even if a caller forwards a model.
+    if model and agent != "oracle":
         inject_vertex_credentials(agent_env, model)
         resolve_provider_env(agent_env, model, agent)
         # Validate required API key for the chosen model
diff --git a/src/benchflow/cli/eval.py b/src/benchflow/cli/eval.py
deleted file mode 100644
index a92c0be..0000000
--- a/src/benchflow/cli/eval.py
+++ /dev/null
@@ -1,373 +0,0 @@
-"""`bf eval` — the benchflow eval-runner command group.
-
-The future-facing entry point for running evaluations. Anthropic-style shape:
-resource creation, one command, return the result or a job-id.
-
-    bf eval create <task-ref> [flags]
-        One-shot eval — creates an Agent + Environment + Trajectory under
-        the hood and runs the task. `task-ref` can be:
-          - a path to a task directory (single task)
-          - a path to a directory of task directories (batch)
-          - a `harbor://<name>[@<version>]` ref (full Harbor dataset)
-          - a `harbor://<name>/<task>` ref (single task from a Harbor dataset)
-          - a `benchflow://<name>[@<version>]` ref (benchflow-owned dataset)
-
-    bf eval list          Show recent eval runs (reads the jobs/ dir)
-    bf eval retrieve ID   Look up a specific trajectory by trial name
-
-Replaces `bf run` + `bf job` as the idiomatic way to run evals. `bf run`
-stays around as a deprecated alias for one release.
-"""
-
-from __future__ import annotations
-
-import asyncio
-from pathlib import Path
-from typing import Annotated
-
-import typer
-from rich.console import Console
-from rich.table import Table
-
-from benchflow.job import DEFAULT_AGENT, DEFAULT_MODEL
-
-console = Console()
-
-app = typer.Typer(
-    name="eval",
-    help="Run evaluations. `bf eval create <task>` is the main entry point.",
-    no_args_is_help=True,
-)
-
-
-def _resolve_task_ref(task_ref: str) -> tuple[Path, bool]:
-    """Resolve a positional task reference to a local directory.
-
-    Returns `(path, is_batch)`. `is_batch` is True when the reference
-    points at a directory containing multiple task dirs (each with its
-    own `task.toml`), False when the reference is a single task dir.
-    """
-
-    # Registry prefix: fetch the full dataset and treat as batch.
-    if task_ref.startswith("harbor://") or task_ref.startswith("benchflow://"):
-        from benchflow.task_download import ensure_tasks
-
-        # Allow `harbor://<name>/<task>` shorthand for a single task within
-        # a dataset. Split off the trailing segment if it matches a task.
-        prefix, _, tail = task_ref.partition("://")
-        head = tail
-        sub_task: str | None = None
-        if "/" in tail and "@" not in tail.split("/", 1)[1]:
-            dataset, sub_task = tail.split("/", 1)
-            head = dataset
-        dataset_ref = f"{prefix}://{head}"
-        dataset_dir = ensure_tasks(dataset_ref)
-        if sub_task is not None:
-            candidate = dataset_dir / sub_task
-            if not candidate.exists() or not (candidate / "task.toml").exists():
-                console.print(
-                    f"[red]Harbor dataset {head!r} has no task named {sub_task!r}.[/red]"
-                )
-                raise typer.Exit(1)
-            return candidate, False
-        return dataset_dir, True
-
-    # Filesystem path: single task if task.toml is present, batch otherwise.
-    path = Path(task_ref).expanduser().resolve()
-    if not path.exists():
-        console.print(f"[red]Task reference not found: {task_ref}[/red]")
-        raise typer.Exit(1)
-    if (path / "task.toml").exists():
-        return path, False
-    if any(
-        child.is_dir() and (child / "task.toml").exists() for child in path.iterdir()
-    ):
-        return path, True
-    console.print(
-        f"[red]{path} is neither a single task (no task.toml) "
-        f"nor a directory of tasks.[/red]"
-    )
-    raise typer.Exit(1)
-
-
-@app.command("create")
-def eval_create(
-    task: Annotated[
-        str,
-        typer.Argument(
-            help=(
-                "Task reference. Path to a task dir, a dir of tasks, or a "
-                "registry ref (harbor://name, benchflow://name)."
-            )
-        ),
-    ],
-    agent: Annotated[
-        str, typer.Option("--agent", "-a", help="Agent name from the registry")
-    ] = DEFAULT_AGENT,
-    model: Annotated[
-        str | None,
-        typer.Option("--model", "-m", help="Model id; agent default if unset"),
-    ] = None,
-    environment: Annotated[
-        str,
-        typer.Option(
-            "--environment",
-            "-e",
-            help="docker | daytona | ... (uses the agent's default if unset)",
-        ),
-    ] = "docker",
-    prompt: Annotated[
-        list[str] | None,
-        typer.Option(
-            "--prompt",
-            "-p",
-            help="Prompt text; repeat for multi-turn. Default: instruction.md",
-        ),
-    ] = None,
-    concurrency: Annotated[
-        int,
-        typer.Option(
-            "--concurrency",
-            "-c",
-            help="Max parallel trials when task is a batch",
-        ),
-    ] = 4,
-    max_retries: Annotated[
-        int,
-        typer.Option(
-            "--max-retries",
-            help="Per-trial retry count on transient errors",
-        ),
-    ] = 1,
-    jobs_dir: Annotated[
-        str,
-        typer.Option("--jobs-dir", "-o", help="Where to write result files"),
-    ] = "jobs",
-    skills_dir: Annotated[
-        Path | None,
-        typer.Option(
-            "--skills-dir",
-            "-s",
-            help="Skills directory to mount into the sandbox",
-        ),
-    ] = None,
-    sandbox_user: Annotated[
-        str | None,
-        typer.Option(
-            "--sandbox-user",
-            help="Non-root sandbox user (default 'agent'; 'none' = root)",
-        ),
-    ] = "agent",
-    agent_env: Annotated[
-        list[str] | None,
-        typer.Option("--agent-env", help="Extra agent env var (KEY=VALUE)"),
-    ] = None,
-) -> None:
-    """Create and run an eval — one-shot.
-
-    Under the hood:
-      1. Resolves `task` to a local directory (fetching from Harbor if needed).
-      2. If it's a single task: runs `SDK.run()` once and prints the reward.
-      3. If it's a batch: runs a `Job` at `--concurrency` and prints pass rate.
-
-    This is the idiomatic way to run evals going forward. `bf run` and
-    `bf job` remain as one-release deprecated aliases that forward here.
-    """
-
-    resolved, is_batch = _resolve_task_ref(task)
-
-    parsed_env: dict[str, str] = {}
-    for entry in agent_env or []:
-        if "=" not in entry:
-            console.print(f"[red]Invalid --agent-env value: {entry!r}[/red]")
-            raise typer.Exit(2)
-        k, v = entry.split("=", 1)
-        parsed_env[k] = v
-
-    if is_batch:
-        _run_batch(
-            tasks_dir=resolved,
-            agent=agent,
-            model=model,
-            environment=environment,
-            concurrency=concurrency,
-            max_retries=max_retries,
-            jobs_dir=jobs_dir,
-            skills_dir=skills_dir,
-            sandbox_user=sandbox_user,
-            agent_env=parsed_env,
-        )
-    else:
-        _run_single(
-            task_dir=resolved,
-            agent=agent,
-            model=model,
-            environment=environment,
-            prompt=prompt,
-            jobs_dir=jobs_dir,
-            skills_dir=skills_dir,
-            sandbox_user=sandbox_user,
-            agent_env=parsed_env,
-        )
-
-
-def _run_single(
-    *,
-    task_dir: Path,
-    agent: str,
-    model: str | None,
-    environment: str,
-    prompt: list[str] | None,
-    jobs_dir: str,
-    skills_dir: Path | None,
-    sandbox_user: str | None,
-    agent_env: dict[str, str],
-) -> None:
-    from typing import cast
-
-    from benchflow.sdk import SDK
-
-    sdk = SDK()
-    effective_model = None if agent == "oracle" else (model or DEFAULT_MODEL)
-    result = asyncio.run(
-        sdk.run(
-            task_path=task_dir,
-            agent=agent,
-            model=effective_model,
-            environment=environment,
-            prompts=cast("list[str | None] | None", prompt),
-            agent_env=agent_env,
-            job_name="eval-create",
-            jobs_dir=jobs_dir,
-            skills_dir=skills_dir,
-            sandbox_user=None if sandbox_user == "none" else sandbox_user,
-        )
-    )
-    reward = getattr(result, "reward", None)
-    err = getattr(result, "error", None) or getattr(result, "verifier_error", None)
-    console.print()
-    if err:
-        console.print(f"[red]failed:[/red] {err}")
-        raise typer.Exit(1)
-    console.print(
-        f"[bold]reward={reward}[/bold]  tools={getattr(result, 'n_tool_calls', 0)}"
-    )
-
-
-def _run_batch(
-    *,
-    tasks_dir: Path,
-    agent: str,
-    model: str | None,
-    environment: str,
-    concurrency: int,
-    max_retries: int,
-    jobs_dir: str,
-    skills_dir: Path | None,
-    sandbox_user: str | None,
-    agent_env: dict[str, str],
-) -> None:
-    from benchflow.job import Job, JobConfig, RetryConfig
-
-    effective_model = None if agent == "oracle" else (model or DEFAULT_MODEL)
-    config = JobConfig(
-        agent=agent,
-        model=effective_model,
-        environment=environment,
-        concurrency=concurrency,
-        retry=RetryConfig(max_retries=max_retries),
-        agent_env=agent_env,
-        sandbox_user=None if sandbox_user == "none" else sandbox_user,
-        skills_dir=str(skills_dir) if skills_dir else None,
-    )
-    job = Job(
-        tasks_dir=tasks_dir,
-        jobs_dir=Path(jobs_dir),
-        config=config,
-    )
-    result = asyncio.run(job.run())
-    console.print()
-    console.print(
-        f"[bold]{result.passed}/{result.total} "
-        f"({result.score:.1%})[/bold] errors={result.errored}"
-    )
-
-
-@app.command("list")
-def eval_list(
-    jobs_dir: Annotated[
-        str,
-        typer.Option("--jobs-dir", "-o", help="Results directory to scan"),
-    ] = "jobs",
-    limit: Annotated[
-        int,
-        typer.Option("--limit", "-n", help="Max rows to show"),
-    ] = 20,
-) -> None:
-    """List recent eval runs by scanning the jobs/ dir."""
-
-    root = Path(jobs_dir)
-    if not root.exists():
-        console.print(f"[yellow]{root} does not exist yet.[/yellow]")
-        return
-    runs: list[tuple[str, int, int]] = []
-    for job_root in root.iterdir():
-        if not job_root.is_dir():
-            continue
-        for stamp in job_root.iterdir():
-            if not stamp.is_dir():
-                continue
-            trials = list(stamp.iterdir())
-            passed = 0
-            total = 0
-            for trial in trials:
-                result = trial / "result.json"
-                if not result.exists():
-                    continue
-                total += 1
-                try:
-                    import json
-
-                    data = json.loads(result.read_text())
-                    if (data.get("rewards") or {}).get("reward") == 1.0:
-                        passed += 1
-                except Exception:
-                    continue
-            if total:
-                runs.append((f"{job_root.name}/{stamp.name}", passed, total))
-    runs.sort(reverse=True)
-    table = Table(title=f"Recent evals in {root}")
-    table.add_column("run", style="cyan")
-    table.add_column("passed", justify="right", style="green")
-    table.add_column("total", justify="right")
-    table.add_column("rate", justify="right", style="yellow")
-    for run, passed, total in runs[:limit]:
-        rate = f"{100 * passed / total:.0f}%" if total else "-"
-        table.add_row(run, str(passed), str(total), rate)
-    console.print(table)
-
-
-@app.command("retrieve")
-def eval_retrieve(
-    trial_name: Annotated[
-        str, typer.Argument(help="Trial dir name, e.g. my-task__abc")
-    ],
-    jobs_dir: Annotated[
-        str,
-        typer.Option("--jobs-dir", "-o"),
-    ] = "jobs",
-) -> None:
-    """Print the result.json for a specific trial."""
-
-    import json
-
-    root = Path(jobs_dir)
-    matches = list(root.rglob(f"{trial_name}/result.json"))
-    if not matches:
-        console.print(f"[red]no trial named {trial_name!r} under {root}[/red]")
-        raise typer.Exit(1)
-    console.print(f"[dim]{matches[0]}[/dim]")
-    data = json.loads(matches[0].read_text())
-    from rich import print_json
-
-    print_json(data=data)
diff --git a/src/benchflow/cli/main.py b/src/benchflow/cli/main.py
index 9cf1198..96611bb 100644
--- a/src/benchflow/cli/main.py
+++ b/src/benchflow/cli/main.py
@@ -10,7 +10,7 @@
 from rich.console import Console
 from rich.table import Table
 
-from benchflow.job import DEFAULT_AGENT, DEFAULT_MODEL
+from benchflow.job import DEFAULT_AGENT, effective_model
 
 console = Console()
 
@@ -167,7 +167,7 @@ def job(
             jobs_dir=jobs_dir,
             config=JobConfig(
                 agent=agent,
-                model=model or DEFAULT_MODEL,
+                model=effective_model(agent, model),
                 environment=environment,
                 concurrency=concurrency,
                 retry=RetryConfig(max_retries=max_retries),
@@ -343,7 +343,7 @@ def eval(
         jobs_dir=jobs_dir,
         config=JobConfig(
             agent=agent,
-            model=model or DEFAULT_MODEL,
+            model=effective_model(agent, model),
             environment=environment,
             concurrency=concurrency,
             skills_dir=effective_skills,
@@ -754,19 +754,20 @@ def eval_create(
             f"({result.score:.1%})[/bold], errors={result.errored}"
         )
     elif tasks_dir:
+        eff_model = effective_model(agent, model)
         # Smart detection: if tasks_dir has task.toml, it's a single task
         if (tasks_dir / "task.toml").exists():
             from benchflow.trial import Trial, TrialConfig, Scene
 
             config = TrialConfig(
                 task_path=tasks_dir,
-                scenes=[Scene.single(agent=agent, model=model or DEFAULT_MODEL,
+                scenes=[Scene.single(agent=agent, model=eff_model,
                                      skills_dir=str(skills_dir) if skills_dir else None)],
                 environment=environment,
                 sandbox_user=sandbox_user,
                 jobs_dir=jobs_dir,
                 agent=agent,
-                model=model or DEFAULT_MODEL,
+                model=eff_model,
                 skills_dir=str(skills_dir) if skills_dir else None,
             )
 
@@ -777,7 +778,7 @@ async def _run():
             run_result = asyncio.run(_run())
             reward = (run_result.rewards or {}).get("reward")
             console.print(f"\n[bold]Task:[/bold] {tasks_dir.name}")
-            console.print(f"[bold]Agent:[/bold] {agent} ({model or DEFAULT_MODEL})")
+            console.print(f"[bold]Agent:[/bold] {agent} ({eff_model or 'no model'})")
             console.print(f"[bold]Reward:[/bold] {reward}")
             console.print(f"[bold]Tool calls:[/bold] {run_result.n_tool_calls}")
             if run_result.error:
@@ -789,7 +790,7 @@ async def _run():
                 jobs_dir=jobs_dir,
                 config=JobConfig(
                     agent=agent,
-                    model=model or DEFAULT_MODEL,
+                    model=eff_model,
                     environment=environment,
                     concurrency=concurrency,
                     sandbox_user=sandbox_user,
@@ -915,7 +916,7 @@ def train_create(
                 jobs_dir=f"{jobs_dir}/sweep-{sweep_idx}",
                 config=JobConfig(
                     agent=agent,
-                    model=model or DEFAULT_MODEL,
+                    model=effective_model(agent, model),
                     environment=environment,
                     concurrency=concurrency,
                 ),
diff --git a/src/benchflow/job.py b/src/benchflow/job.py
index 95414fe..ad99ad4 100644
--- a/src/benchflow/job.py
+++ b/src/benchflow/job.py
@@ -141,12 +141,25 @@ def backoff_delay(self, attempt: int) -> float:
 DEFAULT_MODEL = "claude-haiku-4-5-20251001"
 
 
+def effective_model(agent: str, model: str | None) -> str | None:
+    """Resolve the model an agent should run with.
+
+    Oracle runs solve.sh and never calls an LLM, so it never receives a model
+    (the chokepoint in resolve_agent_env defends, but callers should also stop
+    materializing DEFAULT_MODEL into oracle configs to keep the data honest —
+    e.g. result-summary JSON shows model=null instead of a bogus default).
+    """
+    if agent == "oracle":
+        return None
+    return model or DEFAULT_MODEL
+
+
 @dataclass
 class JobConfig:
     """Configuration for a benchmark job."""
 
     agent: str = DEFAULT_AGENT
-    model: str | None = DEFAULT_MODEL
+    model: str | None = None
     environment: str = "docker"
     concurrency: int = 4
     prompts: list[str | None] | None = None
@@ -281,9 +294,10 @@ def _from_native_yaml(cls, raw: dict, **kwargs) -> "Job":
         sandbox_user = raw.get("sandbox_user", "agent")
         sandbox_locked_paths = raw.get("sandbox_locked_paths")
 
+        agent_name = raw.get("agent", DEFAULT_AGENT)
         config = JobConfig(
-            agent=raw.get("agent", DEFAULT_AGENT),
-            model=raw.get("model", DEFAULT_MODEL),
+            agent=agent_name,
+            model=effective_model(agent_name, raw.get("model")),
             environment=raw.get("environment", "docker"),
             concurrency=raw.get("concurrency", 4),
             prompts=prompts,
@@ -305,7 +319,7 @@ def _from_harbor_yaml(cls, raw: dict, **kwargs) -> "Job":
         agent_name = agent_cfg.get("name", DEFAULT_AGENT)
 
         # Model — keep provider prefix intact for downstream resolution
-        model = agent_cfg.get("model_name", "") or DEFAULT_MODEL
+        model = effective_model(agent_name, agent_cfg.get("model_name") or None)
 
         # Environment
         env_cfg = raw.get("environment", {})
diff --git a/tests/test_eval_cli.py b/tests/test_eval_cli.py
deleted file mode 100644
index 5a009e7..0000000
--- a/tests/test_eval_cli.py
+++ /dev/null
@@ -1,118 +0,0 @@
-"""Tests for `bf eval` — the new eval-runner CLI.
-
-The create command takes a positional task reference and routes to either
-a single SDK.run or a batch Job based on whether the reference is a single
-task dir or a directory of task dirs. We don't exercise the actual runner
-here (that's the parity tests' job); we only verify the resolver.
-"""
-
-from __future__ import annotations
-
-from pathlib import Path
-
-import click.exceptions
-import pytest
-
-from benchflow.cli.eval import _resolve_task_ref
-
-
-def _make_task_dir(parent: Path, name: str) -> Path:
-    d = parent / name
-    d.mkdir()
-    (d / "task.toml").write_text('schema_version = "1.1"\n')
-    (d / "instruction.md").write_text("instruction\n")
-    (d / "environment").mkdir()
-    (d / "tests").mkdir()
-    (d / "tests" / "test.sh").write_text("#!/bin/bash\necho 1\n")
-    return d
-
-
-def test_resolve_single_task_dir(tmp_path: Path) -> None:
-    task = _make_task_dir(tmp_path, "my-task")
-    resolved, is_batch = _resolve_task_ref(str(task))
-    assert resolved == task.resolve()
-    assert is_batch is False
-
-
-def test_resolve_directory_of_tasks_is_batch(tmp_path: Path) -> None:
-    _make_task_dir(tmp_path, "task-a")
-    _make_task_dir(tmp_path, "task-b")
-    resolved, is_batch = _resolve_task_ref(str(tmp_path))
-    assert resolved == tmp_path.resolve()
-    assert is_batch is True
-
-
-def test_resolve_missing_path_exits(tmp_path: Path) -> None:
-    with pytest.raises(click.exceptions.Exit):
-        _resolve_task_ref(str(tmp_path / "does-not-exist"))
-
-
-def test_resolve_harbor_ref_routes_through_ensure_tasks(
-    tmp_path: Path, monkeypatch
-) -> None:
-    """`harbor://<name>` resolves to the cached dataset directory (batch)."""
-
-    fake_dataset = tmp_path / "cached-aime"
-    fake_dataset.mkdir()
-    _make_task_dir(fake_dataset, "aime_60")
-    _make_task_dir(fake_dataset, "aime_61")
-
-    captured: dict[str, str] = {}
-
-    def fake_ensure_tasks(ref: str) -> Path:
-        captured["ref"] = ref
-        return fake_dataset
-
-    from benchflow import task_download
-
-    monkeypatch.setattr(task_download, "ensure_tasks", fake_ensure_tasks)
-    resolved, is_batch = _resolve_task_ref("harbor://aime@1.0")
-    assert captured["ref"] == "harbor://aime@1.0"
-    assert resolved == fake_dataset
-    assert is_batch is True
-
-
-def test_resolve_harbor_ref_with_subtask(tmp_path: Path, monkeypatch) -> None:
-    """`harbor://<name>/<task>` resolves to a single task from the dataset."""
-
-    fake_dataset = tmp_path / "cached-aime"
-    fake_dataset.mkdir()
-    _make_task_dir(fake_dataset, "aime_60")
-    _make_task_dir(fake_dataset, "aime_61")
-
-    def fake_ensure_tasks(ref: str) -> Path:
-        return fake_dataset
-
-    from benchflow import task_download
-
-    monkeypatch.setattr(task_download, "ensure_tasks", fake_ensure_tasks)
-    resolved, is_batch = _resolve_task_ref("harbor://aime/aime_60")
-    assert resolved == fake_dataset / "aime_60"
-    assert is_batch is False
-
-
-def test_resolve_benchflow_ref(tmp_path: Path, monkeypatch) -> None:
-    """`benchflow://<name>` works identically to `harbor://`."""
-
-    fake_dataset = tmp_path / "cached-benchjack"
-    fake_dataset.mkdir()
-    _make_task_dir(fake_dataset, "exploit-1")
-
-    def fake_ensure_tasks(ref: str) -> Path:
-        assert ref == "benchflow://benchjack"
-        return fake_dataset
-
-    from benchflow import task_download
-
-    monkeypatch.setattr(task_download, "ensure_tasks", fake_ensure_tasks)
-    resolved, is_batch = _resolve_task_ref("benchflow://benchjack")
-    assert resolved == fake_dataset
-    assert is_batch is True
-
-
-def test_resolve_non_task_dir_exits(tmp_path: Path) -> None:
-    """A directory with no task.toml children errors cleanly."""
-
-    (tmp_path / "not-a-task").mkdir()
-    with pytest.raises(click.exceptions.Exit):
-        _resolve_task_ref(str(tmp_path))
diff --git a/tests/test_resolve_env_helpers.py b/tests/test_resolve_env_helpers.py
index f699334..d831fe7 100644
--- a/tests/test_resolve_env_helpers.py
+++ b/tests/test_resolve_env_helpers.py
@@ -296,3 +296,45 @@ def test_no_model_empty_requires_env(self, monkeypatch, tmp_path):
         self._patch_expanduser(monkeypatch, tmp_path)
         result = self._resolve(agent="openclaw", agent_env={})
         assert "_BENCHFLOW_SUBSCRIPTION_AUTH" not in result
+
+
+class TestResolveAgentEnvOracle:
+    """Oracle runs solve.sh and never calls an LLM — must skip model-related env.
+
+    Regression for the PR #173 follow-up: commit 360c460 removed the
+    `agent != "oracle"` guard from resolve_agent_env, betting that CLI callers
+    would pass model=None for oracle. But cli/main.py:eval_create (the live
+    `bench eval create`) still passes `model or DEFAULT_MODEL`, so oracle
+    reaches the chokepoint with a real model and triggers ANTHROPIC_API_KEY
+    validation — breaking offline oracle runs that have no API key set.
+    """
+
+    def _patch_expanduser(self, monkeypatch, tmp_path):
+        orig = Path.expanduser
+
+        def fake(self):
+            s = str(self)
+            if s.startswith("~"):
+                return tmp_path / s[2:]
+            return orig(self)
+
+        monkeypatch.setattr(Path, "expanduser", fake)
+
+    def test_oracle_with_default_model_does_not_validate_api_key(
+        self, monkeypatch, tmp_path
+    ):
+        """Oracle + DEFAULT_MODEL + no API key + no host auth must not raise."""
+        for k in (
+            "ANTHROPIC_API_KEY",
+            "OPENAI_API_KEY",
+            "GOOGLE_API_KEY",
+            "GEMINI_API_KEY",
+        ):
+            monkeypatch.delenv(k, raising=False)
+        self._patch_expanduser(monkeypatch, tmp_path)
+
+        result = resolve_agent_env("oracle", "claude-haiku-4-5-20251001", {})
+
+        # Provider env never resolved — oracle never calls an LLM.
+        assert "BENCHFLOW_PROVIDER_MODEL" not in result
+        assert "_BENCHFLOW_SUBSCRIPTION_AUTH" not in result

From 796f5896ec6167c84afbc1bcdebb54e84f6a6658 Mon Sep 17 00:00:00 2001
From: Yifeng He <yfhe.prsn@gmail.com>
Date: Tue, 21 Apr 2026 15:45:53 -0700
Subject: [PATCH 4/6] test: regression suite pinning oracle chokepoint + orphan
 removal
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bundle 14 tests in tests/test_oracle_chokepoint.py that pin each layer
of the prior fix at the right altitude:

- TestOrphanRemoval — cli/eval.py is gone (ModuleNotFoundError) and no
  src/ file references benchflow.cli.eval, guarding against a future
  re-introduction that could swallow the next bug fix the same way.
- TestEvalCreateRouting — `bench eval create` callback lives in
  cli/main.py:eval_create. Pins the architectural fact PR #173 missed.
- TestEffectiveModel — unit tests for the helper: oracle drops model,
  non-oracle falls back to DEFAULT_MODEL, empty string treated as unset.
- TestOracleYamlLoaders — Job.from_yaml(oracle config) → model is None
  for both native and Harbor formats; non-oracle backwards-compat
  preserved.
- TestEvalCreateOracleCLI — end-to-end: live eval_create(agent="oracle")
  with no API key in env does not raise. Mocks Trial.create and resets
  the asyncio loop after to avoid polluting pre-existing tests that use
  the deprecated asyncio.get_event_loop() pattern.

Verified to fail on main in the right shape: 9 of 14 fail (each pinning
a deleted/added behavior), 5 pass (asserting structural facts already
true). The CLI test fails on main with the user-reported error
"ANTHROPIC_API_KEY required for model 'claude-haiku-4-5-20251001'…".
---
 tests/test_oracle_chokepoint.py | 260 ++++++++++++++++++++++++++++++++
 1 file changed, 260 insertions(+)
 create mode 100644 tests/test_oracle_chokepoint.py

diff --git a/tests/test_oracle_chokepoint.py b/tests/test_oracle_chokepoint.py
new file mode 100644
index 0000000..1a5043a
--- /dev/null
+++ b/tests/test_oracle_chokepoint.py
@@ -0,0 +1,260 @@
+"""Regression tests for the oracle agent + DEFAULT_MODEL chokepoint.
+
+Pins the fix from this branch (post-PR #173 follow-up):
+
+  Layer 1 — restore `agent != "oracle"` guard in resolve_agent_env so the
+            chokepoint defends against any caller that forwards a model.
+  Layer 2 — delete the orphaned src/benchflow/cli/eval.py whose oracle fix
+            was unreachable because nothing wired it into the live CLI.
+  Layer 3 — funnel all CLI/YAML-loader sites through effective_model() so
+            oracle gets honest model=None end-to-end.
+
+The classes below pin each layer at the right altitude:
+- TestOrphanRemoval       — proves cli/eval.py is gone and stays gone.
+- TestEvalCreateRouting   — proves `bench eval create` dispatches to
+                            cli/main.py (the file that PR #173 missed).
+- TestEffectiveModel      — unit tests for the helper.
+- TestOracleYamlLoaders   — Job.from_yaml(oracle config) → model is None.
+- TestEvalCreateOracleCLI — end-to-end: invoke `bench eval create -a oracle`
+                            and assert no API-key validation error.
+"""
+
+from __future__ import annotations
+
+import importlib
+from pathlib import Path
+from unittest.mock import AsyncMock, patch
+
+import pytest
+from typer.testing import CliRunner
+
+
+class TestOrphanRemoval:
+    """src/benchflow/cli/eval.py was orphaned (never imported into the live CLI).
+
+    Deletion was safe because nothing wired it up. These tests make sure
+    the orphan stays gone — re-introducing it as a parallel `eval_create`
+    is what caused PR #173 to land its fix in dead code.
+    """
+
+    def test_cli_eval_module_is_gone(self):
+        with pytest.raises(ModuleNotFoundError):
+            importlib.import_module("benchflow.cli.eval")
+
+    def test_no_source_file_imports_the_orphan(self):
+        """Catch any future import of `benchflow.cli.eval` from src/."""
+        src = Path(__file__).resolve().parent.parent / "src" / "benchflow"
+        offenders = [
+            p
+            for p in src.rglob("*.py")
+            if "benchflow.cli.eval" in p.read_text()
+        ]
+        assert not offenders, f"Orphan import re-introduced in: {offenders}"
+
+
+class TestEvalCreateRouting:
+    """`bench eval create` must dispatch to cli/main.py:eval_create.
+
+    The pyproject.toml `bench`/`benchflow` scripts both resolve to
+    `benchflow.cli.main:app`. PR #173 patched a different `eval_create` in
+    `benchflow.cli.eval` that no entry point ever loaded — these tests
+    pin the routing so that mistake can't happen again.
+    """
+
+    def test_entry_point_app_is_cli_main(self):
+        from benchflow.cli.main import app
+
+        assert app.info.name == "benchflow"
+
+    def test_eval_create_callback_lives_in_cli_main(self):
+        from benchflow.cli.main import eval_app
+
+        create_cmds = [c for c in eval_app.registered_commands if c.name == "create"]
+        assert len(create_cmds) == 1
+        assert create_cmds[0].callback.__module__ == "benchflow.cli.main"
+
+    def test_bench_eval_create_help_resolves(self):
+        """Smoke test: `bench eval create --help` reaches a real callback."""
+        from benchflow.cli.main import app
+
+        result = CliRunner().invoke(app, ["eval", "create", "--help"])
+        assert result.exit_code == 0
+        assert "tasks-dir" in result.stdout or "task" in result.stdout.lower()
+
+
+class TestEffectiveModel:
+    """The helper introduced in Layer 3 — single source of truth for the rule
+    "oracle never gets a model; non-oracle agents fall back to DEFAULT_MODEL"."""
+
+    def test_oracle_with_no_model_returns_none(self):
+        from benchflow.job import effective_model
+
+        assert effective_model("oracle", None) is None
+
+    def test_oracle_ignores_explicit_model(self):
+        """Even if a caller forwards a model for oracle, the helper drops it."""
+        from benchflow.job import effective_model
+
+        assert effective_model("oracle", "claude-haiku-4-5-20251001") is None
+
+    def test_non_oracle_with_no_model_returns_default(self):
+        from benchflow.job import DEFAULT_MODEL, effective_model
+
+        assert effective_model("claude-agent-acp", None) == DEFAULT_MODEL
+
+    def test_non_oracle_explicit_model_passes_through(self):
+        from benchflow.job import effective_model
+
+        assert effective_model("codex-acp", "gpt-5") == "gpt-5"
+
+    def test_non_oracle_empty_model_falls_back_to_default(self):
+        """Empty string == "no model" — Harbor YAML can produce this shape."""
+        from benchflow.job import DEFAULT_MODEL, effective_model
+
+        assert effective_model("claude-agent-acp", "") == DEFAULT_MODEL
+
+
+class TestOracleYamlLoaders:
+    """YAML configs for oracle must produce JobConfig.model is None.
+
+    Both loader paths (_from_native_yaml, _from_harbor_yaml) previously
+    coalesced missing model to DEFAULT_MODEL unconditionally — Layer 3
+    routes them through effective_model() so oracle drops the default.
+    """
+
+    def _make_task(self, tmp_path: Path) -> Path:
+        tasks = tmp_path / "tasks" / "task-a"
+        tasks.mkdir(parents=True)
+        (tasks / "task.toml").write_text('schema_version = "1.1"\n')
+        return tmp_path / "tasks"
+
+    def test_native_yaml_oracle_no_model(self, tmp_path: Path):
+        from benchflow.job import Job
+
+        self._make_task(tmp_path)
+        config = tmp_path / "config.yaml"
+        config.write_text("tasks_dir: tasks\nagent: oracle\n")
+        job = Job.from_yaml(config)
+        assert job._config.agent == "oracle"
+        assert job._config.model is None
+
+    def test_harbor_yaml_oracle_no_model(self, tmp_path: Path):
+        from benchflow.job import Job
+
+        self._make_task(tmp_path)
+        config = tmp_path / "config.yaml"
+        config.write_text(
+            "agents:\n"
+            "  - name: oracle\n"
+            "datasets:\n"
+            "  - path: tasks\n"
+        )
+        job = Job.from_yaml(config)
+        assert job._config.agent == "oracle"
+        assert job._config.model is None
+
+    def test_native_yaml_non_oracle_keeps_default_when_omitted(
+        self, tmp_path: Path
+    ):
+        """Backwards-compat: omitting model for an LLM agent still gets DEFAULT_MODEL."""
+        from benchflow.job import DEFAULT_MODEL, Job
+
+        self._make_task(tmp_path)
+        config = tmp_path / "config.yaml"
+        config.write_text("tasks_dir: tasks\nagent: claude-agent-acp\n")
+        job = Job.from_yaml(config)
+        assert job._config.model == DEFAULT_MODEL
+
+
+class TestEvalCreateOracleCLI:
+    """End-to-end: `bench eval create -a oracle` must not trip API key validation.
+
+    This is the user-visible bug the chokepoint test guards against at the
+    unit level. Here we call the live handler (cli/main.py:eval_create)
+    directly — invoking via Typer's CliRunner triggers asyncio.run()
+    internally, which leaves no current event loop and breaks pre-existing
+    tests in the suite that use the deprecated asyncio.get_event_loop()
+    pattern. Calling the function directly still exercises the full
+    CLI → effective_model → TrialConfig → resolve_agent_env path the bug
+    originally lived in.
+    """
+
+    def _make_task(self, tmp_path: Path) -> Path:
+        task = tmp_path / "task"
+        task.mkdir()
+        (task / "task.toml").write_text('schema_version = "1.1"\n')
+        (task / "instruction.md").write_text("solve\n")
+        return task
+
+    def _strip_api_keys(self, monkeypatch):
+        for k in (
+            "ANTHROPIC_API_KEY",
+            "ANTHROPIC_AUTH_TOKEN",
+            "CLAUDE_CODE_OAUTH_TOKEN",
+            "OPENAI_API_KEY",
+            "GOOGLE_API_KEY",
+            "GEMINI_API_KEY",
+            "LLM_API_KEY",
+        ):
+            monkeypatch.delenv(k, raising=False)
+
+    def test_oracle_single_task_no_api_key_no_error(
+        self, tmp_path: Path, monkeypatch
+    ):
+        """The bug: oracle + missing API key → ANTHROPIC_API_KEY ValueError."""
+        import asyncio
+
+        from benchflow.cli.main import eval_create
+        from benchflow.models import RunResult
+
+        # The CLI handler internally calls asyncio.run(), which leaves no
+        # current event loop. Pre-existing tests in the suite use the
+        # deprecated asyncio.get_event_loop() and break in that state, so
+        # restore a fresh loop after the test (teardown via finally below).
+
+        self._strip_api_keys(monkeypatch)
+        task = self._make_task(tmp_path)
+        captured: dict = {}
+
+        async def fake_create(config):
+            captured["config"] = config
+            # Exercise the real chokepoint with the config the CLI built —
+            # that's the specific call site the bug manifested at.
+            from benchflow._agent_env import resolve_agent_env
+
+            captured["agent_env"] = resolve_agent_env(
+                config.primary_agent, config.primary_model, config.agent_env
+            )
+            trial = type("FakeTrial", (), {})()
+            trial.run = AsyncMock(
+                return_value=RunResult(
+                    task_name="task",
+                    agent_name="oracle",
+                    rewards={"reward": 1.0},
+                    n_tool_calls=0,
+                )
+            )
+            return trial
+
+        try:
+            with patch("benchflow.trial.Trial.create", new=fake_create):
+                eval_create(
+                    config_file=None,
+                    tasks_dir=task,
+                    agent="oracle",
+                    model=None,
+                    environment="docker",
+                    concurrency=1,
+                    jobs_dir=str(tmp_path / "jobs"),
+                    sandbox_user="agent",
+                    skills_dir=None,
+                )
+        finally:
+            asyncio.set_event_loop(asyncio.new_event_loop())
+
+        cfg = captured["config"]
+        # Layer 3: oracle never receives a model, even when CLI defaults exist.
+        assert cfg.primary_agent == "oracle"
+        assert cfg.primary_model is None
+        # Layer 1: chokepoint did not inject provider env or raise.
+        assert "BENCHFLOW_PROVIDER_MODEL" not in captured["agent_env"]

From 88bd026e13e1c85b8e8c5040da31c3c2feab742b Mon Sep 17 00:00:00 2001
From: Yifeng He <yfhe.prsn@gmail.com>
Date: Tue, 21 Apr 2026 15:53:15 -0700
Subject: [PATCH 5/6] fix: restore cli/eval.py and test_eval_cli.py, apply
 oracle guard

The previous commit deleted cli/eval.py and its tests as orphans, but
they are intentionally kept. Restore both from main, update eval.py to
use the effective_model() helper for the oracle chokepoint fix, and
replace the "module is gone" regression test with a guard that cli/main.py
does not import cli/eval (the actual invariant).
---
 src/benchflow/cli/eval.py       | 373 ++++++++++++++++++++++++++++++++
 tests/test_eval_cli.py          | 118 ++++++++++
 tests/test_oracle_chokepoint.py |  33 ++-
 3 files changed, 506 insertions(+), 18 deletions(-)
 create mode 100644 src/benchflow/cli/eval.py
 create mode 100644 tests/test_eval_cli.py

diff --git a/src/benchflow/cli/eval.py b/src/benchflow/cli/eval.py
new file mode 100644
index 0000000..ea7af43
--- /dev/null
+++ b/src/benchflow/cli/eval.py
@@ -0,0 +1,373 @@
+"""`bf eval` — the benchflow eval-runner command group.
+
+The future-facing entry point for running evaluations. Anthropic-style shape:
+resource creation, one command, return the result or a job-id.
+
+    bf eval create <task-ref> [flags]
+        One-shot eval — creates an Agent + Environment + Trajectory under
+        the hood and runs the task. `task-ref` can be:
+          - a path to a task directory (single task)
+          - a path to a directory of task directories (batch)
+          - a `harbor://<name>[@<version>]` ref (full Harbor dataset)
+          - a `harbor://<name>/<task>` ref (single task from a Harbor dataset)
+          - a `benchflow://<name>[@<version>]` ref (benchflow-owned dataset)
+
+    bf eval list          Show recent eval runs (reads the jobs/ dir)
+    bf eval retrieve ID   Look up a specific trajectory by trial name
+
+Replaces `bf run` + `bf job` as the idiomatic way to run evals. `bf run`
+stays around as a deprecated alias for one release.
+"""
+
+from __future__ import annotations
+
+import asyncio
+from pathlib import Path
+from typing import Annotated
+
+import typer
+from rich.console import Console
+from rich.table import Table
+
+from benchflow.job import DEFAULT_AGENT, effective_model as _effective_model
+
+console = Console()
+
+app = typer.Typer(
+    name="eval",
+    help="Run evaluations. `bf eval create <task>` is the main entry point.",
+    no_args_is_help=True,
+)
+
+
+def _resolve_task_ref(task_ref: str) -> tuple[Path, bool]:
+    """Resolve a positional task reference to a local directory.
+
+    Returns `(path, is_batch)`. `is_batch` is True when the reference
+    points at a directory containing multiple task dirs (each with its
+    own `task.toml`), False when the reference is a single task dir.
+    """
+
+    # Registry prefix: fetch the full dataset and treat as batch.
+    if task_ref.startswith("harbor://") or task_ref.startswith("benchflow://"):
+        from benchflow.task_download import ensure_tasks
+
+        # Allow `harbor://<name>/<task>` shorthand for a single task within
+        # a dataset. Split off the trailing segment if it matches a task.
+        prefix, _, tail = task_ref.partition("://")
+        head = tail
+        sub_task: str | None = None
+        if "/" in tail and "@" not in tail.split("/", 1)[1]:
+            dataset, sub_task = tail.split("/", 1)
+            head = dataset
+        dataset_ref = f"{prefix}://{head}"
+        dataset_dir = ensure_tasks(dataset_ref)
+        if sub_task is not None:
+            candidate = dataset_dir / sub_task
+            if not candidate.exists() or not (candidate / "task.toml").exists():
+                console.print(
+                    f"[red]Harbor dataset {head!r} has no task named {sub_task!r}.[/red]"
+                )
+                raise typer.Exit(1)
+            return candidate, False
+        return dataset_dir, True
+
+    # Filesystem path: single task if task.toml is present, batch otherwise.
+    path = Path(task_ref).expanduser().resolve()
+    if not path.exists():
+        console.print(f"[red]Task reference not found: {task_ref}[/red]")
+        raise typer.Exit(1)
+    if (path / "task.toml").exists():
+        return path, False
+    if any(
+        child.is_dir() and (child / "task.toml").exists() for child in path.iterdir()
+    ):
+        return path, True
+    console.print(
+        f"[red]{path} is neither a single task (no task.toml) "
+        f"nor a directory of tasks.[/red]"
+    )
+    raise typer.Exit(1)
+
+
+@app.command("create")
+def eval_create(
+    task: Annotated[
+        str,
+        typer.Argument(
+            help=(
+                "Task reference. Path to a task dir, a dir of tasks, or a "
+                "registry ref (harbor://name, benchflow://name)."
+            )
+        ),
+    ],
+    agent: Annotated[
+        str, typer.Option("--agent", "-a", help="Agent name from the registry")
+    ] = DEFAULT_AGENT,
+    model: Annotated[
+        str | None,
+        typer.Option("--model", "-m", help="Model id; agent default if unset"),
+    ] = None,
+    environment: Annotated[
+        str,
+        typer.Option(
+            "--environment",
+            "-e",
+            help="docker | daytona | ... (uses the agent's default if unset)",
+        ),
+    ] = "docker",
+    prompt: Annotated[
+        list[str] | None,
+        typer.Option(
+            "--prompt",
+            "-p",
+            help="Prompt text; repeat for multi-turn. Default: instruction.md",
+        ),
+    ] = None,
+    concurrency: Annotated[
+        int,
+        typer.Option(
+            "--concurrency",
+            "-c",
+            help="Max parallel trials when task is a batch",
+        ),
+    ] = 4,
+    max_retries: Annotated[
+        int,
+        typer.Option(
+            "--max-retries",
+            help="Per-trial retry count on transient errors",
+        ),
+    ] = 1,
+    jobs_dir: Annotated[
+        str,
+        typer.Option("--jobs-dir", "-o", help="Where to write result files"),
+    ] = "jobs",
+    skills_dir: Annotated[
+        Path | None,
+        typer.Option(
+            "--skills-dir",
+            "-s",
+            help="Skills directory to mount into the sandbox",
+        ),
+    ] = None,
+    sandbox_user: Annotated[
+        str | None,
+        typer.Option(
+            "--sandbox-user",
+            help="Non-root sandbox user (default 'agent'; 'none' = root)",
+        ),
+    ] = "agent",
+    agent_env: Annotated[
+        list[str] | None,
+        typer.Option("--agent-env", help="Extra agent env var (KEY=VALUE)"),
+    ] = None,
+) -> None:
+    """Create and run an eval — one-shot.
+
+    Under the hood:
+      1. Resolves `task` to a local directory (fetching from Harbor if needed).
+      2. If it's a single task: runs `SDK.run()` once and prints the reward.
+      3. If it's a batch: runs a `Job` at `--concurrency` and prints pass rate.
+
+    This is the idiomatic way to run evals going forward. `bf run` and
+    `bf job` remain as one-release deprecated aliases that forward here.
+    """
+
+    resolved, is_batch = _resolve_task_ref(task)
+
+    parsed_env: dict[str, str] = {}
+    for entry in agent_env or []:
+        if "=" not in entry:
+            console.print(f"[red]Invalid --agent-env value: {entry!r}[/red]")
+            raise typer.Exit(2)
+        k, v = entry.split("=", 1)
+        parsed_env[k] = v
+
+    if is_batch:
+        _run_batch(
+            tasks_dir=resolved,
+            agent=agent,
+            model=model,
+            environment=environment,
+            concurrency=concurrency,
+            max_retries=max_retries,
+            jobs_dir=jobs_dir,
+            skills_dir=skills_dir,
+            sandbox_user=sandbox_user,
+            agent_env=parsed_env,
+        )
+    else:
+        _run_single(
+            task_dir=resolved,
+            agent=agent,
+            model=model,
+            environment=environment,
+            prompt=prompt,
+            jobs_dir=jobs_dir,
+            skills_dir=skills_dir,
+            sandbox_user=sandbox_user,
+            agent_env=parsed_env,
+        )
+
+
+def _run_single(
+    *,
+    task_dir: Path,
+    agent: str,
+    model: str | None,
+    environment: str,
+    prompt: list[str] | None,
+    jobs_dir: str,
+    skills_dir: Path | None,
+    sandbox_user: str | None,
+    agent_env: dict[str, str],
+) -> None:
+    from typing import cast
+
+    from benchflow.sdk import SDK
+
+    sdk = SDK()
+    eff_model = _effective_model(agent, model)
+    result = asyncio.run(
+        sdk.run(
+            task_path=task_dir,
+            agent=agent,
+            model=eff_model,
+            environment=environment,
+            prompts=cast("list[str | None] | None", prompt),
+            agent_env=agent_env,
+            job_name="eval-create",
+            jobs_dir=jobs_dir,
+            skills_dir=skills_dir,
+            sandbox_user=None if sandbox_user == "none" else sandbox_user,
+        )
+    )
+    reward = getattr(result, "reward", None)
+    err = getattr(result, "error", None) or getattr(result, "verifier_error", None)
+    console.print()
+    if err:
+        console.print(f"[red]failed:[/red] {err}")
+        raise typer.Exit(1)
+    console.print(
+        f"[bold]reward={reward}[/bold]  tools={getattr(result, 'n_tool_calls', 0)}"
+    )
+
+
+def _run_batch(
+    *,
+    tasks_dir: Path,
+    agent: str,
+    model: str | None,
+    environment: str,
+    concurrency: int,
+    max_retries: int,
+    jobs_dir: str,
+    skills_dir: Path | None,
+    sandbox_user: str | None,
+    agent_env: dict[str, str],
+) -> None:
+    from benchflow.job import Job, JobConfig, RetryConfig
+
+    eff_model = _effective_model(agent, model)
+    config = JobConfig(
+        agent=agent,
+        model=eff_model,
+        environment=environment,
+        concurrency=concurrency,
+        retry=RetryConfig(max_retries=max_retries),
+        agent_env=agent_env,
+        sandbox_user=None if sandbox_user == "none" else sandbox_user,
+        skills_dir=str(skills_dir) if skills_dir else None,
+    )
+    job = Job(
+        tasks_dir=tasks_dir,
+        jobs_dir=Path(jobs_dir),
+        config=config,
+    )
+    result = asyncio.run(job.run())
+    console.print()
+    console.print(
+        f"[bold]{result.passed}/{result.total} "
+        f"({result.score:.1%})[/bold] errors={result.errored}"
+    )
+
+
+@app.command("list")
+def eval_list(
+    jobs_dir: Annotated[
+        str,
+        typer.Option("--jobs-dir", "-o", help="Results directory to scan"),
+    ] = "jobs",
+    limit: Annotated[
+        int,
+        typer.Option("--limit", "-n", help="Max rows to show"),
+    ] = 20,
+) -> None:
+    """List recent eval runs by scanning the jobs/ dir."""
+
+    root = Path(jobs_dir)
+    if not root.exists():
+        console.print(f"[yellow]{root} does not exist yet.[/yellow]")
+        return
+    runs: list[tuple[str, int, int]] = []
+    for job_root in root.iterdir():
+        if not job_root.is_dir():
+            continue
+        for stamp in job_root.iterdir():
+            if not stamp.is_dir():
+                continue
+            trials = list(stamp.iterdir())
+            passed = 0
+            total = 0
+            for trial in trials:
+                result = trial / "result.json"
+                if not result.exists():
+                    continue
+                total += 1
+                try:
+                    import json
+
+                    data = json.loads(result.read_text())
+                    if (data.get("rewards") or {}).get("reward") == 1.0:
+                        passed += 1
+                except Exception:
+                    continue
+            if total:
+                runs.append((f"{job_root.name}/{stamp.name}", passed, total))
+    runs.sort(reverse=True)
+    table = Table(title=f"Recent evals in {root}")
+    table.add_column("run", style="cyan")
+    table.add_column("passed", justify="right", style="green")
+    table.add_column("total", justify="right")
+    table.add_column("rate", justify="right", style="yellow")
+    for run, passed, total in runs[:limit]:
+        rate = f"{100 * passed / total:.0f}%" if total else "-"
+        table.add_row(run, str(passed), str(total), rate)
+    console.print(table)
+
+
+@app.command("retrieve")
+def eval_retrieve(
+    trial_name: Annotated[
+        str, typer.Argument(help="Trial dir name, e.g. my-task__abc")
+    ],
+    jobs_dir: Annotated[
+        str,
+        typer.Option("--jobs-dir", "-o"),
+    ] = "jobs",
+) -> None:
+    """Print the result.json for a specific trial."""
+
+    import json
+
+    root = Path(jobs_dir)
+    matches = list(root.rglob(f"{trial_name}/result.json"))
+    if not matches:
+        console.print(f"[red]no trial named {trial_name!r} under {root}[/red]")
+        raise typer.Exit(1)
+    console.print(f"[dim]{matches[0]}[/dim]")
+    data = json.loads(matches[0].read_text())
+    from rich import print_json
+
+    print_json(data=data)
diff --git a/tests/test_eval_cli.py b/tests/test_eval_cli.py
new file mode 100644
index 0000000..5a009e7
--- /dev/null
+++ b/tests/test_eval_cli.py
@@ -0,0 +1,118 @@
+"""Tests for `bf eval` — the new eval-runner CLI.
+
+The create command takes a positional task reference and routes to either
+a single SDK.run or a batch Job based on whether the reference is a single
+task dir or a directory of task dirs. We don't exercise the actual runner
+here (that's the parity tests' job); we only verify the resolver.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import click.exceptions
+import pytest
+
+from benchflow.cli.eval import _resolve_task_ref
+
+
+def _make_task_dir(parent: Path, name: str) -> Path:
+    d = parent / name
+    d.mkdir()
+    (d / "task.toml").write_text('schema_version = "1.1"\n')
+    (d / "instruction.md").write_text("instruction\n")
+    (d / "environment").mkdir()
+    (d / "tests").mkdir()
+    (d / "tests" / "test.sh").write_text("#!/bin/bash\necho 1\n")
+    return d
+
+
+def test_resolve_single_task_dir(tmp_path: Path) -> None:
+    task = _make_task_dir(tmp_path, "my-task")
+    resolved, is_batch = _resolve_task_ref(str(task))
+    assert resolved == task.resolve()
+    assert is_batch is False
+
+
+def test_resolve_directory_of_tasks_is_batch(tmp_path: Path) -> None:
+    _make_task_dir(tmp_path, "task-a")
+    _make_task_dir(tmp_path, "task-b")
+    resolved, is_batch = _resolve_task_ref(str(tmp_path))
+    assert resolved == tmp_path.resolve()
+    assert is_batch is True
+
+
+def test_resolve_missing_path_exits(tmp_path: Path) -> None:
+    with pytest.raises(click.exceptions.Exit):
+        _resolve_task_ref(str(tmp_path / "does-not-exist"))
+
+
+def test_resolve_harbor_ref_routes_through_ensure_tasks(
+    tmp_path: Path, monkeypatch
+) -> None:
+    """`harbor://<name>` resolves to the cached dataset directory (batch)."""
+
+    fake_dataset = tmp_path / "cached-aime"
+    fake_dataset.mkdir()
+    _make_task_dir(fake_dataset, "aime_60")
+    _make_task_dir(fake_dataset, "aime_61")
+
+    captured: dict[str, str] = {}
+
+    def fake_ensure_tasks(ref: str) -> Path:
+        captured["ref"] = ref
+        return fake_dataset
+
+    from benchflow import task_download
+
+    monkeypatch.setattr(task_download, "ensure_tasks", fake_ensure_tasks)
+    resolved, is_batch = _resolve_task_ref("harbor://aime@1.0")
+    assert captured["ref"] == "harbor://aime@1.0"
+    assert resolved == fake_dataset
+    assert is_batch is True
+
+
+def test_resolve_harbor_ref_with_subtask(tmp_path: Path, monkeypatch) -> None:
+    """`harbor://<name>/<task>` resolves to a single task from the dataset."""
+
+    fake_dataset = tmp_path / "cached-aime"
+    fake_dataset.mkdir()
+    _make_task_dir(fake_dataset, "aime_60")
+    _make_task_dir(fake_dataset, "aime_61")
+
+    def fake_ensure_tasks(ref: str) -> Path:
+        return fake_dataset
+
+    from benchflow import task_download
+
+    monkeypatch.setattr(task_download, "ensure_tasks", fake_ensure_tasks)
+    resolved, is_batch = _resolve_task_ref("harbor://aime/aime_60")
+    assert resolved == fake_dataset / "aime_60"
+    assert is_batch is False
+
+
+def test_resolve_benchflow_ref(tmp_path: Path, monkeypatch) -> None:
+    """`benchflow://<name>` works identically to `harbor://`."""
+
+    fake_dataset = tmp_path / "cached-benchjack"
+    fake_dataset.mkdir()
+    _make_task_dir(fake_dataset, "exploit-1")
+
+    def fake_ensure_tasks(ref: str) -> Path:
+        assert ref == "benchflow://benchjack"
+        return fake_dataset
+
+    from benchflow import task_download
+
+    monkeypatch.setattr(task_download, "ensure_tasks", fake_ensure_tasks)
+    resolved, is_batch = _resolve_task_ref("benchflow://benchjack")
+    assert resolved == fake_dataset
+    assert is_batch is True
+
+
+def test_resolve_non_task_dir_exits(tmp_path: Path) -> None:
+    """A directory with no task.toml children errors cleanly."""
+
+    (tmp_path / "not-a-task").mkdir()
+    with pytest.raises(click.exceptions.Exit):
+        _resolve_task_ref(str(tmp_path))
diff --git a/tests/test_oracle_chokepoint.py b/tests/test_oracle_chokepoint.py
index 1a5043a..142c063 100644
--- a/tests/test_oracle_chokepoint.py
+++ b/tests/test_oracle_chokepoint.py
@@ -29,27 +29,24 @@
 from typer.testing import CliRunner
 
 
-class TestOrphanRemoval:
-    """src/benchflow/cli/eval.py was orphaned (never imported into the live CLI).
+class TestEvalModuleNotWiredIntoCLI:
+    """src/benchflow/cli/eval.py exists but is NOT wired into the live CLI.
 
-    Deletion was safe because nothing wired it up. These tests make sure
-    the orphan stays gone — re-introducing it as a parallel `eval_create`
-    is what caused PR #173 to land its fix in dead code.
+    The live `bench eval create` dispatches to cli/main.py:eval_create.
+    cli/eval.py is a standalone module (with its own resolver logic and tests)
+    but must not be imported by the CLI entry-point code — doing so is what
+    caused PR #173 to land its fix in dead code.
     """
 
-    def test_cli_eval_module_is_gone(self):
-        with pytest.raises(ModuleNotFoundError):
-            importlib.import_module("benchflow.cli.eval")
-
-    def test_no_source_file_imports_the_orphan(self):
-        """Catch any future import of `benchflow.cli.eval` from src/."""
-        src = Path(__file__).resolve().parent.parent / "src" / "benchflow"
-        offenders = [
-            p
-            for p in src.rglob("*.py")
-            if "benchflow.cli.eval" in p.read_text()
-        ]
-        assert not offenders, f"Orphan import re-introduced in: {offenders}"
+    def test_cli_main_does_not_import_cli_eval(self):
+        """cli/main.py must not import from cli/eval — they are separate."""
+        main_py = (
+            Path(__file__).resolve().parent.parent
+            / "src" / "benchflow" / "cli" / "main.py"
+        )
+        text = main_py.read_text()
+        assert "from benchflow.cli.eval" not in text
+        assert "import benchflow.cli.eval" not in text
 
 
 class TestEvalCreateRouting:

From bc04c59920a6a4608932f39fe9c2172c0b38d6ba Mon Sep 17 00:00:00 2001
From: Yifeng He <yfhe.prsn@gmail.com>
Date: Tue, 21 Apr 2026 15:54:40 -0700
Subject: [PATCH 6/6] docs: clarify cli/eval.py and test_eval_cli.py are not
 wired into live CLI

---
 src/benchflow/cli/eval.py | 12 +++++++-----
 tests/test_eval_cli.py    | 10 +++++-----
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/src/benchflow/cli/eval.py b/src/benchflow/cli/eval.py
index ea7af43..58b410f 100644
--- a/src/benchflow/cli/eval.py
+++ b/src/benchflow/cli/eval.py
@@ -1,7 +1,12 @@
 """`bf eval` — the benchflow eval-runner command group.
 
-The future-facing entry point for running evaluations. Anthropic-style shape:
-resource creation, one command, return the result or a job-id.
+NOTE: This module is **not wired into the live CLI**.  The active
+``bench eval create`` command dispatches to ``cli/main.py:eval_create``.
+This file is kept as the future-facing design for the eval sub-command
+and must not be imported by ``cli/main.py`` (see
+``test_oracle_chokepoint.py::TestEvalModuleNotWiredIntoCLI``).
+
+Design shape — Anthropic-style resource creation:
 
     bf eval create <task-ref> [flags]
         One-shot eval — creates an Agent + Environment + Trajectory under
@@ -14,9 +19,6 @@
 
     bf eval list          Show recent eval runs (reads the jobs/ dir)
     bf eval retrieve ID   Look up a specific trajectory by trial name
-
-Replaces `bf run` + `bf job` as the idiomatic way to run evals. `bf run`
-stays around as a deprecated alias for one release.
 """
 
 from __future__ import annotations
diff --git a/tests/test_eval_cli.py b/tests/test_eval_cli.py
index 5a009e7..dfcb7db 100644
--- a/tests/test_eval_cli.py
+++ b/tests/test_eval_cli.py
@@ -1,9 +1,9 @@
-"""Tests for `bf eval` — the new eval-runner CLI.
+"""Tests for ``benchflow.cli.eval`` — the future-facing eval CLI module.
 
-The create command takes a positional task reference and routes to either
-a single SDK.run or a batch Job based on whether the reference is a single
-task dir or a directory of task dirs. We don't exercise the actual runner
-here (that's the parity tests' job); we only verify the resolver.
+NOTE: ``cli/eval.py`` is **not wired into the live CLI** (the active
+``bench eval create`` lives in ``cli/main.py``).  These tests cover the
+task-reference resolver (``_resolve_task_ref``) which will be used once
+the module is promoted to the live entry point.
 """
 
 from __future__ import annotations