From 30dbe215c55d444f5f6cef285c2c85b74fcbeded Mon Sep 17 00:00:00 2001 From: Yifeng He Date: Tue, 21 Apr 2026 14:15:30 -0700 Subject: [PATCH 1/6] fix: skip model/API-key validation for oracle agent The oracle agent runs solution/solve.sh and never calls an LLM, but resolve_agent_env() was validating API keys for whatever model the CLI defaulted to (claude-haiku-4-5-20251001). This made `bench eval create -a oracle` fail without ANTHROPIC_API_KEY set, even though oracle doesn't need it. --- src/benchflow/_agent_env.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/benchflow/_agent_env.py b/src/benchflow/_agent_env.py index 8885b5e..5609e6c 100644 --- a/src/benchflow/_agent_env.py +++ b/src/benchflow/_agent_env.py @@ -144,7 +144,7 @@ def resolve_agent_env( """Resolve agent environment: auto-inherit keys, provider vars, env_mapping.""" agent_env = dict(agent_env or {}) auto_inherit_env(agent_env) - if model: + if model and agent != "oracle": inject_vertex_credentials(agent_env, model) resolve_provider_env(agent_env, model, agent) # Validate required API key for the chosen model From 360c460a33e4758325611a7a4bdd4b0a599a8eb6 Mon Sep 17 00:00:00 2001 From: Yifeng He Date: Tue, 21 Apr 2026 14:37:13 -0700 Subject: [PATCH 2/6] fix: don't assign default model to oracle agent Move the fix from resolve_agent_env to the CLI layer: oracle runs solve.sh and never calls an LLM, so it should not receive DEFAULT_MODEL at all. Both _run_single and _run_batch now pass model=None for oracle. Widen JobConfig.model to str | None to support this. --- src/benchflow/_agent_env.py | 2 +- src/benchflow/cli/eval.py | 6 ++++-- src/benchflow/job.py | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/benchflow/_agent_env.py b/src/benchflow/_agent_env.py index 5609e6c..8885b5e 100644 --- a/src/benchflow/_agent_env.py +++ b/src/benchflow/_agent_env.py @@ -144,7 +144,7 @@ def resolve_agent_env( """Resolve agent environment: auto-inherit keys, provider vars, env_mapping.""" agent_env = dict(agent_env or {}) auto_inherit_env(agent_env) - if model and agent != "oracle": + if model: inject_vertex_credentials(agent_env, model) resolve_provider_env(agent_env, model, agent) # Validate required API key for the chosen model diff --git a/src/benchflow/cli/eval.py b/src/benchflow/cli/eval.py index 0b13fd8..a92c0be 100644 --- a/src/benchflow/cli/eval.py +++ b/src/benchflow/cli/eval.py @@ -228,11 +228,12 @@ def _run_single( from benchflow.sdk import SDK sdk = SDK() + effective_model = None if agent == "oracle" else (model or DEFAULT_MODEL) result = asyncio.run( sdk.run( task_path=task_dir, agent=agent, - model=model or DEFAULT_MODEL, + model=effective_model, environment=environment, prompts=cast("list[str | None] | None", prompt), agent_env=agent_env, @@ -268,9 +269,10 @@ def _run_batch( ) -> None: from benchflow.job import Job, JobConfig, RetryConfig + effective_model = None if agent == "oracle" else (model or DEFAULT_MODEL) config = JobConfig( agent=agent, - model=model or DEFAULT_MODEL, + model=effective_model, environment=environment, concurrency=concurrency, retry=RetryConfig(max_retries=max_retries), diff --git a/src/benchflow/job.py b/src/benchflow/job.py index 6d8fe9b..95414fe 100644 --- a/src/benchflow/job.py +++ b/src/benchflow/job.py @@ -146,7 +146,7 @@ class JobConfig: """Configuration for a benchmark job.""" agent: str = DEFAULT_AGENT - model: str = DEFAULT_MODEL + model: str | None = DEFAULT_MODEL environment: str = "docker" concurrency: int = 4 prompts: list[str | None] | None = None From 9b57b2afb8bade8912918fe3654e2a608779f373 Mon Sep 17 00:00:00 2001 From: Yifeng He Date: Tue, 21 Apr 2026 15:32:20 -0700 Subject: [PATCH 3/6] =?UTF-8?q?fix:=20oracle=20agent=20=E2=80=94=20chokepo?= =?UTF-8?q?int=20guard,=20drop=20orphan=20eval=20CLI,=20helper?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #173 moved the oracle/DEFAULT_MODEL guard from resolve_agent_env to cli/eval.py, but cli/eval.py is orphaned (never imported into the live CLI), so `bench eval create` still passes DEFAULT_MODEL to oracle and trips ANTHROPIC_API_KEY validation. Three changes: - Restore the `agent != "oracle"` guard in resolve_agent_env so the chokepoint defends against any caller that forwards a model. - Delete the orphan cli/eval.py and its tests — the live eval_create lives in cli/main.py and was the actual code path users hit. - Add effective_model(agent, model) helper, change JobConfig.model default to None, replace seven `model or DEFAULT_MODEL` sites in cli/main.py and job.py YAML loaders so oracle gets honest model=None end-to-end (in result/summary JSON, prints, and downstream Trial). Regression test in test_resolve_env_helpers.py pins the chokepoint by calling resolve_agent_env("oracle", DEFAULT_MODEL, {}) with no API key and no host auth — verified to fail on main with the user-facing ANTHROPIC_API_KEY error and pass after the fix. --- src/benchflow/_agent_env.py | 4 +- src/benchflow/cli/eval.py | 373 ------------------------------ src/benchflow/cli/main.py | 17 +- src/benchflow/job.py | 22 +- tests/test_eval_cli.py | 118 ---------- tests/test_resolve_env_helpers.py | 42 ++++ 6 files changed, 72 insertions(+), 504 deletions(-) delete mode 100644 src/benchflow/cli/eval.py delete mode 100644 tests/test_eval_cli.py diff --git a/src/benchflow/_agent_env.py b/src/benchflow/_agent_env.py index 8885b5e..72d6498 100644 --- a/src/benchflow/_agent_env.py +++ b/src/benchflow/_agent_env.py @@ -144,7 +144,9 @@ def resolve_agent_env( """Resolve agent environment: auto-inherit keys, provider vars, env_mapping.""" agent_env = dict(agent_env or {}) auto_inherit_env(agent_env) - if model: + # Oracle runs solve.sh and never calls an LLM — model env vars and + # API-key validation are skipped even if a caller forwards a model. + if model and agent != "oracle": inject_vertex_credentials(agent_env, model) resolve_provider_env(agent_env, model, agent) # Validate required API key for the chosen model diff --git a/src/benchflow/cli/eval.py b/src/benchflow/cli/eval.py deleted file mode 100644 index a92c0be..0000000 --- a/src/benchflow/cli/eval.py +++ /dev/null @@ -1,373 +0,0 @@ -"""`bf eval` — the benchflow eval-runner command group. - -The future-facing entry point for running evaluations. Anthropic-style shape: -resource creation, one command, return the result or a job-id. - - bf eval create [flags] - One-shot eval — creates an Agent + Environment + Trajectory under - the hood and runs the task. `task-ref` can be: - - a path to a task directory (single task) - - a path to a directory of task directories (batch) - - a `harbor://[@]` ref (full Harbor dataset) - - a `harbor:///` ref (single task from a Harbor dataset) - - a `benchflow://[@]` ref (benchflow-owned dataset) - - bf eval list Show recent eval runs (reads the jobs/ dir) - bf eval retrieve ID Look up a specific trajectory by trial name - -Replaces `bf run` + `bf job` as the idiomatic way to run evals. `bf run` -stays around as a deprecated alias for one release. -""" - -from __future__ import annotations - -import asyncio -from pathlib import Path -from typing import Annotated - -import typer -from rich.console import Console -from rich.table import Table - -from benchflow.job import DEFAULT_AGENT, DEFAULT_MODEL - -console = Console() - -app = typer.Typer( - name="eval", - help="Run evaluations. `bf eval create ` is the main entry point.", - no_args_is_help=True, -) - - -def _resolve_task_ref(task_ref: str) -> tuple[Path, bool]: - """Resolve a positional task reference to a local directory. - - Returns `(path, is_batch)`. `is_batch` is True when the reference - points at a directory containing multiple task dirs (each with its - own `task.toml`), False when the reference is a single task dir. - """ - - # Registry prefix: fetch the full dataset and treat as batch. - if task_ref.startswith("harbor://") or task_ref.startswith("benchflow://"): - from benchflow.task_download import ensure_tasks - - # Allow `harbor:///` shorthand for a single task within - # a dataset. Split off the trailing segment if it matches a task. - prefix, _, tail = task_ref.partition("://") - head = tail - sub_task: str | None = None - if "/" in tail and "@" not in tail.split("/", 1)[1]: - dataset, sub_task = tail.split("/", 1) - head = dataset - dataset_ref = f"{prefix}://{head}" - dataset_dir = ensure_tasks(dataset_ref) - if sub_task is not None: - candidate = dataset_dir / sub_task - if not candidate.exists() or not (candidate / "task.toml").exists(): - console.print( - f"[red]Harbor dataset {head!r} has no task named {sub_task!r}.[/red]" - ) - raise typer.Exit(1) - return candidate, False - return dataset_dir, True - - # Filesystem path: single task if task.toml is present, batch otherwise. - path = Path(task_ref).expanduser().resolve() - if not path.exists(): - console.print(f"[red]Task reference not found: {task_ref}[/red]") - raise typer.Exit(1) - if (path / "task.toml").exists(): - return path, False - if any( - child.is_dir() and (child / "task.toml").exists() for child in path.iterdir() - ): - return path, True - console.print( - f"[red]{path} is neither a single task (no task.toml) " - f"nor a directory of tasks.[/red]" - ) - raise typer.Exit(1) - - -@app.command("create") -def eval_create( - task: Annotated[ - str, - typer.Argument( - help=( - "Task reference. Path to a task dir, a dir of tasks, or a " - "registry ref (harbor://name, benchflow://name)." - ) - ), - ], - agent: Annotated[ - str, typer.Option("--agent", "-a", help="Agent name from the registry") - ] = DEFAULT_AGENT, - model: Annotated[ - str | None, - typer.Option("--model", "-m", help="Model id; agent default if unset"), - ] = None, - environment: Annotated[ - str, - typer.Option( - "--environment", - "-e", - help="docker | daytona | ... (uses the agent's default if unset)", - ), - ] = "docker", - prompt: Annotated[ - list[str] | None, - typer.Option( - "--prompt", - "-p", - help="Prompt text; repeat for multi-turn. Default: instruction.md", - ), - ] = None, - concurrency: Annotated[ - int, - typer.Option( - "--concurrency", - "-c", - help="Max parallel trials when task is a batch", - ), - ] = 4, - max_retries: Annotated[ - int, - typer.Option( - "--max-retries", - help="Per-trial retry count on transient errors", - ), - ] = 1, - jobs_dir: Annotated[ - str, - typer.Option("--jobs-dir", "-o", help="Where to write result files"), - ] = "jobs", - skills_dir: Annotated[ - Path | None, - typer.Option( - "--skills-dir", - "-s", - help="Skills directory to mount into the sandbox", - ), - ] = None, - sandbox_user: Annotated[ - str | None, - typer.Option( - "--sandbox-user", - help="Non-root sandbox user (default 'agent'; 'none' = root)", - ), - ] = "agent", - agent_env: Annotated[ - list[str] | None, - typer.Option("--agent-env", help="Extra agent env var (KEY=VALUE)"), - ] = None, -) -> None: - """Create and run an eval — one-shot. - - Under the hood: - 1. Resolves `task` to a local directory (fetching from Harbor if needed). - 2. If it's a single task: runs `SDK.run()` once and prints the reward. - 3. If it's a batch: runs a `Job` at `--concurrency` and prints pass rate. - - This is the idiomatic way to run evals going forward. `bf run` and - `bf job` remain as one-release deprecated aliases that forward here. - """ - - resolved, is_batch = _resolve_task_ref(task) - - parsed_env: dict[str, str] = {} - for entry in agent_env or []: - if "=" not in entry: - console.print(f"[red]Invalid --agent-env value: {entry!r}[/red]") - raise typer.Exit(2) - k, v = entry.split("=", 1) - parsed_env[k] = v - - if is_batch: - _run_batch( - tasks_dir=resolved, - agent=agent, - model=model, - environment=environment, - concurrency=concurrency, - max_retries=max_retries, - jobs_dir=jobs_dir, - skills_dir=skills_dir, - sandbox_user=sandbox_user, - agent_env=parsed_env, - ) - else: - _run_single( - task_dir=resolved, - agent=agent, - model=model, - environment=environment, - prompt=prompt, - jobs_dir=jobs_dir, - skills_dir=skills_dir, - sandbox_user=sandbox_user, - agent_env=parsed_env, - ) - - -def _run_single( - *, - task_dir: Path, - agent: str, - model: str | None, - environment: str, - prompt: list[str] | None, - jobs_dir: str, - skills_dir: Path | None, - sandbox_user: str | None, - agent_env: dict[str, str], -) -> None: - from typing import cast - - from benchflow.sdk import SDK - - sdk = SDK() - effective_model = None if agent == "oracle" else (model or DEFAULT_MODEL) - result = asyncio.run( - sdk.run( - task_path=task_dir, - agent=agent, - model=effective_model, - environment=environment, - prompts=cast("list[str | None] | None", prompt), - agent_env=agent_env, - job_name="eval-create", - jobs_dir=jobs_dir, - skills_dir=skills_dir, - sandbox_user=None if sandbox_user == "none" else sandbox_user, - ) - ) - reward = getattr(result, "reward", None) - err = getattr(result, "error", None) or getattr(result, "verifier_error", None) - console.print() - if err: - console.print(f"[red]failed:[/red] {err}") - raise typer.Exit(1) - console.print( - f"[bold]reward={reward}[/bold] tools={getattr(result, 'n_tool_calls', 0)}" - ) - - -def _run_batch( - *, - tasks_dir: Path, - agent: str, - model: str | None, - environment: str, - concurrency: int, - max_retries: int, - jobs_dir: str, - skills_dir: Path | None, - sandbox_user: str | None, - agent_env: dict[str, str], -) -> None: - from benchflow.job import Job, JobConfig, RetryConfig - - effective_model = None if agent == "oracle" else (model or DEFAULT_MODEL) - config = JobConfig( - agent=agent, - model=effective_model, - environment=environment, - concurrency=concurrency, - retry=RetryConfig(max_retries=max_retries), - agent_env=agent_env, - sandbox_user=None if sandbox_user == "none" else sandbox_user, - skills_dir=str(skills_dir) if skills_dir else None, - ) - job = Job( - tasks_dir=tasks_dir, - jobs_dir=Path(jobs_dir), - config=config, - ) - result = asyncio.run(job.run()) - console.print() - console.print( - f"[bold]{result.passed}/{result.total} " - f"({result.score:.1%})[/bold] errors={result.errored}" - ) - - -@app.command("list") -def eval_list( - jobs_dir: Annotated[ - str, - typer.Option("--jobs-dir", "-o", help="Results directory to scan"), - ] = "jobs", - limit: Annotated[ - int, - typer.Option("--limit", "-n", help="Max rows to show"), - ] = 20, -) -> None: - """List recent eval runs by scanning the jobs/ dir.""" - - root = Path(jobs_dir) - if not root.exists(): - console.print(f"[yellow]{root} does not exist yet.[/yellow]") - return - runs: list[tuple[str, int, int]] = [] - for job_root in root.iterdir(): - if not job_root.is_dir(): - continue - for stamp in job_root.iterdir(): - if not stamp.is_dir(): - continue - trials = list(stamp.iterdir()) - passed = 0 - total = 0 - for trial in trials: - result = trial / "result.json" - if not result.exists(): - continue - total += 1 - try: - import json - - data = json.loads(result.read_text()) - if (data.get("rewards") or {}).get("reward") == 1.0: - passed += 1 - except Exception: - continue - if total: - runs.append((f"{job_root.name}/{stamp.name}", passed, total)) - runs.sort(reverse=True) - table = Table(title=f"Recent evals in {root}") - table.add_column("run", style="cyan") - table.add_column("passed", justify="right", style="green") - table.add_column("total", justify="right") - table.add_column("rate", justify="right", style="yellow") - for run, passed, total in runs[:limit]: - rate = f"{100 * passed / total:.0f}%" if total else "-" - table.add_row(run, str(passed), str(total), rate) - console.print(table) - - -@app.command("retrieve") -def eval_retrieve( - trial_name: Annotated[ - str, typer.Argument(help="Trial dir name, e.g. my-task__abc") - ], - jobs_dir: Annotated[ - str, - typer.Option("--jobs-dir", "-o"), - ] = "jobs", -) -> None: - """Print the result.json for a specific trial.""" - - import json - - root = Path(jobs_dir) - matches = list(root.rglob(f"{trial_name}/result.json")) - if not matches: - console.print(f"[red]no trial named {trial_name!r} under {root}[/red]") - raise typer.Exit(1) - console.print(f"[dim]{matches[0]}[/dim]") - data = json.loads(matches[0].read_text()) - from rich import print_json - - print_json(data=data) diff --git a/src/benchflow/cli/main.py b/src/benchflow/cli/main.py index 9cf1198..96611bb 100644 --- a/src/benchflow/cli/main.py +++ b/src/benchflow/cli/main.py @@ -10,7 +10,7 @@ from rich.console import Console from rich.table import Table -from benchflow.job import DEFAULT_AGENT, DEFAULT_MODEL +from benchflow.job import DEFAULT_AGENT, effective_model console = Console() @@ -167,7 +167,7 @@ def job( jobs_dir=jobs_dir, config=JobConfig( agent=agent, - model=model or DEFAULT_MODEL, + model=effective_model(agent, model), environment=environment, concurrency=concurrency, retry=RetryConfig(max_retries=max_retries), @@ -343,7 +343,7 @@ def eval( jobs_dir=jobs_dir, config=JobConfig( agent=agent, - model=model or DEFAULT_MODEL, + model=effective_model(agent, model), environment=environment, concurrency=concurrency, skills_dir=effective_skills, @@ -754,19 +754,20 @@ def eval_create( f"({result.score:.1%})[/bold], errors={result.errored}" ) elif tasks_dir: + eff_model = effective_model(agent, model) # Smart detection: if tasks_dir has task.toml, it's a single task if (tasks_dir / "task.toml").exists(): from benchflow.trial import Trial, TrialConfig, Scene config = TrialConfig( task_path=tasks_dir, - scenes=[Scene.single(agent=agent, model=model or DEFAULT_MODEL, + scenes=[Scene.single(agent=agent, model=eff_model, skills_dir=str(skills_dir) if skills_dir else None)], environment=environment, sandbox_user=sandbox_user, jobs_dir=jobs_dir, agent=agent, - model=model or DEFAULT_MODEL, + model=eff_model, skills_dir=str(skills_dir) if skills_dir else None, ) @@ -777,7 +778,7 @@ async def _run(): run_result = asyncio.run(_run()) reward = (run_result.rewards or {}).get("reward") console.print(f"\n[bold]Task:[/bold] {tasks_dir.name}") - console.print(f"[bold]Agent:[/bold] {agent} ({model or DEFAULT_MODEL})") + console.print(f"[bold]Agent:[/bold] {agent} ({eff_model or 'no model'})") console.print(f"[bold]Reward:[/bold] {reward}") console.print(f"[bold]Tool calls:[/bold] {run_result.n_tool_calls}") if run_result.error: @@ -789,7 +790,7 @@ async def _run(): jobs_dir=jobs_dir, config=JobConfig( agent=agent, - model=model or DEFAULT_MODEL, + model=eff_model, environment=environment, concurrency=concurrency, sandbox_user=sandbox_user, @@ -915,7 +916,7 @@ def train_create( jobs_dir=f"{jobs_dir}/sweep-{sweep_idx}", config=JobConfig( agent=agent, - model=model or DEFAULT_MODEL, + model=effective_model(agent, model), environment=environment, concurrency=concurrency, ), diff --git a/src/benchflow/job.py b/src/benchflow/job.py index 95414fe..ad99ad4 100644 --- a/src/benchflow/job.py +++ b/src/benchflow/job.py @@ -141,12 +141,25 @@ def backoff_delay(self, attempt: int) -> float: DEFAULT_MODEL = "claude-haiku-4-5-20251001" +def effective_model(agent: str, model: str | None) -> str | None: + """Resolve the model an agent should run with. + + Oracle runs solve.sh and never calls an LLM, so it never receives a model + (the chokepoint in resolve_agent_env defends, but callers should also stop + materializing DEFAULT_MODEL into oracle configs to keep the data honest — + e.g. result-summary JSON shows model=null instead of a bogus default). + """ + if agent == "oracle": + return None + return model or DEFAULT_MODEL + + @dataclass class JobConfig: """Configuration for a benchmark job.""" agent: str = DEFAULT_AGENT - model: str | None = DEFAULT_MODEL + model: str | None = None environment: str = "docker" concurrency: int = 4 prompts: list[str | None] | None = None @@ -281,9 +294,10 @@ def _from_native_yaml(cls, raw: dict, **kwargs) -> "Job": sandbox_user = raw.get("sandbox_user", "agent") sandbox_locked_paths = raw.get("sandbox_locked_paths") + agent_name = raw.get("agent", DEFAULT_AGENT) config = JobConfig( - agent=raw.get("agent", DEFAULT_AGENT), - model=raw.get("model", DEFAULT_MODEL), + agent=agent_name, + model=effective_model(agent_name, raw.get("model")), environment=raw.get("environment", "docker"), concurrency=raw.get("concurrency", 4), prompts=prompts, @@ -305,7 +319,7 @@ def _from_harbor_yaml(cls, raw: dict, **kwargs) -> "Job": agent_name = agent_cfg.get("name", DEFAULT_AGENT) # Model — keep provider prefix intact for downstream resolution - model = agent_cfg.get("model_name", "") or DEFAULT_MODEL + model = effective_model(agent_name, agent_cfg.get("model_name") or None) # Environment env_cfg = raw.get("environment", {}) diff --git a/tests/test_eval_cli.py b/tests/test_eval_cli.py deleted file mode 100644 index 5a009e7..0000000 --- a/tests/test_eval_cli.py +++ /dev/null @@ -1,118 +0,0 @@ -"""Tests for `bf eval` — the new eval-runner CLI. - -The create command takes a positional task reference and routes to either -a single SDK.run or a batch Job based on whether the reference is a single -task dir or a directory of task dirs. We don't exercise the actual runner -here (that's the parity tests' job); we only verify the resolver. -""" - -from __future__ import annotations - -from pathlib import Path - -import click.exceptions -import pytest - -from benchflow.cli.eval import _resolve_task_ref - - -def _make_task_dir(parent: Path, name: str) -> Path: - d = parent / name - d.mkdir() - (d / "task.toml").write_text('schema_version = "1.1"\n') - (d / "instruction.md").write_text("instruction\n") - (d / "environment").mkdir() - (d / "tests").mkdir() - (d / "tests" / "test.sh").write_text("#!/bin/bash\necho 1\n") - return d - - -def test_resolve_single_task_dir(tmp_path: Path) -> None: - task = _make_task_dir(tmp_path, "my-task") - resolved, is_batch = _resolve_task_ref(str(task)) - assert resolved == task.resolve() - assert is_batch is False - - -def test_resolve_directory_of_tasks_is_batch(tmp_path: Path) -> None: - _make_task_dir(tmp_path, "task-a") - _make_task_dir(tmp_path, "task-b") - resolved, is_batch = _resolve_task_ref(str(tmp_path)) - assert resolved == tmp_path.resolve() - assert is_batch is True - - -def test_resolve_missing_path_exits(tmp_path: Path) -> None: - with pytest.raises(click.exceptions.Exit): - _resolve_task_ref(str(tmp_path / "does-not-exist")) - - -def test_resolve_harbor_ref_routes_through_ensure_tasks( - tmp_path: Path, monkeypatch -) -> None: - """`harbor://` resolves to the cached dataset directory (batch).""" - - fake_dataset = tmp_path / "cached-aime" - fake_dataset.mkdir() - _make_task_dir(fake_dataset, "aime_60") - _make_task_dir(fake_dataset, "aime_61") - - captured: dict[str, str] = {} - - def fake_ensure_tasks(ref: str) -> Path: - captured["ref"] = ref - return fake_dataset - - from benchflow import task_download - - monkeypatch.setattr(task_download, "ensure_tasks", fake_ensure_tasks) - resolved, is_batch = _resolve_task_ref("harbor://aime@1.0") - assert captured["ref"] == "harbor://aime@1.0" - assert resolved == fake_dataset - assert is_batch is True - - -def test_resolve_harbor_ref_with_subtask(tmp_path: Path, monkeypatch) -> None: - """`harbor:///` resolves to a single task from the dataset.""" - - fake_dataset = tmp_path / "cached-aime" - fake_dataset.mkdir() - _make_task_dir(fake_dataset, "aime_60") - _make_task_dir(fake_dataset, "aime_61") - - def fake_ensure_tasks(ref: str) -> Path: - return fake_dataset - - from benchflow import task_download - - monkeypatch.setattr(task_download, "ensure_tasks", fake_ensure_tasks) - resolved, is_batch = _resolve_task_ref("harbor://aime/aime_60") - assert resolved == fake_dataset / "aime_60" - assert is_batch is False - - -def test_resolve_benchflow_ref(tmp_path: Path, monkeypatch) -> None: - """`benchflow://` works identically to `harbor://`.""" - - fake_dataset = tmp_path / "cached-benchjack" - fake_dataset.mkdir() - _make_task_dir(fake_dataset, "exploit-1") - - def fake_ensure_tasks(ref: str) -> Path: - assert ref == "benchflow://benchjack" - return fake_dataset - - from benchflow import task_download - - monkeypatch.setattr(task_download, "ensure_tasks", fake_ensure_tasks) - resolved, is_batch = _resolve_task_ref("benchflow://benchjack") - assert resolved == fake_dataset - assert is_batch is True - - -def test_resolve_non_task_dir_exits(tmp_path: Path) -> None: - """A directory with no task.toml children errors cleanly.""" - - (tmp_path / "not-a-task").mkdir() - with pytest.raises(click.exceptions.Exit): - _resolve_task_ref(str(tmp_path)) diff --git a/tests/test_resolve_env_helpers.py b/tests/test_resolve_env_helpers.py index f699334..d831fe7 100644 --- a/tests/test_resolve_env_helpers.py +++ b/tests/test_resolve_env_helpers.py @@ -296,3 +296,45 @@ def test_no_model_empty_requires_env(self, monkeypatch, tmp_path): self._patch_expanduser(monkeypatch, tmp_path) result = self._resolve(agent="openclaw", agent_env={}) assert "_BENCHFLOW_SUBSCRIPTION_AUTH" not in result + + +class TestResolveAgentEnvOracle: + """Oracle runs solve.sh and never calls an LLM — must skip model-related env. + + Regression for the PR #173 follow-up: commit 360c460 removed the + `agent != "oracle"` guard from resolve_agent_env, betting that CLI callers + would pass model=None for oracle. But cli/main.py:eval_create (the live + `bench eval create`) still passes `model or DEFAULT_MODEL`, so oracle + reaches the chokepoint with a real model and triggers ANTHROPIC_API_KEY + validation — breaking offline oracle runs that have no API key set. + """ + + def _patch_expanduser(self, monkeypatch, tmp_path): + orig = Path.expanduser + + def fake(self): + s = str(self) + if s.startswith("~"): + return tmp_path / s[2:] + return orig(self) + + monkeypatch.setattr(Path, "expanduser", fake) + + def test_oracle_with_default_model_does_not_validate_api_key( + self, monkeypatch, tmp_path + ): + """Oracle + DEFAULT_MODEL + no API key + no host auth must not raise.""" + for k in ( + "ANTHROPIC_API_KEY", + "OPENAI_API_KEY", + "GOOGLE_API_KEY", + "GEMINI_API_KEY", + ): + monkeypatch.delenv(k, raising=False) + self._patch_expanduser(monkeypatch, tmp_path) + + result = resolve_agent_env("oracle", "claude-haiku-4-5-20251001", {}) + + # Provider env never resolved — oracle never calls an LLM. + assert "BENCHFLOW_PROVIDER_MODEL" not in result + assert "_BENCHFLOW_SUBSCRIPTION_AUTH" not in result From 796f5896ec6167c84afbc1bcdebb54e84f6a6658 Mon Sep 17 00:00:00 2001 From: Yifeng He Date: Tue, 21 Apr 2026 15:45:53 -0700 Subject: [PATCH 4/6] test: regression suite pinning oracle chokepoint + orphan removal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bundle 14 tests in tests/test_oracle_chokepoint.py that pin each layer of the prior fix at the right altitude: - TestOrphanRemoval — cli/eval.py is gone (ModuleNotFoundError) and no src/ file references benchflow.cli.eval, guarding against a future re-introduction that could swallow the next bug fix the same way. - TestEvalCreateRouting — `bench eval create` callback lives in cli/main.py:eval_create. Pins the architectural fact PR #173 missed. - TestEffectiveModel — unit tests for the helper: oracle drops model, non-oracle falls back to DEFAULT_MODEL, empty string treated as unset. - TestOracleYamlLoaders — Job.from_yaml(oracle config) → model is None for both native and Harbor formats; non-oracle backwards-compat preserved. - TestEvalCreateOracleCLI — end-to-end: live eval_create(agent="oracle") with no API key in env does not raise. Mocks Trial.create and resets the asyncio loop after to avoid polluting pre-existing tests that use the deprecated asyncio.get_event_loop() pattern. Verified to fail on main in the right shape: 9 of 14 fail (each pinning a deleted/added behavior), 5 pass (asserting structural facts already true). The CLI test fails on main with the user-reported error "ANTHROPIC_API_KEY required for model 'claude-haiku-4-5-20251001'…". --- tests/test_oracle_chokepoint.py | 260 ++++++++++++++++++++++++++++++++ 1 file changed, 260 insertions(+) create mode 100644 tests/test_oracle_chokepoint.py diff --git a/tests/test_oracle_chokepoint.py b/tests/test_oracle_chokepoint.py new file mode 100644 index 0000000..1a5043a --- /dev/null +++ b/tests/test_oracle_chokepoint.py @@ -0,0 +1,260 @@ +"""Regression tests for the oracle agent + DEFAULT_MODEL chokepoint. + +Pins the fix from this branch (post-PR #173 follow-up): + + Layer 1 — restore `agent != "oracle"` guard in resolve_agent_env so the + chokepoint defends against any caller that forwards a model. + Layer 2 — delete the orphaned src/benchflow/cli/eval.py whose oracle fix + was unreachable because nothing wired it into the live CLI. + Layer 3 — funnel all CLI/YAML-loader sites through effective_model() so + oracle gets honest model=None end-to-end. + +The classes below pin each layer at the right altitude: +- TestOrphanRemoval — proves cli/eval.py is gone and stays gone. +- TestEvalCreateRouting — proves `bench eval create` dispatches to + cli/main.py (the file that PR #173 missed). +- TestEffectiveModel — unit tests for the helper. +- TestOracleYamlLoaders — Job.from_yaml(oracle config) → model is None. +- TestEvalCreateOracleCLI — end-to-end: invoke `bench eval create -a oracle` + and assert no API-key validation error. +""" + +from __future__ import annotations + +import importlib +from pathlib import Path +from unittest.mock import AsyncMock, patch + +import pytest +from typer.testing import CliRunner + + +class TestOrphanRemoval: + """src/benchflow/cli/eval.py was orphaned (never imported into the live CLI). + + Deletion was safe because nothing wired it up. These tests make sure + the orphan stays gone — re-introducing it as a parallel `eval_create` + is what caused PR #173 to land its fix in dead code. + """ + + def test_cli_eval_module_is_gone(self): + with pytest.raises(ModuleNotFoundError): + importlib.import_module("benchflow.cli.eval") + + def test_no_source_file_imports_the_orphan(self): + """Catch any future import of `benchflow.cli.eval` from src/.""" + src = Path(__file__).resolve().parent.parent / "src" / "benchflow" + offenders = [ + p + for p in src.rglob("*.py") + if "benchflow.cli.eval" in p.read_text() + ] + assert not offenders, f"Orphan import re-introduced in: {offenders}" + + +class TestEvalCreateRouting: + """`bench eval create` must dispatch to cli/main.py:eval_create. + + The pyproject.toml `bench`/`benchflow` scripts both resolve to + `benchflow.cli.main:app`. PR #173 patched a different `eval_create` in + `benchflow.cli.eval` that no entry point ever loaded — these tests + pin the routing so that mistake can't happen again. + """ + + def test_entry_point_app_is_cli_main(self): + from benchflow.cli.main import app + + assert app.info.name == "benchflow" + + def test_eval_create_callback_lives_in_cli_main(self): + from benchflow.cli.main import eval_app + + create_cmds = [c for c in eval_app.registered_commands if c.name == "create"] + assert len(create_cmds) == 1 + assert create_cmds[0].callback.__module__ == "benchflow.cli.main" + + def test_bench_eval_create_help_resolves(self): + """Smoke test: `bench eval create --help` reaches a real callback.""" + from benchflow.cli.main import app + + result = CliRunner().invoke(app, ["eval", "create", "--help"]) + assert result.exit_code == 0 + assert "tasks-dir" in result.stdout or "task" in result.stdout.lower() + + +class TestEffectiveModel: + """The helper introduced in Layer 3 — single source of truth for the rule + "oracle never gets a model; non-oracle agents fall back to DEFAULT_MODEL".""" + + def test_oracle_with_no_model_returns_none(self): + from benchflow.job import effective_model + + assert effective_model("oracle", None) is None + + def test_oracle_ignores_explicit_model(self): + """Even if a caller forwards a model for oracle, the helper drops it.""" + from benchflow.job import effective_model + + assert effective_model("oracle", "claude-haiku-4-5-20251001") is None + + def test_non_oracle_with_no_model_returns_default(self): + from benchflow.job import DEFAULT_MODEL, effective_model + + assert effective_model("claude-agent-acp", None) == DEFAULT_MODEL + + def test_non_oracle_explicit_model_passes_through(self): + from benchflow.job import effective_model + + assert effective_model("codex-acp", "gpt-5") == "gpt-5" + + def test_non_oracle_empty_model_falls_back_to_default(self): + """Empty string == "no model" — Harbor YAML can produce this shape.""" + from benchflow.job import DEFAULT_MODEL, effective_model + + assert effective_model("claude-agent-acp", "") == DEFAULT_MODEL + + +class TestOracleYamlLoaders: + """YAML configs for oracle must produce JobConfig.model is None. + + Both loader paths (_from_native_yaml, _from_harbor_yaml) previously + coalesced missing model to DEFAULT_MODEL unconditionally — Layer 3 + routes them through effective_model() so oracle drops the default. + """ + + def _make_task(self, tmp_path: Path) -> Path: + tasks = tmp_path / "tasks" / "task-a" + tasks.mkdir(parents=True) + (tasks / "task.toml").write_text('schema_version = "1.1"\n') + return tmp_path / "tasks" + + def test_native_yaml_oracle_no_model(self, tmp_path: Path): + from benchflow.job import Job + + self._make_task(tmp_path) + config = tmp_path / "config.yaml" + config.write_text("tasks_dir: tasks\nagent: oracle\n") + job = Job.from_yaml(config) + assert job._config.agent == "oracle" + assert job._config.model is None + + def test_harbor_yaml_oracle_no_model(self, tmp_path: Path): + from benchflow.job import Job + + self._make_task(tmp_path) + config = tmp_path / "config.yaml" + config.write_text( + "agents:\n" + " - name: oracle\n" + "datasets:\n" + " - path: tasks\n" + ) + job = Job.from_yaml(config) + assert job._config.agent == "oracle" + assert job._config.model is None + + def test_native_yaml_non_oracle_keeps_default_when_omitted( + self, tmp_path: Path + ): + """Backwards-compat: omitting model for an LLM agent still gets DEFAULT_MODEL.""" + from benchflow.job import DEFAULT_MODEL, Job + + self._make_task(tmp_path) + config = tmp_path / "config.yaml" + config.write_text("tasks_dir: tasks\nagent: claude-agent-acp\n") + job = Job.from_yaml(config) + assert job._config.model == DEFAULT_MODEL + + +class TestEvalCreateOracleCLI: + """End-to-end: `bench eval create -a oracle` must not trip API key validation. + + This is the user-visible bug the chokepoint test guards against at the + unit level. Here we call the live handler (cli/main.py:eval_create) + directly — invoking via Typer's CliRunner triggers asyncio.run() + internally, which leaves no current event loop and breaks pre-existing + tests in the suite that use the deprecated asyncio.get_event_loop() + pattern. Calling the function directly still exercises the full + CLI → effective_model → TrialConfig → resolve_agent_env path the bug + originally lived in. + """ + + def _make_task(self, tmp_path: Path) -> Path: + task = tmp_path / "task" + task.mkdir() + (task / "task.toml").write_text('schema_version = "1.1"\n') + (task / "instruction.md").write_text("solve\n") + return task + + def _strip_api_keys(self, monkeypatch): + for k in ( + "ANTHROPIC_API_KEY", + "ANTHROPIC_AUTH_TOKEN", + "CLAUDE_CODE_OAUTH_TOKEN", + "OPENAI_API_KEY", + "GOOGLE_API_KEY", + "GEMINI_API_KEY", + "LLM_API_KEY", + ): + monkeypatch.delenv(k, raising=False) + + def test_oracle_single_task_no_api_key_no_error( + self, tmp_path: Path, monkeypatch + ): + """The bug: oracle + missing API key → ANTHROPIC_API_KEY ValueError.""" + import asyncio + + from benchflow.cli.main import eval_create + from benchflow.models import RunResult + + # The CLI handler internally calls asyncio.run(), which leaves no + # current event loop. Pre-existing tests in the suite use the + # deprecated asyncio.get_event_loop() and break in that state, so + # restore a fresh loop after the test (teardown via finally below). + + self._strip_api_keys(monkeypatch) + task = self._make_task(tmp_path) + captured: dict = {} + + async def fake_create(config): + captured["config"] = config + # Exercise the real chokepoint with the config the CLI built — + # that's the specific call site the bug manifested at. + from benchflow._agent_env import resolve_agent_env + + captured["agent_env"] = resolve_agent_env( + config.primary_agent, config.primary_model, config.agent_env + ) + trial = type("FakeTrial", (), {})() + trial.run = AsyncMock( + return_value=RunResult( + task_name="task", + agent_name="oracle", + rewards={"reward": 1.0}, + n_tool_calls=0, + ) + ) + return trial + + try: + with patch("benchflow.trial.Trial.create", new=fake_create): + eval_create( + config_file=None, + tasks_dir=task, + agent="oracle", + model=None, + environment="docker", + concurrency=1, + jobs_dir=str(tmp_path / "jobs"), + sandbox_user="agent", + skills_dir=None, + ) + finally: + asyncio.set_event_loop(asyncio.new_event_loop()) + + cfg = captured["config"] + # Layer 3: oracle never receives a model, even when CLI defaults exist. + assert cfg.primary_agent == "oracle" + assert cfg.primary_model is None + # Layer 1: chokepoint did not inject provider env or raise. + assert "BENCHFLOW_PROVIDER_MODEL" not in captured["agent_env"] From 88bd026e13e1c85b8e8c5040da31c3c2feab742b Mon Sep 17 00:00:00 2001 From: Yifeng He Date: Tue, 21 Apr 2026 15:53:15 -0700 Subject: [PATCH 5/6] fix: restore cli/eval.py and test_eval_cli.py, apply oracle guard The previous commit deleted cli/eval.py and its tests as orphans, but they are intentionally kept. Restore both from main, update eval.py to use the effective_model() helper for the oracle chokepoint fix, and replace the "module is gone" regression test with a guard that cli/main.py does not import cli/eval (the actual invariant). --- src/benchflow/cli/eval.py | 373 ++++++++++++++++++++++++++++++++ tests/test_eval_cli.py | 118 ++++++++++ tests/test_oracle_chokepoint.py | 33 ++- 3 files changed, 506 insertions(+), 18 deletions(-) create mode 100644 src/benchflow/cli/eval.py create mode 100644 tests/test_eval_cli.py diff --git a/src/benchflow/cli/eval.py b/src/benchflow/cli/eval.py new file mode 100644 index 0000000..ea7af43 --- /dev/null +++ b/src/benchflow/cli/eval.py @@ -0,0 +1,373 @@ +"""`bf eval` — the benchflow eval-runner command group. + +The future-facing entry point for running evaluations. Anthropic-style shape: +resource creation, one command, return the result or a job-id. + + bf eval create [flags] + One-shot eval — creates an Agent + Environment + Trajectory under + the hood and runs the task. `task-ref` can be: + - a path to a task directory (single task) + - a path to a directory of task directories (batch) + - a `harbor://[@]` ref (full Harbor dataset) + - a `harbor:///` ref (single task from a Harbor dataset) + - a `benchflow://[@]` ref (benchflow-owned dataset) + + bf eval list Show recent eval runs (reads the jobs/ dir) + bf eval retrieve ID Look up a specific trajectory by trial name + +Replaces `bf run` + `bf job` as the idiomatic way to run evals. `bf run` +stays around as a deprecated alias for one release. +""" + +from __future__ import annotations + +import asyncio +from pathlib import Path +from typing import Annotated + +import typer +from rich.console import Console +from rich.table import Table + +from benchflow.job import DEFAULT_AGENT, effective_model as _effective_model + +console = Console() + +app = typer.Typer( + name="eval", + help="Run evaluations. `bf eval create ` is the main entry point.", + no_args_is_help=True, +) + + +def _resolve_task_ref(task_ref: str) -> tuple[Path, bool]: + """Resolve a positional task reference to a local directory. + + Returns `(path, is_batch)`. `is_batch` is True when the reference + points at a directory containing multiple task dirs (each with its + own `task.toml`), False when the reference is a single task dir. + """ + + # Registry prefix: fetch the full dataset and treat as batch. + if task_ref.startswith("harbor://") or task_ref.startswith("benchflow://"): + from benchflow.task_download import ensure_tasks + + # Allow `harbor:///` shorthand for a single task within + # a dataset. Split off the trailing segment if it matches a task. + prefix, _, tail = task_ref.partition("://") + head = tail + sub_task: str | None = None + if "/" in tail and "@" not in tail.split("/", 1)[1]: + dataset, sub_task = tail.split("/", 1) + head = dataset + dataset_ref = f"{prefix}://{head}" + dataset_dir = ensure_tasks(dataset_ref) + if sub_task is not None: + candidate = dataset_dir / sub_task + if not candidate.exists() or not (candidate / "task.toml").exists(): + console.print( + f"[red]Harbor dataset {head!r} has no task named {sub_task!r}.[/red]" + ) + raise typer.Exit(1) + return candidate, False + return dataset_dir, True + + # Filesystem path: single task if task.toml is present, batch otherwise. + path = Path(task_ref).expanduser().resolve() + if not path.exists(): + console.print(f"[red]Task reference not found: {task_ref}[/red]") + raise typer.Exit(1) + if (path / "task.toml").exists(): + return path, False + if any( + child.is_dir() and (child / "task.toml").exists() for child in path.iterdir() + ): + return path, True + console.print( + f"[red]{path} is neither a single task (no task.toml) " + f"nor a directory of tasks.[/red]" + ) + raise typer.Exit(1) + + +@app.command("create") +def eval_create( + task: Annotated[ + str, + typer.Argument( + help=( + "Task reference. Path to a task dir, a dir of tasks, or a " + "registry ref (harbor://name, benchflow://name)." + ) + ), + ], + agent: Annotated[ + str, typer.Option("--agent", "-a", help="Agent name from the registry") + ] = DEFAULT_AGENT, + model: Annotated[ + str | None, + typer.Option("--model", "-m", help="Model id; agent default if unset"), + ] = None, + environment: Annotated[ + str, + typer.Option( + "--environment", + "-e", + help="docker | daytona | ... (uses the agent's default if unset)", + ), + ] = "docker", + prompt: Annotated[ + list[str] | None, + typer.Option( + "--prompt", + "-p", + help="Prompt text; repeat for multi-turn. Default: instruction.md", + ), + ] = None, + concurrency: Annotated[ + int, + typer.Option( + "--concurrency", + "-c", + help="Max parallel trials when task is a batch", + ), + ] = 4, + max_retries: Annotated[ + int, + typer.Option( + "--max-retries", + help="Per-trial retry count on transient errors", + ), + ] = 1, + jobs_dir: Annotated[ + str, + typer.Option("--jobs-dir", "-o", help="Where to write result files"), + ] = "jobs", + skills_dir: Annotated[ + Path | None, + typer.Option( + "--skills-dir", + "-s", + help="Skills directory to mount into the sandbox", + ), + ] = None, + sandbox_user: Annotated[ + str | None, + typer.Option( + "--sandbox-user", + help="Non-root sandbox user (default 'agent'; 'none' = root)", + ), + ] = "agent", + agent_env: Annotated[ + list[str] | None, + typer.Option("--agent-env", help="Extra agent env var (KEY=VALUE)"), + ] = None, +) -> None: + """Create and run an eval — one-shot. + + Under the hood: + 1. Resolves `task` to a local directory (fetching from Harbor if needed). + 2. If it's a single task: runs `SDK.run()` once and prints the reward. + 3. If it's a batch: runs a `Job` at `--concurrency` and prints pass rate. + + This is the idiomatic way to run evals going forward. `bf run` and + `bf job` remain as one-release deprecated aliases that forward here. + """ + + resolved, is_batch = _resolve_task_ref(task) + + parsed_env: dict[str, str] = {} + for entry in agent_env or []: + if "=" not in entry: + console.print(f"[red]Invalid --agent-env value: {entry!r}[/red]") + raise typer.Exit(2) + k, v = entry.split("=", 1) + parsed_env[k] = v + + if is_batch: + _run_batch( + tasks_dir=resolved, + agent=agent, + model=model, + environment=environment, + concurrency=concurrency, + max_retries=max_retries, + jobs_dir=jobs_dir, + skills_dir=skills_dir, + sandbox_user=sandbox_user, + agent_env=parsed_env, + ) + else: + _run_single( + task_dir=resolved, + agent=agent, + model=model, + environment=environment, + prompt=prompt, + jobs_dir=jobs_dir, + skills_dir=skills_dir, + sandbox_user=sandbox_user, + agent_env=parsed_env, + ) + + +def _run_single( + *, + task_dir: Path, + agent: str, + model: str | None, + environment: str, + prompt: list[str] | None, + jobs_dir: str, + skills_dir: Path | None, + sandbox_user: str | None, + agent_env: dict[str, str], +) -> None: + from typing import cast + + from benchflow.sdk import SDK + + sdk = SDK() + eff_model = _effective_model(agent, model) + result = asyncio.run( + sdk.run( + task_path=task_dir, + agent=agent, + model=eff_model, + environment=environment, + prompts=cast("list[str | None] | None", prompt), + agent_env=agent_env, + job_name="eval-create", + jobs_dir=jobs_dir, + skills_dir=skills_dir, + sandbox_user=None if sandbox_user == "none" else sandbox_user, + ) + ) + reward = getattr(result, "reward", None) + err = getattr(result, "error", None) or getattr(result, "verifier_error", None) + console.print() + if err: + console.print(f"[red]failed:[/red] {err}") + raise typer.Exit(1) + console.print( + f"[bold]reward={reward}[/bold] tools={getattr(result, 'n_tool_calls', 0)}" + ) + + +def _run_batch( + *, + tasks_dir: Path, + agent: str, + model: str | None, + environment: str, + concurrency: int, + max_retries: int, + jobs_dir: str, + skills_dir: Path | None, + sandbox_user: str | None, + agent_env: dict[str, str], +) -> None: + from benchflow.job import Job, JobConfig, RetryConfig + + eff_model = _effective_model(agent, model) + config = JobConfig( + agent=agent, + model=eff_model, + environment=environment, + concurrency=concurrency, + retry=RetryConfig(max_retries=max_retries), + agent_env=agent_env, + sandbox_user=None if sandbox_user == "none" else sandbox_user, + skills_dir=str(skills_dir) if skills_dir else None, + ) + job = Job( + tasks_dir=tasks_dir, + jobs_dir=Path(jobs_dir), + config=config, + ) + result = asyncio.run(job.run()) + console.print() + console.print( + f"[bold]{result.passed}/{result.total} " + f"({result.score:.1%})[/bold] errors={result.errored}" + ) + + +@app.command("list") +def eval_list( + jobs_dir: Annotated[ + str, + typer.Option("--jobs-dir", "-o", help="Results directory to scan"), + ] = "jobs", + limit: Annotated[ + int, + typer.Option("--limit", "-n", help="Max rows to show"), + ] = 20, +) -> None: + """List recent eval runs by scanning the jobs/ dir.""" + + root = Path(jobs_dir) + if not root.exists(): + console.print(f"[yellow]{root} does not exist yet.[/yellow]") + return + runs: list[tuple[str, int, int]] = [] + for job_root in root.iterdir(): + if not job_root.is_dir(): + continue + for stamp in job_root.iterdir(): + if not stamp.is_dir(): + continue + trials = list(stamp.iterdir()) + passed = 0 + total = 0 + for trial in trials: + result = trial / "result.json" + if not result.exists(): + continue + total += 1 + try: + import json + + data = json.loads(result.read_text()) + if (data.get("rewards") or {}).get("reward") == 1.0: + passed += 1 + except Exception: + continue + if total: + runs.append((f"{job_root.name}/{stamp.name}", passed, total)) + runs.sort(reverse=True) + table = Table(title=f"Recent evals in {root}") + table.add_column("run", style="cyan") + table.add_column("passed", justify="right", style="green") + table.add_column("total", justify="right") + table.add_column("rate", justify="right", style="yellow") + for run, passed, total in runs[:limit]: + rate = f"{100 * passed / total:.0f}%" if total else "-" + table.add_row(run, str(passed), str(total), rate) + console.print(table) + + +@app.command("retrieve") +def eval_retrieve( + trial_name: Annotated[ + str, typer.Argument(help="Trial dir name, e.g. my-task__abc") + ], + jobs_dir: Annotated[ + str, + typer.Option("--jobs-dir", "-o"), + ] = "jobs", +) -> None: + """Print the result.json for a specific trial.""" + + import json + + root = Path(jobs_dir) + matches = list(root.rglob(f"{trial_name}/result.json")) + if not matches: + console.print(f"[red]no trial named {trial_name!r} under {root}[/red]") + raise typer.Exit(1) + console.print(f"[dim]{matches[0]}[/dim]") + data = json.loads(matches[0].read_text()) + from rich import print_json + + print_json(data=data) diff --git a/tests/test_eval_cli.py b/tests/test_eval_cli.py new file mode 100644 index 0000000..5a009e7 --- /dev/null +++ b/tests/test_eval_cli.py @@ -0,0 +1,118 @@ +"""Tests for `bf eval` — the new eval-runner CLI. + +The create command takes a positional task reference and routes to either +a single SDK.run or a batch Job based on whether the reference is a single +task dir or a directory of task dirs. We don't exercise the actual runner +here (that's the parity tests' job); we only verify the resolver. +""" + +from __future__ import annotations + +from pathlib import Path + +import click.exceptions +import pytest + +from benchflow.cli.eval import _resolve_task_ref + + +def _make_task_dir(parent: Path, name: str) -> Path: + d = parent / name + d.mkdir() + (d / "task.toml").write_text('schema_version = "1.1"\n') + (d / "instruction.md").write_text("instruction\n") + (d / "environment").mkdir() + (d / "tests").mkdir() + (d / "tests" / "test.sh").write_text("#!/bin/bash\necho 1\n") + return d + + +def test_resolve_single_task_dir(tmp_path: Path) -> None: + task = _make_task_dir(tmp_path, "my-task") + resolved, is_batch = _resolve_task_ref(str(task)) + assert resolved == task.resolve() + assert is_batch is False + + +def test_resolve_directory_of_tasks_is_batch(tmp_path: Path) -> None: + _make_task_dir(tmp_path, "task-a") + _make_task_dir(tmp_path, "task-b") + resolved, is_batch = _resolve_task_ref(str(tmp_path)) + assert resolved == tmp_path.resolve() + assert is_batch is True + + +def test_resolve_missing_path_exits(tmp_path: Path) -> None: + with pytest.raises(click.exceptions.Exit): + _resolve_task_ref(str(tmp_path / "does-not-exist")) + + +def test_resolve_harbor_ref_routes_through_ensure_tasks( + tmp_path: Path, monkeypatch +) -> None: + """`harbor://` resolves to the cached dataset directory (batch).""" + + fake_dataset = tmp_path / "cached-aime" + fake_dataset.mkdir() + _make_task_dir(fake_dataset, "aime_60") + _make_task_dir(fake_dataset, "aime_61") + + captured: dict[str, str] = {} + + def fake_ensure_tasks(ref: str) -> Path: + captured["ref"] = ref + return fake_dataset + + from benchflow import task_download + + monkeypatch.setattr(task_download, "ensure_tasks", fake_ensure_tasks) + resolved, is_batch = _resolve_task_ref("harbor://aime@1.0") + assert captured["ref"] == "harbor://aime@1.0" + assert resolved == fake_dataset + assert is_batch is True + + +def test_resolve_harbor_ref_with_subtask(tmp_path: Path, monkeypatch) -> None: + """`harbor:///` resolves to a single task from the dataset.""" + + fake_dataset = tmp_path / "cached-aime" + fake_dataset.mkdir() + _make_task_dir(fake_dataset, "aime_60") + _make_task_dir(fake_dataset, "aime_61") + + def fake_ensure_tasks(ref: str) -> Path: + return fake_dataset + + from benchflow import task_download + + monkeypatch.setattr(task_download, "ensure_tasks", fake_ensure_tasks) + resolved, is_batch = _resolve_task_ref("harbor://aime/aime_60") + assert resolved == fake_dataset / "aime_60" + assert is_batch is False + + +def test_resolve_benchflow_ref(tmp_path: Path, monkeypatch) -> None: + """`benchflow://` works identically to `harbor://`.""" + + fake_dataset = tmp_path / "cached-benchjack" + fake_dataset.mkdir() + _make_task_dir(fake_dataset, "exploit-1") + + def fake_ensure_tasks(ref: str) -> Path: + assert ref == "benchflow://benchjack" + return fake_dataset + + from benchflow import task_download + + monkeypatch.setattr(task_download, "ensure_tasks", fake_ensure_tasks) + resolved, is_batch = _resolve_task_ref("benchflow://benchjack") + assert resolved == fake_dataset + assert is_batch is True + + +def test_resolve_non_task_dir_exits(tmp_path: Path) -> None: + """A directory with no task.toml children errors cleanly.""" + + (tmp_path / "not-a-task").mkdir() + with pytest.raises(click.exceptions.Exit): + _resolve_task_ref(str(tmp_path)) diff --git a/tests/test_oracle_chokepoint.py b/tests/test_oracle_chokepoint.py index 1a5043a..142c063 100644 --- a/tests/test_oracle_chokepoint.py +++ b/tests/test_oracle_chokepoint.py @@ -29,27 +29,24 @@ from typer.testing import CliRunner -class TestOrphanRemoval: - """src/benchflow/cli/eval.py was orphaned (never imported into the live CLI). +class TestEvalModuleNotWiredIntoCLI: + """src/benchflow/cli/eval.py exists but is NOT wired into the live CLI. - Deletion was safe because nothing wired it up. These tests make sure - the orphan stays gone — re-introducing it as a parallel `eval_create` - is what caused PR #173 to land its fix in dead code. + The live `bench eval create` dispatches to cli/main.py:eval_create. + cli/eval.py is a standalone module (with its own resolver logic and tests) + but must not be imported by the CLI entry-point code — doing so is what + caused PR #173 to land its fix in dead code. """ - def test_cli_eval_module_is_gone(self): - with pytest.raises(ModuleNotFoundError): - importlib.import_module("benchflow.cli.eval") - - def test_no_source_file_imports_the_orphan(self): - """Catch any future import of `benchflow.cli.eval` from src/.""" - src = Path(__file__).resolve().parent.parent / "src" / "benchflow" - offenders = [ - p - for p in src.rglob("*.py") - if "benchflow.cli.eval" in p.read_text() - ] - assert not offenders, f"Orphan import re-introduced in: {offenders}" + def test_cli_main_does_not_import_cli_eval(self): + """cli/main.py must not import from cli/eval — they are separate.""" + main_py = ( + Path(__file__).resolve().parent.parent + / "src" / "benchflow" / "cli" / "main.py" + ) + text = main_py.read_text() + assert "from benchflow.cli.eval" not in text + assert "import benchflow.cli.eval" not in text class TestEvalCreateRouting: From bc04c59920a6a4608932f39fe9c2172c0b38d6ba Mon Sep 17 00:00:00 2001 From: Yifeng He Date: Tue, 21 Apr 2026 15:54:40 -0700 Subject: [PATCH 6/6] docs: clarify cli/eval.py and test_eval_cli.py are not wired into live CLI --- src/benchflow/cli/eval.py | 12 +++++++----- tests/test_eval_cli.py | 10 +++++----- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/benchflow/cli/eval.py b/src/benchflow/cli/eval.py index ea7af43..58b410f 100644 --- a/src/benchflow/cli/eval.py +++ b/src/benchflow/cli/eval.py @@ -1,7 +1,12 @@ """`bf eval` — the benchflow eval-runner command group. -The future-facing entry point for running evaluations. Anthropic-style shape: -resource creation, one command, return the result or a job-id. +NOTE: This module is **not wired into the live CLI**. The active +``bench eval create`` command dispatches to ``cli/main.py:eval_create``. +This file is kept as the future-facing design for the eval sub-command +and must not be imported by ``cli/main.py`` (see +``test_oracle_chokepoint.py::TestEvalModuleNotWiredIntoCLI``). + +Design shape — Anthropic-style resource creation: bf eval create [flags] One-shot eval — creates an Agent + Environment + Trajectory under @@ -14,9 +19,6 @@ bf eval list Show recent eval runs (reads the jobs/ dir) bf eval retrieve ID Look up a specific trajectory by trial name - -Replaces `bf run` + `bf job` as the idiomatic way to run evals. `bf run` -stays around as a deprecated alias for one release. """ from __future__ import annotations diff --git a/tests/test_eval_cli.py b/tests/test_eval_cli.py index 5a009e7..dfcb7db 100644 --- a/tests/test_eval_cli.py +++ b/tests/test_eval_cli.py @@ -1,9 +1,9 @@ -"""Tests for `bf eval` — the new eval-runner CLI. +"""Tests for ``benchflow.cli.eval`` — the future-facing eval CLI module. -The create command takes a positional task reference and routes to either -a single SDK.run or a batch Job based on whether the reference is a single -task dir or a directory of task dirs. We don't exercise the actual runner -here (that's the parity tests' job); we only verify the resolver. +NOTE: ``cli/eval.py`` is **not wired into the live CLI** (the active +``bench eval create`` lives in ``cli/main.py``). These tests cover the +task-reference resolver (``_resolve_task_ref``) which will be used once +the module is promoted to the live entry point. """ from __future__ import annotations