From 30dbe215c55d444f5f6cef285c2c85b74fcbeded Mon Sep 17 00:00:00 2001 From: Yifeng He Date: Tue, 21 Apr 2026 14:15:30 -0700 Subject: [PATCH 01/12] fix: skip model/API-key validation for oracle agent The oracle agent runs solution/solve.sh and never calls an LLM, but resolve_agent_env() was validating API keys for whatever model the CLI defaulted to (claude-haiku-4-5-20251001). This made `bench eval create -a oracle` fail without ANTHROPIC_API_KEY set, even though oracle doesn't need it. --- src/benchflow/_agent_env.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/benchflow/_agent_env.py b/src/benchflow/_agent_env.py index 8885b5e..5609e6c 100644 --- a/src/benchflow/_agent_env.py +++ b/src/benchflow/_agent_env.py @@ -144,7 +144,7 @@ def resolve_agent_env( """Resolve agent environment: auto-inherit keys, provider vars, env_mapping.""" agent_env = dict(agent_env or {}) auto_inherit_env(agent_env) - if model: + if model and agent != "oracle": inject_vertex_credentials(agent_env, model) resolve_provider_env(agent_env, model, agent) # Validate required API key for the chosen model From 360c460a33e4758325611a7a4bdd4b0a599a8eb6 Mon Sep 17 00:00:00 2001 From: Yifeng He Date: Tue, 21 Apr 2026 14:37:13 -0700 Subject: [PATCH 02/12] fix: don't assign default model to oracle agent Move the fix from resolve_agent_env to the CLI layer: oracle runs solve.sh and never calls an LLM, so it should not receive DEFAULT_MODEL at all. Both _run_single and _run_batch now pass model=None for oracle. Widen JobConfig.model to str | None to support this. --- src/benchflow/_agent_env.py | 2 +- src/benchflow/cli/eval.py | 6 ++++-- src/benchflow/job.py | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/benchflow/_agent_env.py b/src/benchflow/_agent_env.py index 5609e6c..8885b5e 100644 --- a/src/benchflow/_agent_env.py +++ b/src/benchflow/_agent_env.py @@ -144,7 +144,7 @@ def resolve_agent_env( """Resolve agent environment: auto-inherit keys, provider vars, env_mapping.""" agent_env = dict(agent_env or {}) auto_inherit_env(agent_env) - if model and agent != "oracle": + if model: inject_vertex_credentials(agent_env, model) resolve_provider_env(agent_env, model, agent) # Validate required API key for the chosen model diff --git a/src/benchflow/cli/eval.py b/src/benchflow/cli/eval.py index 0b13fd8..a92c0be 100644 --- a/src/benchflow/cli/eval.py +++ b/src/benchflow/cli/eval.py @@ -228,11 +228,12 @@ def _run_single( from benchflow.sdk import SDK sdk = SDK() + effective_model = None if agent == "oracle" else (model or DEFAULT_MODEL) result = asyncio.run( sdk.run( task_path=task_dir, agent=agent, - model=model or DEFAULT_MODEL, + model=effective_model, environment=environment, prompts=cast("list[str | None] | None", prompt), agent_env=agent_env, @@ -268,9 +269,10 @@ def _run_batch( ) -> None: from benchflow.job import Job, JobConfig, RetryConfig + effective_model = None if agent == "oracle" else (model or DEFAULT_MODEL) config = JobConfig( agent=agent, - model=model or DEFAULT_MODEL, + model=effective_model, environment=environment, concurrency=concurrency, retry=RetryConfig(max_retries=max_retries), diff --git a/src/benchflow/job.py b/src/benchflow/job.py index 6d8fe9b..95414fe 100644 --- a/src/benchflow/job.py +++ b/src/benchflow/job.py @@ -146,7 +146,7 @@ class JobConfig: """Configuration for a benchmark job.""" agent: str = DEFAULT_AGENT - model: str = DEFAULT_MODEL + model: str | None = DEFAULT_MODEL environment: str = "docker" concurrency: int = 4 prompts: list[str | None] | None = None From 9b57b2afb8bade8912918fe3654e2a608779f373 Mon Sep 17 00:00:00 2001 From: Yifeng He Date: Tue, 21 Apr 2026 15:32:20 -0700 Subject: [PATCH 03/12] =?UTF-8?q?fix:=20oracle=20agent=20=E2=80=94=20choke?= =?UTF-8?q?point=20guard,=20drop=20orphan=20eval=20CLI,=20helper?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #173 moved the oracle/DEFAULT_MODEL guard from resolve_agent_env to cli/eval.py, but cli/eval.py is orphaned (never imported into the live CLI), so `bench eval create` still passes DEFAULT_MODEL to oracle and trips ANTHROPIC_API_KEY validation. Three changes: - Restore the `agent != "oracle"` guard in resolve_agent_env so the chokepoint defends against any caller that forwards a model. - Delete the orphan cli/eval.py and its tests — the live eval_create lives in cli/main.py and was the actual code path users hit. - Add effective_model(agent, model) helper, change JobConfig.model default to None, replace seven `model or DEFAULT_MODEL` sites in cli/main.py and job.py YAML loaders so oracle gets honest model=None end-to-end (in result/summary JSON, prints, and downstream Trial). Regression test in test_resolve_env_helpers.py pins the chokepoint by calling resolve_agent_env("oracle", DEFAULT_MODEL, {}) with no API key and no host auth — verified to fail on main with the user-facing ANTHROPIC_API_KEY error and pass after the fix. --- src/benchflow/_agent_env.py | 4 +- src/benchflow/cli/eval.py | 373 ------------------------------ src/benchflow/cli/main.py | 17 +- src/benchflow/job.py | 22 +- tests/test_eval_cli.py | 118 ---------- tests/test_resolve_env_helpers.py | 42 ++++ 6 files changed, 72 insertions(+), 504 deletions(-) delete mode 100644 src/benchflow/cli/eval.py delete mode 100644 tests/test_eval_cli.py diff --git a/src/benchflow/_agent_env.py b/src/benchflow/_agent_env.py index 8885b5e..72d6498 100644 --- a/src/benchflow/_agent_env.py +++ b/src/benchflow/_agent_env.py @@ -144,7 +144,9 @@ def resolve_agent_env( """Resolve agent environment: auto-inherit keys, provider vars, env_mapping.""" agent_env = dict(agent_env or {}) auto_inherit_env(agent_env) - if model: + # Oracle runs solve.sh and never calls an LLM — model env vars and + # API-key validation are skipped even if a caller forwards a model. + if model and agent != "oracle": inject_vertex_credentials(agent_env, model) resolve_provider_env(agent_env, model, agent) # Validate required API key for the chosen model diff --git a/src/benchflow/cli/eval.py b/src/benchflow/cli/eval.py deleted file mode 100644 index a92c0be..0000000 --- a/src/benchflow/cli/eval.py +++ /dev/null @@ -1,373 +0,0 @@ -"""`bf eval` — the benchflow eval-runner command group. - -The future-facing entry point for running evaluations. Anthropic-style shape: -resource creation, one command, return the result or a job-id. - - bf eval create [flags] - One-shot eval — creates an Agent + Environment + Trajectory under - the hood and runs the task. `task-ref` can be: - - a path to a task directory (single task) - - a path to a directory of task directories (batch) - - a `harbor://[@]` ref (full Harbor dataset) - - a `harbor:///` ref (single task from a Harbor dataset) - - a `benchflow://[@]` ref (benchflow-owned dataset) - - bf eval list Show recent eval runs (reads the jobs/ dir) - bf eval retrieve ID Look up a specific trajectory by trial name - -Replaces `bf run` + `bf job` as the idiomatic way to run evals. `bf run` -stays around as a deprecated alias for one release. -""" - -from __future__ import annotations - -import asyncio -from pathlib import Path -from typing import Annotated - -import typer -from rich.console import Console -from rich.table import Table - -from benchflow.job import DEFAULT_AGENT, DEFAULT_MODEL - -console = Console() - -app = typer.Typer( - name="eval", - help="Run evaluations. `bf eval create ` is the main entry point.", - no_args_is_help=True, -) - - -def _resolve_task_ref(task_ref: str) -> tuple[Path, bool]: - """Resolve a positional task reference to a local directory. - - Returns `(path, is_batch)`. `is_batch` is True when the reference - points at a directory containing multiple task dirs (each with its - own `task.toml`), False when the reference is a single task dir. - """ - - # Registry prefix: fetch the full dataset and treat as batch. - if task_ref.startswith("harbor://") or task_ref.startswith("benchflow://"): - from benchflow.task_download import ensure_tasks - - # Allow `harbor:///` shorthand for a single task within - # a dataset. Split off the trailing segment if it matches a task. - prefix, _, tail = task_ref.partition("://") - head = tail - sub_task: str | None = None - if "/" in tail and "@" not in tail.split("/", 1)[1]: - dataset, sub_task = tail.split("/", 1) - head = dataset - dataset_ref = f"{prefix}://{head}" - dataset_dir = ensure_tasks(dataset_ref) - if sub_task is not None: - candidate = dataset_dir / sub_task - if not candidate.exists() or not (candidate / "task.toml").exists(): - console.print( - f"[red]Harbor dataset {head!r} has no task named {sub_task!r}.[/red]" - ) - raise typer.Exit(1) - return candidate, False - return dataset_dir, True - - # Filesystem path: single task if task.toml is present, batch otherwise. - path = Path(task_ref).expanduser().resolve() - if not path.exists(): - console.print(f"[red]Task reference not found: {task_ref}[/red]") - raise typer.Exit(1) - if (path / "task.toml").exists(): - return path, False - if any( - child.is_dir() and (child / "task.toml").exists() for child in path.iterdir() - ): - return path, True - console.print( - f"[red]{path} is neither a single task (no task.toml) " - f"nor a directory of tasks.[/red]" - ) - raise typer.Exit(1) - - -@app.command("create") -def eval_create( - task: Annotated[ - str, - typer.Argument( - help=( - "Task reference. Path to a task dir, a dir of tasks, or a " - "registry ref (harbor://name, benchflow://name)." - ) - ), - ], - agent: Annotated[ - str, typer.Option("--agent", "-a", help="Agent name from the registry") - ] = DEFAULT_AGENT, - model: Annotated[ - str | None, - typer.Option("--model", "-m", help="Model id; agent default if unset"), - ] = None, - environment: Annotated[ - str, - typer.Option( - "--environment", - "-e", - help="docker | daytona | ... (uses the agent's default if unset)", - ), - ] = "docker", - prompt: Annotated[ - list[str] | None, - typer.Option( - "--prompt", - "-p", - help="Prompt text; repeat for multi-turn. Default: instruction.md", - ), - ] = None, - concurrency: Annotated[ - int, - typer.Option( - "--concurrency", - "-c", - help="Max parallel trials when task is a batch", - ), - ] = 4, - max_retries: Annotated[ - int, - typer.Option( - "--max-retries", - help="Per-trial retry count on transient errors", - ), - ] = 1, - jobs_dir: Annotated[ - str, - typer.Option("--jobs-dir", "-o", help="Where to write result files"), - ] = "jobs", - skills_dir: Annotated[ - Path | None, - typer.Option( - "--skills-dir", - "-s", - help="Skills directory to mount into the sandbox", - ), - ] = None, - sandbox_user: Annotated[ - str | None, - typer.Option( - "--sandbox-user", - help="Non-root sandbox user (default 'agent'; 'none' = root)", - ), - ] = "agent", - agent_env: Annotated[ - list[str] | None, - typer.Option("--agent-env", help="Extra agent env var (KEY=VALUE)"), - ] = None, -) -> None: - """Create and run an eval — one-shot. - - Under the hood: - 1. Resolves `task` to a local directory (fetching from Harbor if needed). - 2. If it's a single task: runs `SDK.run()` once and prints the reward. - 3. If it's a batch: runs a `Job` at `--concurrency` and prints pass rate. - - This is the idiomatic way to run evals going forward. `bf run` and - `bf job` remain as one-release deprecated aliases that forward here. - """ - - resolved, is_batch = _resolve_task_ref(task) - - parsed_env: dict[str, str] = {} - for entry in agent_env or []: - if "=" not in entry: - console.print(f"[red]Invalid --agent-env value: {entry!r}[/red]") - raise typer.Exit(2) - k, v = entry.split("=", 1) - parsed_env[k] = v - - if is_batch: - _run_batch( - tasks_dir=resolved, - agent=agent, - model=model, - environment=environment, - concurrency=concurrency, - max_retries=max_retries, - jobs_dir=jobs_dir, - skills_dir=skills_dir, - sandbox_user=sandbox_user, - agent_env=parsed_env, - ) - else: - _run_single( - task_dir=resolved, - agent=agent, - model=model, - environment=environment, - prompt=prompt, - jobs_dir=jobs_dir, - skills_dir=skills_dir, - sandbox_user=sandbox_user, - agent_env=parsed_env, - ) - - -def _run_single( - *, - task_dir: Path, - agent: str, - model: str | None, - environment: str, - prompt: list[str] | None, - jobs_dir: str, - skills_dir: Path | None, - sandbox_user: str | None, - agent_env: dict[str, str], -) -> None: - from typing import cast - - from benchflow.sdk import SDK - - sdk = SDK() - effective_model = None if agent == "oracle" else (model or DEFAULT_MODEL) - result = asyncio.run( - sdk.run( - task_path=task_dir, - agent=agent, - model=effective_model, - environment=environment, - prompts=cast("list[str | None] | None", prompt), - agent_env=agent_env, - job_name="eval-create", - jobs_dir=jobs_dir, - skills_dir=skills_dir, - sandbox_user=None if sandbox_user == "none" else sandbox_user, - ) - ) - reward = getattr(result, "reward", None) - err = getattr(result, "error", None) or getattr(result, "verifier_error", None) - console.print() - if err: - console.print(f"[red]failed:[/red] {err}") - raise typer.Exit(1) - console.print( - f"[bold]reward={reward}[/bold] tools={getattr(result, 'n_tool_calls', 0)}" - ) - - -def _run_batch( - *, - tasks_dir: Path, - agent: str, - model: str | None, - environment: str, - concurrency: int, - max_retries: int, - jobs_dir: str, - skills_dir: Path | None, - sandbox_user: str | None, - agent_env: dict[str, str], -) -> None: - from benchflow.job import Job, JobConfig, RetryConfig - - effective_model = None if agent == "oracle" else (model or DEFAULT_MODEL) - config = JobConfig( - agent=agent, - model=effective_model, - environment=environment, - concurrency=concurrency, - retry=RetryConfig(max_retries=max_retries), - agent_env=agent_env, - sandbox_user=None if sandbox_user == "none" else sandbox_user, - skills_dir=str(skills_dir) if skills_dir else None, - ) - job = Job( - tasks_dir=tasks_dir, - jobs_dir=Path(jobs_dir), - config=config, - ) - result = asyncio.run(job.run()) - console.print() - console.print( - f"[bold]{result.passed}/{result.total} " - f"({result.score:.1%})[/bold] errors={result.errored}" - ) - - -@app.command("list") -def eval_list( - jobs_dir: Annotated[ - str, - typer.Option("--jobs-dir", "-o", help="Results directory to scan"), - ] = "jobs", - limit: Annotated[ - int, - typer.Option("--limit", "-n", help="Max rows to show"), - ] = 20, -) -> None: - """List recent eval runs by scanning the jobs/ dir.""" - - root = Path(jobs_dir) - if not root.exists(): - console.print(f"[yellow]{root} does not exist yet.[/yellow]") - return - runs: list[tuple[str, int, int]] = [] - for job_root in root.iterdir(): - if not job_root.is_dir(): - continue - for stamp in job_root.iterdir(): - if not stamp.is_dir(): - continue - trials = list(stamp.iterdir()) - passed = 0 - total = 0 - for trial in trials: - result = trial / "result.json" - if not result.exists(): - continue - total += 1 - try: - import json - - data = json.loads(result.read_text()) - if (data.get("rewards") or {}).get("reward") == 1.0: - passed += 1 - except Exception: - continue - if total: - runs.append((f"{job_root.name}/{stamp.name}", passed, total)) - runs.sort(reverse=True) - table = Table(title=f"Recent evals in {root}") - table.add_column("run", style="cyan") - table.add_column("passed", justify="right", style="green") - table.add_column("total", justify="right") - table.add_column("rate", justify="right", style="yellow") - for run, passed, total in runs[:limit]: - rate = f"{100 * passed / total:.0f}%" if total else "-" - table.add_row(run, str(passed), str(total), rate) - console.print(table) - - -@app.command("retrieve") -def eval_retrieve( - trial_name: Annotated[ - str, typer.Argument(help="Trial dir name, e.g. my-task__abc") - ], - jobs_dir: Annotated[ - str, - typer.Option("--jobs-dir", "-o"), - ] = "jobs", -) -> None: - """Print the result.json for a specific trial.""" - - import json - - root = Path(jobs_dir) - matches = list(root.rglob(f"{trial_name}/result.json")) - if not matches: - console.print(f"[red]no trial named {trial_name!r} under {root}[/red]") - raise typer.Exit(1) - console.print(f"[dim]{matches[0]}[/dim]") - data = json.loads(matches[0].read_text()) - from rich import print_json - - print_json(data=data) diff --git a/src/benchflow/cli/main.py b/src/benchflow/cli/main.py index 9cf1198..96611bb 100644 --- a/src/benchflow/cli/main.py +++ b/src/benchflow/cli/main.py @@ -10,7 +10,7 @@ from rich.console import Console from rich.table import Table -from benchflow.job import DEFAULT_AGENT, DEFAULT_MODEL +from benchflow.job import DEFAULT_AGENT, effective_model console = Console() @@ -167,7 +167,7 @@ def job( jobs_dir=jobs_dir, config=JobConfig( agent=agent, - model=model or DEFAULT_MODEL, + model=effective_model(agent, model), environment=environment, concurrency=concurrency, retry=RetryConfig(max_retries=max_retries), @@ -343,7 +343,7 @@ def eval( jobs_dir=jobs_dir, config=JobConfig( agent=agent, - model=model or DEFAULT_MODEL, + model=effective_model(agent, model), environment=environment, concurrency=concurrency, skills_dir=effective_skills, @@ -754,19 +754,20 @@ def eval_create( f"({result.score:.1%})[/bold], errors={result.errored}" ) elif tasks_dir: + eff_model = effective_model(agent, model) # Smart detection: if tasks_dir has task.toml, it's a single task if (tasks_dir / "task.toml").exists(): from benchflow.trial import Trial, TrialConfig, Scene config = TrialConfig( task_path=tasks_dir, - scenes=[Scene.single(agent=agent, model=model or DEFAULT_MODEL, + scenes=[Scene.single(agent=agent, model=eff_model, skills_dir=str(skills_dir) if skills_dir else None)], environment=environment, sandbox_user=sandbox_user, jobs_dir=jobs_dir, agent=agent, - model=model or DEFAULT_MODEL, + model=eff_model, skills_dir=str(skills_dir) if skills_dir else None, ) @@ -777,7 +778,7 @@ async def _run(): run_result = asyncio.run(_run()) reward = (run_result.rewards or {}).get("reward") console.print(f"\n[bold]Task:[/bold] {tasks_dir.name}") - console.print(f"[bold]Agent:[/bold] {agent} ({model or DEFAULT_MODEL})") + console.print(f"[bold]Agent:[/bold] {agent} ({eff_model or 'no model'})") console.print(f"[bold]Reward:[/bold] {reward}") console.print(f"[bold]Tool calls:[/bold] {run_result.n_tool_calls}") if run_result.error: @@ -789,7 +790,7 @@ async def _run(): jobs_dir=jobs_dir, config=JobConfig( agent=agent, - model=model or DEFAULT_MODEL, + model=eff_model, environment=environment, concurrency=concurrency, sandbox_user=sandbox_user, @@ -915,7 +916,7 @@ def train_create( jobs_dir=f"{jobs_dir}/sweep-{sweep_idx}", config=JobConfig( agent=agent, - model=model or DEFAULT_MODEL, + model=effective_model(agent, model), environment=environment, concurrency=concurrency, ), diff --git a/src/benchflow/job.py b/src/benchflow/job.py index 95414fe..ad99ad4 100644 --- a/src/benchflow/job.py +++ b/src/benchflow/job.py @@ -141,12 +141,25 @@ def backoff_delay(self, attempt: int) -> float: DEFAULT_MODEL = "claude-haiku-4-5-20251001" +def effective_model(agent: str, model: str | None) -> str | None: + """Resolve the model an agent should run with. + + Oracle runs solve.sh and never calls an LLM, so it never receives a model + (the chokepoint in resolve_agent_env defends, but callers should also stop + materializing DEFAULT_MODEL into oracle configs to keep the data honest — + e.g. result-summary JSON shows model=null instead of a bogus default). + """ + if agent == "oracle": + return None + return model or DEFAULT_MODEL + + @dataclass class JobConfig: """Configuration for a benchmark job.""" agent: str = DEFAULT_AGENT - model: str | None = DEFAULT_MODEL + model: str | None = None environment: str = "docker" concurrency: int = 4 prompts: list[str | None] | None = None @@ -281,9 +294,10 @@ def _from_native_yaml(cls, raw: dict, **kwargs) -> "Job": sandbox_user = raw.get("sandbox_user", "agent") sandbox_locked_paths = raw.get("sandbox_locked_paths") + agent_name = raw.get("agent", DEFAULT_AGENT) config = JobConfig( - agent=raw.get("agent", DEFAULT_AGENT), - model=raw.get("model", DEFAULT_MODEL), + agent=agent_name, + model=effective_model(agent_name, raw.get("model")), environment=raw.get("environment", "docker"), concurrency=raw.get("concurrency", 4), prompts=prompts, @@ -305,7 +319,7 @@ def _from_harbor_yaml(cls, raw: dict, **kwargs) -> "Job": agent_name = agent_cfg.get("name", DEFAULT_AGENT) # Model — keep provider prefix intact for downstream resolution - model = agent_cfg.get("model_name", "") or DEFAULT_MODEL + model = effective_model(agent_name, agent_cfg.get("model_name") or None) # Environment env_cfg = raw.get("environment", {}) diff --git a/tests/test_eval_cli.py b/tests/test_eval_cli.py deleted file mode 100644 index 5a009e7..0000000 --- a/tests/test_eval_cli.py +++ /dev/null @@ -1,118 +0,0 @@ -"""Tests for `bf eval` — the new eval-runner CLI. - -The create command takes a positional task reference and routes to either -a single SDK.run or a batch Job based on whether the reference is a single -task dir or a directory of task dirs. We don't exercise the actual runner -here (that's the parity tests' job); we only verify the resolver. -""" - -from __future__ import annotations - -from pathlib import Path - -import click.exceptions -import pytest - -from benchflow.cli.eval import _resolve_task_ref - - -def _make_task_dir(parent: Path, name: str) -> Path: - d = parent / name - d.mkdir() - (d / "task.toml").write_text('schema_version = "1.1"\n') - (d / "instruction.md").write_text("instruction\n") - (d / "environment").mkdir() - (d / "tests").mkdir() - (d / "tests" / "test.sh").write_text("#!/bin/bash\necho 1\n") - return d - - -def test_resolve_single_task_dir(tmp_path: Path) -> None: - task = _make_task_dir(tmp_path, "my-task") - resolved, is_batch = _resolve_task_ref(str(task)) - assert resolved == task.resolve() - assert is_batch is False - - -def test_resolve_directory_of_tasks_is_batch(tmp_path: Path) -> None: - _make_task_dir(tmp_path, "task-a") - _make_task_dir(tmp_path, "task-b") - resolved, is_batch = _resolve_task_ref(str(tmp_path)) - assert resolved == tmp_path.resolve() - assert is_batch is True - - -def test_resolve_missing_path_exits(tmp_path: Path) -> None: - with pytest.raises(click.exceptions.Exit): - _resolve_task_ref(str(tmp_path / "does-not-exist")) - - -def test_resolve_harbor_ref_routes_through_ensure_tasks( - tmp_path: Path, monkeypatch -) -> None: - """`harbor://` resolves to the cached dataset directory (batch).""" - - fake_dataset = tmp_path / "cached-aime" - fake_dataset.mkdir() - _make_task_dir(fake_dataset, "aime_60") - _make_task_dir(fake_dataset, "aime_61") - - captured: dict[str, str] = {} - - def fake_ensure_tasks(ref: str) -> Path: - captured["ref"] = ref - return fake_dataset - - from benchflow import task_download - - monkeypatch.setattr(task_download, "ensure_tasks", fake_ensure_tasks) - resolved, is_batch = _resolve_task_ref("harbor://aime@1.0") - assert captured["ref"] == "harbor://aime@1.0" - assert resolved == fake_dataset - assert is_batch is True - - -def test_resolve_harbor_ref_with_subtask(tmp_path: Path, monkeypatch) -> None: - """`harbor:///` resolves to a single task from the dataset.""" - - fake_dataset = tmp_path / "cached-aime" - fake_dataset.mkdir() - _make_task_dir(fake_dataset, "aime_60") - _make_task_dir(fake_dataset, "aime_61") - - def fake_ensure_tasks(ref: str) -> Path: - return fake_dataset - - from benchflow import task_download - - monkeypatch.setattr(task_download, "ensure_tasks", fake_ensure_tasks) - resolved, is_batch = _resolve_task_ref("harbor://aime/aime_60") - assert resolved == fake_dataset / "aime_60" - assert is_batch is False - - -def test_resolve_benchflow_ref(tmp_path: Path, monkeypatch) -> None: - """`benchflow://` works identically to `harbor://`.""" - - fake_dataset = tmp_path / "cached-benchjack" - fake_dataset.mkdir() - _make_task_dir(fake_dataset, "exploit-1") - - def fake_ensure_tasks(ref: str) -> Path: - assert ref == "benchflow://benchjack" - return fake_dataset - - from benchflow import task_download - - monkeypatch.setattr(task_download, "ensure_tasks", fake_ensure_tasks) - resolved, is_batch = _resolve_task_ref("benchflow://benchjack") - assert resolved == fake_dataset - assert is_batch is True - - -def test_resolve_non_task_dir_exits(tmp_path: Path) -> None: - """A directory with no task.toml children errors cleanly.""" - - (tmp_path / "not-a-task").mkdir() - with pytest.raises(click.exceptions.Exit): - _resolve_task_ref(str(tmp_path)) diff --git a/tests/test_resolve_env_helpers.py b/tests/test_resolve_env_helpers.py index f699334..d831fe7 100644 --- a/tests/test_resolve_env_helpers.py +++ b/tests/test_resolve_env_helpers.py @@ -296,3 +296,45 @@ def test_no_model_empty_requires_env(self, monkeypatch, tmp_path): self._patch_expanduser(monkeypatch, tmp_path) result = self._resolve(agent="openclaw", agent_env={}) assert "_BENCHFLOW_SUBSCRIPTION_AUTH" not in result + + +class TestResolveAgentEnvOracle: + """Oracle runs solve.sh and never calls an LLM — must skip model-related env. + + Regression for the PR #173 follow-up: commit 360c460 removed the + `agent != "oracle"` guard from resolve_agent_env, betting that CLI callers + would pass model=None for oracle. But cli/main.py:eval_create (the live + `bench eval create`) still passes `model or DEFAULT_MODEL`, so oracle + reaches the chokepoint with a real model and triggers ANTHROPIC_API_KEY + validation — breaking offline oracle runs that have no API key set. + """ + + def _patch_expanduser(self, monkeypatch, tmp_path): + orig = Path.expanduser + + def fake(self): + s = str(self) + if s.startswith("~"): + return tmp_path / s[2:] + return orig(self) + + monkeypatch.setattr(Path, "expanduser", fake) + + def test_oracle_with_default_model_does_not_validate_api_key( + self, monkeypatch, tmp_path + ): + """Oracle + DEFAULT_MODEL + no API key + no host auth must not raise.""" + for k in ( + "ANTHROPIC_API_KEY", + "OPENAI_API_KEY", + "GOOGLE_API_KEY", + "GEMINI_API_KEY", + ): + monkeypatch.delenv(k, raising=False) + self._patch_expanduser(monkeypatch, tmp_path) + + result = resolve_agent_env("oracle", "claude-haiku-4-5-20251001", {}) + + # Provider env never resolved — oracle never calls an LLM. + assert "BENCHFLOW_PROVIDER_MODEL" not in result + assert "_BENCHFLOW_SUBSCRIPTION_AUTH" not in result From 796f5896ec6167c84afbc1bcdebb54e84f6a6658 Mon Sep 17 00:00:00 2001 From: Yifeng He Date: Tue, 21 Apr 2026 15:45:53 -0700 Subject: [PATCH 04/12] test: regression suite pinning oracle chokepoint + orphan removal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bundle 14 tests in tests/test_oracle_chokepoint.py that pin each layer of the prior fix at the right altitude: - TestOrphanRemoval — cli/eval.py is gone (ModuleNotFoundError) and no src/ file references benchflow.cli.eval, guarding against a future re-introduction that could swallow the next bug fix the same way. - TestEvalCreateRouting — `bench eval create` callback lives in cli/main.py:eval_create. Pins the architectural fact PR #173 missed. - TestEffectiveModel — unit tests for the helper: oracle drops model, non-oracle falls back to DEFAULT_MODEL, empty string treated as unset. - TestOracleYamlLoaders — Job.from_yaml(oracle config) → model is None for both native and Harbor formats; non-oracle backwards-compat preserved. - TestEvalCreateOracleCLI — end-to-end: live eval_create(agent="oracle") with no API key in env does not raise. Mocks Trial.create and resets the asyncio loop after to avoid polluting pre-existing tests that use the deprecated asyncio.get_event_loop() pattern. Verified to fail on main in the right shape: 9 of 14 fail (each pinning a deleted/added behavior), 5 pass (asserting structural facts already true). The CLI test fails on main with the user-reported error "ANTHROPIC_API_KEY required for model 'claude-haiku-4-5-20251001'…". --- tests/test_oracle_chokepoint.py | 260 ++++++++++++++++++++++++++++++++ 1 file changed, 260 insertions(+) create mode 100644 tests/test_oracle_chokepoint.py diff --git a/tests/test_oracle_chokepoint.py b/tests/test_oracle_chokepoint.py new file mode 100644 index 0000000..1a5043a --- /dev/null +++ b/tests/test_oracle_chokepoint.py @@ -0,0 +1,260 @@ +"""Regression tests for the oracle agent + DEFAULT_MODEL chokepoint. + +Pins the fix from this branch (post-PR #173 follow-up): + + Layer 1 — restore `agent != "oracle"` guard in resolve_agent_env so the + chokepoint defends against any caller that forwards a model. + Layer 2 — delete the orphaned src/benchflow/cli/eval.py whose oracle fix + was unreachable because nothing wired it into the live CLI. + Layer 3 — funnel all CLI/YAML-loader sites through effective_model() so + oracle gets honest model=None end-to-end. + +The classes below pin each layer at the right altitude: +- TestOrphanRemoval — proves cli/eval.py is gone and stays gone. +- TestEvalCreateRouting — proves `bench eval create` dispatches to + cli/main.py (the file that PR #173 missed). +- TestEffectiveModel — unit tests for the helper. +- TestOracleYamlLoaders — Job.from_yaml(oracle config) → model is None. +- TestEvalCreateOracleCLI — end-to-end: invoke `bench eval create -a oracle` + and assert no API-key validation error. +""" + +from __future__ import annotations + +import importlib +from pathlib import Path +from unittest.mock import AsyncMock, patch + +import pytest +from typer.testing import CliRunner + + +class TestOrphanRemoval: + """src/benchflow/cli/eval.py was orphaned (never imported into the live CLI). + + Deletion was safe because nothing wired it up. These tests make sure + the orphan stays gone — re-introducing it as a parallel `eval_create` + is what caused PR #173 to land its fix in dead code. + """ + + def test_cli_eval_module_is_gone(self): + with pytest.raises(ModuleNotFoundError): + importlib.import_module("benchflow.cli.eval") + + def test_no_source_file_imports_the_orphan(self): + """Catch any future import of `benchflow.cli.eval` from src/.""" + src = Path(__file__).resolve().parent.parent / "src" / "benchflow" + offenders = [ + p + for p in src.rglob("*.py") + if "benchflow.cli.eval" in p.read_text() + ] + assert not offenders, f"Orphan import re-introduced in: {offenders}" + + +class TestEvalCreateRouting: + """`bench eval create` must dispatch to cli/main.py:eval_create. + + The pyproject.toml `bench`/`benchflow` scripts both resolve to + `benchflow.cli.main:app`. PR #173 patched a different `eval_create` in + `benchflow.cli.eval` that no entry point ever loaded — these tests + pin the routing so that mistake can't happen again. + """ + + def test_entry_point_app_is_cli_main(self): + from benchflow.cli.main import app + + assert app.info.name == "benchflow" + + def test_eval_create_callback_lives_in_cli_main(self): + from benchflow.cli.main import eval_app + + create_cmds = [c for c in eval_app.registered_commands if c.name == "create"] + assert len(create_cmds) == 1 + assert create_cmds[0].callback.__module__ == "benchflow.cli.main" + + def test_bench_eval_create_help_resolves(self): + """Smoke test: `bench eval create --help` reaches a real callback.""" + from benchflow.cli.main import app + + result = CliRunner().invoke(app, ["eval", "create", "--help"]) + assert result.exit_code == 0 + assert "tasks-dir" in result.stdout or "task" in result.stdout.lower() + + +class TestEffectiveModel: + """The helper introduced in Layer 3 — single source of truth for the rule + "oracle never gets a model; non-oracle agents fall back to DEFAULT_MODEL".""" + + def test_oracle_with_no_model_returns_none(self): + from benchflow.job import effective_model + + assert effective_model("oracle", None) is None + + def test_oracle_ignores_explicit_model(self): + """Even if a caller forwards a model for oracle, the helper drops it.""" + from benchflow.job import effective_model + + assert effective_model("oracle", "claude-haiku-4-5-20251001") is None + + def test_non_oracle_with_no_model_returns_default(self): + from benchflow.job import DEFAULT_MODEL, effective_model + + assert effective_model("claude-agent-acp", None) == DEFAULT_MODEL + + def test_non_oracle_explicit_model_passes_through(self): + from benchflow.job import effective_model + + assert effective_model("codex-acp", "gpt-5") == "gpt-5" + + def test_non_oracle_empty_model_falls_back_to_default(self): + """Empty string == "no model" — Harbor YAML can produce this shape.""" + from benchflow.job import DEFAULT_MODEL, effective_model + + assert effective_model("claude-agent-acp", "") == DEFAULT_MODEL + + +class TestOracleYamlLoaders: + """YAML configs for oracle must produce JobConfig.model is None. + + Both loader paths (_from_native_yaml, _from_harbor_yaml) previously + coalesced missing model to DEFAULT_MODEL unconditionally — Layer 3 + routes them through effective_model() so oracle drops the default. + """ + + def _make_task(self, tmp_path: Path) -> Path: + tasks = tmp_path / "tasks" / "task-a" + tasks.mkdir(parents=True) + (tasks / "task.toml").write_text('schema_version = "1.1"\n') + return tmp_path / "tasks" + + def test_native_yaml_oracle_no_model(self, tmp_path: Path): + from benchflow.job import Job + + self._make_task(tmp_path) + config = tmp_path / "config.yaml" + config.write_text("tasks_dir: tasks\nagent: oracle\n") + job = Job.from_yaml(config) + assert job._config.agent == "oracle" + assert job._config.model is None + + def test_harbor_yaml_oracle_no_model(self, tmp_path: Path): + from benchflow.job import Job + + self._make_task(tmp_path) + config = tmp_path / "config.yaml" + config.write_text( + "agents:\n" + " - name: oracle\n" + "datasets:\n" + " - path: tasks\n" + ) + job = Job.from_yaml(config) + assert job._config.agent == "oracle" + assert job._config.model is None + + def test_native_yaml_non_oracle_keeps_default_when_omitted( + self, tmp_path: Path + ): + """Backwards-compat: omitting model for an LLM agent still gets DEFAULT_MODEL.""" + from benchflow.job import DEFAULT_MODEL, Job + + self._make_task(tmp_path) + config = tmp_path / "config.yaml" + config.write_text("tasks_dir: tasks\nagent: claude-agent-acp\n") + job = Job.from_yaml(config) + assert job._config.model == DEFAULT_MODEL + + +class TestEvalCreateOracleCLI: + """End-to-end: `bench eval create -a oracle` must not trip API key validation. + + This is the user-visible bug the chokepoint test guards against at the + unit level. Here we call the live handler (cli/main.py:eval_create) + directly — invoking via Typer's CliRunner triggers asyncio.run() + internally, which leaves no current event loop and breaks pre-existing + tests in the suite that use the deprecated asyncio.get_event_loop() + pattern. Calling the function directly still exercises the full + CLI → effective_model → TrialConfig → resolve_agent_env path the bug + originally lived in. + """ + + def _make_task(self, tmp_path: Path) -> Path: + task = tmp_path / "task" + task.mkdir() + (task / "task.toml").write_text('schema_version = "1.1"\n') + (task / "instruction.md").write_text("solve\n") + return task + + def _strip_api_keys(self, monkeypatch): + for k in ( + "ANTHROPIC_API_KEY", + "ANTHROPIC_AUTH_TOKEN", + "CLAUDE_CODE_OAUTH_TOKEN", + "OPENAI_API_KEY", + "GOOGLE_API_KEY", + "GEMINI_API_KEY", + "LLM_API_KEY", + ): + monkeypatch.delenv(k, raising=False) + + def test_oracle_single_task_no_api_key_no_error( + self, tmp_path: Path, monkeypatch + ): + """The bug: oracle + missing API key → ANTHROPIC_API_KEY ValueError.""" + import asyncio + + from benchflow.cli.main import eval_create + from benchflow.models import RunResult + + # The CLI handler internally calls asyncio.run(), which leaves no + # current event loop. Pre-existing tests in the suite use the + # deprecated asyncio.get_event_loop() and break in that state, so + # restore a fresh loop after the test (teardown via finally below). + + self._strip_api_keys(monkeypatch) + task = self._make_task(tmp_path) + captured: dict = {} + + async def fake_create(config): + captured["config"] = config + # Exercise the real chokepoint with the config the CLI built — + # that's the specific call site the bug manifested at. + from benchflow._agent_env import resolve_agent_env + + captured["agent_env"] = resolve_agent_env( + config.primary_agent, config.primary_model, config.agent_env + ) + trial = type("FakeTrial", (), {})() + trial.run = AsyncMock( + return_value=RunResult( + task_name="task", + agent_name="oracle", + rewards={"reward": 1.0}, + n_tool_calls=0, + ) + ) + return trial + + try: + with patch("benchflow.trial.Trial.create", new=fake_create): + eval_create( + config_file=None, + tasks_dir=task, + agent="oracle", + model=None, + environment="docker", + concurrency=1, + jobs_dir=str(tmp_path / "jobs"), + sandbox_user="agent", + skills_dir=None, + ) + finally: + asyncio.set_event_loop(asyncio.new_event_loop()) + + cfg = captured["config"] + # Layer 3: oracle never receives a model, even when CLI defaults exist. + assert cfg.primary_agent == "oracle" + assert cfg.primary_model is None + # Layer 1: chokepoint did not inject provider env or raise. + assert "BENCHFLOW_PROVIDER_MODEL" not in captured["agent_env"] From 88bd026e13e1c85b8e8c5040da31c3c2feab742b Mon Sep 17 00:00:00 2001 From: Yifeng He Date: Tue, 21 Apr 2026 15:53:15 -0700 Subject: [PATCH 05/12] fix: restore cli/eval.py and test_eval_cli.py, apply oracle guard The previous commit deleted cli/eval.py and its tests as orphans, but they are intentionally kept. Restore both from main, update eval.py to use the effective_model() helper for the oracle chokepoint fix, and replace the "module is gone" regression test with a guard that cli/main.py does not import cli/eval (the actual invariant). --- src/benchflow/cli/eval.py | 373 ++++++++++++++++++++++++++++++++ tests/test_eval_cli.py | 118 ++++++++++ tests/test_oracle_chokepoint.py | 33 ++- 3 files changed, 506 insertions(+), 18 deletions(-) create mode 100644 src/benchflow/cli/eval.py create mode 100644 tests/test_eval_cli.py diff --git a/src/benchflow/cli/eval.py b/src/benchflow/cli/eval.py new file mode 100644 index 0000000..ea7af43 --- /dev/null +++ b/src/benchflow/cli/eval.py @@ -0,0 +1,373 @@ +"""`bf eval` — the benchflow eval-runner command group. + +The future-facing entry point for running evaluations. Anthropic-style shape: +resource creation, one command, return the result or a job-id. + + bf eval create [flags] + One-shot eval — creates an Agent + Environment + Trajectory under + the hood and runs the task. `task-ref` can be: + - a path to a task directory (single task) + - a path to a directory of task directories (batch) + - a `harbor://[@]` ref (full Harbor dataset) + - a `harbor:///` ref (single task from a Harbor dataset) + - a `benchflow://[@]` ref (benchflow-owned dataset) + + bf eval list Show recent eval runs (reads the jobs/ dir) + bf eval retrieve ID Look up a specific trajectory by trial name + +Replaces `bf run` + `bf job` as the idiomatic way to run evals. `bf run` +stays around as a deprecated alias for one release. +""" + +from __future__ import annotations + +import asyncio +from pathlib import Path +from typing import Annotated + +import typer +from rich.console import Console +from rich.table import Table + +from benchflow.job import DEFAULT_AGENT, effective_model as _effective_model + +console = Console() + +app = typer.Typer( + name="eval", + help="Run evaluations. `bf eval create ` is the main entry point.", + no_args_is_help=True, +) + + +def _resolve_task_ref(task_ref: str) -> tuple[Path, bool]: + """Resolve a positional task reference to a local directory. + + Returns `(path, is_batch)`. `is_batch` is True when the reference + points at a directory containing multiple task dirs (each with its + own `task.toml`), False when the reference is a single task dir. + """ + + # Registry prefix: fetch the full dataset and treat as batch. + if task_ref.startswith("harbor://") or task_ref.startswith("benchflow://"): + from benchflow.task_download import ensure_tasks + + # Allow `harbor:///` shorthand for a single task within + # a dataset. Split off the trailing segment if it matches a task. + prefix, _, tail = task_ref.partition("://") + head = tail + sub_task: str | None = None + if "/" in tail and "@" not in tail.split("/", 1)[1]: + dataset, sub_task = tail.split("/", 1) + head = dataset + dataset_ref = f"{prefix}://{head}" + dataset_dir = ensure_tasks(dataset_ref) + if sub_task is not None: + candidate = dataset_dir / sub_task + if not candidate.exists() or not (candidate / "task.toml").exists(): + console.print( + f"[red]Harbor dataset {head!r} has no task named {sub_task!r}.[/red]" + ) + raise typer.Exit(1) + return candidate, False + return dataset_dir, True + + # Filesystem path: single task if task.toml is present, batch otherwise. + path = Path(task_ref).expanduser().resolve() + if not path.exists(): + console.print(f"[red]Task reference not found: {task_ref}[/red]") + raise typer.Exit(1) + if (path / "task.toml").exists(): + return path, False + if any( + child.is_dir() and (child / "task.toml").exists() for child in path.iterdir() + ): + return path, True + console.print( + f"[red]{path} is neither a single task (no task.toml) " + f"nor a directory of tasks.[/red]" + ) + raise typer.Exit(1) + + +@app.command("create") +def eval_create( + task: Annotated[ + str, + typer.Argument( + help=( + "Task reference. Path to a task dir, a dir of tasks, or a " + "registry ref (harbor://name, benchflow://name)." + ) + ), + ], + agent: Annotated[ + str, typer.Option("--agent", "-a", help="Agent name from the registry") + ] = DEFAULT_AGENT, + model: Annotated[ + str | None, + typer.Option("--model", "-m", help="Model id; agent default if unset"), + ] = None, + environment: Annotated[ + str, + typer.Option( + "--environment", + "-e", + help="docker | daytona | ... (uses the agent's default if unset)", + ), + ] = "docker", + prompt: Annotated[ + list[str] | None, + typer.Option( + "--prompt", + "-p", + help="Prompt text; repeat for multi-turn. Default: instruction.md", + ), + ] = None, + concurrency: Annotated[ + int, + typer.Option( + "--concurrency", + "-c", + help="Max parallel trials when task is a batch", + ), + ] = 4, + max_retries: Annotated[ + int, + typer.Option( + "--max-retries", + help="Per-trial retry count on transient errors", + ), + ] = 1, + jobs_dir: Annotated[ + str, + typer.Option("--jobs-dir", "-o", help="Where to write result files"), + ] = "jobs", + skills_dir: Annotated[ + Path | None, + typer.Option( + "--skills-dir", + "-s", + help="Skills directory to mount into the sandbox", + ), + ] = None, + sandbox_user: Annotated[ + str | None, + typer.Option( + "--sandbox-user", + help="Non-root sandbox user (default 'agent'; 'none' = root)", + ), + ] = "agent", + agent_env: Annotated[ + list[str] | None, + typer.Option("--agent-env", help="Extra agent env var (KEY=VALUE)"), + ] = None, +) -> None: + """Create and run an eval — one-shot. + + Under the hood: + 1. Resolves `task` to a local directory (fetching from Harbor if needed). + 2. If it's a single task: runs `SDK.run()` once and prints the reward. + 3. If it's a batch: runs a `Job` at `--concurrency` and prints pass rate. + + This is the idiomatic way to run evals going forward. `bf run` and + `bf job` remain as one-release deprecated aliases that forward here. + """ + + resolved, is_batch = _resolve_task_ref(task) + + parsed_env: dict[str, str] = {} + for entry in agent_env or []: + if "=" not in entry: + console.print(f"[red]Invalid --agent-env value: {entry!r}[/red]") + raise typer.Exit(2) + k, v = entry.split("=", 1) + parsed_env[k] = v + + if is_batch: + _run_batch( + tasks_dir=resolved, + agent=agent, + model=model, + environment=environment, + concurrency=concurrency, + max_retries=max_retries, + jobs_dir=jobs_dir, + skills_dir=skills_dir, + sandbox_user=sandbox_user, + agent_env=parsed_env, + ) + else: + _run_single( + task_dir=resolved, + agent=agent, + model=model, + environment=environment, + prompt=prompt, + jobs_dir=jobs_dir, + skills_dir=skills_dir, + sandbox_user=sandbox_user, + agent_env=parsed_env, + ) + + +def _run_single( + *, + task_dir: Path, + agent: str, + model: str | None, + environment: str, + prompt: list[str] | None, + jobs_dir: str, + skills_dir: Path | None, + sandbox_user: str | None, + agent_env: dict[str, str], +) -> None: + from typing import cast + + from benchflow.sdk import SDK + + sdk = SDK() + eff_model = _effective_model(agent, model) + result = asyncio.run( + sdk.run( + task_path=task_dir, + agent=agent, + model=eff_model, + environment=environment, + prompts=cast("list[str | None] | None", prompt), + agent_env=agent_env, + job_name="eval-create", + jobs_dir=jobs_dir, + skills_dir=skills_dir, + sandbox_user=None if sandbox_user == "none" else sandbox_user, + ) + ) + reward = getattr(result, "reward", None) + err = getattr(result, "error", None) or getattr(result, "verifier_error", None) + console.print() + if err: + console.print(f"[red]failed:[/red] {err}") + raise typer.Exit(1) + console.print( + f"[bold]reward={reward}[/bold] tools={getattr(result, 'n_tool_calls', 0)}" + ) + + +def _run_batch( + *, + tasks_dir: Path, + agent: str, + model: str | None, + environment: str, + concurrency: int, + max_retries: int, + jobs_dir: str, + skills_dir: Path | None, + sandbox_user: str | None, + agent_env: dict[str, str], +) -> None: + from benchflow.job import Job, JobConfig, RetryConfig + + eff_model = _effective_model(agent, model) + config = JobConfig( + agent=agent, + model=eff_model, + environment=environment, + concurrency=concurrency, + retry=RetryConfig(max_retries=max_retries), + agent_env=agent_env, + sandbox_user=None if sandbox_user == "none" else sandbox_user, + skills_dir=str(skills_dir) if skills_dir else None, + ) + job = Job( + tasks_dir=tasks_dir, + jobs_dir=Path(jobs_dir), + config=config, + ) + result = asyncio.run(job.run()) + console.print() + console.print( + f"[bold]{result.passed}/{result.total} " + f"({result.score:.1%})[/bold] errors={result.errored}" + ) + + +@app.command("list") +def eval_list( + jobs_dir: Annotated[ + str, + typer.Option("--jobs-dir", "-o", help="Results directory to scan"), + ] = "jobs", + limit: Annotated[ + int, + typer.Option("--limit", "-n", help="Max rows to show"), + ] = 20, +) -> None: + """List recent eval runs by scanning the jobs/ dir.""" + + root = Path(jobs_dir) + if not root.exists(): + console.print(f"[yellow]{root} does not exist yet.[/yellow]") + return + runs: list[tuple[str, int, int]] = [] + for job_root in root.iterdir(): + if not job_root.is_dir(): + continue + for stamp in job_root.iterdir(): + if not stamp.is_dir(): + continue + trials = list(stamp.iterdir()) + passed = 0 + total = 0 + for trial in trials: + result = trial / "result.json" + if not result.exists(): + continue + total += 1 + try: + import json + + data = json.loads(result.read_text()) + if (data.get("rewards") or {}).get("reward") == 1.0: + passed += 1 + except Exception: + continue + if total: + runs.append((f"{job_root.name}/{stamp.name}", passed, total)) + runs.sort(reverse=True) + table = Table(title=f"Recent evals in {root}") + table.add_column("run", style="cyan") + table.add_column("passed", justify="right", style="green") + table.add_column("total", justify="right") + table.add_column("rate", justify="right", style="yellow") + for run, passed, total in runs[:limit]: + rate = f"{100 * passed / total:.0f}%" if total else "-" + table.add_row(run, str(passed), str(total), rate) + console.print(table) + + +@app.command("retrieve") +def eval_retrieve( + trial_name: Annotated[ + str, typer.Argument(help="Trial dir name, e.g. my-task__abc") + ], + jobs_dir: Annotated[ + str, + typer.Option("--jobs-dir", "-o"), + ] = "jobs", +) -> None: + """Print the result.json for a specific trial.""" + + import json + + root = Path(jobs_dir) + matches = list(root.rglob(f"{trial_name}/result.json")) + if not matches: + console.print(f"[red]no trial named {trial_name!r} under {root}[/red]") + raise typer.Exit(1) + console.print(f"[dim]{matches[0]}[/dim]") + data = json.loads(matches[0].read_text()) + from rich import print_json + + print_json(data=data) diff --git a/tests/test_eval_cli.py b/tests/test_eval_cli.py new file mode 100644 index 0000000..5a009e7 --- /dev/null +++ b/tests/test_eval_cli.py @@ -0,0 +1,118 @@ +"""Tests for `bf eval` — the new eval-runner CLI. + +The create command takes a positional task reference and routes to either +a single SDK.run or a batch Job based on whether the reference is a single +task dir or a directory of task dirs. We don't exercise the actual runner +here (that's the parity tests' job); we only verify the resolver. +""" + +from __future__ import annotations + +from pathlib import Path + +import click.exceptions +import pytest + +from benchflow.cli.eval import _resolve_task_ref + + +def _make_task_dir(parent: Path, name: str) -> Path: + d = parent / name + d.mkdir() + (d / "task.toml").write_text('schema_version = "1.1"\n') + (d / "instruction.md").write_text("instruction\n") + (d / "environment").mkdir() + (d / "tests").mkdir() + (d / "tests" / "test.sh").write_text("#!/bin/bash\necho 1\n") + return d + + +def test_resolve_single_task_dir(tmp_path: Path) -> None: + task = _make_task_dir(tmp_path, "my-task") + resolved, is_batch = _resolve_task_ref(str(task)) + assert resolved == task.resolve() + assert is_batch is False + + +def test_resolve_directory_of_tasks_is_batch(tmp_path: Path) -> None: + _make_task_dir(tmp_path, "task-a") + _make_task_dir(tmp_path, "task-b") + resolved, is_batch = _resolve_task_ref(str(tmp_path)) + assert resolved == tmp_path.resolve() + assert is_batch is True + + +def test_resolve_missing_path_exits(tmp_path: Path) -> None: + with pytest.raises(click.exceptions.Exit): + _resolve_task_ref(str(tmp_path / "does-not-exist")) + + +def test_resolve_harbor_ref_routes_through_ensure_tasks( + tmp_path: Path, monkeypatch +) -> None: + """`harbor://` resolves to the cached dataset directory (batch).""" + + fake_dataset = tmp_path / "cached-aime" + fake_dataset.mkdir() + _make_task_dir(fake_dataset, "aime_60") + _make_task_dir(fake_dataset, "aime_61") + + captured: dict[str, str] = {} + + def fake_ensure_tasks(ref: str) -> Path: + captured["ref"] = ref + return fake_dataset + + from benchflow import task_download + + monkeypatch.setattr(task_download, "ensure_tasks", fake_ensure_tasks) + resolved, is_batch = _resolve_task_ref("harbor://aime@1.0") + assert captured["ref"] == "harbor://aime@1.0" + assert resolved == fake_dataset + assert is_batch is True + + +def test_resolve_harbor_ref_with_subtask(tmp_path: Path, monkeypatch) -> None: + """`harbor:///` resolves to a single task from the dataset.""" + + fake_dataset = tmp_path / "cached-aime" + fake_dataset.mkdir() + _make_task_dir(fake_dataset, "aime_60") + _make_task_dir(fake_dataset, "aime_61") + + def fake_ensure_tasks(ref: str) -> Path: + return fake_dataset + + from benchflow import task_download + + monkeypatch.setattr(task_download, "ensure_tasks", fake_ensure_tasks) + resolved, is_batch = _resolve_task_ref("harbor://aime/aime_60") + assert resolved == fake_dataset / "aime_60" + assert is_batch is False + + +def test_resolve_benchflow_ref(tmp_path: Path, monkeypatch) -> None: + """`benchflow://` works identically to `harbor://`.""" + + fake_dataset = tmp_path / "cached-benchjack" + fake_dataset.mkdir() + _make_task_dir(fake_dataset, "exploit-1") + + def fake_ensure_tasks(ref: str) -> Path: + assert ref == "benchflow://benchjack" + return fake_dataset + + from benchflow import task_download + + monkeypatch.setattr(task_download, "ensure_tasks", fake_ensure_tasks) + resolved, is_batch = _resolve_task_ref("benchflow://benchjack") + assert resolved == fake_dataset + assert is_batch is True + + +def test_resolve_non_task_dir_exits(tmp_path: Path) -> None: + """A directory with no task.toml children errors cleanly.""" + + (tmp_path / "not-a-task").mkdir() + with pytest.raises(click.exceptions.Exit): + _resolve_task_ref(str(tmp_path)) diff --git a/tests/test_oracle_chokepoint.py b/tests/test_oracle_chokepoint.py index 1a5043a..142c063 100644 --- a/tests/test_oracle_chokepoint.py +++ b/tests/test_oracle_chokepoint.py @@ -29,27 +29,24 @@ from typer.testing import CliRunner -class TestOrphanRemoval: - """src/benchflow/cli/eval.py was orphaned (never imported into the live CLI). +class TestEvalModuleNotWiredIntoCLI: + """src/benchflow/cli/eval.py exists but is NOT wired into the live CLI. - Deletion was safe because nothing wired it up. These tests make sure - the orphan stays gone — re-introducing it as a parallel `eval_create` - is what caused PR #173 to land its fix in dead code. + The live `bench eval create` dispatches to cli/main.py:eval_create. + cli/eval.py is a standalone module (with its own resolver logic and tests) + but must not be imported by the CLI entry-point code — doing so is what + caused PR #173 to land its fix in dead code. """ - def test_cli_eval_module_is_gone(self): - with pytest.raises(ModuleNotFoundError): - importlib.import_module("benchflow.cli.eval") - - def test_no_source_file_imports_the_orphan(self): - """Catch any future import of `benchflow.cli.eval` from src/.""" - src = Path(__file__).resolve().parent.parent / "src" / "benchflow" - offenders = [ - p - for p in src.rglob("*.py") - if "benchflow.cli.eval" in p.read_text() - ] - assert not offenders, f"Orphan import re-introduced in: {offenders}" + def test_cli_main_does_not_import_cli_eval(self): + """cli/main.py must not import from cli/eval — they are separate.""" + main_py = ( + Path(__file__).resolve().parent.parent + / "src" / "benchflow" / "cli" / "main.py" + ) + text = main_py.read_text() + assert "from benchflow.cli.eval" not in text + assert "import benchflow.cli.eval" not in text class TestEvalCreateRouting: From bc04c59920a6a4608932f39fe9c2172c0b38d6ba Mon Sep 17 00:00:00 2001 From: Yifeng He Date: Tue, 21 Apr 2026 15:54:40 -0700 Subject: [PATCH 06/12] docs: clarify cli/eval.py and test_eval_cli.py are not wired into live CLI --- src/benchflow/cli/eval.py | 12 +++++++----- tests/test_eval_cli.py | 10 +++++----- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/benchflow/cli/eval.py b/src/benchflow/cli/eval.py index ea7af43..58b410f 100644 --- a/src/benchflow/cli/eval.py +++ b/src/benchflow/cli/eval.py @@ -1,7 +1,12 @@ """`bf eval` — the benchflow eval-runner command group. -The future-facing entry point for running evaluations. Anthropic-style shape: -resource creation, one command, return the result or a job-id. +NOTE: This module is **not wired into the live CLI**. The active +``bench eval create`` command dispatches to ``cli/main.py:eval_create``. +This file is kept as the future-facing design for the eval sub-command +and must not be imported by ``cli/main.py`` (see +``test_oracle_chokepoint.py::TestEvalModuleNotWiredIntoCLI``). + +Design shape — Anthropic-style resource creation: bf eval create [flags] One-shot eval — creates an Agent + Environment + Trajectory under @@ -14,9 +19,6 @@ bf eval list Show recent eval runs (reads the jobs/ dir) bf eval retrieve ID Look up a specific trajectory by trial name - -Replaces `bf run` + `bf job` as the idiomatic way to run evals. `bf run` -stays around as a deprecated alias for one release. """ from __future__ import annotations diff --git a/tests/test_eval_cli.py b/tests/test_eval_cli.py index 5a009e7..dfcb7db 100644 --- a/tests/test_eval_cli.py +++ b/tests/test_eval_cli.py @@ -1,9 +1,9 @@ -"""Tests for `bf eval` — the new eval-runner CLI. +"""Tests for ``benchflow.cli.eval`` — the future-facing eval CLI module. -The create command takes a positional task reference and routes to either -a single SDK.run or a batch Job based on whether the reference is a single -task dir or a directory of task dirs. We don't exercise the actual runner -here (that's the parity tests' job); we only verify the resolver. +NOTE: ``cli/eval.py`` is **not wired into the live CLI** (the active +``bench eval create`` lives in ``cli/main.py``). These tests cover the +task-reference resolver (``_resolve_task_ref``) which will be used once +the module is promoted to the live entry point. """ from __future__ import annotations From 9459a23b4214cedf1f23db49c59b59bf28cd232b Mon Sep 17 00:00:00 2001 From: Yifeng He Date: Wed, 22 Apr 2026 12:52:48 -0700 Subject: [PATCH 07/12] docs(plan): add plan to fix sandbox io problem --- ...6-04-22-sandbox-setup-shared-tool-paths.md | 133 ++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 docs/plans/2026-04-22-sandbox-setup-shared-tool-paths.md diff --git a/docs/plans/2026-04-22-sandbox-setup-shared-tool-paths.md b/docs/plans/2026-04-22-sandbox-setup-shared-tool-paths.md new file mode 100644 index 0000000..b6a6e35 --- /dev/null +++ b/docs/plans/2026-04-22-sandbox-setup-shared-tool-paths.md @@ -0,0 +1,133 @@ +# Sandbox Setup Shared Tool Paths Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Eliminate per-trial copying of heavyweight tool installations during `setup_sandbox_user()` while keeping sandboxed agents able to read configs, credentials, and installed binaries. + +**Architecture:** Treat tool installations as shared container state, not per-user home state. Keep `setup_sandbox_user()` responsible only for creating the non-root user, preparing small home-scoped config/credential directories, and granting workspace ownership. Add a narrow compatibility path for root-home installs that BenchFlow cannot relocate itself. + +**Tech Stack:** Python 3.12, pytest, Harbor sandbox env exec, BenchFlow agent registry / sandbox setup. + +--- + +## Recommendation + +Pick **alternative 2: shared install paths** as the primary direction, with a **small compatibility fallback from alternative 1** for existing images that already install tools under `/root`. + +Why this is the best fit: + +- `setup_sandbox_user()` currently copies `/root/.nvm`, `/root/.local/bin`, and every derived home dir. That is the direct cause of the timeout. +- BenchFlow already installs some helpers into shared locations like `/usr/local/bin`, which matches the right ownership boundary for binaries. +- Pre-creating the user in the Dockerfile (alternative 3) is useful for image authors, but it is not a sufficient product-level fix because BenchFlow must still support arbitrary user-provided images. +- Pure symlinking of `/root` home directories (alternative 1) would reduce I/O, but it keeps the wrong model: heavyweight tool installs still live in a root-only home and `setup_sandbox_user()` still has to know about them. + +So the implementation target should be: + +- **Primary path:** stop relying on root-home installs for agent binaries and shared tooling. +- **Compatibility path:** if an existing image already has a heavy tool dir in `/root` and BenchFlow needs it, link the minimum necessary path instead of copying it. + +## File Map + +- Modify: `src/benchflow/_sandbox.py` + - Remove heavyweight recursive copies from `setup_sandbox_user()`. + - Add a minimal compatibility helper for sharing root-owned tool paths without recursive copy. +- Modify: `src/benchflow/agents/registry.py` + - Clarify the contract for `home_dirs` / `get_sandbox_home_dirs()` so it covers user config, not shared tool installs. +- Modify: `src/benchflow/_agent_setup.py` + - Ensure the install path assumptions match the new sandbox contract. +- Modify: `tests/test_sandbox.py` + - Add contract tests for which home dirs are copied versus treated as shared tooling. +- Create or modify: `tests/test_sandbox_setup.py` + - Add targeted tests for the generated `setup_sandbox_user()` command. +- Modify: `docs/task-authoring.md` + - Document preferred shared install locations for benchmark/task images. +- Modify: `docs/api-reference.md` + - Update the sandbox setup description to match the new behavior. + +### Task 1: Lock the new sandbox contract in tests + +**Files:** + +- Modify: `tests/test_sandbox.py` +- Create: `tests/test_sandbox_setup.py` +- Modify: `src/benchflow/_sandbox.py` + +- [ ] Add a focused async test file for `setup_sandbox_user()` that mocks `env.exec`, calls `setup_sandbox_user(env, "agent", "/app")`, and asserts the command no longer contains recursive copies of `/root/.nvm` or `/root/.local/bin`. +- [ ] In the same test file, assert the command still creates the user, prepares the target home directory, and `chown`s the workspace. +- [ ] Add a compatibility test that asserts the command uses a link-or-shared-path strategy for heavyweight root-owned tool dirs rather than `cp -a`. +- [ ] Run: `.venv/bin/python -m pytest tests/test_sandbox.py tests/test_sandbox_setup.py -q` +- [ ] Expected: new tests fail on current `main` because the command still contains `cp -a` / `cp -aL` for heavyweight tool dirs. + +### Task 2: Narrow `setup_sandbox_user()` to user state only + +**Files:** + +- Modify: `src/benchflow/_sandbox.py:103-128` +- Test: `tests/test_sandbox_setup.py` + +- [ ] Refactor `setup_sandbox_user()` so it does only three things: + - create the sandbox user if missing, + - prepare only small home-scoped directories BenchFlow actually needs for config/auth, + - grant ownership to the workspace. +- [ ] Remove the unconditional copy of `/root/.nvm`. +- [ ] Remove the special-case copy of `/root/.local/bin`. +- [ ] Keep copying only the small dirs derived from `get_sandbox_home_dirs()` that represent config/auth state, not tool installations. +- [ ] Add a minimal helper or inline shell snippet that, when a legacy image exposes required tool paths only under `/root`, creates a symlink to the required path instead of copying the directory tree. +- [ ] Run: `.venv/bin/python -m pytest tests/test_sandbox_setup.py -q` +- [ ] Expected: the new setup contract tests pass. + +### Task 3: Align registry semantics with the new contract + +**Files:** + +- Modify: `src/benchflow/agents/registry.py:338-366` +- Modify: `tests/test_sandbox.py` + +- [ ] Update the `get_sandbox_home_dirs()` docstring to state that the returned dirs are user home config/auth dirs that BenchFlow may need to materialize for the sandbox user. +- [ ] Decide whether `.local` should remain in `get_sandbox_home_dirs()`: + - if BenchFlow still needs `.local/share` or similar user-scoped state, keep it but stop special-casing `.local/bin` in `_sandbox.py`; + - if BenchFlow only needed `.local` for tool binaries, remove `.local` from the always-include set and update tests accordingly. +- [ ] Prefer the more minimal option based on the actual credential/skill paths in the registry. +- [ ] Run: `.venv/bin/python -m pytest tests/test_sandbox.py -q` +- [ ] Expected: registry contract tests pass with updated semantics. + +### Task 4: Verify agent install assumptions still hold + +**Files:** + +- Modify: `src/benchflow/_agent_setup.py` +- Modify: `src/benchflow/agents/registry.py` +- Test: `tests/test_sandbox_setup.py` + +- [ ] Review each registry `install_cmd` and confirm BenchFlow-installed binaries resolve from shared paths already on `PATH` for the sandbox user. +- [ ] If any BenchFlow-managed install still lands in a root-home path, change that install command to a shared location such as `/usr/local/bin`, `/usr/local/lib`, or another non-home prefix. +- [ ] Add or adjust a test that asserts the sandbox user setup no longer depends on home-copying to make BenchFlow-installed agents executable. +- [ ] Run targeted tests covering sandbox setup and any registry invariant tests touched by the change. + +### Task 5: Document the benchmark image guidance + +**Files:** + +- Modify: `docs/task-authoring.md` +- Modify: `docs/api-reference.md` + +- [ ] Add a short section to `docs/task-authoring.md` explaining that benchmark images should install shared tooling into shared prefixes like `/usr/local` or `/opt`, not `/root/.nvm` or `/root/.local/bin`, when the tools must be usable by a sandbox user. +- [ ] Document that pre-creating the sandbox user in the Dockerfile is optional optimization, not the primary compatibility mechanism. +- [ ] Update `docs/api-reference.md` to describe `setup_sandbox_user()` as lightweight user setup plus workspace ownership, not recursive home cloning. + +### Task 6: End-to-end verification + +**Files:** + +- No code changes required unless verification exposes a gap. + +- [ ] Run: `.venv/bin/python -m pytest tests/test_sandbox.py tests/test_sandbox_setup.py tests/test_sandbox_hardening.py tests/test_sandbox_verifier_workspace.py -q` +- [ ] Run: `.venv/bin/ty check src/` +- [ ] If available, run one trial startup path that exercises `Trial.install_agent()` with `sandbox_user="agent"` and capture setup timing before/after. +- [ ] Confirm the setup command no longer performs recursive copies of heavyweight tool trees per trial. + +## Notes / Risks + +- The collaborator comment on the issue asks to try shared install paths first. This plan follows that request as the architectural choice. +- Shared install paths alone cannot retrofit arbitrary existing benchmark images that already baked tools into `/root`. That is why the plan keeps a narrow symlink-based compatibility path instead of a full copy. +- The key review question before implementation is whether `.local` remains a true user-state directory in BenchFlow, or whether it only existed because of previous tool-install assumptions. From f18bab6ed06707119df83fc8c160c61916bc9638 Mon Sep 17 00:00:00 2001 From: Yifeng He Date: Wed, 22 Apr 2026 13:31:01 -0700 Subject: [PATCH 08/12] test: lock sandbox setup contract Plan step 1/6: Lock the new sandbox contract in tests --- tests/test_sandbox.py | 5 +++ tests/test_sandbox_setup.py | 62 +++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 tests/test_sandbox_setup.py diff --git a/tests/test_sandbox.py b/tests/test_sandbox.py index 4c7f9f2..1954ac6 100644 --- a/tests/test_sandbox.py +++ b/tests/test_sandbox.py @@ -36,6 +36,11 @@ def test_always_includes_local(self): dirs = get_sandbox_home_dirs() assert ".local" in dirs + def test_only_includes_top_level_home_dirs(self): + """Derived entries stay at $HOME top-level, not nested tool subpaths.""" + dirs = get_sandbox_home_dirs() + assert ".local/bin" not in dirs + def test_new_agent_auto_included(self): """Adding an agent with skill_paths=$HOME/.newagent/skills includes .newagent.""" AGENTS["_test_agent"] = AgentConfig( diff --git a/tests/test_sandbox_setup.py b/tests/test_sandbox_setup.py new file mode 100644 index 0000000..96cde3b --- /dev/null +++ b/tests/test_sandbox_setup.py @@ -0,0 +1,62 @@ +"""Focused contract tests for setup_sandbox_user() shell command generation.""" + +import re +import shlex +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from benchflow._sandbox import setup_sandbox_user + + +async def _run_setup_sandbox_user(*, sandbox_user: str = "agent", workspace: str = "/app"): + env = MagicMock() + env.exec = AsyncMock(return_value=MagicMock(stdout="", stderr="", exit_code=0)) + + await setup_sandbox_user(env, sandbox_user, workspace) + + env.exec.assert_awaited_once() + return env.exec.call_args.args[0], env.exec.call_args.kwargs + + +def _assert_explicit_symlink(cmd: str, *, source: str, dest: str) -> None: + """Heavy tool dirs must use an explicit symlink compatibility path.""" + assert re.search( + rf"ln -s(?:f|[a-zA-Z-])* [\"']?{re.escape(source)}[\"']? [\"']?{re.escape(dest)}[\"']?", + cmd, + ), f"expected explicit symlink from {source} to {dest} in setup command: {cmd}" + + +class TestSetupSandboxUser: + @pytest.mark.asyncio + async def test_setup_command_avoids_recursive_root_tool_copies(self): + """Heavy root-owned tool dirs should no longer be recursively copied.""" + cmd, kwargs = await _run_setup_sandbox_user() + + assert "cp -aL /root/.local/bin/." not in cmd + assert "cp -a /root/.nvm/." not in cmd + assert kwargs["timeout_sec"] == 120 + + @pytest.mark.asyncio + async def test_setup_command_still_creates_user_prepares_home_and_chowns_workspace(self): + """The non-copy setup contract still creates the user and grants access.""" + cmd, _ = await _run_setup_sandbox_user() + + assert "id -u agent >/dev/null 2>&1 || useradd -m -s /bin/bash agent" in cmd + assert "mkdir -p /home/agent/.local/bin" in cmd + assert "chown -R agent:agent /home/agent" in cmd + assert f"chown -R agent:agent {shlex.quote('/app')}" in cmd + + @pytest.mark.asyncio + async def test_setup_command_keeps_heavy_root_tool_dirs_on_shared_paths(self): + """Heavy root-owned tool dirs should use explicit symlinks, not duplication.""" + cmd, _ = await _run_setup_sandbox_user() + + _assert_explicit_symlink( + cmd, + source="/root/.local/bin", + dest="/home/agent/.local/bin", + ) + _assert_explicit_symlink(cmd, source="/root/.nvm", dest="/home/agent/.nvm") + assert "cp -aL /root/.local/bin/. /home/agent/.local/bin/" not in cmd + assert "cp -a /root/.nvm/. /home/agent/.nvm/" not in cmd From 0f9b320135672cca1bf2d5f008d9954ea7ed0f2b Mon Sep 17 00:00:00 2001 From: Yifeng He Date: Wed, 22 Apr 2026 13:41:16 -0700 Subject: [PATCH 09/12] fix: stop copying root tool installs into sandbox home Plan step 2/6: Narrow setup_sandbox_user() to user state only --- src/benchflow/_sandbox.py | 33 ++++++++++++++++++++++---------- tests/test_sandbox_setup.py | 38 ++++++++++++++++++++++++++++++------- 2 files changed, 54 insertions(+), 17 deletions(-) diff --git a/src/benchflow/_sandbox.py b/src/benchflow/_sandbox.py index 41ba87b..dbc9a1b 100644 --- a/src/benchflow/_sandbox.py +++ b/src/benchflow/_sandbox.py @@ -1,7 +1,7 @@ """Sandbox user setup, path lockdown, and verifier hardening. Owns the "agent runs as non-root" lifecycle: - - Creating the sandbox user and copying root's tooling into its home + - Creating the sandbox user and preparing minimal home state it needs - Building the privilege-drop wrapper (setpriv / su) for agent launch - Locking down solution/test paths so the sandbox user cannot read them - Hardening the environment before the verifier runs @@ -100,6 +100,20 @@ def build_priv_drop_cmd(agent_launch: str, sandbox_user: str) -> str: ) +def _legacy_root_tool_link_cmd(source: str, dest: str) -> str: + """Link legacy root-only tool dirs into the sandbox home when needed.""" + src = shlex.quote(source) + dst = shlex.quote(dest) + parent = shlex.quote(str(Path(dest).parent)) + return ( + f"if [ -e {src} ] && [ ! -L {dst} ]; then " + f"mkdir -p {parent} && " + f"rmdir {dst} 2>/dev/null || true; " + f"[ -e {dst} ] || ln -s {src} {dst}; " + "fi" + ) + + async def setup_sandbox_user( env, sandbox_user: str, workspace: str, *, timeout_sec: int = 120 ) -> str: @@ -109,18 +123,17 @@ async def setup_sandbox_user( f"Invalid sandbox_user: {sandbox_user!r} (must be alphanumeric)" ) logger.info(f"Setting up sandbox user: {sandbox_user}") + home = f"/home/{sandbox_user}" + home_dirs = sorted(d for d in get_sandbox_home_dirs() if d != ".local") await env.exec( f"id -u {sandbox_user} >/dev/null 2>&1 || " f"useradd -m -s /bin/bash {sandbox_user} && " - f"mkdir -p /home/{sandbox_user}/.local/bin && " - "if [ -d /root/.local/bin ]; then " - f"cp -aL /root/.local/bin/. /home/{sandbox_user}/.local/bin/ 2>/dev/null || true; fi && " - "if [ -d /root/.nvm ]; then " - f"cp -a /root/.nvm/. /home/{sandbox_user}/.nvm/ 2>/dev/null || true; fi && " - f"for d in {' '.join(sorted(get_sandbox_home_dirs()))}; do " - f"if [ -d /root/$d ]; then mkdir -p /home/{sandbox_user}/$d && " - f"cp -a /root/$d/. /home/{sandbox_user}/$d/ 2>/dev/null || true; fi; done && " - f"chown -R {sandbox_user}:{sandbox_user} /home/{sandbox_user} && " + f"{_legacy_root_tool_link_cmd('/root/.local/bin', f'{home}/.local/bin')} && " + f"{_legacy_root_tool_link_cmd('/root/.nvm', f'{home}/.nvm')} && " + f"for d in {' '.join(home_dirs)}; do " + f"if [ -d /root/$d ]; then mkdir -p {home}/$d && " + f"cp -a /root/$d/. {home}/$d/ 2>/dev/null || true; fi; done && " + f"chown -R {sandbox_user}:{sandbox_user} {home} && " f"chown -R {sandbox_user}:{sandbox_user} {shlex.quote(workspace)}", timeout_sec=timeout_sec, ) diff --git a/tests/test_sandbox_setup.py b/tests/test_sandbox_setup.py index 96cde3b..f835ab6 100644 --- a/tests/test_sandbox_setup.py +++ b/tests/test_sandbox_setup.py @@ -7,6 +7,7 @@ import pytest from benchflow._sandbox import setup_sandbox_user +from benchflow.agents.registry import get_sandbox_home_dirs async def _run_setup_sandbox_user(*, sandbox_user: str = "agent", workspace: str = "/app"): @@ -19,14 +20,21 @@ async def _run_setup_sandbox_user(*, sandbox_user: str = "agent", workspace: str return env.exec.call_args.args[0], env.exec.call_args.kwargs -def _assert_explicit_symlink(cmd: str, *, source: str, dest: str) -> None: - """Heavy tool dirs must use an explicit symlink compatibility path.""" +def _assert_conditional_legacy_symlink(cmd: str, *, source: str, dest: str) -> None: + """Legacy tool dirs should link only when a root-only install exists.""" assert re.search( - rf"ln -s(?:f|[a-zA-Z-])* [\"']?{re.escape(source)}[\"']? [\"']?{re.escape(dest)}[\"']?", + rf"if \[ -e [\"']?{re.escape(source)}[\"']? \].*ln -s(?:f|[a-zA-Z-])* [\"']?{re.escape(source)}[\"']? [\"']?{re.escape(dest)}[\"']?.*fi", cmd, ), f"expected explicit symlink from {source} to {dest} in setup command: {cmd}" +def _get_copy_loop_dirs(cmd: str) -> list[str]: + """Extract the general home-dir copy loop payload from the shell command.""" + match = re.search(r"for d in (?P.*?); do", cmd) + assert match, f"expected general home-dir copy loop in setup command: {cmd}" + return match.group("dirs").split() + + class TestSetupSandboxUser: @pytest.mark.asyncio async def test_setup_command_avoids_recursive_root_tool_copies(self): @@ -43,20 +51,36 @@ async def test_setup_command_still_creates_user_prepares_home_and_chowns_workspa cmd, _ = await _run_setup_sandbox_user() assert "id -u agent >/dev/null 2>&1 || useradd -m -s /bin/bash agent" in cmd - assert "mkdir -p /home/agent/.local/bin" in cmd + assert "mkdir -p /home/agent/.local/bin" not in cmd assert "chown -R agent:agent /home/agent" in cmd assert f"chown -R agent:agent {shlex.quote('/app')}" in cmd @pytest.mark.asyncio async def test_setup_command_keeps_heavy_root_tool_dirs_on_shared_paths(self): - """Heavy root-owned tool dirs should use explicit symlinks, not duplication.""" + """Legacy root-only tool dirs should use conditional symlinks, not duplication.""" cmd, _ = await _run_setup_sandbox_user() - _assert_explicit_symlink( + _assert_conditional_legacy_symlink( cmd, source="/root/.local/bin", dest="/home/agent/.local/bin", ) - _assert_explicit_symlink(cmd, source="/root/.nvm", dest="/home/agent/.nvm") + _assert_conditional_legacy_symlink( + cmd, source="/root/.nvm", dest="/home/agent/.nvm" + ) assert "cp -aL /root/.local/bin/. /home/agent/.local/bin/" not in cmd assert "cp -a /root/.nvm/. /home/agent/.nvm/" not in cmd + + @pytest.mark.asyncio + async def test_setup_command_copy_loop_excludes_local_dir(self): + """General home-dir copying should narrow to small config/auth dirs only.""" + cmd, _ = await _run_setup_sandbox_user() + + copy_loop_dirs = _get_copy_loop_dirs(cmd) + + assert copy_loop_dirs == sorted( + d for d in get_sandbox_home_dirs() if d != ".local" + ) + assert ".local" not in copy_loop_dirs + assert "mkdir -p /home/agent/$d" in cmd + assert "cp -a /root/$d/. /home/agent/$d/ 2>/dev/null || true" in cmd From 13f06b716a3c2833d89a4c4d0acd6d181b8e61cc Mon Sep 17 00:00:00 2001 From: Yifeng He Date: Wed, 22 Apr 2026 13:47:14 -0700 Subject: [PATCH 10/12] refactor: derive sandbox home dirs from registry config Plan step 3/6: Align registry semantics with the new contract --- src/benchflow/agents/registry.py | 9 ++++---- tests/test_sandbox.py | 38 ++++++++++++++++++++++++++++---- 2 files changed, 38 insertions(+), 9 deletions(-) diff --git a/src/benchflow/agents/registry.py b/src/benchflow/agents/registry.py index f1d26bb..b809b39 100644 --- a/src/benchflow/agents/registry.py +++ b/src/benchflow/agents/registry.py @@ -336,16 +336,15 @@ class AgentConfig: def get_sandbox_home_dirs() -> set[str]: - """Collect all dot-dirs under $HOME that sandbox user setup should copy. + """Collect user home config/auth dirs BenchFlow may materialize for the sandbox user. - Derives from three sources across all registered agents: + Derives from four sources across all registered agents: - skill_paths: $HOME/.foo/... → ".foo" - credential_files: {home}/.foo/... → ".foo" + - subscription_auth.files: {home}/.foo/... → ".foo" - home_dirs: explicit extras (e.g. ".openclaw") - - Always includes ".local" (pip scripts, etc.). """ - dirs: set[str] = {".local"} + dirs: set[str] = set() for cfg in AGENTS.values(): for sp in cfg.skill_paths: if sp.startswith("$HOME/."): diff --git a/tests/test_sandbox.py b/tests/test_sandbox.py index 1954ac6..cebda60 100644 --- a/tests/test_sandbox.py +++ b/tests/test_sandbox.py @@ -1,8 +1,10 @@ -"""Tests for sandbox user directory derivation from agent registry.""" +"""Tests for sandbox user config/auth directory derivation from agent registry.""" from benchflow.agents.registry import ( AGENTS, AgentConfig, + HostAuthFile, + SubscriptionAuth, get_sandbox_home_dirs, ) @@ -31,16 +33,21 @@ def test_home_dirs_included(self): # openclaw has home_dirs=[".openclaw"] assert ".openclaw" in dirs - def test_always_includes_local(self): - """.local is always in the dir list.""" + def test_does_not_include_legacy_local_tool_dir(self): + """.local is not included unless an agent registry path derives it.""" dirs = get_sandbox_home_dirs() - assert ".local" in dirs + assert ".local" not in dirs def test_only_includes_top_level_home_dirs(self): """Derived entries stay at $HOME top-level, not nested tool subpaths.""" dirs = get_sandbox_home_dirs() assert ".local/bin" not in dirs + def test_dirs_represent_registry_backed_home_config_or_auth(self): + """Returned dirs are registry-derived user home config/auth roots.""" + dirs = get_sandbox_home_dirs() + assert dirs == {".agents", ".claude", ".codex", ".gemini", ".openclaw", ".pi"} + def test_new_agent_auto_included(self): """Adding an agent with skill_paths=$HOME/.newagent/skills includes .newagent.""" AGENTS["_test_agent"] = AgentConfig( @@ -55,6 +62,29 @@ def test_new_agent_auto_included(self): finally: del AGENTS["_test_agent"] + def test_subscription_auth_file_dirs_included(self): + """Dirs from subscription_auth.files container paths are included.""" + AGENTS["_test_agent_subscription_auth"] = AgentConfig( + name="_test_agent_subscription_auth", + install_cmd="true", + launch_cmd="true", + subscription_auth=SubscriptionAuth( + replaces_env="TEST_API_KEY", + detect_file="~/.subauth/login.json", + files=[ + HostAuthFile( + "~/.subauth/login.json", + "{home}/.subauth/login.json", + ) + ], + ), + ) + try: + dirs = get_sandbox_home_dirs() + assert ".subauth" in dirs + finally: + del AGENTS["_test_agent_subscription_auth"] + def test_workspace_paths_excluded(self): """$WORKSPACE paths are not included (only $HOME paths).""" dirs = get_sandbox_home_dirs() From df07322b36a708e607fce809978bf5139c495553 Mon Sep 17 00:00:00 2001 From: Yifeng He Date: Wed, 22 Apr 2026 14:56:45 -0700 Subject: [PATCH 11/12] refactor: symlink skills into sandbox, enforce shared install prefixes Replace per-trial skill-tree copies with ln -sfn into a shared /skills (or task skills_dir) root, drop skill_paths from get_sandbox_home_dirs(), and add registry + sandbox-setup invariants that keep agent binaries on /usr/local/* rather than /root-only home paths. Updates task-authoring and api-reference docs to describe the new lightweight sandbox contract. --- docs/api-reference.md | 4 + ...6-04-22-sandbox-setup-shared-tool-paths.md | 56 +++---- docs/task-authoring.md | 2 + src/benchflow/_agent_setup.py | 80 +++++++--- src/benchflow/agents/registry.py | 10 +- tests/test_agent_setup.py | 145 ++++++++++++++++++ tests/test_registry_invariants.py | 16 ++ tests/test_sandbox.py | 24 ++- tests/test_sandbox_setup.py | 20 +++ 9 files changed, 285 insertions(+), 72 deletions(-) create mode 100644 tests/test_agent_setup.py diff --git a/docs/api-reference.md b/docs/api-reference.md index 4e871fa..e09d142 100644 --- a/docs/api-reference.md +++ b/docs/api-reference.md @@ -122,6 +122,10 @@ Trial.run() ├─ setup() — resolve config, create env object ├─ start() — spin up sandbox, upload task files, start services ├─ install_agent() — install agent binary, credentials, sandbox user + │ (sandbox user setup: create non-root user, prepare + │ small config/auth dirs, chown the workspace — no + │ recursive copy of /root tool trees; agent binaries + │ must live on shared prefixes like /usr/local/bin) ├─ for scene in scenes: │ └─ _run_scene(scene) │ ├─ connect_as(role) — open ACP session for this role diff --git a/docs/plans/2026-04-22-sandbox-setup-shared-tool-paths.md b/docs/plans/2026-04-22-sandbox-setup-shared-tool-paths.md index b6a6e35..9fc49ea 100644 --- a/docs/plans/2026-04-22-sandbox-setup-shared-tool-paths.md +++ b/docs/plans/2026-04-22-sandbox-setup-shared-tool-paths.md @@ -52,11 +52,11 @@ So the implementation target should be: - Create: `tests/test_sandbox_setup.py` - Modify: `src/benchflow/_sandbox.py` -- [ ] Add a focused async test file for `setup_sandbox_user()` that mocks `env.exec`, calls `setup_sandbox_user(env, "agent", "/app")`, and asserts the command no longer contains recursive copies of `/root/.nvm` or `/root/.local/bin`. -- [ ] In the same test file, assert the command still creates the user, prepares the target home directory, and `chown`s the workspace. -- [ ] Add a compatibility test that asserts the command uses a link-or-shared-path strategy for heavyweight root-owned tool dirs rather than `cp -a`. -- [ ] Run: `.venv/bin/python -m pytest tests/test_sandbox.py tests/test_sandbox_setup.py -q` -- [ ] Expected: new tests fail on current `main` because the command still contains `cp -a` / `cp -aL` for heavyweight tool dirs. +- [x] Add a focused async test file for `setup_sandbox_user()` that mocks `env.exec`, calls `setup_sandbox_user(env, "agent", "/app")`, and asserts the command no longer contains recursive copies of `/root/.nvm` or `/root/.local/bin`. +- [x] In the same test file, assert the command still creates the user, prepares the target home directory, and `chown`s the workspace. +- [x] Add a compatibility test that asserts the command uses a link-or-shared-path strategy for heavyweight root-owned tool dirs rather than `cp -a`. +- [x] Run: `.venv/bin/python -m pytest tests/test_sandbox.py tests/test_sandbox_setup.py -q` +- [x] Expected: new tests fail on current `main` because the command still contains `cp -a` / `cp -aL` for heavyweight tool dirs. ### Task 2: Narrow `setup_sandbox_user()` to user state only @@ -65,16 +65,16 @@ So the implementation target should be: - Modify: `src/benchflow/_sandbox.py:103-128` - Test: `tests/test_sandbox_setup.py` -- [ ] Refactor `setup_sandbox_user()` so it does only three things: +- [x] Refactor `setup_sandbox_user()` so it does only three things: - create the sandbox user if missing, - prepare only small home-scoped directories BenchFlow actually needs for config/auth, - grant ownership to the workspace. -- [ ] Remove the unconditional copy of `/root/.nvm`. -- [ ] Remove the special-case copy of `/root/.local/bin`. -- [ ] Keep copying only the small dirs derived from `get_sandbox_home_dirs()` that represent config/auth state, not tool installations. -- [ ] Add a minimal helper or inline shell snippet that, when a legacy image exposes required tool paths only under `/root`, creates a symlink to the required path instead of copying the directory tree. -- [ ] Run: `.venv/bin/python -m pytest tests/test_sandbox_setup.py -q` -- [ ] Expected: the new setup contract tests pass. +- [x] Remove the unconditional copy of `/root/.nvm`. +- [x] Remove the special-case copy of `/root/.local/bin`. +- [x] Keep copying only the small dirs derived from `get_sandbox_home_dirs()` that represent config/auth state, not tool installations. +- [x] Add a minimal helper or inline shell snippet that, when a legacy image exposes required tool paths only under `/root`, creates a symlink to the required path instead of copying the directory tree. +- [x] Run: `.venv/bin/python -m pytest tests/test_sandbox_setup.py -q` +- [x] Expected: the new setup contract tests pass. ### Task 3: Align registry semantics with the new contract @@ -83,13 +83,13 @@ So the implementation target should be: - Modify: `src/benchflow/agents/registry.py:338-366` - Modify: `tests/test_sandbox.py` -- [ ] Update the `get_sandbox_home_dirs()` docstring to state that the returned dirs are user home config/auth dirs that BenchFlow may need to materialize for the sandbox user. -- [ ] Decide whether `.local` should remain in `get_sandbox_home_dirs()`: +- [x] Update the `get_sandbox_home_dirs()` docstring to state that the returned dirs are user home config/auth dirs that BenchFlow may need to materialize for the sandbox user. +- [x] Decide whether `.local` should remain in `get_sandbox_home_dirs()`: - if BenchFlow still needs `.local/share` or similar user-scoped state, keep it but stop special-casing `.local/bin` in `_sandbox.py`; - if BenchFlow only needed `.local` for tool binaries, remove `.local` from the always-include set and update tests accordingly. -- [ ] Prefer the more minimal option based on the actual credential/skill paths in the registry. -- [ ] Run: `.venv/bin/python -m pytest tests/test_sandbox.py -q` -- [ ] Expected: registry contract tests pass with updated semantics. +- [x] Prefer the more minimal option based on the actual credential/skill paths in the registry. +- [x] Run: `.venv/bin/python -m pytest tests/test_sandbox.py -q` +- [x] Expected: registry contract tests pass with updated semantics. ### Task 4: Verify agent install assumptions still hold @@ -99,10 +99,10 @@ So the implementation target should be: - Modify: `src/benchflow/agents/registry.py` - Test: `tests/test_sandbox_setup.py` -- [ ] Review each registry `install_cmd` and confirm BenchFlow-installed binaries resolve from shared paths already on `PATH` for the sandbox user. -- [ ] If any BenchFlow-managed install still lands in a root-home path, change that install command to a shared location such as `/usr/local/bin`, `/usr/local/lib`, or another non-home prefix. -- [ ] Add or adjust a test that asserts the sandbox user setup no longer depends on home-copying to make BenchFlow-installed agents executable. -- [ ] Run targeted tests covering sandbox setup and any registry invariant tests touched by the change. +- [x] Review each registry `install_cmd` and confirm BenchFlow-installed binaries resolve from shared paths already on `PATH` for the sandbox user. +- [x] If any BenchFlow-managed install still lands in a root-home path, change that install command to a shared location such as `/usr/local/bin`, `/usr/local/lib`, or another non-home prefix. +- [x] Add or adjust a test that asserts the sandbox user setup no longer depends on home-copying to make BenchFlow-installed agents executable. +- [x] Run targeted tests covering sandbox setup and any registry invariant tests touched by the change. ### Task 5: Document the benchmark image guidance @@ -111,9 +111,9 @@ So the implementation target should be: - Modify: `docs/task-authoring.md` - Modify: `docs/api-reference.md` -- [ ] Add a short section to `docs/task-authoring.md` explaining that benchmark images should install shared tooling into shared prefixes like `/usr/local` or `/opt`, not `/root/.nvm` or `/root/.local/bin`, when the tools must be usable by a sandbox user. -- [ ] Document that pre-creating the sandbox user in the Dockerfile is optional optimization, not the primary compatibility mechanism. -- [ ] Update `docs/api-reference.md` to describe `setup_sandbox_user()` as lightweight user setup plus workspace ownership, not recursive home cloning. +- [x] Add a short section to `docs/task-authoring.md` explaining that benchmark images should install shared tooling into shared prefixes like `/usr/local` or `/opt`, not `/root/.nvm` or `/root/.local/bin`, when the tools must be usable by a sandbox user. +- [x] Document that pre-creating the sandbox user in the Dockerfile is optional optimization, not the primary compatibility mechanism. +- [x] Update `docs/api-reference.md` to describe `setup_sandbox_user()` as lightweight user setup plus workspace ownership, not recursive home cloning. ### Task 6: End-to-end verification @@ -121,10 +121,10 @@ So the implementation target should be: - No code changes required unless verification exposes a gap. -- [ ] Run: `.venv/bin/python -m pytest tests/test_sandbox.py tests/test_sandbox_setup.py tests/test_sandbox_hardening.py tests/test_sandbox_verifier_workspace.py -q` -- [ ] Run: `.venv/bin/ty check src/` -- [ ] If available, run one trial startup path that exercises `Trial.install_agent()` with `sandbox_user="agent"` and capture setup timing before/after. -- [ ] Confirm the setup command no longer performs recursive copies of heavyweight tool trees per trial. +- [x] Run: `.venv/bin/python -m pytest tests/test_sandbox.py tests/test_sandbox_setup.py tests/test_sandbox_hardening.py tests/test_sandbox_verifier_workspace.py -q` +- [x] Run: `.venv/bin/ty check src/` +- [x] If available, run one trial startup path that exercises `Trial.install_agent()` with `sandbox_user="agent"` and capture setup timing before/after. +- [x] Confirm the setup command no longer performs recursive copies of heavyweight tool trees per trial. ## Notes / Risks diff --git a/docs/task-authoring.md b/docs/task-authoring.md index adfb641..9081987 100644 --- a/docs/task-authoring.md +++ b/docs/task-authoring.md @@ -50,6 +50,8 @@ env = { OPENAI_API_KEY = "${OPENAI_API_KEY}" } # host vars to injec **Built-in mock services** — if the Dockerfile references a service binary (`claw-gmail`, `claw-slack`, `claw-gcal`, `claw-gdoc`, `claw-gdrive`), BenchFlow starts it automatically. No `[services]` section needed. +**Install tooling to shared prefixes, not `/root`** — when a task image ships Node.js, Python tools, or agent binaries that the sandbox user must execute, install them to `/usr/local/bin`, `/usr/local/lib`, or `/opt`, not `/root/.nvm` or `/root/.local/bin`. `setup_sandbox_user()` creates the non-root user, prepares small config/auth dirs, and chowns the workspace — it does not clone `/root` into the sandbox home. Legacy images that already install tools under `/root` still work via a narrow symlink fallback, but shared prefixes are the supported path. Pre-creating the sandbox user in the Dockerfile is an optional speedup, not a requirement. + --- ## instruction.md diff --git a/src/benchflow/_agent_setup.py b/src/benchflow/_agent_setup.py index 39b1720..5db5308 100644 --- a/src/benchflow/_agent_setup.py +++ b/src/benchflow/_agent_setup.py @@ -30,6 +30,47 @@ logger = logging.getLogger(__name__) +def _skill_link_cmd(source: str, dest: str) -> str: + """Link a shared skills tree into an agent discovery path.""" + if source == dest: + return f"mkdir -p {shlex.quote(dest)}" + + parent = shlex.quote(str(Path(dest).parent)) + q_source = shlex.quote(source) + q_dest = shlex.quote(dest) + return ( + f"mkdir -p {parent} && " + f"rm -rf {q_dest} && " + f"ln -sfn {q_source} {q_dest}" + ) + + +async def _link_skill_paths(env, source: str, skill_paths: list[str], home: str, cwd: str) -> int: + """Link one shared skills tree into each configured discovery path.""" + parts = [] + for sp in skill_paths: + expanded = sp.replace("$HOME", home).replace("$WORKSPACE", cwd) + parts.append(_skill_link_cmd(source, expanded)) + if parts: + cmd = " && ".join(parts) + result = await env.exec(cmd, timeout_sec=15) + if result.return_code != 0: + stdout = (getattr(result, "stdout", "") or "").strip() + stderr = (getattr(result, "stderr", "") or "").strip() + details = [ + f"exit code {result.return_code}", + f"command: {cmd}", + ] + if stdout: + details.append(f"stdout: {stdout}") + if stderr: + details.append(f"stderr: {stderr}") + raise RuntimeError( + f"Failed to link skills from {source}: {'; '.join(details)}" + ) + return len(parts) + + async def install_agent(env, agent: str, trial_dir: Path) -> AgentConfig | None: """Install agent in sandbox and return its config.""" agent_base = agent.split()[0] @@ -72,6 +113,9 @@ async def deploy_skills( task: "Task", ) -> None: """Deploy and distribute skills into sandbox.""" + task_skills_dir = task.config.environment.skills_dir + effective_skills = task_skills_dir + # Runtime upload (fallback if not baked into Dockerfile) if skills_dir: dockerfile = task_path / "environment" / "Dockerfile" @@ -84,38 +128,24 @@ async def deploy_skills( if skills_path.is_dir(): logger.info(f"Deploying skills via runtime upload from {skills_path}") await env.upload_dir(skills_path, "/skills") - if agent_cfg and agent_cfg.skill_paths: - parts = [] - for sp in agent_cfg.skill_paths: - expanded = sp.replace("$HOME", "/root").replace( - "$WORKSPACE", "/app" - ) - parent = str(Path(expanded).parent) - parts.append( - f"mkdir -p '{parent}' && ln -sf /skills '{expanded}'" - ) - await env.exec(" && ".join(parts), timeout_sec=10) - logger.info("Skills deployed to /skills and symlinked") + logger.info("Skills deployed to /skills") + effective_skills = "/skills" else: logger.warning(f"Skills dir not found: {skills_path}") else: logger.info("Skills already injected via Dockerfile") # Distribute to agent-specific discovery paths - task_skills_dir = task.config.environment.skills_dir - effective_skills = "/skills" if skills_dir else task_skills_dir if effective_skills and agent_cfg and agent_cfg.skill_paths: home = f"/home/{sandbox_user}" if sandbox_user else "/root" - parts = [] - for sp in agent_cfg.skill_paths: - expanded = sp.replace("$HOME", home).replace("$WORKSPACE", agent_cwd) - q_expanded = shlex.quote(expanded) - q_skills = shlex.quote(effective_skills) - parts.append( - f"mkdir -p {q_expanded} && cp -r {q_skills}/. {q_expanded}/ 2>/dev/null" - ) - if parts: - await env.exec("; ".join(parts), timeout_sec=15) + count = await _link_skill_paths( + env, + effective_skills, + agent_cfg.skill_paths, + home, + agent_cwd, + ) + if count: logger.info( - f"Skills distributed to {len(parts)} paths for {agent_cfg.name}" + f"Skills distributed to {count} paths for {agent_cfg.name}" ) diff --git a/src/benchflow/agents/registry.py b/src/benchflow/agents/registry.py index b809b39..7e3702f 100644 --- a/src/benchflow/agents/registry.py +++ b/src/benchflow/agents/registry.py @@ -338,18 +338,16 @@ class AgentConfig: def get_sandbox_home_dirs() -> set[str]: """Collect user home config/auth dirs BenchFlow may materialize for the sandbox user. - Derives from four sources across all registered agents: - - skill_paths: $HOME/.foo/... → ".foo" + Derives from three sources across all registered agents: - credential_files: {home}/.foo/... → ".foo" - subscription_auth.files: {home}/.foo/... → ".foo" - home_dirs: explicit extras (e.g. ".openclaw") + + Skill paths are excluded: deploy_skills() now links those paths directly to a + shared skills tree instead of relying on sandbox-home copies. """ dirs: set[str] = set() for cfg in AGENTS.values(): - for sp in cfg.skill_paths: - if sp.startswith("$HOME/."): - dirname = sp.removeprefix("$HOME/").split("/")[0] - dirs.add(dirname) for cf in cfg.credential_files: # path uses {home}/.foo/... placeholder path = cf.path diff --git a/tests/test_agent_setup.py b/tests/test_agent_setup.py new file mode 100644 index 0000000..f277bfb --- /dev/null +++ b/tests/test_agent_setup.py @@ -0,0 +1,145 @@ +"""Tests for agent install and skill deployment setup helpers.""" + +from pathlib import Path +from types import SimpleNamespace +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from benchflow._agent_setup import deploy_skills +from benchflow.agents.registry import AgentConfig + + +def _make_task(skills_dir: str | None): + return SimpleNamespace( + config=SimpleNamespace( + environment=SimpleNamespace( + skills_dir=skills_dir, + ) + ) + ) + + +@pytest.mark.asyncio +async def test_deploy_skills_symlinks_agent_skill_paths_instead_of_copying(tmp_path): + env = MagicMock() + env.exec = AsyncMock(return_value=MagicMock(return_code=0, stdout="")) + env.upload_dir = AsyncMock() + agent_cfg = AgentConfig( + name="test-agent", + install_cmd="true", + launch_cmd="true", + skill_paths=["$HOME/.agents/skills", "$WORKSPACE/skills"], + ) + + await deploy_skills( + env=env, + task_path=tmp_path, + skills_dir=None, + agent_cfg=agent_cfg, + sandbox_user="agent", + agent_cwd="/app", + task=_make_task("/opt/benchflow/skills"), + ) + + env.upload_dir.assert_not_called() + env.exec.assert_awaited_once() + + cmd = env.exec.await_args.args[0] + assert "cp -r" not in cmd + assert " && " in cmd + assert ";" not in cmd + assert "ln -sfn /opt/benchflow/skills /home/agent/.agents/skills" in cmd + assert "ln -sfn /opt/benchflow/skills /app/skills" in cmd + + +@pytest.mark.asyncio +async def test_deploy_skills_uploads_runtime_skills_and_links_shared_tree(tmp_path): + env = MagicMock() + env.exec = AsyncMock(return_value=MagicMock(return_code=0, stdout="")) + env.upload_dir = AsyncMock() + agent_cfg = AgentConfig( + name="test-agent", + install_cmd="true", + launch_cmd="true", + skill_paths=["$HOME/.agents/skills", "$WORKSPACE/skills"], + ) + skills_dir = tmp_path / "skills" + skills_dir.mkdir() + + await deploy_skills( + env=env, + task_path=tmp_path, + skills_dir=skills_dir, + agent_cfg=agent_cfg, + sandbox_user="agent", + agent_cwd="/workspace", + task=_make_task("/opt/benchflow/skills"), + ) + + env.upload_dir.assert_awaited_once_with(skills_dir, "/skills") + env.exec.assert_awaited_once() + + distributed_link_cmd = env.exec.await_args.args[0] + assert " && " in distributed_link_cmd + assert ";" not in distributed_link_cmd + assert "ln -sfn /skills /home/agent/.agents/skills" in distributed_link_cmd + assert "ln -sfn /skills /workspace/skills" in distributed_link_cmd + assert "/root/.agents/skills" not in distributed_link_cmd + assert "/app/skills" not in distributed_link_cmd + + +@pytest.mark.asyncio +async def test_deploy_skills_falls_back_when_local_skills_dir_is_missing(tmp_path): + env = MagicMock() + env.exec = AsyncMock(return_value=MagicMock(return_code=0, stdout="")) + env.upload_dir = AsyncMock() + agent_cfg = AgentConfig( + name="test-agent", + install_cmd="true", + launch_cmd="true", + skill_paths=["$HOME/.agents/skills", "$WORKSPACE/skills"], + ) + + await deploy_skills( + env=env, + task_path=tmp_path, + skills_dir=tmp_path / "missing-skills", + agent_cfg=agent_cfg, + sandbox_user="agent", + agent_cwd="/workspace", + task=_make_task("/opt/benchflow/skills"), + ) + + env.upload_dir.assert_not_called() + env.exec.assert_awaited_once() + + distributed_link_cmd = env.exec.await_args.args[0] + assert "ln -sfn /opt/benchflow/skills /home/agent/.agents/skills" in distributed_link_cmd + assert "ln -sfn /opt/benchflow/skills /workspace/skills" in distributed_link_cmd + assert "ln -sfn /skills /home/agent/.agents/skills" not in distributed_link_cmd + assert "ln -sfn /skills /workspace/skills" not in distributed_link_cmd + + +@pytest.mark.asyncio +async def test_deploy_skills_raises_when_skill_linking_fails(tmp_path): + env = MagicMock() + env.exec = AsyncMock(return_value=MagicMock(return_code=17, stdout="link failed")) + env.upload_dir = AsyncMock() + agent_cfg = AgentConfig( + name="test-agent", + install_cmd="true", + launch_cmd="true", + skill_paths=["$HOME/.agents/skills"], + ) + + with pytest.raises(RuntimeError, match="Failed to link skills"): + await deploy_skills( + env=env, + task_path=tmp_path, + skills_dir=None, + agent_cfg=agent_cfg, + sandbox_user="agent", + agent_cwd="/app", + task=_make_task("/opt/benchflow/skills"), + ) diff --git a/tests/test_registry_invariants.py b/tests/test_registry_invariants.py index 023945b..8cb9fa3 100644 --- a/tests/test_registry_invariants.py +++ b/tests/test_registry_invariants.py @@ -75,6 +75,22 @@ def test_agent_collection_invariants(name, cfg): assert d.startswith("."), f"home_dirs entry {d!r} must start with '.'" +@pytest.mark.parametrize("name,cfg", AGENTS.items(), ids=list(AGENTS.keys())) +def test_agent_install_cmd_targets_shared_paths(name, cfg): + """Installed binaries must land in shared prefixes, not a root-only home. + + setup_sandbox_user() no longer recursively copies /root/.nvm or + /root/.local/bin into the sandbox home. If an install_cmd placed its + binary there, the sandbox user would silently lose access to the agent. + """ + forbidden_binary_prefixes = ("/root/.nvm/", "/root/.local/bin/", "$HOME/.nvm/") + for prefix in forbidden_binary_prefixes: + assert prefix not in cfg.install_cmd, ( + f"{name!r} install_cmd writes under {prefix!r}; use /usr/local/bin " + f"or another shared prefix so the sandbox user inherits the tool" + ) + + @pytest.mark.parametrize("name,cfg", AGENTS.items(), ids=list(AGENTS.keys())) def test_agent_credential_and_subscription_auth(name, cfg): """Optional credential_files and subscription_auth structures.""" diff --git a/tests/test_sandbox.py b/tests/test_sandbox.py index cebda60..36a8c80 100644 --- a/tests/test_sandbox.py +++ b/tests/test_sandbox.py @@ -10,16 +10,12 @@ class TestSandboxDirs: - def test_dirs_derived_from_skill_paths(self): - """All $HOME skill_paths dirs from AGENTS registry are included.""" + def test_skill_only_dirs_not_included(self): + """Skill-only home dirs should not be copied during sandbox user setup.""" dirs = get_sandbox_home_dirs() - # claude-agent-acp has $HOME/.claude/skills - assert ".claude" in dirs - # gemini has $HOME/.gemini/skills - assert ".gemini" in dirs - # pi-acp has $HOME/.pi/agent/skills and $HOME/.agents/skills - assert ".pi" in dirs - assert ".agents" in dirs + + assert ".pi" not in dirs + assert ".agents" not in dirs def test_credential_file_dirs_included(self): """Dirs from credential_files paths are included.""" @@ -46,10 +42,12 @@ def test_only_includes_top_level_home_dirs(self): def test_dirs_represent_registry_backed_home_config_or_auth(self): """Returned dirs are registry-derived user home config/auth roots.""" dirs = get_sandbox_home_dirs() - assert dirs == {".agents", ".claude", ".codex", ".gemini", ".openclaw", ".pi"} + assert {".claude", ".codex", ".gemini", ".openclaw"}.issubset(dirs) + assert ".agents" not in dirs + assert ".pi" not in dirs - def test_new_agent_auto_included(self): - """Adding an agent with skill_paths=$HOME/.newagent/skills includes .newagent.""" + def test_new_agent_skill_path_not_auto_included(self): + """Skill-only home dirs should not become sandbox copy targets.""" AGENTS["_test_agent"] = AgentConfig( name="_test_agent", install_cmd="true", @@ -58,7 +56,7 @@ def test_new_agent_auto_included(self): ) try: dirs = get_sandbox_home_dirs() - assert ".newagent" in dirs + assert ".newagent" not in dirs finally: del AGENTS["_test_agent"] diff --git a/tests/test_sandbox_setup.py b/tests/test_sandbox_setup.py index f835ab6..f81b392 100644 --- a/tests/test_sandbox_setup.py +++ b/tests/test_sandbox_setup.py @@ -84,3 +84,23 @@ async def test_setup_command_copy_loop_excludes_local_dir(self): assert ".local" not in copy_loop_dirs assert "mkdir -p /home/agent/$d" in cmd assert "cp -a /root/$d/. /home/agent/$d/ 2>/dev/null || true" in cmd + + @pytest.mark.asyncio + async def test_setup_command_does_not_copy_heavy_tool_trees_into_home(self): + """BenchFlow-installed agents must not rely on sandbox-home tool copies. + + Agent binaries are placed in /usr/local/bin by the registered install_cmd + values, so setup_sandbox_user() must not bulk-copy heavyweight tool trees + (e.g. /root/.nvm, /root/.local/bin) to make them executable for the user. + """ + cmd, _ = await _run_setup_sandbox_user() + + for heavy_source in ("/root/.nvm", "/root/.local/bin"): + assert f"cp -a {heavy_source}/." not in cmd + assert f"cp -aL {heavy_source}/." not in cmd + + copy_loop_dirs = _get_copy_loop_dirs(cmd) + for heavy_dir in (".nvm", ".local"): + assert heavy_dir not in copy_loop_dirs, ( + f"sandbox copy loop must not include heavy tool dir {heavy_dir!r}" + ) From cbca4959b5da993d0e81f79328d5aa0e9b3b79e4 Mon Sep 17 00:00:00 2001 From: Yifeng He Date: Wed, 22 Apr 2026 15:11:11 -0700 Subject: [PATCH 12/12] chore: remove completed sandbox plan doc --- ...6-04-22-sandbox-setup-shared-tool-paths.md | 133 ------------------ 1 file changed, 133 deletions(-) delete mode 100644 docs/plans/2026-04-22-sandbox-setup-shared-tool-paths.md diff --git a/docs/plans/2026-04-22-sandbox-setup-shared-tool-paths.md b/docs/plans/2026-04-22-sandbox-setup-shared-tool-paths.md deleted file mode 100644 index 9fc49ea..0000000 --- a/docs/plans/2026-04-22-sandbox-setup-shared-tool-paths.md +++ /dev/null @@ -1,133 +0,0 @@ -# Sandbox Setup Shared Tool Paths Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Eliminate per-trial copying of heavyweight tool installations during `setup_sandbox_user()` while keeping sandboxed agents able to read configs, credentials, and installed binaries. - -**Architecture:** Treat tool installations as shared container state, not per-user home state. Keep `setup_sandbox_user()` responsible only for creating the non-root user, preparing small home-scoped config/credential directories, and granting workspace ownership. Add a narrow compatibility path for root-home installs that BenchFlow cannot relocate itself. - -**Tech Stack:** Python 3.12, pytest, Harbor sandbox env exec, BenchFlow agent registry / sandbox setup. - ---- - -## Recommendation - -Pick **alternative 2: shared install paths** as the primary direction, with a **small compatibility fallback from alternative 1** for existing images that already install tools under `/root`. - -Why this is the best fit: - -- `setup_sandbox_user()` currently copies `/root/.nvm`, `/root/.local/bin`, and every derived home dir. That is the direct cause of the timeout. -- BenchFlow already installs some helpers into shared locations like `/usr/local/bin`, which matches the right ownership boundary for binaries. -- Pre-creating the user in the Dockerfile (alternative 3) is useful for image authors, but it is not a sufficient product-level fix because BenchFlow must still support arbitrary user-provided images. -- Pure symlinking of `/root` home directories (alternative 1) would reduce I/O, but it keeps the wrong model: heavyweight tool installs still live in a root-only home and `setup_sandbox_user()` still has to know about them. - -So the implementation target should be: - -- **Primary path:** stop relying on root-home installs for agent binaries and shared tooling. -- **Compatibility path:** if an existing image already has a heavy tool dir in `/root` and BenchFlow needs it, link the minimum necessary path instead of copying it. - -## File Map - -- Modify: `src/benchflow/_sandbox.py` - - Remove heavyweight recursive copies from `setup_sandbox_user()`. - - Add a minimal compatibility helper for sharing root-owned tool paths without recursive copy. -- Modify: `src/benchflow/agents/registry.py` - - Clarify the contract for `home_dirs` / `get_sandbox_home_dirs()` so it covers user config, not shared tool installs. -- Modify: `src/benchflow/_agent_setup.py` - - Ensure the install path assumptions match the new sandbox contract. -- Modify: `tests/test_sandbox.py` - - Add contract tests for which home dirs are copied versus treated as shared tooling. -- Create or modify: `tests/test_sandbox_setup.py` - - Add targeted tests for the generated `setup_sandbox_user()` command. -- Modify: `docs/task-authoring.md` - - Document preferred shared install locations for benchmark/task images. -- Modify: `docs/api-reference.md` - - Update the sandbox setup description to match the new behavior. - -### Task 1: Lock the new sandbox contract in tests - -**Files:** - -- Modify: `tests/test_sandbox.py` -- Create: `tests/test_sandbox_setup.py` -- Modify: `src/benchflow/_sandbox.py` - -- [x] Add a focused async test file for `setup_sandbox_user()` that mocks `env.exec`, calls `setup_sandbox_user(env, "agent", "/app")`, and asserts the command no longer contains recursive copies of `/root/.nvm` or `/root/.local/bin`. -- [x] In the same test file, assert the command still creates the user, prepares the target home directory, and `chown`s the workspace. -- [x] Add a compatibility test that asserts the command uses a link-or-shared-path strategy for heavyweight root-owned tool dirs rather than `cp -a`. -- [x] Run: `.venv/bin/python -m pytest tests/test_sandbox.py tests/test_sandbox_setup.py -q` -- [x] Expected: new tests fail on current `main` because the command still contains `cp -a` / `cp -aL` for heavyweight tool dirs. - -### Task 2: Narrow `setup_sandbox_user()` to user state only - -**Files:** - -- Modify: `src/benchflow/_sandbox.py:103-128` -- Test: `tests/test_sandbox_setup.py` - -- [x] Refactor `setup_sandbox_user()` so it does only three things: - - create the sandbox user if missing, - - prepare only small home-scoped directories BenchFlow actually needs for config/auth, - - grant ownership to the workspace. -- [x] Remove the unconditional copy of `/root/.nvm`. -- [x] Remove the special-case copy of `/root/.local/bin`. -- [x] Keep copying only the small dirs derived from `get_sandbox_home_dirs()` that represent config/auth state, not tool installations. -- [x] Add a minimal helper or inline shell snippet that, when a legacy image exposes required tool paths only under `/root`, creates a symlink to the required path instead of copying the directory tree. -- [x] Run: `.venv/bin/python -m pytest tests/test_sandbox_setup.py -q` -- [x] Expected: the new setup contract tests pass. - -### Task 3: Align registry semantics with the new contract - -**Files:** - -- Modify: `src/benchflow/agents/registry.py:338-366` -- Modify: `tests/test_sandbox.py` - -- [x] Update the `get_sandbox_home_dirs()` docstring to state that the returned dirs are user home config/auth dirs that BenchFlow may need to materialize for the sandbox user. -- [x] Decide whether `.local` should remain in `get_sandbox_home_dirs()`: - - if BenchFlow still needs `.local/share` or similar user-scoped state, keep it but stop special-casing `.local/bin` in `_sandbox.py`; - - if BenchFlow only needed `.local` for tool binaries, remove `.local` from the always-include set and update tests accordingly. -- [x] Prefer the more minimal option based on the actual credential/skill paths in the registry. -- [x] Run: `.venv/bin/python -m pytest tests/test_sandbox.py -q` -- [x] Expected: registry contract tests pass with updated semantics. - -### Task 4: Verify agent install assumptions still hold - -**Files:** - -- Modify: `src/benchflow/_agent_setup.py` -- Modify: `src/benchflow/agents/registry.py` -- Test: `tests/test_sandbox_setup.py` - -- [x] Review each registry `install_cmd` and confirm BenchFlow-installed binaries resolve from shared paths already on `PATH` for the sandbox user. -- [x] If any BenchFlow-managed install still lands in a root-home path, change that install command to a shared location such as `/usr/local/bin`, `/usr/local/lib`, or another non-home prefix. -- [x] Add or adjust a test that asserts the sandbox user setup no longer depends on home-copying to make BenchFlow-installed agents executable. -- [x] Run targeted tests covering sandbox setup and any registry invariant tests touched by the change. - -### Task 5: Document the benchmark image guidance - -**Files:** - -- Modify: `docs/task-authoring.md` -- Modify: `docs/api-reference.md` - -- [x] Add a short section to `docs/task-authoring.md` explaining that benchmark images should install shared tooling into shared prefixes like `/usr/local` or `/opt`, not `/root/.nvm` or `/root/.local/bin`, when the tools must be usable by a sandbox user. -- [x] Document that pre-creating the sandbox user in the Dockerfile is optional optimization, not the primary compatibility mechanism. -- [x] Update `docs/api-reference.md` to describe `setup_sandbox_user()` as lightweight user setup plus workspace ownership, not recursive home cloning. - -### Task 6: End-to-end verification - -**Files:** - -- No code changes required unless verification exposes a gap. - -- [x] Run: `.venv/bin/python -m pytest tests/test_sandbox.py tests/test_sandbox_setup.py tests/test_sandbox_hardening.py tests/test_sandbox_verifier_workspace.py -q` -- [x] Run: `.venv/bin/ty check src/` -- [x] If available, run one trial startup path that exercises `Trial.install_agent()` with `sandbox_user="agent"` and capture setup timing before/after. -- [x] Confirm the setup command no longer performs recursive copies of heavyweight tool trees per trial. - -## Notes / Risks - -- The collaborator comment on the issue asks to try shared install paths first. This plan follows that request as the architectural choice. -- Shared install paths alone cannot retrofit arbitrary existing benchmark images that already baked tools into `/root`. That is why the plan keeps a narrow symlink-based compatibility path instead of a full copy. -- The key review question before implementation is whether `.local` remains a true user-state directory in BenchFlow, or whether it only existed because of previous tool-install assumptions.