From 30dbe215c55d444f5f6cef285c2c85b74fcbeded Mon Sep 17 00:00:00 2001
From: Yifeng He <yfhe.prsn@gmail.com>
Date: Tue, 21 Apr 2026 14:15:30 -0700
Subject: [PATCH 01/12] fix: skip model/API-key validation for oracle agent

The oracle agent runs solution/solve.sh and never calls an LLM, but
resolve_agent_env() was validating API keys for whatever model the CLI
defaulted to (claude-haiku-4-5-20251001). This made `bench eval create
-a oracle` fail without ANTHROPIC_API_KEY set, even though oracle
doesn't need it.
---
 src/benchflow/_agent_env.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/benchflow/_agent_env.py b/src/benchflow/_agent_env.py
index 8885b5e..5609e6c 100644
--- a/src/benchflow/_agent_env.py
+++ b/src/benchflow/_agent_env.py
@@ -144,7 +144,7 @@ def resolve_agent_env(
     """Resolve agent environment: auto-inherit keys, provider vars, env_mapping."""
     agent_env = dict(agent_env or {})
     auto_inherit_env(agent_env)
-    if model:
+    if model and agent != "oracle":
         inject_vertex_credentials(agent_env, model)
         resolve_provider_env(agent_env, model, agent)
         # Validate required API key for the chosen model

From 360c460a33e4758325611a7a4bdd4b0a599a8eb6 Mon Sep 17 00:00:00 2001
From: Yifeng He <yfhe.prsn@gmail.com>
Date: Tue, 21 Apr 2026 14:37:13 -0700
Subject: [PATCH 02/12] fix: don't assign default model to oracle agent

Move the fix from resolve_agent_env to the CLI layer: oracle runs
solve.sh and never calls an LLM, so it should not receive DEFAULT_MODEL
at all. Both _run_single and _run_batch now pass model=None for oracle.
Widen JobConfig.model to str | None to support this.
---
 src/benchflow/_agent_env.py | 2 +-
 src/benchflow/cli/eval.py   | 6 ++++--
 src/benchflow/job.py        | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/benchflow/_agent_env.py b/src/benchflow/_agent_env.py
index 5609e6c..8885b5e 100644
--- a/src/benchflow/_agent_env.py
+++ b/src/benchflow/_agent_env.py
@@ -144,7 +144,7 @@ def resolve_agent_env(
     """Resolve agent environment: auto-inherit keys, provider vars, env_mapping."""
     agent_env = dict(agent_env or {})
     auto_inherit_env(agent_env)
-    if model and agent != "oracle":
+    if model:
         inject_vertex_credentials(agent_env, model)
         resolve_provider_env(agent_env, model, agent)
         # Validate required API key for the chosen model
diff --git a/src/benchflow/cli/eval.py b/src/benchflow/cli/eval.py
index 0b13fd8..a92c0be 100644
--- a/src/benchflow/cli/eval.py
+++ b/src/benchflow/cli/eval.py
@@ -228,11 +228,12 @@ def _run_single(
     from benchflow.sdk import SDK
 
     sdk = SDK()
+    effective_model = None if agent == "oracle" else (model or DEFAULT_MODEL)
     result = asyncio.run(
         sdk.run(
             task_path=task_dir,
             agent=agent,
-            model=model or DEFAULT_MODEL,
+            model=effective_model,
             environment=environment,
             prompts=cast("list[str | None] | None", prompt),
             agent_env=agent_env,
@@ -268,9 +269,10 @@ def _run_batch(
 ) -> None:
     from benchflow.job import Job, JobConfig, RetryConfig
 
+    effective_model = None if agent == "oracle" else (model or DEFAULT_MODEL)
     config = JobConfig(
         agent=agent,
-        model=model or DEFAULT_MODEL,
+        model=effective_model,
         environment=environment,
         concurrency=concurrency,
         retry=RetryConfig(max_retries=max_retries),
diff --git a/src/benchflow/job.py b/src/benchflow/job.py
index 6d8fe9b..95414fe 100644
--- a/src/benchflow/job.py
+++ b/src/benchflow/job.py
@@ -146,7 +146,7 @@ class JobConfig:
     """Configuration for a benchmark job."""
 
     agent: str = DEFAULT_AGENT
-    model: str = DEFAULT_MODEL
+    model: str | None = DEFAULT_MODEL
     environment: str = "docker"
     concurrency: int = 4
     prompts: list[str | None] | None = None

From 9b57b2afb8bade8912918fe3654e2a608779f373 Mon Sep 17 00:00:00 2001
From: Yifeng He <yfhe.prsn@gmail.com>
Date: Tue, 21 Apr 2026 15:32:20 -0700
Subject: [PATCH 03/12] =?UTF-8?q?fix:=20oracle=20agent=20=E2=80=94=20choke?=
 =?UTF-8?q?point=20guard,=20drop=20orphan=20eval=20CLI,=20helper?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR #173 moved the oracle/DEFAULT_MODEL guard from resolve_agent_env to
cli/eval.py, but cli/eval.py is orphaned (never imported into the live
CLI), so `bench eval create` still passes DEFAULT_MODEL to oracle and
trips ANTHROPIC_API_KEY validation. Three changes:

- Restore the `agent != "oracle"` guard in resolve_agent_env so the
  chokepoint defends against any caller that forwards a model.
- Delete the orphan cli/eval.py and its tests — the live eval_create
  lives in cli/main.py and was the actual code path users hit.
- Add effective_model(agent, model) helper, change JobConfig.model
  default to None, replace seven `model or DEFAULT_MODEL` sites in
  cli/main.py and job.py YAML loaders so oracle gets honest model=None
  end-to-end (in result/summary JSON, prints, and downstream Trial).

Regression test in test_resolve_env_helpers.py pins the chokepoint by
calling resolve_agent_env("oracle", DEFAULT_MODEL, {}) with no API key
and no host auth — verified to fail on main with the user-facing
ANTHROPIC_API_KEY error and pass after the fix.
---
 src/benchflow/_agent_env.py       |   4 +-
 src/benchflow/cli/eval.py         | 373 ------------------------------
 src/benchflow/cli/main.py         |  17 +-
 src/benchflow/job.py              |  22 +-
 tests/test_eval_cli.py            | 118 ----------
 tests/test_resolve_env_helpers.py |  42 ++++
 6 files changed, 72 insertions(+), 504 deletions(-)
 delete mode 100644 src/benchflow/cli/eval.py
 delete mode 100644 tests/test_eval_cli.py

diff --git a/src/benchflow/_agent_env.py b/src/benchflow/_agent_env.py
index 8885b5e..72d6498 100644
--- a/src/benchflow/_agent_env.py
+++ b/src/benchflow/_agent_env.py
@@ -144,7 +144,9 @@ def resolve_agent_env(
     """Resolve agent environment: auto-inherit keys, provider vars, env_mapping."""
     agent_env = dict(agent_env or {})
     auto_inherit_env(agent_env)
-    if model:
+    # Oracle runs solve.sh and never calls an LLM — model env vars and
+    # API-key validation are skipped even if a caller forwards a model.
+    if model and agent != "oracle":
         inject_vertex_credentials(agent_env, model)
         resolve_provider_env(agent_env, model, agent)
         # Validate required API key for the chosen model
diff --git a/src/benchflow/cli/eval.py b/src/benchflow/cli/eval.py
deleted file mode 100644
index a92c0be..0000000
--- a/src/benchflow/cli/eval.py
+++ /dev/null
@@ -1,373 +0,0 @@
-"""`bf eval` — the benchflow eval-runner command group.
-
-The future-facing entry point for running evaluations. Anthropic-style shape:
-resource creation, one command, return the result or a job-id.
-
-    bf eval create <task-ref> [flags]
-        One-shot eval — creates an Agent + Environment + Trajectory under
-        the hood and runs the task. `task-ref` can be:
-          - a path to a task directory (single task)
-          - a path to a directory of task directories (batch)
-          - a `harbor://<name>[@<version>]` ref (full Harbor dataset)
-          - a `harbor://<name>/<task>` ref (single task from a Harbor dataset)
-          - a `benchflow://<name>[@<version>]` ref (benchflow-owned dataset)
-
-    bf eval list          Show recent eval runs (reads the jobs/ dir)
-    bf eval retrieve ID   Look up a specific trajectory by trial name
-
-Replaces `bf run` + `bf job` as the idiomatic way to run evals. `bf run`
-stays around as a deprecated alias for one release.
-"""
-
-from __future__ import annotations
-
-import asyncio
-from pathlib import Path
-from typing import Annotated
-
-import typer
-from rich.console import Console
-from rich.table import Table
-
-from benchflow.job import DEFAULT_AGENT, DEFAULT_MODEL
-
-console = Console()
-
-app = typer.Typer(
-    name="eval",
-    help="Run evaluations. `bf eval create <task>` is the main entry point.",
-    no_args_is_help=True,
-)
-
-
-def _resolve_task_ref(task_ref: str) -> tuple[Path, bool]:
-    """Resolve a positional task reference to a local directory.
-
-    Returns `(path, is_batch)`. `is_batch` is True when the reference
-    points at a directory containing multiple task dirs (each with its
-    own `task.toml`), False when the reference is a single task dir.
-    """
-
-    # Registry prefix: fetch the full dataset and treat as batch.
-    if task_ref.startswith("harbor://") or task_ref.startswith("benchflow://"):
-        from benchflow.task_download import ensure_tasks
-
-        # Allow `harbor://<name>/<task>` shorthand for a single task within
-        # a dataset. Split off the trailing segment if it matches a task.
-        prefix, _, tail = task_ref.partition("://")
-        head = tail
-        sub_task: str | None = None
-        if "/" in tail and "@" not in tail.split("/", 1)[1]:
-            dataset, sub_task = tail.split("/", 1)
-            head = dataset
-        dataset_ref = f"{prefix}://{head}"
-        dataset_dir = ensure_tasks(dataset_ref)
-        if sub_task is not None:
-            candidate = dataset_dir / sub_task
-            if not candidate.exists() or not (candidate / "task.toml").exists():
-                console.print(
-                    f"[red]Harbor dataset {head!r} has no task named {sub_task!r}.[/red]"
-                )
-                raise typer.Exit(1)
-            return candidate, False
-        return dataset_dir, True
-
-    # Filesystem path: single task if task.toml is present, batch otherwise.
-    path = Path(task_ref).expanduser().resolve()
-    if not path.exists():
-        console.print(f"[red]Task reference not found: {task_ref}[/red]")
-        raise typer.Exit(1)
-    if (path / "task.toml").exists():
-        return path, False
-    if any(
-        child.is_dir() and (child / "task.toml").exists() for child in path.iterdir()
-    ):
-        return path, True
-    console.print(
-        f"[red]{path} is neither a single task (no task.toml) "
-        f"nor a directory of tasks.[/red]"
-    )
-    raise typer.Exit(1)
-
-
-@app.command("create")
-def eval_create(
-    task: Annotated[
-        str,
-        typer.Argument(
-            help=(
-                "Task reference. Path to a task dir, a dir of tasks, or a "
-                "registry ref (harbor://name, benchflow://name)."
-            )
-        ),
-    ],
-    agent: Annotated[
-        str, typer.Option("--agent", "-a", help="Agent name from the registry")
-    ] = DEFAULT_AGENT,
-    model: Annotated[
-        str | None,
-        typer.Option("--model", "-m", help="Model id; agent default if unset"),
-    ] = None,
-    environment: Annotated[
-        str,
-        typer.Option(
-            "--environment",
-            "-e",
-            help="docker | daytona | ... (uses the agent's default if unset)",
-        ),
-    ] = "docker",
-    prompt: Annotated[
-        list[str] | None,
-        typer.Option(
-            "--prompt",
-            "-p",
-            help="Prompt text; repeat for multi-turn. Default: instruction.md",
-        ),
-    ] = None,
-    concurrency: Annotated[
-        int,
-        typer.Option(
-            "--concurrency",
-            "-c",
-            help="Max parallel trials when task is a batch",
-        ),
-    ] = 4,
-    max_retries: Annotated[
-        int,
-        typer.Option(
-            "--max-retries",
-            help="Per-trial retry count on transient errors",
-        ),
-    ] = 1,
-    jobs_dir: Annotated[
-        str,
-        typer.Option("--jobs-dir", "-o", help="Where to write result files"),
-    ] = "jobs",
-    skills_dir: Annotated[
-        Path | None,
-        typer.Option(
-            "--skills-dir",
-            "-s",
-            help="Skills directory to mount into the sandbox",
-        ),
-    ] = None,
-    sandbox_user: Annotated[
-        str | None,
-        typer.Option(
-            "--sandbox-user",
-            help="Non-root sandbox user (default 'agent'; 'none' = root)",
-        ),
-    ] = "agent",
-    agent_env: Annotated[
-        list[str] | None,
-        typer.Option("--agent-env", help="Extra agent env var (KEY=VALUE)"),
-    ] = None,
-) -> None:
-    """Create and run an eval — one-shot.
-
-    Under the hood:
-      1. Resolves `task` to a local directory (fetching from Harbor if needed).
-      2. If it's a single task: runs `SDK.run()` once and prints the reward.
-      3. If it's a batch: runs a `Job` at `--concurrency` and prints pass rate.
-
-    This is the idiomatic way to run evals going forward. `bf run` and
-    `bf job` remain as one-release deprecated aliases that forward here.
-    """
-
-    resolved, is_batch = _resolve_task_ref(task)
-
-    parsed_env: dict[str, str] = {}
-    for entry in agent_env or []:
-        if "=" not in entry:
-            console.print(f"[red]Invalid --agent-env value: {entry!r}[/red]")
-            raise typer.Exit(2)
-        k, v = entry.split("=", 1)
-        parsed_env[k] = v
-
-    if is_batch:
-        _run_batch(
-            tasks_dir=resolved,
-            agent=agent,
-            model=model,
-            environment=environment,
-            concurrency=concurrency,
-            max_retries=max_retries,
-            jobs_dir=jobs_dir,
-            skills_dir=skills_dir,
-            sandbox_user=sandbox_user,
-            agent_env=parsed_env,
-        )
-    else:
-        _run_single(
-            task_dir=resolved,
-            agent=agent,
-            model=model,
-            environment=environment,
-            prompt=prompt,
-            jobs_dir=jobs_dir,
-            skills_dir=skills_dir,
-            sandbox_user=sandbox_user,
-            agent_env=parsed_env,
-        )
-
-
-def _run_single(
-    *,
-    task_dir: Path,
-    agent: str,
-    model: str | None,
-    environment: str,
-    prompt: list[str] | None,
-    jobs_dir: str,
-    skills_dir: Path | None,
-    sandbox_user: str | None,
-    agent_env: dict[str, str],
-) -> None:
-    from typing import cast
-
-    from benchflow.sdk import SDK
-
-    sdk = SDK()
-    effective_model = None if agent == "oracle" else (model or DEFAULT_MODEL)
-    result = asyncio.run(
-        sdk.run(
-            task_path=task_dir,
-            agent=agent,
-            model=effective_model,
-            environment=environment,
-            prompts=cast("list[str | None] | None", prompt),
-            agent_env=agent_env,
-            job_name="eval-create",
-            jobs_dir=jobs_dir,
-            skills_dir=skills_dir,
-            sandbox_user=None if sandbox_user == "none" else sandbox_user,
-        )
-    )
-    reward = getattr(result, "reward", None)
-    err = getattr(result, "error", None) or getattr(result, "verifier_error", None)
-    console.print()
-    if err:
-        console.print(f"[red]failed:[/red] {err}")
-        raise typer.Exit(1)
-    console.print(
-        f"[bold]reward={reward}[/bold]  tools={getattr(result, 'n_tool_calls', 0)}"
-    )
-
-
-def _run_batch(
-    *,
-    tasks_dir: Path,
-    agent: str,
-    model: str | None,
-    environment: str,
-    concurrency: int,
-    max_retries: int,
-    jobs_dir: str,
-    skills_dir: Path | None,
-    sandbox_user: str | None,
-    agent_env: dict[str, str],
-) -> None:
-    from benchflow.job import Job, JobConfig, RetryConfig
-
-    effective_model = None if agent == "oracle" else (model or DEFAULT_MODEL)
-    config = JobConfig(
-        agent=agent,
-        model=effective_model,
-        environment=environment,
-        concurrency=concurrency,
-        retry=RetryConfig(max_retries=max_retries),
-        agent_env=agent_env,
-        sandbox_user=None if sandbox_user == "none" else sandbox_user,
-        skills_dir=str(skills_dir) if skills_dir else None,
-    )
-    job = Job(
-        tasks_dir=tasks_dir,
-        jobs_dir=Path(jobs_dir),
-        config=config,
-    )
-    result = asyncio.run(job.run())
-    console.print()
-    console.print(
-        f"[bold]{result.passed}/{result.total} "
-        f"({result.score:.1%})[/bold] errors={result.errored}"
-    )
-
-
-@app.command("list")
-def eval_list(
-    jobs_dir: Annotated[
-        str,
-        typer.Option("--jobs-dir", "-o", help="Results directory to scan"),
-    ] = "jobs",
-    limit: Annotated[
-        int,
-        typer.Option("--limit", "-n", help="Max rows to show"),
-    ] = 20,
-) -> None:
-    """List recent eval runs by scanning the jobs/ dir."""
-
-    root = Path(jobs_dir)
-    if not root.exists():
-        console.print(f"[yellow]{root} does not exist yet.[/yellow]")
-        return
-    runs: list[tuple[str, int, int]] = []
-    for job_root in root.iterdir():
-        if not job_root.is_dir():
-            continue
-        for stamp in job_root.iterdir():
-            if not stamp.is_dir():
-                continue
-            trials = list(stamp.iterdir())
-            passed = 0
-            total = 0
-            for trial in trials:
-                result = trial / "result.json"
-                if not result.exists():
-                    continue
-                total += 1
-                try:
-                    import json
-
-                    data = json.loads(result.read_text())
-                    if (data.get("rewards") or {}).get("reward") == 1.0:
-                        passed += 1
-                except Exception:
-                    continue
-            if total:
-                runs.append((f"{job_root.name}/{stamp.name}", passed, total))
-    runs.sort(reverse=True)
-    table = Table(title=f"Recent evals in {root}")
-    table.add_column("run", style="cyan")
-    table.add_column("passed", justify="right", style="green")
-    table.add_column("total", justify="right")
-    table.add_column("rate", justify="right", style="yellow")
-    for run, passed, total in runs[:limit]:
-        rate = f"{100 * passed / total:.0f}%" if total else "-"
-        table.add_row(run, str(passed), str(total), rate)
-    console.print(table)
-
-
-@app.command("retrieve")
-def eval_retrieve(
-    trial_name: Annotated[
-        str, typer.Argument(help="Trial dir name, e.g. my-task__abc")
-    ],
-    jobs_dir: Annotated[
-        str,
-        typer.Option("--jobs-dir", "-o"),
-    ] = "jobs",
-) -> None:
-    """Print the result.json for a specific trial."""
-
-    import json
-
-    root = Path(jobs_dir)
-    matches = list(root.rglob(f"{trial_name}/result.json"))
-    if not matches:
-        console.print(f"[red]no trial named {trial_name!r} under {root}[/red]")
-        raise typer.Exit(1)
-    console.print(f"[dim]{matches[0]}[/dim]")
-    data = json.loads(matches[0].read_text())
-    from rich import print_json
-
-    print_json(data=data)
diff --git a/src/benchflow/cli/main.py b/src/benchflow/cli/main.py
index 9cf1198..96611bb 100644
--- a/src/benchflow/cli/main.py
+++ b/src/benchflow/cli/main.py
@@ -10,7 +10,7 @@
 from rich.console import Console
 from rich.table import Table
 
-from benchflow.job import DEFAULT_AGENT, DEFAULT_MODEL
+from benchflow.job import DEFAULT_AGENT, effective_model
 
 console = Console()
 
@@ -167,7 +167,7 @@ def job(
             jobs_dir=jobs_dir,
             config=JobConfig(
                 agent=agent,
-                model=model or DEFAULT_MODEL,
+                model=effective_model(agent, model),
                 environment=environment,
                 concurrency=concurrency,
                 retry=RetryConfig(max_retries=max_retries),
@@ -343,7 +343,7 @@ def eval(
         jobs_dir=jobs_dir,
         config=JobConfig(
             agent=agent,
-            model=model or DEFAULT_MODEL,
+            model=effective_model(agent, model),
             environment=environment,
             concurrency=concurrency,
             skills_dir=effective_skills,
@@ -754,19 +754,20 @@ def eval_create(
             f"({result.score:.1%})[/bold], errors={result.errored}"
         )
     elif tasks_dir:
+        eff_model = effective_model(agent, model)
         # Smart detection: if tasks_dir has task.toml, it's a single task
         if (tasks_dir / "task.toml").exists():
             from benchflow.trial import Trial, TrialConfig, Scene
 
             config = TrialConfig(
                 task_path=tasks_dir,
-                scenes=[Scene.single(agent=agent, model=model or DEFAULT_MODEL,
+                scenes=[Scene.single(agent=agent, model=eff_model,
                                      skills_dir=str(skills_dir) if skills_dir else None)],
                 environment=environment,
                 sandbox_user=sandbox_user,
                 jobs_dir=jobs_dir,
                 agent=agent,
-                model=model or DEFAULT_MODEL,
+                model=eff_model,
                 skills_dir=str(skills_dir) if skills_dir else None,
             )
 
@@ -777,7 +778,7 @@ async def _run():
             run_result = asyncio.run(_run())
             reward = (run_result.rewards or {}).get("reward")
             console.print(f"\n[bold]Task:[/bold] {tasks_dir.name}")
-            console.print(f"[bold]Agent:[/bold] {agent} ({model or DEFAULT_MODEL})")
+            console.print(f"[bold]Agent:[/bold] {agent} ({eff_model or 'no model'})")
             console.print(f"[bold]Reward:[/bold] {reward}")
             console.print(f"[bold]Tool calls:[/bold] {run_result.n_tool_calls}")
             if run_result.error:
@@ -789,7 +790,7 @@ async def _run():
                 jobs_dir=jobs_dir,
                 config=JobConfig(
                     agent=agent,
-                    model=model or DEFAULT_MODEL,
+                    model=eff_model,
                     environment=environment,
                     concurrency=concurrency,
                     sandbox_user=sandbox_user,
@@ -915,7 +916,7 @@ def train_create(
                 jobs_dir=f"{jobs_dir}/sweep-{sweep_idx}",
                 config=JobConfig(
                     agent=agent,
-                    model=model or DEFAULT_MODEL,
+                    model=effective_model(agent, model),
                     environment=environment,
                     concurrency=concurrency,
                 ),
diff --git a/src/benchflow/job.py b/src/benchflow/job.py
index 95414fe..ad99ad4 100644
--- a/src/benchflow/job.py
+++ b/src/benchflow/job.py
@@ -141,12 +141,25 @@ def backoff_delay(self, attempt: int) -> float:
 DEFAULT_MODEL = "claude-haiku-4-5-20251001"
 
 
+def effective_model(agent: str, model: str | None) -> str | None:
+    """Resolve the model an agent should run with.
+
+    Oracle runs solve.sh and never calls an LLM, so it never receives a model
+    (the chokepoint in resolve_agent_env defends, but callers should also stop
+    materializing DEFAULT_MODEL into oracle configs to keep the data honest —
+    e.g. result-summary JSON shows model=null instead of a bogus default).
+    """
+    if agent == "oracle":
+        return None
+    return model or DEFAULT_MODEL
+
+
 @dataclass
 class JobConfig:
     """Configuration for a benchmark job."""
 
     agent: str = DEFAULT_AGENT
-    model: str | None = DEFAULT_MODEL
+    model: str | None = None
     environment: str = "docker"
     concurrency: int = 4
     prompts: list[str | None] | None = None
@@ -281,9 +294,10 @@ def _from_native_yaml(cls, raw: dict, **kwargs) -> "Job":
         sandbox_user = raw.get("sandbox_user", "agent")
         sandbox_locked_paths = raw.get("sandbox_locked_paths")
 
+        agent_name = raw.get("agent", DEFAULT_AGENT)
         config = JobConfig(
-            agent=raw.get("agent", DEFAULT_AGENT),
-            model=raw.get("model", DEFAULT_MODEL),
+            agent=agent_name,
+            model=effective_model(agent_name, raw.get("model")),
             environment=raw.get("environment", "docker"),
             concurrency=raw.get("concurrency", 4),
             prompts=prompts,
@@ -305,7 +319,7 @@ def _from_harbor_yaml(cls, raw: dict, **kwargs) -> "Job":
         agent_name = agent_cfg.get("name", DEFAULT_AGENT)
 
         # Model — keep provider prefix intact for downstream resolution
-        model = agent_cfg.get("model_name", "") or DEFAULT_MODEL
+        model = effective_model(agent_name, agent_cfg.get("model_name") or None)
 
         # Environment
         env_cfg = raw.get("environment", {})
diff --git a/tests/test_eval_cli.py b/tests/test_eval_cli.py
deleted file mode 100644
index 5a009e7..0000000
--- a/tests/test_eval_cli.py
+++ /dev/null
@@ -1,118 +0,0 @@
-"""Tests for `bf eval` — the new eval-runner CLI.
-
-The create command takes a positional task reference and routes to either
-a single SDK.run or a batch Job based on whether the reference is a single
-task dir or a directory of task dirs. We don't exercise the actual runner
-here (that's the parity tests' job); we only verify the resolver.
-"""
-
-from __future__ import annotations
-
-from pathlib import Path
-
-import click.exceptions
-import pytest
-
-from benchflow.cli.eval import _resolve_task_ref
-
-
-def _make_task_dir(parent: Path, name: str) -> Path:
-    d = parent / name
-    d.mkdir()
-    (d / "task.toml").write_text('schema_version = "1.1"\n')
-    (d / "instruction.md").write_text("instruction\n")
-    (d / "environment").mkdir()
-    (d / "tests").mkdir()
-    (d / "tests" / "test.sh").write_text("#!/bin/bash\necho 1\n")
-    return d
-
-
-def test_resolve_single_task_dir(tmp_path: Path) -> None:
-    task = _make_task_dir(tmp_path, "my-task")
-    resolved, is_batch = _resolve_task_ref(str(task))
-    assert resolved == task.resolve()
-    assert is_batch is False
-
-
-def test_resolve_directory_of_tasks_is_batch(tmp_path: Path) -> None:
-    _make_task_dir(tmp_path, "task-a")
-    _make_task_dir(tmp_path, "task-b")
-    resolved, is_batch = _resolve_task_ref(str(tmp_path))
-    assert resolved == tmp_path.resolve()
-    assert is_batch is True
-
-
-def test_resolve_missing_path_exits(tmp_path: Path) -> None:
-    with pytest.raises(click.exceptions.Exit):
-        _resolve_task_ref(str(tmp_path / "does-not-exist"))
-
-
-def test_resolve_harbor_ref_routes_through_ensure_tasks(
-    tmp_path: Path, monkeypatch
-) -> None:
-    """`harbor://<name>` resolves to the cached dataset directory (batch)."""
-
-    fake_dataset = tmp_path / "cached-aime"
-    fake_dataset.mkdir()
-    _make_task_dir(fake_dataset, "aime_60")
-    _make_task_dir(fake_dataset, "aime_61")
-
-    captured: dict[str, str] = {}
-
-    def fake_ensure_tasks(ref: str) -> Path:
-        captured["ref"] = ref
-        return fake_dataset
-
-    from benchflow import task_download
-
-    monkeypatch.setattr(task_download, "ensure_tasks", fake_ensure_tasks)
-    resolved, is_batch = _resolve_task_ref("harbor://aime@1.0")
-    assert captured["ref"] == "harbor://aime@1.0"
-    assert resolved == fake_dataset
-    assert is_batch is True
-
-
-def test_resolve_harbor_ref_with_subtask(tmp_path: Path, monkeypatch) -> None:
-    """`harbor://<name>/<task>` resolves to a single task from the dataset."""
-
-    fake_dataset = tmp_path / "cached-aime"
-    fake_dataset.mkdir()
-    _make_task_dir(fake_dataset, "aime_60")
-    _make_task_dir(fake_dataset, "aime_61")
-
-    def fake_ensure_tasks(ref: str) -> Path:
-        return fake_dataset
-
-    from benchflow import task_download
-
-    monkeypatch.setattr(task_download, "ensure_tasks", fake_ensure_tasks)
-    resolved, is_batch = _resolve_task_ref("harbor://aime/aime_60")
-    assert resolved == fake_dataset / "aime_60"
-    assert is_batch is False
-
-
-def test_resolve_benchflow_ref(tmp_path: Path, monkeypatch) -> None:
-    """`benchflow://<name>` works identically to `harbor://`."""
-
-    fake_dataset = tmp_path / "cached-benchjack"
-    fake_dataset.mkdir()
-    _make_task_dir(fake_dataset, "exploit-1")
-
-    def fake_ensure_tasks(ref: str) -> Path:
-        assert ref == "benchflow://benchjack"
-        return fake_dataset
-
-    from benchflow import task_download
-
-    monkeypatch.setattr(task_download, "ensure_tasks", fake_ensure_tasks)
-    resolved, is_batch = _resolve_task_ref("benchflow://benchjack")
-    assert resolved == fake_dataset
-    assert is_batch is True
-
-
-def test_resolve_non_task_dir_exits(tmp_path: Path) -> None:
-    """A directory with no task.toml children errors cleanly."""
-
-    (tmp_path / "not-a-task").mkdir()
-    with pytest.raises(click.exceptions.Exit):
-        _resolve_task_ref(str(tmp_path))
diff --git a/tests/test_resolve_env_helpers.py b/tests/test_resolve_env_helpers.py
index f699334..d831fe7 100644
--- a/tests/test_resolve_env_helpers.py
+++ b/tests/test_resolve_env_helpers.py
@@ -296,3 +296,45 @@ def test_no_model_empty_requires_env(self, monkeypatch, tmp_path):
         self._patch_expanduser(monkeypatch, tmp_path)
         result = self._resolve(agent="openclaw", agent_env={})
         assert "_BENCHFLOW_SUBSCRIPTION_AUTH" not in result
+
+
+class TestResolveAgentEnvOracle:
+    """Oracle runs solve.sh and never calls an LLM — must skip model-related env.
+
+    Regression for the PR #173 follow-up: commit 360c460 removed the
+    `agent != "oracle"` guard from resolve_agent_env, betting that CLI callers
+    would pass model=None for oracle. But cli/main.py:eval_create (the live
+    `bench eval create`) still passes `model or DEFAULT_MODEL`, so oracle
+    reaches the chokepoint with a real model and triggers ANTHROPIC_API_KEY
+    validation — breaking offline oracle runs that have no API key set.
+    """
+
+    def _patch_expanduser(self, monkeypatch, tmp_path):
+        orig = Path.expanduser
+
+        def fake(self):
+            s = str(self)
+            if s.startswith("~"):
+                return tmp_path / s[2:]
+            return orig(self)
+
+        monkeypatch.setattr(Path, "expanduser", fake)
+
+    def test_oracle_with_default_model_does_not_validate_api_key(
+        self, monkeypatch, tmp_path
+    ):
+        """Oracle + DEFAULT_MODEL + no API key + no host auth must not raise."""
+        for k in (
+            "ANTHROPIC_API_KEY",
+            "OPENAI_API_KEY",
+            "GOOGLE_API_KEY",
+            "GEMINI_API_KEY",
+        ):
+            monkeypatch.delenv(k, raising=False)
+        self._patch_expanduser(monkeypatch, tmp_path)
+
+        result = resolve_agent_env("oracle", "claude-haiku-4-5-20251001", {})
+
+        # Provider env never resolved — oracle never calls an LLM.
+        assert "BENCHFLOW_PROVIDER_MODEL" not in result
+        assert "_BENCHFLOW_SUBSCRIPTION_AUTH" not in result

From 796f5896ec6167c84afbc1bcdebb54e84f6a6658 Mon Sep 17 00:00:00 2001
From: Yifeng He <yfhe.prsn@gmail.com>
Date: Tue, 21 Apr 2026 15:45:53 -0700
Subject: [PATCH 04/12] test: regression suite pinning oracle chokepoint +
 orphan removal
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bundle 14 tests in tests/test_oracle_chokepoint.py that pin each layer
of the prior fix at the right altitude:

- TestOrphanRemoval — cli/eval.py is gone (ModuleNotFoundError) and no
  src/ file references benchflow.cli.eval, guarding against a future
  re-introduction that could swallow the next bug fix the same way.
- TestEvalCreateRouting — `bench eval create` callback lives in
  cli/main.py:eval_create. Pins the architectural fact PR #173 missed.
- TestEffectiveModel — unit tests for the helper: oracle drops model,
  non-oracle falls back to DEFAULT_MODEL, empty string treated as unset.
- TestOracleYamlLoaders — Job.from_yaml(oracle config) → model is None
  for both native and Harbor formats; non-oracle backwards-compat
  preserved.
- TestEvalCreateOracleCLI — end-to-end: live eval_create(agent="oracle")
  with no API key in env does not raise. Mocks Trial.create and resets
  the asyncio loop after to avoid polluting pre-existing tests that use
  the deprecated asyncio.get_event_loop() pattern.

Verified to fail on main in the right shape: 9 of 14 fail (each pinning
a deleted/added behavior), 5 pass (asserting structural facts already
true). The CLI test fails on main with the user-reported error
"ANTHROPIC_API_KEY required for model 'claude-haiku-4-5-20251001'…".
---
 tests/test_oracle_chokepoint.py | 260 ++++++++++++++++++++++++++++++++
 1 file changed, 260 insertions(+)
 create mode 100644 tests/test_oracle_chokepoint.py

diff --git a/tests/test_oracle_chokepoint.py b/tests/test_oracle_chokepoint.py
new file mode 100644
index 0000000..1a5043a
--- /dev/null
+++ b/tests/test_oracle_chokepoint.py
@@ -0,0 +1,260 @@
+"""Regression tests for the oracle agent + DEFAULT_MODEL chokepoint.
+
+Pins the fix from this branch (post-PR #173 follow-up):
+
+  Layer 1 — restore `agent != "oracle"` guard in resolve_agent_env so the
+            chokepoint defends against any caller that forwards a model.
+  Layer 2 — delete the orphaned src/benchflow/cli/eval.py whose oracle fix
+            was unreachable because nothing wired it into the live CLI.
+  Layer 3 — funnel all CLI/YAML-loader sites through effective_model() so
+            oracle gets honest model=None end-to-end.
+
+The classes below pin each layer at the right altitude:
+- TestOrphanRemoval       — proves cli/eval.py is gone and stays gone.
+- TestEvalCreateRouting   — proves `bench eval create` dispatches to
+                            cli/main.py (the file that PR #173 missed).
+- TestEffectiveModel      — unit tests for the helper.
+- TestOracleYamlLoaders   — Job.from_yaml(oracle config) → model is None.
+- TestEvalCreateOracleCLI — end-to-end: invoke `bench eval create -a oracle`
+                            and assert no API-key validation error.
+"""
+
+from __future__ import annotations
+
+import importlib
+from pathlib import Path
+from unittest.mock import AsyncMock, patch
+
+import pytest
+from typer.testing import CliRunner
+
+
+class TestOrphanRemoval:
+    """src/benchflow/cli/eval.py was orphaned (never imported into the live CLI).
+
+    Deletion was safe because nothing wired it up. These tests make sure
+    the orphan stays gone — re-introducing it as a parallel `eval_create`
+    is what caused PR #173 to land its fix in dead code.
+    """
+
+    def test_cli_eval_module_is_gone(self):
+        with pytest.raises(ModuleNotFoundError):
+            importlib.import_module("benchflow.cli.eval")
+
+    def test_no_source_file_imports_the_orphan(self):
+        """Catch any future import of `benchflow.cli.eval` from src/."""
+        src = Path(__file__).resolve().parent.parent / "src" / "benchflow"
+        offenders = [
+            p
+            for p in src.rglob("*.py")
+            if "benchflow.cli.eval" in p.read_text()
+        ]
+        assert not offenders, f"Orphan import re-introduced in: {offenders}"
+
+
+class TestEvalCreateRouting:
+    """`bench eval create` must dispatch to cli/main.py:eval_create.
+
+    The pyproject.toml `bench`/`benchflow` scripts both resolve to
+    `benchflow.cli.main:app`. PR #173 patched a different `eval_create` in
+    `benchflow.cli.eval` that no entry point ever loaded — these tests
+    pin the routing so that mistake can't happen again.
+    """
+
+    def test_entry_point_app_is_cli_main(self):
+        from benchflow.cli.main import app
+
+        assert app.info.name == "benchflow"
+
+    def test_eval_create_callback_lives_in_cli_main(self):
+        from benchflow.cli.main import eval_app
+
+        create_cmds = [c for c in eval_app.registered_commands if c.name == "create"]
+        assert len(create_cmds) == 1
+        assert create_cmds[0].callback.__module__ == "benchflow.cli.main"
+
+    def test_bench_eval_create_help_resolves(self):
+        """Smoke test: `bench eval create --help` reaches a real callback."""
+        from benchflow.cli.main import app
+
+        result = CliRunner().invoke(app, ["eval", "create", "--help"])
+        assert result.exit_code == 0
+        assert "tasks-dir" in result.stdout or "task" in result.stdout.lower()
+
+
+class TestEffectiveModel:
+    """The helper introduced in Layer 3 — single source of truth for the rule
+    "oracle never gets a model; non-oracle agents fall back to DEFAULT_MODEL"."""
+
+    def test_oracle_with_no_model_returns_none(self):
+        from benchflow.job import effective_model
+
+        assert effective_model("oracle", None) is None
+
+    def test_oracle_ignores_explicit_model(self):
+        """Even if a caller forwards a model for oracle, the helper drops it."""
+        from benchflow.job import effective_model
+
+        assert effective_model("oracle", "claude-haiku-4-5-20251001") is None
+
+    def test_non_oracle_with_no_model_returns_default(self):
+        from benchflow.job import DEFAULT_MODEL, effective_model
+
+        assert effective_model("claude-agent-acp", None) == DEFAULT_MODEL
+
+    def test_non_oracle_explicit_model_passes_through(self):
+        from benchflow.job import effective_model
+
+        assert effective_model("codex-acp", "gpt-5") == "gpt-5"
+
+    def test_non_oracle_empty_model_falls_back_to_default(self):
+        """Empty string == "no model" — Harbor YAML can produce this shape."""
+        from benchflow.job import DEFAULT_MODEL, effective_model
+
+        assert effective_model("claude-agent-acp", "") == DEFAULT_MODEL
+
+
+class TestOracleYamlLoaders:
+    """YAML configs for oracle must produce JobConfig.model is None.
+
+    Both loader paths (_from_native_yaml, _from_harbor_yaml) previously
+    coalesced missing model to DEFAULT_MODEL unconditionally — Layer 3
+    routes them through effective_model() so oracle drops the default.
+    """
+
+    def _make_task(self, tmp_path: Path) -> Path:
+        tasks = tmp_path / "tasks" / "task-a"
+        tasks.mkdir(parents=True)
+        (tasks / "task.toml").write_text('schema_version = "1.1"\n')
+        return tmp_path / "tasks"
+
+    def test_native_yaml_oracle_no_model(self, tmp_path: Path):
+        from benchflow.job import Job
+
+        self._make_task(tmp_path)
+        config = tmp_path / "config.yaml"
+        config.write_text("tasks_dir: tasks\nagent: oracle\n")
+        job = Job.from_yaml(config)
+        assert job._config.agent == "oracle"
+        assert job._config.model is None
+
+    def test_harbor_yaml_oracle_no_model(self, tmp_path: Path):
+        from benchflow.job import Job
+
+        self._make_task(tmp_path)
+        config = tmp_path / "config.yaml"
+        config.write_text(
+            "agents:\n"
+            "  - name: oracle\n"
+            "datasets:\n"
+            "  - path: tasks\n"
+        )
+        job = Job.from_yaml(config)
+        assert job._config.agent == "oracle"
+        assert job._config.model is None
+
+    def test_native_yaml_non_oracle_keeps_default_when_omitted(
+        self, tmp_path: Path
+    ):
+        """Backwards-compat: omitting model for an LLM agent still gets DEFAULT_MODEL."""
+        from benchflow.job import DEFAULT_MODEL, Job
+
+        self._make_task(tmp_path)
+        config = tmp_path / "config.yaml"
+        config.write_text("tasks_dir: tasks\nagent: claude-agent-acp\n")
+        job = Job.from_yaml(config)
+        assert job._config.model == DEFAULT_MODEL
+
+
+class TestEvalCreateOracleCLI:
+    """End-to-end: `bench eval create -a oracle` must not trip API key validation.
+
+    This is the user-visible bug the chokepoint test guards against at the
+    unit level. Here we call the live handler (cli/main.py:eval_create)
+    directly — invoking via Typer's CliRunner triggers asyncio.run()
+    internally, which leaves no current event loop and breaks pre-existing
+    tests in the suite that use the deprecated asyncio.get_event_loop()
+    pattern. Calling the function directly still exercises the full
+    CLI → effective_model → TrialConfig → resolve_agent_env path the bug
+    originally lived in.
+    """
+
+    def _make_task(self, tmp_path: Path) -> Path:
+        task = tmp_path / "task"
+        task.mkdir()
+        (task / "task.toml").write_text('schema_version = "1.1"\n')
+        (task / "instruction.md").write_text("solve\n")
+        return task
+
+    def _strip_api_keys(self, monkeypatch):
+        for k in (
+            "ANTHROPIC_API_KEY",
+            "ANTHROPIC_AUTH_TOKEN",
+            "CLAUDE_CODE_OAUTH_TOKEN",
+            "OPENAI_API_KEY",
+            "GOOGLE_API_KEY",
+            "GEMINI_API_KEY",
+            "LLM_API_KEY",
+        ):
+            monkeypatch.delenv(k, raising=False)
+
+    def test_oracle_single_task_no_api_key_no_error(
+        self, tmp_path: Path, monkeypatch
+    ):
+        """The bug: oracle + missing API key → ANTHROPIC_API_KEY ValueError."""
+        import asyncio
+
+        from benchflow.cli.main import eval_create
+        from benchflow.models import RunResult
+
+        # The CLI handler internally calls asyncio.run(), which leaves no
+        # current event loop. Pre-existing tests in the suite use the
+        # deprecated asyncio.get_event_loop() and break in that state, so
+        # restore a fresh loop after the test (teardown via finally below).
+
+        self._strip_api_keys(monkeypatch)
+        task = self._make_task(tmp_path)
+        captured: dict = {}
+
+        async def fake_create(config):
+            captured["config"] = config
+            # Exercise the real chokepoint with the config the CLI built —
+            # that's the specific call site the bug manifested at.
+            from benchflow._agent_env import resolve_agent_env
+
+            captured["agent_env"] = resolve_agent_env(
+                config.primary_agent, config.primary_model, config.agent_env
+            )
+            trial = type("FakeTrial", (), {})()
+            trial.run = AsyncMock(
+                return_value=RunResult(
+                    task_name="task",
+                    agent_name="oracle",
+                    rewards={"reward": 1.0},
+                    n_tool_calls=0,
+                )
+            )
+            return trial
+
+        try:
+            with patch("benchflow.trial.Trial.create", new=fake_create):
+                eval_create(
+                    config_file=None,
+                    tasks_dir=task,
+                    agent="oracle",
+                    model=None,
+                    environment="docker",
+                    concurrency=1,
+                    jobs_dir=str(tmp_path / "jobs"),
+                    sandbox_user="agent",
+                    skills_dir=None,
+                )
+        finally:
+            asyncio.set_event_loop(asyncio.new_event_loop())
+
+        cfg = captured["config"]
+        # Layer 3: oracle never receives a model, even when CLI defaults exist.
+        assert cfg.primary_agent == "oracle"
+        assert cfg.primary_model is None
+        # Layer 1: chokepoint did not inject provider env or raise.
+        assert "BENCHFLOW_PROVIDER_MODEL" not in captured["agent_env"]

From 88bd026e13e1c85b8e8c5040da31c3c2feab742b Mon Sep 17 00:00:00 2001
From: Yifeng He <yfhe.prsn@gmail.com>
Date: Tue, 21 Apr 2026 15:53:15 -0700
Subject: [PATCH 05/12] fix: restore cli/eval.py and test_eval_cli.py, apply
 oracle guard

The previous commit deleted cli/eval.py and its tests as orphans, but
they are intentionally kept. Restore both from main, update eval.py to
use the effective_model() helper for the oracle chokepoint fix, and
replace the "module is gone" regression test with a guard that cli/main.py
does not import cli/eval (the actual invariant).
---
 src/benchflow/cli/eval.py       | 373 ++++++++++++++++++++++++++++++++
 tests/test_eval_cli.py          | 118 ++++++++++
 tests/test_oracle_chokepoint.py |  33 ++-
 3 files changed, 506 insertions(+), 18 deletions(-)
 create mode 100644 src/benchflow/cli/eval.py
 create mode 100644 tests/test_eval_cli.py

diff --git a/src/benchflow/cli/eval.py b/src/benchflow/cli/eval.py
new file mode 100644
index 0000000..ea7af43
--- /dev/null
+++ b/src/benchflow/cli/eval.py
@@ -0,0 +1,373 @@
+"""`bf eval` — the benchflow eval-runner command group.
+
+The future-facing entry point for running evaluations. Anthropic-style shape:
+resource creation, one command, return the result or a job-id.
+
+    bf eval create <task-ref> [flags]
+        One-shot eval — creates an Agent + Environment + Trajectory under
+        the hood and runs the task. `task-ref` can be:
+          - a path to a task directory (single task)
+          - a path to a directory of task directories (batch)
+          - a `harbor://<name>[@<version>]` ref (full Harbor dataset)
+          - a `harbor://<name>/<task>` ref (single task from a Harbor dataset)
+          - a `benchflow://<name>[@<version>]` ref (benchflow-owned dataset)
+
+    bf eval list          Show recent eval runs (reads the jobs/ dir)
+    bf eval retrieve ID   Look up a specific trajectory by trial name
+
+Replaces `bf run` + `bf job` as the idiomatic way to run evals. `bf run`
+stays around as a deprecated alias for one release.
+"""
+
+from __future__ import annotations
+
+import asyncio
+from pathlib import Path
+from typing import Annotated
+
+import typer
+from rich.console import Console
+from rich.table import Table
+
+from benchflow.job import DEFAULT_AGENT, effective_model as _effective_model
+
+console = Console()
+
+app = typer.Typer(
+    name="eval",
+    help="Run evaluations. `bf eval create <task>` is the main entry point.",
+    no_args_is_help=True,
+)
+
+
+def _resolve_task_ref(task_ref: str) -> tuple[Path, bool]:
+    """Resolve a positional task reference to a local directory.
+
+    Returns `(path, is_batch)`. `is_batch` is True when the reference
+    points at a directory containing multiple task dirs (each with its
+    own `task.toml`), False when the reference is a single task dir.
+    """
+
+    # Registry prefix: fetch the full dataset and treat as batch.
+    if task_ref.startswith("harbor://") or task_ref.startswith("benchflow://"):
+        from benchflow.task_download import ensure_tasks
+
+        # Allow `harbor://<name>/<task>` shorthand for a single task within
+        # a dataset. Split off the trailing segment if it matches a task.
+        prefix, _, tail = task_ref.partition("://")
+        head = tail
+        sub_task: str | None = None
+        if "/" in tail and "@" not in tail.split("/", 1)[1]:
+            dataset, sub_task = tail.split("/", 1)
+            head = dataset
+        dataset_ref = f"{prefix}://{head}"
+        dataset_dir = ensure_tasks(dataset_ref)
+        if sub_task is not None:
+            candidate = dataset_dir / sub_task
+            if not candidate.exists() or not (candidate / "task.toml").exists():
+                console.print(
+                    f"[red]Harbor dataset {head!r} has no task named {sub_task!r}.[/red]"
+                )
+                raise typer.Exit(1)
+            return candidate, False
+        return dataset_dir, True
+
+    # Filesystem path: single task if task.toml is present, batch otherwise.
+    path = Path(task_ref).expanduser().resolve()
+    if not path.exists():
+        console.print(f"[red]Task reference not found: {task_ref}[/red]")
+        raise typer.Exit(1)
+    if (path / "task.toml").exists():
+        return path, False
+    if any(
+        child.is_dir() and (child / "task.toml").exists() for child in path.iterdir()
+    ):
+        return path, True
+    console.print(
+        f"[red]{path} is neither a single task (no task.toml) "
+        f"nor a directory of tasks.[/red]"
+    )
+    raise typer.Exit(1)
+
+
+@app.command("create")
+def eval_create(
+    task: Annotated[
+        str,
+        typer.Argument(
+            help=(
+                "Task reference. Path to a task dir, a dir of tasks, or a "
+                "registry ref (harbor://name, benchflow://name)."
+            )
+        ),
+    ],
+    agent: Annotated[
+        str, typer.Option("--agent", "-a", help="Agent name from the registry")
+    ] = DEFAULT_AGENT,
+    model: Annotated[
+        str | None,
+        typer.Option("--model", "-m", help="Model id; agent default if unset"),
+    ] = None,
+    environment: Annotated[
+        str,
+        typer.Option(
+            "--environment",
+            "-e",
+            help="docker | daytona | ... (uses the agent's default if unset)",
+        ),
+    ] = "docker",
+    prompt: Annotated[
+        list[str] | None,
+        typer.Option(
+            "--prompt",
+            "-p",
+            help="Prompt text; repeat for multi-turn. Default: instruction.md",
+        ),
+    ] = None,
+    concurrency: Annotated[
+        int,
+        typer.Option(
+            "--concurrency",
+            "-c",
+            help="Max parallel trials when task is a batch",
+        ),
+    ] = 4,
+    max_retries: Annotated[
+        int,
+        typer.Option(
+            "--max-retries",
+            help="Per-trial retry count on transient errors",
+        ),
+    ] = 1,
+    jobs_dir: Annotated[
+        str,
+        typer.Option("--jobs-dir", "-o", help="Where to write result files"),
+    ] = "jobs",
+    skills_dir: Annotated[
+        Path | None,
+        typer.Option(
+            "--skills-dir",
+            "-s",
+            help="Skills directory to mount into the sandbox",
+        ),
+    ] = None,
+    sandbox_user: Annotated[
+        str | None,
+        typer.Option(
+            "--sandbox-user",
+            help="Non-root sandbox user (default 'agent'; 'none' = root)",
+        ),
+    ] = "agent",
+    agent_env: Annotated[
+        list[str] | None,
+        typer.Option("--agent-env", help="Extra agent env var (KEY=VALUE)"),
+    ] = None,
+) -> None:
+    """Create and run an eval — one-shot.
+
+    Under the hood:
+      1. Resolves `task` to a local directory (fetching from Harbor if needed).
+      2. If it's a single task: runs `SDK.run()` once and prints the reward.
+      3. If it's a batch: runs a `Job` at `--concurrency` and prints pass rate.
+
+    This is the idiomatic way to run evals going forward. `bf run` and
+    `bf job` remain as one-release deprecated aliases that forward here.
+    """
+
+    resolved, is_batch = _resolve_task_ref(task)
+
+    parsed_env: dict[str, str] = {}
+    for entry in agent_env or []:
+        if "=" not in entry:
+            console.print(f"[red]Invalid --agent-env value: {entry!r}[/red]")
+            raise typer.Exit(2)
+        k, v = entry.split("=", 1)
+        parsed_env[k] = v
+
+    if is_batch:
+        _run_batch(
+            tasks_dir=resolved,
+            agent=agent,
+            model=model,
+            environment=environment,
+            concurrency=concurrency,
+            max_retries=max_retries,
+            jobs_dir=jobs_dir,
+            skills_dir=skills_dir,
+            sandbox_user=sandbox_user,
+            agent_env=parsed_env,
+        )
+    else:
+        _run_single(
+            task_dir=resolved,
+            agent=agent,
+            model=model,
+            environment=environment,
+            prompt=prompt,
+            jobs_dir=jobs_dir,
+            skills_dir=skills_dir,
+            sandbox_user=sandbox_user,
+            agent_env=parsed_env,
+        )
+
+
+def _run_single(
+    *,
+    task_dir: Path,
+    agent: str,
+    model: str | None,
+    environment: str,
+    prompt: list[str] | None,
+    jobs_dir: str,
+    skills_dir: Path | None,
+    sandbox_user: str | None,
+    agent_env: dict[str, str],
+) -> None:
+    from typing import cast
+
+    from benchflow.sdk import SDK
+
+    sdk = SDK()
+    eff_model = _effective_model(agent, model)
+    result = asyncio.run(
+        sdk.run(
+            task_path=task_dir,
+            agent=agent,
+            model=eff_model,
+            environment=environment,
+            prompts=cast("list[str | None] | None", prompt),
+            agent_env=agent_env,
+            job_name="eval-create",
+            jobs_dir=jobs_dir,
+            skills_dir=skills_dir,
+            sandbox_user=None if sandbox_user == "none" else sandbox_user,
+        )
+    )
+    reward = getattr(result, "reward", None)
+    err = getattr(result, "error", None) or getattr(result, "verifier_error", None)
+    console.print()
+    if err:
+        console.print(f"[red]failed:[/red] {err}")
+        raise typer.Exit(1)
+    console.print(
+        f"[bold]reward={reward}[/bold]  tools={getattr(result, 'n_tool_calls', 0)}"
+    )
+
+
+def _run_batch(
+    *,
+    tasks_dir: Path,
+    agent: str,
+    model: str | None,
+    environment: str,
+    concurrency: int,
+    max_retries: int,
+    jobs_dir: str,
+    skills_dir: Path | None,
+    sandbox_user: str | None,
+    agent_env: dict[str, str],
+) -> None:
+    from benchflow.job import Job, JobConfig, RetryConfig
+
+    eff_model = _effective_model(agent, model)
+    config = JobConfig(
+        agent=agent,
+        model=eff_model,
+        environment=environment,
+        concurrency=concurrency,
+        retry=RetryConfig(max_retries=max_retries),
+        agent_env=agent_env,
+        sandbox_user=None if sandbox_user == "none" else sandbox_user,
+        skills_dir=str(skills_dir) if skills_dir else None,
+    )
+    job = Job(
+        tasks_dir=tasks_dir,
+        jobs_dir=Path(jobs_dir),
+        config=config,
+    )
+    result = asyncio.run(job.run())
+    console.print()
+    console.print(
+        f"[bold]{result.passed}/{result.total} "
+        f"({result.score:.1%})[/bold] errors={result.errored}"
+    )
+
+
+@app.command("list")
+def eval_list(
+    jobs_dir: Annotated[
+        str,
+        typer.Option("--jobs-dir", "-o", help="Results directory to scan"),
+    ] = "jobs",
+    limit: Annotated[
+        int,
+        typer.Option("--limit", "-n", help="Max rows to show"),
+    ] = 20,
+) -> None:
+    """List recent eval runs by scanning the jobs/ dir."""
+
+    root = Path(jobs_dir)
+    if not root.exists():
+        console.print(f"[yellow]{root} does not exist yet.[/yellow]")
+        return
+    runs: list[tuple[str, int, int]] = []
+    for job_root in root.iterdir():
+        if not job_root.is_dir():
+            continue
+        for stamp in job_root.iterdir():
+            if not stamp.is_dir():
+                continue
+            trials = list(stamp.iterdir())
+            passed = 0
+            total = 0
+            for trial in trials:
+                result = trial / "result.json"
+                if not result.exists():
+                    continue
+                total += 1
+                try:
+                    import json
+
+                    data = json.loads(result.read_text())
+                    if (data.get("rewards") or {}).get("reward") == 1.0:
+                        passed += 1
+                except Exception:
+                    continue
+            if total:
+                runs.append((f"{job_root.name}/{stamp.name}", passed, total))
+    runs.sort(reverse=True)
+    table = Table(title=f"Recent evals in {root}")
+    table.add_column("run", style="cyan")
+    table.add_column("passed", justify="right", style="green")
+    table.add_column("total", justify="right")
+    table.add_column("rate", justify="right", style="yellow")
+    for run, passed, total in runs[:limit]:
+        rate = f"{100 * passed / total:.0f}%" if total else "-"
+        table.add_row(run, str(passed), str(total), rate)
+    console.print(table)
+
+
+@app.command("retrieve")
+def eval_retrieve(
+    trial_name: Annotated[
+        str, typer.Argument(help="Trial dir name, e.g. my-task__abc")
+    ],
+    jobs_dir: Annotated[
+        str,
+        typer.Option("--jobs-dir", "-o"),
+    ] = "jobs",
+) -> None:
+    """Print the result.json for a specific trial."""
+
+    import json
+
+    root = Path(jobs_dir)
+    matches = list(root.rglob(f"{trial_name}/result.json"))
+    if not matches:
+        console.print(f"[red]no trial named {trial_name!r} under {root}[/red]")
+        raise typer.Exit(1)
+    console.print(f"[dim]{matches[0]}[/dim]")
+    data = json.loads(matches[0].read_text())
+    from rich import print_json
+
+    print_json(data=data)
diff --git a/tests/test_eval_cli.py b/tests/test_eval_cli.py
new file mode 100644
index 0000000..5a009e7
--- /dev/null
+++ b/tests/test_eval_cli.py
@@ -0,0 +1,118 @@
+"""Tests for `bf eval` — the new eval-runner CLI.
+
+The create command takes a positional task reference and routes to either
+a single SDK.run or a batch Job based on whether the reference is a single
+task dir or a directory of task dirs. We don't exercise the actual runner
+here (that's the parity tests' job); we only verify the resolver.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import click.exceptions
+import pytest
+
+from benchflow.cli.eval import _resolve_task_ref
+
+
+def _make_task_dir(parent: Path, name: str) -> Path:
+    d = parent / name
+    d.mkdir()
+    (d / "task.toml").write_text('schema_version = "1.1"\n')
+    (d / "instruction.md").write_text("instruction\n")
+    (d / "environment").mkdir()
+    (d / "tests").mkdir()
+    (d / "tests" / "test.sh").write_text("#!/bin/bash\necho 1\n")
+    return d
+
+
+def test_resolve_single_task_dir(tmp_path: Path) -> None:
+    task = _make_task_dir(tmp_path, "my-task")
+    resolved, is_batch = _resolve_task_ref(str(task))
+    assert resolved == task.resolve()
+    assert is_batch is False
+
+
+def test_resolve_directory_of_tasks_is_batch(tmp_path: Path) -> None:
+    _make_task_dir(tmp_path, "task-a")
+    _make_task_dir(tmp_path, "task-b")
+    resolved, is_batch = _resolve_task_ref(str(tmp_path))
+    assert resolved == tmp_path.resolve()
+    assert is_batch is True
+
+
+def test_resolve_missing_path_exits(tmp_path: Path) -> None:
+    with pytest.raises(click.exceptions.Exit):
+        _resolve_task_ref(str(tmp_path / "does-not-exist"))
+
+
+def test_resolve_harbor_ref_routes_through_ensure_tasks(
+    tmp_path: Path, monkeypatch
+) -> None:
+    """`harbor://<name>` resolves to the cached dataset directory (batch)."""
+
+    fake_dataset = tmp_path / "cached-aime"
+    fake_dataset.mkdir()
+    _make_task_dir(fake_dataset, "aime_60")
+    _make_task_dir(fake_dataset, "aime_61")
+
+    captured: dict[str, str] = {}
+
+    def fake_ensure_tasks(ref: str) -> Path:
+        captured["ref"] = ref
+        return fake_dataset
+
+    from benchflow import task_download
+
+    monkeypatch.setattr(task_download, "ensure_tasks", fake_ensure_tasks)
+    resolved, is_batch = _resolve_task_ref("harbor://aime@1.0")
+    assert captured["ref"] == "harbor://aime@1.0"
+    assert resolved == fake_dataset
+    assert is_batch is True
+
+
+def test_resolve_harbor_ref_with_subtask(tmp_path: Path, monkeypatch) -> None:
+    """`harbor://<name>/<task>` resolves to a single task from the dataset."""
+
+    fake_dataset = tmp_path / "cached-aime"
+    fake_dataset.mkdir()
+    _make_task_dir(fake_dataset, "aime_60")
+    _make_task_dir(fake_dataset, "aime_61")
+
+    def fake_ensure_tasks(ref: str) -> Path:
+        return fake_dataset
+
+    from benchflow import task_download
+
+    monkeypatch.setattr(task_download, "ensure_tasks", fake_ensure_tasks)
+    resolved, is_batch = _resolve_task_ref("harbor://aime/aime_60")
+    assert resolved == fake_dataset / "aime_60"
+    assert is_batch is False
+
+
+def test_resolve_benchflow_ref(tmp_path: Path, monkeypatch) -> None:
+    """`benchflow://<name>` works identically to `harbor://`."""
+
+    fake_dataset = tmp_path / "cached-benchjack"
+    fake_dataset.mkdir()
+    _make_task_dir(fake_dataset, "exploit-1")
+
+    def fake_ensure_tasks(ref: str) -> Path:
+        assert ref == "benchflow://benchjack"
+        return fake_dataset
+
+    from benchflow import task_download
+
+    monkeypatch.setattr(task_download, "ensure_tasks", fake_ensure_tasks)
+    resolved, is_batch = _resolve_task_ref("benchflow://benchjack")
+    assert resolved == fake_dataset
+    assert is_batch is True
+
+
+def test_resolve_non_task_dir_exits(tmp_path: Path) -> None:
+    """A directory with no task.toml children errors cleanly."""
+
+    (tmp_path / "not-a-task").mkdir()
+    with pytest.raises(click.exceptions.Exit):
+        _resolve_task_ref(str(tmp_path))
diff --git a/tests/test_oracle_chokepoint.py b/tests/test_oracle_chokepoint.py
index 1a5043a..142c063 100644
--- a/tests/test_oracle_chokepoint.py
+++ b/tests/test_oracle_chokepoint.py
@@ -29,27 +29,24 @@
 from typer.testing import CliRunner
 
 
-class TestOrphanRemoval:
-    """src/benchflow/cli/eval.py was orphaned (never imported into the live CLI).
+class TestEvalModuleNotWiredIntoCLI:
+    """src/benchflow/cli/eval.py exists but is NOT wired into the live CLI.
 
-    Deletion was safe because nothing wired it up. These tests make sure
-    the orphan stays gone — re-introducing it as a parallel `eval_create`
-    is what caused PR #173 to land its fix in dead code.
+    The live `bench eval create` dispatches to cli/main.py:eval_create.
+    cli/eval.py is a standalone module (with its own resolver logic and tests)
+    but must not be imported by the CLI entry-point code — doing so is what
+    caused PR #173 to land its fix in dead code.
     """
 
-    def test_cli_eval_module_is_gone(self):
-        with pytest.raises(ModuleNotFoundError):
-            importlib.import_module("benchflow.cli.eval")
-
-    def test_no_source_file_imports_the_orphan(self):
-        """Catch any future import of `benchflow.cli.eval` from src/."""
-        src = Path(__file__).resolve().parent.parent / "src" / "benchflow"
-        offenders = [
-            p
-            for p in src.rglob("*.py")
-            if "benchflow.cli.eval" in p.read_text()
-        ]
-        assert not offenders, f"Orphan import re-introduced in: {offenders}"
+    def test_cli_main_does_not_import_cli_eval(self):
+        """cli/main.py must not import from cli/eval — they are separate."""
+        main_py = (
+            Path(__file__).resolve().parent.parent
+            / "src" / "benchflow" / "cli" / "main.py"
+        )
+        text = main_py.read_text()
+        assert "from benchflow.cli.eval" not in text
+        assert "import benchflow.cli.eval" not in text
 
 
 class TestEvalCreateRouting:

From bc04c59920a6a4608932f39fe9c2172c0b38d6ba Mon Sep 17 00:00:00 2001
From: Yifeng He <yfhe.prsn@gmail.com>
Date: Tue, 21 Apr 2026 15:54:40 -0700
Subject: [PATCH 06/12] docs: clarify cli/eval.py and test_eval_cli.py are not
 wired into live CLI

---
 src/benchflow/cli/eval.py | 12 +++++++-----
 tests/test_eval_cli.py    | 10 +++++-----
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/src/benchflow/cli/eval.py b/src/benchflow/cli/eval.py
index ea7af43..58b410f 100644
--- a/src/benchflow/cli/eval.py
+++ b/src/benchflow/cli/eval.py
@@ -1,7 +1,12 @@
 """`bf eval` — the benchflow eval-runner command group.
 
-The future-facing entry point for running evaluations. Anthropic-style shape:
-resource creation, one command, return the result or a job-id.
+NOTE: This module is **not wired into the live CLI**.  The active
+``bench eval create`` command dispatches to ``cli/main.py:eval_create``.
+This file is kept as the future-facing design for the eval sub-command
+and must not be imported by ``cli/main.py`` (see
+``test_oracle_chokepoint.py::TestEvalModuleNotWiredIntoCLI``).
+
+Design shape — Anthropic-style resource creation:
 
     bf eval create <task-ref> [flags]
         One-shot eval — creates an Agent + Environment + Trajectory under
@@ -14,9 +19,6 @@
 
     bf eval list          Show recent eval runs (reads the jobs/ dir)
     bf eval retrieve ID   Look up a specific trajectory by trial name
-
-Replaces `bf run` + `bf job` as the idiomatic way to run evals. `bf run`
-stays around as a deprecated alias for one release.
 """
 
 from __future__ import annotations
diff --git a/tests/test_eval_cli.py b/tests/test_eval_cli.py
index 5a009e7..dfcb7db 100644
--- a/tests/test_eval_cli.py
+++ b/tests/test_eval_cli.py
@@ -1,9 +1,9 @@
-"""Tests for `bf eval` — the new eval-runner CLI.
+"""Tests for ``benchflow.cli.eval`` — the future-facing eval CLI module.
 
-The create command takes a positional task reference and routes to either
-a single SDK.run or a batch Job based on whether the reference is a single
-task dir or a directory of task dirs. We don't exercise the actual runner
-here (that's the parity tests' job); we only verify the resolver.
+NOTE: ``cli/eval.py`` is **not wired into the live CLI** (the active
+``bench eval create`` lives in ``cli/main.py``).  These tests cover the
+task-reference resolver (``_resolve_task_ref``) which will be used once
+the module is promoted to the live entry point.
 """
 
 from __future__ import annotations

From 9459a23b4214cedf1f23db49c59b59bf28cd232b Mon Sep 17 00:00:00 2001
From: Yifeng He <yfhe.prsn@gmail.com>
Date: Wed, 22 Apr 2026 12:52:48 -0700
Subject: [PATCH 07/12] docs(plan): add plan to fix sandbox io problem

---
 ...6-04-22-sandbox-setup-shared-tool-paths.md | 133 ++++++++++++++++++
 1 file changed, 133 insertions(+)
 create mode 100644 docs/plans/2026-04-22-sandbox-setup-shared-tool-paths.md

diff --git a/docs/plans/2026-04-22-sandbox-setup-shared-tool-paths.md b/docs/plans/2026-04-22-sandbox-setup-shared-tool-paths.md
new file mode 100644
index 0000000..b6a6e35
--- /dev/null
+++ b/docs/plans/2026-04-22-sandbox-setup-shared-tool-paths.md
@@ -0,0 +1,133 @@
+# Sandbox Setup Shared Tool Paths Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Eliminate per-trial copying of heavyweight tool installations during `setup_sandbox_user()` while keeping sandboxed agents able to read configs, credentials, and installed binaries.
+
+**Architecture:** Treat tool installations as shared container state, not per-user home state. Keep `setup_sandbox_user()` responsible only for creating the non-root user, preparing small home-scoped config/credential directories, and granting workspace ownership. Add a narrow compatibility path for root-home installs that BenchFlow cannot relocate itself.
+
+**Tech Stack:** Python 3.12, pytest, Harbor sandbox env exec, BenchFlow agent registry / sandbox setup.
+
+---
+
+## Recommendation
+
+Pick **alternative 2: shared install paths** as the primary direction, with a **small compatibility fallback from alternative 1** for existing images that already install tools under `/root`.
+
+Why this is the best fit:
+
+- `setup_sandbox_user()` currently copies `/root/.nvm`, `/root/.local/bin`, and every derived home dir. That is the direct cause of the timeout.
+- BenchFlow already installs some helpers into shared locations like `/usr/local/bin`, which matches the right ownership boundary for binaries.
+- Pre-creating the user in the Dockerfile (alternative 3) is useful for image authors, but it is not a sufficient product-level fix because BenchFlow must still support arbitrary user-provided images.
+- Pure symlinking of `/root` home directories (alternative 1) would reduce I/O, but it keeps the wrong model: heavyweight tool installs still live in a root-only home and `setup_sandbox_user()` still has to know about them.
+
+So the implementation target should be:
+
+- **Primary path:** stop relying on root-home installs for agent binaries and shared tooling.
+- **Compatibility path:** if an existing image already has a heavy tool dir in `/root` and BenchFlow needs it, link the minimum necessary path instead of copying it.
+
+## File Map
+
+- Modify: `src/benchflow/_sandbox.py`
+  - Remove heavyweight recursive copies from `setup_sandbox_user()`.
+  - Add a minimal compatibility helper for sharing root-owned tool paths without recursive copy.
+- Modify: `src/benchflow/agents/registry.py`
+  - Clarify the contract for `home_dirs` / `get_sandbox_home_dirs()` so it covers user config, not shared tool installs.
+- Modify: `src/benchflow/_agent_setup.py`
+  - Ensure the install path assumptions match the new sandbox contract.
+- Modify: `tests/test_sandbox.py`
+  - Add contract tests for which home dirs are copied versus treated as shared tooling.
+- Create or modify: `tests/test_sandbox_setup.py`
+  - Add targeted tests for the generated `setup_sandbox_user()` command.
+- Modify: `docs/task-authoring.md`
+  - Document preferred shared install locations for benchmark/task images.
+- Modify: `docs/api-reference.md`
+  - Update the sandbox setup description to match the new behavior.
+
+### Task 1: Lock the new sandbox contract in tests
+
+**Files:**
+
+- Modify: `tests/test_sandbox.py`
+- Create: `tests/test_sandbox_setup.py`
+- Modify: `src/benchflow/_sandbox.py`
+
+- [ ] Add a focused async test file for `setup_sandbox_user()` that mocks `env.exec`, calls `setup_sandbox_user(env, "agent", "/app")`, and asserts the command no longer contains recursive copies of `/root/.nvm` or `/root/.local/bin`.
+- [ ] In the same test file, assert the command still creates the user, prepares the target home directory, and `chown`s the workspace.
+- [ ] Add a compatibility test that asserts the command uses a link-or-shared-path strategy for heavyweight root-owned tool dirs rather than `cp -a`.
+- [ ] Run: `.venv/bin/python -m pytest tests/test_sandbox.py tests/test_sandbox_setup.py -q`
+- [ ] Expected: new tests fail on current `main` because the command still contains `cp -a` / `cp -aL` for heavyweight tool dirs.
+
+### Task 2: Narrow `setup_sandbox_user()` to user state only
+
+**Files:**
+
+- Modify: `src/benchflow/_sandbox.py:103-128`
+- Test: `tests/test_sandbox_setup.py`
+
+- [ ] Refactor `setup_sandbox_user()` so it does only three things:
+  - create the sandbox user if missing,
+  - prepare only small home-scoped directories BenchFlow actually needs for config/auth,
+  - grant ownership to the workspace.
+- [ ] Remove the unconditional copy of `/root/.nvm`.
+- [ ] Remove the special-case copy of `/root/.local/bin`.
+- [ ] Keep copying only the small dirs derived from `get_sandbox_home_dirs()` that represent config/auth state, not tool installations.
+- [ ] Add a minimal helper or inline shell snippet that, when a legacy image exposes required tool paths only under `/root`, creates a symlink to the required path instead of copying the directory tree.
+- [ ] Run: `.venv/bin/python -m pytest tests/test_sandbox_setup.py -q`
+- [ ] Expected: the new setup contract tests pass.
+
+### Task 3: Align registry semantics with the new contract
+
+**Files:**
+
+- Modify: `src/benchflow/agents/registry.py:338-366`
+- Modify: `tests/test_sandbox.py`
+
+- [ ] Update the `get_sandbox_home_dirs()` docstring to state that the returned dirs are user home config/auth dirs that BenchFlow may need to materialize for the sandbox user.
+- [ ] Decide whether `.local` should remain in `get_sandbox_home_dirs()`:
+  - if BenchFlow still needs `.local/share` or similar user-scoped state, keep it but stop special-casing `.local/bin` in `_sandbox.py`;
+  - if BenchFlow only needed `.local` for tool binaries, remove `.local` from the always-include set and update tests accordingly.
+- [ ] Prefer the more minimal option based on the actual credential/skill paths in the registry.
+- [ ] Run: `.venv/bin/python -m pytest tests/test_sandbox.py -q`
+- [ ] Expected: registry contract tests pass with updated semantics.
+
+### Task 4: Verify agent install assumptions still hold
+
+**Files:**
+
+- Modify: `src/benchflow/_agent_setup.py`
+- Modify: `src/benchflow/agents/registry.py`
+- Test: `tests/test_sandbox_setup.py`
+
+- [ ] Review each registry `install_cmd` and confirm BenchFlow-installed binaries resolve from shared paths already on `PATH` for the sandbox user.
+- [ ] If any BenchFlow-managed install still lands in a root-home path, change that install command to a shared location such as `/usr/local/bin`, `/usr/local/lib`, or another non-home prefix.
+- [ ] Add or adjust a test that asserts the sandbox user setup no longer depends on home-copying to make BenchFlow-installed agents executable.
+- [ ] Run targeted tests covering sandbox setup and any registry invariant tests touched by the change.
+
+### Task 5: Document the benchmark image guidance
+
+**Files:**
+
+- Modify: `docs/task-authoring.md`
+- Modify: `docs/api-reference.md`
+
+- [ ] Add a short section to `docs/task-authoring.md` explaining that benchmark images should install shared tooling into shared prefixes like `/usr/local` or `/opt`, not `/root/.nvm` or `/root/.local/bin`, when the tools must be usable by a sandbox user.
+- [ ] Document that pre-creating the sandbox user in the Dockerfile is optional optimization, not the primary compatibility mechanism.
+- [ ] Update `docs/api-reference.md` to describe `setup_sandbox_user()` as lightweight user setup plus workspace ownership, not recursive home cloning.
+
+### Task 6: End-to-end verification
+
+**Files:**
+
+- No code changes required unless verification exposes a gap.
+
+- [ ] Run: `.venv/bin/python -m pytest tests/test_sandbox.py tests/test_sandbox_setup.py tests/test_sandbox_hardening.py tests/test_sandbox_verifier_workspace.py -q`
+- [ ] Run: `.venv/bin/ty check src/`
+- [ ] If available, run one trial startup path that exercises `Trial.install_agent()` with `sandbox_user="agent"` and capture setup timing before/after.
+- [ ] Confirm the setup command no longer performs recursive copies of heavyweight tool trees per trial.
+
+## Notes / Risks
+
+- The collaborator comment on the issue asks to try shared install paths first. This plan follows that request as the architectural choice.
+- Shared install paths alone cannot retrofit arbitrary existing benchmark images that already baked tools into `/root`. That is why the plan keeps a narrow symlink-based compatibility path instead of a full copy.
+- The key review question before implementation is whether `.local` remains a true user-state directory in BenchFlow, or whether it only existed because of previous tool-install assumptions.

From f18bab6ed06707119df83fc8c160c61916bc9638 Mon Sep 17 00:00:00 2001
From: Yifeng He <yfhe.prsn@gmail.com>
Date: Wed, 22 Apr 2026 13:31:01 -0700
Subject: [PATCH 08/12] test: lock sandbox setup contract

Plan step 1/6: Lock the new sandbox contract in tests
---
 tests/test_sandbox.py       |  5 +++
 tests/test_sandbox_setup.py | 62 +++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)
 create mode 100644 tests/test_sandbox_setup.py

diff --git a/tests/test_sandbox.py b/tests/test_sandbox.py
index 4c7f9f2..1954ac6 100644
--- a/tests/test_sandbox.py
+++ b/tests/test_sandbox.py
@@ -36,6 +36,11 @@ def test_always_includes_local(self):
         dirs = get_sandbox_home_dirs()
         assert ".local" in dirs
 
+    def test_only_includes_top_level_home_dirs(self):
+        """Derived entries stay at $HOME top-level, not nested tool subpaths."""
+        dirs = get_sandbox_home_dirs()
+        assert ".local/bin" not in dirs
+
     def test_new_agent_auto_included(self):
         """Adding an agent with skill_paths=$HOME/.newagent/skills includes .newagent."""
         AGENTS["_test_agent"] = AgentConfig(
diff --git a/tests/test_sandbox_setup.py b/tests/test_sandbox_setup.py
new file mode 100644
index 0000000..96cde3b
--- /dev/null
+++ b/tests/test_sandbox_setup.py
@@ -0,0 +1,62 @@
+"""Focused contract tests for setup_sandbox_user() shell command generation."""
+
+import re
+import shlex
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from benchflow._sandbox import setup_sandbox_user
+
+
+async def _run_setup_sandbox_user(*, sandbox_user: str = "agent", workspace: str = "/app"):
+    env = MagicMock()
+    env.exec = AsyncMock(return_value=MagicMock(stdout="", stderr="", exit_code=0))
+
+    await setup_sandbox_user(env, sandbox_user, workspace)
+
+    env.exec.assert_awaited_once()
+    return env.exec.call_args.args[0], env.exec.call_args.kwargs
+
+
+def _assert_explicit_symlink(cmd: str, *, source: str, dest: str) -> None:
+    """Heavy tool dirs must use an explicit symlink compatibility path."""
+    assert re.search(
+        rf"ln -s(?:f|[a-zA-Z-])* [\"']?{re.escape(source)}[\"']? [\"']?{re.escape(dest)}[\"']?",
+        cmd,
+    ), f"expected explicit symlink from {source} to {dest} in setup command: {cmd}"
+
+
+class TestSetupSandboxUser:
+    @pytest.mark.asyncio
+    async def test_setup_command_avoids_recursive_root_tool_copies(self):
+        """Heavy root-owned tool dirs should no longer be recursively copied."""
+        cmd, kwargs = await _run_setup_sandbox_user()
+
+        assert "cp -aL /root/.local/bin/." not in cmd
+        assert "cp -a /root/.nvm/." not in cmd
+        assert kwargs["timeout_sec"] == 120
+
+    @pytest.mark.asyncio
+    async def test_setup_command_still_creates_user_prepares_home_and_chowns_workspace(self):
+        """The non-copy setup contract still creates the user and grants access."""
+        cmd, _ = await _run_setup_sandbox_user()
+
+        assert "id -u agent >/dev/null 2>&1 || useradd -m -s /bin/bash agent" in cmd
+        assert "mkdir -p /home/agent/.local/bin" in cmd
+        assert "chown -R agent:agent /home/agent" in cmd
+        assert f"chown -R agent:agent {shlex.quote('/app')}" in cmd
+
+    @pytest.mark.asyncio
+    async def test_setup_command_keeps_heavy_root_tool_dirs_on_shared_paths(self):
+        """Heavy root-owned tool dirs should use explicit symlinks, not duplication."""
+        cmd, _ = await _run_setup_sandbox_user()
+
+        _assert_explicit_symlink(
+            cmd,
+            source="/root/.local/bin",
+            dest="/home/agent/.local/bin",
+        )
+        _assert_explicit_symlink(cmd, source="/root/.nvm", dest="/home/agent/.nvm")
+        assert "cp -aL /root/.local/bin/. /home/agent/.local/bin/" not in cmd
+        assert "cp -a /root/.nvm/. /home/agent/.nvm/" not in cmd

From 0f9b320135672cca1bf2d5f008d9954ea7ed0f2b Mon Sep 17 00:00:00 2001
From: Yifeng He <yfhe.prsn@gmail.com>
Date: Wed, 22 Apr 2026 13:41:16 -0700
Subject: [PATCH 09/12] fix: stop copying root tool installs into sandbox home

Plan step 2/6: Narrow setup_sandbox_user() to user state only
---
 src/benchflow/_sandbox.py   | 33 ++++++++++++++++++++++----------
 tests/test_sandbox_setup.py | 38 ++++++++++++++++++++++++++++++-------
 2 files changed, 54 insertions(+), 17 deletions(-)

diff --git a/src/benchflow/_sandbox.py b/src/benchflow/_sandbox.py
index 41ba87b..dbc9a1b 100644
--- a/src/benchflow/_sandbox.py
+++ b/src/benchflow/_sandbox.py
@@ -1,7 +1,7 @@
 """Sandbox user setup, path lockdown, and verifier hardening.
 
 Owns the "agent runs as non-root" lifecycle:
-    - Creating the sandbox user and copying root's tooling into its home
+    - Creating the sandbox user and preparing minimal home state it needs
     - Building the privilege-drop wrapper (setpriv / su) for agent launch
     - Locking down solution/test paths so the sandbox user cannot read them
     - Hardening the environment before the verifier runs
@@ -100,6 +100,20 @@ def build_priv_drop_cmd(agent_launch: str, sandbox_user: str) -> str:
     )
 
 
+def _legacy_root_tool_link_cmd(source: str, dest: str) -> str:
+    """Link legacy root-only tool dirs into the sandbox home when needed."""
+    src = shlex.quote(source)
+    dst = shlex.quote(dest)
+    parent = shlex.quote(str(Path(dest).parent))
+    return (
+        f"if [ -e {src} ] && [ ! -L {dst} ]; then "
+        f"mkdir -p {parent} && "
+        f"rmdir {dst} 2>/dev/null || true; "
+        f"[ -e {dst} ] || ln -s {src} {dst}; "
+        "fi"
+    )
+
+
 async def setup_sandbox_user(
     env, sandbox_user: str, workspace: str, *, timeout_sec: int = 120
 ) -> str:
@@ -109,18 +123,17 @@ async def setup_sandbox_user(
             f"Invalid sandbox_user: {sandbox_user!r} (must be alphanumeric)"
         )
     logger.info(f"Setting up sandbox user: {sandbox_user}")
+    home = f"/home/{sandbox_user}"
+    home_dirs = sorted(d for d in get_sandbox_home_dirs() if d != ".local")
     await env.exec(
         f"id -u {sandbox_user} >/dev/null 2>&1 || "
         f"useradd -m -s /bin/bash {sandbox_user} && "
-        f"mkdir -p /home/{sandbox_user}/.local/bin && "
-        "if [ -d /root/.local/bin ]; then "
-        f"cp -aL /root/.local/bin/. /home/{sandbox_user}/.local/bin/ 2>/dev/null || true; fi && "
-        "if [ -d /root/.nvm ]; then "
-        f"cp -a /root/.nvm/. /home/{sandbox_user}/.nvm/ 2>/dev/null || true; fi && "
-        f"for d in {' '.join(sorted(get_sandbox_home_dirs()))}; do "
-        f"if [ -d /root/$d ]; then mkdir -p /home/{sandbox_user}/$d && "
-        f"cp -a /root/$d/. /home/{sandbox_user}/$d/ 2>/dev/null || true; fi; done && "
-        f"chown -R {sandbox_user}:{sandbox_user} /home/{sandbox_user} && "
+        f"{_legacy_root_tool_link_cmd('/root/.local/bin', f'{home}/.local/bin')} && "
+        f"{_legacy_root_tool_link_cmd('/root/.nvm', f'{home}/.nvm')} && "
+        f"for d in {' '.join(home_dirs)}; do "
+        f"if [ -d /root/$d ]; then mkdir -p {home}/$d && "
+        f"cp -a /root/$d/. {home}/$d/ 2>/dev/null || true; fi; done && "
+        f"chown -R {sandbox_user}:{sandbox_user} {home} && "
         f"chown -R {sandbox_user}:{sandbox_user} {shlex.quote(workspace)}",
         timeout_sec=timeout_sec,
     )
diff --git a/tests/test_sandbox_setup.py b/tests/test_sandbox_setup.py
index 96cde3b..f835ab6 100644
--- a/tests/test_sandbox_setup.py
+++ b/tests/test_sandbox_setup.py
@@ -7,6 +7,7 @@
 import pytest
 
 from benchflow._sandbox import setup_sandbox_user
+from benchflow.agents.registry import get_sandbox_home_dirs
 
 
 async def _run_setup_sandbox_user(*, sandbox_user: str = "agent", workspace: str = "/app"):
@@ -19,14 +20,21 @@ async def _run_setup_sandbox_user(*, sandbox_user: str = "agent", workspace: str
     return env.exec.call_args.args[0], env.exec.call_args.kwargs
 
 
-def _assert_explicit_symlink(cmd: str, *, source: str, dest: str) -> None:
-    """Heavy tool dirs must use an explicit symlink compatibility path."""
+def _assert_conditional_legacy_symlink(cmd: str, *, source: str, dest: str) -> None:
+    """Legacy tool dirs should link only when a root-only install exists."""
     assert re.search(
-        rf"ln -s(?:f|[a-zA-Z-])* [\"']?{re.escape(source)}[\"']? [\"']?{re.escape(dest)}[\"']?",
+        rf"if \[ -e [\"']?{re.escape(source)}[\"']? \].*ln -s(?:f|[a-zA-Z-])* [\"']?{re.escape(source)}[\"']? [\"']?{re.escape(dest)}[\"']?.*fi",
         cmd,
     ), f"expected explicit symlink from {source} to {dest} in setup command: {cmd}"
 
 
+def _get_copy_loop_dirs(cmd: str) -> list[str]:
+    """Extract the general home-dir copy loop payload from the shell command."""
+    match = re.search(r"for d in (?P<dirs>.*?); do", cmd)
+    assert match, f"expected general home-dir copy loop in setup command: {cmd}"
+    return match.group("dirs").split()
+
+
 class TestSetupSandboxUser:
     @pytest.mark.asyncio
     async def test_setup_command_avoids_recursive_root_tool_copies(self):
@@ -43,20 +51,36 @@ async def test_setup_command_still_creates_user_prepares_home_and_chowns_workspa
         cmd, _ = await _run_setup_sandbox_user()
 
         assert "id -u agent >/dev/null 2>&1 || useradd -m -s /bin/bash agent" in cmd
-        assert "mkdir -p /home/agent/.local/bin" in cmd
+        assert "mkdir -p /home/agent/.local/bin" not in cmd
         assert "chown -R agent:agent /home/agent" in cmd
         assert f"chown -R agent:agent {shlex.quote('/app')}" in cmd
 
     @pytest.mark.asyncio
     async def test_setup_command_keeps_heavy_root_tool_dirs_on_shared_paths(self):
-        """Heavy root-owned tool dirs should use explicit symlinks, not duplication."""
+        """Legacy root-only tool dirs should use conditional symlinks, not duplication."""
         cmd, _ = await _run_setup_sandbox_user()
 
-        _assert_explicit_symlink(
+        _assert_conditional_legacy_symlink(
             cmd,
             source="/root/.local/bin",
             dest="/home/agent/.local/bin",
         )
-        _assert_explicit_symlink(cmd, source="/root/.nvm", dest="/home/agent/.nvm")
+        _assert_conditional_legacy_symlink(
+            cmd, source="/root/.nvm", dest="/home/agent/.nvm"
+        )
         assert "cp -aL /root/.local/bin/. /home/agent/.local/bin/" not in cmd
         assert "cp -a /root/.nvm/. /home/agent/.nvm/" not in cmd
+
+    @pytest.mark.asyncio
+    async def test_setup_command_copy_loop_excludes_local_dir(self):
+        """General home-dir copying should narrow to small config/auth dirs only."""
+        cmd, _ = await _run_setup_sandbox_user()
+
+        copy_loop_dirs = _get_copy_loop_dirs(cmd)
+
+        assert copy_loop_dirs == sorted(
+            d for d in get_sandbox_home_dirs() if d != ".local"
+        )
+        assert ".local" not in copy_loop_dirs
+        assert "mkdir -p /home/agent/$d" in cmd
+        assert "cp -a /root/$d/. /home/agent/$d/ 2>/dev/null || true" in cmd

From 13f06b716a3c2833d89a4c4d0acd6d181b8e61cc Mon Sep 17 00:00:00 2001
From: Yifeng He <yfhe.prsn@gmail.com>
Date: Wed, 22 Apr 2026 13:47:14 -0700
Subject: [PATCH 10/12] refactor: derive sandbox home dirs from registry config

Plan step 3/6: Align registry semantics with the new contract
---
 src/benchflow/agents/registry.py |  9 ++++----
 tests/test_sandbox.py            | 38 ++++++++++++++++++++++++++++----
 2 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/src/benchflow/agents/registry.py b/src/benchflow/agents/registry.py
index f1d26bb..b809b39 100644
--- a/src/benchflow/agents/registry.py
+++ b/src/benchflow/agents/registry.py
@@ -336,16 +336,15 @@ class AgentConfig:
 
 
 def get_sandbox_home_dirs() -> set[str]:
-    """Collect all dot-dirs under $HOME that sandbox user setup should copy.
+    """Collect user home config/auth dirs BenchFlow may materialize for the sandbox user.
 
-    Derives from three sources across all registered agents:
+    Derives from four sources across all registered agents:
     - skill_paths: $HOME/.foo/... → ".foo"
     - credential_files: {home}/.foo/... → ".foo"
+    - subscription_auth.files: {home}/.foo/... → ".foo"
     - home_dirs: explicit extras (e.g. ".openclaw")
-
-    Always includes ".local" (pip scripts, etc.).
     """
-    dirs: set[str] = {".local"}
+    dirs: set[str] = set()
     for cfg in AGENTS.values():
         for sp in cfg.skill_paths:
             if sp.startswith("$HOME/."):
diff --git a/tests/test_sandbox.py b/tests/test_sandbox.py
index 1954ac6..cebda60 100644
--- a/tests/test_sandbox.py
+++ b/tests/test_sandbox.py
@@ -1,8 +1,10 @@
-"""Tests for sandbox user directory derivation from agent registry."""
+"""Tests for sandbox user config/auth directory derivation from agent registry."""
 
 from benchflow.agents.registry import (
     AGENTS,
     AgentConfig,
+    HostAuthFile,
+    SubscriptionAuth,
     get_sandbox_home_dirs,
 )
 
@@ -31,16 +33,21 @@ def test_home_dirs_included(self):
         # openclaw has home_dirs=[".openclaw"]
         assert ".openclaw" in dirs
 
-    def test_always_includes_local(self):
-        """.local is always in the dir list."""
+    def test_does_not_include_legacy_local_tool_dir(self):
+        """.local is not included unless an agent registry path derives it."""
         dirs = get_sandbox_home_dirs()
-        assert ".local" in dirs
+        assert ".local" not in dirs
 
     def test_only_includes_top_level_home_dirs(self):
         """Derived entries stay at $HOME top-level, not nested tool subpaths."""
         dirs = get_sandbox_home_dirs()
         assert ".local/bin" not in dirs
 
+    def test_dirs_represent_registry_backed_home_config_or_auth(self):
+        """Returned dirs are registry-derived user home config/auth roots."""
+        dirs = get_sandbox_home_dirs()
+        assert dirs == {".agents", ".claude", ".codex", ".gemini", ".openclaw", ".pi"}
+
     def test_new_agent_auto_included(self):
         """Adding an agent with skill_paths=$HOME/.newagent/skills includes .newagent."""
         AGENTS["_test_agent"] = AgentConfig(
@@ -55,6 +62,29 @@ def test_new_agent_auto_included(self):
         finally:
             del AGENTS["_test_agent"]
 
+    def test_subscription_auth_file_dirs_included(self):
+        """Dirs from subscription_auth.files container paths are included."""
+        AGENTS["_test_agent_subscription_auth"] = AgentConfig(
+            name="_test_agent_subscription_auth",
+            install_cmd="true",
+            launch_cmd="true",
+            subscription_auth=SubscriptionAuth(
+                replaces_env="TEST_API_KEY",
+                detect_file="~/.subauth/login.json",
+                files=[
+                    HostAuthFile(
+                        "~/.subauth/login.json",
+                        "{home}/.subauth/login.json",
+                    )
+                ],
+            ),
+        )
+        try:
+            dirs = get_sandbox_home_dirs()
+            assert ".subauth" in dirs
+        finally:
+            del AGENTS["_test_agent_subscription_auth"]
+
     def test_workspace_paths_excluded(self):
         """$WORKSPACE paths are not included (only $HOME paths)."""
         dirs = get_sandbox_home_dirs()

From df07322b36a708e607fce809978bf5139c495553 Mon Sep 17 00:00:00 2001
From: Yifeng He <yfhe.prsn@gmail.com>
Date: Wed, 22 Apr 2026 14:56:45 -0700
Subject: [PATCH 11/12] refactor: symlink skills into sandbox, enforce shared
 install prefixes

Replace per-trial skill-tree copies with ln -sfn into a shared /skills (or
task skills_dir) root, drop skill_paths from get_sandbox_home_dirs(), and
add registry + sandbox-setup invariants that keep agent binaries on
/usr/local/* rather than /root-only home paths. Updates task-authoring
and api-reference docs to describe the new lightweight sandbox contract.
---
 docs/api-reference.md                         |   4 +
 ...6-04-22-sandbox-setup-shared-tool-paths.md |  56 +++----
 docs/task-authoring.md                        |   2 +
 src/benchflow/_agent_setup.py                 |  80 +++++++---
 src/benchflow/agents/registry.py              |  10 +-
 tests/test_agent_setup.py                     | 145 ++++++++++++++++++
 tests/test_registry_invariants.py             |  16 ++
 tests/test_sandbox.py                         |  24 ++-
 tests/test_sandbox_setup.py                   |  20 +++
 9 files changed, 285 insertions(+), 72 deletions(-)
 create mode 100644 tests/test_agent_setup.py

diff --git a/docs/api-reference.md b/docs/api-reference.md
index 4e871fa..e09d142 100644
--- a/docs/api-reference.md
+++ b/docs/api-reference.md
@@ -122,6 +122,10 @@ Trial.run()
   ├─ setup()          — resolve config, create env object
   ├─ start()          — spin up sandbox, upload task files, start services
   ├─ install_agent()  — install agent binary, credentials, sandbox user
+  │                    (sandbox user setup: create non-root user, prepare
+  │                     small config/auth dirs, chown the workspace — no
+  │                     recursive copy of /root tool trees; agent binaries
+  │                     must live on shared prefixes like /usr/local/bin)
   ├─ for scene in scenes:
   │    └─ _run_scene(scene)
   │         ├─ connect_as(role)    — open ACP session for this role
diff --git a/docs/plans/2026-04-22-sandbox-setup-shared-tool-paths.md b/docs/plans/2026-04-22-sandbox-setup-shared-tool-paths.md
index b6a6e35..9fc49ea 100644
--- a/docs/plans/2026-04-22-sandbox-setup-shared-tool-paths.md
+++ b/docs/plans/2026-04-22-sandbox-setup-shared-tool-paths.md
@@ -52,11 +52,11 @@ So the implementation target should be:
 - Create: `tests/test_sandbox_setup.py`
 - Modify: `src/benchflow/_sandbox.py`
 
-- [ ] Add a focused async test file for `setup_sandbox_user()` that mocks `env.exec`, calls `setup_sandbox_user(env, "agent", "/app")`, and asserts the command no longer contains recursive copies of `/root/.nvm` or `/root/.local/bin`.
-- [ ] In the same test file, assert the command still creates the user, prepares the target home directory, and `chown`s the workspace.
-- [ ] Add a compatibility test that asserts the command uses a link-or-shared-path strategy for heavyweight root-owned tool dirs rather than `cp -a`.
-- [ ] Run: `.venv/bin/python -m pytest tests/test_sandbox.py tests/test_sandbox_setup.py -q`
-- [ ] Expected: new tests fail on current `main` because the command still contains `cp -a` / `cp -aL` for heavyweight tool dirs.
+- [x] Add a focused async test file for `setup_sandbox_user()` that mocks `env.exec`, calls `setup_sandbox_user(env, "agent", "/app")`, and asserts the command no longer contains recursive copies of `/root/.nvm` or `/root/.local/bin`.
+- [x] In the same test file, assert the command still creates the user, prepares the target home directory, and `chown`s the workspace.
+- [x] Add a compatibility test that asserts the command uses a link-or-shared-path strategy for heavyweight root-owned tool dirs rather than `cp -a`.
+- [x] Run: `.venv/bin/python -m pytest tests/test_sandbox.py tests/test_sandbox_setup.py -q`
+- [x] Expected: new tests fail on current `main` because the command still contains `cp -a` / `cp -aL` for heavyweight tool dirs.
 
 ### Task 2: Narrow `setup_sandbox_user()` to user state only
 
@@ -65,16 +65,16 @@ So the implementation target should be:
 - Modify: `src/benchflow/_sandbox.py:103-128`
 - Test: `tests/test_sandbox_setup.py`
 
-- [ ] Refactor `setup_sandbox_user()` so it does only three things:
+- [x] Refactor `setup_sandbox_user()` so it does only three things:
   - create the sandbox user if missing,
   - prepare only small home-scoped directories BenchFlow actually needs for config/auth,
   - grant ownership to the workspace.
-- [ ] Remove the unconditional copy of `/root/.nvm`.
-- [ ] Remove the special-case copy of `/root/.local/bin`.
-- [ ] Keep copying only the small dirs derived from `get_sandbox_home_dirs()` that represent config/auth state, not tool installations.
-- [ ] Add a minimal helper or inline shell snippet that, when a legacy image exposes required tool paths only under `/root`, creates a symlink to the required path instead of copying the directory tree.
-- [ ] Run: `.venv/bin/python -m pytest tests/test_sandbox_setup.py -q`
-- [ ] Expected: the new setup contract tests pass.
+- [x] Remove the unconditional copy of `/root/.nvm`.
+- [x] Remove the special-case copy of `/root/.local/bin`.
+- [x] Keep copying only the small dirs derived from `get_sandbox_home_dirs()` that represent config/auth state, not tool installations.
+- [x] Add a minimal helper or inline shell snippet that, when a legacy image exposes required tool paths only under `/root`, creates a symlink to the required path instead of copying the directory tree.
+- [x] Run: `.venv/bin/python -m pytest tests/test_sandbox_setup.py -q`
+- [x] Expected: the new setup contract tests pass.
 
 ### Task 3: Align registry semantics with the new contract
 
@@ -83,13 +83,13 @@ So the implementation target should be:
 - Modify: `src/benchflow/agents/registry.py:338-366`
 - Modify: `tests/test_sandbox.py`
 
-- [ ] Update the `get_sandbox_home_dirs()` docstring to state that the returned dirs are user home config/auth dirs that BenchFlow may need to materialize for the sandbox user.
-- [ ] Decide whether `.local` should remain in `get_sandbox_home_dirs()`:
+- [x] Update the `get_sandbox_home_dirs()` docstring to state that the returned dirs are user home config/auth dirs that BenchFlow may need to materialize for the sandbox user.
+- [x] Decide whether `.local` should remain in `get_sandbox_home_dirs()`:
   - if BenchFlow still needs `.local/share` or similar user-scoped state, keep it but stop special-casing `.local/bin` in `_sandbox.py`;
   - if BenchFlow only needed `.local` for tool binaries, remove `.local` from the always-include set and update tests accordingly.
-- [ ] Prefer the more minimal option based on the actual credential/skill paths in the registry.
-- [ ] Run: `.venv/bin/python -m pytest tests/test_sandbox.py -q`
-- [ ] Expected: registry contract tests pass with updated semantics.
+- [x] Prefer the more minimal option based on the actual credential/skill paths in the registry.
+- [x] Run: `.venv/bin/python -m pytest tests/test_sandbox.py -q`
+- [x] Expected: registry contract tests pass with updated semantics.
 
 ### Task 4: Verify agent install assumptions still hold
 
@@ -99,10 +99,10 @@ So the implementation target should be:
 - Modify: `src/benchflow/agents/registry.py`
 - Test: `tests/test_sandbox_setup.py`
 
-- [ ] Review each registry `install_cmd` and confirm BenchFlow-installed binaries resolve from shared paths already on `PATH` for the sandbox user.
-- [ ] If any BenchFlow-managed install still lands in a root-home path, change that install command to a shared location such as `/usr/local/bin`, `/usr/local/lib`, or another non-home prefix.
-- [ ] Add or adjust a test that asserts the sandbox user setup no longer depends on home-copying to make BenchFlow-installed agents executable.
-- [ ] Run targeted tests covering sandbox setup and any registry invariant tests touched by the change.
+- [x] Review each registry `install_cmd` and confirm BenchFlow-installed binaries resolve from shared paths already on `PATH` for the sandbox user.
+- [x] If any BenchFlow-managed install still lands in a root-home path, change that install command to a shared location such as `/usr/local/bin`, `/usr/local/lib`, or another non-home prefix.
+- [x] Add or adjust a test that asserts the sandbox user setup no longer depends on home-copying to make BenchFlow-installed agents executable.
+- [x] Run targeted tests covering sandbox setup and any registry invariant tests touched by the change.
 
 ### Task 5: Document the benchmark image guidance
 
@@ -111,9 +111,9 @@ So the implementation target should be:
 - Modify: `docs/task-authoring.md`
 - Modify: `docs/api-reference.md`
 
-- [ ] Add a short section to `docs/task-authoring.md` explaining that benchmark images should install shared tooling into shared prefixes like `/usr/local` or `/opt`, not `/root/.nvm` or `/root/.local/bin`, when the tools must be usable by a sandbox user.
-- [ ] Document that pre-creating the sandbox user in the Dockerfile is optional optimization, not the primary compatibility mechanism.
-- [ ] Update `docs/api-reference.md` to describe `setup_sandbox_user()` as lightweight user setup plus workspace ownership, not recursive home cloning.
+- [x] Add a short section to `docs/task-authoring.md` explaining that benchmark images should install shared tooling into shared prefixes like `/usr/local` or `/opt`, not `/root/.nvm` or `/root/.local/bin`, when the tools must be usable by a sandbox user.
+- [x] Document that pre-creating the sandbox user in the Dockerfile is optional optimization, not the primary compatibility mechanism.
+- [x] Update `docs/api-reference.md` to describe `setup_sandbox_user()` as lightweight user setup plus workspace ownership, not recursive home cloning.
 
 ### Task 6: End-to-end verification
 
@@ -121,10 +121,10 @@ So the implementation target should be:
 
 - No code changes required unless verification exposes a gap.
 
-- [ ] Run: `.venv/bin/python -m pytest tests/test_sandbox.py tests/test_sandbox_setup.py tests/test_sandbox_hardening.py tests/test_sandbox_verifier_workspace.py -q`
-- [ ] Run: `.venv/bin/ty check src/`
-- [ ] If available, run one trial startup path that exercises `Trial.install_agent()` with `sandbox_user="agent"` and capture setup timing before/after.
-- [ ] Confirm the setup command no longer performs recursive copies of heavyweight tool trees per trial.
+- [x] Run: `.venv/bin/python -m pytest tests/test_sandbox.py tests/test_sandbox_setup.py tests/test_sandbox_hardening.py tests/test_sandbox_verifier_workspace.py -q`
+- [x] Run: `.venv/bin/ty check src/`
+- [x] If available, run one trial startup path that exercises `Trial.install_agent()` with `sandbox_user="agent"` and capture setup timing before/after.
+- [x] Confirm the setup command no longer performs recursive copies of heavyweight tool trees per trial.
 
 ## Notes / Risks
 
diff --git a/docs/task-authoring.md b/docs/task-authoring.md
index adfb641..9081987 100644
--- a/docs/task-authoring.md
+++ b/docs/task-authoring.md
@@ -50,6 +50,8 @@ env             = { OPENAI_API_KEY = "${OPENAI_API_KEY}" }  # host vars to injec
 
 **Built-in mock services** — if the Dockerfile references a service binary (`claw-gmail`, `claw-slack`, `claw-gcal`, `claw-gdoc`, `claw-gdrive`), BenchFlow starts it automatically. No `[services]` section needed.
 
+**Install tooling to shared prefixes, not `/root`** — when a task image ships Node.js, Python tools, or agent binaries that the sandbox user must execute, install them to `/usr/local/bin`, `/usr/local/lib`, or `/opt`, not `/root/.nvm` or `/root/.local/bin`. `setup_sandbox_user()` creates the non-root user, prepares small config/auth dirs, and chowns the workspace — it does not clone `/root` into the sandbox home. Legacy images that already install tools under `/root` still work via a narrow symlink fallback, but shared prefixes are the supported path. Pre-creating the sandbox user in the Dockerfile is an optional speedup, not a requirement.
+
 ---
 
 ## instruction.md
diff --git a/src/benchflow/_agent_setup.py b/src/benchflow/_agent_setup.py
index 39b1720..5db5308 100644
--- a/src/benchflow/_agent_setup.py
+++ b/src/benchflow/_agent_setup.py
@@ -30,6 +30,47 @@
 logger = logging.getLogger(__name__)
 
 
+def _skill_link_cmd(source: str, dest: str) -> str:
+    """Link a shared skills tree into an agent discovery path."""
+    if source == dest:
+        return f"mkdir -p {shlex.quote(dest)}"
+
+    parent = shlex.quote(str(Path(dest).parent))
+    q_source = shlex.quote(source)
+    q_dest = shlex.quote(dest)
+    return (
+        f"mkdir -p {parent} && "
+        f"rm -rf {q_dest} && "
+        f"ln -sfn {q_source} {q_dest}"
+    )
+
+
+async def _link_skill_paths(env, source: str, skill_paths: list[str], home: str, cwd: str) -> int:
+    """Link one shared skills tree into each configured discovery path."""
+    parts = []
+    for sp in skill_paths:
+        expanded = sp.replace("$HOME", home).replace("$WORKSPACE", cwd)
+        parts.append(_skill_link_cmd(source, expanded))
+    if parts:
+        cmd = " && ".join(parts)
+        result = await env.exec(cmd, timeout_sec=15)
+        if result.return_code != 0:
+            stdout = (getattr(result, "stdout", "") or "").strip()
+            stderr = (getattr(result, "stderr", "") or "").strip()
+            details = [
+                f"exit code {result.return_code}",
+                f"command: {cmd}",
+            ]
+            if stdout:
+                details.append(f"stdout: {stdout}")
+            if stderr:
+                details.append(f"stderr: {stderr}")
+            raise RuntimeError(
+                f"Failed to link skills from {source}: {'; '.join(details)}"
+            )
+    return len(parts)
+
+
 async def install_agent(env, agent: str, trial_dir: Path) -> AgentConfig | None:
     """Install agent in sandbox and return its config."""
     agent_base = agent.split()[0]
@@ -72,6 +113,9 @@ async def deploy_skills(
     task: "Task",
 ) -> None:
     """Deploy and distribute skills into sandbox."""
+    task_skills_dir = task.config.environment.skills_dir
+    effective_skills = task_skills_dir
+
     # Runtime upload (fallback if not baked into Dockerfile)
     if skills_dir:
         dockerfile = task_path / "environment" / "Dockerfile"
@@ -84,38 +128,24 @@ async def deploy_skills(
             if skills_path.is_dir():
                 logger.info(f"Deploying skills via runtime upload from {skills_path}")
                 await env.upload_dir(skills_path, "/skills")
-                if agent_cfg and agent_cfg.skill_paths:
-                    parts = []
-                    for sp in agent_cfg.skill_paths:
-                        expanded = sp.replace("$HOME", "/root").replace(
-                            "$WORKSPACE", "/app"
-                        )
-                        parent = str(Path(expanded).parent)
-                        parts.append(
-                            f"mkdir -p '{parent}' && ln -sf /skills '{expanded}'"
-                        )
-                    await env.exec(" && ".join(parts), timeout_sec=10)
-                logger.info("Skills deployed to /skills and symlinked")
+                logger.info("Skills deployed to /skills")
+                effective_skills = "/skills"
             else:
                 logger.warning(f"Skills dir not found: {skills_path}")
         else:
             logger.info("Skills already injected via Dockerfile")
 
     # Distribute to agent-specific discovery paths
-    task_skills_dir = task.config.environment.skills_dir
-    effective_skills = "/skills" if skills_dir else task_skills_dir
     if effective_skills and agent_cfg and agent_cfg.skill_paths:
         home = f"/home/{sandbox_user}" if sandbox_user else "/root"
-        parts = []
-        for sp in agent_cfg.skill_paths:
-            expanded = sp.replace("$HOME", home).replace("$WORKSPACE", agent_cwd)
-            q_expanded = shlex.quote(expanded)
-            q_skills = shlex.quote(effective_skills)
-            parts.append(
-                f"mkdir -p {q_expanded} && cp -r {q_skills}/. {q_expanded}/ 2>/dev/null"
-            )
-        if parts:
-            await env.exec("; ".join(parts), timeout_sec=15)
+        count = await _link_skill_paths(
+            env,
+            effective_skills,
+            agent_cfg.skill_paths,
+            home,
+            agent_cwd,
+        )
+        if count:
             logger.info(
-                f"Skills distributed to {len(parts)} paths for {agent_cfg.name}"
+                f"Skills distributed to {count} paths for {agent_cfg.name}"
             )
diff --git a/src/benchflow/agents/registry.py b/src/benchflow/agents/registry.py
index b809b39..7e3702f 100644
--- a/src/benchflow/agents/registry.py
+++ b/src/benchflow/agents/registry.py
@@ -338,18 +338,16 @@ class AgentConfig:
 def get_sandbox_home_dirs() -> set[str]:
     """Collect user home config/auth dirs BenchFlow may materialize for the sandbox user.
 
-    Derives from four sources across all registered agents:
-    - skill_paths: $HOME/.foo/... → ".foo"
+    Derives from three sources across all registered agents:
     - credential_files: {home}/.foo/... → ".foo"
     - subscription_auth.files: {home}/.foo/... → ".foo"
     - home_dirs: explicit extras (e.g. ".openclaw")
+
+    Skill paths are excluded: deploy_skills() now links those paths directly to a
+    shared skills tree instead of relying on sandbox-home copies.
     """
     dirs: set[str] = set()
     for cfg in AGENTS.values():
-        for sp in cfg.skill_paths:
-            if sp.startswith("$HOME/."):
-                dirname = sp.removeprefix("$HOME/").split("/")[0]
-                dirs.add(dirname)
         for cf in cfg.credential_files:
             # path uses {home}/.foo/... placeholder
             path = cf.path
diff --git a/tests/test_agent_setup.py b/tests/test_agent_setup.py
new file mode 100644
index 0000000..f277bfb
--- /dev/null
+++ b/tests/test_agent_setup.py
@@ -0,0 +1,145 @@
+"""Tests for agent install and skill deployment setup helpers."""
+
+from pathlib import Path
+from types import SimpleNamespace
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from benchflow._agent_setup import deploy_skills
+from benchflow.agents.registry import AgentConfig
+
+
+def _make_task(skills_dir: str | None):
+    return SimpleNamespace(
+        config=SimpleNamespace(
+            environment=SimpleNamespace(
+                skills_dir=skills_dir,
+            )
+        )
+    )
+
+
+@pytest.mark.asyncio
+async def test_deploy_skills_symlinks_agent_skill_paths_instead_of_copying(tmp_path):
+    env = MagicMock()
+    env.exec = AsyncMock(return_value=MagicMock(return_code=0, stdout=""))
+    env.upload_dir = AsyncMock()
+    agent_cfg = AgentConfig(
+        name="test-agent",
+        install_cmd="true",
+        launch_cmd="true",
+        skill_paths=["$HOME/.agents/skills", "$WORKSPACE/skills"],
+    )
+
+    await deploy_skills(
+        env=env,
+        task_path=tmp_path,
+        skills_dir=None,
+        agent_cfg=agent_cfg,
+        sandbox_user="agent",
+        agent_cwd="/app",
+        task=_make_task("/opt/benchflow/skills"),
+    )
+
+    env.upload_dir.assert_not_called()
+    env.exec.assert_awaited_once()
+
+    cmd = env.exec.await_args.args[0]
+    assert "cp -r" not in cmd
+    assert " && " in cmd
+    assert ";" not in cmd
+    assert "ln -sfn /opt/benchflow/skills /home/agent/.agents/skills" in cmd
+    assert "ln -sfn /opt/benchflow/skills /app/skills" in cmd
+
+
+@pytest.mark.asyncio
+async def test_deploy_skills_uploads_runtime_skills_and_links_shared_tree(tmp_path):
+    env = MagicMock()
+    env.exec = AsyncMock(return_value=MagicMock(return_code=0, stdout=""))
+    env.upload_dir = AsyncMock()
+    agent_cfg = AgentConfig(
+        name="test-agent",
+        install_cmd="true",
+        launch_cmd="true",
+        skill_paths=["$HOME/.agents/skills", "$WORKSPACE/skills"],
+    )
+    skills_dir = tmp_path / "skills"
+    skills_dir.mkdir()
+
+    await deploy_skills(
+        env=env,
+        task_path=tmp_path,
+        skills_dir=skills_dir,
+        agent_cfg=agent_cfg,
+        sandbox_user="agent",
+        agent_cwd="/workspace",
+        task=_make_task("/opt/benchflow/skills"),
+    )
+
+    env.upload_dir.assert_awaited_once_with(skills_dir, "/skills")
+    env.exec.assert_awaited_once()
+
+    distributed_link_cmd = env.exec.await_args.args[0]
+    assert " && " in distributed_link_cmd
+    assert ";" not in distributed_link_cmd
+    assert "ln -sfn /skills /home/agent/.agents/skills" in distributed_link_cmd
+    assert "ln -sfn /skills /workspace/skills" in distributed_link_cmd
+    assert "/root/.agents/skills" not in distributed_link_cmd
+    assert "/app/skills" not in distributed_link_cmd
+
+
+@pytest.mark.asyncio
+async def test_deploy_skills_falls_back_when_local_skills_dir_is_missing(tmp_path):
+    env = MagicMock()
+    env.exec = AsyncMock(return_value=MagicMock(return_code=0, stdout=""))
+    env.upload_dir = AsyncMock()
+    agent_cfg = AgentConfig(
+        name="test-agent",
+        install_cmd="true",
+        launch_cmd="true",
+        skill_paths=["$HOME/.agents/skills", "$WORKSPACE/skills"],
+    )
+
+    await deploy_skills(
+        env=env,
+        task_path=tmp_path,
+        skills_dir=tmp_path / "missing-skills",
+        agent_cfg=agent_cfg,
+        sandbox_user="agent",
+        agent_cwd="/workspace",
+        task=_make_task("/opt/benchflow/skills"),
+    )
+
+    env.upload_dir.assert_not_called()
+    env.exec.assert_awaited_once()
+
+    distributed_link_cmd = env.exec.await_args.args[0]
+    assert "ln -sfn /opt/benchflow/skills /home/agent/.agents/skills" in distributed_link_cmd
+    assert "ln -sfn /opt/benchflow/skills /workspace/skills" in distributed_link_cmd
+    assert "ln -sfn /skills /home/agent/.agents/skills" not in distributed_link_cmd
+    assert "ln -sfn /skills /workspace/skills" not in distributed_link_cmd
+
+
+@pytest.mark.asyncio
+async def test_deploy_skills_raises_when_skill_linking_fails(tmp_path):
+    env = MagicMock()
+    env.exec = AsyncMock(return_value=MagicMock(return_code=17, stdout="link failed"))
+    env.upload_dir = AsyncMock()
+    agent_cfg = AgentConfig(
+        name="test-agent",
+        install_cmd="true",
+        launch_cmd="true",
+        skill_paths=["$HOME/.agents/skills"],
+    )
+
+    with pytest.raises(RuntimeError, match="Failed to link skills"):
+        await deploy_skills(
+            env=env,
+            task_path=tmp_path,
+            skills_dir=None,
+            agent_cfg=agent_cfg,
+            sandbox_user="agent",
+            agent_cwd="/app",
+            task=_make_task("/opt/benchflow/skills"),
+        )
diff --git a/tests/test_registry_invariants.py b/tests/test_registry_invariants.py
index 023945b..8cb9fa3 100644
--- a/tests/test_registry_invariants.py
+++ b/tests/test_registry_invariants.py
@@ -75,6 +75,22 @@ def test_agent_collection_invariants(name, cfg):
         assert d.startswith("."), f"home_dirs entry {d!r} must start with '.'"
 
 
+@pytest.mark.parametrize("name,cfg", AGENTS.items(), ids=list(AGENTS.keys()))
+def test_agent_install_cmd_targets_shared_paths(name, cfg):
+    """Installed binaries must land in shared prefixes, not a root-only home.
+
+    setup_sandbox_user() no longer recursively copies /root/.nvm or
+    /root/.local/bin into the sandbox home. If an install_cmd placed its
+    binary there, the sandbox user would silently lose access to the agent.
+    """
+    forbidden_binary_prefixes = ("/root/.nvm/", "/root/.local/bin/", "$HOME/.nvm/")
+    for prefix in forbidden_binary_prefixes:
+        assert prefix not in cfg.install_cmd, (
+            f"{name!r} install_cmd writes under {prefix!r}; use /usr/local/bin "
+            f"or another shared prefix so the sandbox user inherits the tool"
+        )
+
+
 @pytest.mark.parametrize("name,cfg", AGENTS.items(), ids=list(AGENTS.keys()))
 def test_agent_credential_and_subscription_auth(name, cfg):
     """Optional credential_files and subscription_auth structures."""
diff --git a/tests/test_sandbox.py b/tests/test_sandbox.py
index cebda60..36a8c80 100644
--- a/tests/test_sandbox.py
+++ b/tests/test_sandbox.py
@@ -10,16 +10,12 @@
 
 
 class TestSandboxDirs:
-    def test_dirs_derived_from_skill_paths(self):
-        """All $HOME skill_paths dirs from AGENTS registry are included."""
+    def test_skill_only_dirs_not_included(self):
+        """Skill-only home dirs should not be copied during sandbox user setup."""
         dirs = get_sandbox_home_dirs()
-        # claude-agent-acp has $HOME/.claude/skills
-        assert ".claude" in dirs
-        # gemini has $HOME/.gemini/skills
-        assert ".gemini" in dirs
-        # pi-acp has $HOME/.pi/agent/skills and $HOME/.agents/skills
-        assert ".pi" in dirs
-        assert ".agents" in dirs
+
+        assert ".pi" not in dirs
+        assert ".agents" not in dirs
 
     def test_credential_file_dirs_included(self):
         """Dirs from credential_files paths are included."""
@@ -46,10 +42,12 @@ def test_only_includes_top_level_home_dirs(self):
     def test_dirs_represent_registry_backed_home_config_or_auth(self):
         """Returned dirs are registry-derived user home config/auth roots."""
         dirs = get_sandbox_home_dirs()
-        assert dirs == {".agents", ".claude", ".codex", ".gemini", ".openclaw", ".pi"}
+        assert {".claude", ".codex", ".gemini", ".openclaw"}.issubset(dirs)
+        assert ".agents" not in dirs
+        assert ".pi" not in dirs
 
-    def test_new_agent_auto_included(self):
-        """Adding an agent with skill_paths=$HOME/.newagent/skills includes .newagent."""
+    def test_new_agent_skill_path_not_auto_included(self):
+        """Skill-only home dirs should not become sandbox copy targets."""
         AGENTS["_test_agent"] = AgentConfig(
             name="_test_agent",
             install_cmd="true",
@@ -58,7 +56,7 @@ def test_new_agent_auto_included(self):
         )
         try:
             dirs = get_sandbox_home_dirs()
-            assert ".newagent" in dirs
+            assert ".newagent" not in dirs
         finally:
             del AGENTS["_test_agent"]
 
diff --git a/tests/test_sandbox_setup.py b/tests/test_sandbox_setup.py
index f835ab6..f81b392 100644
--- a/tests/test_sandbox_setup.py
+++ b/tests/test_sandbox_setup.py
@@ -84,3 +84,23 @@ async def test_setup_command_copy_loop_excludes_local_dir(self):
         assert ".local" not in copy_loop_dirs
         assert "mkdir -p /home/agent/$d" in cmd
         assert "cp -a /root/$d/. /home/agent/$d/ 2>/dev/null || true" in cmd
+
+    @pytest.mark.asyncio
+    async def test_setup_command_does_not_copy_heavy_tool_trees_into_home(self):
+        """BenchFlow-installed agents must not rely on sandbox-home tool copies.
+
+        Agent binaries are placed in /usr/local/bin by the registered install_cmd
+        values, so setup_sandbox_user() must not bulk-copy heavyweight tool trees
+        (e.g. /root/.nvm, /root/.local/bin) to make them executable for the user.
+        """
+        cmd, _ = await _run_setup_sandbox_user()
+
+        for heavy_source in ("/root/.nvm", "/root/.local/bin"):
+            assert f"cp -a {heavy_source}/." not in cmd
+            assert f"cp -aL {heavy_source}/." not in cmd
+
+        copy_loop_dirs = _get_copy_loop_dirs(cmd)
+        for heavy_dir in (".nvm", ".local"):
+            assert heavy_dir not in copy_loop_dirs, (
+                f"sandbox copy loop must not include heavy tool dir {heavy_dir!r}"
+            )

From cbca4959b5da993d0e81f79328d5aa0e9b3b79e4 Mon Sep 17 00:00:00 2001
From: Yifeng He <yfhe.prsn@gmail.com>
Date: Wed, 22 Apr 2026 15:11:11 -0700
Subject: [PATCH 12/12] chore: remove completed sandbox plan doc

---
 ...6-04-22-sandbox-setup-shared-tool-paths.md | 133 ------------------
 1 file changed, 133 deletions(-)
 delete mode 100644 docs/plans/2026-04-22-sandbox-setup-shared-tool-paths.md

diff --git a/docs/plans/2026-04-22-sandbox-setup-shared-tool-paths.md b/docs/plans/2026-04-22-sandbox-setup-shared-tool-paths.md
deleted file mode 100644
index 9fc49ea..0000000
--- a/docs/plans/2026-04-22-sandbox-setup-shared-tool-paths.md
+++ /dev/null
@@ -1,133 +0,0 @@
-# Sandbox Setup Shared Tool Paths Implementation Plan
-
-> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
-
-**Goal:** Eliminate per-trial copying of heavyweight tool installations during `setup_sandbox_user()` while keeping sandboxed agents able to read configs, credentials, and installed binaries.
-
-**Architecture:** Treat tool installations as shared container state, not per-user home state. Keep `setup_sandbox_user()` responsible only for creating the non-root user, preparing small home-scoped config/credential directories, and granting workspace ownership. Add a narrow compatibility path for root-home installs that BenchFlow cannot relocate itself.
-
-**Tech Stack:** Python 3.12, pytest, Harbor sandbox env exec, BenchFlow agent registry / sandbox setup.
-
----
-
-## Recommendation
-
-Pick **alternative 2: shared install paths** as the primary direction, with a **small compatibility fallback from alternative 1** for existing images that already install tools under `/root`.
-
-Why this is the best fit:
-
-- `setup_sandbox_user()` currently copies `/root/.nvm`, `/root/.local/bin`, and every derived home dir. That is the direct cause of the timeout.
-- BenchFlow already installs some helpers into shared locations like `/usr/local/bin`, which matches the right ownership boundary for binaries.
-- Pre-creating the user in the Dockerfile (alternative 3) is useful for image authors, but it is not a sufficient product-level fix because BenchFlow must still support arbitrary user-provided images.
-- Pure symlinking of `/root` home directories (alternative 1) would reduce I/O, but it keeps the wrong model: heavyweight tool installs still live in a root-only home and `setup_sandbox_user()` still has to know about them.
-
-So the implementation target should be:
-
-- **Primary path:** stop relying on root-home installs for agent binaries and shared tooling.
-- **Compatibility path:** if an existing image already has a heavy tool dir in `/root` and BenchFlow needs it, link the minimum necessary path instead of copying it.
-
-## File Map
-
-- Modify: `src/benchflow/_sandbox.py`
-  - Remove heavyweight recursive copies from `setup_sandbox_user()`.
-  - Add a minimal compatibility helper for sharing root-owned tool paths without recursive copy.
-- Modify: `src/benchflow/agents/registry.py`
-  - Clarify the contract for `home_dirs` / `get_sandbox_home_dirs()` so it covers user config, not shared tool installs.
-- Modify: `src/benchflow/_agent_setup.py`
-  - Ensure the install path assumptions match the new sandbox contract.
-- Modify: `tests/test_sandbox.py`
-  - Add contract tests for which home dirs are copied versus treated as shared tooling.
-- Create or modify: `tests/test_sandbox_setup.py`
-  - Add targeted tests for the generated `setup_sandbox_user()` command.
-- Modify: `docs/task-authoring.md`
-  - Document preferred shared install locations for benchmark/task images.
-- Modify: `docs/api-reference.md`
-  - Update the sandbox setup description to match the new behavior.
-
-### Task 1: Lock the new sandbox contract in tests
-
-**Files:**
-
-- Modify: `tests/test_sandbox.py`
-- Create: `tests/test_sandbox_setup.py`
-- Modify: `src/benchflow/_sandbox.py`
-
-- [x] Add a focused async test file for `setup_sandbox_user()` that mocks `env.exec`, calls `setup_sandbox_user(env, "agent", "/app")`, and asserts the command no longer contains recursive copies of `/root/.nvm` or `/root/.local/bin`.
-- [x] In the same test file, assert the command still creates the user, prepares the target home directory, and `chown`s the workspace.
-- [x] Add a compatibility test that asserts the command uses a link-or-shared-path strategy for heavyweight root-owned tool dirs rather than `cp -a`.
-- [x] Run: `.venv/bin/python -m pytest tests/test_sandbox.py tests/test_sandbox_setup.py -q`
-- [x] Expected: new tests fail on current `main` because the command still contains `cp -a` / `cp -aL` for heavyweight tool dirs.
-
-### Task 2: Narrow `setup_sandbox_user()` to user state only
-
-**Files:**
-
-- Modify: `src/benchflow/_sandbox.py:103-128`
-- Test: `tests/test_sandbox_setup.py`
-
-- [x] Refactor `setup_sandbox_user()` so it does only three things:
-  - create the sandbox user if missing,
-  - prepare only small home-scoped directories BenchFlow actually needs for config/auth,
-  - grant ownership to the workspace.
-- [x] Remove the unconditional copy of `/root/.nvm`.
-- [x] Remove the special-case copy of `/root/.local/bin`.
-- [x] Keep copying only the small dirs derived from `get_sandbox_home_dirs()` that represent config/auth state, not tool installations.
-- [x] Add a minimal helper or inline shell snippet that, when a legacy image exposes required tool paths only under `/root`, creates a symlink to the required path instead of copying the directory tree.
-- [x] Run: `.venv/bin/python -m pytest tests/test_sandbox_setup.py -q`
-- [x] Expected: the new setup contract tests pass.
-
-### Task 3: Align registry semantics with the new contract
-
-**Files:**
-
-- Modify: `src/benchflow/agents/registry.py:338-366`
-- Modify: `tests/test_sandbox.py`
-
-- [x] Update the `get_sandbox_home_dirs()` docstring to state that the returned dirs are user home config/auth dirs that BenchFlow may need to materialize for the sandbox user.
-- [x] Decide whether `.local` should remain in `get_sandbox_home_dirs()`:
-  - if BenchFlow still needs `.local/share` or similar user-scoped state, keep it but stop special-casing `.local/bin` in `_sandbox.py`;
-  - if BenchFlow only needed `.local` for tool binaries, remove `.local` from the always-include set and update tests accordingly.
-- [x] Prefer the more minimal option based on the actual credential/skill paths in the registry.
-- [x] Run: `.venv/bin/python -m pytest tests/test_sandbox.py -q`
-- [x] Expected: registry contract tests pass with updated semantics.
-
-### Task 4: Verify agent install assumptions still hold
-
-**Files:**
-
-- Modify: `src/benchflow/_agent_setup.py`
-- Modify: `src/benchflow/agents/registry.py`
-- Test: `tests/test_sandbox_setup.py`
-
-- [x] Review each registry `install_cmd` and confirm BenchFlow-installed binaries resolve from shared paths already on `PATH` for the sandbox user.
-- [x] If any BenchFlow-managed install still lands in a root-home path, change that install command to a shared location such as `/usr/local/bin`, `/usr/local/lib`, or another non-home prefix.
-- [x] Add or adjust a test that asserts the sandbox user setup no longer depends on home-copying to make BenchFlow-installed agents executable.
-- [x] Run targeted tests covering sandbox setup and any registry invariant tests touched by the change.
-
-### Task 5: Document the benchmark image guidance
-
-**Files:**
-
-- Modify: `docs/task-authoring.md`
-- Modify: `docs/api-reference.md`
-
-- [x] Add a short section to `docs/task-authoring.md` explaining that benchmark images should install shared tooling into shared prefixes like `/usr/local` or `/opt`, not `/root/.nvm` or `/root/.local/bin`, when the tools must be usable by a sandbox user.
-- [x] Document that pre-creating the sandbox user in the Dockerfile is optional optimization, not the primary compatibility mechanism.
-- [x] Update `docs/api-reference.md` to describe `setup_sandbox_user()` as lightweight user setup plus workspace ownership, not recursive home cloning.
-
-### Task 6: End-to-end verification
-
-**Files:**
-
-- No code changes required unless verification exposes a gap.
-
-- [x] Run: `.venv/bin/python -m pytest tests/test_sandbox.py tests/test_sandbox_setup.py tests/test_sandbox_hardening.py tests/test_sandbox_verifier_workspace.py -q`
-- [x] Run: `.venv/bin/ty check src/`
-- [x] If available, run one trial startup path that exercises `Trial.install_agent()` with `sandbox_user="agent"` and capture setup timing before/after.
-- [x] Confirm the setup command no longer performs recursive copies of heavyweight tool trees per trial.
-
-## Notes / Risks
-
-- The collaborator comment on the issue asks to try shared install paths first. This plan follows that request as the architectural choice.
-- Shared install paths alone cannot retrofit arbitrary existing benchmark images that already baked tools into `/root`. That is why the plan keeps a narrow symlink-based compatibility path instead of a full copy.
-- The key review question before implementation is whether `.local` remains a true user-state directory in BenchFlow, or whether it only existed because of previous tool-install assumptions.