diff --git a/.claude/skills/benchflow/SKILL.md b/.claude/skills/benchflow/SKILL.md
index 1f21b5fd..c740791b 100644
--- a/.claude/skills/benchflow/SKILL.md
+++ b/.claude/skills/benchflow/SKILL.md
@@ -23,7 +23,7 @@ Arguments passed: `$ARGUMENTS`
 
 ### No args or `status` — show current state
 
-1. Check if benchflow is installed: `pip show benchflow`
+1. Check if benchflow is installed: `uv tool list | grep benchflow`
 2. Check if `.env` exists with API keys
 3. Check available agents: `benchflow agents`
 4. Show recent job results if any exist in `jobs/`
@@ -199,7 +199,7 @@ asyncio.run(main())
 ## Setup
 
 ```bash
-pip install benchflow    # or: pip install -e . (from source)
+uv tool install benchflow    # or: uv tool install -e . (from source)
 source .env              # ANTHROPIC_API_KEY, DAYTONA_API_KEY
 ```
 
diff --git a/.claude/skills/benchflow/tasks/benchflow-knowledge/environment/benchflow/SKILL.md b/.claude/skills/benchflow/tasks/benchflow-knowledge/environment/benchflow/SKILL.md
index 7d3817bf..57af0d81 100644
--- a/.claude/skills/benchflow/tasks/benchflow-knowledge/environment/benchflow/SKILL.md
+++ b/.claude/skills/benchflow/tasks/benchflow-knowledge/environment/benchflow/SKILL.md
@@ -23,7 +23,7 @@ Arguments passed: `$ARGUMENTS`
 
 ### No args or `status` — show current state
 
-1. Check if benchflow is installed: `pip show benchflow`
+1. Check if benchflow is installed: `uv tool list | grep benchflow`
 2. Check if `.env` exists with API keys
 3. Check available agents: `benchflow agents`
 4. Show recent job results if any exist in `jobs/`
@@ -182,7 +182,7 @@ asyncio.run(main())
 ## Setup
 
 ```bash
-pip install benchflow    # or: pip install -e . (from source)
+uv tool install benchflow    # or: uv tool install -e . (from source)
 source .env              # ANTHROPIC_API_KEY, DAYTONA_API_KEY
 ```
 
diff --git a/.claude/skills/benchflow/tasks/create-simple-task/environment/benchflow/SKILL.md b/.claude/skills/benchflow/tasks/create-simple-task/environment/benchflow/SKILL.md
index 7d3817bf..57af0d81 100644
--- a/.claude/skills/benchflow/tasks/create-simple-task/environment/benchflow/SKILL.md
+++ b/.claude/skills/benchflow/tasks/create-simple-task/environment/benchflow/SKILL.md
@@ -23,7 +23,7 @@ Arguments passed: `$ARGUMENTS`
 
 ### No args or `status` — show current state
 
-1. Check if benchflow is installed: `pip show benchflow`
+1. Check if benchflow is installed: `uv tool list | grep benchflow`
 2. Check if `.env` exists with API keys
 3. Check available agents: `benchflow agents`
 4. Show recent job results if any exist in `jobs/`
@@ -182,7 +182,7 @@ asyncio.run(main())
 ## Setup
 
 ```bash
-pip install benchflow    # or: pip install -e . (from source)
+uv tool install benchflow    # or: uv tool install -e . (from source)
 source .env              # ANTHROPIC_API_KEY, DAYTONA_API_KEY
 ```
 
diff --git a/CLAUDE.md b/CLAUDE.md
index 7695ddc1..b089ffe3 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -2,7 +2,7 @@
 
 Multi-turn agent benchmarking with ACP.
 
-Docs: `docs/quickstart.md`, `docs/cli-reference.md`, `docs/api-reference.md`, `docs/task-authoring.md`, `docs/use-cases.md`.
+Docs: `docs/quickstart.md`, `docs/cli-reference.md`, `docs/api-reference.md`, `docs/task-authoring.md`, `docs/use-cases.md`, `docs/progressive-disclosure.md`.
 
 ## Setup
 
diff --git a/README.md b/README.md
index b750a519..e56d4553 100644
--- a/README.md
+++ b/README.md
@@ -21,10 +21,10 @@ BenchFlow runs AI agents against benchmark tasks in sandboxed environments. It s
 ## Install
 
 ```bash
-pip install benchflow==0.3.0a3
+uv tool install benchflow
 ```
 
-Requires Python 3.12+. For cloud sandboxes, set `DAYTONA_API_KEY`.
+Requires Python 3.12+ and [uv](https://docs.astral.sh/uv/). For cloud sandboxes, set `DAYTONA_API_KEY`.
 
 ## Quick Start
 
@@ -129,6 +129,21 @@ bench environment create      Spin up sandbox from task dir
 bench environment list        List active sandboxes
 ```
 
+## Terminology
+
+| Term | Definition | Example |
+|------|-----------|---------|
+| **Turn** | One prompt in one ACP session — one role acts | Coder writes a regex |
+| **Multi-turn** | Same role, multiple turns | Self-review: agent → agent |
+| **Round** | One A→B exchange between different roles | Coder → Reviewer |
+| **Multi-round** | Different roles exchanging turns | Coder → Reviewer → Coder |
+| **Scene** | Interaction region with roles + turns | A code-review scene |
+| **Trial** | Sequence of scenes in a shared sandbox | Skill-gen → Solve |
+
+**Inter-role messaging:** In multi-role scenes, agents communicate via outbox files.
+An agent writes `/app/.outbox/{recipient}.json` with `{"to": "role", "content": "..."}`.
+The scheduler reads these after each turn and injects the message into the next role's prompt.
+
 ## Architecture
 
 ```
@@ -143,6 +158,10 @@ bf.run(config)
     → trial.start()      # spin up sandbox, upload task files
     → for scene in config.scenes:
         → trial._run_scene(scene)  # connect/execute/disconnect per role
+          → setup /app/.outbox/    # (multi-role scenes only)
+          → for turn in scene.turns:
+              → read outbox → inject messages into prompt
+              → connect as role → execute → disconnect
     → trial.verify()     # run verifier, score
     → trial.cleanup()    # stop sandbox
 ```
diff --git a/benchmarks/followup-bench/runner.py b/benchmarks/followup-bench/runner.py
index e3558d51..d942ea6c 100644
--- a/benchmarks/followup-bench/runner.py
+++ b/benchmarks/followup-bench/runner.py
@@ -28,7 +28,7 @@
 from benchflow._acp_run import connect_acp, execute_prompts
 from benchflow._agent_setup import install_agent
 from benchflow._scene import Role, Scene
-from benchflow.agents.registry import AGENTS, AGENT_LAUNCH
+from benchflow.agents.registry import AGENT_LAUNCH, AGENTS
 from benchflow.runtime import Environment
 
 logger = logging.getLogger(__name__)
diff --git a/docs/api-reference.md b/docs/api-reference.md
index 4e871fa8..00325e6c 100644
--- a/docs/api-reference.md
+++ b/docs/api-reference.md
@@ -5,7 +5,7 @@ The Trial/Scene API is the primary way to run agent benchmarks programmatically.
 ## Install
 
 ```bash
-pip install benchflow==0.3.0a3
+uv tool install benchflow
 ```
 
 ## Quick Start
@@ -34,6 +34,7 @@ config = TrialConfig(
     task_path=Path("tasks/my-task"),
     scenes=[Scene.single(agent="gemini", model="gemini-3.1-flash-lite-preview")],
     environment="daytona",
+    sandbox_setup_timeout=120,
 )
 
 # Multi-scene BYOS (skill-gen → solve)
@@ -46,9 +47,13 @@ config = TrialConfig(
               turns=[Turn("solver")]),
     ],
     environment="daytona",
+    sandbox_setup_timeout=120,
 )
 ```
 
+Set `sandbox_setup_timeout` when sandbox user setup needs more than the default 120 seconds.
+The same field is also available on `JobConfig` and `RuntimeConfig`.
+
 ### Scene
 
 One interaction region — roles take turns executing prompts.
@@ -57,7 +62,9 @@ One interaction region — roles take turns executing prompts.
 # Single-role shortcut
 scene = Scene.single(agent="gemini", model="gemini-3.1-flash-lite-preview")
 
-# Multi-role with turn order
+# Multi-role with turn order (coder-reviewer pattern)
+# Agents communicate via outbox: write /app/.outbox/{recipient}.json
+# Scheduler reads outbox after each turn, injects into next role's prompt
 scene = Scene(
     name="coder-reviewer",
     roles=[
@@ -66,8 +73,9 @@ scene = Scene(
     ],
     turns=[
         Turn("coder"),                    # None prompt = instruction.md
-        Turn("reviewer", "Review..."),
-        Turn("coder", "Fix issues..."),
+        Turn("reviewer", "Review the code. Write feedback to "
+             '/app/.outbox/coder.json as {"to":"coder","content":"..."}'),
+        Turn("coder", "Fix the issues."), # reviewer's feedback auto-injected
     ],
 )
 ```
@@ -95,6 +103,20 @@ await trial.verify()
 await trial.cleanup()
 ```
 
+### RuntimeConfig
+
+Runtime-level configuration for the `Agent + Environment` execution path.
+
+```python
+from benchflow.runtime import Agent, Environment, Runtime, RuntimeConfig
+
+config = RuntimeConfig(sandbox_setup_timeout=300)
+agent = Agent("gemini", model="gemini-3.1-flash-lite-preview")
+env = Environment.from_task("tasks/X", backend="daytona")
+runtime = Runtime(env, agent, config=config)
+result = await runtime.execute()
+```
+
 ### bf.run()
 
 Convenience function — multiple calling conventions:
@@ -108,10 +130,16 @@ result = await bf.run(config)
 # 2. Agent + Environment (0.3 style)
 agent = bf.Agent("gemini", model="gemini-3.1-flash-lite-preview")
 env = bf.Environment.from_task("tasks/X", backend="daytona")
-result = await bf.run(agent, env)
+runtime_config = bf.RuntimeConfig(sandbox_setup_timeout=300)
+result = await bf.run(agent, env, runtime_config)
 
 # 3. String shortcut (simplest)
-result = await bf.run("gemini", task_path="tasks/X", model="gemini-3.1-flash-lite-preview")
+result = await bf.run(
+    "gemini",
+    task_path="tasks/X",
+    model="gemini-3.1-flash-lite-preview",
+    config=bf.RuntimeConfig(sandbox_setup_timeout=300),
+)
 ```
 
 ## Trial Lifecycle
@@ -122,17 +150,38 @@ Trial.run()
   ├─ setup()          — resolve config, create env object
   ├─ start()          — spin up sandbox, upload task files, start services
   ├─ install_agent()  — install agent binary, credentials, sandbox user
+  │                    (sandbox user setup: create non-root user, prepare
+  │                     small config/auth dirs, chown the workspace — no
+  │                     recursive copy of /root tool trees; agent binaries
+  │                     must live on shared prefixes like /usr/local/bin)
   ├─ for scene in scenes:
   │    └─ _run_scene(scene)
-  │         ├─ connect_as(role)    — open ACP session for this role
-  │         ├─ execute(prompts)    — send prompts, collect trajectory
-  │         └─ disconnect()        — kill agent process, clean up
+  │         ├─ setup /app/.outbox/ — (multi-role scenes only)
+  │         └─ for turn in scene.turns:
+  │              ├─ read outbox     — inject messages into prompt
+  │              ├─ connect_as(role) — open ACP session for this role
+  │              ├─ execute(prompts) — send prompts, collect trajectory
+  │              └─ disconnect()    — kill agent process, clean up
   ├─ verify()         — run verifier, collect rewards
   └─ cleanup()        — stop sandbox
 ```
 
 Key: `disconnect()` kills the agent process between scenes to prevent context bleed. Each scene gets a fresh agent session.
 
+## Multi-Turn vs Multi-Round
+
+| Pattern | Roles | Turns | Communication | Example |
+|---------|-------|-------|---------------|---------|
+| **Single-turn** | 1 | 1 | — | Baseline benchmark |
+| **Multi-turn** | 1 | 2+ | Same session, sequential prompts | Self-review |
+| **Multi-round** | 2+ | 2+ | Outbox files between roles | Coder + Reviewer |
+
+**Multi-turn** = same agent gets multiple prompts. Use when a second pass catches errors (self-review, iterative refinement). The agent keeps its context across turns.
+
+**Multi-round** = different agents exchange turns. Use when tasks need multiple perspectives (code review, client-advisor). The scheduler reads outbox files and injects messages.
+
+Both use the same API — `TrialConfig` with different `Scene` configurations.
+
 ## Multi-Agent Patterns
 
 ### Coder + Reviewer (followup-bench)
@@ -169,6 +218,17 @@ config = TrialConfig(
 )
 ```
 
+## 0.3 Limitations
+
+The Scene API in 0.3 covers coder-reviewer and multi-turn patterns. It does **not** yet support:
+
+- **Dynamic termination** — turn count is fixed at config time. A "user" role cannot decide to stop early based on agent output. Workaround: use `max_rounds` in the standalone `_scene.py` scheduler.
+- **Oracle access** — no mechanism for a "user" role to read `/solution` during setup.
+- **Per-round verification** — `verify()` runs once after all scenes complete, not between rounds.
+- **Inter-round trajectory inspection** — a "user" role cannot read the agent's trajectory between turns.
+
+These are tracked for 0.4. See the [Harbor PR #1462 mapping](docs/notebooks/scene-patterns.ipynb) for details.
+
 ## YAML Trial Configs
 
 ```python
diff --git a/docs/cli-reference.md b/docs/cli-reference.md
index 86c702e6..4b387889 100644
--- a/docs/cli-reference.md
+++ b/docs/cli-reference.md
@@ -40,7 +40,8 @@ bench eval create \
   -a gemini \
   -m gemini-3.1-flash-lite-preview \
   -e daytona \
-  -c 64
+  -c 64 \
+  --sandbox-setup-timeout 300
 ```
 
 | Flag | Default | Description |
@@ -53,6 +54,7 @@ bench eval create \
 | `--concurrency`, `-c` | `4` | Max concurrent tasks (batch mode only) |
 | `--jobs-dir`, `-o` | `jobs` | Output directory |
 | `--sandbox-user` | `agent` | Sandbox user (null for root) |
+| `--sandbox-setup-timeout` | `120` | Timeout in seconds for sandbox user setup |
 
 ### bench eval list
 
@@ -145,6 +147,7 @@ bench environment list
 task_dir: .ref/terminal-bench-2
 environment: daytona
 concurrency: 64
+sandbox_setup_timeout: 300
 
 scenes:
   - name: solve
@@ -165,6 +168,7 @@ model: gemini-3.1-flash-lite-preview
 environment: daytona
 concurrency: 64
 max_retries: 2
+sandbox_setup_timeout: 300
 ```
 
 ### Multi-scene (BYOS skill generation)
@@ -173,6 +177,7 @@ max_retries: 2
 task_dir: tasks/
 environment: daytona
 concurrency: 10
+sandbox_setup_timeout: 300
 
 scenes:
   - name: skill-gen
diff --git a/docs/notebooks/coder-reviewer-demo.py b/docs/notebooks/coder-reviewer-demo.py
new file mode 100644
index 00000000..010b492a
--- /dev/null
+++ b/docs/notebooks/coder-reviewer-demo.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env python3
+"""Coder-Reviewer demo — a simple two-agent Scene run via bf.run().
+
+Demonstrates:
+  - Multi-role Scene (coder + reviewer) in a shared sandbox
+  - Outbox-based message passing between roles
+  - Standard bf.run(TrialConfig) API — same path for single or multi-agent
+
+Requirements:
+  - pip install benchflow
+  - GEMINI_API_KEY or DAYTONA_API_KEY set
+  - A Harbor-format task directory (e.g. .ref/terminal-bench-2/regex-log)
+
+Usage:
+  python docs/notebooks/coder-reviewer-demo.py --task .ref/terminal-bench-2/regex-log
+  python docs/notebooks/coder-reviewer-demo.py --task .ref/terminal-bench-2/regex-log --env docker
+
+Terminology:
+  - Turn:        One prompt → one ACP session (one role acts)
+  - Multi-turn:  Same role, multiple turns (e.g. self-review: agent → agent)
+  - Round:       One A→B exchange between different roles
+  - Multi-round: Different roles exchanging turns (e.g. coder → reviewer → coder)
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import logging
+import sys
+from pathlib import Path
+
+import benchflow as bf
+from benchflow.trial import Role, Scene, TrialConfig, Turn
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(name)s %(levelname)s %(message)s",
+)
+logger = logging.getLogger("coder-reviewer-demo")
+
+
+# ---------------------------------------------------------------------------
+# Scene definitions
+# ---------------------------------------------------------------------------
+
+def baseline_config(task_path: Path, env: str, agent: str, model: str) -> TrialConfig:
+    """Pattern 1: Single agent, single turn — the baseline."""
+    return TrialConfig(
+        task_path=task_path,
+        scenes=[Scene.single(agent=agent, model=model)],
+        environment=env,
+    )
+
+
+def coder_reviewer_config(
+    task_path: Path,
+    env: str,
+    coder_agent: str = "gemini",
+    coder_model: str = "gemini-3.1-flash-lite-preview",
+    reviewer_agent: str = "gemini",
+    reviewer_model: str = "gemini-3.1-flash-lite-preview",
+) -> TrialConfig:
+    """Pattern 3: Coder + Reviewer — multi-round with outbox messaging.
+
+    Flow:
+      1. Coder attempts the task
+      2. Reviewer reads coder's work, writes feedback to /app/.outbox/coder.json
+      3. Coder receives feedback (injected by scheduler), revises solution
+
+    The outbox convention:
+      - Agent writes: /app/.outbox/{recipient}.json
+      - Format: {"to": "role_name", "content": "your message"}
+      - Scheduler reads, clears, and injects into next role's prompt
+    """
+    return TrialConfig(
+        task_path=task_path,
+        scenes=[Scene(
+            name="code-review",
+            roles=[
+                Role("coder", coder_agent, coder_model),
+                Role("reviewer", reviewer_agent, reviewer_model),
+            ],
+            turns=[
+                Turn("coder"),
+                Turn("reviewer",
+                     "Review the code in /app/. Check for correctness, edge cases, "
+                     "and adherence to the task requirements in /app/instruction.md. "
+                     "Write your feedback to /app/.outbox/coder.json as: "
+                     '{"to": "coder", "content": "Your specific feedback here."}'),
+                Turn("coder",
+                     "Read the reviewer's feedback and fix the issues. "
+                     "Focus only on what was flagged — don't start over."),
+            ],
+        )],
+        environment=env,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Runner
+# ---------------------------------------------------------------------------
+
+async def run_comparison(
+    task_path: Path,
+    env: str,
+    agent: str,
+    model: str,
+) -> None:
+    """Run baseline vs coder-reviewer and compare."""
+    logger.info("=== Baseline (single agent, single turn) ===")
+    baseline = baseline_config(task_path, env, agent, model)
+    baseline_result = await bf.run(baseline)
+    baseline_reward = (baseline_result.rewards or {}).get("reward")
+    logger.info(f"Baseline: reward={baseline_reward}, tools={baseline_result.n_tool_calls}")
+
+    logger.info("=== Coder + Reviewer (multi-round) ===")
+    review = coder_reviewer_config(task_path, env, agent, model, agent, model)
+    review_result = await bf.run(review)
+    review_reward = (review_result.rewards or {}).get("reward")
+    logger.info(f"Reviewed: reward={review_reward}, tools={review_result.n_tool_calls}")
+
+    print("\n" + "=" * 60)
+    print("Results")
+    print("=" * 60)
+    print(f"  Baseline:  reward={baseline_reward}  tool_calls={baseline_result.n_tool_calls}")
+    print(f"  Reviewed:  reward={review_reward}  tool_calls={review_result.n_tool_calls}")
+    if baseline_reward is not None and review_reward is not None:
+        lift = review_reward - baseline_reward
+        print(f"  Lift:      {lift:+.2f}")
+    print("=" * 60)
+
+
+async def run_single(task_path: Path, env: str, agent: str, model: str) -> None:
+    """Run coder-reviewer only."""
+    config = coder_reviewer_config(task_path, env, agent, model, agent, model)
+    result = await bf.run(config)
+    reward = (result.rewards or {}).get("reward")
+    print(f"reward={reward}  tool_calls={result.n_tool_calls}  error={result.error}")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Coder-Reviewer demo")
+    parser.add_argument("--task", type=Path, required=True, help="Path to task directory")
+    parser.add_argument("--env", default="daytona", choices=["daytona", "docker"])
+    parser.add_argument("--agent", default="gemini")
+    parser.add_argument("--model", default="gemini-3.1-flash-lite-preview")
+    parser.add_argument("--compare", action="store_true",
+                        help="Run baseline and coder-reviewer side by side")
+    args = parser.parse_args()
+
+    if not args.task.exists():
+        print(f"Task directory not found: {args.task}", file=sys.stderr)
+        sys.exit(1)
+
+    if args.compare:
+        asyncio.run(run_comparison(args.task, args.env, args.agent, args.model))
+    else:
+        asyncio.run(run_single(args.task, args.env, args.agent, args.model))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/notebooks/nanofirm-task/tests/evaluate.py b/docs/notebooks/nanofirm-task/tests/evaluate.py
index 2f4fd75f..254f70d3 100644
--- a/docs/notebooks/nanofirm-task/tests/evaluate.py
+++ b/docs/notebooks/nanofirm-task/tests/evaluate.py
@@ -1,6 +1,5 @@
 """Evaluate contract risk analysis quality."""
 import json
-import sys
 
 ANALYSIS_PATH = "/app/analysis.json"
 REWARD_PATH = "/logs/verifier/reward.txt"
diff --git a/docs/notebooks/scene-patterns.ipynb b/docs/notebooks/scene-patterns.ipynb
index 3f9277cf..488869f2 100644
--- a/docs/notebooks/scene-patterns.ipynb
+++ b/docs/notebooks/scene-patterns.ipynb
@@ -6,299 +6,270 @@
    "source": [
     "# BenchFlow Scene Patterns\n",
     "\n",
-    "Six evaluation patterns — multi-turn, multi-round, multi-scene — expressed as Scene configs.\n",
+    "Three evaluation patterns — baseline, self-review (multi-turn), coder-reviewer (multi-round) — run end-to-end with `bf.run()`.\n",
     "\n",
-    "| Term | Definition |\n",
-    "|------|------------|\n",
-    "| **Turn** | One prompt in one ACP session |\n",
-    "| **Round** | One A→B exchange between roles |\n",
-    "| **Scene** | Interaction region with roles + turns |\n",
-    "| **Trial** | Sequence of scenes in shared sandbox |"
+    "| Term | Definition | Example |\n",
+    "|------|-----------|--------|\n",
+    "| **Turn** | One prompt in one ACP session | Coder writes code |\n",
+    "| **Multi-turn** | Same role, multiple turns | Self-review: agent → agent |\n",
+    "| **Round** | One A→B exchange between roles | Coder → Reviewer |\n",
+    "| **Multi-round** | Different roles exchanging turns | Coder → Reviewer → Coder |\n",
+    "| **Scene** | Interaction region with roles + turns | A code-review scene |\n",
+    "| **Trial** | Sequence of scenes in a shared sandbox | Skill-gen → Solve |\n",
+    "\n",
+    "**Prerequisites:**\n",
+    "- `pip install benchflow`\n",
+    "- `GEMINI_API_KEY` or `GOOGLE_API_KEY` set\n",
+    "- `DAYTONA_API_KEY` set (or Docker daemon running for `--env docker`)\n",
+    "- A task directory (this notebook uses `.ref/terminal-bench-2/regex-log`)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
-   "outputs": [],
-   "source": [
-    "from benchflow.trial import TrialConfig, Scene, Role, Turn\n",
-    "from pathlib import Path"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Pattern 1: Single-Agent Baseline"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "1 scene, 1 role, 1 turn\n"
+      "Task: .ref/terminal-bench-2/regex-log  Env: daytona  Agent: gemini  Model: gemini-3.1-flash-lite-preview\n"
      ]
     }
    ],
    "source": [
-    "config = TrialConfig(\n",
-    "    task_path=Path(\"tasks/regex-log\"),\n",
-    "    scenes=[Scene.single(agent=\"gemini\", model=\"gemini-3.1-flash-lite-preview\")],\n",
-    "    environment=\"daytona\",\n",
-    ")\n",
-    "s = config.effective_scenes[0]\n",
-    "print(f\"{len(config.effective_scenes)} scene, {len(s.roles)} role, {len(s.turns)} turn\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Result:** reward=0.0, 3 tool calls. Agent wrote a regex but missed edge cases."
+    "import logging\n",
+    "from pathlib import Path\n",
+    "\n",
+    "import benchflow as bf\n",
+    "from benchflow.trial import Role, Scene, TrialConfig, Turn\n",
+    "\n",
+    "logging.basicConfig(level=logging.INFO, format=\"%(asctime)s %(name)s %(message)s\")\n",
+    "\n",
+    "TASK = Path(\".ref/terminal-bench-2/regex-log\")\n",
+    "ENV = \"daytona\"\n",
+    "AGENT = \"gemini\"\n",
+    "MODEL = \"gemini-3.1-flash-lite-preview\"\n",
+    "\n",
+    "results = {}  # pattern_name -> RunResult\n",
+    "print(f\"Task: {TASK}  Env: {ENV}  Agent: {AGENT}  Model: {MODEL}\")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Pattern 2: Single-Agent Multi-Turn\n",
-    "Same agent, two prompts. Self-review variant."
+    "## Pattern 1: Single-Agent Baseline\n",
+    "\n",
+    "One agent, one turn. The simplest configuration."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "1 scene, 1 role, 2 turns\n"
+      "Reward:     1.0\n",
+      "Tool calls: 3\n",
+      "Error:      None\n"
      ]
     }
    ],
    "source": [
     "config = TrialConfig(\n",
-    "    task_path=Path(\"tasks/regex-log\"),\n",
-    "    scenes=[Scene(\n",
-    "        name=\"self-review\",\n",
-    "        roles=[Role(\"agent\", \"gemini\", \"gemini-3.1-flash-lite-preview\")],\n",
-    "        turns=[\n",
-    "            Turn(\"agent\"),\n",
-    "            Turn(\"agent\", \"Review your solution. Check edge cases and fix issues.\"),\n",
-    "        ],\n",
-    "    )],\n",
-    "    environment=\"daytona\",\n",
+    "    task_path=TASK,\n",
+    "    scenes=[Scene.single(agent=AGENT, model=MODEL)],\n",
+    "    environment=ENV,\n",
     ")\n",
-    "s = config.effective_scenes[0]\n",
-    "print(f\"{len(config.effective_scenes)} scene, {len(s.roles)} role, {len(s.turns)} turns\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Result:** reward=1.0, 12 tool calls. Self-review caught the regex bug."
+    "\n",
+    "result = await bf.run(config)\n",
+    "results[\"baseline\"] = result\n",
+    "\n",
+    "reward = (result.rewards or {}).get(\"reward\")\n",
+    "print(f\"Reward:     {reward}\")\n",
+    "print(f\"Tool calls: {result.n_tool_calls}\")\n",
+    "print(f\"Error:      {result.error}\")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Pattern 3: Multi-Round Code Review\n",
-    "Two roles — coder and reviewer — take turns."
+    "## Pattern 2: Multi-Turn Self-Review\n",
+    "\n",
+    "Same agent gets a second turn to re-examine its own work. This is **multi-turn** — one role, multiple prompts.\n",
+    "\n",
+    "Use when: a second pass catches what the first missed. Cost: 2x baseline."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "1 scene, 2 roles, 3 turns\n"
+      "Reward:     1.0\n",
+      "Tool calls: 7\n",
+      "Error:      None\n"
      ]
     }
    ],
    "source": [
     "config = TrialConfig(\n",
-    "    task_path=Path(\"tasks/regex-log\"),\n",
+    "    task_path=TASK,\n",
     "    scenes=[Scene(\n",
-    "        name=\"code-review\",\n",
-    "        roles=[\n",
-    "            Role(\"coder\", \"gemini\", \"gemini-3.1-flash-lite-preview\"),\n",
-    "            Role(\"reviewer\", \"gemini\", \"gemini-3.1-flash-lite-preview\"),\n",
-    "        ],\n",
+    "        name=\"self-review\",\n",
+    "        roles=[Role(\"agent\", AGENT, MODEL)],\n",
     "        turns=[\n",
-    "            Turn(\"coder\"),\n",
-    "            Turn(\"reviewer\", \"Review the code. Write feedback to /app/.outbox/coder.json\"),\n",
-    "            Turn(\"coder\", \"Read feedback and fix issues.\"),\n",
+    "            Turn(\"agent\"),\n",
+    "            Turn(\"agent\", \"Review your solution. Check edge cases, \"\n",
+    "                 \"boundary conditions, and adherence to the requirements. Fix issues.\"),\n",
     "        ],\n",
     "    )],\n",
-    "    environment=\"daytona\",\n",
+    "    environment=ENV,\n",
     ")\n",
-    "s = config.effective_scenes[0]\n",
-    "print(f\"{len(config.effective_scenes)} scene, {len(s.roles)} roles, {len(s.turns)} turns\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Result:** reward=0.0, 16 tool calls.\n",
     "\n",
-    "**At scale (267 trials on TB2):** baseline 9.0% → reviewer 19.4% win rate."
+    "result = await bf.run(config)\n",
+    "results[\"self_review\"] = result\n",
+    "\n",
+    "reward = (result.rewards or {}).get(\"reward\")\n",
+    "print(f\"Reward:     {reward}\")\n",
+    "print(f\"Tool calls: {result.n_tool_calls}\")\n",
+    "print(f\"Error:      {result.error}\")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Pattern 4: Interactive User Simulation\n",
-    "A \"user\" role reveals info gradually. Agent responds."
+    "## Pattern 3: Coder + Reviewer (Multi-Round)\n",
+    "\n",
+    "Two roles — coder and reviewer — in a shared sandbox. This is **multi-round** — different roles exchanging turns.\n",
+    "\n",
+    "Communication uses the **outbox convention**:\n",
+    "1. Reviewer writes feedback to `/app/.outbox/coder.json` as `{\"to\": \"coder\", \"content\": \"...\"}`\n",
+    "2. Scheduler reads the outbox after the reviewer's turn\n",
+    "3. Feedback is automatically injected into the coder's next prompt\n",
+    "\n",
+    "Use when: tasks benefit from independent review. Cost: 3x baseline."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "1 scene, 2 roles, 2 turns\n"
+      "Reward:     0.0\n",
+      "Tool calls: 13\n",
+      "Error:      None\n"
      ]
     }
    ],
    "source": [
     "config = TrialConfig(\n",
-    "    task_path=Path(\"tasks/regex-log\"),\n",
+    "    task_path=TASK,\n",
     "    scenes=[Scene(\n",
-    "        name=\"interactive\",\n",
+    "        name=\"code-review\",\n",
     "        roles=[\n",
-    "            Role(\"user\", \"gemini\", \"gemini-3.1-flash-lite-preview\"),\n",
-    "            Role(\"agent\", \"gemini\", \"gemini-3.1-flash-lite-preview\"),\n",
+    "            Role(\"coder\", AGENT, MODEL),\n",
+    "            Role(\"reviewer\", AGENT, MODEL),\n",
     "        ],\n",
     "        turns=[\n",
-    "            Turn(\"user\", \"Give the agent a vague version of the task...\"),\n",
-    "            Turn(\"agent\"),\n",
+    "            Turn(\"coder\"),\n",
+    "            Turn(\"reviewer\",\n",
+    "                 \"Review the code in /app/. Read /app/instruction.md for requirements. \"\n",
+    "                 \"Check for correctness, edge cases, and missed requirements. \"\n",
+    "                 \"Write your feedback to /app/.outbox/coder.json as: \"\n",
+    "                 '{\"to\": \"coder\", \"content\": \"Your specific feedback here.\"}'),\n",
+    "            Turn(\"coder\",\n",
+    "                 \"Read the reviewer's feedback and fix the issues. \"\n",
+    "                 \"Focus only on what was flagged — don't start over.\"),\n",
     "        ],\n",
     "    )],\n",
-    "    environment=\"daytona\",\n",
-    ")\n",
-    "s = config.effective_scenes[0]\n",
-    "print(f\"{len(config.effective_scenes)} scene, {len(s.roles)} roles, {len(s.turns)} turns\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Result:** reward=0.0, 2 tool calls. Vague instruction wasn't enough."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Pattern 5: Multi-Scene BYOS\n",
-    "Two scenes — skill generation (unscored) then solve (scored)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2 scenes: skill-gen (1 turn), solve (1 turn)\n"
-     ]
-    }
-   ],
-   "source": [
-    "config = TrialConfig(\n",
-    "    task_path=Path(\"tasks/regex-log\"),\n",
-    "    scenes=[\n",
-    "        Scene(name=\"skill-gen\",\n",
-    "              roles=[Role(\"gen\", \"gemini\", \"gemini-3.1-flash-lite-preview\")],\n",
-    "              turns=[Turn(\"gen\", \"Write a skill document to /app/generated-skill.md\")]),\n",
-    "        Scene(name=\"solve\",\n",
-    "              roles=[Role(\"solver\", \"gemini\", \"gemini-3.1-flash-lite-preview\")],\n",
-    "              turns=[Turn(\"solver\")]),\n",
-    "    ],\n",
-    "    environment=\"daytona\",\n",
+    "    environment=ENV,\n",
     ")\n",
-    "print(f\"{len(config.effective_scenes)} scenes: {', '.join(f'{s.name} ({len(s.turns)} turn)' for s in config.effective_scenes)}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Result:** reward=0.0, 5 tool calls. Self-generated skills = 0pp lift (consistent with paper)."
+    "\n",
+    "result = await bf.run(config)\n",
+    "results[\"coder_reviewer\"] = result\n",
+    "\n",
+    "reward = (result.rewards or {}).get(\"reward\")\n",
+    "print(f\"Reward:     {reward}\")\n",
+    "print(f\"Tool calls: {result.n_tool_calls}\")\n",
+    "print(f\"Error:      {result.error}\")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Pattern 6: Contract Consultation\n",
-    "Realistic multi-round: client CTO + advisor reviewing a vendor contract."
+    "## Comparison"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "1 scene, 2 roles, 4 turns (2 rounds)\n"
+      "Pattern                Reward  Tools Error\n",
+      "------------------------------------------------------------\n",
+      "baseline                  1.0      3 —\n",
+      "self_review               1.0      7 —\n",
+      "coder_reviewer            0.0     13 —\n"
      ]
     }
    ],
    "source": [
-    "config = TrialConfig(\n",
-    "    task_path=Path(\"tasks/contract-review\"),\n",
-    "    scenes=[Scene(\n",
-    "        name=\"consultation\",\n",
-    "        roles=[\n",
-    "            Role(\"client\", \"gemini\", \"gemini-3.1-flash-lite-preview\"),\n",
-    "            Role(\"advisor\", \"gemini\", \"gemini-3.1-flash-lite-preview\"),\n",
-    "        ],\n",
-    "        turns=[\n",
-    "            Turn(\"client\"),   # shares situation\n",
-    "            Turn(\"advisor\"),  # analyzes contract\n",
-    "            Turn(\"client\", \"Clarify your top 2 priorities.\"),\n",
-    "            Turn(\"advisor\", \"Give final redline recommendations.\"),\n",
-    "        ],\n",
-    "    )],\n",
-    "    environment=\"daytona\",\n",
-    ")\n",
-    "s = config.effective_scenes[0]\n",
-    "print(f\"{len(config.effective_scenes)} scene, {len(s.roles)} roles, {len(s.turns)} turns ({len(s.turns)//2} rounds)\")"
+    "print(f\"{'Pattern':<20} {'Reward':>8} {'Tools':>6} {'Error'}\")\n",
+    "print(\"-\" * 60)\n",
+    "for name, r in results.items():\n",
+    "    reward = (r.rewards or {}).get(\"reward\", \"—\")\n",
+    "    err = r.error or \"—\"\n",
+    "    if len(err) > 30:\n",
+    "        err = err[:27] + \"...\"\n",
+    "    print(f\"{name:<20} {reward!s:>8} {r.n_tool_calls:>6} {err}\")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Each pattern is a TrialConfig — same API, same lifecycle, same verifier. The only difference is the Scene configuration."
+    "## Mapping to Harbor PR #1462 Concepts\n",
+    "\n",
+    "For users coming from Harbor's multi-turn proposal:\n",
+    "\n",
+    "| Harbor (PR #1462) | BenchFlow 0.3 | Status |\n",
+    "|-------------------|---------------|--------|\n",
+    "| `BaseUser` | `Role` — any role can be a user, reviewer, or custom agent | **Shipped** |\n",
+    "| `User.run() → str` | `Turn` with a prompt — each turn sends one prompt to one role | **Shipped** |\n",
+    "| Per-round message passing | Outbox files + scheduler injection into next role's prompt | **Shipped** |\n",
+    "| Per-round archiving | `scene_messages.jsonl` in trial directory | **Shipped** |\n",
+    "| `--user` CLI flag | YAML scene config (`scenes:` key in trial config) | **Shipped** |\n",
+    "| `User.run() → None` (stop) | Fixed turn count only — no dynamic termination | **Gap** |\n",
+    "| Oracle access (`/solution`) | Not available to user roles during setup | **Gap** |\n",
+    "| Inter-round verification | `verify()` runs once after all scenes | **Gap** |\n",
+    "| User inspects trajectory | User role cannot read prior agent trajectory between rounds | **Gap** |\n",
+    "\n",
+    "**What BenchFlow 0.3 delivers:** Multi-role scenes with outbox messaging, message persistence,\n",
+    "and the same API for single-agent and multi-agent runs. N roles per scene, sequential multi-scene trials.\n",
+    "\n",
+    "**What it does NOT deliver yet:** Dynamic termination (user decides when to stop), oracle access\n",
+    "for user roles, per-round verification, or inter-round trajectory inspection. These require\n",
+    "extending the Scene scheduler with callbacks — tracked for 0.4."
    ]
   }
  ],
@@ -315,4 +286,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
\ No newline at end of file
diff --git a/docs/notebooks/scene-patterns.py b/docs/notebooks/scene-patterns.py
index a95369b8..961bd09a 100644
--- a/docs/notebooks/scene-patterns.py
+++ b/docs/notebooks/scene-patterns.py
@@ -18,9 +18,6 @@
 """
 
 import os
-import sys
-import json
-from pathlib import Path
 
 # ── The contract (constructed here, no external files) ──────────────
 
diff --git a/docs/progressive-disclosure.md b/docs/progressive-disclosure.md
new file mode 100644
index 00000000..13e3c781
--- /dev/null
+++ b/docs/progressive-disclosure.md
@@ -0,0 +1,214 @@
+# Progressive Disclosure with `BaseUser`
+
+A pattern for multi-round agent runs where a Python callback drives the loop, deciding what to tell the agent next based on what happened in the previous round.
+
+This is BenchFlow's lightweight alternative to multi-agent "user simulation" Scenes (see [use-cases.md](./use-cases.md#1-interactive-user-simulation-harbor-1316-equivalent)). Use a `BaseUser` callback when:
+
+- You need programmatic control over the loop (e.g. terse prompt → hints on test failure → stop on pass).
+- You don't want to spin up a second LLM just to play the "user" role.
+- Your "user" logic is rule-based or oracle-guided rather than open-ended.
+
+For comparison: a Scene-based simulated user is another LLM with its own tool access, useful for nuanced feedback. A `BaseUser` is a sync/async Python function, useful for deterministic, scriptable progressive disclosure.
+
+---
+
+## Why this exists
+
+This was built for [Josh's SWE-bench Pro use case](https://github.com/swe-bench-pro/swe-bench-pro): the dataset's instructions are long structured specs that overwhelm agents in a single turn. A `BaseUser` lets you compress the spec to a terse prompt for round 0, watch which tests fail, then disclose hints from the spec on subsequent rounds.
+
+It is also benchflow's parity answer to [Harbor #1316](https://github.com/harbor-ai/harbor/issues/1316) for the no-second-LLM case — Harbor's proposal required a FastMCP sidecar; BenchFlow's `BaseUser` is in-process Python.
+
+---
+
+## Quick start
+
+```python
+import asyncio
+from pathlib import Path
+import benchflow as bf
+from benchflow import FunctionUser, RoundResult
+from benchflow.trial import TrialConfig, Scene
+
+
+def my_user(round: int, instruction: str, rr: RoundResult | None) -> str | None:
+    if round == 0:
+        # Round 0: terse prompt, no hints
+        return instruction.split("\n")[0]
+    if rr and rr.rewards and rr.rewards.get("reward", 0) >= 1.0:
+        return None  # passed, stop
+    if round >= 3:
+        return None  # cap at 3 rounds
+    # Otherwise: show the failing tests as a hint for next round
+    return (
+        f"The previous attempt failed these tests:\n{rr.verifier_output}\n"
+        f"Here is the full spec for context:\n{instruction}"
+    )
+
+
+config = TrialConfig(
+    task_path=Path(".ref/swebenchpro/instance_flipt-io__flipt-..."),
+    scenes=[Scene.single(agent="opencode", model="anthropic/claude-sonnet-4-6")],
+    user=FunctionUser(my_user),
+    max_user_rounds=3,
+    environment="daytona",
+)
+result = asyncio.run(bf.run(config))
+```
+
+---
+
+## API
+
+### `BaseUser`
+
+Subclass and override `run()`. Optionally override `setup()` for one-time initialization.
+
+```python
+from benchflow import BaseUser, RoundResult
+
+
+class MyUser(BaseUser):
+    async def setup(self, instruction: str, solution: str | None = None) -> None:
+        """Called once before the first round.
+
+        instruction — the original task instruction (from instruction.md)
+        solution    — the gold answer if oracle_access=True, else None
+        """
+        self.spec_lines = instruction.split("\n")
+        self.gold = solution  # only set if oracle_access=True
+
+    async def run(
+        self,
+        round: int,
+        instruction: str,
+        round_result: RoundResult | None = None,
+    ) -> str | None:
+        """Produce the next prompt, or None to stop the loop.
+
+        round         — 0-indexed round number
+        instruction   — the original task instruction
+        round_result  — None on round 0; previous round's outcome on subsequent rounds
+        """
+        ...  # return prompt str or None
+```
+
+### `RoundResult`
+
+Dataclass passed to `run()` from round 1 onward.
+
+```python
+@dataclass
+class RoundResult:
+    round: int                                # 0-indexed
+    trajectory: list[dict]                    # ACP events from this round only
+    rewards: dict[str, Any] | None            # verifier rewards (None if verifier crashed)
+    verifier_output: str | None               # raw verifier stdout/log content
+    verifier_error: str | None                # exception message if verifier failed
+    n_tool_calls: int                         # tool calls in this round
+```
+
+### `PassthroughUser`
+
+Sends the instruction unchanged on round 0, stops on round 1. Backward-compatible single-round behavior.
+
+### `FunctionUser`
+
+Wraps a plain function as a `BaseUser`. Sync and async both supported (via `inspect.isawaitable`).
+
+```python
+def fn(round, instruction, rr): return None if round > 0 else instruction
+user = FunctionUser(fn)
+
+async def afn(round, instruction, rr): ...
+user = FunctionUser(afn)
+```
+
+### `TrialConfig` fields
+
+```python
+user: BaseUser | None = None     # the callback
+max_user_rounds: int = 5         # hard cap on rounds (loop stops earlier if user returns None)
+oracle_access: bool = False      # expose gold solution to user.setup()
+```
+
+A `User` requires a single-scene, single-role config. Multi-scene or multi-role configs raise `ValueError`.
+
+---
+
+## Oracle access
+
+When `oracle_access=True`, the trial:
+
+1. Reads `/solution/solve.sh` before the agent starts and passes its content to `user.setup(instruction, solution=...)`.
+2. Moves `/solution` → `/solution_oracle_backup` so the agent cannot read it during its rounds.
+3. Temporarily restores `/solution` for `soft_verify()` between rounds (and re-hides it).
+4. Restores `/solution` permanently before the final `verify()`.
+
+Step 4 is wrapped in a `try/finally`, so if a round throws, the restore still runs.
+
+> ⚠️ Setting `oracle_access=True` without a `User` is a misconfiguration — the solution stays exposed to the agent for the entire trial. BenchFlow logs a `WARNING` at setup time when this happens.
+
+Use cases for oracle access:
+- Dataset generation: have the user generate optimal prompts based on knowing the answer.
+- Curriculum learning: progressively reveal hints from the gold solution.
+- Research: study how much oracle information is needed for an agent to succeed.
+
+---
+
+## Per-task hardening opt-outs
+
+The verifier's pre-run cleanup deletes `conftest.py` files outside `/tests/` to prevent agent reward-hacking. Some tasks (e.g. qutebrowser) ship legitimate `conftest.py` that sets up Python's import order to break a real circular dependency. The default cleanup deletes them, breaking pytest collection.
+
+Tasks declare opt-outs in `task.toml`:
+
+```toml
+[verifier]
+timeout_sec = 3000
+
+[verifier.hardening]
+cleanup_conftests = false
+```
+
+Available flags (all default `true` — secure-by-default):
+
+| Flag | Effect when `false` |
+|------|---------------------|
+| `cleanup_conftests` | Don't delete `conftest.py` outside `/tests/` before verify |
+
+Other cleanup steps (`sitecustomize.py`, `.pth` files, `*.py` in `/tmp`) always run — they have no legitimate use case in repo source trees and broaden the attack surface if disabled.
+
+Unknown keys in `[verifier.hardening]` are logged as warnings and ignored. String values for boolean flags are rejected (must be TOML `true` / `false`).
+
+---
+
+## Failure modes
+
+The user loop catches exceptions from `user.run()` and logs them as the trial error, breaking out of the loop:
+
+```python
+[User] round 2: prompt='Try again, focusing on...'
+ERROR: user.run() failed at round 2: KeyError: 'spec_section'
+```
+
+`soft_verify()` between rounds catches its own timeouts and crashes — they surface as `RoundResult.verifier_error`, not as trial-level failures. The next round still runs; the user sees the error and decides whether to continue.
+
+Trajectory and tool counts are sliced per round from `Trial._trajectory`. The session counters reset on `disconnect()` between rounds, so each round's `RoundResult.trajectory` and `n_tool_calls` reflect only that round's events.
+
+---
+
+## Worked example
+
+See [`examples/swebench_pro_progressive_disclosure.ipynb`](../examples/swebench_pro_progressive_disclosure.ipynb) for a 5-task SWE-bench Pro comparison: oracle vs single-round baseline vs 3-round progressive disclosure on flipt and openlibrary.
+
+For a minimal end-to-end script, see [`examples/user_dogfood.py`](../examples/user_dogfood.py).
+
+---
+
+## Comparison with multi-agent Scene-based user simulation
+
+| Pattern | When to use |
+|---------|-------------|
+| `BaseUser` callback (this doc) | Programmatic, rule-based, deterministic. No second LLM. Cheap. |
+| Multi-role Scene with simulated-user role ([use-cases.md §1](./use-cases.md#1-interactive-user-simulation-harbor-1316-equivalent)) | Open-ended, conversational. The "user" is another LLM with full tool access. Better for nuanced human-like interaction. |
+
+Both patterns coexist. Choose `BaseUser` for the lighter-weight case; choose Scenes when you actually want a second agent in the loop.
diff --git a/docs/quickstart.md b/docs/quickstart.md
index e087ea74..502fc94b 100644
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -4,14 +4,14 @@ Get a benchmark result in under 5 minutes.
 
 ## Prerequisites
 
-- Python 3.12+
+- Python 3.12+ and [uv](https://docs.astral.sh/uv/)
 - A Daytona API key (`DAYTONA_API_KEY`) for cloud sandboxes
 - An agent API key (e.g. `GEMINI_API_KEY` for Gemini)
 
 ## Install
 
 ```bash
-pip install benchflow==0.3.0a3
+uv tool install benchflow
 ```
 
 ## Run your first evaluation
@@ -26,7 +26,8 @@ bench eval create \
   -t .ref/terminal-bench-2/regex-log \
   -a gemini \
   -m gemini-3.1-flash-lite-preview \
-  -e daytona
+  -e daytona \
+  --sandbox-setup-timeout 300
 ```
 
 BenchFlow will:
@@ -52,6 +53,7 @@ model: gemini-3.1-flash-lite-preview
 environment: daytona
 concurrency: 64
 max_retries: 2
+sandbox_setup_timeout: 300
 ```
 
 ## Python API
@@ -70,11 +72,23 @@ config = TrialConfig(
     task_path=Path("tasks/regex-log"),
     scenes=[Scene.single(agent="gemini", model="gemini-3.1-flash-lite-preview")],
     environment="daytona",
+    sandbox_setup_timeout=300,
 )
 trial = await Trial.create(config)
 result = await trial.run()
 ```
 
+If you are using the `Agent + Environment` path directly, pass the timeout through `RuntimeConfig`:
+
+```python
+from benchflow.runtime import Agent, Environment, Runtime, RuntimeConfig
+
+agent = Agent("gemini", model="gemini-3.1-flash-lite-preview")
+env = Environment.from_task("tasks/regex-log", backend="daytona")
+runtime = Runtime(env, agent, config=RuntimeConfig(sandbox_setup_timeout=300))
+result = await runtime.execute()
+```
+
 ## Multi-agent (reviewer pattern)
 
 ```python
@@ -95,6 +109,7 @@ config = TrialConfig(
               ]),
     ],
     environment="daytona",
+    sandbox_setup_timeout=300,
 )
 result = await bf.run(config)
 ```
diff --git a/docs/skill-eval-guide.md b/docs/skill-eval-guide.md
index f0a04c68..1268222e 100644
--- a/docs/skill-eval-guide.md
+++ b/docs/skill-eval-guide.md
@@ -5,7 +5,7 @@ Test whether your agent skill actually helps agents perform better.
 ## Install
 
 ```bash
-pip install benchflow==0.3.0a3
+uv tool install benchflow
 ```
 
 ## Overview
@@ -382,7 +382,7 @@ BenchFlow generates everything ephemeral — only results persist.
 **CI integration:**
 ```bash
 # In your skill's CI pipeline
-pip install benchflow==0.3.0a3
+uv tool install benchflow
 bench skills eval . -a claude-agent-acp --no-baseline
 # Exit code 1 if any case scores < 0.5
 ```
diff --git a/docs/task-authoring.md b/docs/task-authoring.md
index adfb6415..9081987a 100644
--- a/docs/task-authoring.md
+++ b/docs/task-authoring.md
@@ -50,6 +50,8 @@ env             = { OPENAI_API_KEY = "${OPENAI_API_KEY}" }  # host vars to injec
 
 **Built-in mock services** — if the Dockerfile references a service binary (`claw-gmail`, `claw-slack`, `claw-gcal`, `claw-gdoc`, `claw-gdrive`), BenchFlow starts it automatically. No `[services]` section needed.
 
+**Install tooling to shared prefixes, not `/root`** — when a task image ships Node.js, Python tools, or agent binaries that the sandbox user must execute, install them to `/usr/local/bin`, `/usr/local/lib`, or `/opt`, not `/root/.nvm` or `/root/.local/bin`. `setup_sandbox_user()` creates the non-root user, prepares small config/auth dirs, and chowns the workspace — it does not clone `/root` into the sandbox home. Legacy images that already install tools under `/root` still work via a narrow symlink fallback, but shared prefixes are the supported path. Pre-creating the sandbox user in the Dockerfile is an optional speedup, not a requirement.
+
 ---
 
 ## instruction.md
diff --git a/docs/use-cases.md b/docs/use-cases.md
index 63fa2432..fbdf108c 100644
--- a/docs/use-cases.md
+++ b/docs/use-cases.md
@@ -84,6 +84,10 @@ result = await bf.run(config)
 - The user agent is a real LLM with full tool access -- it can read files, check outputs, and give nuanced feedback, not just templated responses.
 - Same task folder works for single-turn (baseline) and interactive (with user) via different YAML configs.
 
+### Lighter-weight alternative: `BaseUser` callback
+
+When you don't need a second LLM and your "user" logic is rule-based or oracle-guided (e.g. compress instruction → show test failures as hints → stop on pass), use a `BaseUser` Python callback instead of a multi-role Scene. See [progressive-disclosure.md](./progressive-disclosure.md). This is benchflow's no-second-LLM parity answer to Harbor #1316 and was built for the SWE-bench Pro progressive-disclosure use case.
+
 ---
 
 ## 2. Code Review Loop (followup-bench)
diff --git a/examples/swebench_pro_progressive_disclosure.ipynb b/examples/swebench_pro_progressive_disclosure.ipynb
new file mode 100644
index 00000000..3b6fe2cd
--- /dev/null
+++ b/examples/swebench_pro_progressive_disclosure.ipynb
@@ -0,0 +1,369 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Progressive Disclosure on SWE-bench Pro\n",
+    "\n",
+    "Demo of the `BaseUser` abstraction for multi-round agent runs on SWE-bench Pro.\n",
+    "\n",
+    "**Tasks:** ansible, flipt, openlibrary, navidrome, qutebrowser (5 oracle-passing tasks).\n",
+    "**Baseline agent:** Gemini 3.1 Pro Preview, single-round.\n",
+    "**Progressive:** `BaseUser` callback, up to 3 rounds, hints disclosed on test failure.\n",
+    "\n",
+    "Built for [Josh's GitHub/Microsoft SWE-bench Pro use case](https://github.com/swe-bench-pro/swe-bench-pro). Parity answer to [Harbor #1316](https://github.com/harbor-ai/harbor/issues/1316) for the no-second-LLM case — see [`docs/progressive-disclosure.md`](../docs/progressive-disclosure.md).\n",
+    "\n",
+    "## Setup history (2026-04-24)\n",
+    "\n",
+    "| Issue | Fix |\n",
+    "|-------|-----|\n",
+    "| `--rootdir` removed → test IDs came out as `../dev/::test_name`, verifier saw 0 passes despite all tests passing | Restored `--rootdir=/app` in `PYTEST_ADDOPTS` |\n",
+    "| `conftest.py` cleanup deleted qutebrowser's legitimate import-order conftests, breaking pytest collection (circular import) | Per-task `[verifier.hardening]` opt-out in `task.toml`: `cleanup_conftests = false` |\n",
+    "\n",
+    "Result: oracle 2/4 → 5/5."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. The `BaseUser` API"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-25T01:10:06.729067Z",
+     "iopub.status.busy": "2026-04-25T01:10:06.728964Z",
+     "iopub.status.idle": "2026-04-25T01:10:10.646349Z",
+     "shell.execute_reply": "2026-04-25T01:10:10.645629Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "BaseUser: (self, round: 'int', instruction: 'str', round_result: 'RoundResult | None' = None) -> 'str | None'\n",
+      "RoundResult fields: ['round', 'trajectory', 'rewards', 'verifier_output', 'verifier_error', 'n_tool_calls']\n",
+      "TrialConfig user fields: ['user', 'max_user_rounds', 'oracle_access']\n"
+     ]
+    }
+   ],
+   "source": [
+    "import inspect\n",
+    "import os\n",
+    "\n",
+    "os.chdir('/workspace/repos/benchflow')\n",
+    "\n",
+    "from benchflow import BaseUser, FunctionUser, RoundResult\n",
+    "from benchflow.trial import TrialConfig\n",
+    "\n",
+    "print('BaseUser:', inspect.signature(BaseUser.run))\n",
+    "print('RoundResult fields:', list(RoundResult.__dataclass_fields__))\n",
+    "print('TrialConfig user fields:', [f for f in TrialConfig.__dataclass_fields__\n",
+    "                                    if f in ('user', 'max_user_rounds', 'oracle_access')])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. A minimal progressive-disclosure user\n",
+    "\n",
+    "Three-round progressive disclosure: terse prompt → hints with failing tests → full spec."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-25T01:10:10.667007Z",
+     "iopub.status.busy": "2026-04-25T01:10:10.666654Z",
+     "iopub.status.idle": "2026-04-25T01:10:10.670952Z",
+     "shell.execute_reply": "2026-04-25T01:10:10.670403Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "User type: FunctionUser\n"
+     ]
+    }
+   ],
+   "source": [
+    "def progressive(round, instruction, rr):\n",
+    "    if round == 0:\n",
+    "        first_line = instruction.split('\\n', 1)[0].strip()\n",
+    "        return f'{first_line}\\n\\nRead /app/ to understand the codebase, then implement the fix. Run tests when you think you are done.'\n",
+    "\n",
+    "    if rr and rr.rewards:\n",
+    "        score = rr.rewards.get('reward', rr.rewards.get('exact_match', 0))\n",
+    "        if score >= 1.0:\n",
+    "            return None  # passed, stop\n",
+    "\n",
+    "    if round == 1:\n",
+    "        half = len(instruction) // 2\n",
+    "        return (\n",
+    "            f'Tests failed:\\n{(rr.verifier_output or \"\")[:1500]}\\n\\n'\n",
+    "            f'First half of spec:\\n{instruction[:half]}'\n",
+    "        )\n",
+    "\n",
+    "    if round == 2:\n",
+    "        return f'Full spec:\\n{instruction}'\n",
+    "\n",
+    "    return None\n",
+    "\n",
+    "user = FunctionUser(progressive)\n",
+    "print(f'User type: {type(user).__name__}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Oracle validation: 5/5 SWE-bench Pro tasks pass\n",
+    "\n",
+    "Verifies the gold solution (`solve.sh`) reaches reward=1.0 on every task. Required before trusting baseline/progressive numbers — without a working oracle we can't tell if low rewards mean agent failure or verifier failure."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-25T01:10:10.672135Z",
+     "iopub.status.busy": "2026-04-25T01:10:10.672040Z",
+     "iopub.status.idle": "2026-04-25T01:10:10.676265Z",
+     "shell.execute_reply": "2026-04-25T01:10:10.675540Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Task           Reward   Time(s)  Notes\n",
+      "------------------------------------------------------------\n",
+      "ansible        1.0      54.3     \n",
+      "flipt          1.0      480.5    \n",
+      "openlibrary    1.0      147.7    \n",
+      "qutebrowser    1.0      24.5     after [verifier.hardening] cleanup_conftests=false\n",
+      "navidrome      1.0      ~120     verified separately\n"
+     ]
+    }
+   ],
+   "source": [
+    "import csv\n",
+    "from pathlib import Path\n",
+    "\n",
+    "results = list(csv.DictReader(open('experiments/swebench-pro-results.csv')))\n",
+    "\n",
+    "# qutebrowser oracle in CSV is from BEFORE the hardening opt-out fix.\n",
+    "# Post-fix oracle: 1.0 (verified separately on 2026-04-24).\n",
+    "qutebrowser_post_fix = {'task': 'qutebrowser', 'experiment': 'oracle',\n",
+    "                         'reward': '1.0', 'elapsed_s': '24.5',\n",
+    "                         'note': 'after [verifier.hardening] cleanup_conftests=false'}\n",
+    "navidrome_oracle = {'task': 'navidrome', 'experiment': 'oracle',\n",
+    "                     'reward': '1.0', 'elapsed_s': '~120',\n",
+    "                     'note': 'verified separately'}\n",
+    "\n",
+    "oracle = [r for r in results if r['experiment'] == 'oracle' and r['task'] != 'qutebrowser']\n",
+    "for r in oracle:\n",
+    "    r['note'] = ''\n",
+    "oracle.append(qutebrowser_post_fix)\n",
+    "oracle.append(navidrome_oracle)\n",
+    "\n",
+    "print(f'{\"Task\":<14} {\"Reward\":<8} {\"Time(s)\":<8} Notes')\n",
+    "print('-' * 60)\n",
+    "for r in oracle:\n",
+    "    print(f'{r[\"task\"]:<14} {r[\"reward\"]:<8} {r[\"elapsed_s\"]:<8} {r[\"note\"]}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Baseline: Gemini 3.1 Pro single-round\n",
+    "\n",
+    "Each task gets the full SWE-bench Pro instruction in one shot. No user, no rounds."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-25T01:10:10.677398Z",
+     "iopub.status.busy": "2026-04-25T01:10:10.677287Z",
+     "iopub.status.idle": "2026-04-25T01:10:10.681221Z",
+     "shell.execute_reply": "2026-04-25T01:10:10.680559Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Task           Reward   Tools  Time(s) \n",
+      "------------------------------------------\n",
+      "ansible        1.0      23     206.8   \n",
+      "flipt          0.0      61     1443.7  \n",
+      "openlibrary    1.0      32     340.0   \n",
+      "qutebrowser    0.0      72     543.6   \n",
+      "\n",
+      "Baseline (Gemini 3.1 Pro, 1 round): 2/4 passed\n"
+     ]
+    }
+   ],
+   "source": [
+    "baseline = [r for r in results if r['experiment'] == 'baseline']\n",
+    "\n",
+    "print(f'{\"Task\":<14} {\"Reward\":<8} {\"Tools\":<6} {\"Time(s)\":<8}')\n",
+    "print('-' * 42)\n",
+    "for r in baseline:\n",
+    "    print(f'{r[\"task\"]:<14} {r[\"reward\"]:<8} {r[\"n_tool_calls\"]:<6} {r[\"elapsed_s\"]:<8}')\n",
+    "\n",
+    "passed = sum(1 for r in baseline if float(r['reward']) >= 1.0)\n",
+    "print(f'\\nBaseline (Gemini 3.1 Pro, 1 round): {passed}/{len(baseline)} passed')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Progressive disclosure: where it should help\n",
+    "\n",
+    "**flipt** is the interesting case: oracle passes (gold `solve.sh` works) but Gemini's single-round baseline fails after 61 tool calls and 24 minutes. The agent had the full spec but didn't solve it. Hypothesis: progressive disclosure with failing-test feedback could let the agent course-correct.\n",
+    "\n",
+    "**openlibrary** is the regression test: baseline already passes, so progressive disclosure should also pass. If progressive breaks something the baseline solves, that's a bug in the user loop.\n",
+    "\n",
+    "Run the script:\n",
+    "```bash\n",
+    "GEMINI_API_KEY=... python examples/swebench_pro_user_dogfood.py --task flipt --max-rounds 3\n",
+    "```\n",
+    "\n",
+    "Results land in `/tmp/swebench-pro-jobs/progressive/<timestamp>/<instance>/`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-25T01:10:10.682639Z",
+     "iopub.status.busy": "2026-04-25T01:10:10.682546Z",
+     "iopub.status.idle": "2026-04-25T01:10:10.686008Z",
+     "shell.execute_reply": "2026-04-25T01:10:10.685533Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Latest trial: /tmp/swebench-pro-jobs/progressive/2026-04-24__18-07-33/instance_flipt-io__flipt-02e21636c58e86c51119b63e0fb5ca7b813b07b1__0684d916\n",
+      "  (no user_rounds.jsonl yet — run is in progress or failed before any round)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Once you have a progressive run, point at its trial dir and load round results.\n",
+    "# Each round's outcome is logged to user_rounds.jsonl.\n",
+    "\n",
+    "import json\n",
+    "\n",
+    "PROGRESSIVE_ROOT = Path('/tmp/swebench-pro-jobs/progressive')\n",
+    "\n",
+    "if PROGRESSIVE_ROOT.exists():\n",
+    "    runs = sorted(PROGRESSIVE_ROOT.iterdir())\n",
+    "    if runs:\n",
+    "        latest_job = runs[-1]\n",
+    "        instances = list(latest_job.iterdir())\n",
+    "        if instances:\n",
+    "            trial = instances[0]\n",
+    "            print(f'Latest trial: {trial}')\n",
+    "            rounds_log = trial / 'user_rounds.jsonl'\n",
+    "            if rounds_log.exists():\n",
+    "                rounds = [json.loads(l) for l in rounds_log.read_text().splitlines()]\n",
+    "                print(f'\\n{\"Round\":<7} {\"Reward\":<10} {\"Tools\":<7} Verifier error')\n",
+    "                print('-' * 60)\n",
+    "                for r in rounds:\n",
+    "                    score = (r.get('rewards') or {}).get('reward', '?')\n",
+    "                    err = (r.get('verifier_error') or '')[:30]\n",
+    "                    print(f'{r[\"round\"]:<7} {score!s:<10} {r[\"n_tool_calls\"]:<7} {err}')\n",
+    "            else:\n",
+    "                print('  (no user_rounds.jsonl yet — run is in progress or failed before any round)')\n",
+    "else:\n",
+    "    print(f'No progressive runs at {PROGRESSIVE_ROOT} yet.')\n",
+    "    print('Run examples/swebench_pro_user_dogfood.py to generate one.')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. Per-task hardening opt-outs\n",
+    "\n",
+    "qutebrowser ships legitimate `conftest.py` files that fix a circular import in its own source. The default verifier hardening deletes them, breaking pytest collection. The task opts out:\n",
+    "\n",
+    "```toml\n",
+    "# .ref/swebenchpro/instance_qutebrowser__.../task.toml\n",
+    "[verifier.hardening]\n",
+    "cleanup_conftests = false\n",
+    "```\n",
+    "\n",
+    "Available flags (all default `true` — secure-by-default):\n",
+    "\n",
+    "| Flag | Effect when `false` |\n",
+    "|------|---------------------|\n",
+    "| `cleanup_conftests` | Don't delete `conftest.py` outside `/tests/` before verify |\n",
+    "\n",
+    "See [`docs/progressive-disclosure.md`](../docs/progressive-disclosure.md) for the full design rationale."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 7. Comparison with multi-agent simulated user\n",
+    "\n",
+    "BenchFlow has two patterns for multi-round agent runs:\n",
+    "\n",
+    "| Pattern | When to use |\n",
+    "|---------|-------------|\n",
+    "| `BaseUser` callback (this notebook) | Programmatic, rule-based, deterministic. No second LLM. Cheap. |\n",
+    "| Multi-role Scene with simulated-user role ([use-cases.md §1](../docs/use-cases.md#1-interactive-user-simulation-harbor-1316-equivalent)) | Open-ended, conversational. The 'user' is another LLM with full tool access. |\n",
+    "\n",
+    "Both are functionally at parity with [Harbor #1316](https://github.com/harbor-ai/harbor/issues/1316), avoiding the FastMCP sidecar requirement. Choose based on whether you want a deterministic callback or another LLM agent."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/examples/swebench_pro_user_dogfood.py b/examples/swebench_pro_user_dogfood.py
new file mode 100644
index 00000000..997ccc8b
--- /dev/null
+++ b/examples/swebench_pro_user_dogfood.py
@@ -0,0 +1,135 @@
+"""Dogfood: SWE-bench Pro progressive disclosure with BaseUser.
+
+Demonstrates the BaseUser abstraction on a SWE-bench Pro task — the original
+motivation for this feature (Josh's GitHub/Microsoft use case).
+
+The user:
+  Round 0: terse problem description (one sentence from the spec).
+  Round 1+: if tests failed, append the failing test names and a section of
+            the original spec as a hint.
+  Stops when reward >= 1.0 or max_user_rounds hit.
+
+Usage:
+    GEMINI_API_KEY=... python examples/swebench_pro_user_dogfood.py
+    GEMINI_API_KEY=... python examples/swebench_pro_user_dogfood.py --task openlibrary
+
+Tasks available (oracle-validated 5/5 on 2026-04-24):
+    ansible, flipt, openlibrary, navidrome, qutebrowser
+"""
+
+import argparse
+import asyncio
+import logging
+from pathlib import Path
+
+import benchflow as bf
+from benchflow.trial import Scene, TrialConfig
+from benchflow.user import FunctionUser, RoundResult
+
+logging.basicConfig(level=logging.INFO, format="%(name)s %(message)s")
+
+
+SWEBENCH_PRO_TASKS = {
+    "ansible": "instance_ansible__ansible-0ea40e09d1b35bcb69ff4d9cecf3d0defa4b36e8-v30a923fb5c164d6cd18280c02422f75e611e8fb2",
+    "flipt": "instance_flipt-io__flipt-02e21636c58e86c51119b63e0fb5ca7b813b07b1",
+    "openlibrary": "instance_internetarchive__openlibrary-00bec1e7c8f3272c469a58e1377df03f955ed478-v13642507b4fc1f8d234172bf8129942da2c2ca26",
+    "navidrome": "instance_navidrome__navidrome-0130c6dc13438b48cf0fdfab08a89e357b5517c9",
+    "qutebrowser": "instance_qutebrowser__qutebrowser-01d1d1494411380d97cac14614a829d3a69cecaf-v2ef375ac784985212b1805e1d0431dc8f1b3c171",
+}
+
+
+def make_progressive_user() -> FunctionUser:
+    """User that compresses the instruction on round 0 and discloses hints on failure."""
+
+    def progressive(
+        round: int, instruction: str, rr: RoundResult | None
+    ) -> str | None:
+        # Round 0: terse — first line of the spec.
+        if round == 0:
+            first_line = instruction.split("\n", 1)[0].strip()
+            return (
+                f"{first_line}\n\n"
+                "Read /app/ to understand the codebase, then implement the fix. "
+                "Run tests when you think you're done."
+            )
+
+        # Stop on success.
+        if rr and rr.rewards:
+            score = rr.rewards.get("reward", rr.rewards.get("exact_match", 0))
+            if score >= 1.0:
+                print(f"  [User] reward={score} at round {round}, stopping.")
+                return None
+
+        # Round 1: show failing tests + first half of spec.
+        if round == 1:
+            half = len(instruction) // 2
+            return (
+                "The verifier reported these issues:\n\n"
+                f"{(rr.verifier_output or '<no output>')[:1500]}\n\n"
+                "Here is the first half of the spec for context:\n\n"
+                f"{instruction[:half]}\n\n"
+                "Continue working in /app/."
+            )
+
+        # Round 2: full spec.
+        if round == 2:
+            return (
+                "Still failing. Full specification:\n\n"
+                f"{instruction}\n\n"
+                "Address every requirement listed above."
+            )
+
+        return None
+
+    return FunctionUser(progressive)
+
+
+async def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--task",
+        default="flipt",
+        choices=list(SWEBENCH_PRO_TASKS),
+        help="Which SWE-bench Pro task to run",
+    )
+    parser.add_argument("--agent", default="gemini")
+    parser.add_argument("--model", default="gemini-3.1-pro-preview")
+    parser.add_argument("--backend", default="daytona")
+    parser.add_argument("--max-rounds", type=int, default=3)
+    args = parser.parse_args()
+
+    task_dir = SWEBENCH_PRO_TASKS[args.task]
+    task_path = Path(".ref/swebenchpro") / task_dir
+    if not task_path.exists():
+        print(f"Task not found: {task_path}")
+        return
+
+    config = TrialConfig(
+        task_path=task_path,
+        scenes=[Scene.single(agent=args.agent, model=args.model)],
+        environment=args.backend,
+        sandbox_user="agent",
+        user=make_progressive_user(),
+        max_user_rounds=args.max_rounds,
+        jobs_dir="/tmp/swebench-pro-jobs/progressive",
+    )
+
+    print(f"Progressive disclosure on {args.task}")
+    print(f"  Agent:  {args.agent} / {args.model}")
+    print(f"  Backend: {args.backend}")
+    print(f"  Rounds:  up to {args.max_rounds}")
+    print()
+
+    result = await bf.run(config)
+
+    print("\n=== RESULT ===")
+    print(f"  Task: {result.task_name}")
+    print(f"  Rewards: {result.rewards}")
+    print(f"  Tool calls: {result.n_tool_calls}")
+    print(f"  Error: {result.error}")
+    if result.trial_dir:
+        print(f"  Trial dir: {result.trial_dir}")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/examples/user_dogfood.py b/examples/user_dogfood.py
new file mode 100644
index 00000000..517afc0c
--- /dev/null
+++ b/examples/user_dogfood.py
@@ -0,0 +1,93 @@
+"""Dogfood: run regex-log task with progressive-disclosure User.
+
+Demonstrates the BaseUser abstraction — a FunctionUser that:
+  Round 0: gives a terse version of the instruction
+  Round 1+: if tests failed, gives a hint based on the full instruction
+  Stops when tests pass or max_rounds hit.
+
+Usage:
+    GEMINI_API_KEY=... python examples/user_dogfood.py
+"""
+
+import asyncio
+import logging
+from pathlib import Path
+
+import benchflow as bf
+from benchflow.trial import Scene, TrialConfig
+from benchflow.user import FunctionUser, RoundResult
+
+logging.basicConfig(level=logging.INFO, format="%(name)s %(message)s")
+
+
+def progressive_user(
+    round: int, instruction: str, rr: RoundResult | None
+) -> str | None:
+    if round == 0:
+        return (
+            "Write a regex that matches dates (YYYY-MM-DD) in log lines "
+            "containing an IPv4 address. Save it to /app/regex.txt"
+        )
+
+    if rr and rr.rewards:
+        score = rr.rewards.get("exact_match", rr.rewards.get("reward", 0))
+        if score >= 1.0:
+            print(f"  [User] Tests passed at round {round}! Stopping.")
+            return None
+
+    if round == 1:
+        return (
+            "The tests failed. Important details you may have missed:\n"
+            "- Match only the LAST date in each line\n"
+            "- Feb can have up to 29 days\n"
+            "- Dates/IPs must not be preceded/followed by alphanumeric chars\n"
+            "- Use re.findall with re.MULTILINE\n"
+            "Fix /app/regex.txt"
+        )
+
+    if round == 2:
+        return (
+            "Still failing. Here's the full instruction:\n\n" + instruction
+        )
+
+    return None
+
+
+async def main():
+    task_path = Path(".ref/terminal-bench-2/regex-log")
+    if not task_path.exists():
+        print(f"Task not found at {task_path}")
+        return
+
+    config = TrialConfig(
+        task_path=task_path,
+        scenes=[Scene.single(agent="gemini", model="gemini-2.5-flash")],
+        environment="daytona",
+        user=FunctionUser(progressive_user),
+        max_user_rounds=4,
+    )
+
+    print("Running progressive-disclosure trial on regex-log...")
+    print("  Agent: gemini/flash")
+    print(f"  Max rounds: {config.max_user_rounds}")
+    print(f"  Environment: {config.environment}")
+    print()
+
+    result = await bf.run(config)
+
+    print("\n=== RESULT ===")
+    print(f"  Task: {result.task_name}")
+    print(f"  Rewards: {result.rewards}")
+    print(f"  Error: {result.error}")
+    print(f"  Tool calls: {result.n_tool_calls}")
+
+    if result.rewards:
+        score = result.rewards.get("exact_match", result.rewards.get("reward", 0))
+        if score >= 1.0:
+            print("  PASSED (final verify)")
+        else:
+            print(f"  FAILED (score={score})")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/examples/user_dogfood_results/result.json b/examples/user_dogfood_results/result.json
new file mode 100644
index 00000000..0f360595
--- /dev/null
+++ b/examples/user_dogfood_results/result.json
@@ -0,0 +1,25 @@
+{
+  "task_name": "regex-log",
+  "trial_name": "regex-log__bab1cbf6",
+  "rewards": {
+    "reward": 0.0
+  },
+  "agent": "gemini",
+  "agent_name": "gemini-cli",
+  "model": "gemini-2.5-flash",
+  "n_tool_calls": 8,
+  "n_prompts": 1,
+  "error": null,
+  "verifier_error": null,
+  "partial_trajectory": false,
+  "trajectory_source": "acp",
+  "started_at": "2026-04-22 22:37:42.034339",
+  "finished_at": "2026-04-22 22:40:58.748485",
+  "timing": {
+    "environment_setup": 1.3,
+    "agent_setup": 3.1,
+    "agent_execution": 15.6,
+    "verifier": 6.4,
+    "total": 196.7
+  }
+}
\ No newline at end of file
diff --git a/examples/user_dogfood_results/user_rounds.jsonl b/examples/user_dogfood_results/user_rounds.jsonl
new file mode 100644
index 00000000..258dfaff
--- /dev/null
+++ b/examples/user_dogfood_results/user_rounds.jsonl
@@ -0,0 +1,3 @@
+{"round": 0, "prompt": "Write a regex that matches dates (YYYY-MM-DD) in log lines containing an IPv4 address. Save it to /app/regex.txt", "rewards": {"reward": 0.0}, "verifier_error": null, "n_tool_calls": 2, "n_trajectory_events": 4}
+{"round": 1, "prompt": "The tests failed. Important details you may have missed:\n- Match only the LAST date in each line\n- Feb can have up to 29 days\n- Dates/IPs must not be preceded/followed by alphanumeric chars\n- Use re.findall with re.MULTILINE\nFix /app/regex.txt", "rewards": {"reward": 0.0}, "verifier_error": null, "n_tool_calls": 3, "n_trajectory_events": 5}
+{"round": 2, "prompt": "Still failing. Here's the full instruction:\n\nWrite a regex expression that matches dates in the format YYYY-MM-DD appearing in lines that contain an IPv4 address in a log file.\nIf multiple dates are present in a line, the regex should match only the last date in that line.\nAssume that February can have up to 29 days in all years, without distinguishing leap years from non-leap years.\nIPv4 addresses use normal decimal notation without leading zeros in each octet.\n\nNote: Be careful that there might be text in the log that looks similar to dates or IPv4 addresses but is not (e.g., user 1134-12-1234). \nTo avoid false matches, ensure that valid dates and IPv4 addresses are not immediately preceded or followed by alphanumeric characters.\n\nSave your regex in /app/regex.txt\nThe regex will be read from the file and applied to the log file contents using Python's re.findall with the re.MULTILINE flag.\nExample Python usage:\n```\nimport re\n\nwith open(\"/app/regex.txt\") as f:\n    pattern = f.read().strip()\n\nmatches = re.findall(pattern, log_text, re.MULTILINE)\n```", "rewards": {"reward": 0.0}, "verifier_error": null, "n_tool_calls": 3, "n_trajectory_events": 5}
diff --git a/experiments/reviewer_ablation.py b/experiments/reviewer_ablation.py
index 5acaecab..8ab0caec 100644
--- a/experiments/reviewer_ablation.py
+++ b/experiments/reviewer_ablation.py
@@ -17,25 +17,26 @@
 import os
 import sys
 import time
-from datetime import datetime
 from pathlib import Path
 
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
 
 sys.path.insert(0, str(Path(__file__).resolve().parents[0].parent / "src"))
 
+import contextlib
+
+from harbor.models.task.task import Task
+from harbor.models.trial.paths import TrialPaths
+from harbor.verifier.verifier import Verifier
+
 from benchflow._acp_run import connect_acp, execute_prompts
 from benchflow._agent_env import resolve_agent_env
 from benchflow._agent_setup import install_agent
 from benchflow._credentials import upload_subscription_auth, write_credential_files
 from benchflow._env_setup import _create_environment
 from benchflow._sandbox import setup_sandbox_user
-from benchflow._scene import Role, Scene
-from benchflow.agents.registry import AGENTS, AGENT_LAUNCH
+from benchflow.agents.registry import AGENT_LAUNCH, AGENTS
 from benchflow.sdk import SDK
-from harbor.models.task.task import Task
-from harbor.verifier.verifier import Verifier
-from harbor.models.trial.paths import TrialPaths
 
 logger = logging.getLogger(__name__)
 
@@ -141,10 +142,8 @@ async def _run_acp(env, prompt: str, trial_dir: Path, timeout: int = 600) -> tup
     try:
         _, n_tools = await execute_prompts(acp_client, session, [prompt], timeout=timeout)
     finally:
-        try:
+        with contextlib.suppress(Exception):
             await acp_client.close()
-        except Exception:
-            pass
     return n_tools, int(time.time() - t0)
 
 
@@ -204,7 +203,7 @@ async def run_reviewer(task_path: Path, task_name: str, condition: str) -> dict:
         total_tools += n_tools
 
         # Read coder's outbox
-        outbox_result = await env.exec("cat /app/.outbox/reviewer.json 2>/dev/null || echo '{}'")
+        await env.exec("cat /app/.outbox/reviewer.json 2>/dev/null || echo '{}'")
         await env.exec("rm -rf /app/.outbox/*")
 
         # Phase 2: Reviewer
diff --git a/experiments/swebench-pro-results.csv b/experiments/swebench-pro-results.csv
new file mode 100644
index 00000000..2c10f0b2
--- /dev/null
+++ b/experiments/swebench-pro-results.csv
@@ -0,0 +1,9 @@
+agent,elapsed_s,error,experiment,model,n_tool_calls,reward,task,task_full
+,54.3,,oracle,,0,1.0,ansible,instance_ansible__ansible-0ea40e09d1b35bcb69ff4d9cecf3d0defa4b36e8-v30a923fb5c164d6cd18280c02422f75e611e8fb2
+,480.5,,oracle,,0,1.0,flipt,instance_flipt-io__flipt-02e21636c58e86c51119b63e0fb5ca7b813b07b1
+,147.7,,oracle,,0,1.0,openlibrary,instance_internetarchive__openlibrary-00bec1e7c8f3272c469a58e1377df03f955ed478-v13642507b4fc1f8d234172bf8129942da2c2ca26
+,24.3,,oracle,,0,0.0,qutebrowser,instance_qutebrowser__qutebrowser-01d1d1494411380d97cac14614a829d3a69cecaf-v2ef375ac784985212b1805e1d0431dc8f1b3c171
+gemini,206.8,,baseline,gemini-3.1-pro-preview,23,1.0,ansible,instance_ansible__ansible-0ea40e09d1b35bcb69ff4d9cecf3d0defa4b36e8-v30a923fb5c164d6cd18280c02422f75e611e8fb2
+gemini,1443.7,,baseline,gemini-3.1-pro-preview,61,0.0,flipt,instance_flipt-io__flipt-02e21636c58e86c51119b63e0fb5ca7b813b07b1
+gemini,340.0,,baseline,gemini-3.1-pro-preview,32,1.0,openlibrary,instance_internetarchive__openlibrary-00bec1e7c8f3272c469a58e1377df03f955ed478-v13642507b4fc1f8d234172bf8129942da2c2ca26
+gemini,543.6,,baseline,gemini-3.1-pro-preview,72,0.0,qutebrowser,instance_qutebrowser__qutebrowser-01d1d1494411380d97cac14614a829d3a69cecaf-v2ef375ac784985212b1805e1d0431dc8f1b3c171
diff --git a/experiments/swebench_pro_oracle_and_baseline.py b/experiments/swebench_pro_oracle_and_baseline.py
new file mode 100644
index 00000000..00b865db
--- /dev/null
+++ b/experiments/swebench_pro_oracle_and_baseline.py
@@ -0,0 +1,263 @@
+"""SWE-bench Pro: oracle validation + Harbor baseline.
+
+Two experiments on the 4 testable SWE-bench Pro tasks:
+
+  1) Oracle validation — runs gold solution (solve.sh), verifies reward=1.0
+     for all 4 tasks. Confirms the --rootdir=/tests fix resolved qutebrowser
+     and openlibrary failures.
+
+  2) Harbor baseline — single-round agent evaluation (no progressive
+     disclosure). Runs the same 4 tasks with a real agent to establish
+     baseline pass rates for comparison with progressive disclosure.
+
+Usage:
+    # Oracle only (no API key needed):
+    python experiments/swebench_pro_oracle_and_baseline.py --oracle-only
+
+    # Full run (oracle + baseline):
+    GEMINI_API_KEY=... python experiments/swebench_pro_oracle_and_baseline.py
+
+    # Customize agent/model/backend:
+    GEMINI_API_KEY=... python experiments/swebench_pro_oracle_and_baseline.py \
+        --agent gemini --model gemini-3.1-pro-preview --backend daytona
+
+Results → experiments/swebench-pro-results.csv
+"""
+
+import argparse
+import asyncio
+import csv
+import logging
+import os
+import sys
+import time
+from pathlib import Path
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s %(name)s %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+sys.path.insert(0, str(Path(__file__).resolve().parents[0].parent / "src"))
+
+import benchflow as bf
+from benchflow.trial import Scene, TrialConfig
+
+SWEBENCH_PRO_ROOT = Path(__file__).resolve().parents[1] / ".ref" / "swebenchpro"
+
+TASKS = [
+    "instance_ansible__ansible-0ea40e09d1b35bcb69ff4d9cecf3d0defa4b36e8-v30a923fb5c164d6cd18280c02422f75e611e8fb2",
+    "instance_flipt-io__flipt-02e21636c58e86c51119b63e0fb5ca7b813b07b1",
+    "instance_internetarchive__openlibrary-00bec1e7c8f3272c469a58e1377df03f955ed478-v13642507b4fc1f8d234172bf8129942da2c2ca26",
+    "instance_qutebrowser__qutebrowser-01d1d1494411380d97cac14614a829d3a69cecaf-v2ef375ac784985212b1805e1d0431dc8f1b3c171",
+]
+
+TASK_LABELS = {
+    "instance_ansible": "ansible",
+    "instance_flipt": "flipt",
+    "instance_internet": "openlibrary",
+    "instance_qutebrowser": "qutebrowser",
+}
+
+RESULTS_FILE = Path(__file__).parent / "swebench-pro-results.csv"
+JOBS_DIR = Path("/tmp/swebench-pro-jobs")
+
+
+def task_label(task_name: str) -> str:
+    for prefix, label in TASK_LABELS.items():
+        if task_name.startswith(prefix):
+            return label
+    return task_name[:30]
+
+
+async def run_oracle(task_path: Path, backend: str) -> dict:
+    """Run oracle (gold solution) on a single task. Returns result dict."""
+    label = task_label(task_path.name)
+    logger.info(f"[oracle] {label}: starting")
+    t0 = time.time()
+
+    config = TrialConfig(
+        task_path=task_path,
+        agent="oracle",
+        environment=backend,
+        sandbox_user="agent",
+        jobs_dir=str(JOBS_DIR / "oracle"),
+    )
+
+    try:
+        result = await bf.run(config)
+        elapsed = time.time() - t0
+        reward = None
+        if result.rewards:
+            reward = result.rewards.get("reward", result.rewards.get("exact_match"))
+        logger.info(
+            f"[oracle] {label}: reward={reward} "
+            f"error={result.error!r} ({elapsed:.0f}s)"
+        )
+        return {
+            "experiment": "oracle",
+            "task": label,
+            "task_full": task_path.name,
+            "reward": reward,
+            "error": result.error or "",
+            "elapsed_s": round(elapsed, 1),
+            "n_tool_calls": result.n_tool_calls,
+        }
+    except Exception as e:
+        elapsed = time.time() - t0
+        logger.error(f"[oracle] {label}: crashed: {e}", exc_info=True)
+        return {
+            "experiment": "oracle",
+            "task": label,
+            "task_full": task_path.name,
+            "reward": None,
+            "error": str(e),
+            "elapsed_s": round(elapsed, 1),
+            "n_tool_calls": 0,
+        }
+
+
+async def run_baseline(
+    task_path: Path, agent: str, model: str, backend: str
+) -> dict:
+    """Run single-round baseline (no progressive disclosure) on a single task."""
+    label = task_label(task_path.name)
+    logger.info(f"[baseline] {label}: starting ({agent}/{model})")
+    t0 = time.time()
+
+    config = TrialConfig(
+        task_path=task_path,
+        scenes=[Scene.single(agent=agent, model=model)],
+        environment=backend,
+        sandbox_user="agent",
+        jobs_dir=str(JOBS_DIR / "baseline"),
+    )
+
+    try:
+        result = await bf.run(config)
+        elapsed = time.time() - t0
+        reward = None
+        if result.rewards:
+            reward = result.rewards.get("reward", result.rewards.get("exact_match"))
+        logger.info(
+            f"[baseline] {label}: reward={reward} "
+            f"tools={result.n_tool_calls} error={result.error!r} ({elapsed:.0f}s)"
+        )
+        return {
+            "experiment": "baseline",
+            "task": label,
+            "task_full": task_path.name,
+            "agent": agent,
+            "model": model,
+            "reward": reward,
+            "error": result.error or "",
+            "elapsed_s": round(elapsed, 1),
+            "n_tool_calls": result.n_tool_calls,
+        }
+    except Exception as e:
+        elapsed = time.time() - t0
+        logger.error(f"[baseline] {label}: crashed: {e}", exc_info=True)
+        return {
+            "experiment": "baseline",
+            "task": label,
+            "task_full": task_path.name,
+            "agent": agent,
+            "model": model,
+            "reward": None,
+            "error": str(e),
+            "elapsed_s": round(elapsed, 1),
+            "n_tool_calls": 0,
+        }
+
+
+async def main():
+    parser = argparse.ArgumentParser(description="SWE-bench Pro oracle + baseline")
+    parser.add_argument("--oracle-only", action="store_true", help="Skip baseline")
+    parser.add_argument("--agent", default=os.environ.get("AGENT", "gemini"))
+    parser.add_argument("--model", default=os.environ.get("MODEL", "gemini-3.1-pro-preview"))
+    parser.add_argument("--backend", default=os.environ.get("BACKEND", "docker"))
+    parser.add_argument("--concurrency", type=int, default=2)
+    args = parser.parse_args()
+
+    task_paths = [SWEBENCH_PRO_ROOT / t for t in TASKS]
+    missing = [p for p in task_paths if not p.exists()]
+    if missing:
+        logger.error(f"Missing tasks: {[p.name for p in missing]}")
+        sys.exit(1)
+
+    JOBS_DIR.mkdir(parents=True, exist_ok=True)
+    results: list[dict] = []
+
+    # Phase 1: Oracle validation (all 4 tasks in parallel)
+    logger.info("=" * 60)
+    logger.info("Phase 1: Oracle validation (gold solution)")
+    logger.info("=" * 60)
+
+    sem = asyncio.Semaphore(args.concurrency)
+
+    async def bounded_oracle(tp):
+        async with sem:
+            return await run_oracle(tp, args.backend)
+
+    oracle_results = await asyncio.gather(
+        *[bounded_oracle(tp) for tp in task_paths]
+    )
+    results.extend(oracle_results)
+
+    oracle_pass = sum(1 for r in oracle_results if r["reward"] == 1.0)
+    logger.info(f"Oracle: {oracle_pass}/{len(oracle_results)} passed")
+
+    if oracle_pass < len(oracle_results):
+        failed = [r["task"] for r in oracle_results if r["reward"] != 1.0]
+        logger.warning(f"Oracle failures: {failed}")
+
+    # Phase 2: Baseline (single-round agent)
+    if not args.oracle_only:
+        logger.info("")
+        logger.info("=" * 60)
+        logger.info(f"Phase 2: Baseline ({args.agent}/{args.model})")
+        logger.info("=" * 60)
+
+        async def bounded_baseline(tp):
+            async with sem:
+                return await run_baseline(tp, args.agent, args.model, args.backend)
+
+        baseline_results = await asyncio.gather(
+            *[bounded_baseline(tp) for tp in task_paths]
+        )
+        results.extend(baseline_results)
+
+        baseline_pass = sum(1 for r in baseline_results if r.get("reward", 0) == 1.0)
+        logger.info(f"Baseline: {baseline_pass}/{len(baseline_results)} passed")
+
+    # Write CSV
+    if results:
+        fieldnames = list(results[0].keys())
+        all_keys = set()
+        for r in results:
+            all_keys.update(r.keys())
+        fieldnames = sorted(all_keys)
+
+        with open(RESULTS_FILE, "w", newline="") as f:
+            writer = csv.DictWriter(f, fieldnames=fieldnames)
+            writer.writeheader()
+            writer.writerows(results)
+        logger.info(f"Results written to {RESULTS_FILE}")
+
+    # Summary table
+    print("\n" + "=" * 60)
+    print("SUMMARY")
+    print("=" * 60)
+    print(f"{'Experiment':<12} {'Task':<15} {'Reward':<10} {'Tools':<8} {'Time':<8} {'Error'}")
+    print("-" * 80)
+    for r in results:
+        reward_str = str(r.get("reward", "—"))
+        tools_str = str(r.get("n_tool_calls", "—"))
+        time_str = f"{r.get('elapsed_s', 0):.0f}s"
+        error_str = (r.get("error", "") or "")[:40]
+        print(f"{r['experiment']:<12} {r['task']:<15} {reward_str:<10} {tools_str:<8} {time_str:<8} {error_str}")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/experiments/validate_multi_scene_lifecycle.py b/experiments/validate_multi_scene_lifecycle.py
index e55eea26..4e6bf877 100644
--- a/experiments/validate_multi_scene_lifecycle.py
+++ b/experiments/validate_multi_scene_lifecycle.py
@@ -162,7 +162,7 @@ async def main():
     r3 = await test_followup_bench()
     results["followup"] = r3.rewards
 
-    logger.info(f"\n=== DOGFOOD RESULTS ===")
+    logger.info("\n=== DOGFOOD RESULTS ===")
     for name, rewards in results.items():
         reward = (rewards or {}).get("reward", "N/A")
         logger.info(f"  {name}: reward={reward}")
diff --git a/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/tests/test_outputs.py b/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/tests/test_outputs.py
index 1c85417f..df887aa9 100644
--- a/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/tests/test_outputs.py
+++ b/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/tests/test_outputs.py
@@ -6,7 +6,7 @@ def test_arithmetic_is_broken():
 
 
 def test_false_is_true():
-    assert False
+    raise AssertionError()
 
 
 def test_pi_is_rational():
diff --git a/labs/benchjack-sandbox-hardening/pattern7_pth_injection/tests/test_outputs.py b/labs/benchjack-sandbox-hardening/pattern7_pth_injection/tests/test_outputs.py
index 6debbc31..a4742c25 100644
--- a/labs/benchjack-sandbox-hardening/pattern7_pth_injection/tests/test_outputs.py
+++ b/labs/benchjack-sandbox-hardening/pattern7_pth_injection/tests/test_outputs.py
@@ -3,7 +3,7 @@ def test_arithmetic_is_broken():
 
 
 def test_false_is_true():
-    assert False
+    raise AssertionError()
 
 
 def test_pi_is_rational():
diff --git a/labs/benchjack-sandbox-hardening/run_comparison.py b/labs/benchjack-sandbox-hardening/run_comparison.py
index e01b1df8..04963bb8 100755
--- a/labs/benchjack-sandbox-hardening/run_comparison.py
+++ b/labs/benchjack-sandbox-hardening/run_comparison.py
@@ -136,7 +136,7 @@ def main() -> int:
         return 1
 
     n = len(PATTERNS)
-    total_steps = 2 + 2 * n  # 2 venvs + 2 versions × n patterns
+    total_steps = 2 + 2 * n  # 2 venvs + 2 versions x n patterns
 
     print(f"[1/{total_steps}] venv: benchflow==0.2.0 (PyPI)")
     _create_venv(VENVS_DIR / "bf-0.2.0", ["benchflow==0.2.0"])
diff --git a/labs/reward-hack-matrix/exploits/conftest_payload.py b/labs/reward-hack-matrix/exploits/conftest_payload.py
index 30fe23f7..7397066a 100644
--- a/labs/reward-hack-matrix/exploits/conftest_payload.py
+++ b/labs/reward-hack-matrix/exploits/conftest_payload.py
@@ -16,7 +16,7 @@
 terminal-bench-2 if its verifier ever touches conftest.py).
 """
 
-import pytest  # noqa: F401  — pytest must see this as a real plugin module
+import pytest
 
 
 def pytest_collection_modifyitems(config, items):
diff --git a/labs/reward-hack-matrix/run_matrix.py b/labs/reward-hack-matrix/run_matrix.py
index d5dfc439..53a86b04 100644
--- a/labs/reward-hack-matrix/run_matrix.py
+++ b/labs/reward-hack-matrix/run_matrix.py
@@ -30,6 +30,7 @@
 
 import argparse
 import asyncio
+import contextlib
 import json
 import os
 import re
@@ -447,13 +448,11 @@ async def shutdown(self) -> None:
         if self.proc is None:
             return
         if self.proc.stdin and not self.proc.stdin.is_closing():
-            try:
+            with contextlib.suppress(OSError, BrokenPipeError):
                 self.proc.stdin.close()
-            except (OSError, BrokenPipeError):
-                pass
         try:
             await asyncio.wait_for(self.proc.wait(), timeout=30)
-        except asyncio.TimeoutError:
+        except TimeoutError:
             self.proc.kill()
             await self.proc.wait()
         if self.reader_task:
@@ -717,7 +716,7 @@ def main() -> int:
     if args.sweep:
         summary_path = Path(args.summary_path) if args.summary_path else (JOBS_DIR / "matrix_sweep.json")
         print(
-            f"[sweep] {len(cells)} cells × {len(VERSIONS)} versions = "
+            f"[sweep] {len(cells)} cells x {len(VERSIONS)} versions = "
             f"{len(cells) * len(VERSIONS)} trials, concurrency={args.concurrency}"
         )
         results = asyncio.run(
diff --git a/pyproject.toml b/pyproject.toml
index ff627c43..62f8f468 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "benchflow"
-version = "0.3.0a10"
+version = "0.3.2"
 description = "Multi-turn agent benchmarking with ACP — run any agent, any model, any provider."
 readme = "README.md"
 requires-python = ">=3.12"
@@ -96,6 +96,16 @@ ignore = [
     "RUF022", # __all__ unsorted — grouped by section for agent-friendliness
 ]
 
+[tool.ruff.lint.per-file-ignores]
+# Standalone scripts — sys.path manipulation before imports is intentional
+"experiments/*.py" = ["E402"]
+"tests/conformance/*.py" = ["E402"]
+# Notebooks: cell-local imports + short loop vars are notebook conventions
+"examples/*.ipynb" = ["E402", "E741", "SIM115"]
+# Forward references resolved via __future__ annotations — ruff flags them
+# but they work at runtime; explicit TYPE_CHECKING imports would force eager loads.
+"src/benchflow/runtime.py" = ["F821"]
+
 [tool.ty.environment]
 python-version = "3.12"
 
diff --git a/src/benchflow/__init__.py b/src/benchflow/__init__.py
index 809292d7..1d9129a0 100644
--- a/src/benchflow/__init__.py
+++ b/src/benchflow/__init__.py
@@ -19,13 +19,14 @@
     ExecResult,
     Task,
     TaskConfig,
-    Trial,
     Verifier,
     VerifierResult,
 )
 
 # benchflow's additions
 from benchflow._env_setup import stage_dockerfile_deps
+from benchflow._scene import MailboxTransport, Message, MessageTransport, Role, Scene
+from benchflow._snapshot import list_snapshots, restore, snapshot
 from benchflow.acp.client import ACPClient
 from benchflow.acp.session import ACPSession
 from benchflow.agents.registry import (
@@ -53,16 +54,16 @@
     RuntimeResult,
     run,  # bf.run(agent, env) — the primary 0.3 API
 )
-from benchflow._scene import MailboxTransport, Message, MessageTransport, Role, Scene
-from benchflow._snapshot import list_snapshots, restore, snapshot
 from benchflow.sdk import SDK
-from benchflow.trial import Trial, TrialConfig
-from benchflow.trial import Role as TrialRole, Scene as TrialScene, Turn
-from benchflow.trial_yaml import trial_config_from_yaml
 from benchflow.skills import SkillInfo, discover_skills, install_skill, parse_skill
 from benchflow.trajectories.otel import OTelCollector
 from benchflow.trajectories.proxy import TrajectoryProxy
 from benchflow.trajectories.types import Trajectory
+from benchflow.trial import Role as TrialRole
+from benchflow.trial import Scene as TrialScene
+from benchflow.trial import Trial, TrialConfig, Turn
+from benchflow.trial_yaml import trial_config_from_yaml
+from benchflow.user import BaseUser, FunctionUser, PassthroughUser, RoundResult
 
 # Public API surface. Anything not in this list is implementation detail and
 # may change without notice. Names are grouped by source module to match the
@@ -123,6 +124,12 @@
     "TrialRole",
     "TrialScene",
     "Turn",
+    "trial_config_from_yaml",
+    # User abstraction (progressive disclosure)
+    "BaseUser",
+    "FunctionUser",
+    "PassthroughUser",
+    "RoundResult",
     # SDK (backwards compat)
     "SDK",
     # Environments / dep staging
diff --git a/src/benchflow/_acp_run.py b/src/benchflow/_acp_run.py
index f1b04c7b..58b95bed 100644
--- a/src/benchflow/_acp_run.py
+++ b/src/benchflow/_acp_run.py
@@ -18,6 +18,7 @@
 """
 
 import asyncio
+import contextlib
 import logging
 from pathlib import Path
 
@@ -26,7 +27,8 @@
 from benchflow.acp.client import ACPClient
 from benchflow.acp.container_transport import ContainerTransport
 from benchflow.agents.providers import strip_provider_prefix
-from benchflow.process import DaytonaProcess, DockerProcess
+from benchflow.agents.registry import AGENTS
+from benchflow.process import DaytonaProcess, DaytonaPtyProcess, DockerProcess
 
 logger = logging.getLogger(__name__)
 
@@ -34,6 +36,52 @@
 _ACP_CONNECT_MAX_RETRIES = 3
 _ACP_CONNECT_BASE_DELAY = 2.0
 
+# models.dev provider inference — used when acp_model_format="provider/model"
+# to reconstruct "provider/model" from a bare model name.
+_MODELSDEV_PROVIDER_HEURISTICS: list[tuple[str, str]] = [
+    # (substring in model name, models.dev provider ID)
+    ("gemini", "google"),
+    ("gemma", "google"),
+    ("gpt", "openai"),
+    ("o1", "openai"),
+    ("o3", "openai"),
+    ("o4", "openai"),
+    ("claude", "anthropic"),
+    ("haiku", "anthropic"),
+    ("sonnet", "anthropic"),
+    ("opus", "anthropic"),
+    ("mistral", "mistral"),
+    ("codestral", "mistral"),
+]
+
+
+def _format_acp_model(model: str, agent: str) -> str:
+    """Format a model ID for ACP session/set_model based on agent requirements.
+
+    Most agents expect a bare model name (e.g. "claude-sonnet-4-6").
+    Agents with acp_model_format="provider/model" (e.g. opencode) need the
+    models.dev provider prefix (e.g. "google/gemini-3.1-pro-preview").
+
+    Strips benchflow's custom provider prefixes first, then re-adds the
+    models.dev provider prefix when the agent requires it.
+    """
+    bare = strip_provider_prefix(model)
+    agent_cfg = AGENTS.get(agent)
+    if not agent_cfg or agent_cfg.acp_model_format != "provider/model":
+        return bare
+    # Already has a slash — assume it's provider/model already
+    if "/" in bare:
+        return bare
+    # Infer the models.dev provider from the bare model name
+    m = bare.lower()
+    for substring, provider in _MODELSDEV_PROVIDER_HEURISTICS:
+        if substring in m:
+            return f"{provider}/{bare}"
+    logger.warning(
+        "Cannot infer models.dev provider for %r — defaulting to anthropic/", bare
+    )
+    return f"anthropic/{bare}"
+
 
 async def connect_acp(
     env,
@@ -66,7 +114,6 @@ async def connect_acp(
         agent_launch = build_priv_drop_cmd(agent_launch, sandbox_user)
         logger.info(f"Agent sandboxed as: {sandbox_user}")
 
-    last_err: Exception | None = None
     acp_client: ACPClient | None = None
     for attempt in range(_ACP_CONNECT_MAX_RETRIES + 1):
         if attempt > 0:
@@ -78,7 +125,12 @@ async def connect_acp(
             if environment == "docker":
                 live_proc = DockerProcess.from_harbor_env(env)
             else:
-                live_proc = await DaytonaProcess.from_harbor_env(env)
+                is_dind = hasattr(env, "_strategy") and hasattr(env._strategy, "_compose_cmd")
+                if is_dind:
+                    live_proc = await DaytonaPtyProcess.from_harbor_env(env)
+                    logger.info("Using PTY transport for DinD compose task")
+                else:
+                    live_proc = await DaytonaProcess.from_harbor_env(env)
 
             agent_log = trial_dir / "agent" / f"{agent.replace('-', '_')}.txt"
             transport = ContainerTransport(
@@ -101,12 +153,9 @@ async def connect_acp(
         except ConnectionError as e:
             # Close the failed client before retrying
             if acp_client:
-                try:
+                with contextlib.suppress(Exception):
                     await acp_client.close()
-                except Exception:
-                    pass
                 acp_client = None
-            last_err = e
             if attempt == _ACP_CONNECT_MAX_RETRIES:
                 raise
             logger.warning(f"ACP connect failed (attempt {attempt + 1}): {e}")
@@ -114,19 +163,20 @@ async def connect_acp(
         except Exception:
             # Non-retryable error — close client to prevent leak
             if acp_client:
-                try:
+                with contextlib.suppress(Exception):
                     await acp_client.close()
-                except Exception:
-                    pass
             raise
 
-    if model:
-        acp_model_id = strip_provider_prefix(model)
+    agent_cfg = AGENTS.get(agent)
+    if model and (agent_cfg is None or agent_cfg.supports_acp_set_model):
+        acp_model_id = _format_acp_model(model, agent)
         try:
             await asyncio.wait_for(acp_client.set_model(acp_model_id), timeout=60)
             logger.info(f"Model set to: {acp_model_id} (from {model})")
         except Exception as e:
             logger.warning(f"Failed to set model via ACP: {e}")
+    elif model:
+        logger.info(f"Skipping ACP set_model for {agent} — launch/env config owns model selection")
 
     return acp_client, session, agent_name
 
diff --git a/src/benchflow/_agent_env.py b/src/benchflow/_agent_env.py
index 72d64986..0229b173 100644
--- a/src/benchflow/_agent_env.py
+++ b/src/benchflow/_agent_env.py
@@ -26,6 +26,32 @@
 
 logger = logging.getLogger(__name__)
 
+_AUTH_CONTEXT_GROUPS = (
+    frozenset({"ANTHROPIC_API_KEY", "ANTHROPIC_AUTH_TOKEN", "CLAUDE_CODE_OAUTH_TOKEN"}),
+    frozenset({"GEMINI_API_KEY", "GOOGLE_API_KEY"}),
+)
+_EXPLICIT_AGENT_NATIVE_BRIDGE_KEYS = frozenset({"LLM_API_KEY"})
+
+
+def _normalize_openhands_model(model: str) -> str:
+    """Translate benchflow model IDs to OpenHands/LiteLLM model IDs.
+
+    OpenHands expects provider-qualified model names for some providers even
+    when benchflow uses bare model IDs or its own provider prefixes.
+    """
+    from benchflow.agents.providers import strip_provider_prefix
+    from benchflow.agents.registry import is_vertex_model
+
+    if model.startswith(("gemini/", "vertex_ai/", "openhands/")):
+        return model
+    stripped = strip_provider_prefix(model)
+    lower = model.lower()
+    if is_vertex_model(model) and "gemini" in lower:
+        return f"vertex_ai/{stripped}"
+    if "gemini" in lower:
+        return f"gemini/{stripped}"
+    return stripped
+
 
 def auto_inherit_env(agent_env: dict[str, str]) -> None:
     """Copy well-known API keys from host os.environ into agent_env."""
@@ -38,6 +64,7 @@ def auto_inherit_env(agent_env: dict[str, str]) -> None:
         "OPENAI_API_KEY",
         "GOOGLE_API_KEY",
         "GEMINI_API_KEY",
+        "GOOGLE_GENERATIVE_AI_API_KEY",
         "GOOGLE_CLOUD_PROJECT",
         "GOOGLE_CLOUD_LOCATION",
         "LLM_API_KEY",
@@ -54,6 +81,9 @@ def auto_inherit_env(agent_env: dict[str, str]) -> None:
     # Mirror GEMINI_API_KEY as GOOGLE_API_KEY (some agents expect one or the other)
     if "GEMINI_API_KEY" in agent_env and "GOOGLE_API_KEY" not in agent_env:
         agent_env["GOOGLE_API_KEY"] = agent_env["GEMINI_API_KEY"]
+    # Mirror GEMINI_API_KEY as GOOGLE_GENERATIVE_AI_API_KEY (opencode/models.dev convention)
+    if "GEMINI_API_KEY" in agent_env and "GOOGLE_GENERATIVE_AI_API_KEY" not in agent_env:
+        agent_env["GOOGLE_GENERATIVE_AI_API_KEY"] = agent_env["GEMINI_API_KEY"]
     # CLAUDE_CODE_OAUTH_TOKEN is a separate auth path — Claude CLI reads it
     # directly. Don't map to ANTHROPIC_API_KEY (different auth mechanism).
 
@@ -118,11 +148,22 @@ def resolve_provider_env(
             _key = agent_env.get(_prov_cfg.auth_env, "")
             if _key:
                 agent_env.setdefault("BENCHFLOW_PROVIDER_API_KEY", _key)
+    else:
+        # No registered provider prefix — bridge the model's well-known API key
+        # to BENCHFLOW_PROVIDER_API_KEY so env_mapping can translate it to
+        # agent-native vars (e.g. GEMINI_API_KEY → LLM_API_KEY for openhands).
+        from benchflow.agents.registry import infer_env_key_for_model
+
+        _inferred = infer_env_key_for_model(model)
+        if _inferred and _inferred in agent_env:
+            agent_env.setdefault("BENCHFLOW_PROVIDER_API_KEY", agent_env[_inferred])
     # Apply agent env_mapping: translate BENCHFLOW_PROVIDER_* → agent-native vars
     if agent_cfg and agent_cfg.env_mapping:
         for src, dst in agent_cfg.env_mapping.items():
             if src in agent_env:
                 agent_env.setdefault(dst, agent_env[src])
+    if agent == "openhands":
+        agent_env.setdefault("LLM_MODEL", _normalize_openhands_model(model))
 
 
 def check_subscription_auth(agent: str, required_key: str) -> bool:
@@ -136,6 +177,18 @@ def check_subscription_auth(agent: str, required_key: str) -> bool:
     return Path(sa.detect_file).expanduser().is_file()
 
 
+def _shares_auth_context(required_key: str | None, candidate_key: str | None) -> bool:
+    """True when both keys represent the same provider auth context."""
+    if not required_key or not candidate_key:
+        return False
+    if required_key == candidate_key:
+        return True
+    return any(
+        required_key in group and candidate_key in group
+        for group in _AUTH_CONTEXT_GROUPS
+    )
+
+
 def resolve_agent_env(
     agent: str,
     model: str | None,
@@ -143,19 +196,61 @@ def resolve_agent_env(
 ) -> dict[str, str]:
     """Resolve agent environment: auto-inherit keys, provider vars, env_mapping."""
     agent_env = dict(agent_env or {})
+    explicit_agent_env_keys = set(agent_env)
     auto_inherit_env(agent_env)
+    pre_provider_env = dict(agent_env)
+    agent_cfg = AGENTS.get(agent)
     # Oracle runs solve.sh and never calls an LLM — model env vars and
     # API-key validation are skipped even if a caller forwards a model.
     if model and agent != "oracle":
         inject_vertex_credentials(agent_env, model)
         resolve_provider_env(agent_env, model, agent)
+        if agent_cfg and agent_cfg.env_mapping:
+            for src, dst in agent_cfg.env_mapping.items():
+                if src in agent_env and dst not in explicit_agent_env_keys:
+                    # Provider resolution must override unrelated host-native
+                    # vars auto-inherited from the environment, but preserve
+                    # explicit agent_env overrides supplied by the caller.
+                    agent_env[dst] = agent_env[src]
         # Validate required API key for the chosen model
         from benchflow.agents.registry import infer_env_key_for_model
 
         required_key = infer_env_key_for_model(model)
-        # CLAUDE_CODE_OAUTH_TOKEN is an alternative auth path for Claude agents
-        has_oauth = "CLAUDE_CODE_OAUTH_TOKEN" in agent_env or "ANTHROPIC_AUTH_TOKEN" in agent_env
-        if required_key and required_key not in agent_env and not has_oauth:
+        mapped_provider_key = (
+            agent_cfg.env_mapping.get("BENCHFLOW_PROVIDER_API_KEY")
+            if agent_cfg
+            else None
+        )
+        has_agent_native_bridge_key = bool(
+            mapped_provider_key
+            and pre_provider_env.get(mapped_provider_key)
+            and (
+                _shares_auth_context(required_key, mapped_provider_key)
+                or (
+                    mapped_provider_key in _EXPLICIT_AGENT_NATIVE_BRIDGE_KEYS
+                    and mapped_provider_key in explicit_agent_env_keys
+                )
+            )
+        )
+        if has_agent_native_bridge_key:
+            # Only pre-existing same-provider aliases or explicit generic bridge
+            # keys can satisfy provider auth. Values synthesized by env_mapping
+            # or inherited from another provider context must not bypass the
+            # model's required credential.
+            agent_env.setdefault(
+                "BENCHFLOW_PROVIDER_API_KEY",
+                pre_provider_env[mapped_provider_key],
+            )
+        has_oauth = any(
+            key in agent_env and _shares_auth_context(required_key, key)
+            for key in ("CLAUDE_CODE_OAUTH_TOKEN", "ANTHROPIC_AUTH_TOKEN")
+        )
+        if (
+            required_key
+            and required_key not in agent_env
+            and not has_oauth
+            and not has_agent_native_bridge_key
+        ):
             if check_subscription_auth(agent, required_key):
                 agent_env["_BENCHFLOW_SUBSCRIPTION_AUTH"] = "1"
                 logger.info(
@@ -183,7 +278,6 @@ def resolve_agent_env(
             )
     else:
         # No model specified — still check subscription auth for required env vars
-        agent_cfg = AGENTS.get(agent)
         if agent_cfg:
             for req_key in agent_cfg.requires_env:
                 if req_key not in agent_env and check_subscription_auth(agent, req_key):
diff --git a/src/benchflow/_agent_setup.py b/src/benchflow/_agent_setup.py
index 39b1720c..0c04a5a8 100644
--- a/src/benchflow/_agent_setup.py
+++ b/src/benchflow/_agent_setup.py
@@ -30,21 +30,76 @@
 logger = logging.getLogger(__name__)
 
 
+def _skill_link_cmd(source: str, dest: str) -> str:
+    """Link a shared skills tree into an agent discovery path."""
+    if source == dest:
+        return f"mkdir -p {shlex.quote(dest)}"
+
+    parent = shlex.quote(str(Path(dest).parent))
+    q_source = shlex.quote(source)
+    q_dest = shlex.quote(dest)
+    return (
+        f"mkdir -p {parent} && "
+        f"rm -rf {q_dest} && "
+        f"ln -sfn {q_source} {q_dest}"
+    )
+
+
+async def _link_skill_paths(env, source: str, skill_paths: list[str], home: str, cwd: str) -> int:
+    """Link one shared skills tree into each configured discovery path."""
+    parts = []
+    for sp in skill_paths:
+        expanded = sp.replace("$HOME", home).replace("$WORKSPACE", cwd)
+        parts.append(_skill_link_cmd(source, expanded))
+    if parts:
+        cmd = " && ".join(parts)
+        result = await env.exec(cmd, timeout_sec=15)
+        if result.return_code != 0:
+            stdout = (getattr(result, "stdout", "") or "").strip()
+            stderr = (getattr(result, "stderr", "") or "").strip()
+            details = [
+                f"exit code {result.return_code}",
+                f"command: {cmd}",
+            ]
+            if stdout:
+                details.append(f"stdout: {stdout}")
+            if stderr:
+                details.append(f"stderr: {stderr}")
+            raise RuntimeError(
+                f"Failed to link skills from {source}: {'; '.join(details)}"
+            )
+    return len(parts)
+
+
 async def install_agent(env, agent: str, trial_dir: Path) -> AgentConfig | None:
     """Install agent in sandbox and return its config."""
     agent_base = agent.split()[0]
     agent_cfg = AGENTS.get(agent_base)
     if agent_base not in AGENT_INSTALLERS:
         return agent_cfg
+    install_cmd = AGENT_INSTALLERS[agent_base]
     install_timeout = agent_cfg.install_timeout if agent_cfg else 900
     logger.info(f"Installing {agent_base} in sandbox (timeout={install_timeout}s)...")
     install_result = await env.exec(
-        AGENT_INSTALLERS[agent_base],
+        install_cmd,
         timeout_sec=install_timeout,
     )
     install_log = trial_dir / "agent" / "install-stdout.txt"
     install_log.parent.mkdir(parents=True, exist_ok=True)
-    install_log.write_text(install_result.stdout or "")
+    stdout = install_result.stdout or ""
+    stderr = install_result.stderr or ""
+    parts = [f"$ {install_cmd}\n"]
+    if stdout:
+        parts.append("=== stdout ===\n")
+        parts.append(stdout)
+        if not stdout.endswith("\n"):
+            parts.append("\n")
+    if stderr:
+        parts.append("=== stderr ===\n")
+        parts.append(stderr)
+        if not stderr.endswith("\n"):
+            parts.append("\n")
+    install_log.write_text("".join(parts))
     if install_result.return_code != 0:
         diag = await env.exec(
             "echo 'OS:' && cat /etc/os-release 2>/dev/null | head -2; "
@@ -55,7 +110,7 @@ async def install_agent(env, agent: str, trial_dir: Path) -> AgentConfig | None:
         raise AgentInstallError(
             agent=agent_base,
             return_code=install_result.return_code,
-            stdout=install_result.stdout or "",
+            stdout="".join(parts),
             diagnostics=diag.stdout or "",
             log_path=str(install_log),
         )
@@ -72,6 +127,9 @@ async def deploy_skills(
     task: "Task",
 ) -> None:
     """Deploy and distribute skills into sandbox."""
+    task_skills_dir = task.config.environment.skills_dir
+    effective_skills = task_skills_dir
+
     # Runtime upload (fallback if not baked into Dockerfile)
     if skills_dir:
         dockerfile = task_path / "environment" / "Dockerfile"
@@ -84,38 +142,24 @@ async def deploy_skills(
             if skills_path.is_dir():
                 logger.info(f"Deploying skills via runtime upload from {skills_path}")
                 await env.upload_dir(skills_path, "/skills")
-                if agent_cfg and agent_cfg.skill_paths:
-                    parts = []
-                    for sp in agent_cfg.skill_paths:
-                        expanded = sp.replace("$HOME", "/root").replace(
-                            "$WORKSPACE", "/app"
-                        )
-                        parent = str(Path(expanded).parent)
-                        parts.append(
-                            f"mkdir -p '{parent}' && ln -sf /skills '{expanded}'"
-                        )
-                    await env.exec(" && ".join(parts), timeout_sec=10)
-                logger.info("Skills deployed to /skills and symlinked")
+                logger.info("Skills deployed to /skills")
+                effective_skills = "/skills"
             else:
                 logger.warning(f"Skills dir not found: {skills_path}")
         else:
             logger.info("Skills already injected via Dockerfile")
 
     # Distribute to agent-specific discovery paths
-    task_skills_dir = task.config.environment.skills_dir
-    effective_skills = "/skills" if skills_dir else task_skills_dir
     if effective_skills and agent_cfg and agent_cfg.skill_paths:
         home = f"/home/{sandbox_user}" if sandbox_user else "/root"
-        parts = []
-        for sp in agent_cfg.skill_paths:
-            expanded = sp.replace("$HOME", home).replace("$WORKSPACE", agent_cwd)
-            q_expanded = shlex.quote(expanded)
-            q_skills = shlex.quote(effective_skills)
-            parts.append(
-                f"mkdir -p {q_expanded} && cp -r {q_skills}/. {q_expanded}/ 2>/dev/null"
-            )
-        if parts:
-            await env.exec("; ".join(parts), timeout_sec=15)
+        count = await _link_skill_paths(
+            env,
+            effective_skills,
+            agent_cfg.skill_paths,
+            home,
+            agent_cwd,
+        )
+        if count:
             logger.info(
-                f"Skills distributed to {len(parts)} paths for {agent_cfg.name}"
+                f"Skills distributed to {count} paths for {agent_cfg.name}"
             )
diff --git a/src/benchflow/_daytona_patches.py b/src/benchflow/_daytona_patches.py
new file mode 100644
index 00000000..56abe783
--- /dev/null
+++ b/src/benchflow/_daytona_patches.py
@@ -0,0 +1,102 @@
+"""Idempotent patches to the Daytona SDK to work around upstream bugs.
+
+Imported (and applied) lazily from ``_env_setup._create_environment`` so the
+SDK is only touched when a Daytona environment is actually being built.
+"""
+
+import asyncio
+import logging
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+_PATCHED = False
+
+
+def apply() -> None:
+    """Install workarounds for known Daytona SDK bugs.
+
+    Currently:
+      * ``AsyncProcess.get_session_command_logs`` occasionally raises
+        ``pydantic.ValidationError`` because the server returns an empty
+        string instead of a JSON object for ``SessionCommandLogsResponse``.
+        Reproduces in SDK 0.168.x and 0.169.x. Wrap with a small bounded
+        retry that returns an empty-but-valid response if every attempt
+        fails — callers can still observe the command's exit_code via
+        ``get_session_command``, so a missing logs payload is recoverable.
+    """
+    global _PATCHED
+    if _PATCHED:
+        return
+
+    try:
+        from daytona._async.process import AsyncProcess
+        from daytona.common.errors import DaytonaError
+        from daytona.common.process import SessionCommandLogsResponse
+    except Exception:  # pragma: no cover — SDK not installed / layout changed
+        logger.debug("daytona SDK not importable; skipping patches", exc_info=True)
+        return
+
+    try:
+        from pydantic import ValidationError
+    except Exception:  # pragma: no cover
+        return
+
+    # AsyncProcess.get_session_command_logs is decorated by intercept_errors
+    # at class definition, which converts every inner exception (including
+    # the pydantic ValidationError we care about) into a DaytonaError. The
+    # decorated bound method is what we capture here, so we have to match
+    # on the wrapped DaytonaError shape too — not just ValidationError.
+    original = AsyncProcess.get_session_command_logs
+
+    _MALFORMED_MARKER = "SessionCommandLogsResponse"
+
+    def _is_malformed_logs_error(exc: BaseException) -> bool:
+        if isinstance(exc, ValidationError):
+            return True
+        return isinstance(exc, DaytonaError) and _MALFORMED_MARKER in str(exc)
+
+    async def _patched_get_session_command_logs(
+        self: Any, session_id: str, command_id: str
+    ) -> SessionCommandLogsResponse:
+        # Harbor already wraps this call in tenacity (3 attempts), so
+        # additional retries here are usually wasted on a deterministic
+        # malformed payload. Try once more with a small delay in case it
+        # IS transient, then return an empty-but-valid response so the
+        # caller can still observe the command's exit_code via
+        # get_session_command. Original error is logged for triage.
+        attempts = 2
+        delay = 0.5
+        last_exc: BaseException | None = None
+        for attempt in range(1, attempts + 1):
+            try:
+                return await original(self, session_id, command_id)
+            except (ValidationError, DaytonaError) as exc:
+                if not _is_malformed_logs_error(exc):
+                    raise
+                last_exc = exc
+                logger.warning(
+                    "daytona get_session_command_logs malformed payload "
+                    "(attempt %d/%d) for session=%s command=%s: %s",
+                    attempt,
+                    attempts,
+                    session_id,
+                    command_id,
+                    exc,
+                )
+                if attempt < attempts:
+                    await asyncio.sleep(delay)
+
+        logger.error(
+            "daytona get_session_command_logs malformed %d times for "
+            "session=%s command=%s; falling back to empty logs (%s)",
+            attempts,
+            session_id,
+            command_id,
+            last_exc,
+        )
+        return SessionCommandLogsResponse(output="", stdout="", stderr="")
+
+    AsyncProcess.get_session_command_logs = _patched_get_session_command_logs  # type: ignore[method-assign]
+    _PATCHED = True
+    logger.debug("daytona SDK patches applied")
diff --git a/src/benchflow/_env_setup.py b/src/benchflow/_env_setup.py
index f3aa7cd8..9800ffc7 100644
--- a/src/benchflow/_env_setup.py
+++ b/src/benchflow/_env_setup.py
@@ -20,6 +20,7 @@
 # build) instead of erroring out. Override via env if running on a paid tier.
 _DAYTONA_MAX_CPUS = int(os.environ.get("BENCHFLOW_DAYTONA_MAX_CPUS", "4"))
 _DAYTONA_MAX_MEMORY_MB = int(os.environ.get("BENCHFLOW_DAYTONA_MAX_MEMORY_MB", "8192"))
+_DAYTONA_MAX_STORAGE_MB = int(os.environ.get("BENCHFLOW_DAYTONA_MAX_STORAGE_MB", "10240"))
 
 # Directories to ignore when copying deps
 _IGNORE_DIRS = {
@@ -260,6 +261,10 @@ def _create_environment(
     elif environment_type == "daytona":
         from harbor.environments.daytona import DaytonaEnvironment
 
+        from benchflow._daytona_patches import apply as _apply_daytona_patches
+
+        _apply_daytona_patches()
+
         env_config = task.config.environment
         if env_config.cpus > _DAYTONA_MAX_CPUS:
             logger.warning(
@@ -275,6 +280,13 @@ def _create_environment(
                 _DAYTONA_MAX_MEMORY_MB,
             )
             env_config.memory_mb = _DAYTONA_MAX_MEMORY_MB
+        if env_config.storage_mb > _DAYTONA_MAX_STORAGE_MB:
+            logger.warning(
+                "Clamping storage_mb %d -> %d for Daytona (override with BENCHFLOW_DAYTONA_MAX_STORAGE_MB)",
+                env_config.storage_mb,
+                _DAYTONA_MAX_STORAGE_MB,
+            )
+            env_config.storage_mb = _DAYTONA_MAX_STORAGE_MB
 
         return DaytonaEnvironment(
             environment_dir=task.paths.environment_dir,
diff --git a/src/benchflow/_sandbox.py b/src/benchflow/_sandbox.py
index 41ba87b3..7294a3d5 100644
--- a/src/benchflow/_sandbox.py
+++ b/src/benchflow/_sandbox.py
@@ -1,7 +1,7 @@
 """Sandbox user setup, path lockdown, and verifier hardening.
 
 Owns the "agent runs as non-root" lifecycle:
-    - Creating the sandbox user and copying root's tooling into its home
+    - Creating the sandbox user and preparing minimal home state it needs
     - Building the privilege-drop wrapper (setpriv / su) for agent launch
     - Locking down solution/test paths so the sandbox user cannot read them
     - Hardening the environment before the verifier runs
@@ -16,7 +16,6 @@
 import os
 import re
 import shlex
-import tomllib
 from pathlib import Path
 from typing import TYPE_CHECKING
 
@@ -87,9 +86,7 @@ def build_priv_drop_cmd(agent_launch: str, sandbox_user: str) -> str:
     setpriv (util-linux) execs directly; su -l is the fallback for Alpine/BusyBox.
     No outer sh -c wrapper — DockerProcess wraps in bash -c already.
     """
-    inner = (
-        f"export HOME=/home/{sandbox_user} && cd /home/{sandbox_user} && {agent_launch}"
-    )
+    inner = f"export HOME=/home/{sandbox_user} && {agent_launch}"
     quoted = shlex.quote(inner)
     return (
         f"if setpriv --help 2>&1 | grep -q reuid; then"
@@ -100,6 +97,20 @@ def build_priv_drop_cmd(agent_launch: str, sandbox_user: str) -> str:
     )
 
 
+def _legacy_root_tool_link_cmd(source: str, dest: str) -> str:
+    """Link legacy root-only tool dirs into the sandbox home when needed."""
+    src = shlex.quote(source)
+    dst = shlex.quote(dest)
+    parent = shlex.quote(str(Path(dest).parent))
+    return (
+        f"if [ -e {src} ] && [ ! -L {dst} ]; then "
+        f"mkdir -p {parent} && "
+        f"rmdir {dst} 2>/dev/null || true; "
+        f"[ -e {dst} ] || ln -s {src} {dst}; "
+        "fi"
+    )
+
+
 async def setup_sandbox_user(
     env, sandbox_user: str, workspace: str, *, timeout_sec: int = 120
 ) -> str:
@@ -109,18 +120,17 @@ async def setup_sandbox_user(
             f"Invalid sandbox_user: {sandbox_user!r} (must be alphanumeric)"
         )
     logger.info(f"Setting up sandbox user: {sandbox_user}")
+    home = f"/home/{sandbox_user}"
+    home_dirs = sorted(d for d in get_sandbox_home_dirs() if d != ".local")
     await env.exec(
         f"id -u {sandbox_user} >/dev/null 2>&1 || "
         f"useradd -m -s /bin/bash {sandbox_user} && "
-        f"mkdir -p /home/{sandbox_user}/.local/bin && "
-        "if [ -d /root/.local/bin ]; then "
-        f"cp -aL /root/.local/bin/. /home/{sandbox_user}/.local/bin/ 2>/dev/null || true; fi && "
-        "if [ -d /root/.nvm ]; then "
-        f"cp -a /root/.nvm/. /home/{sandbox_user}/.nvm/ 2>/dev/null || true; fi && "
-        f"for d in {' '.join(sorted(get_sandbox_home_dirs()))}; do "
-        f"if [ -d /root/$d ]; then mkdir -p /home/{sandbox_user}/$d && "
-        f"cp -a /root/$d/. /home/{sandbox_user}/$d/ 2>/dev/null || true; fi; done && "
-        f"chown -R {sandbox_user}:{sandbox_user} /home/{sandbox_user} && "
+        f"{_legacy_root_tool_link_cmd('/root/.local/bin', f'{home}/.local/bin')} && "
+        f"{_legacy_root_tool_link_cmd('/root/.nvm', f'{home}/.nvm')} && "
+        f"for d in {' '.join(home_dirs)}; do "
+        f"if [ -d /root/$d ]; then mkdir -p {home}/$d && "
+        f"cp -a /root/$d/. {home}/$d/ 2>/dev/null || true; fi; done && "
+        f"chown -R {sandbox_user}:{sandbox_user} {home} && "
         f"chown -R {sandbox_user}:{sandbox_user} {shlex.quote(workspace)}",
         timeout_sec=timeout_sec,
     )
@@ -294,7 +304,7 @@ async def _refresh_verifier_workspace(env, workspace: str) -> None:
     "PYTEST_ADDOPTS": (
         "-c /dev/null "  # block pyproject.toml/pytest.ini/tox.ini/setup.cfg discovery
         "--confcutdir=/tests "  # block conftest.py walk-up beyond /tests
-        "--rootdir=/tests "
+        "--rootdir=/app "  # anchor test node IDs to repo root (not /dev from -c)
         "-p no:cacheprovider"
     ),
     # Block pytest11 entry-point plugins. An agent can modify a pre-installed
@@ -336,34 +346,24 @@ async def _refresh_verifier_workspace(env, workspace: str) -> None:
 _SAFE_VERIFIER_PATH_PARTS = tuple(_SAFE_VERIFIER_PATH.split(":"))
 _RUNTIME_PATH_PREFIXES = ("/tmp", "/var/tmp", "/logs", "/testbed")
 
-# pytest plugin names are not always the same as the PyPI distribution name
-# or the option they register. These aliases cover the common benchmark
-# verifier plugins while preserving PYTEST_DISABLE_PLUGIN_AUTOLOAD=1.
-_PYTEST_PLUGIN_ALIASES = {
-    "ctrf": "ctrf",
-    "pytest-json-ctrf": "ctrf",
-    "pytest_json_ctrf": "ctrf",
-    "pytest_json_ctrf.plugin": "ctrf",
-    "pytest-json-report": "pytest_jsonreport",
-    "pytest_json_report": "pytest_jsonreport",
-    "pytest_jsonreport": "pytest_jsonreport",
-    "pytest_jsonreport.plugin": "pytest_jsonreport",
-}
-_PYTEST_OPTION_PLUGINS = {
-    "--ctrf": "ctrf",
-    "--json-report": "pytest_jsonreport",
-    "--json-report-file": "pytest_jsonreport",
-}
-
-# Pytest plugins worth auto-loading when test.sh pip-installs them but the
-# task author forgot to declare pytest_plugins in task.toml. Map distribution
-# name (as it appears in `pip install pytest-foo`) to importable plugin name.
-_PYTEST_INSTALLED_PLUGINS = {
-    "pytest-asyncio": "pytest_asyncio",
-    "pytest-anyio": "anyio.pytest_plugin",
-    "pytest-trio": "pytest_trio",
-}
-_PIP_INSTALL_RE = re.compile(r"\bpip3?\s+install\b[^\n;|&]*", re.IGNORECASE)
+# Container-side script to enumerate pre-installed pytest11 entry points.
+# Runs after sandbox_user processes are killed, so the agent cannot install
+# new packages between enumeration and verification. The sandbox_user cannot
+# pip install (not root), so all discovered plugins are image-authored.
+_DISCOVER_PYTEST_PLUGINS_SCRIPT = r"""
+import json, sys
+try:
+    from importlib.metadata import entry_points
+    try:
+        eps = list(entry_points(group='pytest11'))
+    except TypeError:
+        eps = list(entry_points().get('pytest11', []))
+    names = sorted(set(ep.name for ep in eps))
+    print(json.dumps(names))
+except Exception as e:
+    print(json.dumps({"error": str(e)}), file=sys.stderr)
+    print("[]")
+""".strip()
 
 
 def _under_path(path: str, prefix: str) -> bool:
@@ -383,6 +383,21 @@ def _blocked_verifier_path_prefixes(
     return tuple(dict.fromkeys(prefixes))
 
 
+def _blocked_verifier_pythonpath_prefixes(
+    sandbox_user: str | None,
+) -> tuple[str, ...]:
+    """Paths blocked from verifier PYTHONPATH.
+
+    Unlike PATH, the workspace is NOT blocked: PYTHONPATH entries like /app
+    are set by the Dockerfile for project imports, and the workspace is
+    already importable via CWD/pytest sys.path insertion regardless.
+    """
+    prefixes = list(_RUNTIME_PATH_PREFIXES)
+    if sandbox_user:
+        prefixes.append(f"/home/{sandbox_user}")
+    return tuple(dict.fromkeys(prefixes))
+
+
 def _merge_trusted_verifier_path(extras: list[str]) -> str:
     """Prepend validated image PATH entries to the verifier allowlist."""
     kept: list[str] = []
@@ -451,76 +466,38 @@ def _trusted_path_extras_cmd(raw_path: str, blocked_prefixes: tuple[str, ...]) -
     )
 
 
-def _normalize_pytest_plugin(name: object) -> str | None:
-    """Return the importable pytest plugin name for a task declaration."""
-    if not isinstance(name, str):
-        return None
-    clean = name.strip()
-    if not clean:
-        return None
-    return _PYTEST_PLUGIN_ALIASES.get(clean, clean)
-
+async def _discover_pytest_plugin_flags(env, task: "Task") -> str:
+    """Auto-discover pytest plugins from root-owned system packages.
 
-def _plugins_from_verifier_script(task: "Task") -> list[str]:
-    """Infer known pytest plugins needed by legacy verifier scripts.
-
-    Older SkillsBench/TB2 tasks predate task-level pytest plugin metadata and
-    call options such as --ctrf directly from tests/test.sh. With pytest entry
-    point autoload disabled, those options must be backed by explicit -p flags.
+    Runs a container-side script that enumerates pytest11 entry points and
+    filters to only those whose dist-info is in a root-owned directory.
+    Falls back to task.toml pytest_plugins declarations if discovery fails.
+    Replaces the previous hand-curated whitelist mechanism.
     """
-    task_dir = getattr(task, "task_dir", None)
-    if not isinstance(task_dir, (str, os.PathLike)):
-        return []
-    test_sh = Path(task_dir) / "tests" / "test.sh"
-    try:
-        content = test_sh.read_text()
-    except OSError:
-        return []
-
     plugins: list[str] = []
-    for option, plugin in _PYTEST_OPTION_PLUGINS.items():
-        if option in content and plugin not in plugins:
-            plugins.append(plugin)
-    # Detect pip-installed pytest plugins so PYTEST_DISABLE_PLUGIN_AUTOLOAD=1
-    # doesn't silently drop them. Only matches the exact installer line so
-    # arbitrary text mentioning the plugin name is ignored.
-    for match in _PIP_INSTALL_RE.findall(content):
-        for dist, plugin in _PYTEST_INSTALLED_PLUGINS.items():
-            if dist in match and plugin not in plugins:
-                plugins.append(plugin)
-    return plugins
-
-
-def _declared_pytest_plugins(task: "Task") -> list[object]:
-    """Return pytest_plugins from the model, falling back to raw task.toml."""
-    declared = getattr(task.config.verifier, "pytest_plugins", None)
-    if declared:
-        return list(declared)
 
-    task_dir = getattr(task, "task_dir", None)
-    if not isinstance(task_dir, (str, os.PathLike)):
-        return []
-    config_path = Path(task_dir) / "task.toml"
+    # Container-side auto-discovery
     try:
-        data = tomllib.loads(config_path.read_text())
-    except (OSError, tomllib.TOMLDecodeError):
-        return []
-    plugins = data.get("verifier", {}).get("pytest_plugins", [])
-    if isinstance(plugins, list):
-        return plugins
-    return []
+        result = await env.exec(
+            f"python3 -c {shlex.quote(_DISCOVER_PYTEST_PLUGINS_SCRIPT)}",
+            user="root", timeout_sec=15,
+        )
+        if result.stderr:
+            logger.debug(f"Plugin discovery stderr: {result.stderr.strip()}")
+        discovered = _json.loads(result.stdout or "[]")
+        if isinstance(discovered, list):
+            plugins.extend(p for p in discovered if isinstance(p, str) and p.strip())
+        logger.info(f"Discovered {len(plugins)} pytest plugins from container")
+    except Exception as e:
+        logger.warning(f"Pytest plugin discovery failed, using task.toml fallback: {e}")
 
+    # Merge task.toml declarations as fallback
+    declared = getattr(task.config.verifier, "pytest_plugins", None)
+    if declared:
+        for name in declared:
+            if isinstance(name, str) and name.strip() and name.strip() not in plugins:
+                plugins.append(name.strip())
 
-def _pytest_plugin_flags(task: "Task") -> str:
-    """Build deterministic -p flags for inferred and declared pytest plugins."""
-    plugins: list[str] = []
-    for plugin in _plugins_from_verifier_script(task):
-        if plugin not in plugins:
-            plugins.append(plugin)
-    for plugin in _declared_pytest_plugins(task):
-        normalized = _normalize_pytest_plugin(plugin)
-        if normalized and normalized not in plugins:
-            plugins.append(normalized)
     return " ".join(f"-p {shlex.quote(p)}" for p in plugins)
 
 
@@ -585,6 +562,31 @@ async def _trusted_verifier_path(
     return _merge_trusted_verifier_path([e for e in extras if isinstance(e, str)])
 
 
+async def _trusted_verifier_pythonpath(
+    env, sandbox_user: str | None,
+) -> str:
+    """Return filtered PYTHONPATH preserving only trusted image entries.
+
+    Same root-owned, non-world-writable validation as PATH, but does not
+    block the workspace — it is already importable via CWD/pytest and
+    is chowned to root before verification.
+    """
+    pp_result = await env.exec("printenv PYTHONPATH 2>/dev/null || true", user="root", timeout_sec=10)
+    raw_pp = (pp_result.stdout or "").strip()
+    if not raw_pp:
+        return ""
+    blocked = _blocked_verifier_pythonpath_prefixes(sandbox_user)
+    cmd = _trusted_path_extras_cmd(raw_pp, blocked)
+    result = await env.exec(cmd, user="root", timeout_sec=10)
+    try:
+        extras = _json.loads(result.stdout or "[]")
+    except _json.JSONDecodeError:
+        return ""
+    if not isinstance(extras, list):
+        return ""
+    return ":".join(e for e in extras if isinstance(e, str))
+
+
 # Wipe and recreate /logs/verifier/ before the verifier runs.
 # rm -rf severs hardlinks, removes symlink replacements, and eliminates
 # variant filenames/subdirs the agent may have pre-staged.
@@ -592,26 +594,87 @@ async def _trusted_verifier_path(
     "rm -rf /logs/verifier && mkdir -p /logs/verifier && chmod 777 /logs/verifier"
 )
 
-# Remove injected conftest.py, sitecustomize.py/usercustomize.py, and .pth
-# files from writable sys.path entries (preserves /usr/lib, /usr/local/lib).
-# Also purge *.py from temp dirs: covers module-shadow via non-workspace cwd.
-CLEANUP_CMD = (
-    "find / -name conftest.py -not -path '/tests/*' -delete 2>/dev/null; "
-    "find /tmp /var/tmp -name '*.py' -delete 2>/dev/null; "
-    'python3 -c "'
-    "import sys,os;"
-    "[os.remove(os.path.join(d,f)) "
-    " for d in sys.path "
-    " for f in ('sitecustomize.py','usercustomize.py') "
-    " if d and not d.startswith('/usr/lib') and not d.startswith('/usr/local/lib') "
-    " and os.path.isfile(os.path.join(d,f))];"
-    "[os.remove(os.path.join(d,f)) "
-    " for d in sys.path if d and os.path.isdir(d) "
-    " for f in os.listdir(d) if f.endswith('.pth') "
-    " and not d.startswith('/usr/lib') and not d.startswith('/usr/local/lib') "
-    " and os.path.isfile(os.path.join(d,f))]"
-    '" 2>/dev/null || true'
-)
+# Per-task hardening opt-outs. Tasks declare these in task.toml under
+# [verifier.hardening] when their legitimate test setup conflicts with the
+# default cleanup (e.g. qutebrowser ships a real conftest.py that the cleanup
+# would otherwise delete, breaking pytest collection).
+#
+# Defaults are secure (all True). Tasks opt out individually:
+#
+#   [verifier.hardening]
+#   cleanup_conftests = false   # don't delete conftest.py before verify
+HARDENING_DEFAULTS: dict[str, bool] = {
+    "cleanup_conftests": True,
+}
+
+
+def _read_hardening_config(task_dir: "Path | str | None") -> dict[str, bool]:
+    """Read [verifier.hardening] section from task.toml. Returns merged defaults."""
+    import tomllib
+    from pathlib import Path as _Path
+
+    result = dict(HARDENING_DEFAULTS)
+    if task_dir is None:
+        return result
+    toml_path = _Path(task_dir) / "task.toml"
+    if not toml_path.exists():
+        return result
+    try:
+        with open(toml_path, "rb") as f:
+            data = tomllib.load(f)
+    except Exception as e:
+        logger.warning(f"task.toml parse error in {task_dir}: {e}")
+        return result
+    overrides = data.get("verifier", {}).get("hardening", {})
+    for k, v in overrides.items():
+        if k in result and isinstance(v, bool):
+            result[k] = v
+        else:
+            logger.warning(
+                f"task.toml [verifier.hardening] unknown/invalid: {k}={v!r}"
+            )
+    return result
+
+
+def _build_cleanup_cmd(hardening: dict[str, bool] | None = None) -> str:
+    """Build the cleanup shell command, honoring per-task hardening opt-outs.
+
+    Steps:
+      - conftest.py removal outside /tests (skippable via cleanup_conftests=false)
+      - *.py purge from /tmp /var/tmp (always — covers module-shadow via cwd)
+      - sitecustomize.py/usercustomize.py removal from writable sys.path
+      - .pth removal from writable sys.path
+
+    sitecustomize/usercustomize/.pth always run — opt-outs there would broaden
+    the attack surface beyond what real-world tasks need.
+    """
+    h = hardening or HARDENING_DEFAULTS
+    parts: list[str] = []
+    if h.get("cleanup_conftests", True):
+        parts.append(
+            "find / -name conftest.py -not -path '/tests/*' -delete 2>/dev/null"
+        )
+    parts.append("find /tmp /var/tmp -name '*.py' -delete 2>/dev/null")
+    parts.append(
+        'python3 -c "'
+        "import sys,os;"
+        "[os.remove(os.path.join(d,f)) "
+        " for d in sys.path "
+        " for f in ('sitecustomize.py','usercustomize.py') "
+        " if d and not d.startswith('/usr/lib') and not d.startswith('/usr/local/lib') "
+        " and os.path.isfile(os.path.join(d,f))];"
+        "[os.remove(os.path.join(d,f)) "
+        " for d in sys.path if d and os.path.isdir(d) "
+        " for f in os.listdir(d) if f.endswith('.pth') "
+        " and not d.startswith('/usr/lib') and not d.startswith('/usr/local/lib') "
+        " and os.path.isfile(os.path.join(d,f))]"
+        '" 2>/dev/null || true'
+    )
+    return "; ".join(parts)
+
+
+# Backward-compat: the all-defaults cleanup command.
+CLEANUP_CMD = _build_cleanup_cmd()
 
 
 async def harden_before_verify(
@@ -711,9 +774,11 @@ async def harden_before_verify(
             f"chown -R root:root {shlex.quote(workspace)}",
             user="root",
         )
-    await env.exec(CLEANUP_CMD, user="root", timeout_sec=10)
+    hardening = _read_hardening_config(getattr(task, "task_dir", None))
+    await env.exec(_build_cleanup_cmd(hardening), user="root", timeout_sec=10)
 
     hardened_path = await _trusted_verifier_path(env, sandbox_user, workspace)
+    hardened_pythonpath = await _trusted_verifier_pythonpath(env, sandbox_user)
     distro_env = await _distro_pip_env(env)
 
     verifier_env = dict(VERIFIER_ENV)
@@ -725,16 +790,16 @@ async def harden_before_verify(
     # plugin loading, or inject code via breakpoint()/coverage/Django/Celery
     # startup hooks.
     verifier_env["PATH"] = hardened_path
+    verifier_env["PYTHONPATH"] = hardened_pythonpath
     verifier_env["PYTEST_DISABLE_PLUGIN_AUTOLOAD"] = "1"
     verifier_env["PYTHONBREAKPOINT"] = "0"
     verifier_env["COVERAGE_PROCESS_START"] = ""
     verifier_env["DJANGO_SETTINGS_MODULE"] = ""
     verifier_env["CELERY_CONFIG_MODULE"] = ""
-    # Re-enable known verifier plugins by appending -p flags to the hardened
-    # base — never to a task-supplied PYTEST_ADDOPTS. Legacy task sets are
-    # inferred from tests/test.sh; newer tasks may declare pytest_plugins.
+    # Auto-discover pytest plugins from root-owned system packages and
+    # task.toml declarations. Appends -p flags to the hardened base.
     base_addopts = VERIFIER_ENV["PYTEST_ADDOPTS"]
-    flags = _pytest_plugin_flags(task)
+    flags = await _discover_pytest_plugin_flags(env, task)
     if flags:
         verifier_env["PYTEST_ADDOPTS"] = base_addopts + f" {flags}"
     else:
diff --git a/src/benchflow/acp/client.py b/src/benchflow/acp/client.py
index 5fa3a658..57a55eb9 100644
--- a/src/benchflow/acp/client.py
+++ b/src/benchflow/acp/client.py
@@ -88,8 +88,9 @@ async def _read_until_response(self, request_id: int) -> dict[str, Any]:
                 f"has_result={'result' in msg} has_error={'error' in msg}"
             )
 
-            # It's a response to our request
-            if "id" in msg and msg["id"] == request_id:
+            # It's a response to our request (has id, no method — distinguishes
+            # from echoed requests when running through a PTY)
+            if "id" in msg and msg["id"] == request_id and "method" not in msg:
                 if msg.get("error"):
                     raise ACPError(
                         msg["error"].get("code", -1),
@@ -97,6 +98,7 @@ async def _read_until_response(self, request_id: int) -> dict[str, Any]:
                     )
                 return msg.get("result", {})
 
+
             # It's a notification (no id)
             if "method" in msg and "id" not in msg:
                 try:
diff --git a/src/benchflow/agents/registry.py b/src/benchflow/agents/registry.py
index f1d26bb7..83bf2c13 100644
--- a/src/benchflow/agents/registry.py
+++ b/src/benchflow/agents/registry.py
@@ -157,9 +157,19 @@ class AgentConfig:
     home_dirs: list[str] = field(default_factory=list)
     # Extra dot-dirs under $HOME to copy to sandbox user (for dirs not
     # derivable from skill_paths or credential_files, e.g. ".openclaw").
+    acp_model_format: str = "bare"
+    # How the agent expects the modelId in session/set_model:
+    # "bare"           — just the model name (e.g. "claude-sonnet-4-6").
+    #                    Default; works for claude-agent-acp, codex-acp.
+    # "provider/model" — models.dev convention (e.g. "google/gemini-3.1-pro-preview").
+    #                    Required by opencode, which uses Provider.parseModel()
+    #                    to split on "/" and treats the first segment as provider ID.
     subscription_auth: SubscriptionAuth | None = None
     # Host CLI login that can substitute for an API key (e.g. OAuth tokens
     # from `claude login`). Detected automatically; API keys take precedence.
+    supports_acp_set_model: bool = True
+    # Some ACP agents configure the model through env/config at launch time and
+    # do not implement session/set_model (e.g. OpenHands CLI ACP).
 
 
 # Agent registry — all supported agents
@@ -308,16 +318,72 @@ class AgentConfig:
             ],
         ),
     ),
+    "opencode": AgentConfig(
+        name="opencode",
+        description="OpenCode via ACP — open-source coding agent (TypeScript)",
+        skill_paths=["$HOME/.opencode/skills"],
+        home_dirs=[".opencode"],
+        install_cmd=(
+            f"{_NODE_INSTALL} && "
+            "( command -v opencode >/dev/null 2>&1 || "
+            "npm install -g opencode-ai@latest >/dev/null 2>&1 ) && "
+            "command -v opencode >/dev/null 2>&1"
+        ),
+        launch_cmd="opencode acp",
+        protocol="acp",
+        requires_env=[],  # inferred from --model at runtime
+        acp_model_format="provider/model",
+        # OpenCode uses models.dev provider IDs — its parseModel() splits
+        # modelId on "/" so set_model must send "google/gemini-3.1-pro-preview",
+        # not just "gemini-3.1-pro-preview".
+        env_mapping={
+            "BENCHFLOW_PROVIDER_BASE_URL": "OPENAI_BASE_URL",
+        },
+    ),
     "openhands": AgentConfig(
         name="openhands",
         description="OpenHands agent via ACP (multi-model, Python-based)",
-        skill_paths=[],
+        skill_paths=["$HOME/.agents/skills", "$WORKSPACE/.agents/skills"],
+        home_dirs=[".openhands"],
         install_cmd=(
-            "( command -v openhands >/dev/null 2>&1 || "
-            "pip install openhands >/dev/null 2>&1 ) && "
+            "export DEBIAN_FRONTEND=noninteractive && "
+            "export PATH=\"$HOME/.local/bin:$PATH\" && "
+            "( command -v curl >/dev/null 2>&1 || "
+            "  ( apt-get update -qq && "
+            "    apt-get install -y -qq curl ca-certificates >/dev/null 2>&1 ) ) && "
+            "( command -v openhands >/dev/null 2>&1 || ( "
+            "  UV_OK=0; "
+            "  if command -v uv >/dev/null 2>&1; then "
+            "    UV_VER=$(uv --version 2>/dev/null | awk '{print $2}'); "
+            "    if [ -n \"$UV_VER\" ] && "
+            "       [ \"$(printf '%s\\n' 0.11.6 \"$UV_VER\" | sort -V | head -n1)\" = \"0.11.6\" ]; then "
+            "      UV_OK=1; "
+            "    fi; "
+            "  fi; "
+            "  if [ \"$UV_OK\" = 0 ]; then "
+            "    curl -LsSf https://astral.sh/uv/install.sh | sh >/dev/null 2>&1 && "
+            "    export PATH=\"$HOME/.local/bin:$PATH\"; "
+            "  fi && "
+            "  ( uv tool list 2>/dev/null | grep -q '^openhands\\b' || "
+            "    uv tool install openhands --python 3.12 >/dev/null 2>&1 || "
+            "    curl -fsSL https://install.openhands.dev/install.sh | sh >/dev/null 2>&1 ) "
+            ") ) && "
+            # Let sandbox user traverse to uv-managed Python interpreter path.
+            "chmod o+x /root /root/.local /root/.local/share "
+            "/root/.local/share/uv /root/.local/share/uv/tools 2>/dev/null; "
+            # Seed config so OpenHands ACP auth check passes before env override.
+            "mkdir -p ~/.openhands && "
+            "echo '{\"llm\":{\"model\":\"placeholder\",\"api_key\":\"placeholder\"}}' "
+            "> ~/.openhands/agent_settings.json && "
             "command -v openhands >/dev/null 2>&1"
         ),
-        launch_cmd="openhands acp --always-approve --override-with-envs",
+        launch_cmd=(
+            "export PATH=\"$HOME/.local/bin:$PATH\" && "
+            "mkdir -p ~/.openhands && "
+            "printf '{\"llm\":{\"model\":\"%s\",\"api_key\":\"%s\"}}' "
+            "\"$LLM_MODEL\" \"$LLM_API_KEY\" > ~/.openhands/agent_settings.json && "
+            "openhands acp --always-approve --override-with-envs"
+        ),
         protocol="acp",
         requires_env=["LLM_API_KEY"],
         api_protocol="",
@@ -325,6 +391,7 @@ class AgentConfig:
             "BENCHFLOW_PROVIDER_BASE_URL": "LLM_BASE_URL",
             "BENCHFLOW_PROVIDER_API_KEY": "LLM_API_KEY",
         },
+        supports_acp_set_model=False,
     ),
 }
 
@@ -336,21 +403,18 @@ class AgentConfig:
 
 
 def get_sandbox_home_dirs() -> set[str]:
-    """Collect all dot-dirs under $HOME that sandbox user setup should copy.
+    """Collect user home config/auth dirs BenchFlow may materialize for the sandbox user.
 
     Derives from three sources across all registered agents:
-    - skill_paths: $HOME/.foo/... → ".foo"
     - credential_files: {home}/.foo/... → ".foo"
+    - subscription_auth.files: {home}/.foo/... → ".foo"
     - home_dirs: explicit extras (e.g. ".openclaw")
 
-    Always includes ".local" (pip scripts, etc.).
+    Skill paths are excluded: deploy_skills() now links those paths directly to a
+    shared skills tree instead of relying on sandbox-home copies.
     """
-    dirs: set[str] = {".local"}
+    dirs: set[str] = set()
     for cfg in AGENTS.values():
-        for sp in cfg.skill_paths:
-            if sp.startswith("$HOME/."):
-                dirname = sp.removeprefix("$HOME/").split("/")[0]
-                dirs.add(dirname)
         for cf in cfg.credential_files:
             # path uses {home}/.foo/... placeholder
             path = cf.path
@@ -496,6 +560,8 @@ def register_agent(
     credential_files: list[CredentialFile] | None = None,
     home_dirs: list[str] | None = None,
     subscription_auth: SubscriptionAuth | None = None,
+    acp_model_format: str = "bare",
+    supports_acp_set_model: bool = True,
 ) -> AgentConfig:
     """Register a custom agent at runtime.
 
@@ -525,6 +591,8 @@ def register_agent(
         credential_files=credential_files or [],
         home_dirs=home_dirs or [],
         subscription_auth=subscription_auth,
+        acp_model_format=acp_model_format,
+        supports_acp_set_model=supports_acp_set_model,
     )
     AGENTS[name] = config
     AGENT_INSTALLERS[name] = install_cmd
diff --git a/src/benchflow/cli/eval.py b/src/benchflow/cli/eval.py
index 58b410fc..17cea1dd 100644
--- a/src/benchflow/cli/eval.py
+++ b/src/benchflow/cli/eval.py
@@ -31,7 +31,8 @@
 from rich.console import Console
 from rich.table import Table
 
-from benchflow.job import DEFAULT_AGENT, effective_model as _effective_model
+from benchflow.job import DEFAULT_AGENT
+from benchflow.job import effective_model as _effective_model
 
 console = Console()
 
diff --git a/src/benchflow/cli/main.py b/src/benchflow/cli/main.py
index 96611bbd..9c9b7f23 100644
--- a/src/benchflow/cli/main.py
+++ b/src/benchflow/cli/main.py
@@ -433,9 +433,9 @@ def skills_eval(
         typer.Argument(help="Path to skill directory containing evals/evals.json"),
     ],
     agent: Annotated[
-        list[str],
+        list[str] | None,
         typer.Option("--agent", "-a", help="Agent(s) to evaluate (repeatable)"),
-    ] = ["claude-agent-acp"],
+    ] = None,
     model: Annotated[
         list[str] | None,
         typer.Option("--model", "-m", help="Model(s) (matched 1:1 with agents)"),
@@ -473,6 +473,8 @@ def skills_eval(
     """
     from benchflow.skill_eval import SkillEvaluator, export_gepa_traces
 
+    if agent is None:
+        agent = ["claude-agent-acp"]
     if not (skill_dir / "evals" / "evals.json").exists():
         console.print(
             f"[red]No evals/evals.json found in {skill_dir}[/red]\n"
@@ -738,6 +740,13 @@ def eval_create(
         str | None,
         typer.Option("--sandbox-user", help="Sandbox user (null for root)"),
     ] = "agent",
+    sandbox_setup_timeout: Annotated[
+        int,
+        typer.Option(
+            "--sandbox-setup-timeout",
+            help="Timeout (seconds) for sandbox user setup inside the environment.",
+        ),
+    ] = 120,
     skills_dir: Annotated[
         Path | None,
         typer.Option("--skills-dir", "-s", help="Skills directory to deploy"),
@@ -757,7 +766,7 @@ def eval_create(
         eff_model = effective_model(agent, model)
         # Smart detection: if tasks_dir has task.toml, it's a single task
         if (tasks_dir / "task.toml").exists():
-            from benchflow.trial import Trial, TrialConfig, Scene
+            from benchflow.trial import Scene, Trial, TrialConfig
 
             config = TrialConfig(
                 task_path=tasks_dir,
@@ -765,6 +774,7 @@ def eval_create(
                                      skills_dir=str(skills_dir) if skills_dir else None)],
                 environment=environment,
                 sandbox_user=sandbox_user,
+                sandbox_setup_timeout=sandbox_setup_timeout,
                 jobs_dir=jobs_dir,
                 agent=agent,
                 model=eff_model,
@@ -794,6 +804,7 @@ async def _run():
                     environment=environment,
                     concurrency=concurrency,
                     sandbox_user=sandbox_user,
+                    sandbox_setup_timeout=sandbox_setup_timeout,
                     skills_dir=str(skills_dir) if skills_dir else None,
                 ),
             )
diff --git a/src/benchflow/job.py b/src/benchflow/job.py
index ad99ad49..bd53a0b0 100644
--- a/src/benchflow/job.py
+++ b/src/benchflow/job.py
@@ -126,9 +126,7 @@ def should_retry(self, error: str | None) -> bool:
             return True
         if self.retry_on_pipe and category == PIPE_CLOSED:
             return True
-        if self.retry_on_acp and category == ACP_ERROR:
-            return True
-        return False
+        return bool(self.retry_on_acp and category == ACP_ERROR)
 
     def backoff_delay(self, attempt: int) -> float:
         """Exponential backoff delay for retry attempt."""
@@ -168,6 +166,7 @@ class JobConfig:
     skills_dir: str | None = None
     sandbox_user: str | None = "agent"
     sandbox_locked_paths: list[str] | None = None
+    sandbox_setup_timeout: int = 120
     context_root: str | None = None
     exclude_tasks: set[str] = field(default_factory=set)
 
@@ -293,6 +292,7 @@ def _from_native_yaml(cls, raw: dict, **kwargs) -> "Job":
         exclude = set(raw.get("exclude", []))
         sandbox_user = raw.get("sandbox_user", "agent")
         sandbox_locked_paths = raw.get("sandbox_locked_paths")
+        sandbox_setup_timeout = raw.get("sandbox_setup_timeout", 120)
 
         agent_name = raw.get("agent", DEFAULT_AGENT)
         config = JobConfig(
@@ -306,6 +306,7 @@ def _from_native_yaml(cls, raw: dict, **kwargs) -> "Job":
             skills_dir=str(Path(raw["skills_dir"])) if raw.get("skills_dir") else None,
             sandbox_user=sandbox_user,
             sandbox_locked_paths=sandbox_locked_paths,
+            sandbox_setup_timeout=sandbox_setup_timeout,
             exclude_tasks=exclude,
         )
         return cls(tasks_dir=tasks_dir, jobs_dir=jobs_dir, config=config, **kwargs)
@@ -352,6 +353,7 @@ def _from_harbor_yaml(cls, raw: dict, **kwargs) -> "Job":
         skills_dir = str(Path(skills_dir_raw)) if skills_dir_raw else None
         sandbox_user = raw.get("sandbox_user", "agent")
         sandbox_locked_paths = raw.get("sandbox_locked_paths")
+        sandbox_setup_timeout = raw.get("sandbox_setup_timeout", 120)
 
         config = JobConfig(
             agent=agent_name,
@@ -363,6 +365,7 @@ def _from_harbor_yaml(cls, raw: dict, **kwargs) -> "Job":
             skills_dir=skills_dir,
             sandbox_user=sandbox_user,
             sandbox_locked_paths=sandbox_locked_paths,
+            sandbox_setup_timeout=sandbox_setup_timeout,
         )
         return cls(tasks_dir=tasks_dir, jobs_dir=jobs_dir, config=config, **kwargs)
 
@@ -407,6 +410,13 @@ def _prune_docker(self):
         except Exception as e:
             logger.warning(f"Docker prune failed: {e}")
 
+    def _resolve_skills_dir(self, task_dir: Path, skills_dir: str | None) -> str | None:
+        """Resolve skills_dir — 'auto' means per-task environment/skills/."""
+        if skills_dir == "auto":
+            candidate = task_dir / "environment" / "skills"
+            return str(candidate) if candidate.is_dir() else None
+        return skills_dir
+
     async def _run_single_task(self, task_dir: Path, cfg: JobConfig) -> RunResult:
         """Execute one trial via Trial."""
         from benchflow.trial import Trial, TrialConfig
@@ -420,9 +430,10 @@ async def _run_single_task(self, task_dir: Path, cfg: JobConfig) -> RunResult:
             job_name=self._job_name,
             jobs_dir=str(self._jobs_dir),
             environment=cfg.environment,
-            skills_dir=cfg.skills_dir,
+            skills_dir=self._resolve_skills_dir(task_dir, cfg.skills_dir),
             sandbox_user=cfg.sandbox_user,
             sandbox_locked_paths=cfg.sandbox_locked_paths,
+            sandbox_setup_timeout=cfg.sandbox_setup_timeout,
             context_root=cfg.context_root,
         )
         trial = await Trial.create(trial_config)
@@ -439,9 +450,10 @@ async def _run_single_task_legacy(self, task_dir: Path, cfg: JobConfig) -> RunRe
             job_name=self._job_name,
             jobs_dir=str(self._jobs_dir),
             environment=cfg.environment,
-            skills_dir=cfg.skills_dir,
+            skills_dir=self._resolve_skills_dir(task_dir, cfg.skills_dir),
             sandbox_user=cfg.sandbox_user,
             sandbox_locked_paths=cfg.sandbox_locked_paths,
+            sandbox_setup_timeout=cfg.sandbox_setup_timeout,
             context_root=cfg.context_root,
         )
 
diff --git a/src/benchflow/mcp/reviewer_server.py b/src/benchflow/mcp/reviewer_server.py
index 9c2dedcc..210746b9 100644
--- a/src/benchflow/mcp/reviewer_server.py
+++ b/src/benchflow/mcp/reviewer_server.py
@@ -24,8 +24,6 @@
 import json
 import logging
 import os
-import subprocess
-import sys
 from pathlib import Path
 
 logger = logging.getLogger(__name__)
@@ -52,11 +50,11 @@ def create_reviewer_server(
     """
     try:
         from fastmcp import FastMCP
-    except ImportError:
+    except ImportError as e:
         raise ImportError(
             "fastmcp required for MCP reviewer server. "
             "Install with: pip install fastmcp"
-        )
+        ) from e
 
     mcp = FastMCP("benchflow-reviewer")
     prompt = review_prompt or os.environ.get("REVIEWER_PROMPT", DEFAULT_REVIEW_PROMPT)
diff --git a/src/benchflow/metrics.py b/src/benchflow/metrics.py
index 43bd4450..413eb7fe 100644
--- a/src/benchflow/metrics.py
+++ b/src/benchflow/metrics.py
@@ -180,7 +180,7 @@ def collect_metrics(
     results_dir = Path(results_dir)
     best: dict[str, dict] = {}
 
-    for rfile in results_dir.rglob("result.json"):
+    for rfile in sorted(results_dir.rglob("result.json")):
         try:
             r = json.loads(rfile.read_text())
             task = r["task_name"]
diff --git a/src/benchflow/process.py b/src/benchflow/process.py
index 4c86f3b6..b9d2c212 100644
--- a/src/benchflow/process.py
+++ b/src/benchflow/process.py
@@ -12,6 +12,7 @@
 import logging
 import os
 import shlex
+import uuid
 from abc import ABC, abstractmethod
 from typing import Any
 
@@ -256,11 +257,16 @@ class DaytonaProcess(LiveProcess):
     """
 
     def __init__(
-        self, sandbox: Any, is_dind: bool = False, compose_cmd_prefix: str = ""
+        self,
+        sandbox: Any,
+        is_dind: bool = False,
+        compose_cmd_prefix: str = "",
+        compose_cmd_base: str = "",
     ):
         self._sandbox = sandbox
         self._is_dind = is_dind
         self._compose_cmd_prefix = compose_cmd_prefix
+        self._compose_cmd_base = compose_cmd_base
 
     @classmethod
     async def from_harbor_env(cls, env: Any) -> "DaytonaProcess":
@@ -273,6 +279,7 @@ async def from_harbor_env(cls, env: Any) -> "DaytonaProcess":
         is_dind = hasattr(env, "_strategy") and hasattr(env._strategy, "_compose_cmd")
 
         compose_cmd_prefix = ""
+        compose_cmd_base = ""
         if is_dind:
             # Build compose env vars and command prefix for DinD
             strategy = env._strategy
@@ -280,9 +287,16 @@ async def from_harbor_env(cls, env: Any) -> "DaytonaProcess":
                 f"{k}={shlex.quote(v)}" for k, v in strategy._compose_env_vars().items()
             )
             compose_cmd_prefix = compose_env
+            # Extract the full compose base command with project/file flags
+            # (e.g. "docker compose -p NAME --project-directory DIR -f F1 -f F2")
+            # so that `docker compose exec` can find the running project.
+            compose_cmd_base = strategy._compose_cmd([])
 
         return cls(
-            sandbox=sandbox, is_dind=is_dind, compose_cmd_prefix=compose_cmd_prefix
+            sandbox=sandbox,
+            is_dind=is_dind,
+            compose_cmd_prefix=compose_cmd_prefix,
+            compose_cmd_base=compose_cmd_base,
         )
 
     async def start(
@@ -296,19 +310,30 @@ async def start(
         ssh_target = f"{ssh_access.token}@ssh.app.daytona.io"
 
         if self._is_dind:
-            # Build the docker compose exec command to run inside the DinD VM
-            inner_parts = ["docker", "compose", "exec", "-i", "-T"]
+            # Build the docker compose exec command to run inside the DinD VM.
+            # Use the full compose base command (with -p, --project-directory,
+            # and -f flags) so that exec can find the running project.
+            if self._compose_cmd_base:
+                inner_parts = [*shlex.split(self._compose_cmd_base), "exec", "-i", "-T"]
+            else:
+                inner_parts = ["docker", "compose", "exec", "-i", "-T"]
             if cwd:
                 inner_parts.extend(["-w", cwd])
             # Write env vars to a temp file on the remote VM instead of passing
             # as -e K=V args (which are visible in ps aux on the remote host).
+            # Use a Python-generated unique suffix instead of a shell `$$`
+            # expansion: shlex.join() (below) single-quotes the --env-file arg,
+            # so `$$` would survive as a literal in the docker compose call
+            # while the cat > ... heredoc would expand it — the file would be
+            # written to one path and read from another. uuid.uuid4 sidesteps
+            # the entire shell-expansion-vs-quoting problem.
             remote_env_path = None
             if env:
-                remote_env_path = "/tmp/benchflow_env_$$.env"
+                remote_env_path = f"/tmp/benchflow_env_{uuid.uuid4().hex[:16]}.env"
                 env_lines = "\n".join(f"{k}={v}" for k, v in env.items())
                 inner_parts.extend(["--env-file", remote_env_path])
-            inner_parts.extend(["main", "bash", "-c", shlex.quote(command)])
-            inner_cmd = " ".join(inner_parts)
+            inner_parts.extend(["main", "bash", "-c", command])
+            inner_cmd = shlex.join(inner_parts)
 
             if self._compose_cmd_prefix:
                 remote_cmd = f"{self._compose_cmd_prefix} {inner_cmd}"
@@ -331,7 +356,9 @@ async def start(
             env_prefix = ""
             remote_env_path = None
             if env:
-                remote_env_path = "/tmp/benchflow_env_$$.env"
+                # Python-generated unique suffix; see DinD branch above for why
+                # $$ shell expansion is fragile across quoting boundaries.
+                remote_env_path = f"/tmp/benchflow_env_{uuid.uuid4().hex[:16]}.env"
                 env_lines = "\n".join(
                     f"export {k}={shlex.quote(v)}" for k, v in env.items()
                 )
@@ -371,3 +398,131 @@ async def start(
             limit=_BUFFER_LIMIT,
         )
         logger.info(f"Daytona process started (pid={self._process.pid})")
+
+
+class DaytonaPtyProcess(LiveProcess):
+    """Live stdin/stdout via Daytona PTY WebSocket API.
+
+    Uses the Daytona SDK's PTY session (WebSocket) instead of SSH, which
+    maintains long-lived interactive pipes through DinD compose layers.
+    Falls back to this for DinD sandboxes where SSH pipes break.
+    """
+
+    _process = None  # Not used — override readline/writeline/close
+
+    def __init__(self, sandbox: Any, compose_cmd_prefix: str, compose_cmd_base: str):
+        self._sandbox = sandbox
+        self._compose_cmd_prefix = compose_cmd_prefix
+        self._compose_cmd_base = compose_cmd_base
+        self._pty = None
+        self._line_buffer = asyncio.Queue()
+        self._partial = b""
+        self._closed = False
+
+    @classmethod
+    async def from_harbor_env(cls, env: Any) -> "DaytonaPtyProcess":
+        sandbox = env._sandbox
+        if not sandbox:
+            raise RuntimeError("Daytona sandbox not started")
+        strategy = env._strategy
+        compose_env = " ".join(
+            f"{k}={shlex.quote(v)}" for k, v in strategy._compose_env_vars().items()
+        )
+        compose_cmd_base = strategy._compose_cmd([])
+        return cls(sandbox=sandbox, compose_cmd_prefix=compose_env, compose_cmd_base=compose_cmd_base)
+
+    async def _on_pty_data(self, data: bytes) -> None:
+        self._partial += data
+        while b"\n" in self._partial:
+            line, self._partial = self._partial.split(b"\n", 1)
+            line = line.replace(b"\r", b"")
+            await self._line_buffer.put(line + b"\n")
+
+    async def start(
+        self,
+        command: str,
+        env: dict[str, str] | None = None,
+        cwd: str | None = None,
+    ) -> None:
+        import uuid
+        session_id = f"acp-{uuid.uuid4().hex[:8]}"
+        pty_env = {}
+        if self._compose_cmd_prefix:
+            for part in shlex.split(self._compose_cmd_prefix):
+                if "=" in part:
+                    k, v = part.split("=", 1)
+                    pty_env[k] = v
+
+        self._pty = await self._sandbox.process.create_pty_session(
+            id=session_id,
+            on_data=self._on_pty_data,
+            envs=pty_env if pty_env else None,
+        )
+        await self._pty.wait_for_connection()
+        logger.info(f"DaytonaPtyProcess: PTY connected (session={session_id})")
+
+        compose_parts = shlex.split(self._compose_cmd_base) if self._compose_cmd_base else ["docker", "compose"]
+        exec_parts = [*compose_parts, "exec", "-i", "-T"]
+        if cwd:
+            exec_parts.extend(["-w", cwd])
+        # Write env vars to a file inside the container (not visible in ps aux),
+        # matching the approach in DaytonaProcess.start().
+        env_file_cmd = ""
+        if env:
+            env_lines = "\n".join(f"export {k}={shlex.quote(v)}" for k, v in env.items())
+            env_file_cmd = (
+                f"cat > /tmp/.benchflow_env <<'__EOF__'\n{env_lines}\n__EOF__\n"
+                f". /tmp/.benchflow_env && rm -f /tmp/.benchflow_env && "
+            )
+        exec_parts.extend(["main", "bash", "-lc", f"{env_file_cmd}{command}"])
+        exec_cmd = shlex.join(exec_parts)
+
+        # Use a marker + stty to cleanly hand over the PTY to the agent.
+        # 1. Disable echo so typed commands don't appear in output
+        # 2. Print marker so we know when to start reading ACP output
+        # 3. exec into compose exec so the agent owns the PTY
+        marker = f"__BENCHFLOW_ACP_{session_id}__"
+        setup = f"stty -echo 2>/dev/null; echo '{marker}'; exec {exec_cmd}\n"
+        await self._pty.send_input(setup)
+        logger.info("DaytonaPtyProcess: sent setup, waiting for marker...")
+
+        while True:
+            try:
+                line = await asyncio.wait_for(self._line_buffer.get(), timeout=120)
+                decoded = line.decode(errors="replace").strip()
+                logger.debug(f"DaytonaPtyProcess drain: {decoded[:120]}")
+                if marker in decoded:
+                    break
+            except TimeoutError as e:
+                raise ConnectionError("DaytonaPtyProcess: timeout waiting for agent start marker") from e
+
+        logger.info("DaytonaPtyProcess: marker seen, agent starting")
+
+    async def readline(self) -> bytes:
+        if self._closed:
+            raise ConnectionError("PTY closed")
+        try:
+            line = await asyncio.wait_for(self._line_buffer.get(), timeout=900)
+            return line
+        except TimeoutError as e:
+            raise ConnectionError("PTY readline timeout (900s)") from e
+        except Exception as e:
+            raise ConnectionError(f"PTY readline error: {e}") from e
+
+    async def writeline(self, data: str) -> None:
+        if not self._pty or self._closed:
+            raise RuntimeError("PTY not started")
+        await self._pty.send_input(data + "\n")
+
+    async def close(self) -> None:
+        self._closed = True
+        if self._pty:
+            with contextlib.suppress(Exception):
+                await self._pty.kill()
+            with contextlib.suppress(Exception):
+                await self._pty.disconnect()
+            logger.info("DaytonaPtyProcess terminated")
+
+    @property
+    def is_running(self) -> bool:
+        return self._pty is not None and not self._closed
diff --git a/src/benchflow/runtime.py b/src/benchflow/runtime.py
index 1e55f0c7..69be399c 100644
--- a/src/benchflow/runtime.py
+++ b/src/benchflow/runtime.py
@@ -142,6 +142,7 @@ class RuntimeConfig:
     """Configuration for a Runtime execution."""
 
     sandbox_user: str | None = "agent"
+    sandbox_setup_timeout: int = 120
     max_rounds: int = 10
     snapshot_policy: str = "none"
     reward_stream: bool = True
@@ -263,6 +264,7 @@ async def execute(self) -> RuntimeResult:
             environment=self.env.backend,
             sandbox_user=config.sandbox_user,
             sandbox_locked_paths=config.sandbox_locked_paths,
+            sandbox_setup_timeout=config.sandbox_setup_timeout,
             jobs_dir=config.jobs_dir,
             context_root=config.context_root,
             pre_agent_hooks=config.pre_agent_hooks,
@@ -291,13 +293,13 @@ async def execute(self) -> RuntimeResult:
 
 
 async def run(
-    subject: "Agent | TrialConfig | str",
-    env: "Environment | str | None" = None,
+    subject: Agent | TrialConfig | str,
+    env: Environment | str | None = None,
     config: RuntimeConfig | None = None,
     *,
-    task_path: "str | Path | None" = None,
+    task_path: str | Path | None = None,
     model: str | None = None,
-) -> "RuntimeResult | RunResult":
+) -> RuntimeResult | RunResult:
     """Primary user-facing API — multiple calling conventions.
 
     Usage::
@@ -313,7 +315,6 @@ async def run(
         # 3. Agent name string (simplest)
         result = await bf.run("gemini", task_path="tasks/X")
     """
-    from benchflow.models import RunResult
     from benchflow.trial import Scene, Trial, TrialConfig
 
     if isinstance(subject, TrialConfig):
@@ -339,6 +340,7 @@ async def run(
             environment=env if isinstance(env, str) else "docker",
             sandbox_user=rc.sandbox_user,
             sandbox_locked_paths=rc.sandbox_locked_paths,
+            sandbox_setup_timeout=rc.sandbox_setup_timeout,
             jobs_dir=rc.jobs_dir,
             context_root=rc.context_root,
             pre_agent_hooks=rc.pre_agent_hooks,
diff --git a/src/benchflow/sdk.py b/src/benchflow/sdk.py
index bd10a2d0..b55a72f7 100644
--- a/src/benchflow/sdk.py
+++ b/src/benchflow/sdk.py
@@ -98,33 +98,12 @@
 from harbor.models.trial.paths import TrialPaths
 from harbor.verifier.verifier import Verifier
 
-from benchflow._acp_run import connect_acp, execute_prompts
-from benchflow._agent_env import resolve_agent_env
-from benchflow._agent_setup import deploy_skills, install_agent
-from benchflow._credentials import (
-    upload_subscription_auth,
-    write_credential_files,
-)
 from benchflow._env_setup import (
-    _create_environment,
-    _inject_skills_into_dockerfile,
     _patch_harbor_dind,
-    stage_dockerfile_deps,
 )
 from benchflow._sandbox import (
-    _resolve_locked_paths,
-    _seed_verifier_workspace,
-    _snapshot_build_config,
     harden_before_verify,
-    lockdown_paths,
-    setup_sandbox_user,
-)
-from benchflow._trajectory import (
-    _capture_session_trajectory,
-    _scrape_agent_trajectory,
 )
-from benchflow.acp.client import ACPClient, ACPError
-from benchflow.agents.registry import AGENT_LAUNCH
 from benchflow.models import RunResult, TrajectorySource
 
 logger = logging.getLogger(__name__)
@@ -243,6 +222,7 @@ def _write_config(
         sandbox_user: str | None,
         context_root: str | Path | None,
         sandbox_locked_paths: list[str] | None = None,
+        sandbox_setup_timeout: int = 120,
         timeout: int,
         started_at: datetime,
         agent_env: dict[str, str],
@@ -262,6 +242,7 @@ def _write_config(
             "skills_dir": str(skills_dir) if skills_dir else None,
             "sandbox_user": sandbox_user,
             "sandbox_locked_paths": sandbox_locked_paths,
+            "sandbox_setup_timeout": sandbox_setup_timeout,
             "context_root": str(context_root) if context_root else None,
             "timeout_sec": timeout,
             "started_at": str(started_at),
@@ -463,6 +444,7 @@ async def run(
         skills_dir: str | Path | None = None,
         sandbox_user: str | None = "agent",
         sandbox_locked_paths: list[str] | None = None,
+        sandbox_setup_timeout: int = 120,
         pre_agent_hooks: list | None = None,
         context_root: str | Path | None = None,
     ) -> RunResult:
@@ -511,6 +493,7 @@ async def run(
             skills_dir=skills_dir,
             sandbox_user=sandbox_user,
             sandbox_locked_paths=sandbox_locked_paths,
+            sandbox_setup_timeout=sandbox_setup_timeout,
             pre_agent_hooks=pre_agent_hooks,
             context_root=context_root,
         )
diff --git a/src/benchflow/skill_eval.py b/src/benchflow/skill_eval.py
index b848135e..82e96869 100644
--- a/src/benchflow/skill_eval.py
+++ b/src/benchflow/skill_eval.py
@@ -6,6 +6,7 @@
     result = await evaluator.run(agents=["claude-agent-acp"], environment="docker")
 """
 
+import contextlib
 import json
 import logging
 import shutil
@@ -431,7 +432,7 @@ async def run(
             all_results: list[CaseResult] = []
 
             # Run each agent
-            for agent, model in zip(agents, models):
+            for agent, model in zip(agents, models, strict=False):
                 agent_label = agent.split("/")[-1] if "/" in agent else agent
 
                 # With-skill run
@@ -486,9 +487,9 @@ async def _run_job(
         with_skill: bool,
     ) -> list[CaseResult]:
         """Run a batch of tasks using Job for concurrency and retries."""
-        from benchflow.job import Job, JobConfig, RetryConfig
-
         import os
+
+        from benchflow.job import Job, JobConfig, RetryConfig
         judge_env = {}
         for key in ("ANTHROPIC_API_KEY", "OPENAI_API_KEY", "GOOGLE_API_KEY", "GEMINI_API_KEY"):
             if os.environ.get(key):
@@ -506,7 +507,7 @@ async def _run_job(
                 agent_env=judge_env,
             ),
         )
-        job_result = await j.run()
+        await j.run()
 
         results = []
         # Walk the jobs directory to collect per-case results
@@ -539,10 +540,8 @@ async def _run_job(
                 # Read judge rubric details if available
                 judge_file = trial_dir / "verifier" / "judge_result.json"
                 if judge_file.exists():
-                    try:
+                    with contextlib.suppress(json.JSONDecodeError, KeyError):
                         rubric_results = json.loads(judge_file.read_text()).get("items")
-                    except (json.JSONDecodeError, KeyError):
-                        pass
 
                 results.append(
                     CaseResult(
@@ -579,7 +578,7 @@ def _compute_lifts(
     ) -> list[AgentLift]:
         """Compute per-agent lift from case results."""
         lifts = []
-        for agent, model in zip(agents, models):
+        for agent, model in zip(agents, models, strict=False):
             with_results = [r for r in all_results if r.agent == agent and r.with_skill]
             baseline_results = [
                 r for r in all_results if r.agent == agent and not r.with_skill
diff --git a/src/benchflow/trial.py b/src/benchflow/trial.py
index 88034199..b2551054 100644
--- a/src/benchflow/trial.py
+++ b/src/benchflow/trial.py
@@ -36,7 +36,10 @@
 from __future__ import annotations
 
 import asyncio
+import contextlib
+import json
 import logging
+import shlex
 from dataclasses import dataclass, field
 from datetime import datetime
 from pathlib import Path
@@ -55,7 +58,6 @@
     _resolve_locked_paths,
     _seed_verifier_workspace,
     _snapshot_build_config,
-    harden_before_verify,
     lockdown_paths,
     setup_sandbox_user,
 )
@@ -64,8 +66,9 @@
     _scrape_agent_trajectory,
 )
 from benchflow.acp.client import ACPClient, ACPError
-from benchflow.agents.registry import AGENT_LAUNCH, AGENTS
+from benchflow.agents.registry import AGENT_LAUNCH
 from benchflow.models import RunResult, TrajectorySource
+from benchflow.user import BaseUser, RoundResult
 
 logger = logging.getLogger(__name__)
 
@@ -108,7 +111,7 @@ def single(
         prompts: list[str | None] | None = None,
         role_name: str = "agent",
         skills_dir: str | Path | None = None,
-    ) -> "Scene":
+    ) -> Scene:
         """Shortcut for single-agent, single-role scene."""
         prompts = prompts or [None]
         return cls(
@@ -131,6 +134,7 @@ class TrialConfig:
     environment: str = "docker"
     sandbox_user: str | None = "agent"
     sandbox_locked_paths: list[str] | None = None
+    sandbox_setup_timeout: int = 120
     services: list[str] | None = None
     job_name: str | None = None
     trial_name: str | None = None
@@ -138,6 +142,11 @@ class TrialConfig:
     context_root: str | Path | None = None
     pre_agent_hooks: list | None = None
 
+    # User-driven progressive-disclosure loop
+    user: BaseUser | None = None
+    max_user_rounds: int = 5
+    oracle_access: bool = False
+
     # Legacy compat fields — used by SDK.run() shim. Ignored when scenes is set.
     agent: str = "claude-agent-acp"
     prompts: list[str | None] | None = None
@@ -155,7 +164,7 @@ def from_legacy(
         prompts: list[str | None] | None = None,
         skills_dir: str | Path | None = None,
         **kwargs,
-    ) -> "TrialConfig":
+    ) -> TrialConfig:
         """Construct from flat SDK.run()-style args."""
         return cls(
             task_path=task_path,
@@ -273,6 +282,11 @@ async def setup(self) -> None:
             logger.warning(
                 "sandbox_user=None — agent runs as root with no path lockdown."
             )
+        if cfg.oracle_access and cfg.user is None:
+            logger.warning(
+                "oracle_access=True without a User — /solution stays visible "
+                "to the agent for the entire trial."
+            )
 
         self._effective_locked = _resolve_locked_paths(
             cfg.sandbox_user, cfg.sandbox_locked_paths
@@ -328,6 +342,7 @@ async def setup(self) -> None:
             sandbox_user=cfg.sandbox_user,
             context_root=cfg.context_root,
             sandbox_locked_paths=self._effective_locked,
+            sandbox_setup_timeout=cfg.sandbox_setup_timeout,
             timeout=self._timeout,
             started_at=self._started_at,
             agent_env=self._agent_env,
@@ -367,7 +382,10 @@ async def install_agent(self) -> None:
         if cfg.primary_agent == "oracle":
             if cfg.sandbox_user:
                 await setup_sandbox_user(
-                    self._env, cfg.sandbox_user, workspace=self._agent_cwd
+                    self._env,
+                    cfg.sandbox_user,
+                    workspace=self._agent_cwd,
+                    timeout_sec=cfg.sandbox_setup_timeout,
                 )
             await _snapshot_build_config(self._env, workspace=self._agent_cwd)
             await _seed_verifier_workspace(self._env, workspace=self._agent_cwd, sandbox_user=cfg.sandbox_user)
@@ -389,7 +407,10 @@ async def install_agent(self) -> None:
 
         if cfg.sandbox_user:
             self._agent_cwd = await setup_sandbox_user(
-                self._env, cfg.sandbox_user, workspace=self._agent_cwd
+                self._env,
+                cfg.sandbox_user,
+                workspace=self._agent_cwd,
+                timeout_sec=cfg.sandbox_setup_timeout,
             )
         await _snapshot_build_config(self._env, workspace=self._agent_cwd)
         await _seed_verifier_workspace(self._env, workspace=self._agent_cwd, sandbox_user=cfg.sandbox_user)
@@ -438,10 +459,8 @@ async def disconnect(self) -> None:
         # Kill any lingering agent processes to prevent context bleed between scenes
         if self._env and self._agent_launch.strip():
             agent_cmd = self._agent_launch.split()[0].split("/")[-1]
-            try:
+            with contextlib.suppress(Exception):
                 await self._env.exec(f"pkill -f '{agent_cmd}' || true", timeout_sec=10)
-            except Exception:
-                pass
         self._session_tool_count = 0
         self._session_traj_count = 0
         self._phase = "installed"
@@ -510,6 +529,69 @@ async def verify(self) -> dict | None:
         self._phase = "verified"
         return self._rewards
 
+    async def soft_verify(self) -> tuple[dict | None, str | None, str | None]:
+        """Run the verifier without full hardening — for intermediate feedback.
+
+        Skips process kill and workspace restore/chown (so the sandbox
+        stays usable for the next round), but DOES purge agent-injected
+        conftest.py / sitecustomize.py / .pth files to prevent the agent
+        from gaming intermediate test results.
+
+        Returns (rewards, verifier_output, verifier_error). The final
+        verify() still does full hardening.
+        """
+        from harbor import Verifier
+
+        from benchflow._sandbox import _build_cleanup_cmd, _read_hardening_config
+
+        self._trial_paths.verifier_dir.mkdir(parents=True, exist_ok=True)
+        # Clean verifier output dir — chmod 777 so non-root verifier processes can write
+        await self._env.exec(
+            "rm -rf /logs/verifier && mkdir -p /logs/verifier && chmod 777 /logs/verifier",
+            user="root", timeout_sec=10,
+        )
+        # Purge agent-injected conftest/sitecustomize/.pth without
+        # killing processes or restoring workspace.
+        # Honor per-task [verifier.hardening] opt-outs from task.toml.
+        hardening = _read_hardening_config(getattr(self._task, "task_dir", None))
+        await self._env.exec(
+            _build_cleanup_cmd(hardening), user="root", timeout_sec=10
+        )
+
+        rewards = None
+        verifier_output = None
+        verifier_error = None
+        try:
+            verifier = Verifier(
+                task=self._task,
+                trial_paths=self._trial_paths,
+                environment=self._env,
+            )
+            verifier_result = await asyncio.wait_for(
+                verifier.verify(),
+                timeout=self._task.config.verifier.timeout_sec,
+            )
+            rewards = verifier_result.rewards
+            # Capture raw verifier output for the user
+            cat = await self._env.exec(
+                "cat /logs/verifier/*.log 2>/dev/null || "
+                "cat /logs/verifier/output.txt 2>/dev/null || true",
+                timeout_sec=10,
+            )
+            verifier_output = (cat.stdout or "").strip() or None
+            logger.info(f"[soft_verify] rewards={rewards}")
+        except TimeoutError:
+            verifier_error = (
+                f"soft verifier timed out after "
+                f"{self._task.config.verifier.timeout_sec}s"
+            )
+            logger.warning(verifier_error)
+        except Exception as e:
+            verifier_error = f"soft verifier crashed: {e}"
+            logger.warning(verifier_error)
+
+        return rewards, verifier_output, verifier_error
+
     # ── Phase 5: CLEANUP ──
 
     async def cleanup(self) -> None:
@@ -568,8 +650,18 @@ async def run(self) -> RunResult:
                 )
             else:
                 await self.install_agent()
-                for scene in cfg.effective_scenes:
-                    await self._run_scene(scene)
+                try:
+                    if cfg.user is not None:
+                        await self._run_user_loop()
+                    else:
+                        for scene in cfg.effective_scenes:
+                            await self._run_scene(scene)
+                finally:
+                    if cfg.oracle_access:
+                        await self._env.exec(
+                            "mv /solution_oracle_backup /solution 2>/dev/null || true",
+                            user="root", timeout_sec=10,
+                        )
 
             await self.verify()
 
@@ -598,20 +690,41 @@ async def run(self) -> RunResult:
 
     # ── Scene execution ──
 
+    _OUTBOX_DIR = "/app/.outbox"
+
     async def _run_scene(self, scene: Scene) -> None:
-        """Execute one scene: for each turn, connect as the turn's role, execute, disconnect."""
+        """Execute one scene: for each turn, connect as the turn's role, execute, disconnect.
+
+        For multi-role scenes, agents communicate via outbox files:
+        an agent writes ``/app/.outbox/{recipient}.json`` with
+        ``{"to": "role_name", "content": "..."}`` and the scheduler
+        injects received messages into the next turn's prompt.
+
+        Inter-role messages are persisted to ``trial_dir/scene_messages.jsonl``.
+        """
         cfg = self._config
         logger.info(f"[Scene] {scene.name} — {len(scene.turns)} turns, {len(scene.roles)} roles")
 
         role_map = {r.name: r for r in scene.roles}
         current_role: str | None = None
+        multi_role = len(scene.roles) > 1
+        scene_messages: list[dict] = []
+
+        if multi_role:
+            setup_cmd = f"rm -rf {self._OUTBOX_DIR} && mkdir -p {self._OUTBOX_DIR}"
+            if cfg.sandbox_user:
+                user = shlex.quote(cfg.sandbox_user)
+                setup_cmd += f" && chown {user}:{user} {self._OUTBOX_DIR}"
+            await self._env.exec(setup_cmd, timeout_sec=10)
 
-        for i, turn in enumerate(scene.turns):
+        inbox: dict[str, list[str]] = {r.name: [] for r in scene.roles}
+        turn_counter = 0
+
+        for _i, turn in enumerate(scene.turns):
             role = role_map.get(turn.role)
             if not role:
                 raise ValueError(f"Turn references unknown role {turn.role!r}")
 
-            # Reconnect if role changed or first turn
             if current_role != turn.role:
                 if current_role is not None:
                     await self.disconnect()
@@ -619,26 +732,236 @@ async def _run_scene(self, scene: Scene) -> None:
                 current_role = turn.role
 
             if turn.prompt:
-                prompts = [turn.prompt]
+                base_prompt = turn.prompt
             elif self._resolved_prompts:
-                prompts = [self._resolved_prompts[0]]
+                base_prompt = self._resolved_prompts[0]
+            else:
+                base_prompt = "Solve the task described in /app/instruction.md"
+
+            pending = inbox.get(turn.role, [])
+            if pending:
+                parts = [base_prompt, "\n---\nMessages from other agents:\n"]
+                parts.extend(pending)
+                prompts = ["\n".join(parts)]
+                inbox[turn.role] = []
             else:
-                prompts = ["Solve the task described in /app/instruction.md"]
+                prompts = [base_prompt]
+
             await self.execute(prompts=prompts)
 
+            if multi_role:
+                for recipient, content in await self._read_scene_outbox(current_role):
+                    turn_counter += 1
+                    inbox.setdefault(recipient, []).append(
+                        f"**From {current_role}:** {content}"
+                    )
+                    scene_messages.append({
+                        "scene": scene.name,
+                        "turn": turn_counter,
+                        "sender": current_role,
+                        "recipient": recipient,
+                        "content": content,
+                    })
+
         if current_role is not None:
             await self.disconnect()
 
+        if scene_messages and self._trial_dir:
+            msg_path = self._trial_dir / "scene_messages.jsonl"
+            with msg_path.open("a") as f:
+                for m in scene_messages:
+                    f.write(json.dumps(m) + "\n")
+            logger.info(
+                f"[Scene] {scene.name}: {len(scene_messages)} messages → {msg_path}"
+            )
+
+    async def _read_scene_outbox(self, sender: str) -> list[tuple[str, str]]:
+        """Read and clear outbox files left by *sender*. Returns [(recipient, content), ...]."""
+        result = await self._env.exec(
+            f"ls {self._OUTBOX_DIR}/*.json 2>/dev/null || true", timeout_sec=10,
+        )
+        files = [f.strip() for f in (result.stdout or "").strip().splitlines() if f.strip()]
+        messages: list[tuple[str, str]] = []
+        for fpath in files:
+            quoted = shlex.quote(fpath)
+            cat = await self._env.exec(f"cat {quoted}", timeout_sec=10)
+            try:
+                data = json.loads(cat.stdout or "{}")
+                recipient = data.get("to", "")
+                content = data.get("content", "")
+                if recipient and content:
+                    messages.append((recipient, content))
+                    logger.info(f"[Scene] outbox: {sender} → {recipient}: {content[:80]}")
+            except json.JSONDecodeError:
+                logger.warning(f"[Scene] invalid JSON in outbox: {fpath}")
+            await self._env.exec(f"rm -f {quoted}", timeout_sec=10)
+        return messages
+
+    async def _run_user_loop(self) -> None:
+        """Execute a user-driven progressive-disclosure loop.
+
+        Each round: user.run() → connect → agent.execute() → disconnect →
+        soft_verify() → build RoundResult → repeat. Stops when user.run()
+        returns None or max_user_rounds is reached.
+        """
+        cfg = self._config
+        user = cfg.user
+        assert user is not None
+
+        if len(cfg.effective_scenes) > 1:
+            raise ValueError(
+                "User-driven loops operate on a single scene. "
+                f"Got {len(cfg.effective_scenes)} scenes."
+            )
+        scene = cfg.effective_scenes[0]
+        if len(scene.roles) != 1:
+            raise ValueError(
+                "User-driven loops require a single-role scene. "
+                f"Got {len(scene.roles)} roles."
+            )
+        role = scene.roles[0]
+
+        instruction = self._resolved_prompts[0] if self._resolved_prompts else (
+            "Solve the task described in /app/instruction.md"
+        )
+
+        # Oracle access: read /solution before the agent runs, then remove it
+        solution: str | None = None
+        if cfg.oracle_access:
+            cat = await self._env.exec(
+                "cat /solution/solve.sh 2>/dev/null || true",
+                user="root", timeout_sec=10,
+            )
+            solution = (cat.stdout or "").strip() or None
+
+        await user.setup(instruction, solution)
+
+        # Hide oracle files from agent — move rather than delete so the
+        # final verify() can still access /solution if the verifier needs it.
+        if cfg.oracle_access:
+            await self._env.exec(
+                "mv /solution /solution_oracle_backup 2>/dev/null || true",
+                user="root", timeout_sec=10,
+            )
+
+        round_result: RoundResult | None = None
+        rounds_log: list[dict] = []
+
+        for round_num in range(cfg.max_user_rounds):
+            try:
+                prompt = await user.run(round_num, instruction, round_result)
+            except Exception as e:
+                self._error = f"user.run() failed at round {round_num}: {e}"
+                logger.error(self._error, exc_info=True)
+                break
+
+            if prompt is None:
+                logger.info(f"[User] stopped at round {round_num}")
+                break
+
+            logger.info(
+                f"[User] round {round_num}: prompt={prompt[:80]!r}..."
+                if len(prompt) > 80 else f"[User] round {round_num}: prompt={prompt!r}"
+            )
+
+            # Fresh ACP session each round — agent starts clean but sees
+            # its previous workspace changes in the shared sandbox.
+            traj_before = len(self._trajectory)
+            try:
+                await self.connect_as(role)
+                await self.execute(prompts=[prompt])
+            finally:
+                await self.disconnect()
+
+            round_trajectory = self._trajectory[traj_before:]
+            round_tools = sum(
+                1 for e in round_trajectory
+                if isinstance(e, dict) and e.get("type") == "tool_call"
+            )
+
+            # Soft verify: run tests after agent disconnected but before
+            # next round. Temporarily restore /solution so the verifier can
+            # access it, then re-hide before the next agent round.
+            if cfg.oracle_access:
+                await self._env.exec(
+                    "mv /solution_oracle_backup /solution 2>/dev/null || true",
+                    user="root", timeout_sec=10,
+                )
+            try:
+                rewards, verifier_output, verifier_error = await self.soft_verify()
+            finally:
+                if cfg.oracle_access:
+                    await self._env.exec(
+                        "mv /solution /solution_oracle_backup 2>/dev/null || true",
+                        user="root", timeout_sec=10,
+                    )
+
+            round_result = RoundResult(
+                round=round_num,
+                trajectory=round_trajectory,
+                rewards=rewards,
+                verifier_output=verifier_output,
+                verifier_error=verifier_error,
+                n_tool_calls=round_tools,
+            )
+
+            rounds_log.append({
+                "round": round_num,
+                "prompt": prompt,
+                "rewards": rewards,
+                "verifier_error": verifier_error,
+                "n_tool_calls": round_tools,
+                "n_trajectory_events": len(round_trajectory),
+            })
+
+            logger.info(
+                f"[User] round {round_num} done: "
+                f"rewards={rewards}, tools={round_tools}"
+            )
+
+        # Persist round log
+        if rounds_log and self._trial_dir:
+            log_path = self._trial_dir / "user_rounds.jsonl"
+            with log_path.open("w") as f:
+                for entry in rounds_log:
+                    f.write(json.dumps(entry) + "\n")
+            logger.info(f"[User] {len(rounds_log)} rounds → {log_path}")
+
     async def connect_as(self, role: Role) -> None:
-        """Open an ACP connection for a specific role."""
+        """Open an ACP connection for a specific role.
+
+        Installs the role's agent binary and credentials if it differs
+        from the primary agent (which was set up in install_agent()).
+        Updates _agent_launch so disconnect() kills the correct process.
+        """
         cfg = self._config
         t0 = datetime.now()
 
+        agent_launch = AGENT_LAUNCH.get(role.agent, role.agent)
+        # Merge cfg.agent_env (config-level) with role.env (role-specific) so
+        # provider creds from YAML reach the agent. role.env wins on overlap.
+        agent_env = resolve_agent_env(
+            role.agent, role.model,
+            {**(cfg.agent_env or {}), **(role.env or {})},
+        )
+
+        if role.agent != cfg.primary_agent:
+            agent_cfg = await install_agent(self._env, role.agent, self._trial_dir)
+            cred_home = f"/home/{cfg.sandbox_user}" if cfg.sandbox_user else "/root"
+            await write_credential_files(
+                self._env, role.agent, agent_env,
+                agent_cfg, role.model, cred_home,
+            )
+            if agent_env.get("_BENCHFLOW_SUBSCRIPTION_AUTH"):
+                await upload_subscription_auth(self._env, role.agent, cred_home)
+
+        self._agent_launch = agent_launch
+
         self._acp_client, self._session, self._agent_name = await connect_acp(
             env=self._env,
             agent=role.agent,
-            agent_launch=AGENT_LAUNCH.get(role.agent, role.agent),
-            agent_env=resolve_agent_env(role.agent, role.model, role.env or None),
+            agent_launch=agent_launch,
+            agent_env=agent_env,
             sandbox_user=cfg.sandbox_user,
             model=role.model,
             trial_dir=self._trial_dir,
diff --git a/src/benchflow/trial_yaml.py b/src/benchflow/trial_yaml.py
index 5b8ca4ae..707978b1 100644
--- a/src/benchflow/trial_yaml.py
+++ b/src/benchflow/trial_yaml.py
@@ -103,6 +103,7 @@ def trial_config_from_dict(
         environment=raw.get("environment", "docker"),
         sandbox_user=raw.get("sandbox_user", "agent"),
         sandbox_locked_paths=raw.get("sandbox_locked_paths"),
+        sandbox_setup_timeout=raw.get("sandbox_setup_timeout", 120),
         job_name=raw.get("job_name"),
         trial_name=raw.get("trial_name"),
         jobs_dir=raw.get("jobs_dir", "jobs"),
diff --git a/src/benchflow/user.py b/src/benchflow/user.py
new file mode 100644
index 00000000..2c8fde47
--- /dev/null
+++ b/src/benchflow/user.py
@@ -0,0 +1,101 @@
+"""User abstraction for progressive-disclosure trial loops.
+
+A User is a Python callback that participates in the trial loop alongside the
+agent. Each round: user.run() produces a prompt → agent executes → verifier
+checks → user sees the result and decides what to say next (or stops).
+
+This is distinct from multi-role scenes (PR #179) where multiple ACP agents
+collaborate via outbox files. The User runs in the scheduler process, not in
+the sandbox.
+"""
+
+from __future__ import annotations
+
+import inspect
+from collections.abc import Awaitable, Callable
+from dataclasses import dataclass, field
+from typing import Any, cast
+
+
+@dataclass
+class RoundResult:
+    """Outcome of one agent round, passed to User.run() for the next decision."""
+
+    round: int
+    trajectory: list[dict] = field(default_factory=list)
+    rewards: dict[str, Any] | None = None
+    verifier_output: str | None = None
+    verifier_error: str | None = None
+    n_tool_calls: int = 0
+
+
+class BaseUser:
+    """Abstract user that drives a progressive-disclosure trial loop.
+
+    Subclass and implement ``run()`` to control what prompt the agent sees
+    each round. Return ``None`` from ``run()`` to stop the loop early.
+    """
+
+    async def setup(self, instruction: str, solution: str | None = None) -> None:
+        """Called once before the first round.
+
+        ``instruction`` is the original task instruction (from instruction.md).
+        ``solution`` is the gold answer if oracle access is enabled, else None.
+        """
+
+    async def run(
+        self,
+        round: int,
+        instruction: str,
+        round_result: RoundResult | None = None,
+    ) -> str | None:
+        """Produce the next prompt for the agent, or None to stop.
+
+        ``round`` starts at 0. ``round_result`` is None on the first call
+        and contains the previous round's outcome on subsequent calls.
+        """
+        raise NotImplementedError
+
+
+class PassthroughUser(BaseUser):
+    """Sends the original instruction unchanged. Single-round, backward compatible."""
+
+    async def run(
+        self,
+        round: int,
+        instruction: str,
+        round_result: RoundResult | None = None,
+    ) -> str | None:
+        if round == 0:
+            return instruction
+        return None
+
+
+class FunctionUser(BaseUser):
+    """Wraps a plain function as a User — for lightweight one-off use.
+
+    The function signature matches ``BaseUser.run()``:
+        fn(round, instruction, round_result) -> str | None
+
+    Both sync and async functions are supported.
+    """
+
+    def __init__(
+        self,
+        fn: Callable[
+            [int, str, RoundResult | None],
+            str | None | Awaitable[str | None],
+        ],
+    ) -> None:
+        self._fn = fn
+
+    async def run(
+        self,
+        round: int,
+        instruction: str,
+        round_result: RoundResult | None = None,
+    ) -> str | None:
+        result = self._fn(round, instruction, round_result)
+        if inspect.isawaitable(result):
+            return cast(str | None, await result)
+        return cast(str | None, result)
diff --git a/tests/conformance/proof_multi_agent.py b/tests/conformance/proof_multi_agent.py
index 902d2ca3..9aa2729c 100644
--- a/tests/conformance/proof_multi_agent.py
+++ b/tests/conformance/proof_multi_agent.py
@@ -18,6 +18,8 @@
 
 sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "src"))
 
+import contextlib
+
 from harbor.models.task.task import Task
 
 from benchflow._acp_run import connect_acp, execute_prompts
@@ -73,7 +75,7 @@ async def role_runner(env, role: Role, prompt: str) -> None:
     trial_dir = Path(f"/tmp/multi-agent-proof/{role.name}")
     trial_dir.mkdir(parents=True, exist_ok=True)
     launch_cmd = AGENT_LAUNCH.get(role.agent, role.agent)
-    acp_client, session, agent_name = await connect_acp(
+    acp_client, session, _agent_name = await connect_acp(
         env=env,
         agent=role.agent,
         agent_launch=launch_cmd,
@@ -85,7 +87,7 @@ async def role_runner(env, role: Role, prompt: str) -> None:
         agent_cwd="/app",
     )
     try:
-        trajectory, n_tools = await execute_prompts(
+        _trajectory, n_tools = await execute_prompts(
             acp_client,
             session,
             [prompt],
@@ -93,10 +95,8 @@ async def role_runner(env, role: Role, prompt: str) -> None:
         )
         logging.info(f"[{role.name}] finished: {n_tools} tool calls")
     finally:
-        try:
+        with contextlib.suppress(Exception):
             await acp_client.close()
-        except Exception:
-            pass
 
 
 async def main() -> None:
diff --git a/tests/conformance/run_conformance.py b/tests/conformance/run_conformance.py
index b8aa5038..1f758a13 100644
--- a/tests/conformance/run_conformance.py
+++ b/tests/conformance/run_conformance.py
@@ -53,9 +53,7 @@ def has_creds(agent_name: str) -> bool:
     if any(os.environ.get(k) for k in keys):
         return True
     sub_file = SUBSCRIPTION_AUTH_FILES.get(agent_name)
-    if sub_file and Path(sub_file).expanduser().exists():
-        return True
-    return False
+    return bool(sub_file and Path(sub_file).expanduser().exists())
 
 
 async def run_one(agent_name: str) -> dict:
diff --git a/tests/conftest.py b/tests/conftest.py
index a3246522..6f5ee12f 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,12 +1,17 @@
 """Test fixtures."""
 
 import json
+import sys
 from datetime import datetime
 from pathlib import Path
 
 import pytest
 
 REPO_ROOT = Path(__file__).parent.parent
+SRC_ROOT = REPO_ROOT / "src"
+if str(SRC_ROOT) not in sys.path:
+    sys.path.insert(0, str(SRC_ROOT))
+
 REF_TASKS = REPO_ROOT / ".ref" / "harbor" / "examples" / "tasks"
 
 
diff --git a/tests/test_acp.py b/tests/test_acp.py
index e0f4729f..cea79c57 100644
--- a/tests/test_acp.py
+++ b/tests/test_acp.py
@@ -347,3 +347,31 @@ async def test_model_id_selection(self, model_in, expected_model, tmp_path):
             )
 
         mock_acp.set_model.assert_awaited_once_with(expected_model)
+
+    @pytest.mark.asyncio
+    async def test_openhands_skips_set_model(self, tmp_path):
+        from benchflow._acp_run import connect_acp
+
+        mock_acp = self._make_mocks()
+        mock_env = AsyncMock()
+        with (
+            patch(
+                "benchflow._acp_run.DockerProcess.from_harbor_env",
+                return_value=MagicMock(),
+            ),
+            patch("benchflow._acp_run.ContainerTransport", return_value=MagicMock()),
+            patch("benchflow._acp_run.ACPClient", return_value=mock_acp),
+        ):
+            await connect_acp(
+                env=mock_env,
+                agent="openhands",
+                agent_launch="openhands acp --always-approve --override-with-envs",
+                agent_env={},
+                sandbox_user=None,
+                model="gemini-3.1-flash-lite-preview",
+                trial_dir=tmp_path,
+                environment="docker",
+                agent_cwd="/app",
+            )
+
+        mock_acp.set_model.assert_not_awaited()
diff --git a/tests/test_agent_registry.py b/tests/test_agent_registry.py
index 4fe59047..ab06cf15 100644
--- a/tests/test_agent_registry.py
+++ b/tests/test_agent_registry.py
@@ -5,6 +5,7 @@
 test_registry_invariants.py — search there for the consolidated tripwire.
 """
 
+from benchflow._agent_env import resolve_provider_env
 from benchflow.agents.providers import PROVIDERS
 from benchflow.agents.registry import AGENTS
 
@@ -33,6 +34,39 @@ def test_gemini_has_mapping(self):
         assert cfg.env_mapping["BENCHFLOW_PROVIDER_BASE_URL"] == "GEMINI_API_BASE_URL"
         assert cfg.env_mapping["BENCHFLOW_PROVIDER_API_KEY"] == "GOOGLE_API_KEY"
 
+    def test_openhands_has_mapping(self):
+        cfg = AGENTS["openhands"]
+        assert cfg.env_mapping["BENCHFLOW_PROVIDER_BASE_URL"] == "LLM_BASE_URL"
+        assert cfg.env_mapping["BENCHFLOW_PROVIDER_API_KEY"] == "LLM_API_KEY"
+        # OpenHands model is normalized in _normalize_openhands_model().
+        assert "BENCHFLOW_PROVIDER_MODEL" not in cfg.env_mapping
+
+    def test_openhands_normalizes_model(self):
+        env = {}
+        resolve_provider_env(
+            agent="openhands",
+            model="zai/glm-5",
+            agent_env=env,
+        )
+
+        assert env["LLM_MODEL"] == "glm-5"
+
+class TestOpenHandsConfig:
+    def test_openhands_uses_agentskills_paths(self):
+        cfg = AGENTS["openhands"]
+        assert "$HOME/.agents/skills" in cfg.skill_paths
+        assert "$WORKSPACE/.agents/skills" in cfg.skill_paths
+
+    def test_openhands_install_cmd_has_uv_and_binary_fallbacks(self):
+        cfg = AGENTS["openhands"]
+        assert "apt-get install -y -qq curl ca-certificates" in cfg.install_cmd
+        assert "uv tool install openhands --python 3.12" in cfg.install_cmd
+        assert "install.openhands.dev/install.sh" in cfg.install_cmd
+
+    def test_openhands_skips_acp_set_model(self):
+        cfg = AGENTS["openhands"]
+        assert cfg.supports_acp_set_model is False
+
 
 class TestAgentCredentialFiles:
     def test_codex_has_auth_json(self):
diff --git a/tests/test_agent_setup.py b/tests/test_agent_setup.py
new file mode 100644
index 00000000..3b99ce64
--- /dev/null
+++ b/tests/test_agent_setup.py
@@ -0,0 +1,176 @@
+"""Tests for agent install and skill deployment setup helpers."""
+
+from pathlib import Path
+from types import SimpleNamespace
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from benchflow._agent_setup import deploy_skills, install_agent
+from benchflow.agents.registry import AgentConfig
+from benchflow.models import AgentInstallError
+
+
+def _make_task(skills_dir: str | None):
+    return SimpleNamespace(
+        config=SimpleNamespace(
+            environment=SimpleNamespace(
+                skills_dir=skills_dir,
+            )
+        )
+    )
+
+
+@pytest.mark.asyncio
+async def test_deploy_skills_symlinks_agent_skill_paths_instead_of_copying(tmp_path):
+    env = MagicMock()
+    env.exec = AsyncMock(return_value=MagicMock(return_code=0, stdout=""))
+    env.upload_dir = AsyncMock()
+    agent_cfg = AgentConfig(
+        name="test-agent",
+        install_cmd="true",
+        launch_cmd="true",
+        skill_paths=["$HOME/.agents/skills", "$WORKSPACE/skills"],
+    )
+
+    await deploy_skills(
+        env=env,
+        task_path=tmp_path,
+        skills_dir=None,
+        agent_cfg=agent_cfg,
+        sandbox_user="agent",
+        agent_cwd="/app",
+        task=_make_task("/opt/benchflow/skills"),
+    )
+
+    env.upload_dir.assert_not_called()
+    env.exec.assert_awaited_once()
+
+    cmd = env.exec.await_args.args[0]
+    assert "cp -r" not in cmd
+    assert " && " in cmd
+    assert ";" not in cmd
+    assert "ln -sfn /opt/benchflow/skills /home/agent/.agents/skills" in cmd
+    assert "ln -sfn /opt/benchflow/skills /app/skills" in cmd
+
+
+@pytest.mark.asyncio
+async def test_deploy_skills_uploads_runtime_skills_and_links_shared_tree(tmp_path):
+    env = MagicMock()
+    env.exec = AsyncMock(return_value=MagicMock(return_code=0, stdout=""))
+    env.upload_dir = AsyncMock()
+    agent_cfg = AgentConfig(
+        name="test-agent",
+        install_cmd="true",
+        launch_cmd="true",
+        skill_paths=["$HOME/.agents/skills", "$WORKSPACE/skills"],
+    )
+    skills_dir = tmp_path / "skills"
+    skills_dir.mkdir()
+
+    await deploy_skills(
+        env=env,
+        task_path=tmp_path,
+        skills_dir=skills_dir,
+        agent_cfg=agent_cfg,
+        sandbox_user="agent",
+        agent_cwd="/workspace",
+        task=_make_task("/opt/benchflow/skills"),
+    )
+
+    env.upload_dir.assert_awaited_once_with(skills_dir, "/skills")
+    env.exec.assert_awaited_once()
+
+    distributed_link_cmd = env.exec.await_args.args[0]
+    assert " && " in distributed_link_cmd
+    assert ";" not in distributed_link_cmd
+    assert "ln -sfn /skills /home/agent/.agents/skills" in distributed_link_cmd
+    assert "ln -sfn /skills /workspace/skills" in distributed_link_cmd
+    assert "/root/.agents/skills" not in distributed_link_cmd
+    assert "/app/skills" not in distributed_link_cmd
+
+
+@pytest.mark.asyncio
+async def test_deploy_skills_falls_back_when_local_skills_dir_is_missing(tmp_path):
+    env = MagicMock()
+    env.exec = AsyncMock(return_value=MagicMock(return_code=0, stdout=""))
+    env.upload_dir = AsyncMock()
+    agent_cfg = AgentConfig(
+        name="test-agent",
+        install_cmd="true",
+        launch_cmd="true",
+        skill_paths=["$HOME/.agents/skills", "$WORKSPACE/skills"],
+    )
+
+    await deploy_skills(
+        env=env,
+        task_path=tmp_path,
+        skills_dir=tmp_path / "missing-skills",
+        agent_cfg=agent_cfg,
+        sandbox_user="agent",
+        agent_cwd="/workspace",
+        task=_make_task("/opt/benchflow/skills"),
+    )
+
+    env.upload_dir.assert_not_called()
+    env.exec.assert_awaited_once()
+
+    distributed_link_cmd = env.exec.await_args.args[0]
+    assert "ln -sfn /opt/benchflow/skills /home/agent/.agents/skills" in distributed_link_cmd
+    assert "ln -sfn /opt/benchflow/skills /workspace/skills" in distributed_link_cmd
+    assert "ln -sfn /skills /home/agent/.agents/skills" not in distributed_link_cmd
+    assert "ln -sfn /skills /workspace/skills" not in distributed_link_cmd
+
+
+@pytest.mark.asyncio
+async def test_deploy_skills_raises_when_skill_linking_fails(tmp_path):
+    env = MagicMock()
+    env.exec = AsyncMock(return_value=MagicMock(return_code=17, stdout="link failed"))
+    env.upload_dir = AsyncMock()
+    agent_cfg = AgentConfig(
+        name="test-agent",
+        install_cmd="true",
+        launch_cmd="true",
+        skill_paths=["$HOME/.agents/skills"],
+    )
+
+    with pytest.raises(RuntimeError, match="Failed to link skills"):
+        await deploy_skills(
+            env=env,
+            task_path=tmp_path,
+            skills_dir=None,
+            agent_cfg=agent_cfg,
+            sandbox_user="agent",
+            agent_cwd="/app",
+            task=_make_task("/opt/benchflow/skills"),
+        )
+
+
+@pytest.mark.asyncio
+async def test_install_agent_writes_command_stdout_and_stderr_on_failure(tmp_path: Path):
+    env = SimpleNamespace()
+    env.exec = AsyncMock(
+        side_effect=[
+            SimpleNamespace(return_code=1, stdout="", stderr="uv: command not found\n"),
+            SimpleNamespace(
+                stdout="OS:\nID=ubuntu\nNode:\nv22.0.0\nAgent:\nnot found\n",
+                stderr="",
+                return_code=0,
+            ),
+        ]
+    )
+
+    with pytest.raises(AgentInstallError) as exc_info:
+        await install_agent(env, "openhands", tmp_path)
+
+    err = exc_info.value
+    log_path = Path(err.log_path)
+    assert log_path == tmp_path / "agent" / "install-stdout.txt"
+    assert log_path.exists()
+    log_text = log_path.read_text()
+    assert log_text.startswith("$ ")
+    assert "uv tool install openhands --python 3.12" in log_text
+    assert "=== stderr ===" in log_text
+    assert "uv: command not found" in log_text
+    assert err.stdout == log_text
+    assert "ID=ubuntu" in err.diagnostics
diff --git a/tests/test_connect_as_env.py b/tests/test_connect_as_env.py
new file mode 100644
index 00000000..97ab4232
--- /dev/null
+++ b/tests/test_connect_as_env.py
@@ -0,0 +1,140 @@
+"""Regression tests for connect_as() agent_env merging (issue #2).
+
+connect_as() must merge cfg.agent_env (config-level) with role.env
+(role-level), with role-level keys winning on overlap.  Before the fix,
+role.env={} was truthy so resolve_agent_env received an empty dict,
+discarding all config-level env vars.
+"""
+
+from pathlib import Path
+from unittest.mock import AsyncMock, patch
+
+import pytest
+
+from benchflow.trial import Role, Scene, TrialConfig
+
+
+def _make_config(agent_env=None, role_env=None):
+    """Build a minimal TrialConfig with one scene."""
+    role = Role(name="agent", agent="claude-agent-acp", model="test-model")
+    if role_env is not None:
+        role = Role(
+            name="agent", agent="claude-agent-acp", model="test-model", env=role_env
+        )
+    scene = Scene(roles=[role])
+    return TrialConfig(
+        task_path=Path("/fake/task"),
+        scenes=[scene],
+        agent_env=agent_env,
+    )
+
+
+class TestConnectAsEnvMerge:
+    """Verify connect_as() merges cfg.agent_env with role.env correctly."""
+
+    @pytest.fixture()
+    def _mock_trial(self, tmp_path):
+        """Return a Trial stub wired to capture the agent_env passed to connect_acp."""
+        from benchflow.trial import Trial
+
+        cfg = _make_config(
+            agent_env={"BENCHFLOW_PROVIDER_BASE_URL": "http://localhost:8080/v1"},
+        )
+        trial = Trial.__new__(Trial)
+        trial._config = cfg
+        trial._env = {}
+        trial._trial_dir = tmp_path
+        trial._timing = {}
+        trial._agent_cwd = None
+        trial._phase = "idle"
+        return trial
+
+    @pytest.mark.asyncio
+    async def test_config_env_propagated_through_empty_role_env(self, _mock_trial):
+        """cfg.agent_env vars reach resolve_agent_env when role.env is {}."""
+        captured = {}
+
+        def fake_resolve(agent, model, env):
+            captured["env"] = env
+            return env or {}
+
+        with (
+            patch("benchflow.trial.resolve_agent_env", side_effect=fake_resolve),
+            patch("benchflow.trial.connect_acp", new_callable=AsyncMock) as mock_conn,
+        ):
+            mock_conn.return_value = (AsyncMock(), AsyncMock(), "agent")
+            role = _mock_trial._config.scenes[0].roles[0]
+            await _mock_trial.connect_as(role)
+
+        assert "BENCHFLOW_PROVIDER_BASE_URL" in captured["env"]
+        assert captured["env"]["BENCHFLOW_PROVIDER_BASE_URL"] == "http://localhost:8080/v1"
+
+    @pytest.mark.asyncio
+    async def test_role_env_overrides_config_env(self, _mock_trial):
+        """Role-level env wins over config-level on key overlap."""
+        _mock_trial._config = _make_config(
+            agent_env={"KEY": "from-config", "SHARED": "config-val"},
+            role_env={"SHARED": "role-val", "ROLE_ONLY": "yes"},
+        )
+        captured = {}
+
+        def fake_resolve(agent, model, env):
+            captured["env"] = env
+            return env or {}
+
+        with (
+            patch("benchflow.trial.resolve_agent_env", side_effect=fake_resolve),
+            patch("benchflow.trial.connect_acp", new_callable=AsyncMock) as mock_conn,
+        ):
+            mock_conn.return_value = (AsyncMock(), AsyncMock(), "agent")
+            role = _mock_trial._config.scenes[0].roles[0]
+            await _mock_trial.connect_as(role)
+
+        env = captured["env"]
+        assert env["KEY"] == "from-config"
+        assert env["SHARED"] == "role-val"
+        assert env["ROLE_ONLY"] == "yes"
+
+    @pytest.mark.asyncio
+    async def test_all_keys_present_in_merge(self, _mock_trial):
+        """Non-overlapping keys from both dicts are all present."""
+        _mock_trial._config = _make_config(
+            agent_env={"A": "1", "B": "2"},
+            role_env={"C": "3", "D": "4"},
+        )
+        captured = {}
+
+        def fake_resolve(agent, model, env):
+            captured["env"] = env
+            return env or {}
+
+        with (
+            patch("benchflow.trial.resolve_agent_env", side_effect=fake_resolve),
+            patch("benchflow.trial.connect_acp", new_callable=AsyncMock) as mock_conn,
+        ):
+            mock_conn.return_value = (AsyncMock(), AsyncMock(), "agent")
+            role = _mock_trial._config.scenes[0].roles[0]
+            await _mock_trial.connect_as(role)
+
+        env = captured["env"]
+        assert env == {"A": "1", "B": "2", "C": "3", "D": "4"}
+
+    @pytest.mark.asyncio
+    async def test_none_config_env_with_empty_role_env(self, _mock_trial):
+        """cfg.agent_env=None + empty role.env does not crash."""
+        _mock_trial._config = _make_config(agent_env=None, role_env={})
+        captured = {}
+
+        def fake_resolve(agent, model, env):
+            captured["env"] = env
+            return env or {}
+
+        with (
+            patch("benchflow.trial.resolve_agent_env", side_effect=fake_resolve),
+            patch("benchflow.trial.connect_acp", new_callable=AsyncMock) as mock_conn,
+        ):
+            mock_conn.return_value = (AsyncMock(), AsyncMock(), "agent")
+            role = _mock_trial._config.scenes[0].roles[0]
+            await _mock_trial.connect_as(role)
+
+        assert captured["env"] == {}
diff --git a/tests/test_oracle_chokepoint.py b/tests/test_oracle_chokepoint.py
index 142c0634..a61333da 100644
--- a/tests/test_oracle_chokepoint.py
+++ b/tests/test_oracle_chokepoint.py
@@ -21,11 +21,9 @@
 
 from __future__ import annotations
 
-import importlib
 from pathlib import Path
 from unittest.mock import AsyncMock, patch
 
-import pytest
 from typer.testing import CliRunner
 
 
diff --git a/tests/test_process.py b/tests/test_process.py
index c604a580..68ea0a0c 100644
--- a/tests/test_process.py
+++ b/tests/test_process.py
@@ -202,3 +202,99 @@ async def capture_communicate(data=None):
         assert (
             f"export SINGLE_QUOTE={shlex.quote('it' + chr(39) + 's a test')}" in content
         )
+
+
+class TestDaytonaProcessEnvFilePath:
+    """Regression: env-file path must be unique without relying on shell `$$` expansion.
+
+    Guards the fix from PR #198 against the regression introduced by PR #193
+    (DinD compose ACP via Daytona PTY WebSocket, commit cdccac7).
+
+    The DinD branch builds an inner `docker compose exec --env-file PATH ...`
+    command and runs it through `shlex.join()`, which single-quotes any `$$`
+    (preventing remote shell expansion). The `cat > PATH` heredoc that writes
+    the file uses raw f-string interpolation where `$$` IS expanded. If the
+    path contains `$$`, the file is written to one path and read from another
+    — env vars silently disappear.
+
+    The direct (non-DinD) branch uses raw f-string in both write and read, so
+    `$$` would expand consistently — but uuid is robust against future quoting
+    changes. Pin both branches.
+    """
+
+    @pytest.mark.asyncio
+    async def test_dind_env_file_path_does_not_use_shell_pid_expansion(self):
+        """DinD path must not use $$ — shlex.join would quote it literally."""
+        from unittest.mock import MagicMock
+
+        from benchflow.process import DaytonaProcess
+
+        sandbox = MagicMock()
+        sandbox.create_ssh_access = AsyncMock(
+            return_value=MagicMock(token="abc")
+        )
+        proc = DaytonaProcess(
+            sandbox=sandbox,
+            is_dind=True,
+            compose_cmd_prefix="",
+            compose_cmd_base="docker compose -p test",
+        )
+
+        captured = []
+
+        async def fake_exec(*args, **kwargs):
+            captured.append(list(args))
+            mock_proc = AsyncMock()
+            mock_proc.pid = 12345
+            mock_proc.returncode = 0
+            mock_proc.stdin = AsyncMock()
+            mock_proc.stdout = AsyncMock()
+            mock_proc.stderr = AsyncMock()
+            mock_proc.communicate = AsyncMock(return_value=(b"", b""))
+            return mock_proc
+
+        with patch("asyncio.create_subprocess_exec", side_effect=fake_exec):
+            await proc.start(command="echo hi", env={"FOO": "bar"})
+
+        # Last arg of ssh is the remote command. Search it for $$
+        remote_cmd = captured[0][-1]
+        assert "$$" not in remote_cmd, (
+            "$$ in remote command — shlex.join() will quote it, mismatching "
+            f"the cat heredoc that does expand it. Got: {remote_cmd[:200]!r}"
+        )
+        # And: a real path was used (literal hex suffix, no shell variable)
+        assert "/tmp/benchflow_env_" in remote_cmd
+        assert "--env-file" in remote_cmd
+
+    @pytest.mark.asyncio
+    async def test_direct_sandbox_env_file_path_does_not_use_shell_pid_expansion(self):
+        """Direct (non-DinD) path is currently safe with $$, but pin the uuid form for robustness."""
+        from unittest.mock import MagicMock
+
+        from benchflow.process import DaytonaProcess
+
+        sandbox = MagicMock()
+        sandbox.create_ssh_access = AsyncMock(
+            return_value=MagicMock(token="abc")
+        )
+        proc = DaytonaProcess(sandbox=sandbox, is_dind=False)
+
+        captured = []
+
+        async def fake_exec(*args, **kwargs):
+            captured.append(list(args))
+            mock_proc = AsyncMock()
+            mock_proc.pid = 12345
+            mock_proc.returncode = 0
+            mock_proc.stdin = AsyncMock()
+            mock_proc.stdout = AsyncMock()
+            mock_proc.stderr = AsyncMock()
+            mock_proc.communicate = AsyncMock(return_value=(b"", b""))
+            return mock_proc
+
+        with patch("asyncio.create_subprocess_exec", side_effect=fake_exec):
+            await proc.start(command="echo hi", env={"FOO": "bar"})
+
+        remote_cmd = captured[0][-1]
+        assert "$$" not in remote_cmd
+        assert "/tmp/benchflow_env_" in remote_cmd
diff --git a/tests/test_registry_invariants.py b/tests/test_registry_invariants.py
index 023945bc..868ffb3c 100644
--- a/tests/test_registry_invariants.py
+++ b/tests/test_registry_invariants.py
@@ -49,6 +49,7 @@ def test_agent_field_shapes(name, cfg):
         f"api_protocol={cfg.api_protocol!r} not in {VALID_API_PROTOCOLS}"
     )
     assert isinstance(cfg.install_timeout, int) and cfg.install_timeout > 0
+    assert isinstance(cfg.supports_acp_set_model, bool)
 
 
 @pytest.mark.parametrize("name,cfg", AGENTS.items(), ids=list(AGENTS.keys()))
@@ -75,6 +76,22 @@ def test_agent_collection_invariants(name, cfg):
         assert d.startswith("."), f"home_dirs entry {d!r} must start with '.'"
 
 
+@pytest.mark.parametrize("name,cfg", AGENTS.items(), ids=list(AGENTS.keys()))
+def test_agent_install_cmd_targets_shared_paths(name, cfg):
+    """Installed binaries must land in shared prefixes, not a root-only home.
+
+    setup_sandbox_user() no longer recursively copies /root/.nvm or
+    /root/.local/bin into the sandbox home. If an install_cmd placed its
+    binary there, the sandbox user would silently lose access to the agent.
+    """
+    forbidden_binary_prefixes = ("/root/.nvm/", "/root/.local/bin/", "$HOME/.nvm/")
+    for prefix in forbidden_binary_prefixes:
+        assert prefix not in cfg.install_cmd, (
+            f"{name!r} install_cmd writes under {prefix!r}; use /usr/local/bin "
+            f"or another shared prefix so the sandbox user inherits the tool"
+        )
+
+
 @pytest.mark.parametrize("name,cfg", AGENTS.items(), ids=list(AGENTS.keys()))
 def test_agent_credential_and_subscription_auth(name, cfg):
     """Optional credential_files and subscription_auth structures."""
diff --git a/tests/test_rewards_jsonl.py b/tests/test_rewards_jsonl.py
index e81f3164..bbc029df 100644
--- a/tests/test_rewards_jsonl.py
+++ b/tests/test_rewards_jsonl.py
@@ -93,7 +93,7 @@ def test_rubric_plus_terminal_no_rubric_in_meta(tmp_path: Path) -> None:
         json.loads(ln)
         for ln in (tmp_path / "rewards.jsonl").read_text().strip().splitlines()
     ]
-    terminal = [ln for ln in lines if ln["type"] == "terminal"][0]
+    terminal = next(ln for ln in lines if ln["type"] == "terminal")
     assert "rubric" not in terminal["meta"]
 
 
diff --git a/tests/test_runtime.py b/tests/test_runtime.py
index 16046fed..37819f71 100644
--- a/tests/test_runtime.py
+++ b/tests/test_runtime.py
@@ -36,6 +36,7 @@ def test_agent_env_default_empty() -> None:
 def test_runtime_config_defaults() -> None:
     c = RuntimeConfig()
     assert c.sandbox_user == "agent"
+    assert c.sandbox_setup_timeout == 120
     assert c.max_rounds == 10
     assert c.snapshot_policy == "none"
     assert c.reward_stream is True
@@ -143,9 +144,10 @@ def test_runtime_custom_config() -> None:
     if not (task_path / "task.toml").exists():
         return
     env = Environment.from_task(task_path, backend="daytona")
-    config = RuntimeConfig(sandbox_user=None, timeout=1800)
+    config = RuntimeConfig(sandbox_user=None, timeout=1800, sandbox_setup_timeout=45)
     runtime = Runtime(env, agent, config)
     assert runtime.config.sandbox_user is None
+    assert runtime.config.sandbox_setup_timeout == 45
     assert runtime.config.timeout == 1800
 
 
diff --git a/tests/test_sandbox.py b/tests/test_sandbox.py
index 4c7f9f2f..36a8c805 100644
--- a/tests/test_sandbox.py
+++ b/tests/test_sandbox.py
@@ -1,23 +1,21 @@
-"""Tests for sandbox user directory derivation from agent registry."""
+"""Tests for sandbox user config/auth directory derivation from agent registry."""
 
 from benchflow.agents.registry import (
     AGENTS,
     AgentConfig,
+    HostAuthFile,
+    SubscriptionAuth,
     get_sandbox_home_dirs,
 )
 
 
 class TestSandboxDirs:
-    def test_dirs_derived_from_skill_paths(self):
-        """All $HOME skill_paths dirs from AGENTS registry are included."""
+    def test_skill_only_dirs_not_included(self):
+        """Skill-only home dirs should not be copied during sandbox user setup."""
         dirs = get_sandbox_home_dirs()
-        # claude-agent-acp has $HOME/.claude/skills
-        assert ".claude" in dirs
-        # gemini has $HOME/.gemini/skills
-        assert ".gemini" in dirs
-        # pi-acp has $HOME/.pi/agent/skills and $HOME/.agents/skills
-        assert ".pi" in dirs
-        assert ".agents" in dirs
+
+        assert ".pi" not in dirs
+        assert ".agents" not in dirs
 
     def test_credential_file_dirs_included(self):
         """Dirs from credential_files paths are included."""
@@ -31,13 +29,25 @@ def test_home_dirs_included(self):
         # openclaw has home_dirs=[".openclaw"]
         assert ".openclaw" in dirs
 
-    def test_always_includes_local(self):
-        """.local is always in the dir list."""
+    def test_does_not_include_legacy_local_tool_dir(self):
+        """.local is not included unless an agent registry path derives it."""
+        dirs = get_sandbox_home_dirs()
+        assert ".local" not in dirs
+
+    def test_only_includes_top_level_home_dirs(self):
+        """Derived entries stay at $HOME top-level, not nested tool subpaths."""
         dirs = get_sandbox_home_dirs()
-        assert ".local" in dirs
+        assert ".local/bin" not in dirs
 
-    def test_new_agent_auto_included(self):
-        """Adding an agent with skill_paths=$HOME/.newagent/skills includes .newagent."""
+    def test_dirs_represent_registry_backed_home_config_or_auth(self):
+        """Returned dirs are registry-derived user home config/auth roots."""
+        dirs = get_sandbox_home_dirs()
+        assert {".claude", ".codex", ".gemini", ".openclaw"}.issubset(dirs)
+        assert ".agents" not in dirs
+        assert ".pi" not in dirs
+
+    def test_new_agent_skill_path_not_auto_included(self):
+        """Skill-only home dirs should not become sandbox copy targets."""
         AGENTS["_test_agent"] = AgentConfig(
             name="_test_agent",
             install_cmd="true",
@@ -46,10 +56,33 @@ def test_new_agent_auto_included(self):
         )
         try:
             dirs = get_sandbox_home_dirs()
-            assert ".newagent" in dirs
+            assert ".newagent" not in dirs
         finally:
             del AGENTS["_test_agent"]
 
+    def test_subscription_auth_file_dirs_included(self):
+        """Dirs from subscription_auth.files container paths are included."""
+        AGENTS["_test_agent_subscription_auth"] = AgentConfig(
+            name="_test_agent_subscription_auth",
+            install_cmd="true",
+            launch_cmd="true",
+            subscription_auth=SubscriptionAuth(
+                replaces_env="TEST_API_KEY",
+                detect_file="~/.subauth/login.json",
+                files=[
+                    HostAuthFile(
+                        "~/.subauth/login.json",
+                        "{home}/.subauth/login.json",
+                    )
+                ],
+            ),
+        )
+        try:
+            dirs = get_sandbox_home_dirs()
+            assert ".subauth" in dirs
+        finally:
+            del AGENTS["_test_agent_subscription_auth"]
+
     def test_workspace_paths_excluded(self):
         """$WORKSPACE paths are not included (only $HOME paths)."""
         dirs = get_sandbox_home_dirs()
diff --git a/tests/test_sandbox_hardening.py b/tests/test_sandbox_hardening.py
index c12b9574..6396c4ee 100644
--- a/tests/test_sandbox_hardening.py
+++ b/tests/test_sandbox_hardening.py
@@ -152,7 +152,7 @@ async def test_with_sandbox_user(self, harness):
         assert "sitecustomize.py" in cleanup_cmd and ".pth" in cleanup_cmd
         assert "-not -path '/tests/*'" in cleanup_cmd
         injected = task.config.verifier.env
-        assert "--rootdir=/tests" in injected["PYTEST_ADDOPTS"]
+        assert "--rootdir=/app" in injected["PYTEST_ADDOPTS"]
         assert "-p no:cacheprovider" in injected["PYTEST_ADDOPTS"]
         assert injected["PYTHONPATH"] == ""
         assert "PYTHONHOME" not in injected  # breaks Py_Initialize if set to ""
@@ -171,7 +171,7 @@ async def test_without_sandbox_user(self, harness):
         assert all("pkill" not in c for c in cmds)
         assert any("conftest.py" in c for c in cmds)
         addopts = task.config.verifier.env["PYTEST_ADDOPTS"]
-        assert "--rootdir=/tests" in addopts
+        assert "--rootdir=/app" in addopts
         assert "-p no:cacheprovider" in addopts
 
     @pytest.mark.asyncio
@@ -629,7 +629,7 @@ def test_env_contract(self):
 
         assert "-c /dev/null" in addopts
         assert "--confcutdir=/tests" in addopts
-        assert "--rootdir=/tests" in addopts
+        assert "--rootdir=/app" in addopts
         assert "-p no:cacheprovider" in addopts
         assert (
             "PYTHONSAFEPATH" not in VERIFIER_ENV
@@ -662,34 +662,41 @@ async def test_distro_pip_env_fedora(self):
         )
         assert await _distro_pip_env(env) == {"PIP_PREFIX": "/usr/local"}
 
-    def test_pip_installed_pytest_plugin_inferred(self, tmp_path):
-        """`pip install pytest-asyncio` in test.sh auto-loads pytest_asyncio."""
-        from benchflow._sandbox import _plugins_from_verifier_script
+    @pytest.mark.asyncio
+    async def test_container_plugin_discovery_merged_into_addopts(self):
+        """Plugins discovered from root-owned container packages appear as -p flags."""
+        from benchflow._sandbox import harden_before_verify
 
-        task_dir = tmp_path / "task"
-        (task_dir / "tests").mkdir(parents=True)
-        (task_dir / "tests" / "test.sh").write_text(
-            "pip3 install --break-system-packages pytest==8.3.4 pytest-asyncio==0.24.0\n"
-            "pytest /tests/test_perf.py\n"
-        )
-        task = MagicMock()
-        task.task_dir = str(task_dir)
-        plugins = _plugins_from_verifier_script(task)
-        assert "pytest_asyncio" in plugins
+        def side_effect(cmd, **kwargs):
+            if "_DISCOVER_PYTEST" in str(cmd) or "importlib.metadata" in str(cmd):
+                return MagicMock(
+                    stdout='["benchmark", "xdist"]', stderr="", exit_code=0
+                )
+            return MagicMock(stdout="", stderr="", exit_code=0)
 
-    def test_pip_install_unrelated_mention_not_inferred(self, tmp_path):
-        """A bare mention of 'pytest-asyncio' in a comment must not auto-load it."""
-        from benchflow._sandbox import _plugins_from_verifier_script
+        env = _make_env(side_effect=side_effect)
+        task = _make_task()
+        await harden_before_verify(env, task, sandbox_user=None)
 
-        task_dir = tmp_path / "task"
-        (task_dir / "tests").mkdir(parents=True)
-        (task_dir / "tests" / "test.sh").write_text(
-            "# tests using pytest-asyncio decorators (already installed in image)\n"
-            "pytest /tests/test_perf.py\n"
-        )
-        task = MagicMock()
-        task.task_dir = str(task_dir)
-        assert _plugins_from_verifier_script(task) == []
+        addopts = task.config.verifier.env["PYTEST_ADDOPTS"]
+        assert "-p benchmark" in addopts
+        assert "-p xdist" in addopts
+
+    @pytest.mark.asyncio
+    async def test_plugin_discovery_failure_graceful(self):
+        """If container-side discovery fails, hardening proceeds without extra plugins."""
+        from benchflow._sandbox import VERIFIER_ENV, harden_before_verify
+
+        def side_effect(cmd, **kwargs):
+            if "importlib.metadata" in str(cmd):
+                raise RuntimeError("no python3")
+            return MagicMock(stdout="", stderr="", exit_code=0)
+
+        env = _make_env(side_effect=side_effect)
+        task = _make_task()
+        await harden_before_verify(env, task, sandbox_user=None)
+
+        assert task.config.verifier.env["PYTEST_ADDOPTS"] == VERIFIER_ENV["PYTEST_ADDOPTS"]
 
     @pytest.mark.asyncio
     async def test_distro_pip_env_ubuntu(self):
@@ -831,77 +838,20 @@ async def test_plugin_autoload_disabled_survives_task_env_override(self):
 
     @pytest.mark.asyncio
     async def test_per_task_plugins_appended_to_addopts(self):
-        """pytest_plugins are translated to -p flags; PYTEST_DISABLE_PLUGIN_AUTOLOAD must survive."""
+        """pytest_plugins from task.toml are translated to -p flags."""
         from benchflow._sandbox import harden_before_verify
 
         env = _make_env(side_effect=_manifest_env(_blank_manifest()))
         task = _make_task()
-        task.config.verifier.pytest_plugins = ["pytest-json-ctrf", "myplug"]
+        task.config.verifier.pytest_plugins = ["ctrf", "myplug"]
         await harden_before_verify(env, task, sandbox_user=None, workspace=None)
 
         final_env = task.config.verifier.env
         addopts = final_env.get("PYTEST_ADDOPTS", "")
         assert "-p ctrf" in addopts
         assert "-p myplug" in addopts
-        # The security flag must still be present after the plugin flags are appended.
         assert final_env.get("PYTEST_DISABLE_PLUGIN_AUTOLOAD") == "1"
 
-    @pytest.mark.asyncio
-    async def test_legacy_ctrf_script_gets_allowlisted_plugin(self, tmp_path):
-        """Legacy verifier scripts using --ctrf get -p ctrf without pytest autoload."""
-        from benchflow._sandbox import harden_before_verify
-
-        tests_dir = tmp_path / "tests"
-        tests_dir.mkdir()
-        (tests_dir / "test.sh").write_text(
-            "uvx --with pytest-json-ctrf==0.3.5 "
-            "pytest --ctrf /logs/verifier/ctrf.json /tests/test_outputs.py\n"
-        )
-        env = _make_env(side_effect=_manifest_env(_blank_manifest()))
-        task = _make_task()
-        task.task_dir = tmp_path
-        await harden_before_verify(env, task, sandbox_user=None, workspace=None)
-
-        final_env = task.config.verifier.env
-        assert "-p ctrf" in final_env["PYTEST_ADDOPTS"]
-        assert final_env.get("PYTEST_DISABLE_PLUGIN_AUTOLOAD") == "1"
-
-    @pytest.mark.asyncio
-    async def test_legacy_json_report_script_gets_allowlisted_plugin(self, tmp_path):
-        """Legacy verifier scripts using --json-report get the right import name."""
-        from benchflow._sandbox import harden_before_verify
-
-        tests_dir = tmp_path / "tests"
-        tests_dir.mkdir()
-        (tests_dir / "test.sh").write_text(
-            "pytest --json-report "
-            "--json-report-file=/logs/verifier/report.json /tests/test_outputs.py\n"
-        )
-        env = _make_env(side_effect=_manifest_env(_blank_manifest()))
-        task = _make_task()
-        task.task_dir = tmp_path
-        await harden_before_verify(env, task, sandbox_user=None, workspace=None)
-
-        assert "-p pytest_jsonreport" in task.config.verifier.env["PYTEST_ADDOPTS"]
-
-    @pytest.mark.asyncio
-    async def test_task_toml_pytest_plugins_fallback(self, tmp_path):
-        """Raw task.toml pytest_plugins work even if Harbor ignores the extra field."""
-        from benchflow._sandbox import harden_before_verify
-
-        (tmp_path / "task.toml").write_text(
-            '[verifier]\npytest_plugins = ["pytest_json_ctrf", "pytest-json-report"]\n'
-        )
-        env = _make_env(side_effect=_manifest_env(_blank_manifest()))
-        task = _make_task()
-        task.task_dir = tmp_path
-        task.config.verifier.pytest_plugins = None
-        await harden_before_verify(env, task, sandbox_user=None, workspace=None)
-
-        addopts = task.config.verifier.env["PYTEST_ADDOPTS"]
-        assert "-p ctrf" in addopts
-        assert "-p pytest_jsonreport" in addopts
-
     @pytest.mark.asyncio
     @pytest.mark.parametrize("plugins", [None, []])
     async def test_no_extra_addopts_when_no_plugins(self, plugins):
@@ -964,7 +914,7 @@ async def test_pytest_addopts_task_override_with_plugins(self):
         env = _make_env()
         task = _make_task()
         task.config.verifier.env = {"PYTEST_ADDOPTS": "--rootdir=/evil"}
-        task.config.verifier.pytest_plugins = ["pytest-json-ctrf"]
+        task.config.verifier.pytest_plugins = ["ctrf"]
         await harden_before_verify(env, task, sandbox_user=None)
 
         addopts = task.config.verifier.env["PYTEST_ADDOPTS"]
@@ -1163,22 +1113,83 @@ def test_devnull_blocks_hostile_pyproject(self, tmp_path):
         )
 
 
-class TestSandboxFailureModes:
-    """Recovery paths when untrusted inputs (task.toml, PATH extras) are malformed."""
+class TestHardeningOptOuts:
+    """Per-task [verifier.hardening] opt-outs from task.toml."""
+
+    def test_defaults_when_no_task_dir(self):
+        from benchflow._sandbox import HARDENING_DEFAULTS, _read_hardening_config
 
-    def test_harden_before_verify_survives_manifest_read_failure(self, tmp_path):
-        """Truncated task.toml must not propagate TOMLDecodeError; treat as no plugins."""
-        from benchflow._sandbox import _declared_pytest_plugins
+        assert _read_hardening_config(None) == HARDENING_DEFAULTS
+
+    def test_defaults_when_no_hardening_section(self, tmp_path):
+        from benchflow._sandbox import HARDENING_DEFAULTS, _read_hardening_config
 
-        # Truncated inline-table → tomllib.TOMLDecodeError
         (tmp_path / "task.toml").write_text(
-            "[verifier]\npytest_plugins = [\n",
+            "[verifier]\ntimeout_sec = 60\n"
         )
-        task = _make_task()
-        task.task_dir = str(tmp_path)
-        task.config.verifier.pytest_plugins = None
+        assert _read_hardening_config(tmp_path) == HARDENING_DEFAULTS
+
+    def test_opt_out_cleanup_conftests(self, tmp_path):
+        from benchflow._sandbox import _read_hardening_config
 
-        assert _declared_pytest_plugins(task) == []
+        (tmp_path / "task.toml").write_text(
+            "[verifier.hardening]\ncleanup_conftests = false\n"
+        )
+        cfg = _read_hardening_config(tmp_path)
+        assert cfg["cleanup_conftests"] is False
+
+    def test_unknown_key_logged_not_applied(self, tmp_path, caplog):
+        from benchflow._sandbox import HARDENING_DEFAULTS, _read_hardening_config
+
+        (tmp_path / "task.toml").write_text(
+            "[verifier.hardening]\nbogus_flag = true\n"
+        )
+        cfg = _read_hardening_config(tmp_path)
+        assert cfg == HARDENING_DEFAULTS  # bogus key ignored
+        assert any("bogus_flag" in r.message for r in caplog.records)
+
+    def test_invalid_value_type_rejected(self, tmp_path, caplog):
+        from benchflow._sandbox import _read_hardening_config
+
+        (tmp_path / "task.toml").write_text(
+            '[verifier.hardening]\ncleanup_conftests = "false"\n'
+        )
+        cfg = _read_hardening_config(tmp_path)
+        # String "false" is not bool — rejected, default applied
+        assert cfg["cleanup_conftests"] is True
+
+    def test_build_cleanup_includes_conftest_by_default(self):
+        from benchflow._sandbox import _build_cleanup_cmd
+
+        cmd = _build_cleanup_cmd()
+        assert "find / -name conftest.py" in cmd
+        assert "find /tmp /var/tmp" in cmd
+        assert "sitecustomize.py" in cmd
+
+    def test_build_cleanup_skips_conftest_when_disabled(self):
+        from benchflow._sandbox import _build_cleanup_cmd
+
+        cmd = _build_cleanup_cmd({"cleanup_conftests": False})
+        assert "find / -name conftest.py" not in cmd
+        # Other steps still run
+        assert "find /tmp /var/tmp" in cmd
+        assert "sitecustomize.py" in cmd
+
+
+class TestSandboxFailureModes:
+    """Recovery paths when untrusted inputs (task.toml, PATH extras) are malformed."""
+
+    @pytest.mark.asyncio
+    async def test_plugin_discovery_bad_json_graceful(self):
+        """Malformed JSON from container plugin discovery falls back gracefully."""
+        from benchflow._sandbox import _discover_pytest_plugin_flags
+
+        env = _make_env(side_effect=lambda cmd, **kw: MagicMock(
+            stdout="not valid json", stderr="", exit_code=0
+        ))
+        task = _make_task()
+        flags = await _discover_pytest_plugin_flags(env, task)
+        assert flags == ""
 
     @pytest.mark.asyncio
     async def test_trusted_path_extras_malformed_json_falls_back(self):
diff --git a/tests/test_sandbox_setup.py b/tests/test_sandbox_setup.py
new file mode 100644
index 00000000..f81b392e
--- /dev/null
+++ b/tests/test_sandbox_setup.py
@@ -0,0 +1,106 @@
+"""Focused contract tests for setup_sandbox_user() shell command generation."""
+
+import re
+import shlex
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from benchflow._sandbox import setup_sandbox_user
+from benchflow.agents.registry import get_sandbox_home_dirs
+
+
+async def _run_setup_sandbox_user(*, sandbox_user: str = "agent", workspace: str = "/app"):
+    env = MagicMock()
+    env.exec = AsyncMock(return_value=MagicMock(stdout="", stderr="", exit_code=0))
+
+    await setup_sandbox_user(env, sandbox_user, workspace)
+
+    env.exec.assert_awaited_once()
+    return env.exec.call_args.args[0], env.exec.call_args.kwargs
+
+
+def _assert_conditional_legacy_symlink(cmd: str, *, source: str, dest: str) -> None:
+    """Legacy tool dirs should link only when a root-only install exists."""
+    assert re.search(
+        rf"if \[ -e [\"']?{re.escape(source)}[\"']? \].*ln -s(?:f|[a-zA-Z-])* [\"']?{re.escape(source)}[\"']? [\"']?{re.escape(dest)}[\"']?.*fi",
+        cmd,
+    ), f"expected explicit symlink from {source} to {dest} in setup command: {cmd}"
+
+
+def _get_copy_loop_dirs(cmd: str) -> list[str]:
+    """Extract the general home-dir copy loop payload from the shell command."""
+    match = re.search(r"for d in (?P<dirs>.*?); do", cmd)
+    assert match, f"expected general home-dir copy loop in setup command: {cmd}"
+    return match.group("dirs").split()
+
+
+class TestSetupSandboxUser:
+    @pytest.mark.asyncio
+    async def test_setup_command_avoids_recursive_root_tool_copies(self):
+        """Heavy root-owned tool dirs should no longer be recursively copied."""
+        cmd, kwargs = await _run_setup_sandbox_user()
+
+        assert "cp -aL /root/.local/bin/." not in cmd
+        assert "cp -a /root/.nvm/." not in cmd
+        assert kwargs["timeout_sec"] == 120
+
+    @pytest.mark.asyncio
+    async def test_setup_command_still_creates_user_prepares_home_and_chowns_workspace(self):
+        """The non-copy setup contract still creates the user and grants access."""
+        cmd, _ = await _run_setup_sandbox_user()
+
+        assert "id -u agent >/dev/null 2>&1 || useradd -m -s /bin/bash agent" in cmd
+        assert "mkdir -p /home/agent/.local/bin" not in cmd
+        assert "chown -R agent:agent /home/agent" in cmd
+        assert f"chown -R agent:agent {shlex.quote('/app')}" in cmd
+
+    @pytest.mark.asyncio
+    async def test_setup_command_keeps_heavy_root_tool_dirs_on_shared_paths(self):
+        """Legacy root-only tool dirs should use conditional symlinks, not duplication."""
+        cmd, _ = await _run_setup_sandbox_user()
+
+        _assert_conditional_legacy_symlink(
+            cmd,
+            source="/root/.local/bin",
+            dest="/home/agent/.local/bin",
+        )
+        _assert_conditional_legacy_symlink(
+            cmd, source="/root/.nvm", dest="/home/agent/.nvm"
+        )
+        assert "cp -aL /root/.local/bin/. /home/agent/.local/bin/" not in cmd
+        assert "cp -a /root/.nvm/. /home/agent/.nvm/" not in cmd
+
+    @pytest.mark.asyncio
+    async def test_setup_command_copy_loop_excludes_local_dir(self):
+        """General home-dir copying should narrow to small config/auth dirs only."""
+        cmd, _ = await _run_setup_sandbox_user()
+
+        copy_loop_dirs = _get_copy_loop_dirs(cmd)
+
+        assert copy_loop_dirs == sorted(
+            d for d in get_sandbox_home_dirs() if d != ".local"
+        )
+        assert ".local" not in copy_loop_dirs
+        assert "mkdir -p /home/agent/$d" in cmd
+        assert "cp -a /root/$d/. /home/agent/$d/ 2>/dev/null || true" in cmd
+
+    @pytest.mark.asyncio
+    async def test_setup_command_does_not_copy_heavy_tool_trees_into_home(self):
+        """BenchFlow-installed agents must not rely on sandbox-home tool copies.
+
+        Agent binaries are placed in /usr/local/bin by the registered install_cmd
+        values, so setup_sandbox_user() must not bulk-copy heavyweight tool trees
+        (e.g. /root/.nvm, /root/.local/bin) to make them executable for the user.
+        """
+        cmd, _ = await _run_setup_sandbox_user()
+
+        for heavy_source in ("/root/.nvm", "/root/.local/bin"):
+            assert f"cp -a {heavy_source}/." not in cmd
+            assert f"cp -aL {heavy_source}/." not in cmd
+
+        copy_loop_dirs = _get_copy_loop_dirs(cmd)
+        for heavy_dir in (".nvm", ".local"):
+            assert heavy_dir not in copy_loop_dirs, (
+                f"sandbox copy loop must not include heavy tool dir {heavy_dir!r}"
+            )
diff --git a/tests/test_scene_outbox_trial.py b/tests/test_scene_outbox_trial.py
new file mode 100644
index 00000000..5b6f07ad
--- /dev/null
+++ b/tests/test_scene_outbox_trial.py
@@ -0,0 +1,341 @@
+"""Tests for outbox-based inter-role messaging in Trial._run_scene().
+
+Verifies that when bf.run(TrialConfig) executes a multi-role Scene,
+outbox files written by one role are read and injected into the next
+role's prompt — bridging the _scene.py outbox convention with the
+Trial lifecycle.
+"""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from unittest.mock import AsyncMock
+
+import pytest
+
+from benchflow.trial import Role, Scene, Trial, TrialConfig, Turn
+
+
+@dataclass
+class FakeExecResult:
+    stdout: str = ""
+    stderr: str = ""
+    return_code: int = 0
+
+
+class FakeEnv:
+    """Minimal sandbox mock that tracks outbox files."""
+
+    def __init__(self) -> None:
+        self._files: dict[str, str] = {}
+        self._exec_log: list[str] = []
+
+    async def exec(self, cmd: str, **kwargs) -> FakeExecResult:
+        self._exec_log.append(cmd)
+        if "rm -rf /app/.outbox" in cmd:
+            self._files = {k: v for k, v in self._files.items()
+                           if not k.startswith("/app/.outbox/")}
+            return FakeExecResult()
+        if "ls /app/.outbox/" in cmd:
+            files = [f for f in self._files if f.startswith("/app/.outbox/")]
+            return FakeExecResult(stdout="\n".join(files))
+        if cmd.startswith("cat "):
+            path = cmd.split(" ", 1)[1]
+            return FakeExecResult(stdout=self._files.get(path, "{}"))
+        if cmd.startswith("rm -f "):
+            path = cmd.split()[-1]
+            self._files.pop(path, None)
+            return FakeExecResult()
+        return FakeExecResult()
+
+    def stage_outbox(self, recipient: str, content: str) -> None:
+        self._files[f"/app/.outbox/{recipient}.json"] = json.dumps(
+            {"to": recipient, "content": content}
+        )
+
+
+def _make_trial(scene: Scene) -> Trial:
+    config = TrialConfig(
+        task_path=Path("tasks/fake"),
+        scenes=[scene],
+        environment="docker",
+    )
+    trial = Trial(config)
+    trial._env = FakeEnv()
+    trial._resolved_prompts = ["Solve the task"]
+    return trial
+
+
+@pytest.fixture
+def coder_reviewer_scene() -> Scene:
+    return Scene(
+        name="code-review",
+        roles=[
+            Role("coder", "gemini", "gemini-3.1-flash-lite-preview"),
+            Role("reviewer", "gemini", "gemini-3.1-flash-lite-preview"),
+        ],
+        turns=[
+            Turn("coder"),
+            Turn("reviewer", "Review the code. Write feedback to /app/.outbox/coder.json"),
+            Turn("coder", "Read feedback and fix issues."),
+        ],
+    )
+
+
+@pytest.fixture
+def self_review_scene() -> Scene:
+    return Scene(
+        name="self-review",
+        roles=[Role("agent", "gemini", "gemini-3.1-flash-lite-preview")],
+        turns=[
+            Turn("agent"),
+            Turn("agent", "Review your solution and fix edge cases."),
+        ],
+    )
+
+
+async def test_outbox_setup_for_multi_role(coder_reviewer_scene: Scene) -> None:
+    """Multi-role scenes set up /app/.outbox with correct ownership."""
+    trial = _make_trial(coder_reviewer_scene)
+    prompts_received: list[str] = []
+
+    async def fake_execute(prompts=None):
+        prompts_received.extend(prompts or [])
+        return [], 0
+
+    trial.connect_as = AsyncMock()
+    trial.disconnect = AsyncMock()
+    trial.execute = fake_execute
+
+    await trial._run_scene(coder_reviewer_scene)
+
+    outbox_setup = [c for c in trial._env._exec_log if "mkdir -p /app/.outbox" in c]
+    assert len(outbox_setup) == 1
+    assert "chown agent:agent /app/.outbox" in outbox_setup[0]
+
+
+async def test_no_outbox_setup_for_single_role(self_review_scene: Scene) -> None:
+    """Single-role scenes skip outbox setup (no inter-role messaging needed)."""
+    trial = _make_trial(self_review_scene)
+
+    async def fake_execute(prompts=None):
+        return [], 0
+
+    trial.connect_as = AsyncMock()
+    trial.disconnect = AsyncMock()
+    trial.execute = fake_execute
+
+    await trial._run_scene(self_review_scene)
+
+    outbox_cmds = [c for c in trial._env._exec_log if "outbox" in c]
+    assert len(outbox_cmds) == 0
+
+
+async def test_outbox_messages_injected_into_prompt(coder_reviewer_scene: Scene) -> None:
+    """Outbox messages from coder are injected into reviewer's prompt."""
+    trial = _make_trial(coder_reviewer_scene)
+    prompts_received: list[tuple[str, list[str]]] = []
+    call_count = 0
+
+    async def fake_execute(prompts=None):
+        nonlocal call_count
+        # Track which role got which prompt
+        role = coder_reviewer_scene.turns[call_count].role
+        prompts_received.append((role, prompts or []))
+        # Coder writes to reviewer outbox on first turn
+        if call_count == 0:
+            trial._env.stage_outbox("reviewer", "Please review my regex implementation")
+        # Reviewer writes feedback to coder outbox on second turn
+        elif call_count == 1:
+            trial._env.stage_outbox("coder", "Edge case: empty string input not handled")
+        call_count += 1
+        return [], 0
+
+    trial.connect_as = AsyncMock()
+    trial.disconnect = AsyncMock()
+    trial.execute = fake_execute
+
+    await trial._run_scene(coder_reviewer_scene)
+
+    assert len(prompts_received) == 3
+
+    # Turn 0: coder gets base prompt (no messages yet)
+    assert prompts_received[0][0] == "coder"
+    assert "Messages from other agents" not in prompts_received[0][1][0]
+
+    # Turn 1: reviewer gets its prompt + coder's outbox message
+    assert prompts_received[1][0] == "reviewer"
+    assert "Please review my regex implementation" in prompts_received[1][1][0]
+    assert "From coder" in prompts_received[1][1][0]
+
+    # Turn 2: coder gets its prompt + reviewer's feedback
+    assert prompts_received[2][0] == "coder"
+    assert "Edge case: empty string input not handled" in prompts_received[2][1][0]
+    assert "From reviewer" in prompts_received[2][1][0]
+
+
+async def test_outbox_files_cleared_after_read(coder_reviewer_scene: Scene) -> None:
+    """Outbox files are removed after reading so they don't repeat."""
+    trial = _make_trial(coder_reviewer_scene)
+    call_count = 0
+
+    async def fake_execute(prompts=None):
+        nonlocal call_count
+        if call_count == 0:
+            trial._env.stage_outbox("reviewer", "msg1")
+        call_count += 1
+        return [], 0
+
+    trial.connect_as = AsyncMock()
+    trial.disconnect = AsyncMock()
+    trial.execute = fake_execute
+
+    await trial._run_scene(coder_reviewer_scene)
+
+    remaining = [f for f in trial._env._files if f.startswith("/app/.outbox/")]
+    assert len(remaining) == 0
+
+
+async def test_outbox_invalid_json_skipped(coder_reviewer_scene: Scene) -> None:
+    """Invalid JSON in outbox files is skipped without crashing."""
+    trial = _make_trial(coder_reviewer_scene)
+    call_count = 0
+
+    async def fake_execute(prompts=None):
+        nonlocal call_count
+        if call_count == 0:
+            trial._env._files["/app/.outbox/reviewer.json"] = "not valid json{{"
+        call_count += 1
+        return [], 0
+
+    trial.connect_as = AsyncMock()
+    trial.disconnect = AsyncMock()
+    trial.execute = fake_execute
+
+    # Should not raise
+    await trial._run_scene(coder_reviewer_scene)
+    assert call_count == 3
+
+
+async def test_role_switching_connects_and_disconnects(coder_reviewer_scene: Scene) -> None:
+    """Verify connect/disconnect happens on role switches."""
+    trial = _make_trial(coder_reviewer_scene)
+
+    async def fake_execute(prompts=None):
+        return [], 0
+
+    trial.connect_as = AsyncMock()
+    trial.disconnect = AsyncMock()
+    trial.execute = fake_execute
+
+    await trial._run_scene(coder_reviewer_scene)
+
+    # 3 turns: coder, reviewer, coder → 2 connect_as calls for role switches + 1 initial
+    # Initial connect for coder, then disconnect+connect for reviewer, then disconnect+connect for coder
+    assert trial.connect_as.call_count == 3
+    # disconnect after coder->reviewer, after reviewer->coder, and final disconnect
+    assert trial.disconnect.call_count == 3
+
+
+async def test_empty_outbox_no_injection() -> None:
+    """When no outbox files exist, prompt is used as-is."""
+    scene = Scene(
+        name="quiet",
+        roles=[
+            Role("a", "gemini", "flash"),
+            Role("b", "gemini", "flash"),
+        ],
+        turns=[Turn("a", "do stuff"), Turn("b", "also do stuff")],
+    )
+    trial = _make_trial(scene)
+    prompts_received: list[str] = []
+
+    async def fake_execute(prompts=None):
+        prompts_received.extend(prompts or [])
+        return [], 0
+
+    trial.connect_as = AsyncMock()
+    trial.disconnect = AsyncMock()
+    trial.execute = fake_execute
+
+    await trial._run_scene(scene)
+
+    assert prompts_received[0] == "do stuff"
+    assert prompts_received[1] == "also do stuff"
+    assert all("Messages from other agents" not in p for p in prompts_received)
+
+
+async def test_scene_messages_persisted(
+    coder_reviewer_scene: Scene, tmp_path: Path
+) -> None:
+    """Inter-role messages are saved to scene_messages.jsonl in trial_dir."""
+    trial = _make_trial(coder_reviewer_scene)
+    trial._trial_dir = tmp_path
+    call_count = 0
+
+    async def fake_execute(prompts=None):
+        nonlocal call_count
+        if call_count == 0:
+            trial._env.stage_outbox("reviewer", "Please review my code")
+        elif call_count == 1:
+            trial._env.stage_outbox("coder", "Found a bug on line 5")
+        call_count += 1
+        return [], 0
+
+    trial.connect_as = AsyncMock()
+    trial.disconnect = AsyncMock()
+    trial.execute = fake_execute
+
+    await trial._run_scene(coder_reviewer_scene)
+
+    msg_path = tmp_path / "scene_messages.jsonl"
+    assert msg_path.exists()
+    lines = [json.loads(ln) for ln in msg_path.read_text().strip().splitlines()]
+    assert len(lines) == 2
+    assert lines[0]["sender"] == "coder"
+    assert lines[0]["recipient"] == "reviewer"
+    assert lines[0]["content"] == "Please review my code"
+    assert lines[1]["sender"] == "reviewer"
+    assert lines[1]["recipient"] == "coder"
+    assert lines[1]["content"] == "Found a bug on line 5"
+    assert lines[0]["scene"] == "code-review"
+
+
+async def test_heterogeneous_agent_install(coder_reviewer_scene: Scene) -> None:
+    """connect_as installs non-primary agents."""
+    scene = Scene(
+        name="hetero",
+        roles=[
+            Role("coder", "gemini", "flash"),
+            Role("reviewer", "claude-agent-acp", "haiku"),
+        ],
+        turns=[Turn("coder"), Turn("reviewer", "Review.")],
+    )
+    config = TrialConfig(
+        task_path=Path("tasks/fake"),
+        scenes=[scene],
+        environment="docker",
+        agent="gemini",
+    )
+    trial = Trial(config)
+    trial._env = FakeEnv()
+    trial._resolved_prompts = ["Solve the task"]
+
+    installed_agents: list[str] = []
+
+    async def tracking_connect_as(self_inner, role):
+        if role.agent != config.primary_agent:
+            installed_agents.append(role.agent)
+
+    async def fake_execute(prompts=None):
+        return [], 0
+
+    trial.connect_as = lambda role: tracking_connect_as(trial, role)
+    trial.disconnect = AsyncMock()
+    trial.execute = fake_execute
+
+    await trial._run_scene(scene)
+
+    assert "claude-agent-acp" in installed_agents
diff --git a/tests/test_sdk_internals.py b/tests/test_sdk_internals.py
index fe000cfc..6eff44aa 100644
--- a/tests/test_sdk_internals.py
+++ b/tests/test_sdk_internals.py
@@ -7,6 +7,7 @@
 import json
 from datetime import datetime
 from pathlib import Path
+from unittest.mock import AsyncMock
 
 import pytest
 
@@ -21,8 +22,26 @@ def _resolve(self, agent="claude-agent-acp", model=None, agent_env=None):
 
         return resolve_agent_env(agent, model, agent_env)
 
-    def test_env_mapping_applied_after_provider(self):
+    def _patch_expanduser(self, monkeypatch, tmp_path):
+        orig_expanduser = Path.expanduser
+
+        def fake_expanduser(self):
+            s = str(self)
+            if s.startswith("~"):
+                return tmp_path / s[2:]
+            return orig_expanduser(self)
+
+        monkeypatch.setattr(Path, "expanduser", fake_expanduser)
+
+    def test_env_mapping_applied_after_provider(self, monkeypatch):
         """env_mapping translates BENCHFLOW_PROVIDER_* → agent-native vars."""
+        for key in (
+            "ZAI_API_KEY",
+            "ANTHROPIC_API_KEY",
+            "ANTHROPIC_AUTH_TOKEN",
+            "CLAUDE_CODE_OAUTH_TOKEN",
+        ):
+            monkeypatch.delenv(key, raising=False)
         result = self._resolve(
             agent="claude-agent-acp",
             model="zai/glm-5",
@@ -33,21 +52,136 @@ def test_env_mapping_applied_after_provider(self):
         assert "ANTHROPIC_AUTH_TOKEN" in result
         assert result["ANTHROPIC_AUTH_TOKEN"] == "zk-test"
 
+    def test_agent_native_api_key_satisfies_model_check(self, monkeypatch):
+        """Agent-native mapped key (LLM_API_KEY) can satisfy provider auth check."""
+        for key in ("OPENAI_API_KEY", "LLM_API_KEY"):
+            monkeypatch.delenv(key, raising=False)
+        result = self._resolve(
+            agent="openhands",
+            model="openai/gpt-4.1-mini",
+            agent_env={"LLM_API_KEY": "test-llm-key"},
+        )
+        assert result["LLM_API_KEY"] == "test-llm-key"
+        assert result["BENCHFLOW_PROVIDER_API_KEY"] == "test-llm-key"
+
+    def test_same_provider_native_alias_satisfies_model_check(self, monkeypatch):
+        """Provider-native aliases remain valid for the same auth context."""
+        for key in ("GEMINI_API_KEY", "GOOGLE_API_KEY"):
+            monkeypatch.delenv(key, raising=False)
+        result = self._resolve(
+            agent="gemini",
+            model="gemini-2.5-flash",
+            agent_env={"GOOGLE_API_KEY": "test-google-key"},
+        )
+        assert result["GOOGLE_API_KEY"] == "test-google-key"
+        assert result["BENCHFLOW_PROVIDER_API_KEY"] == "test-google-key"
+
+    @pytest.mark.parametrize(
+        ("agent", "host_key"),
+        [
+            pytest.param("codex-acp", "OPENAI_API_KEY", id="codex-openai-key"),
+            pytest.param(
+                "claude-agent-acp",
+                "ANTHROPIC_AUTH_TOKEN",
+                id="claude-auth-token",
+            ),
+            pytest.param("gemini", "GOOGLE_API_KEY", id="gemini-google-key"),
+        ],
+    )
+    def test_cross_provider_host_native_key_does_not_bypass_required_key(
+        self, monkeypatch, tmp_path, agent, host_key
+    ):
+        """Host-native keys for another provider must not satisfy zai auth."""
+        for key in (
+            "ZAI_API_KEY",
+            "OPENAI_API_KEY",
+            "ANTHROPIC_API_KEY",
+            "ANTHROPIC_AUTH_TOKEN",
+            "CLAUDE_CODE_OAUTH_TOKEN",
+            "GOOGLE_API_KEY",
+            "GEMINI_API_KEY",
+        ):
+            monkeypatch.delenv(key, raising=False)
+        monkeypatch.setenv(host_key, "host-native-key")
+        self._patch_expanduser(monkeypatch, tmp_path)
+
+        with pytest.raises(ValueError, match="ZAI_API_KEY required"):
+            self._resolve(agent=agent, model="zai/glm-5", agent_env={})
+
+    def test_auto_inherited_generic_bridge_key_does_not_bypass_required_key(
+        self, monkeypatch, tmp_path
+    ):
+        """Generic agent-native keys must be passed explicitly to bridge auth."""
+        for key in ("ZAI_API_KEY", "LLM_API_KEY"):
+            monkeypatch.delenv(key, raising=False)
+        monkeypatch.setenv("LLM_API_KEY", "host-llm-key")
+        self._patch_expanduser(monkeypatch, tmp_path)
+
+        with pytest.raises(ValueError, match="ZAI_API_KEY required"):
+            self._resolve(agent="openhands", model="zai/glm-5", agent_env={})
+
+    def test_openhands_gemini_model_is_prefixed_for_google_ai_studio(self, monkeypatch):
+        """OpenHands expects Gemini models in gemini/<model> format."""
+        monkeypatch.delenv("GEMINI_API_KEY", raising=False)
+        result = self._resolve(
+            agent="openhands",
+            model="gemini-3.1-flash-lite-preview",
+            agent_env={"GEMINI_API_KEY": "test-gemini-key"},
+        )
+        assert result["LLM_MODEL"] == "gemini/gemini-3.1-flash-lite-preview"
+        assert result["LLM_API_KEY"] == "test-gemini-key"
+
+    def test_openhands_explicit_llm_model_is_preserved(self, monkeypatch):
+        """User-provided LLM_MODEL must win over derived normalization."""
+        monkeypatch.delenv("GEMINI_API_KEY", raising=False)
+        result = self._resolve(
+            agent="openhands",
+            model="gemini-3.1-flash-lite-preview",
+            agent_env={
+                "GEMINI_API_KEY": "test-gemini-key",
+                "LLM_MODEL": "litellm/custom-format",
+            },
+        )
+        assert result["LLM_MODEL"] == "litellm/custom-format"
+
+    def test_openhands_vertex_model_is_prefixed_for_vertex(self, monkeypatch, tmp_path):
+        """OpenHands expects Vertex Gemini models in vertex_ai/<model> format."""
+        adc_dir = tmp_path / ".config" / "gcloud"
+        adc_dir.mkdir(parents=True)
+        (adc_dir / "application_default_credentials.json").write_text("{}")
+        monkeypatch.setattr("pathlib.Path.home", staticmethod(lambda: tmp_path))
+        result = self._resolve(
+            agent="openhands",
+            model="google-vertex/gemini-2.5-flash",
+            agent_env={"GOOGLE_CLOUD_PROJECT": "my-proj"},
+        )
+        assert result["LLM_MODEL"] == "vertex_ai/gemini-2.5-flash"
+
+    def test_provider_bridge_key_alone_does_not_bypass_required_model_key(
+        self, monkeypatch
+    ):
+        """Only mapped agent-native keys can bypass provider-specific key checks."""
+        monkeypatch.delenv("OPENAI_API_KEY", raising=False)
+        with pytest.raises(ValueError, match="OPENAI_API_KEY required"):
+            self._resolve(
+                agent="openclaw",
+                model="openai/gpt-4.1-mini",
+                agent_env={"BENCHFLOW_PROVIDER_API_KEY": "x"},
+            )
+
     def test_required_key_missing_raises(self, monkeypatch, tmp_path):
         """Missing required API key raises ValueError when no subscription auth."""
         # Clear any auto-inherited keys from the environment
-        for key in ("ANTHROPIC_API_KEY", "ZAI_API_KEY", "OPENAI_API_KEY"):
+        for key in (
+            "ANTHROPIC_API_KEY",
+            "ANTHROPIC_AUTH_TOKEN",
+            "CLAUDE_CODE_OAUTH_TOKEN",
+            "ZAI_API_KEY",
+            "OPENAI_API_KEY",
+        ):
             monkeypatch.delenv(key, raising=False)
         # Ensure no host subscription auth files are found
-        orig_expanduser = Path.expanduser
-
-        def fake_expanduser(self):
-            s = str(self)
-            if s.startswith("~"):
-                return tmp_path / s[2:]
-            return orig_expanduser(self)
-
-        monkeypatch.setattr(Path, "expanduser", fake_expanduser)
+        self._patch_expanduser(monkeypatch, tmp_path)
         # Anthropic model
         with pytest.raises(ValueError, match="ANTHROPIC_API_KEY required"):
             self._resolve(
@@ -208,6 +342,7 @@ def test_config_json_written(self, tmp_path):
             skills_dir=None,
             sandbox_user=None,
             context_root=None,
+            sandbox_setup_timeout=33,
             timeout=300,
             started_at=datetime(2026, 4, 8, 12, 0),
             agent_env={"ANTHROPIC_API_KEY": "sk-secret", "SOME_VAR": "visible"},
@@ -221,6 +356,7 @@ def test_config_json_written(self, tmp_path):
             "skills_dir",
             "sandbox_user",
             "sandbox_locked_paths",
+            "sandbox_setup_timeout",
             "context_root",
             "timeout_sec",
             "started_at",
@@ -232,6 +368,7 @@ def test_config_json_written(self, tmp_path):
         assert data["agent"] == "claude-agent-acp"
         assert data["model"] == "claude-haiku-4-5-20251001"
         assert data["environment"] == "docker"
+        assert data["sandbox_setup_timeout"] == 33
         assert data["timeout_sec"] == 300
 
     def test_secrets_filtered(self, tmp_path):
@@ -266,6 +403,41 @@ def test_secrets_filtered(self, tmp_path):
         assert recorded["SAFE_VAR"] == "visible"
 
 
+# ── run wiring ──
+
+
+class TestRunWiring:
+    """Tests for SDK.run() argument forwarding into TrialConfig."""
+
+    @pytest.mark.asyncio
+    async def test_run_forwards_sandbox_setup_timeout_to_trial_config(
+        self, monkeypatch, tmp_path
+    ):
+        from benchflow.models import RunResult
+        from benchflow.sdk import SDK
+
+        seen = {}
+
+        async def fake_create(config):
+            seen["config"] = config
+            trial = AsyncMock()
+            trial.run = AsyncMock(
+                return_value=RunResult(task_name="task-1", rewards={"reward": 1.0})
+            )
+            return trial
+
+        monkeypatch.setattr("benchflow.trial.Trial.create", fake_create)
+
+        result = await SDK().run(
+            task_path=tmp_path,
+            sandbox_setup_timeout=77,
+        )
+
+        assert result.rewards == {"reward": 1.0}
+        assert seen["config"].sandbox_setup_timeout == 77
+        assert seen["config"].task_path == tmp_path
+
+
 # ── _build_result ──
 
 
diff --git a/tests/test_sdk_lockdown.py b/tests/test_sdk_lockdown.py
index 91cb7d41..0d4f92cc 100644
--- a/tests/test_sdk_lockdown.py
+++ b/tests/test_sdk_lockdown.py
@@ -261,7 +261,7 @@ def test_inner_command_is_shlex_quoted(self):
         import shlex
 
         cmd = build_priv_drop_cmd("agent --flag value", "agent")
-        inner = "export HOME=/home/agent && cd /home/agent && agent --flag value"
+        inner = "export HOME=/home/agent && agent --flag value"
         assert shlex.quote(inner) in cmd
 
     def test_single_quotes_in_launch(self):
diff --git a/tests/test_skill_eval.py b/tests/test_skill_eval.py
index c1f2b3e6..94518d15 100644
--- a/tests/test_skill_eval.py
+++ b/tests/test_skill_eval.py
@@ -111,7 +111,7 @@ def test_loads_valid_dataset(self, skill_dir):
         ds = load_eval_dataset(skill_dir)
         assert ds.skill_name == "calculator"
         assert len(ds.cases) == 2
-        assert ds.judge_model == "gemini-3.1-flash-lite"
+        assert ds.judge_model == "claude-haiku-4-5-20251001"
         assert ds.timeout_sec == 120
 
     def test_parses_cases(self, skill_dir):
@@ -133,7 +133,7 @@ def test_minimal_dataset(self, minimal_skill_dir):
     def test_missing_evals_json(self, tmp_path):
         skill = tmp_path / "no-evals"
         skill.mkdir()
-        with pytest.raises(FileNotFoundError, match="evals.json"):
+        with pytest.raises(FileNotFoundError, match=r"evals\.json"):
             load_eval_dataset(skill)
 
     def test_empty_cases(self, tmp_path):
diff --git a/tests/test_skill_eval_dryrun.py b/tests/test_skill_eval_dryrun.py
index 39b6a57e..931a8e96 100644
--- a/tests/test_skill_eval_dryrun.py
+++ b/tests/test_skill_eval_dryrun.py
@@ -218,7 +218,7 @@ def test_gepa_export_roundtrip(self, mock_skill):
 
         # Verify trace files
         traces = list((gepa_dir / "traces").iterdir())
-        assert len(traces) == 4  # 2 cases × 2 modes
+        assert len(traces) == 4  # 2 cases x 2 modes
 
         # Verify trace content
         trace = json.loads(traces[0].read_text())
@@ -228,6 +228,7 @@ def test_gepa_export_roundtrip(self, mock_skill):
 
     def test_cli_dryrun_loads_dataset(self, mock_skill):
         from typer.testing import CliRunner
+
         from benchflow.cli.main import app
 
         runner = CliRunner()
diff --git a/tests/test_subscription_auth.py b/tests/test_subscription_auth.py
index 7f322b99..39ff0565 100644
--- a/tests/test_subscription_auth.py
+++ b/tests/test_subscription_auth.py
@@ -98,6 +98,8 @@ def test_subscription_auth_detected(self, monkeypatch, tmp_path):
         """When host auth file exists and no API key, subscription auth is used."""
         for k in (
             "ANTHROPIC_API_KEY",
+            "ANTHROPIC_AUTH_TOKEN",
+            "CLAUDE_CODE_OAUTH_TOKEN",
             "OPENAI_API_KEY",
             "GOOGLE_API_KEY",
             "GEMINI_API_KEY",
@@ -118,6 +120,8 @@ def test_no_auth_file_raises(self, monkeypatch, tmp_path):
         """When no API key and no host auth file, raises ValueError."""
         for k in (
             "ANTHROPIC_API_KEY",
+            "ANTHROPIC_AUTH_TOKEN",
+            "CLAUDE_CODE_OAUTH_TOKEN",
             "OPENAI_API_KEY",
             "GOOGLE_API_KEY",
             "GEMINI_API_KEY",
diff --git a/tests/test_trial_install_agent_timeout.py b/tests/test_trial_install_agent_timeout.py
new file mode 100644
index 00000000..40db22c9
--- /dev/null
+++ b/tests/test_trial_install_agent_timeout.py
@@ -0,0 +1,91 @@
+"""Tests for Trial.install_agent timeout wiring."""
+
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from benchflow.trial import Trial, TrialConfig
+
+
+def _make_trial(tmp_path, *, agent: str, sandbox_setup_timeout: int) -> Trial:
+    config = TrialConfig.from_legacy(
+        task_path=tmp_path / "task",
+        agent=agent,
+        prompts=[None],
+        sandbox_user="agent",
+        sandbox_setup_timeout=sandbox_setup_timeout,
+    )
+    trial = Trial(config)
+    trial._env = MagicMock()
+    trial._env.exec = AsyncMock(return_value=MagicMock(stdout="/workspace\n"))
+    trial._trial_dir = tmp_path / "trial"
+    trial._trial_dir.mkdir()
+    trial._trial_paths = MagicMock()
+    trial._task = MagicMock()
+    trial._effective_locked = []
+    return trial
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    ("agent", "expected_setup_return"),
+    [
+        ("claude-agent-acp", "/home/agent"),
+        ("oracle", None),
+    ],
+)
+async def test_install_agent_forwards_sandbox_setup_timeout(
+    tmp_path, monkeypatch, agent, expected_setup_return
+):
+    trial = _make_trial(tmp_path, agent=agent, sandbox_setup_timeout=41)
+
+    install_agent_mock = AsyncMock(return_value=MagicMock())
+    write_credential_files_mock = AsyncMock()
+    upload_subscription_auth_mock = AsyncMock()
+    snapshot_build_config_mock = AsyncMock()
+    seed_verifier_workspace_mock = AsyncMock()
+    deploy_skills_mock = AsyncMock()
+    lockdown_paths_mock = AsyncMock()
+    setup_sandbox_user_mock = AsyncMock(return_value=expected_setup_return)
+
+    monkeypatch.setattr("benchflow.trial.install_agent", install_agent_mock)
+    monkeypatch.setattr(
+        "benchflow.trial.write_credential_files", write_credential_files_mock
+    )
+    monkeypatch.setattr(
+        "benchflow.trial.upload_subscription_auth", upload_subscription_auth_mock
+    )
+    monkeypatch.setattr(
+        "benchflow.trial._snapshot_build_config", snapshot_build_config_mock
+    )
+    monkeypatch.setattr(
+        "benchflow.trial._seed_verifier_workspace", seed_verifier_workspace_mock
+    )
+    monkeypatch.setattr("benchflow.trial.deploy_skills", deploy_skills_mock)
+    monkeypatch.setattr("benchflow.trial.lockdown_paths", lockdown_paths_mock)
+    monkeypatch.setattr(
+        "benchflow.trial.setup_sandbox_user", setup_sandbox_user_mock
+    )
+
+    await trial.install_agent()
+
+    setup_sandbox_user_mock.assert_awaited_once()
+    args, kwargs = setup_sandbox_user_mock.await_args
+    assert args[1] == "agent"
+    assert kwargs["timeout_sec"] == 41
+    assert kwargs["workspace"] == "/workspace"
+
+    if agent == "oracle":
+        install_agent_mock.assert_not_awaited()
+        write_credential_files_mock.assert_not_awaited()
+        deploy_skills_mock.assert_not_awaited()
+        assert trial._agent_cwd == "/workspace"
+    else:
+        install_agent_mock.assert_awaited_once()
+        write_credential_files_mock.assert_awaited_once()
+        deploy_skills_mock.assert_awaited_once()
+        assert trial._agent_cwd == "/home/agent"
+
+    snapshot_build_config_mock.assert_awaited_once()
+    seed_verifier_workspace_mock.assert_awaited_once()
+    lockdown_paths_mock.assert_awaited_once()
diff --git a/tests/test_user.py b/tests/test_user.py
new file mode 100644
index 00000000..8ebf1c4c
--- /dev/null
+++ b/tests/test_user.py
@@ -0,0 +1,336 @@
+"""Tests for the User abstraction — BaseUser, PassthroughUser, FunctionUser, RoundResult."""
+
+from __future__ import annotations
+
+import tempfile
+from dataclasses import dataclass
+from pathlib import Path
+from unittest.mock import AsyncMock, patch
+
+import pytest
+
+from benchflow.trial import Role, Scene, Trial, TrialConfig, Turn
+from benchflow.user import BaseUser, FunctionUser, PassthroughUser, RoundResult
+
+# ── Unit tests for User types ──
+
+
+class TestPassthroughUser:
+    @pytest.mark.asyncio
+    async def test_sends_instruction_once(self):
+        user = PassthroughUser()
+        result = await user.run(0, "Fix the bug")
+        assert result == "Fix the bug"
+
+    @pytest.mark.asyncio
+    async def test_stops_after_first_round(self):
+        user = PassthroughUser()
+        await user.run(0, "Fix the bug")
+        result = await user.run(1, "Fix the bug", RoundResult(round=0))
+        assert result is None
+
+
+
+class TestFunctionUser:
+    @pytest.mark.asyncio
+    async def test_sync_function(self):
+        def my_fn(round: int, instruction: str, rr: RoundResult | None) -> str | None:
+            if round == 0:
+                return "terse: " + instruction[:10]
+            return None
+
+        user = FunctionUser(my_fn)
+        assert await user.run(0, "Fix the authentication bug") == "terse: Fix the au"
+        assert await user.run(1, "Fix the authentication bug", RoundResult(round=0)) is None
+
+    @pytest.mark.asyncio
+    async def test_async_function(self):
+        async def my_fn(round: int, instruction: str, rr: RoundResult | None) -> str | None:
+            if round == 0:
+                return instruction
+            if rr and rr.rewards and rr.rewards.get("exact_match", 0) < 1.0:
+                return "Try again, tests failed"
+            return None
+
+        user = FunctionUser(my_fn)
+        assert await user.run(0, "task") == "task"
+
+        failing = RoundResult(round=0, rewards={"exact_match": 0.0})
+        assert await user.run(1, "task", failing) == "Try again, tests failed"
+
+        passing = RoundResult(round=1, rewards={"exact_match": 1.0})
+        assert await user.run(2, "task", passing) is None
+
+
+class TestBaseUser:
+    @pytest.mark.asyncio
+    async def test_not_implemented(self):
+        user = BaseUser()
+        with pytest.raises(NotImplementedError):
+            await user.run(0, "task")
+
+
+# ── Integration tests for user loop in Trial ──
+
+
+@dataclass
+class FakeExecResult:
+    stdout: str = ""
+    stderr: str = ""
+    return_code: int = 0
+
+
+class FakeEnv:
+    """Minimal sandbox mock for user loop tests."""
+
+    def __init__(self) -> None:
+        self._exec_log: list[str] = []
+
+    async def exec(self, cmd: str, **kwargs) -> FakeExecResult:
+        self._exec_log.append(cmd)
+        if "cat /solution" in cmd:
+            return FakeExecResult(stdout="gold answer here")
+        if "rm -rf /solution" in cmd:
+            return FakeExecResult()
+        if "/logs/verifier" in cmd:
+            return FakeExecResult()
+        if "cat /logs/verifier" in cmd:
+            return FakeExecResult(stdout="")
+        return FakeExecResult()
+
+    async def stop(self, **kwargs):
+        pass
+
+
+def _make_user_trial(
+    user: BaseUser, max_rounds: int = 5, oracle: bool = False, tmp_path: Path | None = None,
+) -> Trial:
+    config = TrialConfig(
+        task_path=Path("tasks/fake"),
+        scenes=[Scene.single(agent="gemini", model="flash")],
+        environment="docker",
+        user=user,
+        max_user_rounds=max_rounds,
+        oracle_access=oracle,
+    )
+    trial = Trial(config)
+    trial._env = FakeEnv()
+    trial._resolved_prompts = ["Solve the task described in /app/instruction.md"]
+    trial_dir = tmp_path or Path(tempfile.mkdtemp(prefix="benchflow-test-"))
+    trial._trial_dir = trial_dir
+    verifier_dir = trial_dir / "verifier"
+    verifier_dir.mkdir(parents=True, exist_ok=True)
+    trial._trial_paths = type("P", (), {"verifier_dir": verifier_dir})()
+    trial._task = type("T", (), {
+        "config": type("C", (), {
+            "verifier": type("V", (), {
+                "timeout_sec": 30,
+                "env": {},
+            })(),
+            "agent": type("A", (), {"timeout_sec": 60})(),
+        })(),
+    })()
+    trial._agent_cwd = "/app"
+    return trial
+
+
+class RecordingUser(BaseUser):
+    """User that records calls and stops after a fixed number of rounds."""
+
+    def __init__(self, max_rounds: int = 2, prompts: list[str] | None = None):
+        self.setup_calls: list[tuple[str, str | None]] = []
+        self.run_calls: list[tuple[int, str, RoundResult | None]] = []
+        self._max = max_rounds
+        self._prompts = prompts or ["Do the thing", "Try harder"]
+
+    async def setup(self, instruction: str, solution: str | None = None) -> None:
+        self.setup_calls.append((instruction, solution))
+
+    async def run(
+        self, round: int, instruction: str, round_result: RoundResult | None = None
+    ) -> str | None:
+        self.run_calls.append((round, instruction, round_result))
+        if round >= self._max:
+            return None
+        return self._prompts[min(round, len(self._prompts) - 1)]
+
+
+class TestUserLoop:
+    @pytest.mark.asyncio
+    async def test_user_loop_calls_setup_and_run(self):
+        user = RecordingUser(max_rounds=1)
+        trial = _make_user_trial(user, max_rounds=3)
+
+        with patch.object(trial, "connect_as", new_callable=AsyncMock), \
+             patch.object(trial, "execute", new_callable=AsyncMock, return_value=([], 0)), \
+             patch.object(trial, "disconnect", new_callable=AsyncMock), \
+             patch.object(trial, "soft_verify", new_callable=AsyncMock, return_value=({"exact_match": 1.0}, None, None)):
+
+            await trial._run_user_loop()
+
+        assert len(user.setup_calls) == 1
+        assert user.setup_calls[0][0] == "Solve the task described in /app/instruction.md"
+        assert len(user.run_calls) == 2  # round 0 → prompt, round 1 → None
+        assert user.run_calls[0][0] == 0  # round number
+        assert user.run_calls[1][0] == 1
+
+    @pytest.mark.asyncio
+    async def test_user_loop_passes_round_result(self):
+        user = RecordingUser(max_rounds=2)
+        trial = _make_user_trial(user, max_rounds=5)
+
+        with patch.object(trial, "connect_as", new_callable=AsyncMock), \
+             patch.object(trial, "execute", new_callable=AsyncMock, return_value=([], 0)), \
+             patch.object(trial, "disconnect", new_callable=AsyncMock), \
+             patch.object(trial, "soft_verify", new_callable=AsyncMock, return_value=({"exact_match": 0.5}, "1 failed", None)):
+
+            await trial._run_user_loop()
+
+        # First call has no round_result
+        assert user.run_calls[0][2] is None
+        # Second call has round_result from round 0
+        rr = user.run_calls[1][2]
+        assert isinstance(rr, RoundResult)
+        assert rr.round == 0
+        assert rr.rewards == {"exact_match": 0.5}
+        assert rr.verifier_output == "1 failed"
+
+    @pytest.mark.asyncio
+    async def test_user_loop_respects_max_rounds(self):
+        """User that never stops is capped by max_user_rounds."""
+        def never_stop(r, instr, rr):
+            return "keep going"
+
+        user = FunctionUser(never_stop)
+        trial = _make_user_trial(user, max_rounds=3)
+
+        call_count = 0
+        async def mock_execute(prompts=None):
+            nonlocal call_count
+            call_count += 1
+            return [], 0
+
+        with patch.object(trial, "connect_as", new_callable=AsyncMock), \
+             patch.object(trial, "execute", side_effect=mock_execute), \
+             patch.object(trial, "disconnect", new_callable=AsyncMock), \
+             patch.object(trial, "soft_verify", new_callable=AsyncMock, return_value=(None, None, None)):
+
+            await trial._run_user_loop()
+
+        assert call_count == 3
+
+    @pytest.mark.asyncio
+    async def test_oracle_access(self):
+        user = RecordingUser(max_rounds=0)
+        trial = _make_user_trial(user, oracle=True)
+
+        with patch.object(trial, "connect_as", new_callable=AsyncMock), \
+             patch.object(trial, "execute", new_callable=AsyncMock, return_value=([], 0)), \
+             patch.object(trial, "disconnect", new_callable=AsyncMock), \
+             patch.object(trial, "soft_verify", new_callable=AsyncMock, return_value=(None, None, None)):
+
+            await trial._run_user_loop()
+
+        assert user.setup_calls[0][1] == "gold answer here"
+
+    @pytest.mark.asyncio
+    async def test_multi_role_raises(self):
+        user = RecordingUser()
+        config = TrialConfig(
+            task_path=Path("tasks/fake"),
+            scenes=[Scene(
+                name="multi",
+                roles=[Role("a", "gemini"), Role("b", "gemini")],
+                turns=[Turn("a"), Turn("b")],
+            )],
+            user=user,
+        )
+        trial = Trial(config)
+        trial._env = FakeEnv()
+        trial._resolved_prompts = ["task"]
+
+        with pytest.raises(ValueError, match="single-role"):
+            await trial._run_user_loop()
+
+    @pytest.mark.asyncio
+    async def test_user_run_exception_stops_loop(self):
+        class FailingUser(BaseUser):
+            async def run(self, round, instruction, rr=None):
+                raise RuntimeError("oops")
+
+        trial = _make_user_trial(FailingUser())
+
+        with patch.object(trial, "connect_as", new_callable=AsyncMock), \
+             patch.object(trial, "execute", new_callable=AsyncMock, return_value=([], 0)), \
+             patch.object(trial, "disconnect", new_callable=AsyncMock), \
+             patch.object(trial, "soft_verify", new_callable=AsyncMock, return_value=(None, None, None)):
+
+            await trial._run_user_loop()
+
+        assert "user.run() failed" in trial._error
+
+
+class TestSoftVerify:
+    @pytest.mark.asyncio
+    async def test_soft_verify_timeout(self):
+        trial = _make_user_trial(PassthroughUser())
+
+        with patch("harbor.Verifier") as MockVerifier, \
+             patch("benchflow._sandbox.CLEANUP_CMD", "true"):
+            mock_instance = MockVerifier.return_value
+            mock_instance.verify = AsyncMock(side_effect=TimeoutError())
+
+            rewards, output, error = await trial.soft_verify()
+
+        assert rewards is None
+        assert output is None
+        assert "timed out" in error
+
+    @pytest.mark.asyncio
+    async def test_soft_verify_crash(self):
+        trial = _make_user_trial(PassthroughUser())
+
+        with patch("harbor.Verifier") as MockVerifier, \
+             patch("benchflow._sandbox.CLEANUP_CMD", "true"):
+            mock_instance = MockVerifier.return_value
+            mock_instance.verify = AsyncMock(side_effect=RuntimeError("boom"))
+
+            rewards, _output, error = await trial.soft_verify()
+
+        assert rewards is None
+        assert "crashed" in error
+        assert "boom" in error
+
+    @pytest.mark.asyncio
+    async def test_soft_verify_success(self):
+        trial = _make_user_trial(PassthroughUser())
+
+        mock_result = type("VR", (), {"rewards": {"exact_match": 1.0}})()
+
+        with patch("harbor.Verifier") as MockVerifier, \
+             patch("benchflow._sandbox.CLEANUP_CMD", "true"):
+            mock_instance = MockVerifier.return_value
+            mock_instance.verify = AsyncMock(return_value=mock_result)
+
+            rewards, _output, error = await trial.soft_verify()
+
+        assert rewards == {"exact_match": 1.0}
+        assert error is None
+
+    @pytest.mark.asyncio
+    async def test_soft_verify_runs_cleanup_cmd(self):
+        trial = _make_user_trial(PassthroughUser())
+
+        mock_result = type("VR", (), {"rewards": {}})()
+
+        with patch("harbor.Verifier") as MockVerifier, \
+             patch("benchflow._sandbox._build_cleanup_cmd",
+                   return_value="echo cleanup_sentinel"):
+            mock_instance = MockVerifier.return_value
+            mock_instance.verify = AsyncMock(return_value=mock_result)
+
+            await trial.soft_verify()
+
+        # Verify cleanup command was executed
+        exec_log = trial._env._exec_log
+        assert any("cleanup_sentinel" in cmd for cmd in exec_log)
diff --git a/tests/test_verify.py b/tests/test_verify.py
index c3a790fa..93f4a51f 100644
--- a/tests/test_verify.py
+++ b/tests/test_verify.py
@@ -476,9 +476,9 @@ def sdk_run_mocks(self, tmp_path):
     def _patch_sdk_run(self, sdk, mock_env, extra_patches):
         """Apply shared + extra patches for SDK.run() internals."""
         patches = [
-            patch("benchflow.sdk._create_environment", return_value=mock_env),
+            patch("benchflow.trial._create_environment", return_value=mock_env),
             patch(
-                "benchflow.sdk.install_agent",
+                "benchflow.trial.install_agent",
                 new_callable=AsyncMock,
                 return_value=MagicMock(
                     credential_files={},
@@ -487,8 +487,8 @@ def _patch_sdk_run(self, sdk, mock_env, extra_patches):
                     env_mapping={},
                 ),
             ),
-            patch("benchflow.sdk.write_credential_files", new_callable=AsyncMock),
-            patch("benchflow.sdk.deploy_skills", new_callable=AsyncMock),
+            patch("benchflow.trial.write_credential_files", new_callable=AsyncMock),
+            patch("benchflow.trial.deploy_skills", new_callable=AsyncMock),
             *extra_patches,
         ]
         with contextlib.ExitStack() as stack:
@@ -516,23 +516,22 @@ async def test_scraped_trajectory_preserves_n_tool_calls(
                 mock_env,
                 [
                     patch(
-                        "benchflow.sdk.connect_acp",
+                        "benchflow.trial.connect_acp",
                         new_callable=AsyncMock,
                         return_value=(mock_acp, mock_session, "test-agent"),
                     ),
                     patch(
-                        "benchflow.sdk.execute_prompts",
+                        "benchflow.trial.execute_prompts",
                         new_callable=AsyncMock,
                         return_value=([], 5),
                     ),
                     patch(
-                        "benchflow.sdk._scrape_agent_trajectory",
+                        "benchflow.trial._scrape_agent_trajectory",
                         new_callable=AsyncMock,
                         return_value=forged,
                     ),
-                    patch.object(
-                        sdk,
-                        "_verify",
+                    patch(
+                        "benchflow.sdk.SDK._verify",
                         new_callable=AsyncMock,
                         return_value=({"reward": 1.0}, None),
                     ),
@@ -568,21 +567,21 @@ async def test_partial_acp_uses_session_tool_calls(self, sdk_run_mocks):
             mock_env,
             [
                 patch(
-                    "benchflow.sdk.connect_acp",
+                    "benchflow.trial.connect_acp",
                     new_callable=AsyncMock,
                     return_value=(mock_acp, mock_session, "test-agent"),
                 ),
                 patch(
-                    "benchflow.sdk.execute_prompts",
+                    "benchflow.trial.execute_prompts",
                     new_callable=AsyncMock,
                     side_effect=ConnectionError("lost"),
                 ),
                 patch(
-                    "benchflow.sdk._capture_session_trajectory",
+                    "benchflow.trial._capture_session_trajectory",
                     return_value=partial_events,
                 ),
                 patch(
-                    "benchflow.sdk._scrape_agent_trajectory",
+                    "benchflow.trial._scrape_agent_trajectory",
                     new_callable=AsyncMock,
                     return_value=[],
                 ),
diff --git a/tests/test_yaml_config.py b/tests/test_yaml_config.py
index d362296e..0de12aac 100644
--- a/tests/test_yaml_config.py
+++ b/tests/test_yaml_config.py
@@ -24,6 +24,7 @@ def native_yaml(tmp_path):
 environment: daytona
 concurrency: 32
 max_retries: 1
+sandbox_setup_timeout: 45
 prompts:
   - null
   - "Review your solution."
@@ -46,6 +47,7 @@ def harbor_yaml(tmp_path):
 orchestrator:
   type: local
   n_concurrent_trials: 8
+sandbox_setup_timeout: 75
 environment:
   type: daytona
   env:
@@ -69,6 +71,7 @@ def test_from_native_yaml(native_yaml):
     assert cfg.environment == "daytona"
     assert cfg.concurrency == 32
     assert cfg.retry.max_retries == 1
+    assert cfg.sandbox_setup_timeout == 45
     assert cfg.prompts == [None, "Review your solution."]
     assert job._tasks_dir == Path("tasks")
     assert job._jobs_dir == Path("output")
@@ -85,6 +88,7 @@ def test_from_harbor_yaml(harbor_yaml):
     assert cfg.concurrency == 8
     assert cfg.retry.max_retries == 1  # n_attempts=2 → max_retries=1
     assert cfg.agent_env.get("ANTHROPIC_API_KEY") == "test-key"
+    assert cfg.sandbox_setup_timeout == 75
     assert job._tasks_dir == Path("tasks")
     assert job._jobs_dir == Path("output")
 
@@ -127,6 +131,7 @@ def test_from_harbor_yaml_defaults(tmp_path):
     assert cfg.agent == "pi-acp"
     assert cfg.environment == "docker"
     assert cfg.concurrency == 4
+    assert cfg.sandbox_setup_timeout == 120
     assert job._tasks_dir == Path("tasks")
     assert job._jobs_dir == Path("jobs")
 
@@ -253,3 +258,4 @@ def test_defaults_when_omitted(self, tmp_path):
         assert job._config.exclude_tasks == set()
         assert job._config.agent_env == {}
         assert job._config.sandbox_user == "agent"
+        assert job._config.sandbox_setup_timeout == 120