diff --git a/.claude/skills/benchflow/SKILL.md b/.claude/skills/benchflow/SKILL.md index 1f21b5fd..c740791b 100644 --- a/.claude/skills/benchflow/SKILL.md +++ b/.claude/skills/benchflow/SKILL.md @@ -23,7 +23,7 @@ Arguments passed: `$ARGUMENTS` ### No args or `status` — show current state -1. Check if benchflow is installed: `pip show benchflow` +1. Check if benchflow is installed: `uv tool list | grep benchflow` 2. Check if `.env` exists with API keys 3. Check available agents: `benchflow agents` 4. Show recent job results if any exist in `jobs/` @@ -199,7 +199,7 @@ asyncio.run(main()) ## Setup ```bash -pip install benchflow # or: pip install -e . (from source) +uv tool install benchflow # or: uv tool install -e . (from source) source .env # ANTHROPIC_API_KEY, DAYTONA_API_KEY ``` diff --git a/.claude/skills/benchflow/tasks/benchflow-knowledge/environment/benchflow/SKILL.md b/.claude/skills/benchflow/tasks/benchflow-knowledge/environment/benchflow/SKILL.md index 7d3817bf..57af0d81 100644 --- a/.claude/skills/benchflow/tasks/benchflow-knowledge/environment/benchflow/SKILL.md +++ b/.claude/skills/benchflow/tasks/benchflow-knowledge/environment/benchflow/SKILL.md @@ -23,7 +23,7 @@ Arguments passed: `$ARGUMENTS` ### No args or `status` — show current state -1. Check if benchflow is installed: `pip show benchflow` +1. Check if benchflow is installed: `uv tool list | grep benchflow` 2. Check if `.env` exists with API keys 3. Check available agents: `benchflow agents` 4. Show recent job results if any exist in `jobs/` @@ -182,7 +182,7 @@ asyncio.run(main()) ## Setup ```bash -pip install benchflow # or: pip install -e . (from source) +uv tool install benchflow # or: uv tool install -e . (from source) source .env # ANTHROPIC_API_KEY, DAYTONA_API_KEY ``` diff --git a/.claude/skills/benchflow/tasks/create-simple-task/environment/benchflow/SKILL.md b/.claude/skills/benchflow/tasks/create-simple-task/environment/benchflow/SKILL.md index 7d3817bf..57af0d81 100644 --- a/.claude/skills/benchflow/tasks/create-simple-task/environment/benchflow/SKILL.md +++ b/.claude/skills/benchflow/tasks/create-simple-task/environment/benchflow/SKILL.md @@ -23,7 +23,7 @@ Arguments passed: `$ARGUMENTS` ### No args or `status` — show current state -1. Check if benchflow is installed: `pip show benchflow` +1. Check if benchflow is installed: `uv tool list | grep benchflow` 2. Check if `.env` exists with API keys 3. Check available agents: `benchflow agents` 4. Show recent job results if any exist in `jobs/` @@ -182,7 +182,7 @@ asyncio.run(main()) ## Setup ```bash -pip install benchflow # or: pip install -e . (from source) +uv tool install benchflow # or: uv tool install -e . (from source) source .env # ANTHROPIC_API_KEY, DAYTONA_API_KEY ``` diff --git a/CLAUDE.md b/CLAUDE.md index 7695ddc1..b089ffe3 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -2,7 +2,7 @@ Multi-turn agent benchmarking with ACP. -Docs: `docs/quickstart.md`, `docs/cli-reference.md`, `docs/api-reference.md`, `docs/task-authoring.md`, `docs/use-cases.md`. +Docs: `docs/quickstart.md`, `docs/cli-reference.md`, `docs/api-reference.md`, `docs/task-authoring.md`, `docs/use-cases.md`, `docs/progressive-disclosure.md`. ## Setup diff --git a/README.md b/README.md index b750a519..e56d4553 100644 --- a/README.md +++ b/README.md @@ -21,10 +21,10 @@ BenchFlow runs AI agents against benchmark tasks in sandboxed environments. It s ## Install ```bash -pip install benchflow==0.3.0a3 +uv tool install benchflow ``` -Requires Python 3.12+. For cloud sandboxes, set `DAYTONA_API_KEY`. +Requires Python 3.12+ and [uv](https://docs.astral.sh/uv/). For cloud sandboxes, set `DAYTONA_API_KEY`. ## Quick Start @@ -129,6 +129,21 @@ bench environment create Spin up sandbox from task dir bench environment list List active sandboxes ``` +## Terminology + +| Term | Definition | Example | +|------|-----------|---------| +| **Turn** | One prompt in one ACP session — one role acts | Coder writes a regex | +| **Multi-turn** | Same role, multiple turns | Self-review: agent → agent | +| **Round** | One A→B exchange between different roles | Coder → Reviewer | +| **Multi-round** | Different roles exchanging turns | Coder → Reviewer → Coder | +| **Scene** | Interaction region with roles + turns | A code-review scene | +| **Trial** | Sequence of scenes in a shared sandbox | Skill-gen → Solve | + +**Inter-role messaging:** In multi-role scenes, agents communicate via outbox files. +An agent writes `/app/.outbox/{recipient}.json` with `{"to": "role", "content": "..."}`. +The scheduler reads these after each turn and injects the message into the next role's prompt. + ## Architecture ``` @@ -143,6 +158,10 @@ bf.run(config) → trial.start() # spin up sandbox, upload task files → for scene in config.scenes: → trial._run_scene(scene) # connect/execute/disconnect per role + → setup /app/.outbox/ # (multi-role scenes only) + → for turn in scene.turns: + → read outbox → inject messages into prompt + → connect as role → execute → disconnect → trial.verify() # run verifier, score → trial.cleanup() # stop sandbox ``` diff --git a/benchmarks/followup-bench/runner.py b/benchmarks/followup-bench/runner.py index e3558d51..d942ea6c 100644 --- a/benchmarks/followup-bench/runner.py +++ b/benchmarks/followup-bench/runner.py @@ -28,7 +28,7 @@ from benchflow._acp_run import connect_acp, execute_prompts from benchflow._agent_setup import install_agent from benchflow._scene import Role, Scene -from benchflow.agents.registry import AGENTS, AGENT_LAUNCH +from benchflow.agents.registry import AGENT_LAUNCH, AGENTS from benchflow.runtime import Environment logger = logging.getLogger(__name__) diff --git a/docs/api-reference.md b/docs/api-reference.md index 4e871fa8..00325e6c 100644 --- a/docs/api-reference.md +++ b/docs/api-reference.md @@ -5,7 +5,7 @@ The Trial/Scene API is the primary way to run agent benchmarks programmatically. ## Install ```bash -pip install benchflow==0.3.0a3 +uv tool install benchflow ``` ## Quick Start @@ -34,6 +34,7 @@ config = TrialConfig( task_path=Path("tasks/my-task"), scenes=[Scene.single(agent="gemini", model="gemini-3.1-flash-lite-preview")], environment="daytona", + sandbox_setup_timeout=120, ) # Multi-scene BYOS (skill-gen → solve) @@ -46,9 +47,13 @@ config = TrialConfig( turns=[Turn("solver")]), ], environment="daytona", + sandbox_setup_timeout=120, ) ``` +Set `sandbox_setup_timeout` when sandbox user setup needs more than the default 120 seconds. +The same field is also available on `JobConfig` and `RuntimeConfig`. + ### Scene One interaction region — roles take turns executing prompts. @@ -57,7 +62,9 @@ One interaction region — roles take turns executing prompts. # Single-role shortcut scene = Scene.single(agent="gemini", model="gemini-3.1-flash-lite-preview") -# Multi-role with turn order +# Multi-role with turn order (coder-reviewer pattern) +# Agents communicate via outbox: write /app/.outbox/{recipient}.json +# Scheduler reads outbox after each turn, injects into next role's prompt scene = Scene( name="coder-reviewer", roles=[ @@ -66,8 +73,9 @@ scene = Scene( ], turns=[ Turn("coder"), # None prompt = instruction.md - Turn("reviewer", "Review..."), - Turn("coder", "Fix issues..."), + Turn("reviewer", "Review the code. Write feedback to " + '/app/.outbox/coder.json as {"to":"coder","content":"..."}'), + Turn("coder", "Fix the issues."), # reviewer's feedback auto-injected ], ) ``` @@ -95,6 +103,20 @@ await trial.verify() await trial.cleanup() ``` +### RuntimeConfig + +Runtime-level configuration for the `Agent + Environment` execution path. + +```python +from benchflow.runtime import Agent, Environment, Runtime, RuntimeConfig + +config = RuntimeConfig(sandbox_setup_timeout=300) +agent = Agent("gemini", model="gemini-3.1-flash-lite-preview") +env = Environment.from_task("tasks/X", backend="daytona") +runtime = Runtime(env, agent, config=config) +result = await runtime.execute() +``` + ### bf.run() Convenience function — multiple calling conventions: @@ -108,10 +130,16 @@ result = await bf.run(config) # 2. Agent + Environment (0.3 style) agent = bf.Agent("gemini", model="gemini-3.1-flash-lite-preview") env = bf.Environment.from_task("tasks/X", backend="daytona") -result = await bf.run(agent, env) +runtime_config = bf.RuntimeConfig(sandbox_setup_timeout=300) +result = await bf.run(agent, env, runtime_config) # 3. String shortcut (simplest) -result = await bf.run("gemini", task_path="tasks/X", model="gemini-3.1-flash-lite-preview") +result = await bf.run( + "gemini", + task_path="tasks/X", + model="gemini-3.1-flash-lite-preview", + config=bf.RuntimeConfig(sandbox_setup_timeout=300), +) ``` ## Trial Lifecycle @@ -122,17 +150,38 @@ Trial.run() ├─ setup() — resolve config, create env object ├─ start() — spin up sandbox, upload task files, start services ├─ install_agent() — install agent binary, credentials, sandbox user + │ (sandbox user setup: create non-root user, prepare + │ small config/auth dirs, chown the workspace — no + │ recursive copy of /root tool trees; agent binaries + │ must live on shared prefixes like /usr/local/bin) ├─ for scene in scenes: │ └─ _run_scene(scene) - │ ├─ connect_as(role) — open ACP session for this role - │ ├─ execute(prompts) — send prompts, collect trajectory - │ └─ disconnect() — kill agent process, clean up + │ ├─ setup /app/.outbox/ — (multi-role scenes only) + │ └─ for turn in scene.turns: + │ ├─ read outbox — inject messages into prompt + │ ├─ connect_as(role) — open ACP session for this role + │ ├─ execute(prompts) — send prompts, collect trajectory + │ └─ disconnect() — kill agent process, clean up ├─ verify() — run verifier, collect rewards └─ cleanup() — stop sandbox ``` Key: `disconnect()` kills the agent process between scenes to prevent context bleed. Each scene gets a fresh agent session. +## Multi-Turn vs Multi-Round + +| Pattern | Roles | Turns | Communication | Example | +|---------|-------|-------|---------------|---------| +| **Single-turn** | 1 | 1 | — | Baseline benchmark | +| **Multi-turn** | 1 | 2+ | Same session, sequential prompts | Self-review | +| **Multi-round** | 2+ | 2+ | Outbox files between roles | Coder + Reviewer | + +**Multi-turn** = same agent gets multiple prompts. Use when a second pass catches errors (self-review, iterative refinement). The agent keeps its context across turns. + +**Multi-round** = different agents exchange turns. Use when tasks need multiple perspectives (code review, client-advisor). The scheduler reads outbox files and injects messages. + +Both use the same API — `TrialConfig` with different `Scene` configurations. + ## Multi-Agent Patterns ### Coder + Reviewer (followup-bench) @@ -169,6 +218,17 @@ config = TrialConfig( ) ``` +## 0.3 Limitations + +The Scene API in 0.3 covers coder-reviewer and multi-turn patterns. It does **not** yet support: + +- **Dynamic termination** — turn count is fixed at config time. A "user" role cannot decide to stop early based on agent output. Workaround: use `max_rounds` in the standalone `_scene.py` scheduler. +- **Oracle access** — no mechanism for a "user" role to read `/solution` during setup. +- **Per-round verification** — `verify()` runs once after all scenes complete, not between rounds. +- **Inter-round trajectory inspection** — a "user" role cannot read the agent's trajectory between turns. + +These are tracked for 0.4. See the [Harbor PR #1462 mapping](docs/notebooks/scene-patterns.ipynb) for details. + ## YAML Trial Configs ```python diff --git a/docs/cli-reference.md b/docs/cli-reference.md index 86c702e6..4b387889 100644 --- a/docs/cli-reference.md +++ b/docs/cli-reference.md @@ -40,7 +40,8 @@ bench eval create \ -a gemini \ -m gemini-3.1-flash-lite-preview \ -e daytona \ - -c 64 + -c 64 \ + --sandbox-setup-timeout 300 ``` | Flag | Default | Description | @@ -53,6 +54,7 @@ bench eval create \ | `--concurrency`, `-c` | `4` | Max concurrent tasks (batch mode only) | | `--jobs-dir`, `-o` | `jobs` | Output directory | | `--sandbox-user` | `agent` | Sandbox user (null for root) | +| `--sandbox-setup-timeout` | `120` | Timeout in seconds for sandbox user setup | ### bench eval list @@ -145,6 +147,7 @@ bench environment list task_dir: .ref/terminal-bench-2 environment: daytona concurrency: 64 +sandbox_setup_timeout: 300 scenes: - name: solve @@ -165,6 +168,7 @@ model: gemini-3.1-flash-lite-preview environment: daytona concurrency: 64 max_retries: 2 +sandbox_setup_timeout: 300 ``` ### Multi-scene (BYOS skill generation) @@ -173,6 +177,7 @@ max_retries: 2 task_dir: tasks/ environment: daytona concurrency: 10 +sandbox_setup_timeout: 300 scenes: - name: skill-gen diff --git a/docs/notebooks/coder-reviewer-demo.py b/docs/notebooks/coder-reviewer-demo.py new file mode 100644 index 00000000..010b492a --- /dev/null +++ b/docs/notebooks/coder-reviewer-demo.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +"""Coder-Reviewer demo — a simple two-agent Scene run via bf.run(). + +Demonstrates: + - Multi-role Scene (coder + reviewer) in a shared sandbox + - Outbox-based message passing between roles + - Standard bf.run(TrialConfig) API — same path for single or multi-agent + +Requirements: + - pip install benchflow + - GEMINI_API_KEY or DAYTONA_API_KEY set + - A Harbor-format task directory (e.g. .ref/terminal-bench-2/regex-log) + +Usage: + python docs/notebooks/coder-reviewer-demo.py --task .ref/terminal-bench-2/regex-log + python docs/notebooks/coder-reviewer-demo.py --task .ref/terminal-bench-2/regex-log --env docker + +Terminology: + - Turn: One prompt → one ACP session (one role acts) + - Multi-turn: Same role, multiple turns (e.g. self-review: agent → agent) + - Round: One A→B exchange between different roles + - Multi-round: Different roles exchanging turns (e.g. coder → reviewer → coder) +""" + +from __future__ import annotations + +import argparse +import asyncio +import logging +import sys +from pathlib import Path + +import benchflow as bf +from benchflow.trial import Role, Scene, TrialConfig, Turn + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(name)s %(levelname)s %(message)s", +) +logger = logging.getLogger("coder-reviewer-demo") + + +# --------------------------------------------------------------------------- +# Scene definitions +# --------------------------------------------------------------------------- + +def baseline_config(task_path: Path, env: str, agent: str, model: str) -> TrialConfig: + """Pattern 1: Single agent, single turn — the baseline.""" + return TrialConfig( + task_path=task_path, + scenes=[Scene.single(agent=agent, model=model)], + environment=env, + ) + + +def coder_reviewer_config( + task_path: Path, + env: str, + coder_agent: str = "gemini", + coder_model: str = "gemini-3.1-flash-lite-preview", + reviewer_agent: str = "gemini", + reviewer_model: str = "gemini-3.1-flash-lite-preview", +) -> TrialConfig: + """Pattern 3: Coder + Reviewer — multi-round with outbox messaging. + + Flow: + 1. Coder attempts the task + 2. Reviewer reads coder's work, writes feedback to /app/.outbox/coder.json + 3. Coder receives feedback (injected by scheduler), revises solution + + The outbox convention: + - Agent writes: /app/.outbox/{recipient}.json + - Format: {"to": "role_name", "content": "your message"} + - Scheduler reads, clears, and injects into next role's prompt + """ + return TrialConfig( + task_path=task_path, + scenes=[Scene( + name="code-review", + roles=[ + Role("coder", coder_agent, coder_model), + Role("reviewer", reviewer_agent, reviewer_model), + ], + turns=[ + Turn("coder"), + Turn("reviewer", + "Review the code in /app/. Check for correctness, edge cases, " + "and adherence to the task requirements in /app/instruction.md. " + "Write your feedback to /app/.outbox/coder.json as: " + '{"to": "coder", "content": "Your specific feedback here."}'), + Turn("coder", + "Read the reviewer's feedback and fix the issues. " + "Focus only on what was flagged — don't start over."), + ], + )], + environment=env, + ) + + +# --------------------------------------------------------------------------- +# Runner +# --------------------------------------------------------------------------- + +async def run_comparison( + task_path: Path, + env: str, + agent: str, + model: str, +) -> None: + """Run baseline vs coder-reviewer and compare.""" + logger.info("=== Baseline (single agent, single turn) ===") + baseline = baseline_config(task_path, env, agent, model) + baseline_result = await bf.run(baseline) + baseline_reward = (baseline_result.rewards or {}).get("reward") + logger.info(f"Baseline: reward={baseline_reward}, tools={baseline_result.n_tool_calls}") + + logger.info("=== Coder + Reviewer (multi-round) ===") + review = coder_reviewer_config(task_path, env, agent, model, agent, model) + review_result = await bf.run(review) + review_reward = (review_result.rewards or {}).get("reward") + logger.info(f"Reviewed: reward={review_reward}, tools={review_result.n_tool_calls}") + + print("\n" + "=" * 60) + print("Results") + print("=" * 60) + print(f" Baseline: reward={baseline_reward} tool_calls={baseline_result.n_tool_calls}") + print(f" Reviewed: reward={review_reward} tool_calls={review_result.n_tool_calls}") + if baseline_reward is not None and review_reward is not None: + lift = review_reward - baseline_reward + print(f" Lift: {lift:+.2f}") + print("=" * 60) + + +async def run_single(task_path: Path, env: str, agent: str, model: str) -> None: + """Run coder-reviewer only.""" + config = coder_reviewer_config(task_path, env, agent, model, agent, model) + result = await bf.run(config) + reward = (result.rewards or {}).get("reward") + print(f"reward={reward} tool_calls={result.n_tool_calls} error={result.error}") + + +def main() -> None: + parser = argparse.ArgumentParser(description="Coder-Reviewer demo") + parser.add_argument("--task", type=Path, required=True, help="Path to task directory") + parser.add_argument("--env", default="daytona", choices=["daytona", "docker"]) + parser.add_argument("--agent", default="gemini") + parser.add_argument("--model", default="gemini-3.1-flash-lite-preview") + parser.add_argument("--compare", action="store_true", + help="Run baseline and coder-reviewer side by side") + args = parser.parse_args() + + if not args.task.exists(): + print(f"Task directory not found: {args.task}", file=sys.stderr) + sys.exit(1) + + if args.compare: + asyncio.run(run_comparison(args.task, args.env, args.agent, args.model)) + else: + asyncio.run(run_single(args.task, args.env, args.agent, args.model)) + + +if __name__ == "__main__": + main() diff --git a/docs/notebooks/nanofirm-task/tests/evaluate.py b/docs/notebooks/nanofirm-task/tests/evaluate.py index 2f4fd75f..254f70d3 100644 --- a/docs/notebooks/nanofirm-task/tests/evaluate.py +++ b/docs/notebooks/nanofirm-task/tests/evaluate.py @@ -1,6 +1,5 @@ """Evaluate contract risk analysis quality.""" import json -import sys ANALYSIS_PATH = "/app/analysis.json" REWARD_PATH = "/logs/verifier/reward.txt" diff --git a/docs/notebooks/scene-patterns.ipynb b/docs/notebooks/scene-patterns.ipynb index 3f9277cf..488869f2 100644 --- a/docs/notebooks/scene-patterns.ipynb +++ b/docs/notebooks/scene-patterns.ipynb @@ -6,299 +6,270 @@ "source": [ "# BenchFlow Scene Patterns\n", "\n", - "Six evaluation patterns — multi-turn, multi-round, multi-scene — expressed as Scene configs.\n", + "Three evaluation patterns — baseline, self-review (multi-turn), coder-reviewer (multi-round) — run end-to-end with `bf.run()`.\n", "\n", - "| Term | Definition |\n", - "|------|------------|\n", - "| **Turn** | One prompt in one ACP session |\n", - "| **Round** | One A→B exchange between roles |\n", - "| **Scene** | Interaction region with roles + turns |\n", - "| **Trial** | Sequence of scenes in shared sandbox |" + "| Term | Definition | Example |\n", + "|------|-----------|--------|\n", + "| **Turn** | One prompt in one ACP session | Coder writes code |\n", + "| **Multi-turn** | Same role, multiple turns | Self-review: agent → agent |\n", + "| **Round** | One A→B exchange between roles | Coder → Reviewer |\n", + "| **Multi-round** | Different roles exchanging turns | Coder → Reviewer → Coder |\n", + "| **Scene** | Interaction region with roles + turns | A code-review scene |\n", + "| **Trial** | Sequence of scenes in a shared sandbox | Skill-gen → Solve |\n", + "\n", + "**Prerequisites:**\n", + "- `pip install benchflow`\n", + "- `GEMINI_API_KEY` or `GOOGLE_API_KEY` set\n", + "- `DAYTONA_API_KEY` set (or Docker daemon running for `--env docker`)\n", + "- A task directory (this notebook uses `.ref/terminal-bench-2/regex-log`)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [], - "source": [ - "from benchflow.trial import TrialConfig, Scene, Role, Turn\n", - "from pathlib import Path" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Pattern 1: Single-Agent Baseline" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "1 scene, 1 role, 1 turn\n" + "Task: .ref/terminal-bench-2/regex-log Env: daytona Agent: gemini Model: gemini-3.1-flash-lite-preview\n" ] } ], "source": [ - "config = TrialConfig(\n", - " task_path=Path(\"tasks/regex-log\"),\n", - " scenes=[Scene.single(agent=\"gemini\", model=\"gemini-3.1-flash-lite-preview\")],\n", - " environment=\"daytona\",\n", - ")\n", - "s = config.effective_scenes[0]\n", - "print(f\"{len(config.effective_scenes)} scene, {len(s.roles)} role, {len(s.turns)} turn\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Result:** reward=0.0, 3 tool calls. Agent wrote a regex but missed edge cases." + "import logging\n", + "from pathlib import Path\n", + "\n", + "import benchflow as bf\n", + "from benchflow.trial import Role, Scene, TrialConfig, Turn\n", + "\n", + "logging.basicConfig(level=logging.INFO, format=\"%(asctime)s %(name)s %(message)s\")\n", + "\n", + "TASK = Path(\".ref/terminal-bench-2/regex-log\")\n", + "ENV = \"daytona\"\n", + "AGENT = \"gemini\"\n", + "MODEL = \"gemini-3.1-flash-lite-preview\"\n", + "\n", + "results = {} # pattern_name -> RunResult\n", + "print(f\"Task: {TASK} Env: {ENV} Agent: {AGENT} Model: {MODEL}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Pattern 2: Single-Agent Multi-Turn\n", - "Same agent, two prompts. Self-review variant." + "## Pattern 1: Single-Agent Baseline\n", + "\n", + "One agent, one turn. The simplest configuration." ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "1 scene, 1 role, 2 turns\n" + "Reward: 1.0\n", + "Tool calls: 3\n", + "Error: None\n" ] } ], "source": [ "config = TrialConfig(\n", - " task_path=Path(\"tasks/regex-log\"),\n", - " scenes=[Scene(\n", - " name=\"self-review\",\n", - " roles=[Role(\"agent\", \"gemini\", \"gemini-3.1-flash-lite-preview\")],\n", - " turns=[\n", - " Turn(\"agent\"),\n", - " Turn(\"agent\", \"Review your solution. Check edge cases and fix issues.\"),\n", - " ],\n", - " )],\n", - " environment=\"daytona\",\n", + " task_path=TASK,\n", + " scenes=[Scene.single(agent=AGENT, model=MODEL)],\n", + " environment=ENV,\n", ")\n", - "s = config.effective_scenes[0]\n", - "print(f\"{len(config.effective_scenes)} scene, {len(s.roles)} role, {len(s.turns)} turns\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Result:** reward=1.0, 12 tool calls. Self-review caught the regex bug." + "\n", + "result = await bf.run(config)\n", + "results[\"baseline\"] = result\n", + "\n", + "reward = (result.rewards or {}).get(\"reward\")\n", + "print(f\"Reward: {reward}\")\n", + "print(f\"Tool calls: {result.n_tool_calls}\")\n", + "print(f\"Error: {result.error}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Pattern 3: Multi-Round Code Review\n", - "Two roles — coder and reviewer — take turns." + "## Pattern 2: Multi-Turn Self-Review\n", + "\n", + "Same agent gets a second turn to re-examine its own work. This is **multi-turn** — one role, multiple prompts.\n", + "\n", + "Use when: a second pass catches what the first missed. Cost: 2x baseline." ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "1 scene, 2 roles, 3 turns\n" + "Reward: 1.0\n", + "Tool calls: 7\n", + "Error: None\n" ] } ], "source": [ "config = TrialConfig(\n", - " task_path=Path(\"tasks/regex-log\"),\n", + " task_path=TASK,\n", " scenes=[Scene(\n", - " name=\"code-review\",\n", - " roles=[\n", - " Role(\"coder\", \"gemini\", \"gemini-3.1-flash-lite-preview\"),\n", - " Role(\"reviewer\", \"gemini\", \"gemini-3.1-flash-lite-preview\"),\n", - " ],\n", + " name=\"self-review\",\n", + " roles=[Role(\"agent\", AGENT, MODEL)],\n", " turns=[\n", - " Turn(\"coder\"),\n", - " Turn(\"reviewer\", \"Review the code. Write feedback to /app/.outbox/coder.json\"),\n", - " Turn(\"coder\", \"Read feedback and fix issues.\"),\n", + " Turn(\"agent\"),\n", + " Turn(\"agent\", \"Review your solution. Check edge cases, \"\n", + " \"boundary conditions, and adherence to the requirements. Fix issues.\"),\n", " ],\n", " )],\n", - " environment=\"daytona\",\n", + " environment=ENV,\n", ")\n", - "s = config.effective_scenes[0]\n", - "print(f\"{len(config.effective_scenes)} scene, {len(s.roles)} roles, {len(s.turns)} turns\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Result:** reward=0.0, 16 tool calls.\n", "\n", - "**At scale (267 trials on TB2):** baseline 9.0% → reviewer 19.4% win rate." + "result = await bf.run(config)\n", + "results[\"self_review\"] = result\n", + "\n", + "reward = (result.rewards or {}).get(\"reward\")\n", + "print(f\"Reward: {reward}\")\n", + "print(f\"Tool calls: {result.n_tool_calls}\")\n", + "print(f\"Error: {result.error}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Pattern 4: Interactive User Simulation\n", - "A \"user\" role reveals info gradually. Agent responds." + "## Pattern 3: Coder + Reviewer (Multi-Round)\n", + "\n", + "Two roles — coder and reviewer — in a shared sandbox. This is **multi-round** — different roles exchanging turns.\n", + "\n", + "Communication uses the **outbox convention**:\n", + "1. Reviewer writes feedback to `/app/.outbox/coder.json` as `{\"to\": \"coder\", \"content\": \"...\"}`\n", + "2. Scheduler reads the outbox after the reviewer's turn\n", + "3. Feedback is automatically injected into the coder's next prompt\n", + "\n", + "Use when: tasks benefit from independent review. Cost: 3x baseline." ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "1 scene, 2 roles, 2 turns\n" + "Reward: 0.0\n", + "Tool calls: 13\n", + "Error: None\n" ] } ], "source": [ "config = TrialConfig(\n", - " task_path=Path(\"tasks/regex-log\"),\n", + " task_path=TASK,\n", " scenes=[Scene(\n", - " name=\"interactive\",\n", + " name=\"code-review\",\n", " roles=[\n", - " Role(\"user\", \"gemini\", \"gemini-3.1-flash-lite-preview\"),\n", - " Role(\"agent\", \"gemini\", \"gemini-3.1-flash-lite-preview\"),\n", + " Role(\"coder\", AGENT, MODEL),\n", + " Role(\"reviewer\", AGENT, MODEL),\n", " ],\n", " turns=[\n", - " Turn(\"user\", \"Give the agent a vague version of the task...\"),\n", - " Turn(\"agent\"),\n", + " Turn(\"coder\"),\n", + " Turn(\"reviewer\",\n", + " \"Review the code in /app/. Read /app/instruction.md for requirements. \"\n", + " \"Check for correctness, edge cases, and missed requirements. \"\n", + " \"Write your feedback to /app/.outbox/coder.json as: \"\n", + " '{\"to\": \"coder\", \"content\": \"Your specific feedback here.\"}'),\n", + " Turn(\"coder\",\n", + " \"Read the reviewer's feedback and fix the issues. \"\n", + " \"Focus only on what was flagged — don't start over.\"),\n", " ],\n", " )],\n", - " environment=\"daytona\",\n", - ")\n", - "s = config.effective_scenes[0]\n", - "print(f\"{len(config.effective_scenes)} scene, {len(s.roles)} roles, {len(s.turns)} turns\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Result:** reward=0.0, 2 tool calls. Vague instruction wasn't enough." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Pattern 5: Multi-Scene BYOS\n", - "Two scenes — skill generation (unscored) then solve (scored)." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2 scenes: skill-gen (1 turn), solve (1 turn)\n" - ] - } - ], - "source": [ - "config = TrialConfig(\n", - " task_path=Path(\"tasks/regex-log\"),\n", - " scenes=[\n", - " Scene(name=\"skill-gen\",\n", - " roles=[Role(\"gen\", \"gemini\", \"gemini-3.1-flash-lite-preview\")],\n", - " turns=[Turn(\"gen\", \"Write a skill document to /app/generated-skill.md\")]),\n", - " Scene(name=\"solve\",\n", - " roles=[Role(\"solver\", \"gemini\", \"gemini-3.1-flash-lite-preview\")],\n", - " turns=[Turn(\"solver\")]),\n", - " ],\n", - " environment=\"daytona\",\n", + " environment=ENV,\n", ")\n", - "print(f\"{len(config.effective_scenes)} scenes: {', '.join(f'{s.name} ({len(s.turns)} turn)' for s in config.effective_scenes)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Result:** reward=0.0, 5 tool calls. Self-generated skills = 0pp lift (consistent with paper)." + "\n", + "result = await bf.run(config)\n", + "results[\"coder_reviewer\"] = result\n", + "\n", + "reward = (result.rewards or {}).get(\"reward\")\n", + "print(f\"Reward: {reward}\")\n", + "print(f\"Tool calls: {result.n_tool_calls}\")\n", + "print(f\"Error: {result.error}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Pattern 6: Contract Consultation\n", - "Realistic multi-round: client CTO + advisor reviewing a vendor contract." + "## Comparison" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "1 scene, 2 roles, 4 turns (2 rounds)\n" + "Pattern Reward Tools Error\n", + "------------------------------------------------------------\n", + "baseline 1.0 3 —\n", + "self_review 1.0 7 —\n", + "coder_reviewer 0.0 13 —\n" ] } ], "source": [ - "config = TrialConfig(\n", - " task_path=Path(\"tasks/contract-review\"),\n", - " scenes=[Scene(\n", - " name=\"consultation\",\n", - " roles=[\n", - " Role(\"client\", \"gemini\", \"gemini-3.1-flash-lite-preview\"),\n", - " Role(\"advisor\", \"gemini\", \"gemini-3.1-flash-lite-preview\"),\n", - " ],\n", - " turns=[\n", - " Turn(\"client\"), # shares situation\n", - " Turn(\"advisor\"), # analyzes contract\n", - " Turn(\"client\", \"Clarify your top 2 priorities.\"),\n", - " Turn(\"advisor\", \"Give final redline recommendations.\"),\n", - " ],\n", - " )],\n", - " environment=\"daytona\",\n", - ")\n", - "s = config.effective_scenes[0]\n", - "print(f\"{len(config.effective_scenes)} scene, {len(s.roles)} roles, {len(s.turns)} turns ({len(s.turns)//2} rounds)\")" + "print(f\"{'Pattern':<20} {'Reward':>8} {'Tools':>6} {'Error'}\")\n", + "print(\"-\" * 60)\n", + "for name, r in results.items():\n", + " reward = (r.rewards or {}).get(\"reward\", \"—\")\n", + " err = r.error or \"—\"\n", + " if len(err) > 30:\n", + " err = err[:27] + \"...\"\n", + " print(f\"{name:<20} {reward!s:>8} {r.n_tool_calls:>6} {err}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Each pattern is a TrialConfig — same API, same lifecycle, same verifier. The only difference is the Scene configuration." + "## Mapping to Harbor PR #1462 Concepts\n", + "\n", + "For users coming from Harbor's multi-turn proposal:\n", + "\n", + "| Harbor (PR #1462) | BenchFlow 0.3 | Status |\n", + "|-------------------|---------------|--------|\n", + "| `BaseUser` | `Role` — any role can be a user, reviewer, or custom agent | **Shipped** |\n", + "| `User.run() → str` | `Turn` with a prompt — each turn sends one prompt to one role | **Shipped** |\n", + "| Per-round message passing | Outbox files + scheduler injection into next role's prompt | **Shipped** |\n", + "| Per-round archiving | `scene_messages.jsonl` in trial directory | **Shipped** |\n", + "| `--user` CLI flag | YAML scene config (`scenes:` key in trial config) | **Shipped** |\n", + "| `User.run() → None` (stop) | Fixed turn count only — no dynamic termination | **Gap** |\n", + "| Oracle access (`/solution`) | Not available to user roles during setup | **Gap** |\n", + "| Inter-round verification | `verify()` runs once after all scenes | **Gap** |\n", + "| User inspects trajectory | User role cannot read prior agent trajectory between rounds | **Gap** |\n", + "\n", + "**What BenchFlow 0.3 delivers:** Multi-role scenes with outbox messaging, message persistence,\n", + "and the same API for single-agent and multi-agent runs. N roles per scene, sequential multi-scene trials.\n", + "\n", + "**What it does NOT deliver yet:** Dynamic termination (user decides when to stop), oracle access\n", + "for user roles, per-round verification, or inter-round trajectory inspection. These require\n", + "extending the Scene scheduler with callbacks — tracked for 0.4." ] } ], @@ -315,4 +286,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/docs/notebooks/scene-patterns.py b/docs/notebooks/scene-patterns.py index a95369b8..961bd09a 100644 --- a/docs/notebooks/scene-patterns.py +++ b/docs/notebooks/scene-patterns.py @@ -18,9 +18,6 @@ """ import os -import sys -import json -from pathlib import Path # ── The contract (constructed here, no external files) ────────────── diff --git a/docs/progressive-disclosure.md b/docs/progressive-disclosure.md new file mode 100644 index 00000000..13e3c781 --- /dev/null +++ b/docs/progressive-disclosure.md @@ -0,0 +1,214 @@ +# Progressive Disclosure with `BaseUser` + +A pattern for multi-round agent runs where a Python callback drives the loop, deciding what to tell the agent next based on what happened in the previous round. + +This is BenchFlow's lightweight alternative to multi-agent "user simulation" Scenes (see [use-cases.md](./use-cases.md#1-interactive-user-simulation-harbor-1316-equivalent)). Use a `BaseUser` callback when: + +- You need programmatic control over the loop (e.g. terse prompt → hints on test failure → stop on pass). +- You don't want to spin up a second LLM just to play the "user" role. +- Your "user" logic is rule-based or oracle-guided rather than open-ended. + +For comparison: a Scene-based simulated user is another LLM with its own tool access, useful for nuanced feedback. A `BaseUser` is a sync/async Python function, useful for deterministic, scriptable progressive disclosure. + +--- + +## Why this exists + +This was built for [Josh's SWE-bench Pro use case](https://github.com/swe-bench-pro/swe-bench-pro): the dataset's instructions are long structured specs that overwhelm agents in a single turn. A `BaseUser` lets you compress the spec to a terse prompt for round 0, watch which tests fail, then disclose hints from the spec on subsequent rounds. + +It is also benchflow's parity answer to [Harbor #1316](https://github.com/harbor-ai/harbor/issues/1316) for the no-second-LLM case — Harbor's proposal required a FastMCP sidecar; BenchFlow's `BaseUser` is in-process Python. + +--- + +## Quick start + +```python +import asyncio +from pathlib import Path +import benchflow as bf +from benchflow import FunctionUser, RoundResult +from benchflow.trial import TrialConfig, Scene + + +def my_user(round: int, instruction: str, rr: RoundResult | None) -> str | None: + if round == 0: + # Round 0: terse prompt, no hints + return instruction.split("\n")[0] + if rr and rr.rewards and rr.rewards.get("reward", 0) >= 1.0: + return None # passed, stop + if round >= 3: + return None # cap at 3 rounds + # Otherwise: show the failing tests as a hint for next round + return ( + f"The previous attempt failed these tests:\n{rr.verifier_output}\n" + f"Here is the full spec for context:\n{instruction}" + ) + + +config = TrialConfig( + task_path=Path(".ref/swebenchpro/instance_flipt-io__flipt-..."), + scenes=[Scene.single(agent="opencode", model="anthropic/claude-sonnet-4-6")], + user=FunctionUser(my_user), + max_user_rounds=3, + environment="daytona", +) +result = asyncio.run(bf.run(config)) +``` + +--- + +## API + +### `BaseUser` + +Subclass and override `run()`. Optionally override `setup()` for one-time initialization. + +```python +from benchflow import BaseUser, RoundResult + + +class MyUser(BaseUser): + async def setup(self, instruction: str, solution: str | None = None) -> None: + """Called once before the first round. + + instruction — the original task instruction (from instruction.md) + solution — the gold answer if oracle_access=True, else None + """ + self.spec_lines = instruction.split("\n") + self.gold = solution # only set if oracle_access=True + + async def run( + self, + round: int, + instruction: str, + round_result: RoundResult | None = None, + ) -> str | None: + """Produce the next prompt, or None to stop the loop. + + round — 0-indexed round number + instruction — the original task instruction + round_result — None on round 0; previous round's outcome on subsequent rounds + """ + ... # return prompt str or None +``` + +### `RoundResult` + +Dataclass passed to `run()` from round 1 onward. + +```python +@dataclass +class RoundResult: + round: int # 0-indexed + trajectory: list[dict] # ACP events from this round only + rewards: dict[str, Any] | None # verifier rewards (None if verifier crashed) + verifier_output: str | None # raw verifier stdout/log content + verifier_error: str | None # exception message if verifier failed + n_tool_calls: int # tool calls in this round +``` + +### `PassthroughUser` + +Sends the instruction unchanged on round 0, stops on round 1. Backward-compatible single-round behavior. + +### `FunctionUser` + +Wraps a plain function as a `BaseUser`. Sync and async both supported (via `inspect.isawaitable`). + +```python +def fn(round, instruction, rr): return None if round > 0 else instruction +user = FunctionUser(fn) + +async def afn(round, instruction, rr): ... +user = FunctionUser(afn) +``` + +### `TrialConfig` fields + +```python +user: BaseUser | None = None # the callback +max_user_rounds: int = 5 # hard cap on rounds (loop stops earlier if user returns None) +oracle_access: bool = False # expose gold solution to user.setup() +``` + +A `User` requires a single-scene, single-role config. Multi-scene or multi-role configs raise `ValueError`. + +--- + +## Oracle access + +When `oracle_access=True`, the trial: + +1. Reads `/solution/solve.sh` before the agent starts and passes its content to `user.setup(instruction, solution=...)`. +2. Moves `/solution` → `/solution_oracle_backup` so the agent cannot read it during its rounds. +3. Temporarily restores `/solution` for `soft_verify()` between rounds (and re-hides it). +4. Restores `/solution` permanently before the final `verify()`. + +Step 4 is wrapped in a `try/finally`, so if a round throws, the restore still runs. + +> ⚠️ Setting `oracle_access=True` without a `User` is a misconfiguration — the solution stays exposed to the agent for the entire trial. BenchFlow logs a `WARNING` at setup time when this happens. + +Use cases for oracle access: +- Dataset generation: have the user generate optimal prompts based on knowing the answer. +- Curriculum learning: progressively reveal hints from the gold solution. +- Research: study how much oracle information is needed for an agent to succeed. + +--- + +## Per-task hardening opt-outs + +The verifier's pre-run cleanup deletes `conftest.py` files outside `/tests/` to prevent agent reward-hacking. Some tasks (e.g. qutebrowser) ship legitimate `conftest.py` that sets up Python's import order to break a real circular dependency. The default cleanup deletes them, breaking pytest collection. + +Tasks declare opt-outs in `task.toml`: + +```toml +[verifier] +timeout_sec = 3000 + +[verifier.hardening] +cleanup_conftests = false +``` + +Available flags (all default `true` — secure-by-default): + +| Flag | Effect when `false` | +|------|---------------------| +| `cleanup_conftests` | Don't delete `conftest.py` outside `/tests/` before verify | + +Other cleanup steps (`sitecustomize.py`, `.pth` files, `*.py` in `/tmp`) always run — they have no legitimate use case in repo source trees and broaden the attack surface if disabled. + +Unknown keys in `[verifier.hardening]` are logged as warnings and ignored. String values for boolean flags are rejected (must be TOML `true` / `false`). + +--- + +## Failure modes + +The user loop catches exceptions from `user.run()` and logs them as the trial error, breaking out of the loop: + +```python +[User] round 2: prompt='Try again, focusing on...' +ERROR: user.run() failed at round 2: KeyError: 'spec_section' +``` + +`soft_verify()` between rounds catches its own timeouts and crashes — they surface as `RoundResult.verifier_error`, not as trial-level failures. The next round still runs; the user sees the error and decides whether to continue. + +Trajectory and tool counts are sliced per round from `Trial._trajectory`. The session counters reset on `disconnect()` between rounds, so each round's `RoundResult.trajectory` and `n_tool_calls` reflect only that round's events. + +--- + +## Worked example + +See [`examples/swebench_pro_progressive_disclosure.ipynb`](../examples/swebench_pro_progressive_disclosure.ipynb) for a 5-task SWE-bench Pro comparison: oracle vs single-round baseline vs 3-round progressive disclosure on flipt and openlibrary. + +For a minimal end-to-end script, see [`examples/user_dogfood.py`](../examples/user_dogfood.py). + +--- + +## Comparison with multi-agent Scene-based user simulation + +| Pattern | When to use | +|---------|-------------| +| `BaseUser` callback (this doc) | Programmatic, rule-based, deterministic. No second LLM. Cheap. | +| Multi-role Scene with simulated-user role ([use-cases.md §1](./use-cases.md#1-interactive-user-simulation-harbor-1316-equivalent)) | Open-ended, conversational. The "user" is another LLM with full tool access. Better for nuanced human-like interaction. | + +Both patterns coexist. Choose `BaseUser` for the lighter-weight case; choose Scenes when you actually want a second agent in the loop. diff --git a/docs/quickstart.md b/docs/quickstart.md index e087ea74..502fc94b 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -4,14 +4,14 @@ Get a benchmark result in under 5 minutes. ## Prerequisites -- Python 3.12+ +- Python 3.12+ and [uv](https://docs.astral.sh/uv/) - A Daytona API key (`DAYTONA_API_KEY`) for cloud sandboxes - An agent API key (e.g. `GEMINI_API_KEY` for Gemini) ## Install ```bash -pip install benchflow==0.3.0a3 +uv tool install benchflow ``` ## Run your first evaluation @@ -26,7 +26,8 @@ bench eval create \ -t .ref/terminal-bench-2/regex-log \ -a gemini \ -m gemini-3.1-flash-lite-preview \ - -e daytona + -e daytona \ + --sandbox-setup-timeout 300 ``` BenchFlow will: @@ -52,6 +53,7 @@ model: gemini-3.1-flash-lite-preview environment: daytona concurrency: 64 max_retries: 2 +sandbox_setup_timeout: 300 ``` ## Python API @@ -70,11 +72,23 @@ config = TrialConfig( task_path=Path("tasks/regex-log"), scenes=[Scene.single(agent="gemini", model="gemini-3.1-flash-lite-preview")], environment="daytona", + sandbox_setup_timeout=300, ) trial = await Trial.create(config) result = await trial.run() ``` +If you are using the `Agent + Environment` path directly, pass the timeout through `RuntimeConfig`: + +```python +from benchflow.runtime import Agent, Environment, Runtime, RuntimeConfig + +agent = Agent("gemini", model="gemini-3.1-flash-lite-preview") +env = Environment.from_task("tasks/regex-log", backend="daytona") +runtime = Runtime(env, agent, config=RuntimeConfig(sandbox_setup_timeout=300)) +result = await runtime.execute() +``` + ## Multi-agent (reviewer pattern) ```python @@ -95,6 +109,7 @@ config = TrialConfig( ]), ], environment="daytona", + sandbox_setup_timeout=300, ) result = await bf.run(config) ``` diff --git a/docs/skill-eval-guide.md b/docs/skill-eval-guide.md index f0a04c68..1268222e 100644 --- a/docs/skill-eval-guide.md +++ b/docs/skill-eval-guide.md @@ -5,7 +5,7 @@ Test whether your agent skill actually helps agents perform better. ## Install ```bash -pip install benchflow==0.3.0a3 +uv tool install benchflow ``` ## Overview @@ -382,7 +382,7 @@ BenchFlow generates everything ephemeral — only results persist. **CI integration:** ```bash # In your skill's CI pipeline -pip install benchflow==0.3.0a3 +uv tool install benchflow bench skills eval . -a claude-agent-acp --no-baseline # Exit code 1 if any case scores < 0.5 ``` diff --git a/docs/task-authoring.md b/docs/task-authoring.md index adfb6415..9081987a 100644 --- a/docs/task-authoring.md +++ b/docs/task-authoring.md @@ -50,6 +50,8 @@ env = { OPENAI_API_KEY = "${OPENAI_API_KEY}" } # host vars to injec **Built-in mock services** — if the Dockerfile references a service binary (`claw-gmail`, `claw-slack`, `claw-gcal`, `claw-gdoc`, `claw-gdrive`), BenchFlow starts it automatically. No `[services]` section needed. +**Install tooling to shared prefixes, not `/root`** — when a task image ships Node.js, Python tools, or agent binaries that the sandbox user must execute, install them to `/usr/local/bin`, `/usr/local/lib`, or `/opt`, not `/root/.nvm` or `/root/.local/bin`. `setup_sandbox_user()` creates the non-root user, prepares small config/auth dirs, and chowns the workspace — it does not clone `/root` into the sandbox home. Legacy images that already install tools under `/root` still work via a narrow symlink fallback, but shared prefixes are the supported path. Pre-creating the sandbox user in the Dockerfile is an optional speedup, not a requirement. + --- ## instruction.md diff --git a/docs/use-cases.md b/docs/use-cases.md index 63fa2432..fbdf108c 100644 --- a/docs/use-cases.md +++ b/docs/use-cases.md @@ -84,6 +84,10 @@ result = await bf.run(config) - The user agent is a real LLM with full tool access -- it can read files, check outputs, and give nuanced feedback, not just templated responses. - Same task folder works for single-turn (baseline) and interactive (with user) via different YAML configs. +### Lighter-weight alternative: `BaseUser` callback + +When you don't need a second LLM and your "user" logic is rule-based or oracle-guided (e.g. compress instruction → show test failures as hints → stop on pass), use a `BaseUser` Python callback instead of a multi-role Scene. See [progressive-disclosure.md](./progressive-disclosure.md). This is benchflow's no-second-LLM parity answer to Harbor #1316 and was built for the SWE-bench Pro progressive-disclosure use case. + --- ## 2. Code Review Loop (followup-bench) diff --git a/examples/swebench_pro_progressive_disclosure.ipynb b/examples/swebench_pro_progressive_disclosure.ipynb new file mode 100644 index 00000000..3b6fe2cd --- /dev/null +++ b/examples/swebench_pro_progressive_disclosure.ipynb @@ -0,0 +1,369 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Progressive Disclosure on SWE-bench Pro\n", + "\n", + "Demo of the `BaseUser` abstraction for multi-round agent runs on SWE-bench Pro.\n", + "\n", + "**Tasks:** ansible, flipt, openlibrary, navidrome, qutebrowser (5 oracle-passing tasks).\n", + "**Baseline agent:** Gemini 3.1 Pro Preview, single-round.\n", + "**Progressive:** `BaseUser` callback, up to 3 rounds, hints disclosed on test failure.\n", + "\n", + "Built for [Josh's GitHub/Microsoft SWE-bench Pro use case](https://github.com/swe-bench-pro/swe-bench-pro). Parity answer to [Harbor #1316](https://github.com/harbor-ai/harbor/issues/1316) for the no-second-LLM case — see [`docs/progressive-disclosure.md`](../docs/progressive-disclosure.md).\n", + "\n", + "## Setup history (2026-04-24)\n", + "\n", + "| Issue | Fix |\n", + "|-------|-----|\n", + "| `--rootdir` removed → test IDs came out as `../dev/::test_name`, verifier saw 0 passes despite all tests passing | Restored `--rootdir=/app` in `PYTEST_ADDOPTS` |\n", + "| `conftest.py` cleanup deleted qutebrowser's legitimate import-order conftests, breaking pytest collection (circular import) | Per-task `[verifier.hardening]` opt-out in `task.toml`: `cleanup_conftests = false` |\n", + "\n", + "Result: oracle 2/4 → 5/5." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. The `BaseUser` API" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-25T01:10:06.729067Z", + "iopub.status.busy": "2026-04-25T01:10:06.728964Z", + "iopub.status.idle": "2026-04-25T01:10:10.646349Z", + "shell.execute_reply": "2026-04-25T01:10:10.645629Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "BaseUser: (self, round: 'int', instruction: 'str', round_result: 'RoundResult | None' = None) -> 'str | None'\n", + "RoundResult fields: ['round', 'trajectory', 'rewards', 'verifier_output', 'verifier_error', 'n_tool_calls']\n", + "TrialConfig user fields: ['user', 'max_user_rounds', 'oracle_access']\n" + ] + } + ], + "source": [ + "import inspect\n", + "import os\n", + "\n", + "os.chdir('/workspace/repos/benchflow')\n", + "\n", + "from benchflow import BaseUser, FunctionUser, RoundResult\n", + "from benchflow.trial import TrialConfig\n", + "\n", + "print('BaseUser:', inspect.signature(BaseUser.run))\n", + "print('RoundResult fields:', list(RoundResult.__dataclass_fields__))\n", + "print('TrialConfig user fields:', [f for f in TrialConfig.__dataclass_fields__\n", + " if f in ('user', 'max_user_rounds', 'oracle_access')])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. A minimal progressive-disclosure user\n", + "\n", + "Three-round progressive disclosure: terse prompt → hints with failing tests → full spec." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-25T01:10:10.667007Z", + "iopub.status.busy": "2026-04-25T01:10:10.666654Z", + "iopub.status.idle": "2026-04-25T01:10:10.670952Z", + "shell.execute_reply": "2026-04-25T01:10:10.670403Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "User type: FunctionUser\n" + ] + } + ], + "source": [ + "def progressive(round, instruction, rr):\n", + " if round == 0:\n", + " first_line = instruction.split('\\n', 1)[0].strip()\n", + " return f'{first_line}\\n\\nRead /app/ to understand the codebase, then implement the fix. Run tests when you think you are done.'\n", + "\n", + " if rr and rr.rewards:\n", + " score = rr.rewards.get('reward', rr.rewards.get('exact_match', 0))\n", + " if score >= 1.0:\n", + " return None # passed, stop\n", + "\n", + " if round == 1:\n", + " half = len(instruction) // 2\n", + " return (\n", + " f'Tests failed:\\n{(rr.verifier_output or \"\")[:1500]}\\n\\n'\n", + " f'First half of spec:\\n{instruction[:half]}'\n", + " )\n", + "\n", + " if round == 2:\n", + " return f'Full spec:\\n{instruction}'\n", + "\n", + " return None\n", + "\n", + "user = FunctionUser(progressive)\n", + "print(f'User type: {type(user).__name__}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Oracle validation: 5/5 SWE-bench Pro tasks pass\n", + "\n", + "Verifies the gold solution (`solve.sh`) reaches reward=1.0 on every task. Required before trusting baseline/progressive numbers — without a working oracle we can't tell if low rewards mean agent failure or verifier failure." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-25T01:10:10.672135Z", + "iopub.status.busy": "2026-04-25T01:10:10.672040Z", + "iopub.status.idle": "2026-04-25T01:10:10.676265Z", + "shell.execute_reply": "2026-04-25T01:10:10.675540Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Task Reward Time(s) Notes\n", + "------------------------------------------------------------\n", + "ansible 1.0 54.3 \n", + "flipt 1.0 480.5 \n", + "openlibrary 1.0 147.7 \n", + "qutebrowser 1.0 24.5 after [verifier.hardening] cleanup_conftests=false\n", + "navidrome 1.0 ~120 verified separately\n" + ] + } + ], + "source": [ + "import csv\n", + "from pathlib import Path\n", + "\n", + "results = list(csv.DictReader(open('experiments/swebench-pro-results.csv')))\n", + "\n", + "# qutebrowser oracle in CSV is from BEFORE the hardening opt-out fix.\n", + "# Post-fix oracle: 1.0 (verified separately on 2026-04-24).\n", + "qutebrowser_post_fix = {'task': 'qutebrowser', 'experiment': 'oracle',\n", + " 'reward': '1.0', 'elapsed_s': '24.5',\n", + " 'note': 'after [verifier.hardening] cleanup_conftests=false'}\n", + "navidrome_oracle = {'task': 'navidrome', 'experiment': 'oracle',\n", + " 'reward': '1.0', 'elapsed_s': '~120',\n", + " 'note': 'verified separately'}\n", + "\n", + "oracle = [r for r in results if r['experiment'] == 'oracle' and r['task'] != 'qutebrowser']\n", + "for r in oracle:\n", + " r['note'] = ''\n", + "oracle.append(qutebrowser_post_fix)\n", + "oracle.append(navidrome_oracle)\n", + "\n", + "print(f'{\"Task\":<14} {\"Reward\":<8} {\"Time(s)\":<8} Notes')\n", + "print('-' * 60)\n", + "for r in oracle:\n", + " print(f'{r[\"task\"]:<14} {r[\"reward\"]:<8} {r[\"elapsed_s\"]:<8} {r[\"note\"]}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Baseline: Gemini 3.1 Pro single-round\n", + "\n", + "Each task gets the full SWE-bench Pro instruction in one shot. No user, no rounds." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-25T01:10:10.677398Z", + "iopub.status.busy": "2026-04-25T01:10:10.677287Z", + "iopub.status.idle": "2026-04-25T01:10:10.681221Z", + "shell.execute_reply": "2026-04-25T01:10:10.680559Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Task Reward Tools Time(s) \n", + "------------------------------------------\n", + "ansible 1.0 23 206.8 \n", + "flipt 0.0 61 1443.7 \n", + "openlibrary 1.0 32 340.0 \n", + "qutebrowser 0.0 72 543.6 \n", + "\n", + "Baseline (Gemini 3.1 Pro, 1 round): 2/4 passed\n" + ] + } + ], + "source": [ + "baseline = [r for r in results if r['experiment'] == 'baseline']\n", + "\n", + "print(f'{\"Task\":<14} {\"Reward\":<8} {\"Tools\":<6} {\"Time(s)\":<8}')\n", + "print('-' * 42)\n", + "for r in baseline:\n", + " print(f'{r[\"task\"]:<14} {r[\"reward\"]:<8} {r[\"n_tool_calls\"]:<6} {r[\"elapsed_s\"]:<8}')\n", + "\n", + "passed = sum(1 for r in baseline if float(r['reward']) >= 1.0)\n", + "print(f'\\nBaseline (Gemini 3.1 Pro, 1 round): {passed}/{len(baseline)} passed')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Progressive disclosure: where it should help\n", + "\n", + "**flipt** is the interesting case: oracle passes (gold `solve.sh` works) but Gemini's single-round baseline fails after 61 tool calls and 24 minutes. The agent had the full spec but didn't solve it. Hypothesis: progressive disclosure with failing-test feedback could let the agent course-correct.\n", + "\n", + "**openlibrary** is the regression test: baseline already passes, so progressive disclosure should also pass. If progressive breaks something the baseline solves, that's a bug in the user loop.\n", + "\n", + "Run the script:\n", + "```bash\n", + "GEMINI_API_KEY=... python examples/swebench_pro_user_dogfood.py --task flipt --max-rounds 3\n", + "```\n", + "\n", + "Results land in `/tmp/swebench-pro-jobs/progressive///`." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-25T01:10:10.682639Z", + "iopub.status.busy": "2026-04-25T01:10:10.682546Z", + "iopub.status.idle": "2026-04-25T01:10:10.686008Z", + "shell.execute_reply": "2026-04-25T01:10:10.685533Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Latest trial: /tmp/swebench-pro-jobs/progressive/2026-04-24__18-07-33/instance_flipt-io__flipt-02e21636c58e86c51119b63e0fb5ca7b813b07b1__0684d916\n", + " (no user_rounds.jsonl yet — run is in progress or failed before any round)\n" + ] + } + ], + "source": [ + "# Once you have a progressive run, point at its trial dir and load round results.\n", + "# Each round's outcome is logged to user_rounds.jsonl.\n", + "\n", + "import json\n", + "\n", + "PROGRESSIVE_ROOT = Path('/tmp/swebench-pro-jobs/progressive')\n", + "\n", + "if PROGRESSIVE_ROOT.exists():\n", + " runs = sorted(PROGRESSIVE_ROOT.iterdir())\n", + " if runs:\n", + " latest_job = runs[-1]\n", + " instances = list(latest_job.iterdir())\n", + " if instances:\n", + " trial = instances[0]\n", + " print(f'Latest trial: {trial}')\n", + " rounds_log = trial / 'user_rounds.jsonl'\n", + " if rounds_log.exists():\n", + " rounds = [json.loads(l) for l in rounds_log.read_text().splitlines()]\n", + " print(f'\\n{\"Round\":<7} {\"Reward\":<10} {\"Tools\":<7} Verifier error')\n", + " print('-' * 60)\n", + " for r in rounds:\n", + " score = (r.get('rewards') or {}).get('reward', '?')\n", + " err = (r.get('verifier_error') or '')[:30]\n", + " print(f'{r[\"round\"]:<7} {score!s:<10} {r[\"n_tool_calls\"]:<7} {err}')\n", + " else:\n", + " print(' (no user_rounds.jsonl yet — run is in progress or failed before any round)')\n", + "else:\n", + " print(f'No progressive runs at {PROGRESSIVE_ROOT} yet.')\n", + " print('Run examples/swebench_pro_user_dogfood.py to generate one.')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Per-task hardening opt-outs\n", + "\n", + "qutebrowser ships legitimate `conftest.py` files that fix a circular import in its own source. The default verifier hardening deletes them, breaking pytest collection. The task opts out:\n", + "\n", + "```toml\n", + "# .ref/swebenchpro/instance_qutebrowser__.../task.toml\n", + "[verifier.hardening]\n", + "cleanup_conftests = false\n", + "```\n", + "\n", + "Available flags (all default `true` — secure-by-default):\n", + "\n", + "| Flag | Effect when `false` |\n", + "|------|---------------------|\n", + "| `cleanup_conftests` | Don't delete `conftest.py` outside `/tests/` before verify |\n", + "\n", + "See [`docs/progressive-disclosure.md`](../docs/progressive-disclosure.md) for the full design rationale." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Comparison with multi-agent simulated user\n", + "\n", + "BenchFlow has two patterns for multi-round agent runs:\n", + "\n", + "| Pattern | When to use |\n", + "|---------|-------------|\n", + "| `BaseUser` callback (this notebook) | Programmatic, rule-based, deterministic. No second LLM. Cheap. |\n", + "| Multi-role Scene with simulated-user role ([use-cases.md §1](../docs/use-cases.md#1-interactive-user-simulation-harbor-1316-equivalent)) | Open-ended, conversational. The 'user' is another LLM with full tool access. |\n", + "\n", + "Both are functionally at parity with [Harbor #1316](https://github.com/harbor-ai/harbor/issues/1316), avoiding the FastMCP sidecar requirement. Choose based on whether you want a deterministic callback or another LLM agent." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/swebench_pro_user_dogfood.py b/examples/swebench_pro_user_dogfood.py new file mode 100644 index 00000000..997ccc8b --- /dev/null +++ b/examples/swebench_pro_user_dogfood.py @@ -0,0 +1,135 @@ +"""Dogfood: SWE-bench Pro progressive disclosure with BaseUser. + +Demonstrates the BaseUser abstraction on a SWE-bench Pro task — the original +motivation for this feature (Josh's GitHub/Microsoft use case). + +The user: + Round 0: terse problem description (one sentence from the spec). + Round 1+: if tests failed, append the failing test names and a section of + the original spec as a hint. + Stops when reward >= 1.0 or max_user_rounds hit. + +Usage: + GEMINI_API_KEY=... python examples/swebench_pro_user_dogfood.py + GEMINI_API_KEY=... python examples/swebench_pro_user_dogfood.py --task openlibrary + +Tasks available (oracle-validated 5/5 on 2026-04-24): + ansible, flipt, openlibrary, navidrome, qutebrowser +""" + +import argparse +import asyncio +import logging +from pathlib import Path + +import benchflow as bf +from benchflow.trial import Scene, TrialConfig +from benchflow.user import FunctionUser, RoundResult + +logging.basicConfig(level=logging.INFO, format="%(name)s %(message)s") + + +SWEBENCH_PRO_TASKS = { + "ansible": "instance_ansible__ansible-0ea40e09d1b35bcb69ff4d9cecf3d0defa4b36e8-v30a923fb5c164d6cd18280c02422f75e611e8fb2", + "flipt": "instance_flipt-io__flipt-02e21636c58e86c51119b63e0fb5ca7b813b07b1", + "openlibrary": "instance_internetarchive__openlibrary-00bec1e7c8f3272c469a58e1377df03f955ed478-v13642507b4fc1f8d234172bf8129942da2c2ca26", + "navidrome": "instance_navidrome__navidrome-0130c6dc13438b48cf0fdfab08a89e357b5517c9", + "qutebrowser": "instance_qutebrowser__qutebrowser-01d1d1494411380d97cac14614a829d3a69cecaf-v2ef375ac784985212b1805e1d0431dc8f1b3c171", +} + + +def make_progressive_user() -> FunctionUser: + """User that compresses the instruction on round 0 and discloses hints on failure.""" + + def progressive( + round: int, instruction: str, rr: RoundResult | None + ) -> str | None: + # Round 0: terse — first line of the spec. + if round == 0: + first_line = instruction.split("\n", 1)[0].strip() + return ( + f"{first_line}\n\n" + "Read /app/ to understand the codebase, then implement the fix. " + "Run tests when you think you're done." + ) + + # Stop on success. + if rr and rr.rewards: + score = rr.rewards.get("reward", rr.rewards.get("exact_match", 0)) + if score >= 1.0: + print(f" [User] reward={score} at round {round}, stopping.") + return None + + # Round 1: show failing tests + first half of spec. + if round == 1: + half = len(instruction) // 2 + return ( + "The verifier reported these issues:\n\n" + f"{(rr.verifier_output or '')[:1500]}\n\n" + "Here is the first half of the spec for context:\n\n" + f"{instruction[:half]}\n\n" + "Continue working in /app/." + ) + + # Round 2: full spec. + if round == 2: + return ( + "Still failing. Full specification:\n\n" + f"{instruction}\n\n" + "Address every requirement listed above." + ) + + return None + + return FunctionUser(progressive) + + +async def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--task", + default="flipt", + choices=list(SWEBENCH_PRO_TASKS), + help="Which SWE-bench Pro task to run", + ) + parser.add_argument("--agent", default="gemini") + parser.add_argument("--model", default="gemini-3.1-pro-preview") + parser.add_argument("--backend", default="daytona") + parser.add_argument("--max-rounds", type=int, default=3) + args = parser.parse_args() + + task_dir = SWEBENCH_PRO_TASKS[args.task] + task_path = Path(".ref/swebenchpro") / task_dir + if not task_path.exists(): + print(f"Task not found: {task_path}") + return + + config = TrialConfig( + task_path=task_path, + scenes=[Scene.single(agent=args.agent, model=args.model)], + environment=args.backend, + sandbox_user="agent", + user=make_progressive_user(), + max_user_rounds=args.max_rounds, + jobs_dir="/tmp/swebench-pro-jobs/progressive", + ) + + print(f"Progressive disclosure on {args.task}") + print(f" Agent: {args.agent} / {args.model}") + print(f" Backend: {args.backend}") + print(f" Rounds: up to {args.max_rounds}") + print() + + result = await bf.run(config) + + print("\n=== RESULT ===") + print(f" Task: {result.task_name}") + print(f" Rewards: {result.rewards}") + print(f" Tool calls: {result.n_tool_calls}") + print(f" Error: {result.error}") + if result.trial_dir: + print(f" Trial dir: {result.trial_dir}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/user_dogfood.py b/examples/user_dogfood.py new file mode 100644 index 00000000..517afc0c --- /dev/null +++ b/examples/user_dogfood.py @@ -0,0 +1,93 @@ +"""Dogfood: run regex-log task with progressive-disclosure User. + +Demonstrates the BaseUser abstraction — a FunctionUser that: + Round 0: gives a terse version of the instruction + Round 1+: if tests failed, gives a hint based on the full instruction + Stops when tests pass or max_rounds hit. + +Usage: + GEMINI_API_KEY=... python examples/user_dogfood.py +""" + +import asyncio +import logging +from pathlib import Path + +import benchflow as bf +from benchflow.trial import Scene, TrialConfig +from benchflow.user import FunctionUser, RoundResult + +logging.basicConfig(level=logging.INFO, format="%(name)s %(message)s") + + +def progressive_user( + round: int, instruction: str, rr: RoundResult | None +) -> str | None: + if round == 0: + return ( + "Write a regex that matches dates (YYYY-MM-DD) in log lines " + "containing an IPv4 address. Save it to /app/regex.txt" + ) + + if rr and rr.rewards: + score = rr.rewards.get("exact_match", rr.rewards.get("reward", 0)) + if score >= 1.0: + print(f" [User] Tests passed at round {round}! Stopping.") + return None + + if round == 1: + return ( + "The tests failed. Important details you may have missed:\n" + "- Match only the LAST date in each line\n" + "- Feb can have up to 29 days\n" + "- Dates/IPs must not be preceded/followed by alphanumeric chars\n" + "- Use re.findall with re.MULTILINE\n" + "Fix /app/regex.txt" + ) + + if round == 2: + return ( + "Still failing. Here's the full instruction:\n\n" + instruction + ) + + return None + + +async def main(): + task_path = Path(".ref/terminal-bench-2/regex-log") + if not task_path.exists(): + print(f"Task not found at {task_path}") + return + + config = TrialConfig( + task_path=task_path, + scenes=[Scene.single(agent="gemini", model="gemini-2.5-flash")], + environment="daytona", + user=FunctionUser(progressive_user), + max_user_rounds=4, + ) + + print("Running progressive-disclosure trial on regex-log...") + print(" Agent: gemini/flash") + print(f" Max rounds: {config.max_user_rounds}") + print(f" Environment: {config.environment}") + print() + + result = await bf.run(config) + + print("\n=== RESULT ===") + print(f" Task: {result.task_name}") + print(f" Rewards: {result.rewards}") + print(f" Error: {result.error}") + print(f" Tool calls: {result.n_tool_calls}") + + if result.rewards: + score = result.rewards.get("exact_match", result.rewards.get("reward", 0)) + if score >= 1.0: + print(" PASSED (final verify)") + else: + print(f" FAILED (score={score})") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/user_dogfood_results/result.json b/examples/user_dogfood_results/result.json new file mode 100644 index 00000000..0f360595 --- /dev/null +++ b/examples/user_dogfood_results/result.json @@ -0,0 +1,25 @@ +{ + "task_name": "regex-log", + "trial_name": "regex-log__bab1cbf6", + "rewards": { + "reward": 0.0 + }, + "agent": "gemini", + "agent_name": "gemini-cli", + "model": "gemini-2.5-flash", + "n_tool_calls": 8, + "n_prompts": 1, + "error": null, + "verifier_error": null, + "partial_trajectory": false, + "trajectory_source": "acp", + "started_at": "2026-04-22 22:37:42.034339", + "finished_at": "2026-04-22 22:40:58.748485", + "timing": { + "environment_setup": 1.3, + "agent_setup": 3.1, + "agent_execution": 15.6, + "verifier": 6.4, + "total": 196.7 + } +} \ No newline at end of file diff --git a/examples/user_dogfood_results/user_rounds.jsonl b/examples/user_dogfood_results/user_rounds.jsonl new file mode 100644 index 00000000..258dfaff --- /dev/null +++ b/examples/user_dogfood_results/user_rounds.jsonl @@ -0,0 +1,3 @@ +{"round": 0, "prompt": "Write a regex that matches dates (YYYY-MM-DD) in log lines containing an IPv4 address. Save it to /app/regex.txt", "rewards": {"reward": 0.0}, "verifier_error": null, "n_tool_calls": 2, "n_trajectory_events": 4} +{"round": 1, "prompt": "The tests failed. Important details you may have missed:\n- Match only the LAST date in each line\n- Feb can have up to 29 days\n- Dates/IPs must not be preceded/followed by alphanumeric chars\n- Use re.findall with re.MULTILINE\nFix /app/regex.txt", "rewards": {"reward": 0.0}, "verifier_error": null, "n_tool_calls": 3, "n_trajectory_events": 5} +{"round": 2, "prompt": "Still failing. Here's the full instruction:\n\nWrite a regex expression that matches dates in the format YYYY-MM-DD appearing in lines that contain an IPv4 address in a log file.\nIf multiple dates are present in a line, the regex should match only the last date in that line.\nAssume that February can have up to 29 days in all years, without distinguishing leap years from non-leap years.\nIPv4 addresses use normal decimal notation without leading zeros in each octet.\n\nNote: Be careful that there might be text in the log that looks similar to dates or IPv4 addresses but is not (e.g., user 1134-12-1234). \nTo avoid false matches, ensure that valid dates and IPv4 addresses are not immediately preceded or followed by alphanumeric characters.\n\nSave your regex in /app/regex.txt\nThe regex will be read from the file and applied to the log file contents using Python's re.findall with the re.MULTILINE flag.\nExample Python usage:\n```\nimport re\n\nwith open(\"/app/regex.txt\") as f:\n pattern = f.read().strip()\n\nmatches = re.findall(pattern, log_text, re.MULTILINE)\n```", "rewards": {"reward": 0.0}, "verifier_error": null, "n_tool_calls": 3, "n_trajectory_events": 5} diff --git a/experiments/reviewer_ablation.py b/experiments/reviewer_ablation.py index 5acaecab..8ab0caec 100644 --- a/experiments/reviewer_ablation.py +++ b/experiments/reviewer_ablation.py @@ -17,25 +17,26 @@ import os import sys import time -from datetime import datetime from pathlib import Path logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") sys.path.insert(0, str(Path(__file__).resolve().parents[0].parent / "src")) +import contextlib + +from harbor.models.task.task import Task +from harbor.models.trial.paths import TrialPaths +from harbor.verifier.verifier import Verifier + from benchflow._acp_run import connect_acp, execute_prompts from benchflow._agent_env import resolve_agent_env from benchflow._agent_setup import install_agent from benchflow._credentials import upload_subscription_auth, write_credential_files from benchflow._env_setup import _create_environment from benchflow._sandbox import setup_sandbox_user -from benchflow._scene import Role, Scene -from benchflow.agents.registry import AGENTS, AGENT_LAUNCH +from benchflow.agents.registry import AGENT_LAUNCH, AGENTS from benchflow.sdk import SDK -from harbor.models.task.task import Task -from harbor.verifier.verifier import Verifier -from harbor.models.trial.paths import TrialPaths logger = logging.getLogger(__name__) @@ -141,10 +142,8 @@ async def _run_acp(env, prompt: str, trial_dir: Path, timeout: int = 600) -> tup try: _, n_tools = await execute_prompts(acp_client, session, [prompt], timeout=timeout) finally: - try: + with contextlib.suppress(Exception): await acp_client.close() - except Exception: - pass return n_tools, int(time.time() - t0) @@ -204,7 +203,7 @@ async def run_reviewer(task_path: Path, task_name: str, condition: str) -> dict: total_tools += n_tools # Read coder's outbox - outbox_result = await env.exec("cat /app/.outbox/reviewer.json 2>/dev/null || echo '{}'") + await env.exec("cat /app/.outbox/reviewer.json 2>/dev/null || echo '{}'") await env.exec("rm -rf /app/.outbox/*") # Phase 2: Reviewer diff --git a/experiments/swebench-pro-results.csv b/experiments/swebench-pro-results.csv new file mode 100644 index 00000000..2c10f0b2 --- /dev/null +++ b/experiments/swebench-pro-results.csv @@ -0,0 +1,9 @@ +agent,elapsed_s,error,experiment,model,n_tool_calls,reward,task,task_full +,54.3,,oracle,,0,1.0,ansible,instance_ansible__ansible-0ea40e09d1b35bcb69ff4d9cecf3d0defa4b36e8-v30a923fb5c164d6cd18280c02422f75e611e8fb2 +,480.5,,oracle,,0,1.0,flipt,instance_flipt-io__flipt-02e21636c58e86c51119b63e0fb5ca7b813b07b1 +,147.7,,oracle,,0,1.0,openlibrary,instance_internetarchive__openlibrary-00bec1e7c8f3272c469a58e1377df03f955ed478-v13642507b4fc1f8d234172bf8129942da2c2ca26 +,24.3,,oracle,,0,0.0,qutebrowser,instance_qutebrowser__qutebrowser-01d1d1494411380d97cac14614a829d3a69cecaf-v2ef375ac784985212b1805e1d0431dc8f1b3c171 +gemini,206.8,,baseline,gemini-3.1-pro-preview,23,1.0,ansible,instance_ansible__ansible-0ea40e09d1b35bcb69ff4d9cecf3d0defa4b36e8-v30a923fb5c164d6cd18280c02422f75e611e8fb2 +gemini,1443.7,,baseline,gemini-3.1-pro-preview,61,0.0,flipt,instance_flipt-io__flipt-02e21636c58e86c51119b63e0fb5ca7b813b07b1 +gemini,340.0,,baseline,gemini-3.1-pro-preview,32,1.0,openlibrary,instance_internetarchive__openlibrary-00bec1e7c8f3272c469a58e1377df03f955ed478-v13642507b4fc1f8d234172bf8129942da2c2ca26 +gemini,543.6,,baseline,gemini-3.1-pro-preview,72,0.0,qutebrowser,instance_qutebrowser__qutebrowser-01d1d1494411380d97cac14614a829d3a69cecaf-v2ef375ac784985212b1805e1d0431dc8f1b3c171 diff --git a/experiments/swebench_pro_oracle_and_baseline.py b/experiments/swebench_pro_oracle_and_baseline.py new file mode 100644 index 00000000..00b865db --- /dev/null +++ b/experiments/swebench_pro_oracle_and_baseline.py @@ -0,0 +1,263 @@ +"""SWE-bench Pro: oracle validation + Harbor baseline. + +Two experiments on the 4 testable SWE-bench Pro tasks: + + 1) Oracle validation — runs gold solution (solve.sh), verifies reward=1.0 + for all 4 tasks. Confirms the --rootdir=/tests fix resolved qutebrowser + and openlibrary failures. + + 2) Harbor baseline — single-round agent evaluation (no progressive + disclosure). Runs the same 4 tasks with a real agent to establish + baseline pass rates for comparison with progressive disclosure. + +Usage: + # Oracle only (no API key needed): + python experiments/swebench_pro_oracle_and_baseline.py --oracle-only + + # Full run (oracle + baseline): + GEMINI_API_KEY=... python experiments/swebench_pro_oracle_and_baseline.py + + # Customize agent/model/backend: + GEMINI_API_KEY=... python experiments/swebench_pro_oracle_and_baseline.py \ + --agent gemini --model gemini-3.1-pro-preview --backend daytona + +Results → experiments/swebench-pro-results.csv +""" + +import argparse +import asyncio +import csv +import logging +import os +import sys +import time +from pathlib import Path + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s %(name)s %(message)s", +) +logger = logging.getLogger(__name__) + +sys.path.insert(0, str(Path(__file__).resolve().parents[0].parent / "src")) + +import benchflow as bf +from benchflow.trial import Scene, TrialConfig + +SWEBENCH_PRO_ROOT = Path(__file__).resolve().parents[1] / ".ref" / "swebenchpro" + +TASKS = [ + "instance_ansible__ansible-0ea40e09d1b35bcb69ff4d9cecf3d0defa4b36e8-v30a923fb5c164d6cd18280c02422f75e611e8fb2", + "instance_flipt-io__flipt-02e21636c58e86c51119b63e0fb5ca7b813b07b1", + "instance_internetarchive__openlibrary-00bec1e7c8f3272c469a58e1377df03f955ed478-v13642507b4fc1f8d234172bf8129942da2c2ca26", + "instance_qutebrowser__qutebrowser-01d1d1494411380d97cac14614a829d3a69cecaf-v2ef375ac784985212b1805e1d0431dc8f1b3c171", +] + +TASK_LABELS = { + "instance_ansible": "ansible", + "instance_flipt": "flipt", + "instance_internet": "openlibrary", + "instance_qutebrowser": "qutebrowser", +} + +RESULTS_FILE = Path(__file__).parent / "swebench-pro-results.csv" +JOBS_DIR = Path("/tmp/swebench-pro-jobs") + + +def task_label(task_name: str) -> str: + for prefix, label in TASK_LABELS.items(): + if task_name.startswith(prefix): + return label + return task_name[:30] + + +async def run_oracle(task_path: Path, backend: str) -> dict: + """Run oracle (gold solution) on a single task. Returns result dict.""" + label = task_label(task_path.name) + logger.info(f"[oracle] {label}: starting") + t0 = time.time() + + config = TrialConfig( + task_path=task_path, + agent="oracle", + environment=backend, + sandbox_user="agent", + jobs_dir=str(JOBS_DIR / "oracle"), + ) + + try: + result = await bf.run(config) + elapsed = time.time() - t0 + reward = None + if result.rewards: + reward = result.rewards.get("reward", result.rewards.get("exact_match")) + logger.info( + f"[oracle] {label}: reward={reward} " + f"error={result.error!r} ({elapsed:.0f}s)" + ) + return { + "experiment": "oracle", + "task": label, + "task_full": task_path.name, + "reward": reward, + "error": result.error or "", + "elapsed_s": round(elapsed, 1), + "n_tool_calls": result.n_tool_calls, + } + except Exception as e: + elapsed = time.time() - t0 + logger.error(f"[oracle] {label}: crashed: {e}", exc_info=True) + return { + "experiment": "oracle", + "task": label, + "task_full": task_path.name, + "reward": None, + "error": str(e), + "elapsed_s": round(elapsed, 1), + "n_tool_calls": 0, + } + + +async def run_baseline( + task_path: Path, agent: str, model: str, backend: str +) -> dict: + """Run single-round baseline (no progressive disclosure) on a single task.""" + label = task_label(task_path.name) + logger.info(f"[baseline] {label}: starting ({agent}/{model})") + t0 = time.time() + + config = TrialConfig( + task_path=task_path, + scenes=[Scene.single(agent=agent, model=model)], + environment=backend, + sandbox_user="agent", + jobs_dir=str(JOBS_DIR / "baseline"), + ) + + try: + result = await bf.run(config) + elapsed = time.time() - t0 + reward = None + if result.rewards: + reward = result.rewards.get("reward", result.rewards.get("exact_match")) + logger.info( + f"[baseline] {label}: reward={reward} " + f"tools={result.n_tool_calls} error={result.error!r} ({elapsed:.0f}s)" + ) + return { + "experiment": "baseline", + "task": label, + "task_full": task_path.name, + "agent": agent, + "model": model, + "reward": reward, + "error": result.error or "", + "elapsed_s": round(elapsed, 1), + "n_tool_calls": result.n_tool_calls, + } + except Exception as e: + elapsed = time.time() - t0 + logger.error(f"[baseline] {label}: crashed: {e}", exc_info=True) + return { + "experiment": "baseline", + "task": label, + "task_full": task_path.name, + "agent": agent, + "model": model, + "reward": None, + "error": str(e), + "elapsed_s": round(elapsed, 1), + "n_tool_calls": 0, + } + + +async def main(): + parser = argparse.ArgumentParser(description="SWE-bench Pro oracle + baseline") + parser.add_argument("--oracle-only", action="store_true", help="Skip baseline") + parser.add_argument("--agent", default=os.environ.get("AGENT", "gemini")) + parser.add_argument("--model", default=os.environ.get("MODEL", "gemini-3.1-pro-preview")) + parser.add_argument("--backend", default=os.environ.get("BACKEND", "docker")) + parser.add_argument("--concurrency", type=int, default=2) + args = parser.parse_args() + + task_paths = [SWEBENCH_PRO_ROOT / t for t in TASKS] + missing = [p for p in task_paths if not p.exists()] + if missing: + logger.error(f"Missing tasks: {[p.name for p in missing]}") + sys.exit(1) + + JOBS_DIR.mkdir(parents=True, exist_ok=True) + results: list[dict] = [] + + # Phase 1: Oracle validation (all 4 tasks in parallel) + logger.info("=" * 60) + logger.info("Phase 1: Oracle validation (gold solution)") + logger.info("=" * 60) + + sem = asyncio.Semaphore(args.concurrency) + + async def bounded_oracle(tp): + async with sem: + return await run_oracle(tp, args.backend) + + oracle_results = await asyncio.gather( + *[bounded_oracle(tp) for tp in task_paths] + ) + results.extend(oracle_results) + + oracle_pass = sum(1 for r in oracle_results if r["reward"] == 1.0) + logger.info(f"Oracle: {oracle_pass}/{len(oracle_results)} passed") + + if oracle_pass < len(oracle_results): + failed = [r["task"] for r in oracle_results if r["reward"] != 1.0] + logger.warning(f"Oracle failures: {failed}") + + # Phase 2: Baseline (single-round agent) + if not args.oracle_only: + logger.info("") + logger.info("=" * 60) + logger.info(f"Phase 2: Baseline ({args.agent}/{args.model})") + logger.info("=" * 60) + + async def bounded_baseline(tp): + async with sem: + return await run_baseline(tp, args.agent, args.model, args.backend) + + baseline_results = await asyncio.gather( + *[bounded_baseline(tp) for tp in task_paths] + ) + results.extend(baseline_results) + + baseline_pass = sum(1 for r in baseline_results if r.get("reward", 0) == 1.0) + logger.info(f"Baseline: {baseline_pass}/{len(baseline_results)} passed") + + # Write CSV + if results: + fieldnames = list(results[0].keys()) + all_keys = set() + for r in results: + all_keys.update(r.keys()) + fieldnames = sorted(all_keys) + + with open(RESULTS_FILE, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(results) + logger.info(f"Results written to {RESULTS_FILE}") + + # Summary table + print("\n" + "=" * 60) + print("SUMMARY") + print("=" * 60) + print(f"{'Experiment':<12} {'Task':<15} {'Reward':<10} {'Tools':<8} {'Time':<8} {'Error'}") + print("-" * 80) + for r in results: + reward_str = str(r.get("reward", "—")) + tools_str = str(r.get("n_tool_calls", "—")) + time_str = f"{r.get('elapsed_s', 0):.0f}s" + error_str = (r.get("error", "") or "")[:40] + print(f"{r['experiment']:<12} {r['task']:<15} {reward_str:<10} {tools_str:<8} {time_str:<8} {error_str}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/experiments/validate_multi_scene_lifecycle.py b/experiments/validate_multi_scene_lifecycle.py index e55eea26..4e6bf877 100644 --- a/experiments/validate_multi_scene_lifecycle.py +++ b/experiments/validate_multi_scene_lifecycle.py @@ -162,7 +162,7 @@ async def main(): r3 = await test_followup_bench() results["followup"] = r3.rewards - logger.info(f"\n=== DOGFOOD RESULTS ===") + logger.info("\n=== DOGFOOD RESULTS ===") for name, rewards in results.items(): reward = (rewards or {}).get("reward", "N/A") logger.info(f" {name}: reward={reward}") diff --git a/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/tests/test_outputs.py b/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/tests/test_outputs.py index 1c85417f..df887aa9 100644 --- a/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/tests/test_outputs.py +++ b/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/tests/test_outputs.py @@ -6,7 +6,7 @@ def test_arithmetic_is_broken(): def test_false_is_true(): - assert False + raise AssertionError() def test_pi_is_rational(): diff --git a/labs/benchjack-sandbox-hardening/pattern7_pth_injection/tests/test_outputs.py b/labs/benchjack-sandbox-hardening/pattern7_pth_injection/tests/test_outputs.py index 6debbc31..a4742c25 100644 --- a/labs/benchjack-sandbox-hardening/pattern7_pth_injection/tests/test_outputs.py +++ b/labs/benchjack-sandbox-hardening/pattern7_pth_injection/tests/test_outputs.py @@ -3,7 +3,7 @@ def test_arithmetic_is_broken(): def test_false_is_true(): - assert False + raise AssertionError() def test_pi_is_rational(): diff --git a/labs/benchjack-sandbox-hardening/run_comparison.py b/labs/benchjack-sandbox-hardening/run_comparison.py index e01b1df8..04963bb8 100755 --- a/labs/benchjack-sandbox-hardening/run_comparison.py +++ b/labs/benchjack-sandbox-hardening/run_comparison.py @@ -136,7 +136,7 @@ def main() -> int: return 1 n = len(PATTERNS) - total_steps = 2 + 2 * n # 2 venvs + 2 versions × n patterns + total_steps = 2 + 2 * n # 2 venvs + 2 versions x n patterns print(f"[1/{total_steps}] venv: benchflow==0.2.0 (PyPI)") _create_venv(VENVS_DIR / "bf-0.2.0", ["benchflow==0.2.0"]) diff --git a/labs/reward-hack-matrix/exploits/conftest_payload.py b/labs/reward-hack-matrix/exploits/conftest_payload.py index 30fe23f7..7397066a 100644 --- a/labs/reward-hack-matrix/exploits/conftest_payload.py +++ b/labs/reward-hack-matrix/exploits/conftest_payload.py @@ -16,7 +16,7 @@ terminal-bench-2 if its verifier ever touches conftest.py). """ -import pytest # noqa: F401 — pytest must see this as a real plugin module +import pytest def pytest_collection_modifyitems(config, items): diff --git a/labs/reward-hack-matrix/run_matrix.py b/labs/reward-hack-matrix/run_matrix.py index d5dfc439..53a86b04 100644 --- a/labs/reward-hack-matrix/run_matrix.py +++ b/labs/reward-hack-matrix/run_matrix.py @@ -30,6 +30,7 @@ import argparse import asyncio +import contextlib import json import os import re @@ -447,13 +448,11 @@ async def shutdown(self) -> None: if self.proc is None: return if self.proc.stdin and not self.proc.stdin.is_closing(): - try: + with contextlib.suppress(OSError, BrokenPipeError): self.proc.stdin.close() - except (OSError, BrokenPipeError): - pass try: await asyncio.wait_for(self.proc.wait(), timeout=30) - except asyncio.TimeoutError: + except TimeoutError: self.proc.kill() await self.proc.wait() if self.reader_task: @@ -717,7 +716,7 @@ def main() -> int: if args.sweep: summary_path = Path(args.summary_path) if args.summary_path else (JOBS_DIR / "matrix_sweep.json") print( - f"[sweep] {len(cells)} cells × {len(VERSIONS)} versions = " + f"[sweep] {len(cells)} cells x {len(VERSIONS)} versions = " f"{len(cells) * len(VERSIONS)} trials, concurrency={args.concurrency}" ) results = asyncio.run( diff --git a/pyproject.toml b/pyproject.toml index ff627c43..62f8f468 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "benchflow" -version = "0.3.0a10" +version = "0.3.2" description = "Multi-turn agent benchmarking with ACP — run any agent, any model, any provider." readme = "README.md" requires-python = ">=3.12" @@ -96,6 +96,16 @@ ignore = [ "RUF022", # __all__ unsorted — grouped by section for agent-friendliness ] +[tool.ruff.lint.per-file-ignores] +# Standalone scripts — sys.path manipulation before imports is intentional +"experiments/*.py" = ["E402"] +"tests/conformance/*.py" = ["E402"] +# Notebooks: cell-local imports + short loop vars are notebook conventions +"examples/*.ipynb" = ["E402", "E741", "SIM115"] +# Forward references resolved via __future__ annotations — ruff flags them +# but they work at runtime; explicit TYPE_CHECKING imports would force eager loads. +"src/benchflow/runtime.py" = ["F821"] + [tool.ty.environment] python-version = "3.12" diff --git a/src/benchflow/__init__.py b/src/benchflow/__init__.py index 809292d7..1d9129a0 100644 --- a/src/benchflow/__init__.py +++ b/src/benchflow/__init__.py @@ -19,13 +19,14 @@ ExecResult, Task, TaskConfig, - Trial, Verifier, VerifierResult, ) # benchflow's additions from benchflow._env_setup import stage_dockerfile_deps +from benchflow._scene import MailboxTransport, Message, MessageTransport, Role, Scene +from benchflow._snapshot import list_snapshots, restore, snapshot from benchflow.acp.client import ACPClient from benchflow.acp.session import ACPSession from benchflow.agents.registry import ( @@ -53,16 +54,16 @@ RuntimeResult, run, # bf.run(agent, env) — the primary 0.3 API ) -from benchflow._scene import MailboxTransport, Message, MessageTransport, Role, Scene -from benchflow._snapshot import list_snapshots, restore, snapshot from benchflow.sdk import SDK -from benchflow.trial import Trial, TrialConfig -from benchflow.trial import Role as TrialRole, Scene as TrialScene, Turn -from benchflow.trial_yaml import trial_config_from_yaml from benchflow.skills import SkillInfo, discover_skills, install_skill, parse_skill from benchflow.trajectories.otel import OTelCollector from benchflow.trajectories.proxy import TrajectoryProxy from benchflow.trajectories.types import Trajectory +from benchflow.trial import Role as TrialRole +from benchflow.trial import Scene as TrialScene +from benchflow.trial import Trial, TrialConfig, Turn +from benchflow.trial_yaml import trial_config_from_yaml +from benchflow.user import BaseUser, FunctionUser, PassthroughUser, RoundResult # Public API surface. Anything not in this list is implementation detail and # may change without notice. Names are grouped by source module to match the @@ -123,6 +124,12 @@ "TrialRole", "TrialScene", "Turn", + "trial_config_from_yaml", + # User abstraction (progressive disclosure) + "BaseUser", + "FunctionUser", + "PassthroughUser", + "RoundResult", # SDK (backwards compat) "SDK", # Environments / dep staging diff --git a/src/benchflow/_acp_run.py b/src/benchflow/_acp_run.py index f1b04c7b..58b95bed 100644 --- a/src/benchflow/_acp_run.py +++ b/src/benchflow/_acp_run.py @@ -18,6 +18,7 @@ """ import asyncio +import contextlib import logging from pathlib import Path @@ -26,7 +27,8 @@ from benchflow.acp.client import ACPClient from benchflow.acp.container_transport import ContainerTransport from benchflow.agents.providers import strip_provider_prefix -from benchflow.process import DaytonaProcess, DockerProcess +from benchflow.agents.registry import AGENTS +from benchflow.process import DaytonaProcess, DaytonaPtyProcess, DockerProcess logger = logging.getLogger(__name__) @@ -34,6 +36,52 @@ _ACP_CONNECT_MAX_RETRIES = 3 _ACP_CONNECT_BASE_DELAY = 2.0 +# models.dev provider inference — used when acp_model_format="provider/model" +# to reconstruct "provider/model" from a bare model name. +_MODELSDEV_PROVIDER_HEURISTICS: list[tuple[str, str]] = [ + # (substring in model name, models.dev provider ID) + ("gemini", "google"), + ("gemma", "google"), + ("gpt", "openai"), + ("o1", "openai"), + ("o3", "openai"), + ("o4", "openai"), + ("claude", "anthropic"), + ("haiku", "anthropic"), + ("sonnet", "anthropic"), + ("opus", "anthropic"), + ("mistral", "mistral"), + ("codestral", "mistral"), +] + + +def _format_acp_model(model: str, agent: str) -> str: + """Format a model ID for ACP session/set_model based on agent requirements. + + Most agents expect a bare model name (e.g. "claude-sonnet-4-6"). + Agents with acp_model_format="provider/model" (e.g. opencode) need the + models.dev provider prefix (e.g. "google/gemini-3.1-pro-preview"). + + Strips benchflow's custom provider prefixes first, then re-adds the + models.dev provider prefix when the agent requires it. + """ + bare = strip_provider_prefix(model) + agent_cfg = AGENTS.get(agent) + if not agent_cfg or agent_cfg.acp_model_format != "provider/model": + return bare + # Already has a slash — assume it's provider/model already + if "/" in bare: + return bare + # Infer the models.dev provider from the bare model name + m = bare.lower() + for substring, provider in _MODELSDEV_PROVIDER_HEURISTICS: + if substring in m: + return f"{provider}/{bare}" + logger.warning( + "Cannot infer models.dev provider for %r — defaulting to anthropic/", bare + ) + return f"anthropic/{bare}" + async def connect_acp( env, @@ -66,7 +114,6 @@ async def connect_acp( agent_launch = build_priv_drop_cmd(agent_launch, sandbox_user) logger.info(f"Agent sandboxed as: {sandbox_user}") - last_err: Exception | None = None acp_client: ACPClient | None = None for attempt in range(_ACP_CONNECT_MAX_RETRIES + 1): if attempt > 0: @@ -78,7 +125,12 @@ async def connect_acp( if environment == "docker": live_proc = DockerProcess.from_harbor_env(env) else: - live_proc = await DaytonaProcess.from_harbor_env(env) + is_dind = hasattr(env, "_strategy") and hasattr(env._strategy, "_compose_cmd") + if is_dind: + live_proc = await DaytonaPtyProcess.from_harbor_env(env) + logger.info("Using PTY transport for DinD compose task") + else: + live_proc = await DaytonaProcess.from_harbor_env(env) agent_log = trial_dir / "agent" / f"{agent.replace('-', '_')}.txt" transport = ContainerTransport( @@ -101,12 +153,9 @@ async def connect_acp( except ConnectionError as e: # Close the failed client before retrying if acp_client: - try: + with contextlib.suppress(Exception): await acp_client.close() - except Exception: - pass acp_client = None - last_err = e if attempt == _ACP_CONNECT_MAX_RETRIES: raise logger.warning(f"ACP connect failed (attempt {attempt + 1}): {e}") @@ -114,19 +163,20 @@ async def connect_acp( except Exception: # Non-retryable error — close client to prevent leak if acp_client: - try: + with contextlib.suppress(Exception): await acp_client.close() - except Exception: - pass raise - if model: - acp_model_id = strip_provider_prefix(model) + agent_cfg = AGENTS.get(agent) + if model and (agent_cfg is None or agent_cfg.supports_acp_set_model): + acp_model_id = _format_acp_model(model, agent) try: await asyncio.wait_for(acp_client.set_model(acp_model_id), timeout=60) logger.info(f"Model set to: {acp_model_id} (from {model})") except Exception as e: logger.warning(f"Failed to set model via ACP: {e}") + elif model: + logger.info(f"Skipping ACP set_model for {agent} — launch/env config owns model selection") return acp_client, session, agent_name diff --git a/src/benchflow/_agent_env.py b/src/benchflow/_agent_env.py index 72d64986..0229b173 100644 --- a/src/benchflow/_agent_env.py +++ b/src/benchflow/_agent_env.py @@ -26,6 +26,32 @@ logger = logging.getLogger(__name__) +_AUTH_CONTEXT_GROUPS = ( + frozenset({"ANTHROPIC_API_KEY", "ANTHROPIC_AUTH_TOKEN", "CLAUDE_CODE_OAUTH_TOKEN"}), + frozenset({"GEMINI_API_KEY", "GOOGLE_API_KEY"}), +) +_EXPLICIT_AGENT_NATIVE_BRIDGE_KEYS = frozenset({"LLM_API_KEY"}) + + +def _normalize_openhands_model(model: str) -> str: + """Translate benchflow model IDs to OpenHands/LiteLLM model IDs. + + OpenHands expects provider-qualified model names for some providers even + when benchflow uses bare model IDs or its own provider prefixes. + """ + from benchflow.agents.providers import strip_provider_prefix + from benchflow.agents.registry import is_vertex_model + + if model.startswith(("gemini/", "vertex_ai/", "openhands/")): + return model + stripped = strip_provider_prefix(model) + lower = model.lower() + if is_vertex_model(model) and "gemini" in lower: + return f"vertex_ai/{stripped}" + if "gemini" in lower: + return f"gemini/{stripped}" + return stripped + def auto_inherit_env(agent_env: dict[str, str]) -> None: """Copy well-known API keys from host os.environ into agent_env.""" @@ -38,6 +64,7 @@ def auto_inherit_env(agent_env: dict[str, str]) -> None: "OPENAI_API_KEY", "GOOGLE_API_KEY", "GEMINI_API_KEY", + "GOOGLE_GENERATIVE_AI_API_KEY", "GOOGLE_CLOUD_PROJECT", "GOOGLE_CLOUD_LOCATION", "LLM_API_KEY", @@ -54,6 +81,9 @@ def auto_inherit_env(agent_env: dict[str, str]) -> None: # Mirror GEMINI_API_KEY as GOOGLE_API_KEY (some agents expect one or the other) if "GEMINI_API_KEY" in agent_env and "GOOGLE_API_KEY" not in agent_env: agent_env["GOOGLE_API_KEY"] = agent_env["GEMINI_API_KEY"] + # Mirror GEMINI_API_KEY as GOOGLE_GENERATIVE_AI_API_KEY (opencode/models.dev convention) + if "GEMINI_API_KEY" in agent_env and "GOOGLE_GENERATIVE_AI_API_KEY" not in agent_env: + agent_env["GOOGLE_GENERATIVE_AI_API_KEY"] = agent_env["GEMINI_API_KEY"] # CLAUDE_CODE_OAUTH_TOKEN is a separate auth path — Claude CLI reads it # directly. Don't map to ANTHROPIC_API_KEY (different auth mechanism). @@ -118,11 +148,22 @@ def resolve_provider_env( _key = agent_env.get(_prov_cfg.auth_env, "") if _key: agent_env.setdefault("BENCHFLOW_PROVIDER_API_KEY", _key) + else: + # No registered provider prefix — bridge the model's well-known API key + # to BENCHFLOW_PROVIDER_API_KEY so env_mapping can translate it to + # agent-native vars (e.g. GEMINI_API_KEY → LLM_API_KEY for openhands). + from benchflow.agents.registry import infer_env_key_for_model + + _inferred = infer_env_key_for_model(model) + if _inferred and _inferred in agent_env: + agent_env.setdefault("BENCHFLOW_PROVIDER_API_KEY", agent_env[_inferred]) # Apply agent env_mapping: translate BENCHFLOW_PROVIDER_* → agent-native vars if agent_cfg and agent_cfg.env_mapping: for src, dst in agent_cfg.env_mapping.items(): if src in agent_env: agent_env.setdefault(dst, agent_env[src]) + if agent == "openhands": + agent_env.setdefault("LLM_MODEL", _normalize_openhands_model(model)) def check_subscription_auth(agent: str, required_key: str) -> bool: @@ -136,6 +177,18 @@ def check_subscription_auth(agent: str, required_key: str) -> bool: return Path(sa.detect_file).expanduser().is_file() +def _shares_auth_context(required_key: str | None, candidate_key: str | None) -> bool: + """True when both keys represent the same provider auth context.""" + if not required_key or not candidate_key: + return False + if required_key == candidate_key: + return True + return any( + required_key in group and candidate_key in group + for group in _AUTH_CONTEXT_GROUPS + ) + + def resolve_agent_env( agent: str, model: str | None, @@ -143,19 +196,61 @@ def resolve_agent_env( ) -> dict[str, str]: """Resolve agent environment: auto-inherit keys, provider vars, env_mapping.""" agent_env = dict(agent_env or {}) + explicit_agent_env_keys = set(agent_env) auto_inherit_env(agent_env) + pre_provider_env = dict(agent_env) + agent_cfg = AGENTS.get(agent) # Oracle runs solve.sh and never calls an LLM — model env vars and # API-key validation are skipped even if a caller forwards a model. if model and agent != "oracle": inject_vertex_credentials(agent_env, model) resolve_provider_env(agent_env, model, agent) + if agent_cfg and agent_cfg.env_mapping: + for src, dst in agent_cfg.env_mapping.items(): + if src in agent_env and dst not in explicit_agent_env_keys: + # Provider resolution must override unrelated host-native + # vars auto-inherited from the environment, but preserve + # explicit agent_env overrides supplied by the caller. + agent_env[dst] = agent_env[src] # Validate required API key for the chosen model from benchflow.agents.registry import infer_env_key_for_model required_key = infer_env_key_for_model(model) - # CLAUDE_CODE_OAUTH_TOKEN is an alternative auth path for Claude agents - has_oauth = "CLAUDE_CODE_OAUTH_TOKEN" in agent_env or "ANTHROPIC_AUTH_TOKEN" in agent_env - if required_key and required_key not in agent_env and not has_oauth: + mapped_provider_key = ( + agent_cfg.env_mapping.get("BENCHFLOW_PROVIDER_API_KEY") + if agent_cfg + else None + ) + has_agent_native_bridge_key = bool( + mapped_provider_key + and pre_provider_env.get(mapped_provider_key) + and ( + _shares_auth_context(required_key, mapped_provider_key) + or ( + mapped_provider_key in _EXPLICIT_AGENT_NATIVE_BRIDGE_KEYS + and mapped_provider_key in explicit_agent_env_keys + ) + ) + ) + if has_agent_native_bridge_key: + # Only pre-existing same-provider aliases or explicit generic bridge + # keys can satisfy provider auth. Values synthesized by env_mapping + # or inherited from another provider context must not bypass the + # model's required credential. + agent_env.setdefault( + "BENCHFLOW_PROVIDER_API_KEY", + pre_provider_env[mapped_provider_key], + ) + has_oauth = any( + key in agent_env and _shares_auth_context(required_key, key) + for key in ("CLAUDE_CODE_OAUTH_TOKEN", "ANTHROPIC_AUTH_TOKEN") + ) + if ( + required_key + and required_key not in agent_env + and not has_oauth + and not has_agent_native_bridge_key + ): if check_subscription_auth(agent, required_key): agent_env["_BENCHFLOW_SUBSCRIPTION_AUTH"] = "1" logger.info( @@ -183,7 +278,6 @@ def resolve_agent_env( ) else: # No model specified — still check subscription auth for required env vars - agent_cfg = AGENTS.get(agent) if agent_cfg: for req_key in agent_cfg.requires_env: if req_key not in agent_env and check_subscription_auth(agent, req_key): diff --git a/src/benchflow/_agent_setup.py b/src/benchflow/_agent_setup.py index 39b1720c..0c04a5a8 100644 --- a/src/benchflow/_agent_setup.py +++ b/src/benchflow/_agent_setup.py @@ -30,21 +30,76 @@ logger = logging.getLogger(__name__) +def _skill_link_cmd(source: str, dest: str) -> str: + """Link a shared skills tree into an agent discovery path.""" + if source == dest: + return f"mkdir -p {shlex.quote(dest)}" + + parent = shlex.quote(str(Path(dest).parent)) + q_source = shlex.quote(source) + q_dest = shlex.quote(dest) + return ( + f"mkdir -p {parent} && " + f"rm -rf {q_dest} && " + f"ln -sfn {q_source} {q_dest}" + ) + + +async def _link_skill_paths(env, source: str, skill_paths: list[str], home: str, cwd: str) -> int: + """Link one shared skills tree into each configured discovery path.""" + parts = [] + for sp in skill_paths: + expanded = sp.replace("$HOME", home).replace("$WORKSPACE", cwd) + parts.append(_skill_link_cmd(source, expanded)) + if parts: + cmd = " && ".join(parts) + result = await env.exec(cmd, timeout_sec=15) + if result.return_code != 0: + stdout = (getattr(result, "stdout", "") or "").strip() + stderr = (getattr(result, "stderr", "") or "").strip() + details = [ + f"exit code {result.return_code}", + f"command: {cmd}", + ] + if stdout: + details.append(f"stdout: {stdout}") + if stderr: + details.append(f"stderr: {stderr}") + raise RuntimeError( + f"Failed to link skills from {source}: {'; '.join(details)}" + ) + return len(parts) + + async def install_agent(env, agent: str, trial_dir: Path) -> AgentConfig | None: """Install agent in sandbox and return its config.""" agent_base = agent.split()[0] agent_cfg = AGENTS.get(agent_base) if agent_base not in AGENT_INSTALLERS: return agent_cfg + install_cmd = AGENT_INSTALLERS[agent_base] install_timeout = agent_cfg.install_timeout if agent_cfg else 900 logger.info(f"Installing {agent_base} in sandbox (timeout={install_timeout}s)...") install_result = await env.exec( - AGENT_INSTALLERS[agent_base], + install_cmd, timeout_sec=install_timeout, ) install_log = trial_dir / "agent" / "install-stdout.txt" install_log.parent.mkdir(parents=True, exist_ok=True) - install_log.write_text(install_result.stdout or "") + stdout = install_result.stdout or "" + stderr = install_result.stderr or "" + parts = [f"$ {install_cmd}\n"] + if stdout: + parts.append("=== stdout ===\n") + parts.append(stdout) + if not stdout.endswith("\n"): + parts.append("\n") + if stderr: + parts.append("=== stderr ===\n") + parts.append(stderr) + if not stderr.endswith("\n"): + parts.append("\n") + install_log.write_text("".join(parts)) if install_result.return_code != 0: diag = await env.exec( "echo 'OS:' && cat /etc/os-release 2>/dev/null | head -2; " @@ -55,7 +110,7 @@ async def install_agent(env, agent: str, trial_dir: Path) -> AgentConfig | None: raise AgentInstallError( agent=agent_base, return_code=install_result.return_code, - stdout=install_result.stdout or "", + stdout="".join(parts), diagnostics=diag.stdout or "", log_path=str(install_log), ) @@ -72,6 +127,9 @@ async def deploy_skills( task: "Task", ) -> None: """Deploy and distribute skills into sandbox.""" + task_skills_dir = task.config.environment.skills_dir + effective_skills = task_skills_dir + # Runtime upload (fallback if not baked into Dockerfile) if skills_dir: dockerfile = task_path / "environment" / "Dockerfile" @@ -84,38 +142,24 @@ async def deploy_skills( if skills_path.is_dir(): logger.info(f"Deploying skills via runtime upload from {skills_path}") await env.upload_dir(skills_path, "/skills") - if agent_cfg and agent_cfg.skill_paths: - parts = [] - for sp in agent_cfg.skill_paths: - expanded = sp.replace("$HOME", "/root").replace( - "$WORKSPACE", "/app" - ) - parent = str(Path(expanded).parent) - parts.append( - f"mkdir -p '{parent}' && ln -sf /skills '{expanded}'" - ) - await env.exec(" && ".join(parts), timeout_sec=10) - logger.info("Skills deployed to /skills and symlinked") + logger.info("Skills deployed to /skills") + effective_skills = "/skills" else: logger.warning(f"Skills dir not found: {skills_path}") else: logger.info("Skills already injected via Dockerfile") # Distribute to agent-specific discovery paths - task_skills_dir = task.config.environment.skills_dir - effective_skills = "/skills" if skills_dir else task_skills_dir if effective_skills and agent_cfg and agent_cfg.skill_paths: home = f"/home/{sandbox_user}" if sandbox_user else "/root" - parts = [] - for sp in agent_cfg.skill_paths: - expanded = sp.replace("$HOME", home).replace("$WORKSPACE", agent_cwd) - q_expanded = shlex.quote(expanded) - q_skills = shlex.quote(effective_skills) - parts.append( - f"mkdir -p {q_expanded} && cp -r {q_skills}/. {q_expanded}/ 2>/dev/null" - ) - if parts: - await env.exec("; ".join(parts), timeout_sec=15) + count = await _link_skill_paths( + env, + effective_skills, + agent_cfg.skill_paths, + home, + agent_cwd, + ) + if count: logger.info( - f"Skills distributed to {len(parts)} paths for {agent_cfg.name}" + f"Skills distributed to {count} paths for {agent_cfg.name}" ) diff --git a/src/benchflow/_daytona_patches.py b/src/benchflow/_daytona_patches.py new file mode 100644 index 00000000..56abe783 --- /dev/null +++ b/src/benchflow/_daytona_patches.py @@ -0,0 +1,102 @@ +"""Idempotent patches to the Daytona SDK to work around upstream bugs. + +Imported (and applied) lazily from ``_env_setup._create_environment`` so the +SDK is only touched when a Daytona environment is actually being built. +""" + +import asyncio +import logging +from typing import Any + +logger = logging.getLogger(__name__) + +_PATCHED = False + + +def apply() -> None: + """Install workarounds for known Daytona SDK bugs. + + Currently: + * ``AsyncProcess.get_session_command_logs`` occasionally raises + ``pydantic.ValidationError`` because the server returns an empty + string instead of a JSON object for ``SessionCommandLogsResponse``. + Reproduces in SDK 0.168.x and 0.169.x. Wrap with a small bounded + retry that returns an empty-but-valid response if every attempt + fails — callers can still observe the command's exit_code via + ``get_session_command``, so a missing logs payload is recoverable. + """ + global _PATCHED + if _PATCHED: + return + + try: + from daytona._async.process import AsyncProcess + from daytona.common.errors import DaytonaError + from daytona.common.process import SessionCommandLogsResponse + except Exception: # pragma: no cover — SDK not installed / layout changed + logger.debug("daytona SDK not importable; skipping patches", exc_info=True) + return + + try: + from pydantic import ValidationError + except Exception: # pragma: no cover + return + + # AsyncProcess.get_session_command_logs is decorated by intercept_errors + # at class definition, which converts every inner exception (including + # the pydantic ValidationError we care about) into a DaytonaError. The + # decorated bound method is what we capture here, so we have to match + # on the wrapped DaytonaError shape too — not just ValidationError. + original = AsyncProcess.get_session_command_logs + + _MALFORMED_MARKER = "SessionCommandLogsResponse" + + def _is_malformed_logs_error(exc: BaseException) -> bool: + if isinstance(exc, ValidationError): + return True + return isinstance(exc, DaytonaError) and _MALFORMED_MARKER in str(exc) + + async def _patched_get_session_command_logs( + self: Any, session_id: str, command_id: str + ) -> SessionCommandLogsResponse: + # Harbor already wraps this call in tenacity (3 attempts), so + # additional retries here are usually wasted on a deterministic + # malformed payload. Try once more with a small delay in case it + # IS transient, then return an empty-but-valid response so the + # caller can still observe the command's exit_code via + # get_session_command. Original error is logged for triage. + attempts = 2 + delay = 0.5 + last_exc: BaseException | None = None + for attempt in range(1, attempts + 1): + try: + return await original(self, session_id, command_id) + except (ValidationError, DaytonaError) as exc: + if not _is_malformed_logs_error(exc): + raise + last_exc = exc + logger.warning( + "daytona get_session_command_logs malformed payload " + "(attempt %d/%d) for session=%s command=%s: %s", + attempt, + attempts, + session_id, + command_id, + exc, + ) + if attempt < attempts: + await asyncio.sleep(delay) + + logger.error( + "daytona get_session_command_logs malformed %d times for " + "session=%s command=%s; falling back to empty logs (%s)", + attempts, + session_id, + command_id, + last_exc, + ) + return SessionCommandLogsResponse(output="", stdout="", stderr="") + + AsyncProcess.get_session_command_logs = _patched_get_session_command_logs # type: ignore[method-assign] + _PATCHED = True + logger.debug("daytona SDK patches applied") diff --git a/src/benchflow/_env_setup.py b/src/benchflow/_env_setup.py index f3aa7cd8..9800ffc7 100644 --- a/src/benchflow/_env_setup.py +++ b/src/benchflow/_env_setup.py @@ -20,6 +20,7 @@ # build) instead of erroring out. Override via env if running on a paid tier. _DAYTONA_MAX_CPUS = int(os.environ.get("BENCHFLOW_DAYTONA_MAX_CPUS", "4")) _DAYTONA_MAX_MEMORY_MB = int(os.environ.get("BENCHFLOW_DAYTONA_MAX_MEMORY_MB", "8192")) +_DAYTONA_MAX_STORAGE_MB = int(os.environ.get("BENCHFLOW_DAYTONA_MAX_STORAGE_MB", "10240")) # Directories to ignore when copying deps _IGNORE_DIRS = { @@ -260,6 +261,10 @@ def _create_environment( elif environment_type == "daytona": from harbor.environments.daytona import DaytonaEnvironment + from benchflow._daytona_patches import apply as _apply_daytona_patches + + _apply_daytona_patches() + env_config = task.config.environment if env_config.cpus > _DAYTONA_MAX_CPUS: logger.warning( @@ -275,6 +280,13 @@ def _create_environment( _DAYTONA_MAX_MEMORY_MB, ) env_config.memory_mb = _DAYTONA_MAX_MEMORY_MB + if env_config.storage_mb > _DAYTONA_MAX_STORAGE_MB: + logger.warning( + "Clamping storage_mb %d -> %d for Daytona (override with BENCHFLOW_DAYTONA_MAX_STORAGE_MB)", + env_config.storage_mb, + _DAYTONA_MAX_STORAGE_MB, + ) + env_config.storage_mb = _DAYTONA_MAX_STORAGE_MB return DaytonaEnvironment( environment_dir=task.paths.environment_dir, diff --git a/src/benchflow/_sandbox.py b/src/benchflow/_sandbox.py index 41ba87b3..7294a3d5 100644 --- a/src/benchflow/_sandbox.py +++ b/src/benchflow/_sandbox.py @@ -1,7 +1,7 @@ """Sandbox user setup, path lockdown, and verifier hardening. Owns the "agent runs as non-root" lifecycle: - - Creating the sandbox user and copying root's tooling into its home + - Creating the sandbox user and preparing minimal home state it needs - Building the privilege-drop wrapper (setpriv / su) for agent launch - Locking down solution/test paths so the sandbox user cannot read them - Hardening the environment before the verifier runs @@ -16,7 +16,6 @@ import os import re import shlex -import tomllib from pathlib import Path from typing import TYPE_CHECKING @@ -87,9 +86,7 @@ def build_priv_drop_cmd(agent_launch: str, sandbox_user: str) -> str: setpriv (util-linux) execs directly; su -l is the fallback for Alpine/BusyBox. No outer sh -c wrapper — DockerProcess wraps in bash -c already. """ - inner = ( - f"export HOME=/home/{sandbox_user} && cd /home/{sandbox_user} && {agent_launch}" - ) + inner = f"export HOME=/home/{sandbox_user} && {agent_launch}" quoted = shlex.quote(inner) return ( f"if setpriv --help 2>&1 | grep -q reuid; then" @@ -100,6 +97,20 @@ def build_priv_drop_cmd(agent_launch: str, sandbox_user: str) -> str: ) +def _legacy_root_tool_link_cmd(source: str, dest: str) -> str: + """Link legacy root-only tool dirs into the sandbox home when needed.""" + src = shlex.quote(source) + dst = shlex.quote(dest) + parent = shlex.quote(str(Path(dest).parent)) + return ( + f"if [ -e {src} ] && [ ! -L {dst} ]; then " + f"mkdir -p {parent} && " + f"rmdir {dst} 2>/dev/null || true; " + f"[ -e {dst} ] || ln -s {src} {dst}; " + "fi" + ) + + async def setup_sandbox_user( env, sandbox_user: str, workspace: str, *, timeout_sec: int = 120 ) -> str: @@ -109,18 +120,17 @@ async def setup_sandbox_user( f"Invalid sandbox_user: {sandbox_user!r} (must be alphanumeric)" ) logger.info(f"Setting up sandbox user: {sandbox_user}") + home = f"/home/{sandbox_user}" + home_dirs = sorted(d for d in get_sandbox_home_dirs() if d != ".local") await env.exec( f"id -u {sandbox_user} >/dev/null 2>&1 || " f"useradd -m -s /bin/bash {sandbox_user} && " - f"mkdir -p /home/{sandbox_user}/.local/bin && " - "if [ -d /root/.local/bin ]; then " - f"cp -aL /root/.local/bin/. /home/{sandbox_user}/.local/bin/ 2>/dev/null || true; fi && " - "if [ -d /root/.nvm ]; then " - f"cp -a /root/.nvm/. /home/{sandbox_user}/.nvm/ 2>/dev/null || true; fi && " - f"for d in {' '.join(sorted(get_sandbox_home_dirs()))}; do " - f"if [ -d /root/$d ]; then mkdir -p /home/{sandbox_user}/$d && " - f"cp -a /root/$d/. /home/{sandbox_user}/$d/ 2>/dev/null || true; fi; done && " - f"chown -R {sandbox_user}:{sandbox_user} /home/{sandbox_user} && " + f"{_legacy_root_tool_link_cmd('/root/.local/bin', f'{home}/.local/bin')} && " + f"{_legacy_root_tool_link_cmd('/root/.nvm', f'{home}/.nvm')} && " + f"for d in {' '.join(home_dirs)}; do " + f"if [ -d /root/$d ]; then mkdir -p {home}/$d && " + f"cp -a /root/$d/. {home}/$d/ 2>/dev/null || true; fi; done && " + f"chown -R {sandbox_user}:{sandbox_user} {home} && " f"chown -R {sandbox_user}:{sandbox_user} {shlex.quote(workspace)}", timeout_sec=timeout_sec, ) @@ -294,7 +304,7 @@ async def _refresh_verifier_workspace(env, workspace: str) -> None: "PYTEST_ADDOPTS": ( "-c /dev/null " # block pyproject.toml/pytest.ini/tox.ini/setup.cfg discovery "--confcutdir=/tests " # block conftest.py walk-up beyond /tests - "--rootdir=/tests " + "--rootdir=/app " # anchor test node IDs to repo root (not /dev from -c) "-p no:cacheprovider" ), # Block pytest11 entry-point plugins. An agent can modify a pre-installed @@ -336,34 +346,24 @@ async def _refresh_verifier_workspace(env, workspace: str) -> None: _SAFE_VERIFIER_PATH_PARTS = tuple(_SAFE_VERIFIER_PATH.split(":")) _RUNTIME_PATH_PREFIXES = ("/tmp", "/var/tmp", "/logs", "/testbed") -# pytest plugin names are not always the same as the PyPI distribution name -# or the option they register. These aliases cover the common benchmark -# verifier plugins while preserving PYTEST_DISABLE_PLUGIN_AUTOLOAD=1. -_PYTEST_PLUGIN_ALIASES = { - "ctrf": "ctrf", - "pytest-json-ctrf": "ctrf", - "pytest_json_ctrf": "ctrf", - "pytest_json_ctrf.plugin": "ctrf", - "pytest-json-report": "pytest_jsonreport", - "pytest_json_report": "pytest_jsonreport", - "pytest_jsonreport": "pytest_jsonreport", - "pytest_jsonreport.plugin": "pytest_jsonreport", -} -_PYTEST_OPTION_PLUGINS = { - "--ctrf": "ctrf", - "--json-report": "pytest_jsonreport", - "--json-report-file": "pytest_jsonreport", -} - -# Pytest plugins worth auto-loading when test.sh pip-installs them but the -# task author forgot to declare pytest_plugins in task.toml. Map distribution -# name (as it appears in `pip install pytest-foo`) to importable plugin name. -_PYTEST_INSTALLED_PLUGINS = { - "pytest-asyncio": "pytest_asyncio", - "pytest-anyio": "anyio.pytest_plugin", - "pytest-trio": "pytest_trio", -} -_PIP_INSTALL_RE = re.compile(r"\bpip3?\s+install\b[^\n;|&]*", re.IGNORECASE) +# Container-side script to enumerate pre-installed pytest11 entry points. +# Runs after sandbox_user processes are killed, so the agent cannot install +# new packages between enumeration and verification. The sandbox_user cannot +# pip install (not root), so all discovered plugins are image-authored. +_DISCOVER_PYTEST_PLUGINS_SCRIPT = r""" +import json, sys +try: + from importlib.metadata import entry_points + try: + eps = list(entry_points(group='pytest11')) + except TypeError: + eps = list(entry_points().get('pytest11', [])) + names = sorted(set(ep.name for ep in eps)) + print(json.dumps(names)) +except Exception as e: + print(json.dumps({"error": str(e)}), file=sys.stderr) + print("[]") +""".strip() def _under_path(path: str, prefix: str) -> bool: @@ -383,6 +383,21 @@ def _blocked_verifier_path_prefixes( return tuple(dict.fromkeys(prefixes)) +def _blocked_verifier_pythonpath_prefixes( + sandbox_user: str | None, +) -> tuple[str, ...]: + """Paths blocked from verifier PYTHONPATH. + + Unlike PATH, the workspace is NOT blocked: PYTHONPATH entries like /app + are set by the Dockerfile for project imports, and the workspace is + already importable via CWD/pytest sys.path insertion regardless. + """ + prefixes = list(_RUNTIME_PATH_PREFIXES) + if sandbox_user: + prefixes.append(f"/home/{sandbox_user}") + return tuple(dict.fromkeys(prefixes)) + + def _merge_trusted_verifier_path(extras: list[str]) -> str: """Prepend validated image PATH entries to the verifier allowlist.""" kept: list[str] = [] @@ -451,76 +466,38 @@ def _trusted_path_extras_cmd(raw_path: str, blocked_prefixes: tuple[str, ...]) - ) -def _normalize_pytest_plugin(name: object) -> str | None: - """Return the importable pytest plugin name for a task declaration.""" - if not isinstance(name, str): - return None - clean = name.strip() - if not clean: - return None - return _PYTEST_PLUGIN_ALIASES.get(clean, clean) - +async def _discover_pytest_plugin_flags(env, task: "Task") -> str: + """Auto-discover pytest plugins from root-owned system packages. -def _plugins_from_verifier_script(task: "Task") -> list[str]: - """Infer known pytest plugins needed by legacy verifier scripts. - - Older SkillsBench/TB2 tasks predate task-level pytest plugin metadata and - call options such as --ctrf directly from tests/test.sh. With pytest entry - point autoload disabled, those options must be backed by explicit -p flags. + Runs a container-side script that enumerates pytest11 entry points and + filters to only those whose dist-info is in a root-owned directory. + Falls back to task.toml pytest_plugins declarations if discovery fails. + Replaces the previous hand-curated whitelist mechanism. """ - task_dir = getattr(task, "task_dir", None) - if not isinstance(task_dir, (str, os.PathLike)): - return [] - test_sh = Path(task_dir) / "tests" / "test.sh" - try: - content = test_sh.read_text() - except OSError: - return [] - plugins: list[str] = [] - for option, plugin in _PYTEST_OPTION_PLUGINS.items(): - if option in content and plugin not in plugins: - plugins.append(plugin) - # Detect pip-installed pytest plugins so PYTEST_DISABLE_PLUGIN_AUTOLOAD=1 - # doesn't silently drop them. Only matches the exact installer line so - # arbitrary text mentioning the plugin name is ignored. - for match in _PIP_INSTALL_RE.findall(content): - for dist, plugin in _PYTEST_INSTALLED_PLUGINS.items(): - if dist in match and plugin not in plugins: - plugins.append(plugin) - return plugins - - -def _declared_pytest_plugins(task: "Task") -> list[object]: - """Return pytest_plugins from the model, falling back to raw task.toml.""" - declared = getattr(task.config.verifier, "pytest_plugins", None) - if declared: - return list(declared) - task_dir = getattr(task, "task_dir", None) - if not isinstance(task_dir, (str, os.PathLike)): - return [] - config_path = Path(task_dir) / "task.toml" + # Container-side auto-discovery try: - data = tomllib.loads(config_path.read_text()) - except (OSError, tomllib.TOMLDecodeError): - return [] - plugins = data.get("verifier", {}).get("pytest_plugins", []) - if isinstance(plugins, list): - return plugins - return [] + result = await env.exec( + f"python3 -c {shlex.quote(_DISCOVER_PYTEST_PLUGINS_SCRIPT)}", + user="root", timeout_sec=15, + ) + if result.stderr: + logger.debug(f"Plugin discovery stderr: {result.stderr.strip()}") + discovered = _json.loads(result.stdout or "[]") + if isinstance(discovered, list): + plugins.extend(p for p in discovered if isinstance(p, str) and p.strip()) + logger.info(f"Discovered {len(plugins)} pytest plugins from container") + except Exception as e: + logger.warning(f"Pytest plugin discovery failed, using task.toml fallback: {e}") + # Merge task.toml declarations as fallback + declared = getattr(task.config.verifier, "pytest_plugins", None) + if declared: + for name in declared: + if isinstance(name, str) and name.strip() and name.strip() not in plugins: + plugins.append(name.strip()) -def _pytest_plugin_flags(task: "Task") -> str: - """Build deterministic -p flags for inferred and declared pytest plugins.""" - plugins: list[str] = [] - for plugin in _plugins_from_verifier_script(task): - if plugin not in plugins: - plugins.append(plugin) - for plugin in _declared_pytest_plugins(task): - normalized = _normalize_pytest_plugin(plugin) - if normalized and normalized not in plugins: - plugins.append(normalized) return " ".join(f"-p {shlex.quote(p)}" for p in plugins) @@ -585,6 +562,31 @@ async def _trusted_verifier_path( return _merge_trusted_verifier_path([e for e in extras if isinstance(e, str)]) +async def _trusted_verifier_pythonpath( + env, sandbox_user: str | None, +) -> str: + """Return filtered PYTHONPATH preserving only trusted image entries. + + Same root-owned, non-world-writable validation as PATH, but does not + block the workspace — it is already importable via CWD/pytest and + is chowned to root before verification. + """ + pp_result = await env.exec("printenv PYTHONPATH 2>/dev/null || true", user="root", timeout_sec=10) + raw_pp = (pp_result.stdout or "").strip() + if not raw_pp: + return "" + blocked = _blocked_verifier_pythonpath_prefixes(sandbox_user) + cmd = _trusted_path_extras_cmd(raw_pp, blocked) + result = await env.exec(cmd, user="root", timeout_sec=10) + try: + extras = _json.loads(result.stdout or "[]") + except _json.JSONDecodeError: + return "" + if not isinstance(extras, list): + return "" + return ":".join(e for e in extras if isinstance(e, str)) + + # Wipe and recreate /logs/verifier/ before the verifier runs. # rm -rf severs hardlinks, removes symlink replacements, and eliminates # variant filenames/subdirs the agent may have pre-staged. @@ -592,26 +594,87 @@ async def _trusted_verifier_path( "rm -rf /logs/verifier && mkdir -p /logs/verifier && chmod 777 /logs/verifier" ) -# Remove injected conftest.py, sitecustomize.py/usercustomize.py, and .pth -# files from writable sys.path entries (preserves /usr/lib, /usr/local/lib). -# Also purge *.py from temp dirs: covers module-shadow via non-workspace cwd. -CLEANUP_CMD = ( - "find / -name conftest.py -not -path '/tests/*' -delete 2>/dev/null; " - "find /tmp /var/tmp -name '*.py' -delete 2>/dev/null; " - 'python3 -c "' - "import sys,os;" - "[os.remove(os.path.join(d,f)) " - " for d in sys.path " - " for f in ('sitecustomize.py','usercustomize.py') " - " if d and not d.startswith('/usr/lib') and not d.startswith('/usr/local/lib') " - " and os.path.isfile(os.path.join(d,f))];" - "[os.remove(os.path.join(d,f)) " - " for d in sys.path if d and os.path.isdir(d) " - " for f in os.listdir(d) if f.endswith('.pth') " - " and not d.startswith('/usr/lib') and not d.startswith('/usr/local/lib') " - " and os.path.isfile(os.path.join(d,f))]" - '" 2>/dev/null || true' -) +# Per-task hardening opt-outs. Tasks declare these in task.toml under +# [verifier.hardening] when their legitimate test setup conflicts with the +# default cleanup (e.g. qutebrowser ships a real conftest.py that the cleanup +# would otherwise delete, breaking pytest collection). +# +# Defaults are secure (all True). Tasks opt out individually: +# +# [verifier.hardening] +# cleanup_conftests = false # don't delete conftest.py before verify +HARDENING_DEFAULTS: dict[str, bool] = { + "cleanup_conftests": True, +} + + +def _read_hardening_config(task_dir: "Path | str | None") -> dict[str, bool]: + """Read [verifier.hardening] section from task.toml. Returns merged defaults.""" + import tomllib + from pathlib import Path as _Path + + result = dict(HARDENING_DEFAULTS) + if task_dir is None: + return result + toml_path = _Path(task_dir) / "task.toml" + if not toml_path.exists(): + return result + try: + with open(toml_path, "rb") as f: + data = tomllib.load(f) + except Exception as e: + logger.warning(f"task.toml parse error in {task_dir}: {e}") + return result + overrides = data.get("verifier", {}).get("hardening", {}) + for k, v in overrides.items(): + if k in result and isinstance(v, bool): + result[k] = v + else: + logger.warning( + f"task.toml [verifier.hardening] unknown/invalid: {k}={v!r}" + ) + return result + + +def _build_cleanup_cmd(hardening: dict[str, bool] | None = None) -> str: + """Build the cleanup shell command, honoring per-task hardening opt-outs. + + Steps: + - conftest.py removal outside /tests (skippable via cleanup_conftests=false) + - *.py purge from /tmp /var/tmp (always — covers module-shadow via cwd) + - sitecustomize.py/usercustomize.py removal from writable sys.path + - .pth removal from writable sys.path + + sitecustomize/usercustomize/.pth always run — opt-outs there would broaden + the attack surface beyond what real-world tasks need. + """ + h = hardening or HARDENING_DEFAULTS + parts: list[str] = [] + if h.get("cleanup_conftests", True): + parts.append( + "find / -name conftest.py -not -path '/tests/*' -delete 2>/dev/null" + ) + parts.append("find /tmp /var/tmp -name '*.py' -delete 2>/dev/null") + parts.append( + 'python3 -c "' + "import sys,os;" + "[os.remove(os.path.join(d,f)) " + " for d in sys.path " + " for f in ('sitecustomize.py','usercustomize.py') " + " if d and not d.startswith('/usr/lib') and not d.startswith('/usr/local/lib') " + " and os.path.isfile(os.path.join(d,f))];" + "[os.remove(os.path.join(d,f)) " + " for d in sys.path if d and os.path.isdir(d) " + " for f in os.listdir(d) if f.endswith('.pth') " + " and not d.startswith('/usr/lib') and not d.startswith('/usr/local/lib') " + " and os.path.isfile(os.path.join(d,f))]" + '" 2>/dev/null || true' + ) + return "; ".join(parts) + + +# Backward-compat: the all-defaults cleanup command. +CLEANUP_CMD = _build_cleanup_cmd() async def harden_before_verify( @@ -711,9 +774,11 @@ async def harden_before_verify( f"chown -R root:root {shlex.quote(workspace)}", user="root", ) - await env.exec(CLEANUP_CMD, user="root", timeout_sec=10) + hardening = _read_hardening_config(getattr(task, "task_dir", None)) + await env.exec(_build_cleanup_cmd(hardening), user="root", timeout_sec=10) hardened_path = await _trusted_verifier_path(env, sandbox_user, workspace) + hardened_pythonpath = await _trusted_verifier_pythonpath(env, sandbox_user) distro_env = await _distro_pip_env(env) verifier_env = dict(VERIFIER_ENV) @@ -725,16 +790,16 @@ async def harden_before_verify( # plugin loading, or inject code via breakpoint()/coverage/Django/Celery # startup hooks. verifier_env["PATH"] = hardened_path + verifier_env["PYTHONPATH"] = hardened_pythonpath verifier_env["PYTEST_DISABLE_PLUGIN_AUTOLOAD"] = "1" verifier_env["PYTHONBREAKPOINT"] = "0" verifier_env["COVERAGE_PROCESS_START"] = "" verifier_env["DJANGO_SETTINGS_MODULE"] = "" verifier_env["CELERY_CONFIG_MODULE"] = "" - # Re-enable known verifier plugins by appending -p flags to the hardened - # base — never to a task-supplied PYTEST_ADDOPTS. Legacy task sets are - # inferred from tests/test.sh; newer tasks may declare pytest_plugins. + # Auto-discover pytest plugins from root-owned system packages and + # task.toml declarations. Appends -p flags to the hardened base. base_addopts = VERIFIER_ENV["PYTEST_ADDOPTS"] - flags = _pytest_plugin_flags(task) + flags = await _discover_pytest_plugin_flags(env, task) if flags: verifier_env["PYTEST_ADDOPTS"] = base_addopts + f" {flags}" else: diff --git a/src/benchflow/acp/client.py b/src/benchflow/acp/client.py index 5fa3a658..57a55eb9 100644 --- a/src/benchflow/acp/client.py +++ b/src/benchflow/acp/client.py @@ -88,8 +88,9 @@ async def _read_until_response(self, request_id: int) -> dict[str, Any]: f"has_result={'result' in msg} has_error={'error' in msg}" ) - # It's a response to our request - if "id" in msg and msg["id"] == request_id: + # It's a response to our request (has id, no method — distinguishes + # from echoed requests when running through a PTY) + if "id" in msg and msg["id"] == request_id and "method" not in msg: if msg.get("error"): raise ACPError( msg["error"].get("code", -1), @@ -97,6 +98,7 @@ async def _read_until_response(self, request_id: int) -> dict[str, Any]: ) return msg.get("result", {}) + # It's a notification (no id) if "method" in msg and "id" not in msg: try: diff --git a/src/benchflow/agents/registry.py b/src/benchflow/agents/registry.py index f1d26bb7..83bf2c13 100644 --- a/src/benchflow/agents/registry.py +++ b/src/benchflow/agents/registry.py @@ -157,9 +157,19 @@ class AgentConfig: home_dirs: list[str] = field(default_factory=list) # Extra dot-dirs under $HOME to copy to sandbox user (for dirs not # derivable from skill_paths or credential_files, e.g. ".openclaw"). + acp_model_format: str = "bare" + # How the agent expects the modelId in session/set_model: + # "bare" — just the model name (e.g. "claude-sonnet-4-6"). + # Default; works for claude-agent-acp, codex-acp. + # "provider/model" — models.dev convention (e.g. "google/gemini-3.1-pro-preview"). + # Required by opencode, which uses Provider.parseModel() + # to split on "/" and treats the first segment as provider ID. subscription_auth: SubscriptionAuth | None = None # Host CLI login that can substitute for an API key (e.g. OAuth tokens # from `claude login`). Detected automatically; API keys take precedence. + supports_acp_set_model: bool = True + # Some ACP agents configure the model through env/config at launch time and + # do not implement session/set_model (e.g. OpenHands CLI ACP). # Agent registry — all supported agents @@ -308,16 +318,72 @@ class AgentConfig: ], ), ), + "opencode": AgentConfig( + name="opencode", + description="OpenCode via ACP — open-source coding agent (TypeScript)", + skill_paths=["$HOME/.opencode/skills"], + home_dirs=[".opencode"], + install_cmd=( + f"{_NODE_INSTALL} && " + "( command -v opencode >/dev/null 2>&1 || " + "npm install -g opencode-ai@latest >/dev/null 2>&1 ) && " + "command -v opencode >/dev/null 2>&1" + ), + launch_cmd="opencode acp", + protocol="acp", + requires_env=[], # inferred from --model at runtime + acp_model_format="provider/model", + # OpenCode uses models.dev provider IDs — its parseModel() splits + # modelId on "/" so set_model must send "google/gemini-3.1-pro-preview", + # not just "gemini-3.1-pro-preview". + env_mapping={ + "BENCHFLOW_PROVIDER_BASE_URL": "OPENAI_BASE_URL", + }, + ), "openhands": AgentConfig( name="openhands", description="OpenHands agent via ACP (multi-model, Python-based)", - skill_paths=[], + skill_paths=["$HOME/.agents/skills", "$WORKSPACE/.agents/skills"], + home_dirs=[".openhands"], install_cmd=( - "( command -v openhands >/dev/null 2>&1 || " - "pip install openhands >/dev/null 2>&1 ) && " + "export DEBIAN_FRONTEND=noninteractive && " + "export PATH=\"$HOME/.local/bin:$PATH\" && " + "( command -v curl >/dev/null 2>&1 || " + " ( apt-get update -qq && " + " apt-get install -y -qq curl ca-certificates >/dev/null 2>&1 ) ) && " + "( command -v openhands >/dev/null 2>&1 || ( " + " UV_OK=0; " + " if command -v uv >/dev/null 2>&1; then " + " UV_VER=$(uv --version 2>/dev/null | awk '{print $2}'); " + " if [ -n \"$UV_VER\" ] && " + " [ \"$(printf '%s\\n' 0.11.6 \"$UV_VER\" | sort -V | head -n1)\" = \"0.11.6\" ]; then " + " UV_OK=1; " + " fi; " + " fi; " + " if [ \"$UV_OK\" = 0 ]; then " + " curl -LsSf https://astral.sh/uv/install.sh | sh >/dev/null 2>&1 && " + " export PATH=\"$HOME/.local/bin:$PATH\"; " + " fi && " + " ( uv tool list 2>/dev/null | grep -q '^openhands\\b' || " + " uv tool install openhands --python 3.12 >/dev/null 2>&1 || " + " curl -fsSL https://install.openhands.dev/install.sh | sh >/dev/null 2>&1 ) " + ") ) && " + # Let sandbox user traverse to uv-managed Python interpreter path. + "chmod o+x /root /root/.local /root/.local/share " + "/root/.local/share/uv /root/.local/share/uv/tools 2>/dev/null; " + # Seed config so OpenHands ACP auth check passes before env override. + "mkdir -p ~/.openhands && " + "echo '{\"llm\":{\"model\":\"placeholder\",\"api_key\":\"placeholder\"}}' " + "> ~/.openhands/agent_settings.json && " "command -v openhands >/dev/null 2>&1" ), - launch_cmd="openhands acp --always-approve --override-with-envs", + launch_cmd=( + "export PATH=\"$HOME/.local/bin:$PATH\" && " + "mkdir -p ~/.openhands && " + "printf '{\"llm\":{\"model\":\"%s\",\"api_key\":\"%s\"}}' " + "\"$LLM_MODEL\" \"$LLM_API_KEY\" > ~/.openhands/agent_settings.json && " + "openhands acp --always-approve --override-with-envs" + ), protocol="acp", requires_env=["LLM_API_KEY"], api_protocol="", @@ -325,6 +391,7 @@ class AgentConfig: "BENCHFLOW_PROVIDER_BASE_URL": "LLM_BASE_URL", "BENCHFLOW_PROVIDER_API_KEY": "LLM_API_KEY", }, + supports_acp_set_model=False, ), } @@ -336,21 +403,18 @@ class AgentConfig: def get_sandbox_home_dirs() -> set[str]: - """Collect all dot-dirs under $HOME that sandbox user setup should copy. + """Collect user home config/auth dirs BenchFlow may materialize for the sandbox user. Derives from three sources across all registered agents: - - skill_paths: $HOME/.foo/... → ".foo" - credential_files: {home}/.foo/... → ".foo" + - subscription_auth.files: {home}/.foo/... → ".foo" - home_dirs: explicit extras (e.g. ".openclaw") - Always includes ".local" (pip scripts, etc.). + Skill paths are excluded: deploy_skills() now links those paths directly to a + shared skills tree instead of relying on sandbox-home copies. """ - dirs: set[str] = {".local"} + dirs: set[str] = set() for cfg in AGENTS.values(): - for sp in cfg.skill_paths: - if sp.startswith("$HOME/."): - dirname = sp.removeprefix("$HOME/").split("/")[0] - dirs.add(dirname) for cf in cfg.credential_files: # path uses {home}/.foo/... placeholder path = cf.path @@ -496,6 +560,8 @@ def register_agent( credential_files: list[CredentialFile] | None = None, home_dirs: list[str] | None = None, subscription_auth: SubscriptionAuth | None = None, + acp_model_format: str = "bare", + supports_acp_set_model: bool = True, ) -> AgentConfig: """Register a custom agent at runtime. @@ -525,6 +591,8 @@ def register_agent( credential_files=credential_files or [], home_dirs=home_dirs or [], subscription_auth=subscription_auth, + acp_model_format=acp_model_format, + supports_acp_set_model=supports_acp_set_model, ) AGENTS[name] = config AGENT_INSTALLERS[name] = install_cmd diff --git a/src/benchflow/cli/eval.py b/src/benchflow/cli/eval.py index 58b410fc..17cea1dd 100644 --- a/src/benchflow/cli/eval.py +++ b/src/benchflow/cli/eval.py @@ -31,7 +31,8 @@ from rich.console import Console from rich.table import Table -from benchflow.job import DEFAULT_AGENT, effective_model as _effective_model +from benchflow.job import DEFAULT_AGENT +from benchflow.job import effective_model as _effective_model console = Console() diff --git a/src/benchflow/cli/main.py b/src/benchflow/cli/main.py index 96611bbd..9c9b7f23 100644 --- a/src/benchflow/cli/main.py +++ b/src/benchflow/cli/main.py @@ -433,9 +433,9 @@ def skills_eval( typer.Argument(help="Path to skill directory containing evals/evals.json"), ], agent: Annotated[ - list[str], + list[str] | None, typer.Option("--agent", "-a", help="Agent(s) to evaluate (repeatable)"), - ] = ["claude-agent-acp"], + ] = None, model: Annotated[ list[str] | None, typer.Option("--model", "-m", help="Model(s) (matched 1:1 with agents)"), @@ -473,6 +473,8 @@ def skills_eval( """ from benchflow.skill_eval import SkillEvaluator, export_gepa_traces + if agent is None: + agent = ["claude-agent-acp"] if not (skill_dir / "evals" / "evals.json").exists(): console.print( f"[red]No evals/evals.json found in {skill_dir}[/red]\n" @@ -738,6 +740,13 @@ def eval_create( str | None, typer.Option("--sandbox-user", help="Sandbox user (null for root)"), ] = "agent", + sandbox_setup_timeout: Annotated[ + int, + typer.Option( + "--sandbox-setup-timeout", + help="Timeout (seconds) for sandbox user setup inside the environment.", + ), + ] = 120, skills_dir: Annotated[ Path | None, typer.Option("--skills-dir", "-s", help="Skills directory to deploy"), @@ -757,7 +766,7 @@ def eval_create( eff_model = effective_model(agent, model) # Smart detection: if tasks_dir has task.toml, it's a single task if (tasks_dir / "task.toml").exists(): - from benchflow.trial import Trial, TrialConfig, Scene + from benchflow.trial import Scene, Trial, TrialConfig config = TrialConfig( task_path=tasks_dir, @@ -765,6 +774,7 @@ def eval_create( skills_dir=str(skills_dir) if skills_dir else None)], environment=environment, sandbox_user=sandbox_user, + sandbox_setup_timeout=sandbox_setup_timeout, jobs_dir=jobs_dir, agent=agent, model=eff_model, @@ -794,6 +804,7 @@ async def _run(): environment=environment, concurrency=concurrency, sandbox_user=sandbox_user, + sandbox_setup_timeout=sandbox_setup_timeout, skills_dir=str(skills_dir) if skills_dir else None, ), ) diff --git a/src/benchflow/job.py b/src/benchflow/job.py index ad99ad49..bd53a0b0 100644 --- a/src/benchflow/job.py +++ b/src/benchflow/job.py @@ -126,9 +126,7 @@ def should_retry(self, error: str | None) -> bool: return True if self.retry_on_pipe and category == PIPE_CLOSED: return True - if self.retry_on_acp and category == ACP_ERROR: - return True - return False + return bool(self.retry_on_acp and category == ACP_ERROR) def backoff_delay(self, attempt: int) -> float: """Exponential backoff delay for retry attempt.""" @@ -168,6 +166,7 @@ class JobConfig: skills_dir: str | None = None sandbox_user: str | None = "agent" sandbox_locked_paths: list[str] | None = None + sandbox_setup_timeout: int = 120 context_root: str | None = None exclude_tasks: set[str] = field(default_factory=set) @@ -293,6 +292,7 @@ def _from_native_yaml(cls, raw: dict, **kwargs) -> "Job": exclude = set(raw.get("exclude", [])) sandbox_user = raw.get("sandbox_user", "agent") sandbox_locked_paths = raw.get("sandbox_locked_paths") + sandbox_setup_timeout = raw.get("sandbox_setup_timeout", 120) agent_name = raw.get("agent", DEFAULT_AGENT) config = JobConfig( @@ -306,6 +306,7 @@ def _from_native_yaml(cls, raw: dict, **kwargs) -> "Job": skills_dir=str(Path(raw["skills_dir"])) if raw.get("skills_dir") else None, sandbox_user=sandbox_user, sandbox_locked_paths=sandbox_locked_paths, + sandbox_setup_timeout=sandbox_setup_timeout, exclude_tasks=exclude, ) return cls(tasks_dir=tasks_dir, jobs_dir=jobs_dir, config=config, **kwargs) @@ -352,6 +353,7 @@ def _from_harbor_yaml(cls, raw: dict, **kwargs) -> "Job": skills_dir = str(Path(skills_dir_raw)) if skills_dir_raw else None sandbox_user = raw.get("sandbox_user", "agent") sandbox_locked_paths = raw.get("sandbox_locked_paths") + sandbox_setup_timeout = raw.get("sandbox_setup_timeout", 120) config = JobConfig( agent=agent_name, @@ -363,6 +365,7 @@ def _from_harbor_yaml(cls, raw: dict, **kwargs) -> "Job": skills_dir=skills_dir, sandbox_user=sandbox_user, sandbox_locked_paths=sandbox_locked_paths, + sandbox_setup_timeout=sandbox_setup_timeout, ) return cls(tasks_dir=tasks_dir, jobs_dir=jobs_dir, config=config, **kwargs) @@ -407,6 +410,13 @@ def _prune_docker(self): except Exception as e: logger.warning(f"Docker prune failed: {e}") + def _resolve_skills_dir(self, task_dir: Path, skills_dir: str | None) -> str | None: + """Resolve skills_dir — 'auto' means per-task environment/skills/.""" + if skills_dir == "auto": + candidate = task_dir / "environment" / "skills" + return str(candidate) if candidate.is_dir() else None + return skills_dir + async def _run_single_task(self, task_dir: Path, cfg: JobConfig) -> RunResult: """Execute one trial via Trial.""" from benchflow.trial import Trial, TrialConfig @@ -420,9 +430,10 @@ async def _run_single_task(self, task_dir: Path, cfg: JobConfig) -> RunResult: job_name=self._job_name, jobs_dir=str(self._jobs_dir), environment=cfg.environment, - skills_dir=cfg.skills_dir, + skills_dir=self._resolve_skills_dir(task_dir, cfg.skills_dir), sandbox_user=cfg.sandbox_user, sandbox_locked_paths=cfg.sandbox_locked_paths, + sandbox_setup_timeout=cfg.sandbox_setup_timeout, context_root=cfg.context_root, ) trial = await Trial.create(trial_config) @@ -439,9 +450,10 @@ async def _run_single_task_legacy(self, task_dir: Path, cfg: JobConfig) -> RunRe job_name=self._job_name, jobs_dir=str(self._jobs_dir), environment=cfg.environment, - skills_dir=cfg.skills_dir, + skills_dir=self._resolve_skills_dir(task_dir, cfg.skills_dir), sandbox_user=cfg.sandbox_user, sandbox_locked_paths=cfg.sandbox_locked_paths, + sandbox_setup_timeout=cfg.sandbox_setup_timeout, context_root=cfg.context_root, ) diff --git a/src/benchflow/mcp/reviewer_server.py b/src/benchflow/mcp/reviewer_server.py index 9c2dedcc..210746b9 100644 --- a/src/benchflow/mcp/reviewer_server.py +++ b/src/benchflow/mcp/reviewer_server.py @@ -24,8 +24,6 @@ import json import logging import os -import subprocess -import sys from pathlib import Path logger = logging.getLogger(__name__) @@ -52,11 +50,11 @@ def create_reviewer_server( """ try: from fastmcp import FastMCP - except ImportError: + except ImportError as e: raise ImportError( "fastmcp required for MCP reviewer server. " "Install with: pip install fastmcp" - ) + ) from e mcp = FastMCP("benchflow-reviewer") prompt = review_prompt or os.environ.get("REVIEWER_PROMPT", DEFAULT_REVIEW_PROMPT) diff --git a/src/benchflow/metrics.py b/src/benchflow/metrics.py index 43bd4450..413eb7fe 100644 --- a/src/benchflow/metrics.py +++ b/src/benchflow/metrics.py @@ -180,7 +180,7 @@ def collect_metrics( results_dir = Path(results_dir) best: dict[str, dict] = {} - for rfile in results_dir.rglob("result.json"): + for rfile in sorted(results_dir.rglob("result.json")): try: r = json.loads(rfile.read_text()) task = r["task_name"] diff --git a/src/benchflow/process.py b/src/benchflow/process.py index 4c86f3b6..b9d2c212 100644 --- a/src/benchflow/process.py +++ b/src/benchflow/process.py @@ -12,6 +12,7 @@ import logging import os import shlex +import uuid from abc import ABC, abstractmethod from typing import Any @@ -256,11 +257,16 @@ class DaytonaProcess(LiveProcess): """ def __init__( - self, sandbox: Any, is_dind: bool = False, compose_cmd_prefix: str = "" + self, + sandbox: Any, + is_dind: bool = False, + compose_cmd_prefix: str = "", + compose_cmd_base: str = "", ): self._sandbox = sandbox self._is_dind = is_dind self._compose_cmd_prefix = compose_cmd_prefix + self._compose_cmd_base = compose_cmd_base @classmethod async def from_harbor_env(cls, env: Any) -> "DaytonaProcess": @@ -273,6 +279,7 @@ async def from_harbor_env(cls, env: Any) -> "DaytonaProcess": is_dind = hasattr(env, "_strategy") and hasattr(env._strategy, "_compose_cmd") compose_cmd_prefix = "" + compose_cmd_base = "" if is_dind: # Build compose env vars and command prefix for DinD strategy = env._strategy @@ -280,9 +287,16 @@ async def from_harbor_env(cls, env: Any) -> "DaytonaProcess": f"{k}={shlex.quote(v)}" for k, v in strategy._compose_env_vars().items() ) compose_cmd_prefix = compose_env + # Extract the full compose base command with project/file flags + # (e.g. "docker compose -p NAME --project-directory DIR -f F1 -f F2") + # so that `docker compose exec` can find the running project. + compose_cmd_base = strategy._compose_cmd([]) return cls( - sandbox=sandbox, is_dind=is_dind, compose_cmd_prefix=compose_cmd_prefix + sandbox=sandbox, + is_dind=is_dind, + compose_cmd_prefix=compose_cmd_prefix, + compose_cmd_base=compose_cmd_base, ) async def start( @@ -296,19 +310,30 @@ async def start( ssh_target = f"{ssh_access.token}@ssh.app.daytona.io" if self._is_dind: - # Build the docker compose exec command to run inside the DinD VM - inner_parts = ["docker", "compose", "exec", "-i", "-T"] + # Build the docker compose exec command to run inside the DinD VM. + # Use the full compose base command (with -p, --project-directory, + # and -f flags) so that exec can find the running project. + if self._compose_cmd_base: + inner_parts = [*shlex.split(self._compose_cmd_base), "exec", "-i", "-T"] + else: + inner_parts = ["docker", "compose", "exec", "-i", "-T"] if cwd: inner_parts.extend(["-w", cwd]) # Write env vars to a temp file on the remote VM instead of passing # as -e K=V args (which are visible in ps aux on the remote host). + # Use a Python-generated unique suffix instead of a shell `$$` + # expansion: shlex.join() (below) single-quotes the --env-file arg, + # so `$$` would survive as a literal in the docker compose call + # while the cat > ... heredoc would expand it — the file would be + # written to one path and read from another. uuid.uuid4 sidesteps + # the entire shell-expansion-vs-quoting problem. remote_env_path = None if env: - remote_env_path = "/tmp/benchflow_env_$$.env" + remote_env_path = f"/tmp/benchflow_env_{uuid.uuid4().hex[:16]}.env" env_lines = "\n".join(f"{k}={v}" for k, v in env.items()) inner_parts.extend(["--env-file", remote_env_path]) - inner_parts.extend(["main", "bash", "-c", shlex.quote(command)]) - inner_cmd = " ".join(inner_parts) + inner_parts.extend(["main", "bash", "-c", command]) + inner_cmd = shlex.join(inner_parts) if self._compose_cmd_prefix: remote_cmd = f"{self._compose_cmd_prefix} {inner_cmd}" @@ -331,7 +356,9 @@ async def start( env_prefix = "" remote_env_path = None if env: - remote_env_path = "/tmp/benchflow_env_$$.env" + # Python-generated unique suffix; see DinD branch above for why + # $$ shell expansion is fragile across quoting boundaries. + remote_env_path = f"/tmp/benchflow_env_{uuid.uuid4().hex[:16]}.env" env_lines = "\n".join( f"export {k}={shlex.quote(v)}" for k, v in env.items() ) @@ -371,3 +398,131 @@ async def start( limit=_BUFFER_LIMIT, ) logger.info(f"Daytona process started (pid={self._process.pid})") + + +class DaytonaPtyProcess(LiveProcess): + """Live stdin/stdout via Daytona PTY WebSocket API. + + Uses the Daytona SDK's PTY session (WebSocket) instead of SSH, which + maintains long-lived interactive pipes through DinD compose layers. + Falls back to this for DinD sandboxes where SSH pipes break. + """ + + _process = None # Not used — override readline/writeline/close + + def __init__(self, sandbox: Any, compose_cmd_prefix: str, compose_cmd_base: str): + self._sandbox = sandbox + self._compose_cmd_prefix = compose_cmd_prefix + self._compose_cmd_base = compose_cmd_base + self._pty = None + self._line_buffer = asyncio.Queue() + self._partial = b"" + self._closed = False + + @classmethod + async def from_harbor_env(cls, env: Any) -> "DaytonaPtyProcess": + sandbox = env._sandbox + if not sandbox: + raise RuntimeError("Daytona sandbox not started") + strategy = env._strategy + compose_env = " ".join( + f"{k}={shlex.quote(v)}" for k, v in strategy._compose_env_vars().items() + ) + compose_cmd_base = strategy._compose_cmd([]) + return cls(sandbox=sandbox, compose_cmd_prefix=compose_env, compose_cmd_base=compose_cmd_base) + + async def _on_pty_data(self, data: bytes) -> None: + self._partial += data + while b"\n" in self._partial: + line, self._partial = self._partial.split(b"\n", 1) + line = line.replace(b"\r", b"") + await self._line_buffer.put(line + b"\n") + + async def start( + self, + command: str, + env: dict[str, str] | None = None, + cwd: str | None = None, + ) -> None: + import uuid + session_id = f"acp-{uuid.uuid4().hex[:8]}" + pty_env = {} + if self._compose_cmd_prefix: + for part in shlex.split(self._compose_cmd_prefix): + if "=" in part: + k, v = part.split("=", 1) + pty_env[k] = v + + self._pty = await self._sandbox.process.create_pty_session( + id=session_id, + on_data=self._on_pty_data, + envs=pty_env if pty_env else None, + ) + await self._pty.wait_for_connection() + logger.info(f"DaytonaPtyProcess: PTY connected (session={session_id})") + + compose_parts = shlex.split(self._compose_cmd_base) if self._compose_cmd_base else ["docker", "compose"] + exec_parts = [*compose_parts, "exec", "-i", "-T"] + if cwd: + exec_parts.extend(["-w", cwd]) + # Write env vars to a file inside the container (not visible in ps aux), + # matching the approach in DaytonaProcess.start(). + env_file_cmd = "" + if env: + env_lines = "\n".join(f"export {k}={shlex.quote(v)}" for k, v in env.items()) + env_file_cmd = ( + f"cat > /tmp/.benchflow_env <<'__EOF__'\n{env_lines}\n__EOF__\n" + f". /tmp/.benchflow_env && rm -f /tmp/.benchflow_env && " + ) + exec_parts.extend(["main", "bash", "-lc", f"{env_file_cmd}{command}"]) + exec_cmd = shlex.join(exec_parts) + + # Use a marker + stty to cleanly hand over the PTY to the agent. + # 1. Disable echo so typed commands don't appear in output + # 2. Print marker so we know when to start reading ACP output + # 3. exec into compose exec so the agent owns the PTY + marker = f"__BENCHFLOW_ACP_{session_id}__" + setup = f"stty -echo 2>/dev/null; echo '{marker}'; exec {exec_cmd}\n" + await self._pty.send_input(setup) + logger.info("DaytonaPtyProcess: sent setup, waiting for marker...") + + while True: + try: + line = await asyncio.wait_for(self._line_buffer.get(), timeout=120) + decoded = line.decode(errors="replace").strip() + logger.debug(f"DaytonaPtyProcess drain: {decoded[:120]}") + if marker in decoded: + break + except TimeoutError as e: + raise ConnectionError("DaytonaPtyProcess: timeout waiting for agent start marker") from e + + logger.info("DaytonaPtyProcess: marker seen, agent starting") + + async def readline(self) -> bytes: + if self._closed: + raise ConnectionError("PTY closed") + try: + line = await asyncio.wait_for(self._line_buffer.get(), timeout=900) + return line + except TimeoutError as e: + raise ConnectionError("PTY readline timeout (900s)") from e + except Exception as e: + raise ConnectionError(f"PTY readline error: {e}") from e + + async def writeline(self, data: str) -> None: + if not self._pty or self._closed: + raise RuntimeError("PTY not started") + await self._pty.send_input(data + "\n") + + async def close(self) -> None: + self._closed = True + if self._pty: + with contextlib.suppress(Exception): + await self._pty.kill() + with contextlib.suppress(Exception): + await self._pty.disconnect() + logger.info("DaytonaPtyProcess terminated") + + @property + def is_running(self) -> bool: + return self._pty is not None and not self._closed diff --git a/src/benchflow/runtime.py b/src/benchflow/runtime.py index 1e55f0c7..69be399c 100644 --- a/src/benchflow/runtime.py +++ b/src/benchflow/runtime.py @@ -142,6 +142,7 @@ class RuntimeConfig: """Configuration for a Runtime execution.""" sandbox_user: str | None = "agent" + sandbox_setup_timeout: int = 120 max_rounds: int = 10 snapshot_policy: str = "none" reward_stream: bool = True @@ -263,6 +264,7 @@ async def execute(self) -> RuntimeResult: environment=self.env.backend, sandbox_user=config.sandbox_user, sandbox_locked_paths=config.sandbox_locked_paths, + sandbox_setup_timeout=config.sandbox_setup_timeout, jobs_dir=config.jobs_dir, context_root=config.context_root, pre_agent_hooks=config.pre_agent_hooks, @@ -291,13 +293,13 @@ async def execute(self) -> RuntimeResult: async def run( - subject: "Agent | TrialConfig | str", - env: "Environment | str | None" = None, + subject: Agent | TrialConfig | str, + env: Environment | str | None = None, config: RuntimeConfig | None = None, *, - task_path: "str | Path | None" = None, + task_path: str | Path | None = None, model: str | None = None, -) -> "RuntimeResult | RunResult": +) -> RuntimeResult | RunResult: """Primary user-facing API — multiple calling conventions. Usage:: @@ -313,7 +315,6 @@ async def run( # 3. Agent name string (simplest) result = await bf.run("gemini", task_path="tasks/X") """ - from benchflow.models import RunResult from benchflow.trial import Scene, Trial, TrialConfig if isinstance(subject, TrialConfig): @@ -339,6 +340,7 @@ async def run( environment=env if isinstance(env, str) else "docker", sandbox_user=rc.sandbox_user, sandbox_locked_paths=rc.sandbox_locked_paths, + sandbox_setup_timeout=rc.sandbox_setup_timeout, jobs_dir=rc.jobs_dir, context_root=rc.context_root, pre_agent_hooks=rc.pre_agent_hooks, diff --git a/src/benchflow/sdk.py b/src/benchflow/sdk.py index bd10a2d0..b55a72f7 100644 --- a/src/benchflow/sdk.py +++ b/src/benchflow/sdk.py @@ -98,33 +98,12 @@ from harbor.models.trial.paths import TrialPaths from harbor.verifier.verifier import Verifier -from benchflow._acp_run import connect_acp, execute_prompts -from benchflow._agent_env import resolve_agent_env -from benchflow._agent_setup import deploy_skills, install_agent -from benchflow._credentials import ( - upload_subscription_auth, - write_credential_files, -) from benchflow._env_setup import ( - _create_environment, - _inject_skills_into_dockerfile, _patch_harbor_dind, - stage_dockerfile_deps, ) from benchflow._sandbox import ( - _resolve_locked_paths, - _seed_verifier_workspace, - _snapshot_build_config, harden_before_verify, - lockdown_paths, - setup_sandbox_user, -) -from benchflow._trajectory import ( - _capture_session_trajectory, - _scrape_agent_trajectory, ) -from benchflow.acp.client import ACPClient, ACPError -from benchflow.agents.registry import AGENT_LAUNCH from benchflow.models import RunResult, TrajectorySource logger = logging.getLogger(__name__) @@ -243,6 +222,7 @@ def _write_config( sandbox_user: str | None, context_root: str | Path | None, sandbox_locked_paths: list[str] | None = None, + sandbox_setup_timeout: int = 120, timeout: int, started_at: datetime, agent_env: dict[str, str], @@ -262,6 +242,7 @@ def _write_config( "skills_dir": str(skills_dir) if skills_dir else None, "sandbox_user": sandbox_user, "sandbox_locked_paths": sandbox_locked_paths, + "sandbox_setup_timeout": sandbox_setup_timeout, "context_root": str(context_root) if context_root else None, "timeout_sec": timeout, "started_at": str(started_at), @@ -463,6 +444,7 @@ async def run( skills_dir: str | Path | None = None, sandbox_user: str | None = "agent", sandbox_locked_paths: list[str] | None = None, + sandbox_setup_timeout: int = 120, pre_agent_hooks: list | None = None, context_root: str | Path | None = None, ) -> RunResult: @@ -511,6 +493,7 @@ async def run( skills_dir=skills_dir, sandbox_user=sandbox_user, sandbox_locked_paths=sandbox_locked_paths, + sandbox_setup_timeout=sandbox_setup_timeout, pre_agent_hooks=pre_agent_hooks, context_root=context_root, ) diff --git a/src/benchflow/skill_eval.py b/src/benchflow/skill_eval.py index b848135e..82e96869 100644 --- a/src/benchflow/skill_eval.py +++ b/src/benchflow/skill_eval.py @@ -6,6 +6,7 @@ result = await evaluator.run(agents=["claude-agent-acp"], environment="docker") """ +import contextlib import json import logging import shutil @@ -431,7 +432,7 @@ async def run( all_results: list[CaseResult] = [] # Run each agent - for agent, model in zip(agents, models): + for agent, model in zip(agents, models, strict=False): agent_label = agent.split("/")[-1] if "/" in agent else agent # With-skill run @@ -486,9 +487,9 @@ async def _run_job( with_skill: bool, ) -> list[CaseResult]: """Run a batch of tasks using Job for concurrency and retries.""" - from benchflow.job import Job, JobConfig, RetryConfig - import os + + from benchflow.job import Job, JobConfig, RetryConfig judge_env = {} for key in ("ANTHROPIC_API_KEY", "OPENAI_API_KEY", "GOOGLE_API_KEY", "GEMINI_API_KEY"): if os.environ.get(key): @@ -506,7 +507,7 @@ async def _run_job( agent_env=judge_env, ), ) - job_result = await j.run() + await j.run() results = [] # Walk the jobs directory to collect per-case results @@ -539,10 +540,8 @@ async def _run_job( # Read judge rubric details if available judge_file = trial_dir / "verifier" / "judge_result.json" if judge_file.exists(): - try: + with contextlib.suppress(json.JSONDecodeError, KeyError): rubric_results = json.loads(judge_file.read_text()).get("items") - except (json.JSONDecodeError, KeyError): - pass results.append( CaseResult( @@ -579,7 +578,7 @@ def _compute_lifts( ) -> list[AgentLift]: """Compute per-agent lift from case results.""" lifts = [] - for agent, model in zip(agents, models): + for agent, model in zip(agents, models, strict=False): with_results = [r for r in all_results if r.agent == agent and r.with_skill] baseline_results = [ r for r in all_results if r.agent == agent and not r.with_skill diff --git a/src/benchflow/trial.py b/src/benchflow/trial.py index 88034199..b2551054 100644 --- a/src/benchflow/trial.py +++ b/src/benchflow/trial.py @@ -36,7 +36,10 @@ from __future__ import annotations import asyncio +import contextlib +import json import logging +import shlex from dataclasses import dataclass, field from datetime import datetime from pathlib import Path @@ -55,7 +58,6 @@ _resolve_locked_paths, _seed_verifier_workspace, _snapshot_build_config, - harden_before_verify, lockdown_paths, setup_sandbox_user, ) @@ -64,8 +66,9 @@ _scrape_agent_trajectory, ) from benchflow.acp.client import ACPClient, ACPError -from benchflow.agents.registry import AGENT_LAUNCH, AGENTS +from benchflow.agents.registry import AGENT_LAUNCH from benchflow.models import RunResult, TrajectorySource +from benchflow.user import BaseUser, RoundResult logger = logging.getLogger(__name__) @@ -108,7 +111,7 @@ def single( prompts: list[str | None] | None = None, role_name: str = "agent", skills_dir: str | Path | None = None, - ) -> "Scene": + ) -> Scene: """Shortcut for single-agent, single-role scene.""" prompts = prompts or [None] return cls( @@ -131,6 +134,7 @@ class TrialConfig: environment: str = "docker" sandbox_user: str | None = "agent" sandbox_locked_paths: list[str] | None = None + sandbox_setup_timeout: int = 120 services: list[str] | None = None job_name: str | None = None trial_name: str | None = None @@ -138,6 +142,11 @@ class TrialConfig: context_root: str | Path | None = None pre_agent_hooks: list | None = None + # User-driven progressive-disclosure loop + user: BaseUser | None = None + max_user_rounds: int = 5 + oracle_access: bool = False + # Legacy compat fields — used by SDK.run() shim. Ignored when scenes is set. agent: str = "claude-agent-acp" prompts: list[str | None] | None = None @@ -155,7 +164,7 @@ def from_legacy( prompts: list[str | None] | None = None, skills_dir: str | Path | None = None, **kwargs, - ) -> "TrialConfig": + ) -> TrialConfig: """Construct from flat SDK.run()-style args.""" return cls( task_path=task_path, @@ -273,6 +282,11 @@ async def setup(self) -> None: logger.warning( "sandbox_user=None — agent runs as root with no path lockdown." ) + if cfg.oracle_access and cfg.user is None: + logger.warning( + "oracle_access=True without a User — /solution stays visible " + "to the agent for the entire trial." + ) self._effective_locked = _resolve_locked_paths( cfg.sandbox_user, cfg.sandbox_locked_paths @@ -328,6 +342,7 @@ async def setup(self) -> None: sandbox_user=cfg.sandbox_user, context_root=cfg.context_root, sandbox_locked_paths=self._effective_locked, + sandbox_setup_timeout=cfg.sandbox_setup_timeout, timeout=self._timeout, started_at=self._started_at, agent_env=self._agent_env, @@ -367,7 +382,10 @@ async def install_agent(self) -> None: if cfg.primary_agent == "oracle": if cfg.sandbox_user: await setup_sandbox_user( - self._env, cfg.sandbox_user, workspace=self._agent_cwd + self._env, + cfg.sandbox_user, + workspace=self._agent_cwd, + timeout_sec=cfg.sandbox_setup_timeout, ) await _snapshot_build_config(self._env, workspace=self._agent_cwd) await _seed_verifier_workspace(self._env, workspace=self._agent_cwd, sandbox_user=cfg.sandbox_user) @@ -389,7 +407,10 @@ async def install_agent(self) -> None: if cfg.sandbox_user: self._agent_cwd = await setup_sandbox_user( - self._env, cfg.sandbox_user, workspace=self._agent_cwd + self._env, + cfg.sandbox_user, + workspace=self._agent_cwd, + timeout_sec=cfg.sandbox_setup_timeout, ) await _snapshot_build_config(self._env, workspace=self._agent_cwd) await _seed_verifier_workspace(self._env, workspace=self._agent_cwd, sandbox_user=cfg.sandbox_user) @@ -438,10 +459,8 @@ async def disconnect(self) -> None: # Kill any lingering agent processes to prevent context bleed between scenes if self._env and self._agent_launch.strip(): agent_cmd = self._agent_launch.split()[0].split("/")[-1] - try: + with contextlib.suppress(Exception): await self._env.exec(f"pkill -f '{agent_cmd}' || true", timeout_sec=10) - except Exception: - pass self._session_tool_count = 0 self._session_traj_count = 0 self._phase = "installed" @@ -510,6 +529,69 @@ async def verify(self) -> dict | None: self._phase = "verified" return self._rewards + async def soft_verify(self) -> tuple[dict | None, str | None, str | None]: + """Run the verifier without full hardening — for intermediate feedback. + + Skips process kill and workspace restore/chown (so the sandbox + stays usable for the next round), but DOES purge agent-injected + conftest.py / sitecustomize.py / .pth files to prevent the agent + from gaming intermediate test results. + + Returns (rewards, verifier_output, verifier_error). The final + verify() still does full hardening. + """ + from harbor import Verifier + + from benchflow._sandbox import _build_cleanup_cmd, _read_hardening_config + + self._trial_paths.verifier_dir.mkdir(parents=True, exist_ok=True) + # Clean verifier output dir — chmod 777 so non-root verifier processes can write + await self._env.exec( + "rm -rf /logs/verifier && mkdir -p /logs/verifier && chmod 777 /logs/verifier", + user="root", timeout_sec=10, + ) + # Purge agent-injected conftest/sitecustomize/.pth without + # killing processes or restoring workspace. + # Honor per-task [verifier.hardening] opt-outs from task.toml. + hardening = _read_hardening_config(getattr(self._task, "task_dir", None)) + await self._env.exec( + _build_cleanup_cmd(hardening), user="root", timeout_sec=10 + ) + + rewards = None + verifier_output = None + verifier_error = None + try: + verifier = Verifier( + task=self._task, + trial_paths=self._trial_paths, + environment=self._env, + ) + verifier_result = await asyncio.wait_for( + verifier.verify(), + timeout=self._task.config.verifier.timeout_sec, + ) + rewards = verifier_result.rewards + # Capture raw verifier output for the user + cat = await self._env.exec( + "cat /logs/verifier/*.log 2>/dev/null || " + "cat /logs/verifier/output.txt 2>/dev/null || true", + timeout_sec=10, + ) + verifier_output = (cat.stdout or "").strip() or None + logger.info(f"[soft_verify] rewards={rewards}") + except TimeoutError: + verifier_error = ( + f"soft verifier timed out after " + f"{self._task.config.verifier.timeout_sec}s" + ) + logger.warning(verifier_error) + except Exception as e: + verifier_error = f"soft verifier crashed: {e}" + logger.warning(verifier_error) + + return rewards, verifier_output, verifier_error + # ── Phase 5: CLEANUP ── async def cleanup(self) -> None: @@ -568,8 +650,18 @@ async def run(self) -> RunResult: ) else: await self.install_agent() - for scene in cfg.effective_scenes: - await self._run_scene(scene) + try: + if cfg.user is not None: + await self._run_user_loop() + else: + for scene in cfg.effective_scenes: + await self._run_scene(scene) + finally: + if cfg.oracle_access: + await self._env.exec( + "mv /solution_oracle_backup /solution 2>/dev/null || true", + user="root", timeout_sec=10, + ) await self.verify() @@ -598,20 +690,41 @@ async def run(self) -> RunResult: # ── Scene execution ── + _OUTBOX_DIR = "/app/.outbox" + async def _run_scene(self, scene: Scene) -> None: - """Execute one scene: for each turn, connect as the turn's role, execute, disconnect.""" + """Execute one scene: for each turn, connect as the turn's role, execute, disconnect. + + For multi-role scenes, agents communicate via outbox files: + an agent writes ``/app/.outbox/{recipient}.json`` with + ``{"to": "role_name", "content": "..."}`` and the scheduler + injects received messages into the next turn's prompt. + + Inter-role messages are persisted to ``trial_dir/scene_messages.jsonl``. + """ cfg = self._config logger.info(f"[Scene] {scene.name} — {len(scene.turns)} turns, {len(scene.roles)} roles") role_map = {r.name: r for r in scene.roles} current_role: str | None = None + multi_role = len(scene.roles) > 1 + scene_messages: list[dict] = [] + + if multi_role: + setup_cmd = f"rm -rf {self._OUTBOX_DIR} && mkdir -p {self._OUTBOX_DIR}" + if cfg.sandbox_user: + user = shlex.quote(cfg.sandbox_user) + setup_cmd += f" && chown {user}:{user} {self._OUTBOX_DIR}" + await self._env.exec(setup_cmd, timeout_sec=10) - for i, turn in enumerate(scene.turns): + inbox: dict[str, list[str]] = {r.name: [] for r in scene.roles} + turn_counter = 0 + + for _i, turn in enumerate(scene.turns): role = role_map.get(turn.role) if not role: raise ValueError(f"Turn references unknown role {turn.role!r}") - # Reconnect if role changed or first turn if current_role != turn.role: if current_role is not None: await self.disconnect() @@ -619,26 +732,236 @@ async def _run_scene(self, scene: Scene) -> None: current_role = turn.role if turn.prompt: - prompts = [turn.prompt] + base_prompt = turn.prompt elif self._resolved_prompts: - prompts = [self._resolved_prompts[0]] + base_prompt = self._resolved_prompts[0] + else: + base_prompt = "Solve the task described in /app/instruction.md" + + pending = inbox.get(turn.role, []) + if pending: + parts = [base_prompt, "\n---\nMessages from other agents:\n"] + parts.extend(pending) + prompts = ["\n".join(parts)] + inbox[turn.role] = [] else: - prompts = ["Solve the task described in /app/instruction.md"] + prompts = [base_prompt] + await self.execute(prompts=prompts) + if multi_role: + for recipient, content in await self._read_scene_outbox(current_role): + turn_counter += 1 + inbox.setdefault(recipient, []).append( + f"**From {current_role}:** {content}" + ) + scene_messages.append({ + "scene": scene.name, + "turn": turn_counter, + "sender": current_role, + "recipient": recipient, + "content": content, + }) + if current_role is not None: await self.disconnect() + if scene_messages and self._trial_dir: + msg_path = self._trial_dir / "scene_messages.jsonl" + with msg_path.open("a") as f: + for m in scene_messages: + f.write(json.dumps(m) + "\n") + logger.info( + f"[Scene] {scene.name}: {len(scene_messages)} messages → {msg_path}" + ) + + async def _read_scene_outbox(self, sender: str) -> list[tuple[str, str]]: + """Read and clear outbox files left by *sender*. Returns [(recipient, content), ...].""" + result = await self._env.exec( + f"ls {self._OUTBOX_DIR}/*.json 2>/dev/null || true", timeout_sec=10, + ) + files = [f.strip() for f in (result.stdout or "").strip().splitlines() if f.strip()] + messages: list[tuple[str, str]] = [] + for fpath in files: + quoted = shlex.quote(fpath) + cat = await self._env.exec(f"cat {quoted}", timeout_sec=10) + try: + data = json.loads(cat.stdout or "{}") + recipient = data.get("to", "") + content = data.get("content", "") + if recipient and content: + messages.append((recipient, content)) + logger.info(f"[Scene] outbox: {sender} → {recipient}: {content[:80]}") + except json.JSONDecodeError: + logger.warning(f"[Scene] invalid JSON in outbox: {fpath}") + await self._env.exec(f"rm -f {quoted}", timeout_sec=10) + return messages + + async def _run_user_loop(self) -> None: + """Execute a user-driven progressive-disclosure loop. + + Each round: user.run() → connect → agent.execute() → disconnect → + soft_verify() → build RoundResult → repeat. Stops when user.run() + returns None or max_user_rounds is reached. + """ + cfg = self._config + user = cfg.user + assert user is not None + + if len(cfg.effective_scenes) > 1: + raise ValueError( + "User-driven loops operate on a single scene. " + f"Got {len(cfg.effective_scenes)} scenes." + ) + scene = cfg.effective_scenes[0] + if len(scene.roles) != 1: + raise ValueError( + "User-driven loops require a single-role scene. " + f"Got {len(scene.roles)} roles." + ) + role = scene.roles[0] + + instruction = self._resolved_prompts[0] if self._resolved_prompts else ( + "Solve the task described in /app/instruction.md" + ) + + # Oracle access: read /solution before the agent runs, then remove it + solution: str | None = None + if cfg.oracle_access: + cat = await self._env.exec( + "cat /solution/solve.sh 2>/dev/null || true", + user="root", timeout_sec=10, + ) + solution = (cat.stdout or "").strip() or None + + await user.setup(instruction, solution) + + # Hide oracle files from agent — move rather than delete so the + # final verify() can still access /solution if the verifier needs it. + if cfg.oracle_access: + await self._env.exec( + "mv /solution /solution_oracle_backup 2>/dev/null || true", + user="root", timeout_sec=10, + ) + + round_result: RoundResult | None = None + rounds_log: list[dict] = [] + + for round_num in range(cfg.max_user_rounds): + try: + prompt = await user.run(round_num, instruction, round_result) + except Exception as e: + self._error = f"user.run() failed at round {round_num}: {e}" + logger.error(self._error, exc_info=True) + break + + if prompt is None: + logger.info(f"[User] stopped at round {round_num}") + break + + logger.info( + f"[User] round {round_num}: prompt={prompt[:80]!r}..." + if len(prompt) > 80 else f"[User] round {round_num}: prompt={prompt!r}" + ) + + # Fresh ACP session each round — agent starts clean but sees + # its previous workspace changes in the shared sandbox. + traj_before = len(self._trajectory) + try: + await self.connect_as(role) + await self.execute(prompts=[prompt]) + finally: + await self.disconnect() + + round_trajectory = self._trajectory[traj_before:] + round_tools = sum( + 1 for e in round_trajectory + if isinstance(e, dict) and e.get("type") == "tool_call" + ) + + # Soft verify: run tests after agent disconnected but before + # next round. Temporarily restore /solution so the verifier can + # access it, then re-hide before the next agent round. + if cfg.oracle_access: + await self._env.exec( + "mv /solution_oracle_backup /solution 2>/dev/null || true", + user="root", timeout_sec=10, + ) + try: + rewards, verifier_output, verifier_error = await self.soft_verify() + finally: + if cfg.oracle_access: + await self._env.exec( + "mv /solution /solution_oracle_backup 2>/dev/null || true", + user="root", timeout_sec=10, + ) + + round_result = RoundResult( + round=round_num, + trajectory=round_trajectory, + rewards=rewards, + verifier_output=verifier_output, + verifier_error=verifier_error, + n_tool_calls=round_tools, + ) + + rounds_log.append({ + "round": round_num, + "prompt": prompt, + "rewards": rewards, + "verifier_error": verifier_error, + "n_tool_calls": round_tools, + "n_trajectory_events": len(round_trajectory), + }) + + logger.info( + f"[User] round {round_num} done: " + f"rewards={rewards}, tools={round_tools}" + ) + + # Persist round log + if rounds_log and self._trial_dir: + log_path = self._trial_dir / "user_rounds.jsonl" + with log_path.open("w") as f: + for entry in rounds_log: + f.write(json.dumps(entry) + "\n") + logger.info(f"[User] {len(rounds_log)} rounds → {log_path}") + async def connect_as(self, role: Role) -> None: - """Open an ACP connection for a specific role.""" + """Open an ACP connection for a specific role. + + Installs the role's agent binary and credentials if it differs + from the primary agent (which was set up in install_agent()). + Updates _agent_launch so disconnect() kills the correct process. + """ cfg = self._config t0 = datetime.now() + agent_launch = AGENT_LAUNCH.get(role.agent, role.agent) + # Merge cfg.agent_env (config-level) with role.env (role-specific) so + # provider creds from YAML reach the agent. role.env wins on overlap. + agent_env = resolve_agent_env( + role.agent, role.model, + {**(cfg.agent_env or {}), **(role.env or {})}, + ) + + if role.agent != cfg.primary_agent: + agent_cfg = await install_agent(self._env, role.agent, self._trial_dir) + cred_home = f"/home/{cfg.sandbox_user}" if cfg.sandbox_user else "/root" + await write_credential_files( + self._env, role.agent, agent_env, + agent_cfg, role.model, cred_home, + ) + if agent_env.get("_BENCHFLOW_SUBSCRIPTION_AUTH"): + await upload_subscription_auth(self._env, role.agent, cred_home) + + self._agent_launch = agent_launch + self._acp_client, self._session, self._agent_name = await connect_acp( env=self._env, agent=role.agent, - agent_launch=AGENT_LAUNCH.get(role.agent, role.agent), - agent_env=resolve_agent_env(role.agent, role.model, role.env or None), + agent_launch=agent_launch, + agent_env=agent_env, sandbox_user=cfg.sandbox_user, model=role.model, trial_dir=self._trial_dir, diff --git a/src/benchflow/trial_yaml.py b/src/benchflow/trial_yaml.py index 5b8ca4ae..707978b1 100644 --- a/src/benchflow/trial_yaml.py +++ b/src/benchflow/trial_yaml.py @@ -103,6 +103,7 @@ def trial_config_from_dict( environment=raw.get("environment", "docker"), sandbox_user=raw.get("sandbox_user", "agent"), sandbox_locked_paths=raw.get("sandbox_locked_paths"), + sandbox_setup_timeout=raw.get("sandbox_setup_timeout", 120), job_name=raw.get("job_name"), trial_name=raw.get("trial_name"), jobs_dir=raw.get("jobs_dir", "jobs"), diff --git a/src/benchflow/user.py b/src/benchflow/user.py new file mode 100644 index 00000000..2c8fde47 --- /dev/null +++ b/src/benchflow/user.py @@ -0,0 +1,101 @@ +"""User abstraction for progressive-disclosure trial loops. + +A User is a Python callback that participates in the trial loop alongside the +agent. Each round: user.run() produces a prompt → agent executes → verifier +checks → user sees the result and decides what to say next (or stops). + +This is distinct from multi-role scenes (PR #179) where multiple ACP agents +collaborate via outbox files. The User runs in the scheduler process, not in +the sandbox. +""" + +from __future__ import annotations + +import inspect +from collections.abc import Awaitable, Callable +from dataclasses import dataclass, field +from typing import Any, cast + + +@dataclass +class RoundResult: + """Outcome of one agent round, passed to User.run() for the next decision.""" + + round: int + trajectory: list[dict] = field(default_factory=list) + rewards: dict[str, Any] | None = None + verifier_output: str | None = None + verifier_error: str | None = None + n_tool_calls: int = 0 + + +class BaseUser: + """Abstract user that drives a progressive-disclosure trial loop. + + Subclass and implement ``run()`` to control what prompt the agent sees + each round. Return ``None`` from ``run()`` to stop the loop early. + """ + + async def setup(self, instruction: str, solution: str | None = None) -> None: + """Called once before the first round. + + ``instruction`` is the original task instruction (from instruction.md). + ``solution`` is the gold answer if oracle access is enabled, else None. + """ + + async def run( + self, + round: int, + instruction: str, + round_result: RoundResult | None = None, + ) -> str | None: + """Produce the next prompt for the agent, or None to stop. + + ``round`` starts at 0. ``round_result`` is None on the first call + and contains the previous round's outcome on subsequent calls. + """ + raise NotImplementedError + + +class PassthroughUser(BaseUser): + """Sends the original instruction unchanged. Single-round, backward compatible.""" + + async def run( + self, + round: int, + instruction: str, + round_result: RoundResult | None = None, + ) -> str | None: + if round == 0: + return instruction + return None + + +class FunctionUser(BaseUser): + """Wraps a plain function as a User — for lightweight one-off use. + + The function signature matches ``BaseUser.run()``: + fn(round, instruction, round_result) -> str | None + + Both sync and async functions are supported. + """ + + def __init__( + self, + fn: Callable[ + [int, str, RoundResult | None], + str | None | Awaitable[str | None], + ], + ) -> None: + self._fn = fn + + async def run( + self, + round: int, + instruction: str, + round_result: RoundResult | None = None, + ) -> str | None: + result = self._fn(round, instruction, round_result) + if inspect.isawaitable(result): + return cast(str | None, await result) + return cast(str | None, result) diff --git a/tests/conformance/proof_multi_agent.py b/tests/conformance/proof_multi_agent.py index 902d2ca3..9aa2729c 100644 --- a/tests/conformance/proof_multi_agent.py +++ b/tests/conformance/proof_multi_agent.py @@ -18,6 +18,8 @@ sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "src")) +import contextlib + from harbor.models.task.task import Task from benchflow._acp_run import connect_acp, execute_prompts @@ -73,7 +75,7 @@ async def role_runner(env, role: Role, prompt: str) -> None: trial_dir = Path(f"/tmp/multi-agent-proof/{role.name}") trial_dir.mkdir(parents=True, exist_ok=True) launch_cmd = AGENT_LAUNCH.get(role.agent, role.agent) - acp_client, session, agent_name = await connect_acp( + acp_client, session, _agent_name = await connect_acp( env=env, agent=role.agent, agent_launch=launch_cmd, @@ -85,7 +87,7 @@ async def role_runner(env, role: Role, prompt: str) -> None: agent_cwd="/app", ) try: - trajectory, n_tools = await execute_prompts( + _trajectory, n_tools = await execute_prompts( acp_client, session, [prompt], @@ -93,10 +95,8 @@ async def role_runner(env, role: Role, prompt: str) -> None: ) logging.info(f"[{role.name}] finished: {n_tools} tool calls") finally: - try: + with contextlib.suppress(Exception): await acp_client.close() - except Exception: - pass async def main() -> None: diff --git a/tests/conformance/run_conformance.py b/tests/conformance/run_conformance.py index b8aa5038..1f758a13 100644 --- a/tests/conformance/run_conformance.py +++ b/tests/conformance/run_conformance.py @@ -53,9 +53,7 @@ def has_creds(agent_name: str) -> bool: if any(os.environ.get(k) for k in keys): return True sub_file = SUBSCRIPTION_AUTH_FILES.get(agent_name) - if sub_file and Path(sub_file).expanduser().exists(): - return True - return False + return bool(sub_file and Path(sub_file).expanduser().exists()) async def run_one(agent_name: str) -> dict: diff --git a/tests/conftest.py b/tests/conftest.py index a3246522..6f5ee12f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,12 +1,17 @@ """Test fixtures.""" import json +import sys from datetime import datetime from pathlib import Path import pytest REPO_ROOT = Path(__file__).parent.parent +SRC_ROOT = REPO_ROOT / "src" +if str(SRC_ROOT) not in sys.path: + sys.path.insert(0, str(SRC_ROOT)) + REF_TASKS = REPO_ROOT / ".ref" / "harbor" / "examples" / "tasks" diff --git a/tests/test_acp.py b/tests/test_acp.py index e0f4729f..cea79c57 100644 --- a/tests/test_acp.py +++ b/tests/test_acp.py @@ -347,3 +347,31 @@ async def test_model_id_selection(self, model_in, expected_model, tmp_path): ) mock_acp.set_model.assert_awaited_once_with(expected_model) + + @pytest.mark.asyncio + async def test_openhands_skips_set_model(self, tmp_path): + from benchflow._acp_run import connect_acp + + mock_acp = self._make_mocks() + mock_env = AsyncMock() + with ( + patch( + "benchflow._acp_run.DockerProcess.from_harbor_env", + return_value=MagicMock(), + ), + patch("benchflow._acp_run.ContainerTransport", return_value=MagicMock()), + patch("benchflow._acp_run.ACPClient", return_value=mock_acp), + ): + await connect_acp( + env=mock_env, + agent="openhands", + agent_launch="openhands acp --always-approve --override-with-envs", + agent_env={}, + sandbox_user=None, + model="gemini-3.1-flash-lite-preview", + trial_dir=tmp_path, + environment="docker", + agent_cwd="/app", + ) + + mock_acp.set_model.assert_not_awaited() diff --git a/tests/test_agent_registry.py b/tests/test_agent_registry.py index 4fe59047..ab06cf15 100644 --- a/tests/test_agent_registry.py +++ b/tests/test_agent_registry.py @@ -5,6 +5,7 @@ test_registry_invariants.py — search there for the consolidated tripwire. """ +from benchflow._agent_env import resolve_provider_env from benchflow.agents.providers import PROVIDERS from benchflow.agents.registry import AGENTS @@ -33,6 +34,39 @@ def test_gemini_has_mapping(self): assert cfg.env_mapping["BENCHFLOW_PROVIDER_BASE_URL"] == "GEMINI_API_BASE_URL" assert cfg.env_mapping["BENCHFLOW_PROVIDER_API_KEY"] == "GOOGLE_API_KEY" + def test_openhands_has_mapping(self): + cfg = AGENTS["openhands"] + assert cfg.env_mapping["BENCHFLOW_PROVIDER_BASE_URL"] == "LLM_BASE_URL" + assert cfg.env_mapping["BENCHFLOW_PROVIDER_API_KEY"] == "LLM_API_KEY" + # OpenHands model is normalized in _normalize_openhands_model(). + assert "BENCHFLOW_PROVIDER_MODEL" not in cfg.env_mapping + + def test_openhands_normalizes_model(self): + env = {} + resolve_provider_env( + agent="openhands", + model="zai/glm-5", + agent_env=env, + ) + + assert env["LLM_MODEL"] == "glm-5" + +class TestOpenHandsConfig: + def test_openhands_uses_agentskills_paths(self): + cfg = AGENTS["openhands"] + assert "$HOME/.agents/skills" in cfg.skill_paths + assert "$WORKSPACE/.agents/skills" in cfg.skill_paths + + def test_openhands_install_cmd_has_uv_and_binary_fallbacks(self): + cfg = AGENTS["openhands"] + assert "apt-get install -y -qq curl ca-certificates" in cfg.install_cmd + assert "uv tool install openhands --python 3.12" in cfg.install_cmd + assert "install.openhands.dev/install.sh" in cfg.install_cmd + + def test_openhands_skips_acp_set_model(self): + cfg = AGENTS["openhands"] + assert cfg.supports_acp_set_model is False + class TestAgentCredentialFiles: def test_codex_has_auth_json(self): diff --git a/tests/test_agent_setup.py b/tests/test_agent_setup.py new file mode 100644 index 00000000..3b99ce64 --- /dev/null +++ b/tests/test_agent_setup.py @@ -0,0 +1,176 @@ +"""Tests for agent install and skill deployment setup helpers.""" + +from pathlib import Path +from types import SimpleNamespace +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from benchflow._agent_setup import deploy_skills, install_agent +from benchflow.agents.registry import AgentConfig +from benchflow.models import AgentInstallError + + +def _make_task(skills_dir: str | None): + return SimpleNamespace( + config=SimpleNamespace( + environment=SimpleNamespace( + skills_dir=skills_dir, + ) + ) + ) + + +@pytest.mark.asyncio +async def test_deploy_skills_symlinks_agent_skill_paths_instead_of_copying(tmp_path): + env = MagicMock() + env.exec = AsyncMock(return_value=MagicMock(return_code=0, stdout="")) + env.upload_dir = AsyncMock() + agent_cfg = AgentConfig( + name="test-agent", + install_cmd="true", + launch_cmd="true", + skill_paths=["$HOME/.agents/skills", "$WORKSPACE/skills"], + ) + + await deploy_skills( + env=env, + task_path=tmp_path, + skills_dir=None, + agent_cfg=agent_cfg, + sandbox_user="agent", + agent_cwd="/app", + task=_make_task("/opt/benchflow/skills"), + ) + + env.upload_dir.assert_not_called() + env.exec.assert_awaited_once() + + cmd = env.exec.await_args.args[0] + assert "cp -r" not in cmd + assert " && " in cmd + assert ";" not in cmd + assert "ln -sfn /opt/benchflow/skills /home/agent/.agents/skills" in cmd + assert "ln -sfn /opt/benchflow/skills /app/skills" in cmd + + +@pytest.mark.asyncio +async def test_deploy_skills_uploads_runtime_skills_and_links_shared_tree(tmp_path): + env = MagicMock() + env.exec = AsyncMock(return_value=MagicMock(return_code=0, stdout="")) + env.upload_dir = AsyncMock() + agent_cfg = AgentConfig( + name="test-agent", + install_cmd="true", + launch_cmd="true", + skill_paths=["$HOME/.agents/skills", "$WORKSPACE/skills"], + ) + skills_dir = tmp_path / "skills" + skills_dir.mkdir() + + await deploy_skills( + env=env, + task_path=tmp_path, + skills_dir=skills_dir, + agent_cfg=agent_cfg, + sandbox_user="agent", + agent_cwd="/workspace", + task=_make_task("/opt/benchflow/skills"), + ) + + env.upload_dir.assert_awaited_once_with(skills_dir, "/skills") + env.exec.assert_awaited_once() + + distributed_link_cmd = env.exec.await_args.args[0] + assert " && " in distributed_link_cmd + assert ";" not in distributed_link_cmd + assert "ln -sfn /skills /home/agent/.agents/skills" in distributed_link_cmd + assert "ln -sfn /skills /workspace/skills" in distributed_link_cmd + assert "/root/.agents/skills" not in distributed_link_cmd + assert "/app/skills" not in distributed_link_cmd + + +@pytest.mark.asyncio +async def test_deploy_skills_falls_back_when_local_skills_dir_is_missing(tmp_path): + env = MagicMock() + env.exec = AsyncMock(return_value=MagicMock(return_code=0, stdout="")) + env.upload_dir = AsyncMock() + agent_cfg = AgentConfig( + name="test-agent", + install_cmd="true", + launch_cmd="true", + skill_paths=["$HOME/.agents/skills", "$WORKSPACE/skills"], + ) + + await deploy_skills( + env=env, + task_path=tmp_path, + skills_dir=tmp_path / "missing-skills", + agent_cfg=agent_cfg, + sandbox_user="agent", + agent_cwd="/workspace", + task=_make_task("/opt/benchflow/skills"), + ) + + env.upload_dir.assert_not_called() + env.exec.assert_awaited_once() + + distributed_link_cmd = env.exec.await_args.args[0] + assert "ln -sfn /opt/benchflow/skills /home/agent/.agents/skills" in distributed_link_cmd + assert "ln -sfn /opt/benchflow/skills /workspace/skills" in distributed_link_cmd + assert "ln -sfn /skills /home/agent/.agents/skills" not in distributed_link_cmd + assert "ln -sfn /skills /workspace/skills" not in distributed_link_cmd + + +@pytest.mark.asyncio +async def test_deploy_skills_raises_when_skill_linking_fails(tmp_path): + env = MagicMock() + env.exec = AsyncMock(return_value=MagicMock(return_code=17, stdout="link failed")) + env.upload_dir = AsyncMock() + agent_cfg = AgentConfig( + name="test-agent", + install_cmd="true", + launch_cmd="true", + skill_paths=["$HOME/.agents/skills"], + ) + + with pytest.raises(RuntimeError, match="Failed to link skills"): + await deploy_skills( + env=env, + task_path=tmp_path, + skills_dir=None, + agent_cfg=agent_cfg, + sandbox_user="agent", + agent_cwd="/app", + task=_make_task("/opt/benchflow/skills"), + ) + + +@pytest.mark.asyncio +async def test_install_agent_writes_command_stdout_and_stderr_on_failure(tmp_path: Path): + env = SimpleNamespace() + env.exec = AsyncMock( + side_effect=[ + SimpleNamespace(return_code=1, stdout="", stderr="uv: command not found\n"), + SimpleNamespace( + stdout="OS:\nID=ubuntu\nNode:\nv22.0.0\nAgent:\nnot found\n", + stderr="", + return_code=0, + ), + ] + ) + + with pytest.raises(AgentInstallError) as exc_info: + await install_agent(env, "openhands", tmp_path) + + err = exc_info.value + log_path = Path(err.log_path) + assert log_path == tmp_path / "agent" / "install-stdout.txt" + assert log_path.exists() + log_text = log_path.read_text() + assert log_text.startswith("$ ") + assert "uv tool install openhands --python 3.12" in log_text + assert "=== stderr ===" in log_text + assert "uv: command not found" in log_text + assert err.stdout == log_text + assert "ID=ubuntu" in err.diagnostics diff --git a/tests/test_connect_as_env.py b/tests/test_connect_as_env.py new file mode 100644 index 00000000..97ab4232 --- /dev/null +++ b/tests/test_connect_as_env.py @@ -0,0 +1,140 @@ +"""Regression tests for connect_as() agent_env merging (issue #2). + +connect_as() must merge cfg.agent_env (config-level) with role.env +(role-level), with role-level keys winning on overlap. Before the fix, +role.env={} was truthy so resolve_agent_env received an empty dict, +discarding all config-level env vars. +""" + +from pathlib import Path +from unittest.mock import AsyncMock, patch + +import pytest + +from benchflow.trial import Role, Scene, TrialConfig + + +def _make_config(agent_env=None, role_env=None): + """Build a minimal TrialConfig with one scene.""" + role = Role(name="agent", agent="claude-agent-acp", model="test-model") + if role_env is not None: + role = Role( + name="agent", agent="claude-agent-acp", model="test-model", env=role_env + ) + scene = Scene(roles=[role]) + return TrialConfig( + task_path=Path("/fake/task"), + scenes=[scene], + agent_env=agent_env, + ) + + +class TestConnectAsEnvMerge: + """Verify connect_as() merges cfg.agent_env with role.env correctly.""" + + @pytest.fixture() + def _mock_trial(self, tmp_path): + """Return a Trial stub wired to capture the agent_env passed to connect_acp.""" + from benchflow.trial import Trial + + cfg = _make_config( + agent_env={"BENCHFLOW_PROVIDER_BASE_URL": "http://localhost:8080/v1"}, + ) + trial = Trial.__new__(Trial) + trial._config = cfg + trial._env = {} + trial._trial_dir = tmp_path + trial._timing = {} + trial._agent_cwd = None + trial._phase = "idle" + return trial + + @pytest.mark.asyncio + async def test_config_env_propagated_through_empty_role_env(self, _mock_trial): + """cfg.agent_env vars reach resolve_agent_env when role.env is {}.""" + captured = {} + + def fake_resolve(agent, model, env): + captured["env"] = env + return env or {} + + with ( + patch("benchflow.trial.resolve_agent_env", side_effect=fake_resolve), + patch("benchflow.trial.connect_acp", new_callable=AsyncMock) as mock_conn, + ): + mock_conn.return_value = (AsyncMock(), AsyncMock(), "agent") + role = _mock_trial._config.scenes[0].roles[0] + await _mock_trial.connect_as(role) + + assert "BENCHFLOW_PROVIDER_BASE_URL" in captured["env"] + assert captured["env"]["BENCHFLOW_PROVIDER_BASE_URL"] == "http://localhost:8080/v1" + + @pytest.mark.asyncio + async def test_role_env_overrides_config_env(self, _mock_trial): + """Role-level env wins over config-level on key overlap.""" + _mock_trial._config = _make_config( + agent_env={"KEY": "from-config", "SHARED": "config-val"}, + role_env={"SHARED": "role-val", "ROLE_ONLY": "yes"}, + ) + captured = {} + + def fake_resolve(agent, model, env): + captured["env"] = env + return env or {} + + with ( + patch("benchflow.trial.resolve_agent_env", side_effect=fake_resolve), + patch("benchflow.trial.connect_acp", new_callable=AsyncMock) as mock_conn, + ): + mock_conn.return_value = (AsyncMock(), AsyncMock(), "agent") + role = _mock_trial._config.scenes[0].roles[0] + await _mock_trial.connect_as(role) + + env = captured["env"] + assert env["KEY"] == "from-config" + assert env["SHARED"] == "role-val" + assert env["ROLE_ONLY"] == "yes" + + @pytest.mark.asyncio + async def test_all_keys_present_in_merge(self, _mock_trial): + """Non-overlapping keys from both dicts are all present.""" + _mock_trial._config = _make_config( + agent_env={"A": "1", "B": "2"}, + role_env={"C": "3", "D": "4"}, + ) + captured = {} + + def fake_resolve(agent, model, env): + captured["env"] = env + return env or {} + + with ( + patch("benchflow.trial.resolve_agent_env", side_effect=fake_resolve), + patch("benchflow.trial.connect_acp", new_callable=AsyncMock) as mock_conn, + ): + mock_conn.return_value = (AsyncMock(), AsyncMock(), "agent") + role = _mock_trial._config.scenes[0].roles[0] + await _mock_trial.connect_as(role) + + env = captured["env"] + assert env == {"A": "1", "B": "2", "C": "3", "D": "4"} + + @pytest.mark.asyncio + async def test_none_config_env_with_empty_role_env(self, _mock_trial): + """cfg.agent_env=None + empty role.env does not crash.""" + _mock_trial._config = _make_config(agent_env=None, role_env={}) + captured = {} + + def fake_resolve(agent, model, env): + captured["env"] = env + return env or {} + + with ( + patch("benchflow.trial.resolve_agent_env", side_effect=fake_resolve), + patch("benchflow.trial.connect_acp", new_callable=AsyncMock) as mock_conn, + ): + mock_conn.return_value = (AsyncMock(), AsyncMock(), "agent") + role = _mock_trial._config.scenes[0].roles[0] + await _mock_trial.connect_as(role) + + assert captured["env"] == {} diff --git a/tests/test_oracle_chokepoint.py b/tests/test_oracle_chokepoint.py index 142c0634..a61333da 100644 --- a/tests/test_oracle_chokepoint.py +++ b/tests/test_oracle_chokepoint.py @@ -21,11 +21,9 @@ from __future__ import annotations -import importlib from pathlib import Path from unittest.mock import AsyncMock, patch -import pytest from typer.testing import CliRunner diff --git a/tests/test_process.py b/tests/test_process.py index c604a580..68ea0a0c 100644 --- a/tests/test_process.py +++ b/tests/test_process.py @@ -202,3 +202,99 @@ async def capture_communicate(data=None): assert ( f"export SINGLE_QUOTE={shlex.quote('it' + chr(39) + 's a test')}" in content ) + + +class TestDaytonaProcessEnvFilePath: + """Regression: env-file path must be unique without relying on shell `$$` expansion. + + Guards the fix from PR #198 against the regression introduced by PR #193 + (DinD compose ACP via Daytona PTY WebSocket, commit cdccac7). + + The DinD branch builds an inner `docker compose exec --env-file PATH ...` + command and runs it through `shlex.join()`, which single-quotes any `$$` + (preventing remote shell expansion). The `cat > PATH` heredoc that writes + the file uses raw f-string interpolation where `$$` IS expanded. If the + path contains `$$`, the file is written to one path and read from another + — env vars silently disappear. + + The direct (non-DinD) branch uses raw f-string in both write and read, so + `$$` would expand consistently — but uuid is robust against future quoting + changes. Pin both branches. + """ + + @pytest.mark.asyncio + async def test_dind_env_file_path_does_not_use_shell_pid_expansion(self): + """DinD path must not use $$ — shlex.join would quote it literally.""" + from unittest.mock import MagicMock + + from benchflow.process import DaytonaProcess + + sandbox = MagicMock() + sandbox.create_ssh_access = AsyncMock( + return_value=MagicMock(token="abc") + ) + proc = DaytonaProcess( + sandbox=sandbox, + is_dind=True, + compose_cmd_prefix="", + compose_cmd_base="docker compose -p test", + ) + + captured = [] + + async def fake_exec(*args, **kwargs): + captured.append(list(args)) + mock_proc = AsyncMock() + mock_proc.pid = 12345 + mock_proc.returncode = 0 + mock_proc.stdin = AsyncMock() + mock_proc.stdout = AsyncMock() + mock_proc.stderr = AsyncMock() + mock_proc.communicate = AsyncMock(return_value=(b"", b"")) + return mock_proc + + with patch("asyncio.create_subprocess_exec", side_effect=fake_exec): + await proc.start(command="echo hi", env={"FOO": "bar"}) + + # Last arg of ssh is the remote command. Search it for $$ + remote_cmd = captured[0][-1] + assert "$$" not in remote_cmd, ( + "$$ in remote command — shlex.join() will quote it, mismatching " + f"the cat heredoc that does expand it. Got: {remote_cmd[:200]!r}" + ) + # And: a real path was used (literal hex suffix, no shell variable) + assert "/tmp/benchflow_env_" in remote_cmd + assert "--env-file" in remote_cmd + + @pytest.mark.asyncio + async def test_direct_sandbox_env_file_path_does_not_use_shell_pid_expansion(self): + """Direct (non-DinD) path is currently safe with $$, but pin the uuid form for robustness.""" + from unittest.mock import MagicMock + + from benchflow.process import DaytonaProcess + + sandbox = MagicMock() + sandbox.create_ssh_access = AsyncMock( + return_value=MagicMock(token="abc") + ) + proc = DaytonaProcess(sandbox=sandbox, is_dind=False) + + captured = [] + + async def fake_exec(*args, **kwargs): + captured.append(list(args)) + mock_proc = AsyncMock() + mock_proc.pid = 12345 + mock_proc.returncode = 0 + mock_proc.stdin = AsyncMock() + mock_proc.stdout = AsyncMock() + mock_proc.stderr = AsyncMock() + mock_proc.communicate = AsyncMock(return_value=(b"", b"")) + return mock_proc + + with patch("asyncio.create_subprocess_exec", side_effect=fake_exec): + await proc.start(command="echo hi", env={"FOO": "bar"}) + + remote_cmd = captured[0][-1] + assert "$$" not in remote_cmd + assert "/tmp/benchflow_env_" in remote_cmd diff --git a/tests/test_registry_invariants.py b/tests/test_registry_invariants.py index 023945bc..868ffb3c 100644 --- a/tests/test_registry_invariants.py +++ b/tests/test_registry_invariants.py @@ -49,6 +49,7 @@ def test_agent_field_shapes(name, cfg): f"api_protocol={cfg.api_protocol!r} not in {VALID_API_PROTOCOLS}" ) assert isinstance(cfg.install_timeout, int) and cfg.install_timeout > 0 + assert isinstance(cfg.supports_acp_set_model, bool) @pytest.mark.parametrize("name,cfg", AGENTS.items(), ids=list(AGENTS.keys())) @@ -75,6 +76,22 @@ def test_agent_collection_invariants(name, cfg): assert d.startswith("."), f"home_dirs entry {d!r} must start with '.'" +@pytest.mark.parametrize("name,cfg", AGENTS.items(), ids=list(AGENTS.keys())) +def test_agent_install_cmd_targets_shared_paths(name, cfg): + """Installed binaries must land in shared prefixes, not a root-only home. + + setup_sandbox_user() no longer recursively copies /root/.nvm or + /root/.local/bin into the sandbox home. If an install_cmd placed its + binary there, the sandbox user would silently lose access to the agent. + """ + forbidden_binary_prefixes = ("/root/.nvm/", "/root/.local/bin/", "$HOME/.nvm/") + for prefix in forbidden_binary_prefixes: + assert prefix not in cfg.install_cmd, ( + f"{name!r} install_cmd writes under {prefix!r}; use /usr/local/bin " + f"or another shared prefix so the sandbox user inherits the tool" + ) + + @pytest.mark.parametrize("name,cfg", AGENTS.items(), ids=list(AGENTS.keys())) def test_agent_credential_and_subscription_auth(name, cfg): """Optional credential_files and subscription_auth structures.""" diff --git a/tests/test_rewards_jsonl.py b/tests/test_rewards_jsonl.py index e81f3164..bbc029df 100644 --- a/tests/test_rewards_jsonl.py +++ b/tests/test_rewards_jsonl.py @@ -93,7 +93,7 @@ def test_rubric_plus_terminal_no_rubric_in_meta(tmp_path: Path) -> None: json.loads(ln) for ln in (tmp_path / "rewards.jsonl").read_text().strip().splitlines() ] - terminal = [ln for ln in lines if ln["type"] == "terminal"][0] + terminal = next(ln for ln in lines if ln["type"] == "terminal") assert "rubric" not in terminal["meta"] diff --git a/tests/test_runtime.py b/tests/test_runtime.py index 16046fed..37819f71 100644 --- a/tests/test_runtime.py +++ b/tests/test_runtime.py @@ -36,6 +36,7 @@ def test_agent_env_default_empty() -> None: def test_runtime_config_defaults() -> None: c = RuntimeConfig() assert c.sandbox_user == "agent" + assert c.sandbox_setup_timeout == 120 assert c.max_rounds == 10 assert c.snapshot_policy == "none" assert c.reward_stream is True @@ -143,9 +144,10 @@ def test_runtime_custom_config() -> None: if not (task_path / "task.toml").exists(): return env = Environment.from_task(task_path, backend="daytona") - config = RuntimeConfig(sandbox_user=None, timeout=1800) + config = RuntimeConfig(sandbox_user=None, timeout=1800, sandbox_setup_timeout=45) runtime = Runtime(env, agent, config) assert runtime.config.sandbox_user is None + assert runtime.config.sandbox_setup_timeout == 45 assert runtime.config.timeout == 1800 diff --git a/tests/test_sandbox.py b/tests/test_sandbox.py index 4c7f9f2f..36a8c805 100644 --- a/tests/test_sandbox.py +++ b/tests/test_sandbox.py @@ -1,23 +1,21 @@ -"""Tests for sandbox user directory derivation from agent registry.""" +"""Tests for sandbox user config/auth directory derivation from agent registry.""" from benchflow.agents.registry import ( AGENTS, AgentConfig, + HostAuthFile, + SubscriptionAuth, get_sandbox_home_dirs, ) class TestSandboxDirs: - def test_dirs_derived_from_skill_paths(self): - """All $HOME skill_paths dirs from AGENTS registry are included.""" + def test_skill_only_dirs_not_included(self): + """Skill-only home dirs should not be copied during sandbox user setup.""" dirs = get_sandbox_home_dirs() - # claude-agent-acp has $HOME/.claude/skills - assert ".claude" in dirs - # gemini has $HOME/.gemini/skills - assert ".gemini" in dirs - # pi-acp has $HOME/.pi/agent/skills and $HOME/.agents/skills - assert ".pi" in dirs - assert ".agents" in dirs + + assert ".pi" not in dirs + assert ".agents" not in dirs def test_credential_file_dirs_included(self): """Dirs from credential_files paths are included.""" @@ -31,13 +29,25 @@ def test_home_dirs_included(self): # openclaw has home_dirs=[".openclaw"] assert ".openclaw" in dirs - def test_always_includes_local(self): - """.local is always in the dir list.""" + def test_does_not_include_legacy_local_tool_dir(self): + """.local is not included unless an agent registry path derives it.""" + dirs = get_sandbox_home_dirs() + assert ".local" not in dirs + + def test_only_includes_top_level_home_dirs(self): + """Derived entries stay at $HOME top-level, not nested tool subpaths.""" dirs = get_sandbox_home_dirs() - assert ".local" in dirs + assert ".local/bin" not in dirs - def test_new_agent_auto_included(self): - """Adding an agent with skill_paths=$HOME/.newagent/skills includes .newagent.""" + def test_dirs_represent_registry_backed_home_config_or_auth(self): + """Returned dirs are registry-derived user home config/auth roots.""" + dirs = get_sandbox_home_dirs() + assert {".claude", ".codex", ".gemini", ".openclaw"}.issubset(dirs) + assert ".agents" not in dirs + assert ".pi" not in dirs + + def test_new_agent_skill_path_not_auto_included(self): + """Skill-only home dirs should not become sandbox copy targets.""" AGENTS["_test_agent"] = AgentConfig( name="_test_agent", install_cmd="true", @@ -46,10 +56,33 @@ def test_new_agent_auto_included(self): ) try: dirs = get_sandbox_home_dirs() - assert ".newagent" in dirs + assert ".newagent" not in dirs finally: del AGENTS["_test_agent"] + def test_subscription_auth_file_dirs_included(self): + """Dirs from subscription_auth.files container paths are included.""" + AGENTS["_test_agent_subscription_auth"] = AgentConfig( + name="_test_agent_subscription_auth", + install_cmd="true", + launch_cmd="true", + subscription_auth=SubscriptionAuth( + replaces_env="TEST_API_KEY", + detect_file="~/.subauth/login.json", + files=[ + HostAuthFile( + "~/.subauth/login.json", + "{home}/.subauth/login.json", + ) + ], + ), + ) + try: + dirs = get_sandbox_home_dirs() + assert ".subauth" in dirs + finally: + del AGENTS["_test_agent_subscription_auth"] + def test_workspace_paths_excluded(self): """$WORKSPACE paths are not included (only $HOME paths).""" dirs = get_sandbox_home_dirs() diff --git a/tests/test_sandbox_hardening.py b/tests/test_sandbox_hardening.py index c12b9574..6396c4ee 100644 --- a/tests/test_sandbox_hardening.py +++ b/tests/test_sandbox_hardening.py @@ -152,7 +152,7 @@ async def test_with_sandbox_user(self, harness): assert "sitecustomize.py" in cleanup_cmd and ".pth" in cleanup_cmd assert "-not -path '/tests/*'" in cleanup_cmd injected = task.config.verifier.env - assert "--rootdir=/tests" in injected["PYTEST_ADDOPTS"] + assert "--rootdir=/app" in injected["PYTEST_ADDOPTS"] assert "-p no:cacheprovider" in injected["PYTEST_ADDOPTS"] assert injected["PYTHONPATH"] == "" assert "PYTHONHOME" not in injected # breaks Py_Initialize if set to "" @@ -171,7 +171,7 @@ async def test_without_sandbox_user(self, harness): assert all("pkill" not in c for c in cmds) assert any("conftest.py" in c for c in cmds) addopts = task.config.verifier.env["PYTEST_ADDOPTS"] - assert "--rootdir=/tests" in addopts + assert "--rootdir=/app" in addopts assert "-p no:cacheprovider" in addopts @pytest.mark.asyncio @@ -629,7 +629,7 @@ def test_env_contract(self): assert "-c /dev/null" in addopts assert "--confcutdir=/tests" in addopts - assert "--rootdir=/tests" in addopts + assert "--rootdir=/app" in addopts assert "-p no:cacheprovider" in addopts assert ( "PYTHONSAFEPATH" not in VERIFIER_ENV @@ -662,34 +662,41 @@ async def test_distro_pip_env_fedora(self): ) assert await _distro_pip_env(env) == {"PIP_PREFIX": "/usr/local"} - def test_pip_installed_pytest_plugin_inferred(self, tmp_path): - """`pip install pytest-asyncio` in test.sh auto-loads pytest_asyncio.""" - from benchflow._sandbox import _plugins_from_verifier_script + @pytest.mark.asyncio + async def test_container_plugin_discovery_merged_into_addopts(self): + """Plugins discovered from root-owned container packages appear as -p flags.""" + from benchflow._sandbox import harden_before_verify - task_dir = tmp_path / "task" - (task_dir / "tests").mkdir(parents=True) - (task_dir / "tests" / "test.sh").write_text( - "pip3 install --break-system-packages pytest==8.3.4 pytest-asyncio==0.24.0\n" - "pytest /tests/test_perf.py\n" - ) - task = MagicMock() - task.task_dir = str(task_dir) - plugins = _plugins_from_verifier_script(task) - assert "pytest_asyncio" in plugins + def side_effect(cmd, **kwargs): + if "_DISCOVER_PYTEST" in str(cmd) or "importlib.metadata" in str(cmd): + return MagicMock( + stdout='["benchmark", "xdist"]', stderr="", exit_code=0 + ) + return MagicMock(stdout="", stderr="", exit_code=0) - def test_pip_install_unrelated_mention_not_inferred(self, tmp_path): - """A bare mention of 'pytest-asyncio' in a comment must not auto-load it.""" - from benchflow._sandbox import _plugins_from_verifier_script + env = _make_env(side_effect=side_effect) + task = _make_task() + await harden_before_verify(env, task, sandbox_user=None) - task_dir = tmp_path / "task" - (task_dir / "tests").mkdir(parents=True) - (task_dir / "tests" / "test.sh").write_text( - "# tests using pytest-asyncio decorators (already installed in image)\n" - "pytest /tests/test_perf.py\n" - ) - task = MagicMock() - task.task_dir = str(task_dir) - assert _plugins_from_verifier_script(task) == [] + addopts = task.config.verifier.env["PYTEST_ADDOPTS"] + assert "-p benchmark" in addopts + assert "-p xdist" in addopts + + @pytest.mark.asyncio + async def test_plugin_discovery_failure_graceful(self): + """If container-side discovery fails, hardening proceeds without extra plugins.""" + from benchflow._sandbox import VERIFIER_ENV, harden_before_verify + + def side_effect(cmd, **kwargs): + if "importlib.metadata" in str(cmd): + raise RuntimeError("no python3") + return MagicMock(stdout="", stderr="", exit_code=0) + + env = _make_env(side_effect=side_effect) + task = _make_task() + await harden_before_verify(env, task, sandbox_user=None) + + assert task.config.verifier.env["PYTEST_ADDOPTS"] == VERIFIER_ENV["PYTEST_ADDOPTS"] @pytest.mark.asyncio async def test_distro_pip_env_ubuntu(self): @@ -831,77 +838,20 @@ async def test_plugin_autoload_disabled_survives_task_env_override(self): @pytest.mark.asyncio async def test_per_task_plugins_appended_to_addopts(self): - """pytest_plugins are translated to -p flags; PYTEST_DISABLE_PLUGIN_AUTOLOAD must survive.""" + """pytest_plugins from task.toml are translated to -p flags.""" from benchflow._sandbox import harden_before_verify env = _make_env(side_effect=_manifest_env(_blank_manifest())) task = _make_task() - task.config.verifier.pytest_plugins = ["pytest-json-ctrf", "myplug"] + task.config.verifier.pytest_plugins = ["ctrf", "myplug"] await harden_before_verify(env, task, sandbox_user=None, workspace=None) final_env = task.config.verifier.env addopts = final_env.get("PYTEST_ADDOPTS", "") assert "-p ctrf" in addopts assert "-p myplug" in addopts - # The security flag must still be present after the plugin flags are appended. assert final_env.get("PYTEST_DISABLE_PLUGIN_AUTOLOAD") == "1" - @pytest.mark.asyncio - async def test_legacy_ctrf_script_gets_allowlisted_plugin(self, tmp_path): - """Legacy verifier scripts using --ctrf get -p ctrf without pytest autoload.""" - from benchflow._sandbox import harden_before_verify - - tests_dir = tmp_path / "tests" - tests_dir.mkdir() - (tests_dir / "test.sh").write_text( - "uvx --with pytest-json-ctrf==0.3.5 " - "pytest --ctrf /logs/verifier/ctrf.json /tests/test_outputs.py\n" - ) - env = _make_env(side_effect=_manifest_env(_blank_manifest())) - task = _make_task() - task.task_dir = tmp_path - await harden_before_verify(env, task, sandbox_user=None, workspace=None) - - final_env = task.config.verifier.env - assert "-p ctrf" in final_env["PYTEST_ADDOPTS"] - assert final_env.get("PYTEST_DISABLE_PLUGIN_AUTOLOAD") == "1" - - @pytest.mark.asyncio - async def test_legacy_json_report_script_gets_allowlisted_plugin(self, tmp_path): - """Legacy verifier scripts using --json-report get the right import name.""" - from benchflow._sandbox import harden_before_verify - - tests_dir = tmp_path / "tests" - tests_dir.mkdir() - (tests_dir / "test.sh").write_text( - "pytest --json-report " - "--json-report-file=/logs/verifier/report.json /tests/test_outputs.py\n" - ) - env = _make_env(side_effect=_manifest_env(_blank_manifest())) - task = _make_task() - task.task_dir = tmp_path - await harden_before_verify(env, task, sandbox_user=None, workspace=None) - - assert "-p pytest_jsonreport" in task.config.verifier.env["PYTEST_ADDOPTS"] - - @pytest.mark.asyncio - async def test_task_toml_pytest_plugins_fallback(self, tmp_path): - """Raw task.toml pytest_plugins work even if Harbor ignores the extra field.""" - from benchflow._sandbox import harden_before_verify - - (tmp_path / "task.toml").write_text( - '[verifier]\npytest_plugins = ["pytest_json_ctrf", "pytest-json-report"]\n' - ) - env = _make_env(side_effect=_manifest_env(_blank_manifest())) - task = _make_task() - task.task_dir = tmp_path - task.config.verifier.pytest_plugins = None - await harden_before_verify(env, task, sandbox_user=None, workspace=None) - - addopts = task.config.verifier.env["PYTEST_ADDOPTS"] - assert "-p ctrf" in addopts - assert "-p pytest_jsonreport" in addopts - @pytest.mark.asyncio @pytest.mark.parametrize("plugins", [None, []]) async def test_no_extra_addopts_when_no_plugins(self, plugins): @@ -964,7 +914,7 @@ async def test_pytest_addopts_task_override_with_plugins(self): env = _make_env() task = _make_task() task.config.verifier.env = {"PYTEST_ADDOPTS": "--rootdir=/evil"} - task.config.verifier.pytest_plugins = ["pytest-json-ctrf"] + task.config.verifier.pytest_plugins = ["ctrf"] await harden_before_verify(env, task, sandbox_user=None) addopts = task.config.verifier.env["PYTEST_ADDOPTS"] @@ -1163,22 +1113,83 @@ def test_devnull_blocks_hostile_pyproject(self, tmp_path): ) -class TestSandboxFailureModes: - """Recovery paths when untrusted inputs (task.toml, PATH extras) are malformed.""" +class TestHardeningOptOuts: + """Per-task [verifier.hardening] opt-outs from task.toml.""" + + def test_defaults_when_no_task_dir(self): + from benchflow._sandbox import HARDENING_DEFAULTS, _read_hardening_config - def test_harden_before_verify_survives_manifest_read_failure(self, tmp_path): - """Truncated task.toml must not propagate TOMLDecodeError; treat as no plugins.""" - from benchflow._sandbox import _declared_pytest_plugins + assert _read_hardening_config(None) == HARDENING_DEFAULTS + + def test_defaults_when_no_hardening_section(self, tmp_path): + from benchflow._sandbox import HARDENING_DEFAULTS, _read_hardening_config - # Truncated inline-table → tomllib.TOMLDecodeError (tmp_path / "task.toml").write_text( - "[verifier]\npytest_plugins = [\n", + "[verifier]\ntimeout_sec = 60\n" ) - task = _make_task() - task.task_dir = str(tmp_path) - task.config.verifier.pytest_plugins = None + assert _read_hardening_config(tmp_path) == HARDENING_DEFAULTS + + def test_opt_out_cleanup_conftests(self, tmp_path): + from benchflow._sandbox import _read_hardening_config - assert _declared_pytest_plugins(task) == [] + (tmp_path / "task.toml").write_text( + "[verifier.hardening]\ncleanup_conftests = false\n" + ) + cfg = _read_hardening_config(tmp_path) + assert cfg["cleanup_conftests"] is False + + def test_unknown_key_logged_not_applied(self, tmp_path, caplog): + from benchflow._sandbox import HARDENING_DEFAULTS, _read_hardening_config + + (tmp_path / "task.toml").write_text( + "[verifier.hardening]\nbogus_flag = true\n" + ) + cfg = _read_hardening_config(tmp_path) + assert cfg == HARDENING_DEFAULTS # bogus key ignored + assert any("bogus_flag" in r.message for r in caplog.records) + + def test_invalid_value_type_rejected(self, tmp_path, caplog): + from benchflow._sandbox import _read_hardening_config + + (tmp_path / "task.toml").write_text( + '[verifier.hardening]\ncleanup_conftests = "false"\n' + ) + cfg = _read_hardening_config(tmp_path) + # String "false" is not bool — rejected, default applied + assert cfg["cleanup_conftests"] is True + + def test_build_cleanup_includes_conftest_by_default(self): + from benchflow._sandbox import _build_cleanup_cmd + + cmd = _build_cleanup_cmd() + assert "find / -name conftest.py" in cmd + assert "find /tmp /var/tmp" in cmd + assert "sitecustomize.py" in cmd + + def test_build_cleanup_skips_conftest_when_disabled(self): + from benchflow._sandbox import _build_cleanup_cmd + + cmd = _build_cleanup_cmd({"cleanup_conftests": False}) + assert "find / -name conftest.py" not in cmd + # Other steps still run + assert "find /tmp /var/tmp" in cmd + assert "sitecustomize.py" in cmd + + +class TestSandboxFailureModes: + """Recovery paths when untrusted inputs (task.toml, PATH extras) are malformed.""" + + @pytest.mark.asyncio + async def test_plugin_discovery_bad_json_graceful(self): + """Malformed JSON from container plugin discovery falls back gracefully.""" + from benchflow._sandbox import _discover_pytest_plugin_flags + + env = _make_env(side_effect=lambda cmd, **kw: MagicMock( + stdout="not valid json", stderr="", exit_code=0 + )) + task = _make_task() + flags = await _discover_pytest_plugin_flags(env, task) + assert flags == "" @pytest.mark.asyncio async def test_trusted_path_extras_malformed_json_falls_back(self): diff --git a/tests/test_sandbox_setup.py b/tests/test_sandbox_setup.py new file mode 100644 index 00000000..f81b392e --- /dev/null +++ b/tests/test_sandbox_setup.py @@ -0,0 +1,106 @@ +"""Focused contract tests for setup_sandbox_user() shell command generation.""" + +import re +import shlex +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from benchflow._sandbox import setup_sandbox_user +from benchflow.agents.registry import get_sandbox_home_dirs + + +async def _run_setup_sandbox_user(*, sandbox_user: str = "agent", workspace: str = "/app"): + env = MagicMock() + env.exec = AsyncMock(return_value=MagicMock(stdout="", stderr="", exit_code=0)) + + await setup_sandbox_user(env, sandbox_user, workspace) + + env.exec.assert_awaited_once() + return env.exec.call_args.args[0], env.exec.call_args.kwargs + + +def _assert_conditional_legacy_symlink(cmd: str, *, source: str, dest: str) -> None: + """Legacy tool dirs should link only when a root-only install exists.""" + assert re.search( + rf"if \[ -e [\"']?{re.escape(source)}[\"']? \].*ln -s(?:f|[a-zA-Z-])* [\"']?{re.escape(source)}[\"']? [\"']?{re.escape(dest)}[\"']?.*fi", + cmd, + ), f"expected explicit symlink from {source} to {dest} in setup command: {cmd}" + + +def _get_copy_loop_dirs(cmd: str) -> list[str]: + """Extract the general home-dir copy loop payload from the shell command.""" + match = re.search(r"for d in (?P.*?); do", cmd) + assert match, f"expected general home-dir copy loop in setup command: {cmd}" + return match.group("dirs").split() + + +class TestSetupSandboxUser: + @pytest.mark.asyncio + async def test_setup_command_avoids_recursive_root_tool_copies(self): + """Heavy root-owned tool dirs should no longer be recursively copied.""" + cmd, kwargs = await _run_setup_sandbox_user() + + assert "cp -aL /root/.local/bin/." not in cmd + assert "cp -a /root/.nvm/." not in cmd + assert kwargs["timeout_sec"] == 120 + + @pytest.mark.asyncio + async def test_setup_command_still_creates_user_prepares_home_and_chowns_workspace(self): + """The non-copy setup contract still creates the user and grants access.""" + cmd, _ = await _run_setup_sandbox_user() + + assert "id -u agent >/dev/null 2>&1 || useradd -m -s /bin/bash agent" in cmd + assert "mkdir -p /home/agent/.local/bin" not in cmd + assert "chown -R agent:agent /home/agent" in cmd + assert f"chown -R agent:agent {shlex.quote('/app')}" in cmd + + @pytest.mark.asyncio + async def test_setup_command_keeps_heavy_root_tool_dirs_on_shared_paths(self): + """Legacy root-only tool dirs should use conditional symlinks, not duplication.""" + cmd, _ = await _run_setup_sandbox_user() + + _assert_conditional_legacy_symlink( + cmd, + source="/root/.local/bin", + dest="/home/agent/.local/bin", + ) + _assert_conditional_legacy_symlink( + cmd, source="/root/.nvm", dest="/home/agent/.nvm" + ) + assert "cp -aL /root/.local/bin/. /home/agent/.local/bin/" not in cmd + assert "cp -a /root/.nvm/. /home/agent/.nvm/" not in cmd + + @pytest.mark.asyncio + async def test_setup_command_copy_loop_excludes_local_dir(self): + """General home-dir copying should narrow to small config/auth dirs only.""" + cmd, _ = await _run_setup_sandbox_user() + + copy_loop_dirs = _get_copy_loop_dirs(cmd) + + assert copy_loop_dirs == sorted( + d for d in get_sandbox_home_dirs() if d != ".local" + ) + assert ".local" not in copy_loop_dirs + assert "mkdir -p /home/agent/$d" in cmd + assert "cp -a /root/$d/. /home/agent/$d/ 2>/dev/null || true" in cmd + + @pytest.mark.asyncio + async def test_setup_command_does_not_copy_heavy_tool_trees_into_home(self): + """BenchFlow-installed agents must not rely on sandbox-home tool copies. + + Agent binaries are placed in /usr/local/bin by the registered install_cmd + values, so setup_sandbox_user() must not bulk-copy heavyweight tool trees + (e.g. /root/.nvm, /root/.local/bin) to make them executable for the user. + """ + cmd, _ = await _run_setup_sandbox_user() + + for heavy_source in ("/root/.nvm", "/root/.local/bin"): + assert f"cp -a {heavy_source}/." not in cmd + assert f"cp -aL {heavy_source}/." not in cmd + + copy_loop_dirs = _get_copy_loop_dirs(cmd) + for heavy_dir in (".nvm", ".local"): + assert heavy_dir not in copy_loop_dirs, ( + f"sandbox copy loop must not include heavy tool dir {heavy_dir!r}" + ) diff --git a/tests/test_scene_outbox_trial.py b/tests/test_scene_outbox_trial.py new file mode 100644 index 00000000..5b6f07ad --- /dev/null +++ b/tests/test_scene_outbox_trial.py @@ -0,0 +1,341 @@ +"""Tests for outbox-based inter-role messaging in Trial._run_scene(). + +Verifies that when bf.run(TrialConfig) executes a multi-role Scene, +outbox files written by one role are read and injected into the next +role's prompt — bridging the _scene.py outbox convention with the +Trial lifecycle. +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path +from unittest.mock import AsyncMock + +import pytest + +from benchflow.trial import Role, Scene, Trial, TrialConfig, Turn + + +@dataclass +class FakeExecResult: + stdout: str = "" + stderr: str = "" + return_code: int = 0 + + +class FakeEnv: + """Minimal sandbox mock that tracks outbox files.""" + + def __init__(self) -> None: + self._files: dict[str, str] = {} + self._exec_log: list[str] = [] + + async def exec(self, cmd: str, **kwargs) -> FakeExecResult: + self._exec_log.append(cmd) + if "rm -rf /app/.outbox" in cmd: + self._files = {k: v for k, v in self._files.items() + if not k.startswith("/app/.outbox/")} + return FakeExecResult() + if "ls /app/.outbox/" in cmd: + files = [f for f in self._files if f.startswith("/app/.outbox/")] + return FakeExecResult(stdout="\n".join(files)) + if cmd.startswith("cat "): + path = cmd.split(" ", 1)[1] + return FakeExecResult(stdout=self._files.get(path, "{}")) + if cmd.startswith("rm -f "): + path = cmd.split()[-1] + self._files.pop(path, None) + return FakeExecResult() + return FakeExecResult() + + def stage_outbox(self, recipient: str, content: str) -> None: + self._files[f"/app/.outbox/{recipient}.json"] = json.dumps( + {"to": recipient, "content": content} + ) + + +def _make_trial(scene: Scene) -> Trial: + config = TrialConfig( + task_path=Path("tasks/fake"), + scenes=[scene], + environment="docker", + ) + trial = Trial(config) + trial._env = FakeEnv() + trial._resolved_prompts = ["Solve the task"] + return trial + + +@pytest.fixture +def coder_reviewer_scene() -> Scene: + return Scene( + name="code-review", + roles=[ + Role("coder", "gemini", "gemini-3.1-flash-lite-preview"), + Role("reviewer", "gemini", "gemini-3.1-flash-lite-preview"), + ], + turns=[ + Turn("coder"), + Turn("reviewer", "Review the code. Write feedback to /app/.outbox/coder.json"), + Turn("coder", "Read feedback and fix issues."), + ], + ) + + +@pytest.fixture +def self_review_scene() -> Scene: + return Scene( + name="self-review", + roles=[Role("agent", "gemini", "gemini-3.1-flash-lite-preview")], + turns=[ + Turn("agent"), + Turn("agent", "Review your solution and fix edge cases."), + ], + ) + + +async def test_outbox_setup_for_multi_role(coder_reviewer_scene: Scene) -> None: + """Multi-role scenes set up /app/.outbox with correct ownership.""" + trial = _make_trial(coder_reviewer_scene) + prompts_received: list[str] = [] + + async def fake_execute(prompts=None): + prompts_received.extend(prompts or []) + return [], 0 + + trial.connect_as = AsyncMock() + trial.disconnect = AsyncMock() + trial.execute = fake_execute + + await trial._run_scene(coder_reviewer_scene) + + outbox_setup = [c for c in trial._env._exec_log if "mkdir -p /app/.outbox" in c] + assert len(outbox_setup) == 1 + assert "chown agent:agent /app/.outbox" in outbox_setup[0] + + +async def test_no_outbox_setup_for_single_role(self_review_scene: Scene) -> None: + """Single-role scenes skip outbox setup (no inter-role messaging needed).""" + trial = _make_trial(self_review_scene) + + async def fake_execute(prompts=None): + return [], 0 + + trial.connect_as = AsyncMock() + trial.disconnect = AsyncMock() + trial.execute = fake_execute + + await trial._run_scene(self_review_scene) + + outbox_cmds = [c for c in trial._env._exec_log if "outbox" in c] + assert len(outbox_cmds) == 0 + + +async def test_outbox_messages_injected_into_prompt(coder_reviewer_scene: Scene) -> None: + """Outbox messages from coder are injected into reviewer's prompt.""" + trial = _make_trial(coder_reviewer_scene) + prompts_received: list[tuple[str, list[str]]] = [] + call_count = 0 + + async def fake_execute(prompts=None): + nonlocal call_count + # Track which role got which prompt + role = coder_reviewer_scene.turns[call_count].role + prompts_received.append((role, prompts or [])) + # Coder writes to reviewer outbox on first turn + if call_count == 0: + trial._env.stage_outbox("reviewer", "Please review my regex implementation") + # Reviewer writes feedback to coder outbox on second turn + elif call_count == 1: + trial._env.stage_outbox("coder", "Edge case: empty string input not handled") + call_count += 1 + return [], 0 + + trial.connect_as = AsyncMock() + trial.disconnect = AsyncMock() + trial.execute = fake_execute + + await trial._run_scene(coder_reviewer_scene) + + assert len(prompts_received) == 3 + + # Turn 0: coder gets base prompt (no messages yet) + assert prompts_received[0][0] == "coder" + assert "Messages from other agents" not in prompts_received[0][1][0] + + # Turn 1: reviewer gets its prompt + coder's outbox message + assert prompts_received[1][0] == "reviewer" + assert "Please review my regex implementation" in prompts_received[1][1][0] + assert "From coder" in prompts_received[1][1][0] + + # Turn 2: coder gets its prompt + reviewer's feedback + assert prompts_received[2][0] == "coder" + assert "Edge case: empty string input not handled" in prompts_received[2][1][0] + assert "From reviewer" in prompts_received[2][1][0] + + +async def test_outbox_files_cleared_after_read(coder_reviewer_scene: Scene) -> None: + """Outbox files are removed after reading so they don't repeat.""" + trial = _make_trial(coder_reviewer_scene) + call_count = 0 + + async def fake_execute(prompts=None): + nonlocal call_count + if call_count == 0: + trial._env.stage_outbox("reviewer", "msg1") + call_count += 1 + return [], 0 + + trial.connect_as = AsyncMock() + trial.disconnect = AsyncMock() + trial.execute = fake_execute + + await trial._run_scene(coder_reviewer_scene) + + remaining = [f for f in trial._env._files if f.startswith("/app/.outbox/")] + assert len(remaining) == 0 + + +async def test_outbox_invalid_json_skipped(coder_reviewer_scene: Scene) -> None: + """Invalid JSON in outbox files is skipped without crashing.""" + trial = _make_trial(coder_reviewer_scene) + call_count = 0 + + async def fake_execute(prompts=None): + nonlocal call_count + if call_count == 0: + trial._env._files["/app/.outbox/reviewer.json"] = "not valid json{{" + call_count += 1 + return [], 0 + + trial.connect_as = AsyncMock() + trial.disconnect = AsyncMock() + trial.execute = fake_execute + + # Should not raise + await trial._run_scene(coder_reviewer_scene) + assert call_count == 3 + + +async def test_role_switching_connects_and_disconnects(coder_reviewer_scene: Scene) -> None: + """Verify connect/disconnect happens on role switches.""" + trial = _make_trial(coder_reviewer_scene) + + async def fake_execute(prompts=None): + return [], 0 + + trial.connect_as = AsyncMock() + trial.disconnect = AsyncMock() + trial.execute = fake_execute + + await trial._run_scene(coder_reviewer_scene) + + # 3 turns: coder, reviewer, coder → 2 connect_as calls for role switches + 1 initial + # Initial connect for coder, then disconnect+connect for reviewer, then disconnect+connect for coder + assert trial.connect_as.call_count == 3 + # disconnect after coder->reviewer, after reviewer->coder, and final disconnect + assert trial.disconnect.call_count == 3 + + +async def test_empty_outbox_no_injection() -> None: + """When no outbox files exist, prompt is used as-is.""" + scene = Scene( + name="quiet", + roles=[ + Role("a", "gemini", "flash"), + Role("b", "gemini", "flash"), + ], + turns=[Turn("a", "do stuff"), Turn("b", "also do stuff")], + ) + trial = _make_trial(scene) + prompts_received: list[str] = [] + + async def fake_execute(prompts=None): + prompts_received.extend(prompts or []) + return [], 0 + + trial.connect_as = AsyncMock() + trial.disconnect = AsyncMock() + trial.execute = fake_execute + + await trial._run_scene(scene) + + assert prompts_received[0] == "do stuff" + assert prompts_received[1] == "also do stuff" + assert all("Messages from other agents" not in p for p in prompts_received) + + +async def test_scene_messages_persisted( + coder_reviewer_scene: Scene, tmp_path: Path +) -> None: + """Inter-role messages are saved to scene_messages.jsonl in trial_dir.""" + trial = _make_trial(coder_reviewer_scene) + trial._trial_dir = tmp_path + call_count = 0 + + async def fake_execute(prompts=None): + nonlocal call_count + if call_count == 0: + trial._env.stage_outbox("reviewer", "Please review my code") + elif call_count == 1: + trial._env.stage_outbox("coder", "Found a bug on line 5") + call_count += 1 + return [], 0 + + trial.connect_as = AsyncMock() + trial.disconnect = AsyncMock() + trial.execute = fake_execute + + await trial._run_scene(coder_reviewer_scene) + + msg_path = tmp_path / "scene_messages.jsonl" + assert msg_path.exists() + lines = [json.loads(ln) for ln in msg_path.read_text().strip().splitlines()] + assert len(lines) == 2 + assert lines[0]["sender"] == "coder" + assert lines[0]["recipient"] == "reviewer" + assert lines[0]["content"] == "Please review my code" + assert lines[1]["sender"] == "reviewer" + assert lines[1]["recipient"] == "coder" + assert lines[1]["content"] == "Found a bug on line 5" + assert lines[0]["scene"] == "code-review" + + +async def test_heterogeneous_agent_install(coder_reviewer_scene: Scene) -> None: + """connect_as installs non-primary agents.""" + scene = Scene( + name="hetero", + roles=[ + Role("coder", "gemini", "flash"), + Role("reviewer", "claude-agent-acp", "haiku"), + ], + turns=[Turn("coder"), Turn("reviewer", "Review.")], + ) + config = TrialConfig( + task_path=Path("tasks/fake"), + scenes=[scene], + environment="docker", + agent="gemini", + ) + trial = Trial(config) + trial._env = FakeEnv() + trial._resolved_prompts = ["Solve the task"] + + installed_agents: list[str] = [] + + async def tracking_connect_as(self_inner, role): + if role.agent != config.primary_agent: + installed_agents.append(role.agent) + + async def fake_execute(prompts=None): + return [], 0 + + trial.connect_as = lambda role: tracking_connect_as(trial, role) + trial.disconnect = AsyncMock() + trial.execute = fake_execute + + await trial._run_scene(scene) + + assert "claude-agent-acp" in installed_agents diff --git a/tests/test_sdk_internals.py b/tests/test_sdk_internals.py index fe000cfc..6eff44aa 100644 --- a/tests/test_sdk_internals.py +++ b/tests/test_sdk_internals.py @@ -7,6 +7,7 @@ import json from datetime import datetime from pathlib import Path +from unittest.mock import AsyncMock import pytest @@ -21,8 +22,26 @@ def _resolve(self, agent="claude-agent-acp", model=None, agent_env=None): return resolve_agent_env(agent, model, agent_env) - def test_env_mapping_applied_after_provider(self): + def _patch_expanduser(self, monkeypatch, tmp_path): + orig_expanduser = Path.expanduser + + def fake_expanduser(self): + s = str(self) + if s.startswith("~"): + return tmp_path / s[2:] + return orig_expanduser(self) + + monkeypatch.setattr(Path, "expanduser", fake_expanduser) + + def test_env_mapping_applied_after_provider(self, monkeypatch): """env_mapping translates BENCHFLOW_PROVIDER_* → agent-native vars.""" + for key in ( + "ZAI_API_KEY", + "ANTHROPIC_API_KEY", + "ANTHROPIC_AUTH_TOKEN", + "CLAUDE_CODE_OAUTH_TOKEN", + ): + monkeypatch.delenv(key, raising=False) result = self._resolve( agent="claude-agent-acp", model="zai/glm-5", @@ -33,21 +52,136 @@ def test_env_mapping_applied_after_provider(self): assert "ANTHROPIC_AUTH_TOKEN" in result assert result["ANTHROPIC_AUTH_TOKEN"] == "zk-test" + def test_agent_native_api_key_satisfies_model_check(self, monkeypatch): + """Agent-native mapped key (LLM_API_KEY) can satisfy provider auth check.""" + for key in ("OPENAI_API_KEY", "LLM_API_KEY"): + monkeypatch.delenv(key, raising=False) + result = self._resolve( + agent="openhands", + model="openai/gpt-4.1-mini", + agent_env={"LLM_API_KEY": "test-llm-key"}, + ) + assert result["LLM_API_KEY"] == "test-llm-key" + assert result["BENCHFLOW_PROVIDER_API_KEY"] == "test-llm-key" + + def test_same_provider_native_alias_satisfies_model_check(self, monkeypatch): + """Provider-native aliases remain valid for the same auth context.""" + for key in ("GEMINI_API_KEY", "GOOGLE_API_KEY"): + monkeypatch.delenv(key, raising=False) + result = self._resolve( + agent="gemini", + model="gemini-2.5-flash", + agent_env={"GOOGLE_API_KEY": "test-google-key"}, + ) + assert result["GOOGLE_API_KEY"] == "test-google-key" + assert result["BENCHFLOW_PROVIDER_API_KEY"] == "test-google-key" + + @pytest.mark.parametrize( + ("agent", "host_key"), + [ + pytest.param("codex-acp", "OPENAI_API_KEY", id="codex-openai-key"), + pytest.param( + "claude-agent-acp", + "ANTHROPIC_AUTH_TOKEN", + id="claude-auth-token", + ), + pytest.param("gemini", "GOOGLE_API_KEY", id="gemini-google-key"), + ], + ) + def test_cross_provider_host_native_key_does_not_bypass_required_key( + self, monkeypatch, tmp_path, agent, host_key + ): + """Host-native keys for another provider must not satisfy zai auth.""" + for key in ( + "ZAI_API_KEY", + "OPENAI_API_KEY", + "ANTHROPIC_API_KEY", + "ANTHROPIC_AUTH_TOKEN", + "CLAUDE_CODE_OAUTH_TOKEN", + "GOOGLE_API_KEY", + "GEMINI_API_KEY", + ): + monkeypatch.delenv(key, raising=False) + monkeypatch.setenv(host_key, "host-native-key") + self._patch_expanduser(monkeypatch, tmp_path) + + with pytest.raises(ValueError, match="ZAI_API_KEY required"): + self._resolve(agent=agent, model="zai/glm-5", agent_env={}) + + def test_auto_inherited_generic_bridge_key_does_not_bypass_required_key( + self, monkeypatch, tmp_path + ): + """Generic agent-native keys must be passed explicitly to bridge auth.""" + for key in ("ZAI_API_KEY", "LLM_API_KEY"): + monkeypatch.delenv(key, raising=False) + monkeypatch.setenv("LLM_API_KEY", "host-llm-key") + self._patch_expanduser(monkeypatch, tmp_path) + + with pytest.raises(ValueError, match="ZAI_API_KEY required"): + self._resolve(agent="openhands", model="zai/glm-5", agent_env={}) + + def test_openhands_gemini_model_is_prefixed_for_google_ai_studio(self, monkeypatch): + """OpenHands expects Gemini models in gemini/ format.""" + monkeypatch.delenv("GEMINI_API_KEY", raising=False) + result = self._resolve( + agent="openhands", + model="gemini-3.1-flash-lite-preview", + agent_env={"GEMINI_API_KEY": "test-gemini-key"}, + ) + assert result["LLM_MODEL"] == "gemini/gemini-3.1-flash-lite-preview" + assert result["LLM_API_KEY"] == "test-gemini-key" + + def test_openhands_explicit_llm_model_is_preserved(self, monkeypatch): + """User-provided LLM_MODEL must win over derived normalization.""" + monkeypatch.delenv("GEMINI_API_KEY", raising=False) + result = self._resolve( + agent="openhands", + model="gemini-3.1-flash-lite-preview", + agent_env={ + "GEMINI_API_KEY": "test-gemini-key", + "LLM_MODEL": "litellm/custom-format", + }, + ) + assert result["LLM_MODEL"] == "litellm/custom-format" + + def test_openhands_vertex_model_is_prefixed_for_vertex(self, monkeypatch, tmp_path): + """OpenHands expects Vertex Gemini models in vertex_ai/ format.""" + adc_dir = tmp_path / ".config" / "gcloud" + adc_dir.mkdir(parents=True) + (adc_dir / "application_default_credentials.json").write_text("{}") + monkeypatch.setattr("pathlib.Path.home", staticmethod(lambda: tmp_path)) + result = self._resolve( + agent="openhands", + model="google-vertex/gemini-2.5-flash", + agent_env={"GOOGLE_CLOUD_PROJECT": "my-proj"}, + ) + assert result["LLM_MODEL"] == "vertex_ai/gemini-2.5-flash" + + def test_provider_bridge_key_alone_does_not_bypass_required_model_key( + self, monkeypatch + ): + """Only mapped agent-native keys can bypass provider-specific key checks.""" + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + with pytest.raises(ValueError, match="OPENAI_API_KEY required"): + self._resolve( + agent="openclaw", + model="openai/gpt-4.1-mini", + agent_env={"BENCHFLOW_PROVIDER_API_KEY": "x"}, + ) + def test_required_key_missing_raises(self, monkeypatch, tmp_path): """Missing required API key raises ValueError when no subscription auth.""" # Clear any auto-inherited keys from the environment - for key in ("ANTHROPIC_API_KEY", "ZAI_API_KEY", "OPENAI_API_KEY"): + for key in ( + "ANTHROPIC_API_KEY", + "ANTHROPIC_AUTH_TOKEN", + "CLAUDE_CODE_OAUTH_TOKEN", + "ZAI_API_KEY", + "OPENAI_API_KEY", + ): monkeypatch.delenv(key, raising=False) # Ensure no host subscription auth files are found - orig_expanduser = Path.expanduser - - def fake_expanduser(self): - s = str(self) - if s.startswith("~"): - return tmp_path / s[2:] - return orig_expanduser(self) - - monkeypatch.setattr(Path, "expanduser", fake_expanduser) + self._patch_expanduser(monkeypatch, tmp_path) # Anthropic model with pytest.raises(ValueError, match="ANTHROPIC_API_KEY required"): self._resolve( @@ -208,6 +342,7 @@ def test_config_json_written(self, tmp_path): skills_dir=None, sandbox_user=None, context_root=None, + sandbox_setup_timeout=33, timeout=300, started_at=datetime(2026, 4, 8, 12, 0), agent_env={"ANTHROPIC_API_KEY": "sk-secret", "SOME_VAR": "visible"}, @@ -221,6 +356,7 @@ def test_config_json_written(self, tmp_path): "skills_dir", "sandbox_user", "sandbox_locked_paths", + "sandbox_setup_timeout", "context_root", "timeout_sec", "started_at", @@ -232,6 +368,7 @@ def test_config_json_written(self, tmp_path): assert data["agent"] == "claude-agent-acp" assert data["model"] == "claude-haiku-4-5-20251001" assert data["environment"] == "docker" + assert data["sandbox_setup_timeout"] == 33 assert data["timeout_sec"] == 300 def test_secrets_filtered(self, tmp_path): @@ -266,6 +403,41 @@ def test_secrets_filtered(self, tmp_path): assert recorded["SAFE_VAR"] == "visible" +# ── run wiring ── + + +class TestRunWiring: + """Tests for SDK.run() argument forwarding into TrialConfig.""" + + @pytest.mark.asyncio + async def test_run_forwards_sandbox_setup_timeout_to_trial_config( + self, monkeypatch, tmp_path + ): + from benchflow.models import RunResult + from benchflow.sdk import SDK + + seen = {} + + async def fake_create(config): + seen["config"] = config + trial = AsyncMock() + trial.run = AsyncMock( + return_value=RunResult(task_name="task-1", rewards={"reward": 1.0}) + ) + return trial + + monkeypatch.setattr("benchflow.trial.Trial.create", fake_create) + + result = await SDK().run( + task_path=tmp_path, + sandbox_setup_timeout=77, + ) + + assert result.rewards == {"reward": 1.0} + assert seen["config"].sandbox_setup_timeout == 77 + assert seen["config"].task_path == tmp_path + + # ── _build_result ── diff --git a/tests/test_sdk_lockdown.py b/tests/test_sdk_lockdown.py index 91cb7d41..0d4f92cc 100644 --- a/tests/test_sdk_lockdown.py +++ b/tests/test_sdk_lockdown.py @@ -261,7 +261,7 @@ def test_inner_command_is_shlex_quoted(self): import shlex cmd = build_priv_drop_cmd("agent --flag value", "agent") - inner = "export HOME=/home/agent && cd /home/agent && agent --flag value" + inner = "export HOME=/home/agent && agent --flag value" assert shlex.quote(inner) in cmd def test_single_quotes_in_launch(self): diff --git a/tests/test_skill_eval.py b/tests/test_skill_eval.py index c1f2b3e6..94518d15 100644 --- a/tests/test_skill_eval.py +++ b/tests/test_skill_eval.py @@ -111,7 +111,7 @@ def test_loads_valid_dataset(self, skill_dir): ds = load_eval_dataset(skill_dir) assert ds.skill_name == "calculator" assert len(ds.cases) == 2 - assert ds.judge_model == "gemini-3.1-flash-lite" + assert ds.judge_model == "claude-haiku-4-5-20251001" assert ds.timeout_sec == 120 def test_parses_cases(self, skill_dir): @@ -133,7 +133,7 @@ def test_minimal_dataset(self, minimal_skill_dir): def test_missing_evals_json(self, tmp_path): skill = tmp_path / "no-evals" skill.mkdir() - with pytest.raises(FileNotFoundError, match="evals.json"): + with pytest.raises(FileNotFoundError, match=r"evals\.json"): load_eval_dataset(skill) def test_empty_cases(self, tmp_path): diff --git a/tests/test_skill_eval_dryrun.py b/tests/test_skill_eval_dryrun.py index 39b6a57e..931a8e96 100644 --- a/tests/test_skill_eval_dryrun.py +++ b/tests/test_skill_eval_dryrun.py @@ -218,7 +218,7 @@ def test_gepa_export_roundtrip(self, mock_skill): # Verify trace files traces = list((gepa_dir / "traces").iterdir()) - assert len(traces) == 4 # 2 cases × 2 modes + assert len(traces) == 4 # 2 cases x 2 modes # Verify trace content trace = json.loads(traces[0].read_text()) @@ -228,6 +228,7 @@ def test_gepa_export_roundtrip(self, mock_skill): def test_cli_dryrun_loads_dataset(self, mock_skill): from typer.testing import CliRunner + from benchflow.cli.main import app runner = CliRunner() diff --git a/tests/test_subscription_auth.py b/tests/test_subscription_auth.py index 7f322b99..39ff0565 100644 --- a/tests/test_subscription_auth.py +++ b/tests/test_subscription_auth.py @@ -98,6 +98,8 @@ def test_subscription_auth_detected(self, monkeypatch, tmp_path): """When host auth file exists and no API key, subscription auth is used.""" for k in ( "ANTHROPIC_API_KEY", + "ANTHROPIC_AUTH_TOKEN", + "CLAUDE_CODE_OAUTH_TOKEN", "OPENAI_API_KEY", "GOOGLE_API_KEY", "GEMINI_API_KEY", @@ -118,6 +120,8 @@ def test_no_auth_file_raises(self, monkeypatch, tmp_path): """When no API key and no host auth file, raises ValueError.""" for k in ( "ANTHROPIC_API_KEY", + "ANTHROPIC_AUTH_TOKEN", + "CLAUDE_CODE_OAUTH_TOKEN", "OPENAI_API_KEY", "GOOGLE_API_KEY", "GEMINI_API_KEY", diff --git a/tests/test_trial_install_agent_timeout.py b/tests/test_trial_install_agent_timeout.py new file mode 100644 index 00000000..40db22c9 --- /dev/null +++ b/tests/test_trial_install_agent_timeout.py @@ -0,0 +1,91 @@ +"""Tests for Trial.install_agent timeout wiring.""" + +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from benchflow.trial import Trial, TrialConfig + + +def _make_trial(tmp_path, *, agent: str, sandbox_setup_timeout: int) -> Trial: + config = TrialConfig.from_legacy( + task_path=tmp_path / "task", + agent=agent, + prompts=[None], + sandbox_user="agent", + sandbox_setup_timeout=sandbox_setup_timeout, + ) + trial = Trial(config) + trial._env = MagicMock() + trial._env.exec = AsyncMock(return_value=MagicMock(stdout="/workspace\n")) + trial._trial_dir = tmp_path / "trial" + trial._trial_dir.mkdir() + trial._trial_paths = MagicMock() + trial._task = MagicMock() + trial._effective_locked = [] + return trial + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("agent", "expected_setup_return"), + [ + ("claude-agent-acp", "/home/agent"), + ("oracle", None), + ], +) +async def test_install_agent_forwards_sandbox_setup_timeout( + tmp_path, monkeypatch, agent, expected_setup_return +): + trial = _make_trial(tmp_path, agent=agent, sandbox_setup_timeout=41) + + install_agent_mock = AsyncMock(return_value=MagicMock()) + write_credential_files_mock = AsyncMock() + upload_subscription_auth_mock = AsyncMock() + snapshot_build_config_mock = AsyncMock() + seed_verifier_workspace_mock = AsyncMock() + deploy_skills_mock = AsyncMock() + lockdown_paths_mock = AsyncMock() + setup_sandbox_user_mock = AsyncMock(return_value=expected_setup_return) + + monkeypatch.setattr("benchflow.trial.install_agent", install_agent_mock) + monkeypatch.setattr( + "benchflow.trial.write_credential_files", write_credential_files_mock + ) + monkeypatch.setattr( + "benchflow.trial.upload_subscription_auth", upload_subscription_auth_mock + ) + monkeypatch.setattr( + "benchflow.trial._snapshot_build_config", snapshot_build_config_mock + ) + monkeypatch.setattr( + "benchflow.trial._seed_verifier_workspace", seed_verifier_workspace_mock + ) + monkeypatch.setattr("benchflow.trial.deploy_skills", deploy_skills_mock) + monkeypatch.setattr("benchflow.trial.lockdown_paths", lockdown_paths_mock) + monkeypatch.setattr( + "benchflow.trial.setup_sandbox_user", setup_sandbox_user_mock + ) + + await trial.install_agent() + + setup_sandbox_user_mock.assert_awaited_once() + args, kwargs = setup_sandbox_user_mock.await_args + assert args[1] == "agent" + assert kwargs["timeout_sec"] == 41 + assert kwargs["workspace"] == "/workspace" + + if agent == "oracle": + install_agent_mock.assert_not_awaited() + write_credential_files_mock.assert_not_awaited() + deploy_skills_mock.assert_not_awaited() + assert trial._agent_cwd == "/workspace" + else: + install_agent_mock.assert_awaited_once() + write_credential_files_mock.assert_awaited_once() + deploy_skills_mock.assert_awaited_once() + assert trial._agent_cwd == "/home/agent" + + snapshot_build_config_mock.assert_awaited_once() + seed_verifier_workspace_mock.assert_awaited_once() + lockdown_paths_mock.assert_awaited_once() diff --git a/tests/test_user.py b/tests/test_user.py new file mode 100644 index 00000000..8ebf1c4c --- /dev/null +++ b/tests/test_user.py @@ -0,0 +1,336 @@ +"""Tests for the User abstraction — BaseUser, PassthroughUser, FunctionUser, RoundResult.""" + +from __future__ import annotations + +import tempfile +from dataclasses import dataclass +from pathlib import Path +from unittest.mock import AsyncMock, patch + +import pytest + +from benchflow.trial import Role, Scene, Trial, TrialConfig, Turn +from benchflow.user import BaseUser, FunctionUser, PassthroughUser, RoundResult + +# ── Unit tests for User types ── + + +class TestPassthroughUser: + @pytest.mark.asyncio + async def test_sends_instruction_once(self): + user = PassthroughUser() + result = await user.run(0, "Fix the bug") + assert result == "Fix the bug" + + @pytest.mark.asyncio + async def test_stops_after_first_round(self): + user = PassthroughUser() + await user.run(0, "Fix the bug") + result = await user.run(1, "Fix the bug", RoundResult(round=0)) + assert result is None + + + +class TestFunctionUser: + @pytest.mark.asyncio + async def test_sync_function(self): + def my_fn(round: int, instruction: str, rr: RoundResult | None) -> str | None: + if round == 0: + return "terse: " + instruction[:10] + return None + + user = FunctionUser(my_fn) + assert await user.run(0, "Fix the authentication bug") == "terse: Fix the au" + assert await user.run(1, "Fix the authentication bug", RoundResult(round=0)) is None + + @pytest.mark.asyncio + async def test_async_function(self): + async def my_fn(round: int, instruction: str, rr: RoundResult | None) -> str | None: + if round == 0: + return instruction + if rr and rr.rewards and rr.rewards.get("exact_match", 0) < 1.0: + return "Try again, tests failed" + return None + + user = FunctionUser(my_fn) + assert await user.run(0, "task") == "task" + + failing = RoundResult(round=0, rewards={"exact_match": 0.0}) + assert await user.run(1, "task", failing) == "Try again, tests failed" + + passing = RoundResult(round=1, rewards={"exact_match": 1.0}) + assert await user.run(2, "task", passing) is None + + +class TestBaseUser: + @pytest.mark.asyncio + async def test_not_implemented(self): + user = BaseUser() + with pytest.raises(NotImplementedError): + await user.run(0, "task") + + +# ── Integration tests for user loop in Trial ── + + +@dataclass +class FakeExecResult: + stdout: str = "" + stderr: str = "" + return_code: int = 0 + + +class FakeEnv: + """Minimal sandbox mock for user loop tests.""" + + def __init__(self) -> None: + self._exec_log: list[str] = [] + + async def exec(self, cmd: str, **kwargs) -> FakeExecResult: + self._exec_log.append(cmd) + if "cat /solution" in cmd: + return FakeExecResult(stdout="gold answer here") + if "rm -rf /solution" in cmd: + return FakeExecResult() + if "/logs/verifier" in cmd: + return FakeExecResult() + if "cat /logs/verifier" in cmd: + return FakeExecResult(stdout="") + return FakeExecResult() + + async def stop(self, **kwargs): + pass + + +def _make_user_trial( + user: BaseUser, max_rounds: int = 5, oracle: bool = False, tmp_path: Path | None = None, +) -> Trial: + config = TrialConfig( + task_path=Path("tasks/fake"), + scenes=[Scene.single(agent="gemini", model="flash")], + environment="docker", + user=user, + max_user_rounds=max_rounds, + oracle_access=oracle, + ) + trial = Trial(config) + trial._env = FakeEnv() + trial._resolved_prompts = ["Solve the task described in /app/instruction.md"] + trial_dir = tmp_path or Path(tempfile.mkdtemp(prefix="benchflow-test-")) + trial._trial_dir = trial_dir + verifier_dir = trial_dir / "verifier" + verifier_dir.mkdir(parents=True, exist_ok=True) + trial._trial_paths = type("P", (), {"verifier_dir": verifier_dir})() + trial._task = type("T", (), { + "config": type("C", (), { + "verifier": type("V", (), { + "timeout_sec": 30, + "env": {}, + })(), + "agent": type("A", (), {"timeout_sec": 60})(), + })(), + })() + trial._agent_cwd = "/app" + return trial + + +class RecordingUser(BaseUser): + """User that records calls and stops after a fixed number of rounds.""" + + def __init__(self, max_rounds: int = 2, prompts: list[str] | None = None): + self.setup_calls: list[tuple[str, str | None]] = [] + self.run_calls: list[tuple[int, str, RoundResult | None]] = [] + self._max = max_rounds + self._prompts = prompts or ["Do the thing", "Try harder"] + + async def setup(self, instruction: str, solution: str | None = None) -> None: + self.setup_calls.append((instruction, solution)) + + async def run( + self, round: int, instruction: str, round_result: RoundResult | None = None + ) -> str | None: + self.run_calls.append((round, instruction, round_result)) + if round >= self._max: + return None + return self._prompts[min(round, len(self._prompts) - 1)] + + +class TestUserLoop: + @pytest.mark.asyncio + async def test_user_loop_calls_setup_and_run(self): + user = RecordingUser(max_rounds=1) + trial = _make_user_trial(user, max_rounds=3) + + with patch.object(trial, "connect_as", new_callable=AsyncMock), \ + patch.object(trial, "execute", new_callable=AsyncMock, return_value=([], 0)), \ + patch.object(trial, "disconnect", new_callable=AsyncMock), \ + patch.object(trial, "soft_verify", new_callable=AsyncMock, return_value=({"exact_match": 1.0}, None, None)): + + await trial._run_user_loop() + + assert len(user.setup_calls) == 1 + assert user.setup_calls[0][0] == "Solve the task described in /app/instruction.md" + assert len(user.run_calls) == 2 # round 0 → prompt, round 1 → None + assert user.run_calls[0][0] == 0 # round number + assert user.run_calls[1][0] == 1 + + @pytest.mark.asyncio + async def test_user_loop_passes_round_result(self): + user = RecordingUser(max_rounds=2) + trial = _make_user_trial(user, max_rounds=5) + + with patch.object(trial, "connect_as", new_callable=AsyncMock), \ + patch.object(trial, "execute", new_callable=AsyncMock, return_value=([], 0)), \ + patch.object(trial, "disconnect", new_callable=AsyncMock), \ + patch.object(trial, "soft_verify", new_callable=AsyncMock, return_value=({"exact_match": 0.5}, "1 failed", None)): + + await trial._run_user_loop() + + # First call has no round_result + assert user.run_calls[0][2] is None + # Second call has round_result from round 0 + rr = user.run_calls[1][2] + assert isinstance(rr, RoundResult) + assert rr.round == 0 + assert rr.rewards == {"exact_match": 0.5} + assert rr.verifier_output == "1 failed" + + @pytest.mark.asyncio + async def test_user_loop_respects_max_rounds(self): + """User that never stops is capped by max_user_rounds.""" + def never_stop(r, instr, rr): + return "keep going" + + user = FunctionUser(never_stop) + trial = _make_user_trial(user, max_rounds=3) + + call_count = 0 + async def mock_execute(prompts=None): + nonlocal call_count + call_count += 1 + return [], 0 + + with patch.object(trial, "connect_as", new_callable=AsyncMock), \ + patch.object(trial, "execute", side_effect=mock_execute), \ + patch.object(trial, "disconnect", new_callable=AsyncMock), \ + patch.object(trial, "soft_verify", new_callable=AsyncMock, return_value=(None, None, None)): + + await trial._run_user_loop() + + assert call_count == 3 + + @pytest.mark.asyncio + async def test_oracle_access(self): + user = RecordingUser(max_rounds=0) + trial = _make_user_trial(user, oracle=True) + + with patch.object(trial, "connect_as", new_callable=AsyncMock), \ + patch.object(trial, "execute", new_callable=AsyncMock, return_value=([], 0)), \ + patch.object(trial, "disconnect", new_callable=AsyncMock), \ + patch.object(trial, "soft_verify", new_callable=AsyncMock, return_value=(None, None, None)): + + await trial._run_user_loop() + + assert user.setup_calls[0][1] == "gold answer here" + + @pytest.mark.asyncio + async def test_multi_role_raises(self): + user = RecordingUser() + config = TrialConfig( + task_path=Path("tasks/fake"), + scenes=[Scene( + name="multi", + roles=[Role("a", "gemini"), Role("b", "gemini")], + turns=[Turn("a"), Turn("b")], + )], + user=user, + ) + trial = Trial(config) + trial._env = FakeEnv() + trial._resolved_prompts = ["task"] + + with pytest.raises(ValueError, match="single-role"): + await trial._run_user_loop() + + @pytest.mark.asyncio + async def test_user_run_exception_stops_loop(self): + class FailingUser(BaseUser): + async def run(self, round, instruction, rr=None): + raise RuntimeError("oops") + + trial = _make_user_trial(FailingUser()) + + with patch.object(trial, "connect_as", new_callable=AsyncMock), \ + patch.object(trial, "execute", new_callable=AsyncMock, return_value=([], 0)), \ + patch.object(trial, "disconnect", new_callable=AsyncMock), \ + patch.object(trial, "soft_verify", new_callable=AsyncMock, return_value=(None, None, None)): + + await trial._run_user_loop() + + assert "user.run() failed" in trial._error + + +class TestSoftVerify: + @pytest.mark.asyncio + async def test_soft_verify_timeout(self): + trial = _make_user_trial(PassthroughUser()) + + with patch("harbor.Verifier") as MockVerifier, \ + patch("benchflow._sandbox.CLEANUP_CMD", "true"): + mock_instance = MockVerifier.return_value + mock_instance.verify = AsyncMock(side_effect=TimeoutError()) + + rewards, output, error = await trial.soft_verify() + + assert rewards is None + assert output is None + assert "timed out" in error + + @pytest.mark.asyncio + async def test_soft_verify_crash(self): + trial = _make_user_trial(PassthroughUser()) + + with patch("harbor.Verifier") as MockVerifier, \ + patch("benchflow._sandbox.CLEANUP_CMD", "true"): + mock_instance = MockVerifier.return_value + mock_instance.verify = AsyncMock(side_effect=RuntimeError("boom")) + + rewards, _output, error = await trial.soft_verify() + + assert rewards is None + assert "crashed" in error + assert "boom" in error + + @pytest.mark.asyncio + async def test_soft_verify_success(self): + trial = _make_user_trial(PassthroughUser()) + + mock_result = type("VR", (), {"rewards": {"exact_match": 1.0}})() + + with patch("harbor.Verifier") as MockVerifier, \ + patch("benchflow._sandbox.CLEANUP_CMD", "true"): + mock_instance = MockVerifier.return_value + mock_instance.verify = AsyncMock(return_value=mock_result) + + rewards, _output, error = await trial.soft_verify() + + assert rewards == {"exact_match": 1.0} + assert error is None + + @pytest.mark.asyncio + async def test_soft_verify_runs_cleanup_cmd(self): + trial = _make_user_trial(PassthroughUser()) + + mock_result = type("VR", (), {"rewards": {}})() + + with patch("harbor.Verifier") as MockVerifier, \ + patch("benchflow._sandbox._build_cleanup_cmd", + return_value="echo cleanup_sentinel"): + mock_instance = MockVerifier.return_value + mock_instance.verify = AsyncMock(return_value=mock_result) + + await trial.soft_verify() + + # Verify cleanup command was executed + exec_log = trial._env._exec_log + assert any("cleanup_sentinel" in cmd for cmd in exec_log) diff --git a/tests/test_verify.py b/tests/test_verify.py index c3a790fa..93f4a51f 100644 --- a/tests/test_verify.py +++ b/tests/test_verify.py @@ -476,9 +476,9 @@ def sdk_run_mocks(self, tmp_path): def _patch_sdk_run(self, sdk, mock_env, extra_patches): """Apply shared + extra patches for SDK.run() internals.""" patches = [ - patch("benchflow.sdk._create_environment", return_value=mock_env), + patch("benchflow.trial._create_environment", return_value=mock_env), patch( - "benchflow.sdk.install_agent", + "benchflow.trial.install_agent", new_callable=AsyncMock, return_value=MagicMock( credential_files={}, @@ -487,8 +487,8 @@ def _patch_sdk_run(self, sdk, mock_env, extra_patches): env_mapping={}, ), ), - patch("benchflow.sdk.write_credential_files", new_callable=AsyncMock), - patch("benchflow.sdk.deploy_skills", new_callable=AsyncMock), + patch("benchflow.trial.write_credential_files", new_callable=AsyncMock), + patch("benchflow.trial.deploy_skills", new_callable=AsyncMock), *extra_patches, ] with contextlib.ExitStack() as stack: @@ -516,23 +516,22 @@ async def test_scraped_trajectory_preserves_n_tool_calls( mock_env, [ patch( - "benchflow.sdk.connect_acp", + "benchflow.trial.connect_acp", new_callable=AsyncMock, return_value=(mock_acp, mock_session, "test-agent"), ), patch( - "benchflow.sdk.execute_prompts", + "benchflow.trial.execute_prompts", new_callable=AsyncMock, return_value=([], 5), ), patch( - "benchflow.sdk._scrape_agent_trajectory", + "benchflow.trial._scrape_agent_trajectory", new_callable=AsyncMock, return_value=forged, ), - patch.object( - sdk, - "_verify", + patch( + "benchflow.sdk.SDK._verify", new_callable=AsyncMock, return_value=({"reward": 1.0}, None), ), @@ -568,21 +567,21 @@ async def test_partial_acp_uses_session_tool_calls(self, sdk_run_mocks): mock_env, [ patch( - "benchflow.sdk.connect_acp", + "benchflow.trial.connect_acp", new_callable=AsyncMock, return_value=(mock_acp, mock_session, "test-agent"), ), patch( - "benchflow.sdk.execute_prompts", + "benchflow.trial.execute_prompts", new_callable=AsyncMock, side_effect=ConnectionError("lost"), ), patch( - "benchflow.sdk._capture_session_trajectory", + "benchflow.trial._capture_session_trajectory", return_value=partial_events, ), patch( - "benchflow.sdk._scrape_agent_trajectory", + "benchflow.trial._scrape_agent_trajectory", new_callable=AsyncMock, return_value=[], ), diff --git a/tests/test_yaml_config.py b/tests/test_yaml_config.py index d362296e..0de12aac 100644 --- a/tests/test_yaml_config.py +++ b/tests/test_yaml_config.py @@ -24,6 +24,7 @@ def native_yaml(tmp_path): environment: daytona concurrency: 32 max_retries: 1 +sandbox_setup_timeout: 45 prompts: - null - "Review your solution." @@ -46,6 +47,7 @@ def harbor_yaml(tmp_path): orchestrator: type: local n_concurrent_trials: 8 +sandbox_setup_timeout: 75 environment: type: daytona env: @@ -69,6 +71,7 @@ def test_from_native_yaml(native_yaml): assert cfg.environment == "daytona" assert cfg.concurrency == 32 assert cfg.retry.max_retries == 1 + assert cfg.sandbox_setup_timeout == 45 assert cfg.prompts == [None, "Review your solution."] assert job._tasks_dir == Path("tasks") assert job._jobs_dir == Path("output") @@ -85,6 +88,7 @@ def test_from_harbor_yaml(harbor_yaml): assert cfg.concurrency == 8 assert cfg.retry.max_retries == 1 # n_attempts=2 → max_retries=1 assert cfg.agent_env.get("ANTHROPIC_API_KEY") == "test-key" + assert cfg.sandbox_setup_timeout == 75 assert job._tasks_dir == Path("tasks") assert job._jobs_dir == Path("output") @@ -127,6 +131,7 @@ def test_from_harbor_yaml_defaults(tmp_path): assert cfg.agent == "pi-acp" assert cfg.environment == "docker" assert cfg.concurrency == 4 + assert cfg.sandbox_setup_timeout == 120 assert job._tasks_dir == Path("tasks") assert job._jobs_dir == Path("jobs") @@ -253,3 +258,4 @@ def test_defaults_when_omitted(self, tmp_path): assert job._config.exclude_tasks == set() assert job._config.agent_env == {} assert job._config.sandbox_user == "agent" + assert job._config.sandbox_setup_timeout == 120