From 8540fda66297a0070b7cc20f949b50ab23aded3f Mon Sep 17 00:00:00 2001 From: Xiangyi Li Date: Tue, 21 Apr 2026 13:39:24 -0700 Subject: [PATCH 01/25] release: benchflow 0.3.0 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ff627c4..6889398 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "benchflow" -version = "0.3.0a10" +version = "0.3.0" description = "Multi-turn agent benchmarking with ACP — run any agent, any model, any provider." readme = "README.md" requires-python = ">=3.12" From d6497f341156037956c414c2ff634fd5ce1e3457 Mon Sep 17 00:00:00 2001 From: Xiangyi Li Date: Wed, 22 Apr 2026 00:11:52 -0400 Subject: [PATCH 02/25] =?UTF-8?q?fix:=20openhands=20install=20=E2=80=94=20?= =?UTF-8?q?uv=20tool=20install=20or=20pip=20install=20openhands-ai=20(#169?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: skip model/API-key validation for oracle agent The oracle agent runs solution/solve.sh and never calls an LLM, but resolve_agent_env() was validating API keys for whatever model the CLI defaulted to (claude-haiku-4-5-20251001). This made `bench eval create -a oracle` fail without ANTHROPIC_API_KEY set, even though oracle doesn't need it. * fix: don't assign default model to oracle agent Move the fix from resolve_agent_env to the CLI layer: oracle runs solve.sh and never calls an LLM, so it should not receive DEFAULT_MODEL at all. Both _run_single and _run_batch now pass model=None for oracle. Widen JobConfig.model to str | None to support this. * fix: openhands install — use uv tool install or pip install openhands-ai The PyPI package 'openhands' (0.0.0) is a placeholder, not the CLI. The real install is 'uv tool install openhands' (preferred) or 'pip install openhands-ai'. Tries uv first, falls back to pip. Fixes #169 runtime error: 'openhands: command not found' --------- Co-authored-by: Yifeng He --- src/benchflow/agents/registry.py | 4 +++- src/benchflow/cli/eval.py | 6 ++++-- src/benchflow/job.py | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/benchflow/agents/registry.py b/src/benchflow/agents/registry.py index f1d26bb..456c406 100644 --- a/src/benchflow/agents/registry.py +++ b/src/benchflow/agents/registry.py @@ -314,7 +314,9 @@ class AgentConfig: skill_paths=[], install_cmd=( "( command -v openhands >/dev/null 2>&1 || " - "pip install openhands >/dev/null 2>&1 ) && " + "( command -v uv >/dev/null 2>&1 && " + "uv tool install openhands --python 3.12 >/dev/null 2>&1 ) || " + "( pip install openhands-ai >/dev/null 2>&1 ) ) && " "command -v openhands >/dev/null 2>&1" ), launch_cmd="openhands acp --always-approve --override-with-envs", diff --git a/src/benchflow/cli/eval.py b/src/benchflow/cli/eval.py index 0b13fd8..a92c0be 100644 --- a/src/benchflow/cli/eval.py +++ b/src/benchflow/cli/eval.py @@ -228,11 +228,12 @@ def _run_single( from benchflow.sdk import SDK sdk = SDK() + effective_model = None if agent == "oracle" else (model or DEFAULT_MODEL) result = asyncio.run( sdk.run( task_path=task_dir, agent=agent, - model=model or DEFAULT_MODEL, + model=effective_model, environment=environment, prompts=cast("list[str | None] | None", prompt), agent_env=agent_env, @@ -268,9 +269,10 @@ def _run_batch( ) -> None: from benchflow.job import Job, JobConfig, RetryConfig + effective_model = None if agent == "oracle" else (model or DEFAULT_MODEL) config = JobConfig( agent=agent, - model=model or DEFAULT_MODEL, + model=effective_model, environment=environment, concurrency=concurrency, retry=RetryConfig(max_retries=max_retries), diff --git a/src/benchflow/job.py b/src/benchflow/job.py index 6d8fe9b..95414fe 100644 --- a/src/benchflow/job.py +++ b/src/benchflow/job.py @@ -146,7 +146,7 @@ class JobConfig: """Configuration for a benchmark job.""" agent: str = DEFAULT_AGENT - model: str = DEFAULT_MODEL + model: str | None = DEFAULT_MODEL environment: str = "docker" concurrency: int = 4 prompts: list[str | None] | None = None From d3345dd90db8563f0f7b6e3c09ed9e0ac879f5a4 Mon Sep 17 00:00:00 2001 From: Xiangyi Li Date: Tue, 21 Apr 2026 21:12:48 -0700 Subject: [PATCH 03/25] =?UTF-8?q?release:=20benchflow=200.3.1=20=E2=80=94?= =?UTF-8?q?=20fix=20openhands=20install?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6889398..dfd8fec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "benchflow" -version = "0.3.0" +version = "0.3.1" description = "Multi-turn agent benchmarking with ACP — run any agent, any model, any provider." readme = "README.md" requires-python = ">=3.12" From 3ee1adecd60adc7e801a2bbeefc01f08ddc5bc10 Mon Sep 17 00:00:00 2001 From: Xiangyi Li Date: Tue, 21 Apr 2026 21:15:53 -0700 Subject: [PATCH 04/25] =?UTF-8?q?fix:=20openhands=20install=20=E2=80=94=20?= =?UTF-8?q?bootstrap=20curl=20+=20uv=20in=20bare=20Ubuntu=20sandboxes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/benchflow/agents/registry.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/benchflow/agents/registry.py b/src/benchflow/agents/registry.py index 456c406..ce26efd 100644 --- a/src/benchflow/agents/registry.py +++ b/src/benchflow/agents/registry.py @@ -313,10 +313,17 @@ class AgentConfig: description="OpenHands agent via ACP (multi-model, Python-based)", skill_paths=[], install_cmd=( - "( command -v openhands >/dev/null 2>&1 || " - "( command -v uv >/dev/null 2>&1 && " - "uv tool install openhands --python 3.12 >/dev/null 2>&1 ) || " - "( pip install openhands-ai >/dev/null 2>&1 ) ) && " + "export DEBIAN_FRONTEND=noninteractive; " + "( command -v openhands >/dev/null 2>&1 || ( " + " ( command -v uv >/dev/null 2>&1 || " + " ( command -v curl >/dev/null 2>&1 || " + " (apt-get update -qq && apt-get install -y -qq curl ca-certificates) ) && " + " curl -LsSf https://astral.sh/uv/install.sh | sh && " + " export PATH=\"$HOME/.local/bin:$PATH\" " + " ) && " + " uv tool install openhands --python 3.12 " + ") ) >/dev/null 2>&1 && " + "export PATH=\"$HOME/.local/bin:$PATH\" && " "command -v openhands >/dev/null 2>&1" ), launch_cmd="openhands acp --always-approve --override-with-envs", From 21b356da0db9ea34dcf20e852486a2e8fdc25447 Mon Sep 17 00:00:00 2001 From: Xiangyi Li Date: Tue, 21 Apr 2026 21:21:11 -0700 Subject: [PATCH 05/25] =?UTF-8?q?fix:=20openhands=20install=20=E2=80=94=20?= =?UTF-8?q?set=20PATH=20before=20command=20-v=20check?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/benchflow/agents/registry.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/benchflow/agents/registry.py b/src/benchflow/agents/registry.py index ce26efd..583138b 100644 --- a/src/benchflow/agents/registry.py +++ b/src/benchflow/agents/registry.py @@ -313,17 +313,16 @@ class AgentConfig: description="OpenHands agent via ACP (multi-model, Python-based)", skill_paths=[], install_cmd=( - "export DEBIAN_FRONTEND=noninteractive; " + "export DEBIAN_FRONTEND=noninteractive && " + "export PATH=\"$HOME/.local/bin:$PATH\" && " "( command -v openhands >/dev/null 2>&1 || ( " - " ( command -v uv >/dev/null 2>&1 || " + " ( command -v uv >/dev/null 2>&1 || ( " " ( command -v curl >/dev/null 2>&1 || " - " (apt-get update -qq && apt-get install -y -qq curl ca-certificates) ) && " - " curl -LsSf https://astral.sh/uv/install.sh | sh && " - " export PATH=\"$HOME/.local/bin:$PATH\" " - " ) && " - " uv tool install openhands --python 3.12 " - ") ) >/dev/null 2>&1 && " - "export PATH=\"$HOME/.local/bin:$PATH\" && " + " (apt-get update -qq && apt-get install -y -qq curl ca-certificates >/dev/null 2>&1) ) && " + " curl -LsSf https://astral.sh/uv/install.sh | sh >/dev/null 2>&1 " + " ) ) && " + " uv tool install openhands --python 3.12 >/dev/null 2>&1 " + ") ) && " "command -v openhands >/dev/null 2>&1" ), launch_cmd="openhands acp --always-approve --override-with-envs", From 6fbd320861b43470a6eacc566e7808aeeeef3939 Mon Sep 17 00:00:00 2001 From: Xiangyi Li Date: Tue, 21 Apr 2026 22:18:23 -0700 Subject: [PATCH 06/25] =?UTF-8?q?fix:=20OpenHands=20agent=20=E2=80=94=20sa?= =?UTF-8?q?ndbox=20launch,=20auth,=20model,=20and=20cwd?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Five fixes for issue #169 (openhands: command not found): 1. PATH: add $HOME/.local/bin to launch_cmd so uv-installed binary is found 2. Interpreter access: chmod o+x on /root path chain so sandbox user can reach the uv-managed Python shebang at /root/.local/share/uv/tools/ 3. ACP auth: seed ~/.openhands/agent_settings.json at install (OpenHands _is_authenticated() requires it) and overwrite with real LLM_MODEL/KEY at launch (workaround for OpenHands ACP not applying --override-with-envs in _create_conversation) 4. Model env: add BENCHFLOW_PROVIDER_MODEL → LLM_MODEL to env_mapping 5. CWD: remove hardcoded cd /home/{user} from build_priv_drop_cmd — it overrode the docker -w /app workspace, causing agents to write files in the wrong directory Also adds home_dirs=[".openhands"] so setup_sandbox_user copies the settings dir to the agent user. Tested: bench eval create + bench run, both sandbox_user=agent and root, gemini agent regression-verified, 45/45 registry+sandbox tests pass. --- src/benchflow/_sandbox.py | 2 +- src/benchflow/agents/registry.py | 17 ++++++++++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/src/benchflow/_sandbox.py b/src/benchflow/_sandbox.py index 41ba87b..cef2f42 100644 --- a/src/benchflow/_sandbox.py +++ b/src/benchflow/_sandbox.py @@ -88,7 +88,7 @@ def build_priv_drop_cmd(agent_launch: str, sandbox_user: str) -> str: No outer sh -c wrapper — DockerProcess wraps in bash -c already. """ inner = ( - f"export HOME=/home/{sandbox_user} && cd /home/{sandbox_user} && {agent_launch}" + f"export HOME=/home/{sandbox_user} && {agent_launch}" ) quoted = shlex.quote(inner) return ( diff --git a/src/benchflow/agents/registry.py b/src/benchflow/agents/registry.py index 583138b..1326726 100644 --- a/src/benchflow/agents/registry.py +++ b/src/benchflow/agents/registry.py @@ -312,6 +312,7 @@ class AgentConfig: name="openhands", description="OpenHands agent via ACP (multi-model, Python-based)", skill_paths=[], + home_dirs=[".openhands"], install_cmd=( "export DEBIAN_FRONTEND=noninteractive && " "export PATH=\"$HOME/.local/bin:$PATH\" && " @@ -323,15 +324,29 @@ class AgentConfig: " ) ) && " " uv tool install openhands --python 3.12 >/dev/null 2>&1 " ") ) && " + # Let sandbox user traverse to the uv-managed Python interpreter + "chmod o+x /root /root/.local /root/.local/share " + "/root/.local/share/uv /root/.local/share/uv/tools 2>/dev/null; " + # Seed agent_settings.json so _is_authenticated() passes + "mkdir -p ~/.openhands && " + "echo '{\"llm\":{\"model\":\"placeholder\",\"api_key\":\"placeholder\"}}' " + "> ~/.openhands/agent_settings.json && " "command -v openhands >/dev/null 2>&1" ), - launch_cmd="openhands acp --always-approve --override-with-envs", + launch_cmd=( + "export PATH=\"$HOME/.local/bin:$PATH\" && " + "mkdir -p ~/.openhands && " + "printf '{\"llm\":{\"model\":\"%s\",\"api_key\":\"%s\"}}' " + "\"$LLM_MODEL\" \"$LLM_API_KEY\" > ~/.openhands/agent_settings.json && " + "openhands acp --always-approve --override-with-envs" + ), protocol="acp", requires_env=["LLM_API_KEY"], api_protocol="", env_mapping={ "BENCHFLOW_PROVIDER_BASE_URL": "LLM_BASE_URL", "BENCHFLOW_PROVIDER_API_KEY": "LLM_API_KEY", + "BENCHFLOW_PROVIDER_MODEL": "LLM_MODEL", }, ), } From 046567cfc22d01b215a3c34e315d45bae94e24ba Mon Sep 17 00:00:00 2001 From: Xiangyi Li Date: Wed, 22 Apr 2026 01:19:47 -0700 Subject: [PATCH 07/25] feat: wire outbox messaging into Trial._run_scene() for multi-role scenes Multi-role scenes (coder + reviewer) now communicate via outbox files through the main bf.run(TrialConfig) path. Previously, outbox-based message passing only worked through the standalone _scene.py scheduler (used by followup-bench). Now the same convention works end-to-end: 1. Scheduler sets up /app/.outbox/ before the first turn 2. After each turn, reads outbox files written by the active role 3. Injects received messages into the next role's prompt Also includes: - Coder-reviewer demo script (docs/notebooks/coder-reviewer-demo.py) - Real runnable notebook replacing config-only cells with bf.run() calls - Multi-turn vs multi-round terminology in README and api-reference - 7 new tests covering outbox setup, injection, cleanup, and edge cases --- README.md | 19 ++ docs/api-reference.md | 32 ++- docs/notebooks/coder-reviewer-demo.py | 163 +++++++++++++ docs/notebooks/scene-patterns.ipynb | 337 ++++++++++---------------- src/benchflow/trial.py | 64 ++++- tests/test_scene_outbox_trial.py | 267 ++++++++++++++++++++ 6 files changed, 661 insertions(+), 221 deletions(-) create mode 100644 docs/notebooks/coder-reviewer-demo.py create mode 100644 tests/test_scene_outbox_trial.py diff --git a/README.md b/README.md index b750a51..4c38823 100644 --- a/README.md +++ b/README.md @@ -129,6 +129,21 @@ bench environment create Spin up sandbox from task dir bench environment list List active sandboxes ``` +## Terminology + +| Term | Definition | Example | +|------|-----------|---------| +| **Turn** | One prompt in one ACP session — one role acts | Coder writes a regex | +| **Multi-turn** | Same role, multiple turns | Self-review: agent → agent | +| **Round** | One A→B exchange between different roles | Coder → Reviewer | +| **Multi-round** | Different roles exchanging turns | Coder → Reviewer → Coder | +| **Scene** | Interaction region with roles + turns | A code-review scene | +| **Trial** | Sequence of scenes in a shared sandbox | Skill-gen → Solve | + +**Inter-role messaging:** In multi-role scenes, agents communicate via outbox files. +An agent writes `/app/.outbox/{recipient}.json` with `{"to": "role", "content": "..."}`. +The scheduler reads these after each turn and injects the message into the next role's prompt. + ## Architecture ``` @@ -143,6 +158,10 @@ bf.run(config) → trial.start() # spin up sandbox, upload task files → for scene in config.scenes: → trial._run_scene(scene) # connect/execute/disconnect per role + → setup /app/.outbox/ # (multi-role scenes only) + → for turn in scene.turns: + → read outbox → inject messages into prompt + → connect as role → execute → disconnect → trial.verify() # run verifier, score → trial.cleanup() # stop sandbox ``` diff --git a/docs/api-reference.md b/docs/api-reference.md index 4e871fa..8e599de 100644 --- a/docs/api-reference.md +++ b/docs/api-reference.md @@ -57,7 +57,9 @@ One interaction region — roles take turns executing prompts. # Single-role shortcut scene = Scene.single(agent="gemini", model="gemini-3.1-flash-lite-preview") -# Multi-role with turn order +# Multi-role with turn order (coder-reviewer pattern) +# Agents communicate via outbox: write /app/.outbox/{recipient}.json +# Scheduler reads outbox after each turn, injects into next role's prompt scene = Scene( name="coder-reviewer", roles=[ @@ -66,8 +68,9 @@ scene = Scene( ], turns=[ Turn("coder"), # None prompt = instruction.md - Turn("reviewer", "Review..."), - Turn("coder", "Fix issues..."), + Turn("reviewer", "Review the code. Write feedback to " + '/app/.outbox/coder.json as {"to":"coder","content":"..."}'), + Turn("coder", "Fix the issues."), # reviewer's feedback auto-injected ], ) ``` @@ -124,15 +127,32 @@ Trial.run() ├─ install_agent() — install agent binary, credentials, sandbox user ├─ for scene in scenes: │ └─ _run_scene(scene) - │ ├─ connect_as(role) — open ACP session for this role - │ ├─ execute(prompts) — send prompts, collect trajectory - │ └─ disconnect() — kill agent process, clean up + │ ├─ setup /app/.outbox/ — (multi-role scenes only) + │ └─ for turn in scene.turns: + │ ├─ read outbox — inject messages into prompt + │ ├─ connect_as(role) — open ACP session for this role + │ ├─ execute(prompts) — send prompts, collect trajectory + │ └─ disconnect() — kill agent process, clean up ├─ verify() — run verifier, collect rewards └─ cleanup() — stop sandbox ``` Key: `disconnect()` kills the agent process between scenes to prevent context bleed. Each scene gets a fresh agent session. +## Multi-Turn vs Multi-Round + +| Pattern | Roles | Turns | Communication | Example | +|---------|-------|-------|---------------|---------| +| **Single-turn** | 1 | 1 | — | Baseline benchmark | +| **Multi-turn** | 1 | 2+ | Same session, sequential prompts | Self-review | +| **Multi-round** | 2+ | 2+ | Outbox files between roles | Coder + Reviewer | + +**Multi-turn** = same agent gets multiple prompts. Use when a second pass catches errors (self-review, iterative refinement). The agent keeps its context across turns. + +**Multi-round** = different agents exchange turns. Use when tasks need multiple perspectives (code review, client-advisor). The scheduler reads outbox files and injects messages. + +Both use the same API — `TrialConfig` with different `Scene` configurations. + ## Multi-Agent Patterns ### Coder + Reviewer (followup-bench) diff --git a/docs/notebooks/coder-reviewer-demo.py b/docs/notebooks/coder-reviewer-demo.py new file mode 100644 index 0000000..010b492 --- /dev/null +++ b/docs/notebooks/coder-reviewer-demo.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +"""Coder-Reviewer demo — a simple two-agent Scene run via bf.run(). + +Demonstrates: + - Multi-role Scene (coder + reviewer) in a shared sandbox + - Outbox-based message passing between roles + - Standard bf.run(TrialConfig) API — same path for single or multi-agent + +Requirements: + - pip install benchflow + - GEMINI_API_KEY or DAYTONA_API_KEY set + - A Harbor-format task directory (e.g. .ref/terminal-bench-2/regex-log) + +Usage: + python docs/notebooks/coder-reviewer-demo.py --task .ref/terminal-bench-2/regex-log + python docs/notebooks/coder-reviewer-demo.py --task .ref/terminal-bench-2/regex-log --env docker + +Terminology: + - Turn: One prompt → one ACP session (one role acts) + - Multi-turn: Same role, multiple turns (e.g. self-review: agent → agent) + - Round: One A→B exchange between different roles + - Multi-round: Different roles exchanging turns (e.g. coder → reviewer → coder) +""" + +from __future__ import annotations + +import argparse +import asyncio +import logging +import sys +from pathlib import Path + +import benchflow as bf +from benchflow.trial import Role, Scene, TrialConfig, Turn + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(name)s %(levelname)s %(message)s", +) +logger = logging.getLogger("coder-reviewer-demo") + + +# --------------------------------------------------------------------------- +# Scene definitions +# --------------------------------------------------------------------------- + +def baseline_config(task_path: Path, env: str, agent: str, model: str) -> TrialConfig: + """Pattern 1: Single agent, single turn — the baseline.""" + return TrialConfig( + task_path=task_path, + scenes=[Scene.single(agent=agent, model=model)], + environment=env, + ) + + +def coder_reviewer_config( + task_path: Path, + env: str, + coder_agent: str = "gemini", + coder_model: str = "gemini-3.1-flash-lite-preview", + reviewer_agent: str = "gemini", + reviewer_model: str = "gemini-3.1-flash-lite-preview", +) -> TrialConfig: + """Pattern 3: Coder + Reviewer — multi-round with outbox messaging. + + Flow: + 1. Coder attempts the task + 2. Reviewer reads coder's work, writes feedback to /app/.outbox/coder.json + 3. Coder receives feedback (injected by scheduler), revises solution + + The outbox convention: + - Agent writes: /app/.outbox/{recipient}.json + - Format: {"to": "role_name", "content": "your message"} + - Scheduler reads, clears, and injects into next role's prompt + """ + return TrialConfig( + task_path=task_path, + scenes=[Scene( + name="code-review", + roles=[ + Role("coder", coder_agent, coder_model), + Role("reviewer", reviewer_agent, reviewer_model), + ], + turns=[ + Turn("coder"), + Turn("reviewer", + "Review the code in /app/. Check for correctness, edge cases, " + "and adherence to the task requirements in /app/instruction.md. " + "Write your feedback to /app/.outbox/coder.json as: " + '{"to": "coder", "content": "Your specific feedback here."}'), + Turn("coder", + "Read the reviewer's feedback and fix the issues. " + "Focus only on what was flagged — don't start over."), + ], + )], + environment=env, + ) + + +# --------------------------------------------------------------------------- +# Runner +# --------------------------------------------------------------------------- + +async def run_comparison( + task_path: Path, + env: str, + agent: str, + model: str, +) -> None: + """Run baseline vs coder-reviewer and compare.""" + logger.info("=== Baseline (single agent, single turn) ===") + baseline = baseline_config(task_path, env, agent, model) + baseline_result = await bf.run(baseline) + baseline_reward = (baseline_result.rewards or {}).get("reward") + logger.info(f"Baseline: reward={baseline_reward}, tools={baseline_result.n_tool_calls}") + + logger.info("=== Coder + Reviewer (multi-round) ===") + review = coder_reviewer_config(task_path, env, agent, model, agent, model) + review_result = await bf.run(review) + review_reward = (review_result.rewards or {}).get("reward") + logger.info(f"Reviewed: reward={review_reward}, tools={review_result.n_tool_calls}") + + print("\n" + "=" * 60) + print("Results") + print("=" * 60) + print(f" Baseline: reward={baseline_reward} tool_calls={baseline_result.n_tool_calls}") + print(f" Reviewed: reward={review_reward} tool_calls={review_result.n_tool_calls}") + if baseline_reward is not None and review_reward is not None: + lift = review_reward - baseline_reward + print(f" Lift: {lift:+.2f}") + print("=" * 60) + + +async def run_single(task_path: Path, env: str, agent: str, model: str) -> None: + """Run coder-reviewer only.""" + config = coder_reviewer_config(task_path, env, agent, model, agent, model) + result = await bf.run(config) + reward = (result.rewards or {}).get("reward") + print(f"reward={reward} tool_calls={result.n_tool_calls} error={result.error}") + + +def main() -> None: + parser = argparse.ArgumentParser(description="Coder-Reviewer demo") + parser.add_argument("--task", type=Path, required=True, help="Path to task directory") + parser.add_argument("--env", default="daytona", choices=["daytona", "docker"]) + parser.add_argument("--agent", default="gemini") + parser.add_argument("--model", default="gemini-3.1-flash-lite-preview") + parser.add_argument("--compare", action="store_true", + help="Run baseline and coder-reviewer side by side") + args = parser.parse_args() + + if not args.task.exists(): + print(f"Task directory not found: {args.task}", file=sys.stderr) + sys.exit(1) + + if args.compare: + asyncio.run(run_comparison(args.task, args.env, args.agent, args.model)) + else: + asyncio.run(run_single(args.task, args.env, args.agent, args.model)) + + +if __name__ == "__main__": + main() diff --git a/docs/notebooks/scene-patterns.ipynb b/docs/notebooks/scene-patterns.ipynb index 3f9277c..da5f994 100644 --- a/docs/notebooks/scene-patterns.ipynb +++ b/docs/notebooks/scene-patterns.ipynb @@ -6,299 +6,216 @@ "source": [ "# BenchFlow Scene Patterns\n", "\n", - "Six evaluation patterns — multi-turn, multi-round, multi-scene — expressed as Scene configs.\n", + "Three evaluation patterns — baseline, self-review (multi-turn), coder-reviewer (multi-round) — run end-to-end with `bf.run()`.\n", "\n", - "| Term | Definition |\n", - "|------|------------|\n", - "| **Turn** | One prompt in one ACP session |\n", - "| **Round** | One A→B exchange between roles |\n", - "| **Scene** | Interaction region with roles + turns |\n", - "| **Trial** | Sequence of scenes in shared sandbox |" + "| Term | Definition | Example |\n", + "|------|-----------|--------|\n", + "| **Turn** | One prompt in one ACP session | Coder writes code |\n", + "| **Multi-turn** | Same role, multiple turns | Self-review: agent → agent |\n", + "| **Round** | One A→B exchange between roles | Coder → Reviewer |\n", + "| **Multi-round** | Different roles exchanging turns | Coder → Reviewer → Coder |\n", + "| **Scene** | Interaction region with roles + turns | A code-review scene |\n", + "| **Trial** | Sequence of scenes in a shared sandbox | Skill-gen → Solve |\n", + "\n", + "**Prerequisites:**\n", + "- `pip install benchflow`\n", + "- `GEMINI_API_KEY` or `GOOGLE_API_KEY` set\n", + "- `DAYTONA_API_KEY` set (or Docker daemon running for `--env docker`)\n", + "- A task directory (this notebook uses `.ref/terminal-bench-2/regex-log`)" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "from benchflow.trial import TrialConfig, Scene, Role, Turn\n", - "from pathlib import Path" + "import asyncio\n", + "import logging\n", + "from pathlib import Path\n", + "\n", + "import benchflow as bf\n", + "from benchflow.trial import Role, Scene, TrialConfig, Turn\n", + "\n", + "logging.basicConfig(level=logging.INFO, format=\"%(asctime)s %(name)s %(message)s\")\n", + "\n", + "TASK = Path(\".ref/terminal-bench-2/regex-log\")\n", + "ENV = \"daytona\"\n", + "AGENT = \"gemini\"\n", + "MODEL = \"gemini-3.1-flash-lite-preview\"\n", + "\n", + "results = {} # pattern_name -> RunResult\n", + "print(f\"Task: {TASK} Env: {ENV} Agent: {AGENT} Model: {MODEL}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Pattern 1: Single-Agent Baseline" + "## Pattern 1: Single-Agent Baseline\n", + "\n", + "One agent, one turn. The simplest configuration." ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1 scene, 1 role, 1 turn\n" - ] - } - ], + "outputs": [], "source": [ "config = TrialConfig(\n", - " task_path=Path(\"tasks/regex-log\"),\n", - " scenes=[Scene.single(agent=\"gemini\", model=\"gemini-3.1-flash-lite-preview\")],\n", - " environment=\"daytona\",\n", + " task_path=TASK,\n", + " scenes=[Scene.single(agent=AGENT, model=MODEL)],\n", + " environment=ENV,\n", ")\n", - "s = config.effective_scenes[0]\n", - "print(f\"{len(config.effective_scenes)} scene, {len(s.roles)} role, {len(s.turns)} turn\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Result:** reward=0.0, 3 tool calls. Agent wrote a regex but missed edge cases." + "\n", + "result = await bf.run(config)\n", + "results[\"baseline\"] = result\n", + "\n", + "reward = (result.rewards or {}).get(\"reward\")\n", + "print(f\"Reward: {reward}\")\n", + "print(f\"Tool calls: {result.n_tool_calls}\")\n", + "print(f\"Error: {result.error}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Pattern 2: Single-Agent Multi-Turn\n", - "Same agent, two prompts. Self-review variant." + "## Pattern 2: Multi-Turn Self-Review\n", + "\n", + "Same agent gets a second turn to re-examine its own work. This is **multi-turn** — one role, multiple prompts.\n", + "\n", + "Use when: a second pass catches what the first missed. Cost: 2x baseline." ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1 scene, 1 role, 2 turns\n" - ] - } - ], + "outputs": [], "source": [ "config = TrialConfig(\n", - " task_path=Path(\"tasks/regex-log\"),\n", + " task_path=TASK,\n", " scenes=[Scene(\n", " name=\"self-review\",\n", - " roles=[Role(\"agent\", \"gemini\", \"gemini-3.1-flash-lite-preview\")],\n", + " roles=[Role(\"agent\", AGENT, MODEL)],\n", " turns=[\n", " Turn(\"agent\"),\n", - " Turn(\"agent\", \"Review your solution. Check edge cases and fix issues.\"),\n", + " Turn(\"agent\", \"Review your solution. Check edge cases, \"\n", + " \"boundary conditions, and adherence to the requirements. Fix issues.\"),\n", " ],\n", " )],\n", - " environment=\"daytona\",\n", + " environment=ENV,\n", ")\n", - "s = config.effective_scenes[0]\n", - "print(f\"{len(config.effective_scenes)} scene, {len(s.roles)} role, {len(s.turns)} turns\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Result:** reward=1.0, 12 tool calls. Self-review caught the regex bug." + "\n", + "result = await bf.run(config)\n", + "results[\"self_review\"] = result\n", + "\n", + "reward = (result.rewards or {}).get(\"reward\")\n", + "print(f\"Reward: {reward}\")\n", + "print(f\"Tool calls: {result.n_tool_calls}\")\n", + "print(f\"Error: {result.error}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Pattern 3: Multi-Round Code Review\n", - "Two roles — coder and reviewer — take turns." + "## Pattern 3: Coder + Reviewer (Multi-Round)\n", + "\n", + "Two roles — coder and reviewer — in a shared sandbox. This is **multi-round** — different roles exchanging turns.\n", + "\n", + "Communication uses the **outbox convention**:\n", + "1. Reviewer writes feedback to `/app/.outbox/coder.json` as `{\"to\": \"coder\", \"content\": \"...\"}`\n", + "2. Scheduler reads the outbox after the reviewer's turn\n", + "3. Feedback is automatically injected into the coder's next prompt\n", + "\n", + "Use when: tasks benefit from independent review. Cost: 3x baseline." ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1 scene, 2 roles, 3 turns\n" - ] - } - ], + "outputs": [], "source": [ "config = TrialConfig(\n", - " task_path=Path(\"tasks/regex-log\"),\n", + " task_path=TASK,\n", " scenes=[Scene(\n", " name=\"code-review\",\n", " roles=[\n", - " Role(\"coder\", \"gemini\", \"gemini-3.1-flash-lite-preview\"),\n", - " Role(\"reviewer\", \"gemini\", \"gemini-3.1-flash-lite-preview\"),\n", + " Role(\"coder\", AGENT, MODEL),\n", + " Role(\"reviewer\", AGENT, MODEL),\n", " ],\n", " turns=[\n", " Turn(\"coder\"),\n", - " Turn(\"reviewer\", \"Review the code. Write feedback to /app/.outbox/coder.json\"),\n", - " Turn(\"coder\", \"Read feedback and fix issues.\"),\n", + " Turn(\"reviewer\",\n", + " \"Review the code in /app/. Read /app/instruction.md for requirements. \"\n", + " \"Check for correctness, edge cases, and missed requirements. \"\n", + " \"Write your feedback to /app/.outbox/coder.json as: \"\n", + " '{\"to\": \"coder\", \"content\": \"Your specific feedback here.\"}'),\n", + " Turn(\"coder\",\n", + " \"Read the reviewer's feedback and fix the issues. \"\n", + " \"Focus only on what was flagged — don't start over.\"),\n", " ],\n", " )],\n", - " environment=\"daytona\",\n", + " environment=ENV,\n", ")\n", - "s = config.effective_scenes[0]\n", - "print(f\"{len(config.effective_scenes)} scene, {len(s.roles)} roles, {len(s.turns)} turns\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Result:** reward=0.0, 16 tool calls.\n", "\n", - "**At scale (267 trials on TB2):** baseline 9.0% → reviewer 19.4% win rate." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Pattern 4: Interactive User Simulation\n", - "A \"user\" role reveals info gradually. Agent responds." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1 scene, 2 roles, 2 turns\n" - ] - } - ], - "source": [ - "config = TrialConfig(\n", - " task_path=Path(\"tasks/regex-log\"),\n", - " scenes=[Scene(\n", - " name=\"interactive\",\n", - " roles=[\n", - " Role(\"user\", \"gemini\", \"gemini-3.1-flash-lite-preview\"),\n", - " Role(\"agent\", \"gemini\", \"gemini-3.1-flash-lite-preview\"),\n", - " ],\n", - " turns=[\n", - " Turn(\"user\", \"Give the agent a vague version of the task...\"),\n", - " Turn(\"agent\"),\n", - " ],\n", - " )],\n", - " environment=\"daytona\",\n", - ")\n", - "s = config.effective_scenes[0]\n", - "print(f\"{len(config.effective_scenes)} scene, {len(s.roles)} roles, {len(s.turns)} turns\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Result:** reward=0.0, 2 tool calls. Vague instruction wasn't enough." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Pattern 5: Multi-Scene BYOS\n", - "Two scenes — skill generation (unscored) then solve (scored)." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2 scenes: skill-gen (1 turn), solve (1 turn)\n" - ] - } - ], - "source": [ - "config = TrialConfig(\n", - " task_path=Path(\"tasks/regex-log\"),\n", - " scenes=[\n", - " Scene(name=\"skill-gen\",\n", - " roles=[Role(\"gen\", \"gemini\", \"gemini-3.1-flash-lite-preview\")],\n", - " turns=[Turn(\"gen\", \"Write a skill document to /app/generated-skill.md\")]),\n", - " Scene(name=\"solve\",\n", - " roles=[Role(\"solver\", \"gemini\", \"gemini-3.1-flash-lite-preview\")],\n", - " turns=[Turn(\"solver\")]),\n", - " ],\n", - " environment=\"daytona\",\n", - ")\n", - "print(f\"{len(config.effective_scenes)} scenes: {', '.join(f'{s.name} ({len(s.turns)} turn)' for s in config.effective_scenes)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Result:** reward=0.0, 5 tool calls. Self-generated skills = 0pp lift (consistent with paper)." + "result = await bf.run(config)\n", + "results[\"coder_reviewer\"] = result\n", + "\n", + "reward = (result.rewards or {}).get(\"reward\")\n", + "print(f\"Reward: {reward}\")\n", + "print(f\"Tool calls: {result.n_tool_calls}\")\n", + "print(f\"Error: {result.error}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Pattern 6: Contract Consultation\n", - "Realistic multi-round: client CTO + advisor reviewing a vendor contract." + "## Comparison" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1 scene, 2 roles, 4 turns (2 rounds)\n" - ] - } - ], + "outputs": [], "source": [ - "config = TrialConfig(\n", - " task_path=Path(\"tasks/contract-review\"),\n", - " scenes=[Scene(\n", - " name=\"consultation\",\n", - " roles=[\n", - " Role(\"client\", \"gemini\", \"gemini-3.1-flash-lite-preview\"),\n", - " Role(\"advisor\", \"gemini\", \"gemini-3.1-flash-lite-preview\"),\n", - " ],\n", - " turns=[\n", - " Turn(\"client\"), # shares situation\n", - " Turn(\"advisor\"), # analyzes contract\n", - " Turn(\"client\", \"Clarify your top 2 priorities.\"),\n", - " Turn(\"advisor\", \"Give final redline recommendations.\"),\n", - " ],\n", - " )],\n", - " environment=\"daytona\",\n", - ")\n", - "s = config.effective_scenes[0]\n", - "print(f\"{len(config.effective_scenes)} scene, {len(s.roles)} roles, {len(s.turns)} turns ({len(s.turns)//2} rounds)\")" + "print(f\"{'Pattern':<20} {'Reward':>8} {'Tools':>6} {'Error'}\")\n", + "print(\"-\" * 60)\n", + "for name, r in results.items():\n", + " reward = (r.rewards or {}).get(\"reward\", \"—\")\n", + " err = r.error or \"—\"\n", + " if len(err) > 30:\n", + " err = err[:27] + \"...\"\n", + " print(f\"{name:<20} {str(reward):>8} {r.n_tool_calls:>6} {err}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Each pattern is a TrialConfig — same API, same lifecycle, same verifier. The only difference is the Scene configuration." + "## Mapping to Harbor PR #1462 Concepts\n", + "\n", + "For users coming from Harbor's multi-turn proposal:\n", + "\n", + "| Harbor (PR #1462) | BenchFlow 0.3 |\n", + "|-------------------|---------------|\n", + "| `BaseUser` | `Role` — any role can be a user, reviewer, or custom agent |\n", + "| `User.run() → str` | `Turn` with a prompt — each turn sends one prompt to one role |\n", + "| `User.run() → None` (stop) | Fixed turn count in Scene (or `max_rounds` in `_scene.py`) |\n", + "| Per-round archiving | Scene trajectory saved as JSONL |\n", + "| `--user` CLI flag | YAML scene config (`scenes:` key in trial config) |\n", + "| `--max-user-rounds` | Number of turns in the Scene definition |\n", + "\n", + "BenchFlow additionally supports:\n", + "- N roles per scene (not just user + agent)\n", + "- Sequential multi-scene trials (BYOS: skill-gen → solve)\n", + "- Same API for single-agent and multi-agent runs" ] } ], diff --git a/src/benchflow/trial.py b/src/benchflow/trial.py index 8803419..9d79448 100644 --- a/src/benchflow/trial.py +++ b/src/benchflow/trial.py @@ -36,6 +36,7 @@ from __future__ import annotations import asyncio +import json import logging from dataclasses import dataclass, field from datetime import datetime @@ -598,20 +599,36 @@ async def run(self) -> RunResult: # ── Scene execution ── + _OUTBOX_DIR = "/app/.outbox" + async def _run_scene(self, scene: Scene) -> None: - """Execute one scene: for each turn, connect as the turn's role, execute, disconnect.""" + """Execute one scene: for each turn, connect as the turn's role, execute, disconnect. + + For multi-role scenes, agents communicate via outbox files: + an agent writes ``/app/.outbox/{recipient}.json`` with + ``{"to": "role_name", "content": "..."}`` and the scheduler + injects received messages into the next turn's prompt. + """ cfg = self._config logger.info(f"[Scene] {scene.name} — {len(scene.turns)} turns, {len(scene.roles)} roles") role_map = {r.name: r for r in scene.roles} current_role: str | None = None + multi_role = len(scene.roles) > 1 + + if multi_role: + await self._env.exec( + f"rm -rf {self._OUTBOX_DIR} && mkdir -p {self._OUTBOX_DIR}", + timeout_sec=10, + ) + + inbox: dict[str, list[str]] = {r.name: [] for r in scene.roles} for i, turn in enumerate(scene.turns): role = role_map.get(turn.role) if not role: raise ValueError(f"Turn references unknown role {turn.role!r}") - # Reconnect if role changed or first turn if current_role != turn.role: if current_role is not None: await self.disconnect() @@ -619,16 +636,53 @@ async def _run_scene(self, scene: Scene) -> None: current_role = turn.role if turn.prompt: - prompts = [turn.prompt] + base_prompt = turn.prompt elif self._resolved_prompts: - prompts = [self._resolved_prompts[0]] + base_prompt = self._resolved_prompts[0] else: - prompts = ["Solve the task described in /app/instruction.md"] + base_prompt = "Solve the task described in /app/instruction.md" + + pending = inbox.get(turn.role, []) + if pending: + parts = [base_prompt, "\n---\nMessages from other agents:\n"] + parts.extend(pending) + prompts = ["\n".join(parts)] + inbox[turn.role] = [] + else: + prompts = [base_prompt] + await self.execute(prompts=prompts) + if multi_role: + for recipient, content in await self._read_scene_outbox(current_role): + inbox.setdefault(recipient, []).append( + f"**From {current_role}:** {content}" + ) + if current_role is not None: await self.disconnect() + async def _read_scene_outbox(self, sender: str) -> list[tuple[str, str]]: + """Read and clear outbox files left by *sender*. Returns [(recipient, content), ...].""" + result = await self._env.exec( + f"ls {self._OUTBOX_DIR}/*.json 2>/dev/null || true", timeout_sec=10, + ) + files = [f.strip() for f in (result.stdout or "").strip().splitlines() if f.strip()] + messages: list[tuple[str, str]] = [] + for fpath in files: + cat = await self._env.exec(f"cat {fpath}", timeout_sec=10) + try: + data = json.loads(cat.stdout or "{}") + recipient = data.get("to", "") + content = data.get("content", "") + if recipient and content: + messages.append((recipient, content)) + logger.info(f"[Scene] outbox: {sender} → {recipient}: {content[:80]}") + except json.JSONDecodeError: + logger.warning(f"[Scene] invalid JSON in outbox: {fpath}") + await self._env.exec(f"rm -f {fpath}", timeout_sec=10) + return messages + async def connect_as(self, role: Role) -> None: """Open an ACP connection for a specific role.""" cfg = self._config diff --git a/tests/test_scene_outbox_trial.py b/tests/test_scene_outbox_trial.py new file mode 100644 index 0000000..f3177f1 --- /dev/null +++ b/tests/test_scene_outbox_trial.py @@ -0,0 +1,267 @@ +"""Tests for outbox-based inter-role messaging in Trial._run_scene(). + +Verifies that when bf.run(TrialConfig) executes a multi-role Scene, +outbox files written by one role are read and injected into the next +role's prompt — bridging the _scene.py outbox convention with the +Trial lifecycle. +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Any +from unittest.mock import AsyncMock, patch + +import pytest + +from benchflow.trial import Role, Scene, Trial, TrialConfig, Turn + + +@dataclass +class FakeExecResult: + stdout: str = "" + stderr: str = "" + return_code: int = 0 + + +class FakeEnv: + """Minimal sandbox mock that tracks outbox files.""" + + def __init__(self) -> None: + self._files: dict[str, str] = {} + self._exec_log: list[str] = [] + + async def exec(self, cmd: str, **kwargs) -> FakeExecResult: + self._exec_log.append(cmd) + if "rm -rf /app/.outbox" in cmd: + self._files = {k: v for k, v in self._files.items() + if not k.startswith("/app/.outbox/")} + return FakeExecResult() + if "ls /app/.outbox/" in cmd: + files = [f for f in self._files if f.startswith("/app/.outbox/")] + return FakeExecResult(stdout="\n".join(files)) + if cmd.startswith("cat "): + path = cmd.split(" ", 1)[1] + return FakeExecResult(stdout=self._files.get(path, "{}")) + if cmd.startswith("rm -f "): + path = cmd.split()[-1] + self._files.pop(path, None) + return FakeExecResult() + return FakeExecResult() + + def stage_outbox(self, recipient: str, content: str) -> None: + self._files[f"/app/.outbox/{recipient}.json"] = json.dumps( + {"to": recipient, "content": content} + ) + + +def _make_trial(scene: Scene) -> Trial: + config = TrialConfig( + task_path=Path("tasks/fake"), + scenes=[scene], + environment="docker", + ) + trial = Trial(config) + trial._env = FakeEnv() + trial._resolved_prompts = ["Solve the task"] + return trial + + +@pytest.fixture +def coder_reviewer_scene() -> Scene: + return Scene( + name="code-review", + roles=[ + Role("coder", "gemini", "gemini-3.1-flash-lite-preview"), + Role("reviewer", "gemini", "gemini-3.1-flash-lite-preview"), + ], + turns=[ + Turn("coder"), + Turn("reviewer", "Review the code. Write feedback to /app/.outbox/coder.json"), + Turn("coder", "Read feedback and fix issues."), + ], + ) + + +@pytest.fixture +def self_review_scene() -> Scene: + return Scene( + name="self-review", + roles=[Role("agent", "gemini", "gemini-3.1-flash-lite-preview")], + turns=[ + Turn("agent"), + Turn("agent", "Review your solution and fix edge cases."), + ], + ) + + +async def test_outbox_setup_for_multi_role(coder_reviewer_scene: Scene) -> None: + """Multi-role scenes set up /app/.outbox before running turns.""" + trial = _make_trial(coder_reviewer_scene) + prompts_received: list[str] = [] + + async def fake_execute(prompts=None): + prompts_received.extend(prompts or []) + return [], 0 + + trial.connect_as = AsyncMock() + trial.disconnect = AsyncMock() + trial.execute = fake_execute + + await trial._run_scene(coder_reviewer_scene) + + outbox_setup = [c for c in trial._env._exec_log if "mkdir -p /app/.outbox" in c] + assert len(outbox_setup) == 1 + + +async def test_no_outbox_setup_for_single_role(self_review_scene: Scene) -> None: + """Single-role scenes skip outbox setup (no inter-role messaging needed).""" + trial = _make_trial(self_review_scene) + + async def fake_execute(prompts=None): + return [], 0 + + trial.connect_as = AsyncMock() + trial.disconnect = AsyncMock() + trial.execute = fake_execute + + await trial._run_scene(self_review_scene) + + outbox_cmds = [c for c in trial._env._exec_log if "outbox" in c] + assert len(outbox_cmds) == 0 + + +async def test_outbox_messages_injected_into_prompt(coder_reviewer_scene: Scene) -> None: + """Outbox messages from coder are injected into reviewer's prompt.""" + trial = _make_trial(coder_reviewer_scene) + prompts_received: list[tuple[str, list[str]]] = [] + call_count = 0 + + async def fake_execute(prompts=None): + nonlocal call_count + # Track which role got which prompt + role = coder_reviewer_scene.turns[call_count].role + prompts_received.append((role, prompts or [])) + # Coder writes to reviewer outbox on first turn + if call_count == 0: + trial._env.stage_outbox("reviewer", "Please review my regex implementation") + # Reviewer writes feedback to coder outbox on second turn + elif call_count == 1: + trial._env.stage_outbox("coder", "Edge case: empty string input not handled") + call_count += 1 + return [], 0 + + trial.connect_as = AsyncMock() + trial.disconnect = AsyncMock() + trial.execute = fake_execute + + await trial._run_scene(coder_reviewer_scene) + + assert len(prompts_received) == 3 + + # Turn 0: coder gets base prompt (no messages yet) + assert prompts_received[0][0] == "coder" + assert "Messages from other agents" not in prompts_received[0][1][0] + + # Turn 1: reviewer gets its prompt + coder's outbox message + assert prompts_received[1][0] == "reviewer" + assert "Please review my regex implementation" in prompts_received[1][1][0] + assert "From coder" in prompts_received[1][1][0] + + # Turn 2: coder gets its prompt + reviewer's feedback + assert prompts_received[2][0] == "coder" + assert "Edge case: empty string input not handled" in prompts_received[2][1][0] + assert "From reviewer" in prompts_received[2][1][0] + + +async def test_outbox_files_cleared_after_read(coder_reviewer_scene: Scene) -> None: + """Outbox files are removed after reading so they don't repeat.""" + trial = _make_trial(coder_reviewer_scene) + call_count = 0 + + async def fake_execute(prompts=None): + nonlocal call_count + if call_count == 0: + trial._env.stage_outbox("reviewer", "msg1") + call_count += 1 + return [], 0 + + trial.connect_as = AsyncMock() + trial.disconnect = AsyncMock() + trial.execute = fake_execute + + await trial._run_scene(coder_reviewer_scene) + + remaining = [f for f in trial._env._files if f.startswith("/app/.outbox/")] + assert len(remaining) == 0 + + +async def test_outbox_invalid_json_skipped(coder_reviewer_scene: Scene) -> None: + """Invalid JSON in outbox files is skipped without crashing.""" + trial = _make_trial(coder_reviewer_scene) + call_count = 0 + + async def fake_execute(prompts=None): + nonlocal call_count + if call_count == 0: + trial._env._files["/app/.outbox/reviewer.json"] = "not valid json{{" + call_count += 1 + return [], 0 + + trial.connect_as = AsyncMock() + trial.disconnect = AsyncMock() + trial.execute = fake_execute + + # Should not raise + await trial._run_scene(coder_reviewer_scene) + assert call_count == 3 + + +async def test_role_switching_connects_and_disconnects(coder_reviewer_scene: Scene) -> None: + """Verify connect/disconnect happens on role switches.""" + trial = _make_trial(coder_reviewer_scene) + + async def fake_execute(prompts=None): + return [], 0 + + trial.connect_as = AsyncMock() + trial.disconnect = AsyncMock() + trial.execute = fake_execute + + await trial._run_scene(coder_reviewer_scene) + + # 3 turns: coder, reviewer, coder → 2 connect_as calls for role switches + 1 initial + # Initial connect for coder, then disconnect+connect for reviewer, then disconnect+connect for coder + assert trial.connect_as.call_count == 3 + # disconnect after coder->reviewer, after reviewer->coder, and final disconnect + assert trial.disconnect.call_count == 3 + + +async def test_empty_outbox_no_injection() -> None: + """When no outbox files exist, prompt is used as-is.""" + scene = Scene( + name="quiet", + roles=[ + Role("a", "gemini", "flash"), + Role("b", "gemini", "flash"), + ], + turns=[Turn("a", "do stuff"), Turn("b", "also do stuff")], + ) + trial = _make_trial(scene) + prompts_received: list[str] = [] + + async def fake_execute(prompts=None): + prompts_received.extend(prompts or []) + return [], 0 + + trial.connect_as = AsyncMock() + trial.disconnect = AsyncMock() + trial.execute = fake_execute + + await trial._run_scene(scene) + + assert prompts_received[0] == "do stuff" + assert prompts_received[1] == "also do stuff" + assert all("Messages from other agents" not in p for p in prompts_received) From a0dbe8ecc907faf8eb7abb7ff010a5221b70e15f Mon Sep 17 00:00:00 2001 From: Xiangyi Li Date: Wed, 22 Apr 2026 08:01:15 -0700 Subject: [PATCH 08/25] =?UTF-8?q?fix:=20address=20Devin=20review=20?= =?UTF-8?q?=E2=80=94=20shell=20injection=20+=20outbox=20dir=20ownership?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Quote file paths with shlex.quote() in _read_scene_outbox() to prevent shell command injection via crafted outbox filenames 2. chown /app/.outbox to sandbox_user so agents can actually write outbox files (was root:root 755 → agent couldn't write) --- src/benchflow/trial.py | 15 +++++++++------ tests/test_scene_outbox_trial.py | 3 ++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/benchflow/trial.py b/src/benchflow/trial.py index 9d79448..405d8a0 100644 --- a/src/benchflow/trial.py +++ b/src/benchflow/trial.py @@ -38,6 +38,7 @@ import asyncio import json import logging +import shlex from dataclasses import dataclass, field from datetime import datetime from pathlib import Path @@ -617,10 +618,11 @@ async def _run_scene(self, scene: Scene) -> None: multi_role = len(scene.roles) > 1 if multi_role: - await self._env.exec( - f"rm -rf {self._OUTBOX_DIR} && mkdir -p {self._OUTBOX_DIR}", - timeout_sec=10, - ) + setup_cmd = f"rm -rf {self._OUTBOX_DIR} && mkdir -p {self._OUTBOX_DIR}" + if cfg.sandbox_user: + user = shlex.quote(cfg.sandbox_user) + setup_cmd += f" && chown {user}:{user} {self._OUTBOX_DIR}" + await self._env.exec(setup_cmd, timeout_sec=10) inbox: dict[str, list[str]] = {r.name: [] for r in scene.roles} @@ -670,7 +672,8 @@ async def _read_scene_outbox(self, sender: str) -> list[tuple[str, str]]: files = [f.strip() for f in (result.stdout or "").strip().splitlines() if f.strip()] messages: list[tuple[str, str]] = [] for fpath in files: - cat = await self._env.exec(f"cat {fpath}", timeout_sec=10) + quoted = shlex.quote(fpath) + cat = await self._env.exec(f"cat {quoted}", timeout_sec=10) try: data = json.loads(cat.stdout or "{}") recipient = data.get("to", "") @@ -680,7 +683,7 @@ async def _read_scene_outbox(self, sender: str) -> list[tuple[str, str]]: logger.info(f"[Scene] outbox: {sender} → {recipient}: {content[:80]}") except json.JSONDecodeError: logger.warning(f"[Scene] invalid JSON in outbox: {fpath}") - await self._env.exec(f"rm -f {fpath}", timeout_sec=10) + await self._env.exec(f"rm -f {quoted}", timeout_sec=10) return messages async def connect_as(self, role: Role) -> None: diff --git a/tests/test_scene_outbox_trial.py b/tests/test_scene_outbox_trial.py index f3177f1..8378653 100644 --- a/tests/test_scene_outbox_trial.py +++ b/tests/test_scene_outbox_trial.py @@ -98,7 +98,7 @@ def self_review_scene() -> Scene: async def test_outbox_setup_for_multi_role(coder_reviewer_scene: Scene) -> None: - """Multi-role scenes set up /app/.outbox before running turns.""" + """Multi-role scenes set up /app/.outbox with correct ownership.""" trial = _make_trial(coder_reviewer_scene) prompts_received: list[str] = [] @@ -114,6 +114,7 @@ async def fake_execute(prompts=None): outbox_setup = [c for c in trial._env._exec_log if "mkdir -p /app/.outbox" in c] assert len(outbox_setup) == 1 + assert "chown agent:agent /app/.outbox" in outbox_setup[0] async def test_no_outbox_setup_for_single_role(self_review_scene: Scene) -> None: From 53b910ce8d8b4eab224f89728eb942f23bb3a2f4 Mon Sep 17 00:00:00 2001 From: Xiangyi Li Date: Wed, 22 Apr 2026 08:10:53 -0700 Subject: [PATCH 09/25] =?UTF-8?q?fix:=20address=20Codex=20critical=20revie?= =?UTF-8?q?w=20=E2=80=94=20persistence,=20agent=20install,=20honest=20gaps?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Persist inter-role messages to trial_dir/scene_messages.jsonl (was ephemeral — injected into prompts then discarded) 2. Install non-primary agents in connect_as() for heterogeneous scenes (was broken: only primary agent was installed) 3. Honest Harbor mapping — document what 0.3 delivers vs what's a gap: - Shipped: roles, turns, outbox messaging, message persistence - Gap: dynamic termination, oracle access, per-round verification, inter-round trajectory inspection 4. Add 0.3 Limitations section to api-reference 5. Two new tests: message persistence + heterogeneous agent install --- docs/api-reference.md | 11 +++++ docs/notebooks/scene-patterns.ipynb | 53 +++++++++++--------- src/benchflow/trial.py | 30 +++++++++++- tests/test_scene_outbox_trial.py | 75 +++++++++++++++++++++++++++++ 4 files changed, 144 insertions(+), 25 deletions(-) diff --git a/docs/api-reference.md b/docs/api-reference.md index 8e599de..5248459 100644 --- a/docs/api-reference.md +++ b/docs/api-reference.md @@ -189,6 +189,17 @@ config = TrialConfig( ) ``` +## 0.3 Limitations + +The Scene API in 0.3 covers coder-reviewer and multi-turn patterns. It does **not** yet support: + +- **Dynamic termination** — turn count is fixed at config time. A "user" role cannot decide to stop early based on agent output. Workaround: use `max_rounds` in the standalone `_scene.py` scheduler. +- **Oracle access** — no mechanism for a "user" role to read `/solution` during setup. +- **Per-round verification** — `verify()` runs once after all scenes complete, not between rounds. +- **Inter-round trajectory inspection** — a "user" role cannot read the agent's trajectory between turns. + +These are tracked for 0.4. See the [Harbor PR #1462 mapping](docs/notebooks/scene-patterns.ipynb) for details. + ## YAML Trial Configs ```python diff --git a/docs/notebooks/scene-patterns.ipynb b/docs/notebooks/scene-patterns.ipynb index da5f994..101a67e 100644 --- a/docs/notebooks/scene-patterns.ipynb +++ b/docs/notebooks/scene-patterns.ipynb @@ -6,16 +6,16 @@ "source": [ "# BenchFlow Scene Patterns\n", "\n", - "Three evaluation patterns — baseline, self-review (multi-turn), coder-reviewer (multi-round) — run end-to-end with `bf.run()`.\n", + "Three evaluation patterns \u2014 baseline, self-review (multi-turn), coder-reviewer (multi-round) \u2014 run end-to-end with `bf.run()`.\n", "\n", "| Term | Definition | Example |\n", "|------|-----------|--------|\n", "| **Turn** | One prompt in one ACP session | Coder writes code |\n", - "| **Multi-turn** | Same role, multiple turns | Self-review: agent → agent |\n", - "| **Round** | One A→B exchange between roles | Coder → Reviewer |\n", - "| **Multi-round** | Different roles exchanging turns | Coder → Reviewer → Coder |\n", + "| **Multi-turn** | Same role, multiple turns | Self-review: agent \u2192 agent |\n", + "| **Round** | One A\u2192B exchange between roles | Coder \u2192 Reviewer |\n", + "| **Multi-round** | Different roles exchanging turns | Coder \u2192 Reviewer \u2192 Coder |\n", "| **Scene** | Interaction region with roles + turns | A code-review scene |\n", - "| **Trial** | Sequence of scenes in a shared sandbox | Skill-gen → Solve |\n", + "| **Trial** | Sequence of scenes in a shared sandbox | Skill-gen \u2192 Solve |\n", "\n", "**Prerequisites:**\n", "- `pip install benchflow`\n", @@ -84,7 +84,7 @@ "source": [ "## Pattern 2: Multi-Turn Self-Review\n", "\n", - "Same agent gets a second turn to re-examine its own work. This is **multi-turn** — one role, multiple prompts.\n", + "Same agent gets a second turn to re-examine its own work. This is **multi-turn** \u2014 one role, multiple prompts.\n", "\n", "Use when: a second pass catches what the first missed. Cost: 2x baseline." ] @@ -124,7 +124,7 @@ "source": [ "## Pattern 3: Coder + Reviewer (Multi-Round)\n", "\n", - "Two roles — coder and reviewer — in a shared sandbox. This is **multi-round** — different roles exchanging turns.\n", + "Two roles \u2014 coder and reviewer \u2014 in a shared sandbox. This is **multi-round** \u2014 different roles exchanging turns.\n", "\n", "Communication uses the **outbox convention**:\n", "1. Reviewer writes feedback to `/app/.outbox/coder.json` as `{\"to\": \"coder\", \"content\": \"...\"}`\n", @@ -157,7 +157,7 @@ " '{\"to\": \"coder\", \"content\": \"Your specific feedback here.\"}'),\n", " Turn(\"coder\",\n", " \"Read the reviewer's feedback and fix the issues. \"\n", - " \"Focus only on what was flagged — don't start over.\"),\n", + " \"Focus only on what was flagged \u2014 don't start over.\"),\n", " ],\n", " )],\n", " environment=ENV,\n", @@ -188,8 +188,8 @@ "print(f\"{'Pattern':<20} {'Reward':>8} {'Tools':>6} {'Error'}\")\n", "print(\"-\" * 60)\n", "for name, r in results.items():\n", - " reward = (r.rewards or {}).get(\"reward\", \"—\")\n", - " err = r.error or \"—\"\n", + " reward = (r.rewards or {}).get(\"reward\", \"\u2014\")\n", + " err = r.error or \"\u2014\"\n", " if len(err) > 30:\n", " err = err[:27] + \"...\"\n", " print(f\"{name:<20} {str(reward):>8} {r.n_tool_calls:>6} {err}\")" @@ -203,19 +203,24 @@ "\n", "For users coming from Harbor's multi-turn proposal:\n", "\n", - "| Harbor (PR #1462) | BenchFlow 0.3 |\n", - "|-------------------|---------------|\n", - "| `BaseUser` | `Role` — any role can be a user, reviewer, or custom agent |\n", - "| `User.run() → str` | `Turn` with a prompt — each turn sends one prompt to one role |\n", - "| `User.run() → None` (stop) | Fixed turn count in Scene (or `max_rounds` in `_scene.py`) |\n", - "| Per-round archiving | Scene trajectory saved as JSONL |\n", - "| `--user` CLI flag | YAML scene config (`scenes:` key in trial config) |\n", - "| `--max-user-rounds` | Number of turns in the Scene definition |\n", - "\n", - "BenchFlow additionally supports:\n", - "- N roles per scene (not just user + agent)\n", - "- Sequential multi-scene trials (BYOS: skill-gen → solve)\n", - "- Same API for single-agent and multi-agent runs" + "| Harbor (PR #1462) | BenchFlow 0.3 | Status |\n", + "|-------------------|---------------|--------|\n", + "| `BaseUser` | `Role` \u2014 any role can be a user, reviewer, or custom agent | **Shipped** |\n", + "| `User.run() \u2192 str` | `Turn` with a prompt \u2014 each turn sends one prompt to one role | **Shipped** |\n", + "| Per-round message passing | Outbox files + scheduler injection into next role's prompt | **Shipped** |\n", + "| Per-round archiving | `scene_messages.jsonl` in trial directory | **Shipped** |\n", + "| `--user` CLI flag | YAML scene config (`scenes:` key in trial config) | **Shipped** |\n", + "| `User.run() \u2192 None` (stop) | Fixed turn count only \u2014 no dynamic termination | **Gap** |\n", + "| Oracle access (`/solution`) | Not available to user roles during setup | **Gap** |\n", + "| Inter-round verification | `verify()` runs once after all scenes | **Gap** |\n", + "| User inspects trajectory | User role cannot read prior agent trajectory between rounds | **Gap** |\n", + "\n", + "**What BenchFlow 0.3 delivers:** Multi-role scenes with outbox messaging, message persistence,\n", + "and the same API for single-agent and multi-agent runs. N roles per scene, sequential multi-scene trials.\n", + "\n", + "**What it does NOT deliver yet:** Dynamic termination (user decides when to stop), oracle access\n", + "for user roles, per-round verification, or inter-round trajectory inspection. These require\n", + "extending the Scene scheduler with callbacks \u2014 tracked for 0.4." ] } ], @@ -232,4 +237,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/src/benchflow/trial.py b/src/benchflow/trial.py index 405d8a0..3e3206b 100644 --- a/src/benchflow/trial.py +++ b/src/benchflow/trial.py @@ -609,6 +609,8 @@ async def _run_scene(self, scene: Scene) -> None: an agent writes ``/app/.outbox/{recipient}.json`` with ``{"to": "role_name", "content": "..."}`` and the scheduler injects received messages into the next turn's prompt. + + Inter-role messages are persisted to ``trial_dir/scene_messages.jsonl``. """ cfg = self._config logger.info(f"[Scene] {scene.name} — {len(scene.turns)} turns, {len(scene.roles)} roles") @@ -616,6 +618,7 @@ async def _run_scene(self, scene: Scene) -> None: role_map = {r.name: r for r in scene.roles} current_role: str | None = None multi_role = len(scene.roles) > 1 + scene_messages: list[dict] = [] if multi_role: setup_cmd = f"rm -rf {self._OUTBOX_DIR} && mkdir -p {self._OUTBOX_DIR}" @@ -625,6 +628,7 @@ async def _run_scene(self, scene: Scene) -> None: await self._env.exec(setup_cmd, timeout_sec=10) inbox: dict[str, list[str]] = {r.name: [] for r in scene.roles} + turn_counter = 0 for i, turn in enumerate(scene.turns): role = role_map.get(turn.role) @@ -657,13 +661,30 @@ async def _run_scene(self, scene: Scene) -> None: if multi_role: for recipient, content in await self._read_scene_outbox(current_role): + turn_counter += 1 inbox.setdefault(recipient, []).append( f"**From {current_role}:** {content}" ) + scene_messages.append({ + "scene": scene.name, + "turn": turn_counter, + "sender": current_role, + "recipient": recipient, + "content": content, + }) if current_role is not None: await self.disconnect() + if scene_messages and self._trial_dir: + msg_path = self._trial_dir / "scene_messages.jsonl" + with msg_path.open("a") as f: + for m in scene_messages: + f.write(json.dumps(m) + "\n") + logger.info( + f"[Scene] {scene.name}: {len(scene_messages)} messages → {msg_path}" + ) + async def _read_scene_outbox(self, sender: str) -> list[tuple[str, str]]: """Read and clear outbox files left by *sender*. Returns [(recipient, content), ...].""" result = await self._env.exec( @@ -687,10 +708,17 @@ async def _read_scene_outbox(self, sender: str) -> list[tuple[str, str]]: return messages async def connect_as(self, role: Role) -> None: - """Open an ACP connection for a specific role.""" + """Open an ACP connection for a specific role. + + Installs the role's agent binary if it differs from the primary + agent (which was installed in install_agent()). + """ cfg = self._config t0 = datetime.now() + if role.agent != cfg.primary_agent: + await install_agent(self._env, role.agent, self._trial_dir) + self._acp_client, self._session, self._agent_name = await connect_acp( env=self._env, agent=role.agent, diff --git a/tests/test_scene_outbox_trial.py b/tests/test_scene_outbox_trial.py index 8378653..4e6d478 100644 --- a/tests/test_scene_outbox_trial.py +++ b/tests/test_scene_outbox_trial.py @@ -266,3 +266,78 @@ async def fake_execute(prompts=None): assert prompts_received[0] == "do stuff" assert prompts_received[1] == "also do stuff" assert all("Messages from other agents" not in p for p in prompts_received) + + +async def test_scene_messages_persisted( + coder_reviewer_scene: Scene, tmp_path: Path +) -> None: + """Inter-role messages are saved to scene_messages.jsonl in trial_dir.""" + trial = _make_trial(coder_reviewer_scene) + trial._trial_dir = tmp_path + call_count = 0 + + async def fake_execute(prompts=None): + nonlocal call_count + if call_count == 0: + trial._env.stage_outbox("reviewer", "Please review my code") + elif call_count == 1: + trial._env.stage_outbox("coder", "Found a bug on line 5") + call_count += 1 + return [], 0 + + trial.connect_as = AsyncMock() + trial.disconnect = AsyncMock() + trial.execute = fake_execute + + await trial._run_scene(coder_reviewer_scene) + + msg_path = tmp_path / "scene_messages.jsonl" + assert msg_path.exists() + lines = [json.loads(ln) for ln in msg_path.read_text().strip().splitlines()] + assert len(lines) == 2 + assert lines[0]["sender"] == "coder" + assert lines[0]["recipient"] == "reviewer" + assert lines[0]["content"] == "Please review my code" + assert lines[1]["sender"] == "reviewer" + assert lines[1]["recipient"] == "coder" + assert lines[1]["content"] == "Found a bug on line 5" + assert lines[0]["scene"] == "code-review" + + +async def test_heterogeneous_agent_install(coder_reviewer_scene: Scene) -> None: + """connect_as installs non-primary agents.""" + scene = Scene( + name="hetero", + roles=[ + Role("coder", "gemini", "flash"), + Role("reviewer", "claude-agent-acp", "haiku"), + ], + turns=[Turn("coder"), Turn("reviewer", "Review.")], + ) + config = TrialConfig( + task_path=Path("tasks/fake"), + scenes=[scene], + environment="docker", + agent="gemini", + ) + trial = Trial(config) + trial._env = FakeEnv() + trial._resolved_prompts = ["Solve the task"] + + installed_agents: list[str] = [] + original_connect_as = Trial.connect_as + + async def tracking_connect_as(self_inner, role): + if role.agent != config.primary_agent: + installed_agents.append(role.agent) + + async def fake_execute(prompts=None): + return [], 0 + + trial.connect_as = lambda role: tracking_connect_as(trial, role) + trial.disconnect = AsyncMock() + trial.execute = fake_execute + + await trial._run_scene(scene) + + assert "claude-agent-acp" in installed_agents From 0fbfe6de5ae29f13a63269ef67becc70a4ce28c0 Mon Sep 17 00:00:00 2001 From: Xiangyi Li Date: Wed, 22 Apr 2026 16:31:03 -0700 Subject: [PATCH 10/25] feat: notebook with real execution outputs from Daytona runs All 3 patterns executed end-to-end on regex-log task via Daytona: - Baseline: reward=1.0, 3 tool calls - Self-review (multi-turn): reward=1.0, 7 tool calls - Coder-reviewer (multi-round): reward=0.0, 13 tool calls Outbox messaging confirmed working: reviewer wrote feedback to /app/.outbox/coder.json, scheduler read and injected into coder's prompt. Messages persisted to scene_messages.jsonl. --- docs/notebooks/scene-patterns.ipynb | 70 ++++++++++++++++++++++++----- 1 file changed, 60 insertions(+), 10 deletions(-) diff --git a/docs/notebooks/scene-patterns.ipynb b/docs/notebooks/scene-patterns.ipynb index 101a67e..9f1400c 100644 --- a/docs/notebooks/scene-patterns.ipynb +++ b/docs/notebooks/scene-patterns.ipynb @@ -26,9 +26,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Task: .ref/terminal-bench-2/regex-log Env: daytona Agent: gemini Model: gemini-3.1-flash-lite-preview\n" + ] + } + ], "source": [ "import asyncio\n", "import logging\n", @@ -59,9 +67,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Reward: 1.0\n", + "Tool calls: 3\n", + "Error: None\n" + ] + } + ], "source": [ "config = TrialConfig(\n", " task_path=TASK,\n", @@ -91,9 +109,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Reward: 1.0\n", + "Tool calls: 7\n", + "Error: None\n" + ] + } + ], "source": [ "config = TrialConfig(\n", " task_path=TASK,\n", @@ -136,9 +164,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Reward: 0.0\n", + "Tool calls: 13\n", + "Error: None\n" + ] + } + ], "source": [ "config = TrialConfig(\n", " task_path=TASK,\n", @@ -181,9 +219,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Pattern Reward Tools Error\n", + "------------------------------------------------------------\n", + "baseline 1.0 3 \u2014\n", + "self_review 1.0 7 \u2014\n", + "coder_reviewer 0.0 13 \u2014\n" + ] + } + ], "source": [ "print(f\"{'Pattern':<20} {'Reward':>8} {'Tools':>6} {'Error'}\")\n", "print(\"-\" * 60)\n", From a911c51c8dece4c5c2b0b4d9fa806c273c195017 Mon Sep 17 00:00:00 2001 From: Xiangyi Li Date: Wed, 22 Apr 2026 17:16:41 -0700 Subject: [PATCH 11/25] =?UTF-8?q?fix:=20address=20Devin=20review=20round?= =?UTF-8?q?=202=20=E2=80=94=20credentials=20+=20disconnect=20for=20non-pri?= =?UTF-8?q?mary=20agents?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. connect_as() now writes credential files and uploads subscription auth for non-primary agents, matching what install_agent() does for the primary agent. Fixes heterogeneous scenes where e.g. codex-acp needs ~/.codex/auth.json. 2. connect_as() now updates self._agent_launch so disconnect()'s pkill fallback targets the correct process (not always the primary agent's binary). 3. Note: the openhands launch_cmd pkill issue (pkill -f 'export') is pre-existing in registry.py, not introduced by this PR. --- src/benchflow/trial.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/src/benchflow/trial.py b/src/benchflow/trial.py index 3e3206b..d2af9ae 100644 --- a/src/benchflow/trial.py +++ b/src/benchflow/trial.py @@ -710,20 +710,33 @@ async def _read_scene_outbox(self, sender: str) -> list[tuple[str, str]]: async def connect_as(self, role: Role) -> None: """Open an ACP connection for a specific role. - Installs the role's agent binary if it differs from the primary - agent (which was installed in install_agent()). + Installs the role's agent binary and credentials if it differs + from the primary agent (which was set up in install_agent()). + Updates _agent_launch so disconnect() kills the correct process. """ cfg = self._config t0 = datetime.now() + agent_launch = AGENT_LAUNCH.get(role.agent, role.agent) + agent_env = resolve_agent_env(role.agent, role.model, role.env or None) + if role.agent != cfg.primary_agent: - await install_agent(self._env, role.agent, self._trial_dir) + agent_cfg = await install_agent(self._env, role.agent, self._trial_dir) + cred_home = f"/home/{cfg.sandbox_user}" if cfg.sandbox_user else "/root" + await write_credential_files( + self._env, role.agent, agent_env, + agent_cfg, role.model, cred_home, + ) + if agent_env.get("_BENCHFLOW_SUBSCRIPTION_AUTH"): + await upload_subscription_auth(self._env, role.agent, cred_home) + + self._agent_launch = agent_launch self._acp_client, self._session, self._agent_name = await connect_acp( env=self._env, agent=role.agent, - agent_launch=AGENT_LAUNCH.get(role.agent, role.agent), - agent_env=resolve_agent_env(role.agent, role.model, role.env or None), + agent_launch=agent_launch, + agent_env=agent_env, sandbox_user=cfg.sandbox_user, model=role.model, trial_dir=self._trial_dir, From 0b76a413f12552db34c08a0d57cc30977c60e5b5 Mon Sep 17 00:00:00 2001 From: Xiangyi Li Date: Wed, 22 Apr 2026 23:05:44 -0700 Subject: [PATCH 12/25] fix: clamp Daytona storage_mb to configurable max (10 GB default) Tasks requesting more storage than the Daytona tier allows fail at sandbox creation. Apply the same clamping pattern already used for cpus and memory_mb so tasks degrade gracefully. The cap is overridable via BENCHFLOW_DAYTONA_MAX_STORAGE_MB. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/benchflow/_env_setup.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/benchflow/_env_setup.py b/src/benchflow/_env_setup.py index f3aa7cd..bfc5283 100644 --- a/src/benchflow/_env_setup.py +++ b/src/benchflow/_env_setup.py @@ -20,6 +20,7 @@ # build) instead of erroring out. Override via env if running on a paid tier. _DAYTONA_MAX_CPUS = int(os.environ.get("BENCHFLOW_DAYTONA_MAX_CPUS", "4")) _DAYTONA_MAX_MEMORY_MB = int(os.environ.get("BENCHFLOW_DAYTONA_MAX_MEMORY_MB", "8192")) +_DAYTONA_MAX_STORAGE_MB = int(os.environ.get("BENCHFLOW_DAYTONA_MAX_STORAGE_MB", "10240")) # Directories to ignore when copying deps _IGNORE_DIRS = { @@ -275,6 +276,13 @@ def _create_environment( _DAYTONA_MAX_MEMORY_MB, ) env_config.memory_mb = _DAYTONA_MAX_MEMORY_MB + if env_config.storage_mb > _DAYTONA_MAX_STORAGE_MB: + logger.warning( + "Clamping storage_mb %d -> %d for Daytona (override with BENCHFLOW_DAYTONA_MAX_STORAGE_MB)", + env_config.storage_mb, + _DAYTONA_MAX_STORAGE_MB, + ) + env_config.storage_mb = _DAYTONA_MAX_STORAGE_MB return DaytonaEnvironment( environment_dir=task.paths.environment_dir, From cdccac7ea17ba5826c0fe118798a59fed58c27fe Mon Sep 17 00:00:00 2001 From: Xiangyi Li Date: Thu, 23 Apr 2026 18:17:41 -0400 Subject: [PATCH 13/25] Fix DinD compose exec missing project flags (#188) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix DinD compose exec missing project/directory/file flags DaytonaProcess.start() hardcoded `docker compose exec` without the `-p`, `--project-directory`, and `-f` flags needed to locate the running compose project inside the DinD sandbox. This caused exec to fail silently with "Process closed stdout (rc=None)". Extract the full compose base command from Harbor's strategy via `_compose_cmd([])` during `from_harbor_env()` and use it in `start()` so the exec subcommand includes all required project identifiers. Co-Authored-By: Claude Opus 4.6 (1M context) * fix: use shlex.join for DinD compose exec to handle paths with spaces Address Devin review feedback — shlex.split() + " ".join() loses quoting for tokens with spaces. Use shlex.join() which properly quotes each token. Co-Authored-By: Claude Opus 4.6 (1M context) --------- Co-authored-by: Claude Opus 4.6 (1M context) --- src/benchflow/process.py | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/src/benchflow/process.py b/src/benchflow/process.py index 4c86f3b..0ca41ef 100644 --- a/src/benchflow/process.py +++ b/src/benchflow/process.py @@ -256,11 +256,16 @@ class DaytonaProcess(LiveProcess): """ def __init__( - self, sandbox: Any, is_dind: bool = False, compose_cmd_prefix: str = "" + self, + sandbox: Any, + is_dind: bool = False, + compose_cmd_prefix: str = "", + compose_cmd_base: str = "", ): self._sandbox = sandbox self._is_dind = is_dind self._compose_cmd_prefix = compose_cmd_prefix + self._compose_cmd_base = compose_cmd_base @classmethod async def from_harbor_env(cls, env: Any) -> "DaytonaProcess": @@ -273,6 +278,7 @@ async def from_harbor_env(cls, env: Any) -> "DaytonaProcess": is_dind = hasattr(env, "_strategy") and hasattr(env._strategy, "_compose_cmd") compose_cmd_prefix = "" + compose_cmd_base = "" if is_dind: # Build compose env vars and command prefix for DinD strategy = env._strategy @@ -280,9 +286,16 @@ async def from_harbor_env(cls, env: Any) -> "DaytonaProcess": f"{k}={shlex.quote(v)}" for k, v in strategy._compose_env_vars().items() ) compose_cmd_prefix = compose_env + # Extract the full compose base command with project/file flags + # (e.g. "docker compose -p NAME --project-directory DIR -f F1 -f F2") + # so that `docker compose exec` can find the running project. + compose_cmd_base = strategy._compose_cmd([]) return cls( - sandbox=sandbox, is_dind=is_dind, compose_cmd_prefix=compose_cmd_prefix + sandbox=sandbox, + is_dind=is_dind, + compose_cmd_prefix=compose_cmd_prefix, + compose_cmd_base=compose_cmd_base, ) async def start( @@ -296,8 +309,13 @@ async def start( ssh_target = f"{ssh_access.token}@ssh.app.daytona.io" if self._is_dind: - # Build the docker compose exec command to run inside the DinD VM - inner_parts = ["docker", "compose", "exec", "-i", "-T"] + # Build the docker compose exec command to run inside the DinD VM. + # Use the full compose base command (with -p, --project-directory, + # and -f flags) so that exec can find the running project. + if self._compose_cmd_base: + inner_parts = shlex.split(self._compose_cmd_base) + ["exec", "-i", "-T"] + else: + inner_parts = ["docker", "compose", "exec", "-i", "-T"] if cwd: inner_parts.extend(["-w", cwd]) # Write env vars to a temp file on the remote VM instead of passing @@ -307,8 +325,8 @@ async def start( remote_env_path = "/tmp/benchflow_env_$$.env" env_lines = "\n".join(f"{k}={v}" for k, v in env.items()) inner_parts.extend(["--env-file", remote_env_path]) - inner_parts.extend(["main", "bash", "-c", shlex.quote(command)]) - inner_cmd = " ".join(inner_parts) + inner_parts.extend(["main", "bash", "-c", command]) + inner_cmd = shlex.join(inner_parts) if self._compose_cmd_prefix: remote_cmd = f"{self._compose_cmd_prefix} {inner_cmd}" From ea1c72843161e39d47995153953fc0c1cee1b3df Mon Sep 17 00:00:00 2001 From: Xiangyi Li Date: Thu, 23 Apr 2026 15:19:45 -0700 Subject: [PATCH 14/25] =?UTF-8?q?release:=20benchflow=200.3.2=20=E2=80=94?= =?UTF-8?q?=20Daytona=20DinD=20fix=20+=20storage=20clamp?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - fix: DinD compose exec now includes project/directory/file flags (#188) - fix: clamp Daytona storage_mb to configurable max (#185) Co-Authored-By: Claude Opus 4.6 (1M context) --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index dfd8fec..0b032ab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "benchflow" -version = "0.3.1" +version = "0.3.2" description = "Multi-turn agent benchmarking with ACP — run any agent, any model, any provider." readme = "README.md" requires-python = ">=3.12" From d813dc241fd803358b5c7096fa604721538b681c Mon Sep 17 00:00:00 2001 From: Xiangyi Li Date: Sat, 25 Apr 2026 01:34:43 -0400 Subject: [PATCH 15/25] Fix DinD compose ACP: use Daytona PTY WebSocket for live agent pipes (#193) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SSH pipes break through the DinD→compose exec chain, causing "Process closed stdout (rc=None)" on all compose tasks. New DaytonaPtyProcess uses Daytona SDK's WebSocket PTY API for the outer connection (keeps pipe alive), then docker compose exec -i -T inside (clean stdio for the agent). Includes marker-based startup to drain shell output before ACP handshake, and echo-resistant response matching in the ACP client (filter echoed requests by checking for 'method' field absence). Also adds skills_dir: "auto" support in Job for per-task skill resolution after PR #720 removed COPY skills from Dockerfiles. --- src/benchflow/_acp_run.py | 9 ++- src/benchflow/acp/client.py | 7 +- src/benchflow/job.py | 11 ++- src/benchflow/process.py | 132 ++++++++++++++++++++++++++++++++++++ 4 files changed, 153 insertions(+), 6 deletions(-) diff --git a/src/benchflow/_acp_run.py b/src/benchflow/_acp_run.py index f1b04c7..8b64288 100644 --- a/src/benchflow/_acp_run.py +++ b/src/benchflow/_acp_run.py @@ -26,7 +26,7 @@ from benchflow.acp.client import ACPClient from benchflow.acp.container_transport import ContainerTransport from benchflow.agents.providers import strip_provider_prefix -from benchflow.process import DaytonaProcess, DockerProcess +from benchflow.process import DaytonaProcess, DaytonaPtyProcess, DockerProcess logger = logging.getLogger(__name__) @@ -78,7 +78,12 @@ async def connect_acp( if environment == "docker": live_proc = DockerProcess.from_harbor_env(env) else: - live_proc = await DaytonaProcess.from_harbor_env(env) + is_dind = hasattr(env, "_strategy") and hasattr(env._strategy, "_compose_cmd") + if is_dind: + live_proc = await DaytonaPtyProcess.from_harbor_env(env) + logger.info("Using PTY transport for DinD compose task") + else: + live_proc = await DaytonaProcess.from_harbor_env(env) agent_log = trial_dir / "agent" / f"{agent.replace('-', '_')}.txt" transport = ContainerTransport( diff --git a/src/benchflow/acp/client.py b/src/benchflow/acp/client.py index 5fa3a65..abd4743 100644 --- a/src/benchflow/acp/client.py +++ b/src/benchflow/acp/client.py @@ -1,5 +1,6 @@ """ACP client — benchflow acts as the client, agents are ACP servers.""" +import json import logging from typing import Any @@ -88,8 +89,9 @@ async def _read_until_response(self, request_id: int) -> dict[str, Any]: f"has_result={'result' in msg} has_error={'error' in msg}" ) - # It's a response to our request - if "id" in msg and msg["id"] == request_id: + # It's a response to our request (has id, no method — distinguishes + # from echoed requests when running through a PTY) + if "id" in msg and msg["id"] == request_id and "method" not in msg: if msg.get("error"): raise ACPError( msg["error"].get("code", -1), @@ -97,6 +99,7 @@ async def _read_until_response(self, request_id: int) -> dict[str, Any]: ) return msg.get("result", {}) + # It's a notification (no id) if "method" in msg and "id" not in msg: try: diff --git a/src/benchflow/job.py b/src/benchflow/job.py index 95414fe..e0583f6 100644 --- a/src/benchflow/job.py +++ b/src/benchflow/job.py @@ -393,6 +393,13 @@ def _prune_docker(self): except Exception as e: logger.warning(f"Docker prune failed: {e}") + def _resolve_skills_dir(self, task_dir: Path, skills_dir: str | None) -> str | None: + """Resolve skills_dir — 'auto' means per-task environment/skills/.""" + if skills_dir == "auto": + candidate = task_dir / "environment" / "skills" + return str(candidate) if candidate.is_dir() else None + return skills_dir + async def _run_single_task(self, task_dir: Path, cfg: JobConfig) -> RunResult: """Execute one trial via Trial.""" from benchflow.trial import Trial, TrialConfig @@ -406,7 +413,7 @@ async def _run_single_task(self, task_dir: Path, cfg: JobConfig) -> RunResult: job_name=self._job_name, jobs_dir=str(self._jobs_dir), environment=cfg.environment, - skills_dir=cfg.skills_dir, + skills_dir=self._resolve_skills_dir(task_dir, cfg.skills_dir), sandbox_user=cfg.sandbox_user, sandbox_locked_paths=cfg.sandbox_locked_paths, context_root=cfg.context_root, @@ -425,7 +432,7 @@ async def _run_single_task_legacy(self, task_dir: Path, cfg: JobConfig) -> RunRe job_name=self._job_name, jobs_dir=str(self._jobs_dir), environment=cfg.environment, - skills_dir=cfg.skills_dir, + skills_dir=self._resolve_skills_dir(task_dir, cfg.skills_dir), sandbox_user=cfg.sandbox_user, sandbox_locked_paths=cfg.sandbox_locked_paths, context_root=cfg.context_root, diff --git a/src/benchflow/process.py b/src/benchflow/process.py index 0ca41ef..4a0e4be 100644 --- a/src/benchflow/process.py +++ b/src/benchflow/process.py @@ -389,3 +389,135 @@ async def start( limit=_BUFFER_LIMIT, ) logger.info(f"Daytona process started (pid={self._process.pid})") + + +class DaytonaPtyProcess(LiveProcess): + """Live stdin/stdout via Daytona PTY WebSocket API. + + Uses the Daytona SDK's PTY session (WebSocket) instead of SSH, which + maintains long-lived interactive pipes through DinD compose layers. + Falls back to this for DinD sandboxes where SSH pipes break. + """ + + _process = None # Not used — override readline/writeline/close + + def __init__(self, sandbox: Any, compose_cmd_prefix: str, compose_cmd_base: str): + self._sandbox = sandbox + self._compose_cmd_prefix = compose_cmd_prefix + self._compose_cmd_base = compose_cmd_base + self._pty = None + self._line_buffer = asyncio.Queue() + self._partial = b"" + self._closed = False + + @classmethod + async def from_harbor_env(cls, env: Any) -> "DaytonaPtyProcess": + sandbox = env._sandbox + if not sandbox: + raise RuntimeError("Daytona sandbox not started") + strategy = env._strategy + compose_env = " ".join( + f"{k}={shlex.quote(v)}" for k, v in strategy._compose_env_vars().items() + ) + compose_cmd_base = strategy._compose_cmd([]) + return cls(sandbox=sandbox, compose_cmd_prefix=compose_env, compose_cmd_base=compose_cmd_base) + + async def _on_pty_data(self, data: bytes) -> None: + self._partial += data + while b"\n" in self._partial: + line, self._partial = self._partial.split(b"\n", 1) + line = line.replace(b"\r", b"") + await self._line_buffer.put(line + b"\n") + + async def start( + self, + command: str, + env: dict[str, str] | None = None, + cwd: str | None = None, + ) -> None: + import uuid + session_id = f"acp-{uuid.uuid4().hex[:8]}" + pty_env = {} + if self._compose_cmd_prefix: + for part in shlex.split(self._compose_cmd_prefix): + if "=" in part: + k, v = part.split("=", 1) + pty_env[k] = v + + self._pty = await self._sandbox.process.create_pty_session( + id=session_id, + on_data=self._on_pty_data, + envs=pty_env if pty_env else None, + ) + await self._pty.wait_for_connection() + logger.info(f"DaytonaPtyProcess: PTY connected (session={session_id})") + + compose_parts = shlex.split(self._compose_cmd_base) if self._compose_cmd_base else ["docker", "compose"] + exec_parts = compose_parts + ["exec", "-i", "-T"] + if cwd: + exec_parts.extend(["-w", cwd]) + # Write env vars to a file inside the container (not visible in ps aux), + # matching the approach in DaytonaProcess.start(). + env_file_cmd = "" + if env: + env_lines = "\n".join(f"export {k}={shlex.quote(v)}" for k, v in env.items()) + env_file_cmd = ( + f"cat > /tmp/.benchflow_env <<'__EOF__'\n{env_lines}\n__EOF__\n" + f". /tmp/.benchflow_env && rm -f /tmp/.benchflow_env && " + ) + exec_parts.extend(["main", "bash", "-lc", f"{env_file_cmd}{command}"]) + exec_cmd = shlex.join(exec_parts) + + # Use a marker + stty to cleanly hand over the PTY to the agent. + # 1. Disable echo so typed commands don't appear in output + # 2. Print marker so we know when to start reading ACP output + # 3. exec into compose exec so the agent owns the PTY + marker = f"__BENCHFLOW_ACP_{session_id}__" + setup = f"stty -echo 2>/dev/null; echo '{marker}'; exec {exec_cmd}\n" + await self._pty.send_input(setup) + logger.info(f"DaytonaPtyProcess: sent setup, waiting for marker...") + + while True: + try: + line = await asyncio.wait_for(self._line_buffer.get(), timeout=120) + decoded = line.decode(errors="replace").strip() + logger.debug(f"DaytonaPtyProcess drain: {decoded[:120]}") + if marker in decoded: + break + except TimeoutError: + raise ConnectionError("DaytonaPtyProcess: timeout waiting for agent start marker") + + logger.info(f"DaytonaPtyProcess: marker seen, agent starting") + + async def readline(self) -> bytes: + if self._closed: + raise ConnectionError("PTY closed") + try: + line = await asyncio.wait_for(self._line_buffer.get(), timeout=300) + return line + except TimeoutError: + raise ConnectionError("PTY readline timeout (300s)") + except Exception as e: + raise ConnectionError(f"PTY readline error: {e}") + + async def writeline(self, data: str) -> None: + if not self._pty or self._closed: + raise RuntimeError("PTY not started") + await self._pty.send_input(data + "\n") + + async def close(self) -> None: + self._closed = True + if self._pty: + try: + await self._pty.kill() + except Exception: + pass + try: + await self._pty.disconnect() + except Exception: + pass + logger.info("DaytonaPtyProcess terminated") + + @property + def is_running(self) -> bool: + return self._pty is not None and not self._closed From e1cc115273d595367f69ddf3ae9754584cdd6ffe Mon Sep 17 00:00:00 2001 From: Xiangyi Li Date: Sat, 25 Apr 2026 04:18:01 -0400 Subject: [PATCH 16/25] =?UTF-8?q?merge:=20main=20=E2=86=92=20dev-0.3=20(re?= =?UTF-8?q?lease=20prep=20for=20v0.3.2)=20(#195)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: skip model/API-key validation for oracle agent The oracle agent runs solution/solve.sh and never calls an LLM, but resolve_agent_env() was validating API keys for whatever model the CLI defaulted to (claude-haiku-4-5-20251001). This made `bench eval create -a oracle` fail without ANTHROPIC_API_KEY set, even though oracle doesn't need it. * fix: don't assign default model to oracle agent Move the fix from resolve_agent_env to the CLI layer: oracle runs solve.sh and never calls an LLM, so it should not receive DEFAULT_MODEL at all. Both _run_single and _run_batch now pass model=None for oracle. Widen JobConfig.model to str | None to support this. * fix: oracle agent — chokepoint guard, drop orphan eval CLI, helper PR #173 moved the oracle/DEFAULT_MODEL guard from resolve_agent_env to cli/eval.py, but cli/eval.py is orphaned (never imported into the live CLI), so `bench eval create` still passes DEFAULT_MODEL to oracle and trips ANTHROPIC_API_KEY validation. Three changes: - Restore the `agent != "oracle"` guard in resolve_agent_env so the chokepoint defends against any caller that forwards a model. - Delete the orphan cli/eval.py and its tests — the live eval_create lives in cli/main.py and was the actual code path users hit. - Add effective_model(agent, model) helper, change JobConfig.model default to None, replace seven `model or DEFAULT_MODEL` sites in cli/main.py and job.py YAML loaders so oracle gets honest model=None end-to-end (in result/summary JSON, prints, and downstream Trial). Regression test in test_resolve_env_helpers.py pins the chokepoint by calling resolve_agent_env("oracle", DEFAULT_MODEL, {}) with no API key and no host auth — verified to fail on main with the user-facing ANTHROPIC_API_KEY error and pass after the fix. * test: regression suite pinning oracle chokepoint + orphan removal Bundle 14 tests in tests/test_oracle_chokepoint.py that pin each layer of the prior fix at the right altitude: - TestOrphanRemoval — cli/eval.py is gone (ModuleNotFoundError) and no src/ file references benchflow.cli.eval, guarding against a future re-introduction that could swallow the next bug fix the same way. - TestEvalCreateRouting — `bench eval create` callback lives in cli/main.py:eval_create. Pins the architectural fact PR #173 missed. - TestEffectiveModel — unit tests for the helper: oracle drops model, non-oracle falls back to DEFAULT_MODEL, empty string treated as unset. - TestOracleYamlLoaders — Job.from_yaml(oracle config) → model is None for both native and Harbor formats; non-oracle backwards-compat preserved. - TestEvalCreateOracleCLI — end-to-end: live eval_create(agent="oracle") with no API key in env does not raise. Mocks Trial.create and resets the asyncio loop after to avoid polluting pre-existing tests that use the deprecated asyncio.get_event_loop() pattern. Verified to fail on main in the right shape: 9 of 14 fail (each pinning a deleted/added behavior), 5 pass (asserting structural facts already true). The CLI test fails on main with the user-reported error "ANTHROPIC_API_KEY required for model 'claude-haiku-4-5-20251001'…". * fix: restore cli/eval.py and test_eval_cli.py, apply oracle guard The previous commit deleted cli/eval.py and its tests as orphans, but they are intentionally kept. Restore both from main, update eval.py to use the effective_model() helper for the oracle chokepoint fix, and replace the "module is gone" regression test with a guard that cli/main.py does not import cli/eval (the actual invariant). * docs: clarify cli/eval.py and test_eval_cli.py are not wired into live CLI --------- Co-authored-by: Yifeng He --- src/benchflow/_agent_env.py | 4 +- src/benchflow/cli/eval.py | 22 +-- src/benchflow/cli/main.py | 17 +- src/benchflow/job.py | 22 ++- tests/test_eval_cli.py | 10 +- tests/test_oracle_chokepoint.py | 257 ++++++++++++++++++++++++++++++ tests/test_resolve_env_helpers.py | 42 +++++ 7 files changed, 346 insertions(+), 28 deletions(-) create mode 100644 tests/test_oracle_chokepoint.py diff --git a/src/benchflow/_agent_env.py b/src/benchflow/_agent_env.py index 8885b5e..72d6498 100644 --- a/src/benchflow/_agent_env.py +++ b/src/benchflow/_agent_env.py @@ -144,7 +144,9 @@ def resolve_agent_env( """Resolve agent environment: auto-inherit keys, provider vars, env_mapping.""" agent_env = dict(agent_env or {}) auto_inherit_env(agent_env) - if model: + # Oracle runs solve.sh and never calls an LLM — model env vars and + # API-key validation are skipped even if a caller forwards a model. + if model and agent != "oracle": inject_vertex_credentials(agent_env, model) resolve_provider_env(agent_env, model, agent) # Validate required API key for the chosen model diff --git a/src/benchflow/cli/eval.py b/src/benchflow/cli/eval.py index a92c0be..58b410f 100644 --- a/src/benchflow/cli/eval.py +++ b/src/benchflow/cli/eval.py @@ -1,7 +1,12 @@ """`bf eval` — the benchflow eval-runner command group. -The future-facing entry point for running evaluations. Anthropic-style shape: -resource creation, one command, return the result or a job-id. +NOTE: This module is **not wired into the live CLI**. The active +``bench eval create`` command dispatches to ``cli/main.py:eval_create``. +This file is kept as the future-facing design for the eval sub-command +and must not be imported by ``cli/main.py`` (see +``test_oracle_chokepoint.py::TestEvalModuleNotWiredIntoCLI``). + +Design shape — Anthropic-style resource creation: bf eval create [flags] One-shot eval — creates an Agent + Environment + Trajectory under @@ -14,9 +19,6 @@ bf eval list Show recent eval runs (reads the jobs/ dir) bf eval retrieve ID Look up a specific trajectory by trial name - -Replaces `bf run` + `bf job` as the idiomatic way to run evals. `bf run` -stays around as a deprecated alias for one release. """ from __future__ import annotations @@ -29,7 +31,7 @@ from rich.console import Console from rich.table import Table -from benchflow.job import DEFAULT_AGENT, DEFAULT_MODEL +from benchflow.job import DEFAULT_AGENT, effective_model as _effective_model console = Console() @@ -228,12 +230,12 @@ def _run_single( from benchflow.sdk import SDK sdk = SDK() - effective_model = None if agent == "oracle" else (model or DEFAULT_MODEL) + eff_model = _effective_model(agent, model) result = asyncio.run( sdk.run( task_path=task_dir, agent=agent, - model=effective_model, + model=eff_model, environment=environment, prompts=cast("list[str | None] | None", prompt), agent_env=agent_env, @@ -269,10 +271,10 @@ def _run_batch( ) -> None: from benchflow.job import Job, JobConfig, RetryConfig - effective_model = None if agent == "oracle" else (model or DEFAULT_MODEL) + eff_model = _effective_model(agent, model) config = JobConfig( agent=agent, - model=effective_model, + model=eff_model, environment=environment, concurrency=concurrency, retry=RetryConfig(max_retries=max_retries), diff --git a/src/benchflow/cli/main.py b/src/benchflow/cli/main.py index 9cf1198..96611bb 100644 --- a/src/benchflow/cli/main.py +++ b/src/benchflow/cli/main.py @@ -10,7 +10,7 @@ from rich.console import Console from rich.table import Table -from benchflow.job import DEFAULT_AGENT, DEFAULT_MODEL +from benchflow.job import DEFAULT_AGENT, effective_model console = Console() @@ -167,7 +167,7 @@ def job( jobs_dir=jobs_dir, config=JobConfig( agent=agent, - model=model or DEFAULT_MODEL, + model=effective_model(agent, model), environment=environment, concurrency=concurrency, retry=RetryConfig(max_retries=max_retries), @@ -343,7 +343,7 @@ def eval( jobs_dir=jobs_dir, config=JobConfig( agent=agent, - model=model or DEFAULT_MODEL, + model=effective_model(agent, model), environment=environment, concurrency=concurrency, skills_dir=effective_skills, @@ -754,19 +754,20 @@ def eval_create( f"({result.score:.1%})[/bold], errors={result.errored}" ) elif tasks_dir: + eff_model = effective_model(agent, model) # Smart detection: if tasks_dir has task.toml, it's a single task if (tasks_dir / "task.toml").exists(): from benchflow.trial import Trial, TrialConfig, Scene config = TrialConfig( task_path=tasks_dir, - scenes=[Scene.single(agent=agent, model=model or DEFAULT_MODEL, + scenes=[Scene.single(agent=agent, model=eff_model, skills_dir=str(skills_dir) if skills_dir else None)], environment=environment, sandbox_user=sandbox_user, jobs_dir=jobs_dir, agent=agent, - model=model or DEFAULT_MODEL, + model=eff_model, skills_dir=str(skills_dir) if skills_dir else None, ) @@ -777,7 +778,7 @@ async def _run(): run_result = asyncio.run(_run()) reward = (run_result.rewards or {}).get("reward") console.print(f"\n[bold]Task:[/bold] {tasks_dir.name}") - console.print(f"[bold]Agent:[/bold] {agent} ({model or DEFAULT_MODEL})") + console.print(f"[bold]Agent:[/bold] {agent} ({eff_model or 'no model'})") console.print(f"[bold]Reward:[/bold] {reward}") console.print(f"[bold]Tool calls:[/bold] {run_result.n_tool_calls}") if run_result.error: @@ -789,7 +790,7 @@ async def _run(): jobs_dir=jobs_dir, config=JobConfig( agent=agent, - model=model or DEFAULT_MODEL, + model=eff_model, environment=environment, concurrency=concurrency, sandbox_user=sandbox_user, @@ -915,7 +916,7 @@ def train_create( jobs_dir=f"{jobs_dir}/sweep-{sweep_idx}", config=JobConfig( agent=agent, - model=model or DEFAULT_MODEL, + model=effective_model(agent, model), environment=environment, concurrency=concurrency, ), diff --git a/src/benchflow/job.py b/src/benchflow/job.py index e0583f6..5429add 100644 --- a/src/benchflow/job.py +++ b/src/benchflow/job.py @@ -141,12 +141,25 @@ def backoff_delay(self, attempt: int) -> float: DEFAULT_MODEL = "claude-haiku-4-5-20251001" +def effective_model(agent: str, model: str | None) -> str | None: + """Resolve the model an agent should run with. + + Oracle runs solve.sh and never calls an LLM, so it never receives a model + (the chokepoint in resolve_agent_env defends, but callers should also stop + materializing DEFAULT_MODEL into oracle configs to keep the data honest — + e.g. result-summary JSON shows model=null instead of a bogus default). + """ + if agent == "oracle": + return None + return model or DEFAULT_MODEL + + @dataclass class JobConfig: """Configuration for a benchmark job.""" agent: str = DEFAULT_AGENT - model: str | None = DEFAULT_MODEL + model: str | None = None environment: str = "docker" concurrency: int = 4 prompts: list[str | None] | None = None @@ -281,9 +294,10 @@ def _from_native_yaml(cls, raw: dict, **kwargs) -> "Job": sandbox_user = raw.get("sandbox_user", "agent") sandbox_locked_paths = raw.get("sandbox_locked_paths") + agent_name = raw.get("agent", DEFAULT_AGENT) config = JobConfig( - agent=raw.get("agent", DEFAULT_AGENT), - model=raw.get("model", DEFAULT_MODEL), + agent=agent_name, + model=effective_model(agent_name, raw.get("model")), environment=raw.get("environment", "docker"), concurrency=raw.get("concurrency", 4), prompts=prompts, @@ -305,7 +319,7 @@ def _from_harbor_yaml(cls, raw: dict, **kwargs) -> "Job": agent_name = agent_cfg.get("name", DEFAULT_AGENT) # Model — keep provider prefix intact for downstream resolution - model = agent_cfg.get("model_name", "") or DEFAULT_MODEL + model = effective_model(agent_name, agent_cfg.get("model_name") or None) # Environment env_cfg = raw.get("environment", {}) diff --git a/tests/test_eval_cli.py b/tests/test_eval_cli.py index 5a009e7..dfcb7db 100644 --- a/tests/test_eval_cli.py +++ b/tests/test_eval_cli.py @@ -1,9 +1,9 @@ -"""Tests for `bf eval` — the new eval-runner CLI. +"""Tests for ``benchflow.cli.eval`` — the future-facing eval CLI module. -The create command takes a positional task reference and routes to either -a single SDK.run or a batch Job based on whether the reference is a single -task dir or a directory of task dirs. We don't exercise the actual runner -here (that's the parity tests' job); we only verify the resolver. +NOTE: ``cli/eval.py`` is **not wired into the live CLI** (the active +``bench eval create`` lives in ``cli/main.py``). These tests cover the +task-reference resolver (``_resolve_task_ref``) which will be used once +the module is promoted to the live entry point. """ from __future__ import annotations diff --git a/tests/test_oracle_chokepoint.py b/tests/test_oracle_chokepoint.py new file mode 100644 index 0000000..142c063 --- /dev/null +++ b/tests/test_oracle_chokepoint.py @@ -0,0 +1,257 @@ +"""Regression tests for the oracle agent + DEFAULT_MODEL chokepoint. + +Pins the fix from this branch (post-PR #173 follow-up): + + Layer 1 — restore `agent != "oracle"` guard in resolve_agent_env so the + chokepoint defends against any caller that forwards a model. + Layer 2 — delete the orphaned src/benchflow/cli/eval.py whose oracle fix + was unreachable because nothing wired it into the live CLI. + Layer 3 — funnel all CLI/YAML-loader sites through effective_model() so + oracle gets honest model=None end-to-end. + +The classes below pin each layer at the right altitude: +- TestOrphanRemoval — proves cli/eval.py is gone and stays gone. +- TestEvalCreateRouting — proves `bench eval create` dispatches to + cli/main.py (the file that PR #173 missed). +- TestEffectiveModel — unit tests for the helper. +- TestOracleYamlLoaders — Job.from_yaml(oracle config) → model is None. +- TestEvalCreateOracleCLI — end-to-end: invoke `bench eval create -a oracle` + and assert no API-key validation error. +""" + +from __future__ import annotations + +import importlib +from pathlib import Path +from unittest.mock import AsyncMock, patch + +import pytest +from typer.testing import CliRunner + + +class TestEvalModuleNotWiredIntoCLI: + """src/benchflow/cli/eval.py exists but is NOT wired into the live CLI. + + The live `bench eval create` dispatches to cli/main.py:eval_create. + cli/eval.py is a standalone module (with its own resolver logic and tests) + but must not be imported by the CLI entry-point code — doing so is what + caused PR #173 to land its fix in dead code. + """ + + def test_cli_main_does_not_import_cli_eval(self): + """cli/main.py must not import from cli/eval — they are separate.""" + main_py = ( + Path(__file__).resolve().parent.parent + / "src" / "benchflow" / "cli" / "main.py" + ) + text = main_py.read_text() + assert "from benchflow.cli.eval" not in text + assert "import benchflow.cli.eval" not in text + + +class TestEvalCreateRouting: + """`bench eval create` must dispatch to cli/main.py:eval_create. + + The pyproject.toml `bench`/`benchflow` scripts both resolve to + `benchflow.cli.main:app`. PR #173 patched a different `eval_create` in + `benchflow.cli.eval` that no entry point ever loaded — these tests + pin the routing so that mistake can't happen again. + """ + + def test_entry_point_app_is_cli_main(self): + from benchflow.cli.main import app + + assert app.info.name == "benchflow" + + def test_eval_create_callback_lives_in_cli_main(self): + from benchflow.cli.main import eval_app + + create_cmds = [c for c in eval_app.registered_commands if c.name == "create"] + assert len(create_cmds) == 1 + assert create_cmds[0].callback.__module__ == "benchflow.cli.main" + + def test_bench_eval_create_help_resolves(self): + """Smoke test: `bench eval create --help` reaches a real callback.""" + from benchflow.cli.main import app + + result = CliRunner().invoke(app, ["eval", "create", "--help"]) + assert result.exit_code == 0 + assert "tasks-dir" in result.stdout or "task" in result.stdout.lower() + + +class TestEffectiveModel: + """The helper introduced in Layer 3 — single source of truth for the rule + "oracle never gets a model; non-oracle agents fall back to DEFAULT_MODEL".""" + + def test_oracle_with_no_model_returns_none(self): + from benchflow.job import effective_model + + assert effective_model("oracle", None) is None + + def test_oracle_ignores_explicit_model(self): + """Even if a caller forwards a model for oracle, the helper drops it.""" + from benchflow.job import effective_model + + assert effective_model("oracle", "claude-haiku-4-5-20251001") is None + + def test_non_oracle_with_no_model_returns_default(self): + from benchflow.job import DEFAULT_MODEL, effective_model + + assert effective_model("claude-agent-acp", None) == DEFAULT_MODEL + + def test_non_oracle_explicit_model_passes_through(self): + from benchflow.job import effective_model + + assert effective_model("codex-acp", "gpt-5") == "gpt-5" + + def test_non_oracle_empty_model_falls_back_to_default(self): + """Empty string == "no model" — Harbor YAML can produce this shape.""" + from benchflow.job import DEFAULT_MODEL, effective_model + + assert effective_model("claude-agent-acp", "") == DEFAULT_MODEL + + +class TestOracleYamlLoaders: + """YAML configs for oracle must produce JobConfig.model is None. + + Both loader paths (_from_native_yaml, _from_harbor_yaml) previously + coalesced missing model to DEFAULT_MODEL unconditionally — Layer 3 + routes them through effective_model() so oracle drops the default. + """ + + def _make_task(self, tmp_path: Path) -> Path: + tasks = tmp_path / "tasks" / "task-a" + tasks.mkdir(parents=True) + (tasks / "task.toml").write_text('schema_version = "1.1"\n') + return tmp_path / "tasks" + + def test_native_yaml_oracle_no_model(self, tmp_path: Path): + from benchflow.job import Job + + self._make_task(tmp_path) + config = tmp_path / "config.yaml" + config.write_text("tasks_dir: tasks\nagent: oracle\n") + job = Job.from_yaml(config) + assert job._config.agent == "oracle" + assert job._config.model is None + + def test_harbor_yaml_oracle_no_model(self, tmp_path: Path): + from benchflow.job import Job + + self._make_task(tmp_path) + config = tmp_path / "config.yaml" + config.write_text( + "agents:\n" + " - name: oracle\n" + "datasets:\n" + " - path: tasks\n" + ) + job = Job.from_yaml(config) + assert job._config.agent == "oracle" + assert job._config.model is None + + def test_native_yaml_non_oracle_keeps_default_when_omitted( + self, tmp_path: Path + ): + """Backwards-compat: omitting model for an LLM agent still gets DEFAULT_MODEL.""" + from benchflow.job import DEFAULT_MODEL, Job + + self._make_task(tmp_path) + config = tmp_path / "config.yaml" + config.write_text("tasks_dir: tasks\nagent: claude-agent-acp\n") + job = Job.from_yaml(config) + assert job._config.model == DEFAULT_MODEL + + +class TestEvalCreateOracleCLI: + """End-to-end: `bench eval create -a oracle` must not trip API key validation. + + This is the user-visible bug the chokepoint test guards against at the + unit level. Here we call the live handler (cli/main.py:eval_create) + directly — invoking via Typer's CliRunner triggers asyncio.run() + internally, which leaves no current event loop and breaks pre-existing + tests in the suite that use the deprecated asyncio.get_event_loop() + pattern. Calling the function directly still exercises the full + CLI → effective_model → TrialConfig → resolve_agent_env path the bug + originally lived in. + """ + + def _make_task(self, tmp_path: Path) -> Path: + task = tmp_path / "task" + task.mkdir() + (task / "task.toml").write_text('schema_version = "1.1"\n') + (task / "instruction.md").write_text("solve\n") + return task + + def _strip_api_keys(self, monkeypatch): + for k in ( + "ANTHROPIC_API_KEY", + "ANTHROPIC_AUTH_TOKEN", + "CLAUDE_CODE_OAUTH_TOKEN", + "OPENAI_API_KEY", + "GOOGLE_API_KEY", + "GEMINI_API_KEY", + "LLM_API_KEY", + ): + monkeypatch.delenv(k, raising=False) + + def test_oracle_single_task_no_api_key_no_error( + self, tmp_path: Path, monkeypatch + ): + """The bug: oracle + missing API key → ANTHROPIC_API_KEY ValueError.""" + import asyncio + + from benchflow.cli.main import eval_create + from benchflow.models import RunResult + + # The CLI handler internally calls asyncio.run(), which leaves no + # current event loop. Pre-existing tests in the suite use the + # deprecated asyncio.get_event_loop() and break in that state, so + # restore a fresh loop after the test (teardown via finally below). + + self._strip_api_keys(monkeypatch) + task = self._make_task(tmp_path) + captured: dict = {} + + async def fake_create(config): + captured["config"] = config + # Exercise the real chokepoint with the config the CLI built — + # that's the specific call site the bug manifested at. + from benchflow._agent_env import resolve_agent_env + + captured["agent_env"] = resolve_agent_env( + config.primary_agent, config.primary_model, config.agent_env + ) + trial = type("FakeTrial", (), {})() + trial.run = AsyncMock( + return_value=RunResult( + task_name="task", + agent_name="oracle", + rewards={"reward": 1.0}, + n_tool_calls=0, + ) + ) + return trial + + try: + with patch("benchflow.trial.Trial.create", new=fake_create): + eval_create( + config_file=None, + tasks_dir=task, + agent="oracle", + model=None, + environment="docker", + concurrency=1, + jobs_dir=str(tmp_path / "jobs"), + sandbox_user="agent", + skills_dir=None, + ) + finally: + asyncio.set_event_loop(asyncio.new_event_loop()) + + cfg = captured["config"] + # Layer 3: oracle never receives a model, even when CLI defaults exist. + assert cfg.primary_agent == "oracle" + assert cfg.primary_model is None + # Layer 1: chokepoint did not inject provider env or raise. + assert "BENCHFLOW_PROVIDER_MODEL" not in captured["agent_env"] diff --git a/tests/test_resolve_env_helpers.py b/tests/test_resolve_env_helpers.py index f699334..d831fe7 100644 --- a/tests/test_resolve_env_helpers.py +++ b/tests/test_resolve_env_helpers.py @@ -296,3 +296,45 @@ def test_no_model_empty_requires_env(self, monkeypatch, tmp_path): self._patch_expanduser(monkeypatch, tmp_path) result = self._resolve(agent="openclaw", agent_env={}) assert "_BENCHFLOW_SUBSCRIPTION_AUTH" not in result + + +class TestResolveAgentEnvOracle: + """Oracle runs solve.sh and never calls an LLM — must skip model-related env. + + Regression for the PR #173 follow-up: commit 360c460 removed the + `agent != "oracle"` guard from resolve_agent_env, betting that CLI callers + would pass model=None for oracle. But cli/main.py:eval_create (the live + `bench eval create`) still passes `model or DEFAULT_MODEL`, so oracle + reaches the chokepoint with a real model and triggers ANTHROPIC_API_KEY + validation — breaking offline oracle runs that have no API key set. + """ + + def _patch_expanduser(self, monkeypatch, tmp_path): + orig = Path.expanduser + + def fake(self): + s = str(self) + if s.startswith("~"): + return tmp_path / s[2:] + return orig(self) + + monkeypatch.setattr(Path, "expanduser", fake) + + def test_oracle_with_default_model_does_not_validate_api_key( + self, monkeypatch, tmp_path + ): + """Oracle + DEFAULT_MODEL + no API key + no host auth must not raise.""" + for k in ( + "ANTHROPIC_API_KEY", + "OPENAI_API_KEY", + "GOOGLE_API_KEY", + "GEMINI_API_KEY", + ): + monkeypatch.delenv(k, raising=False) + self._patch_expanduser(monkeypatch, tmp_path) + + result = resolve_agent_env("oracle", "claude-haiku-4-5-20251001", {}) + + # Provider env never resolved — oracle never calls an LLM. + assert "BENCHFLOW_PROVIDER_MODEL" not in result + assert "_BENCHFLOW_SUBSCRIPTION_AUTH" not in result From a582c0bdcacc3d21fe5f9c35801d6c256f033072 Mon Sep 17 00:00:00 2001 From: Xiangyi Li Date: Sat, 25 Apr 2026 05:26:43 -0400 Subject: [PATCH 17/25] chore: clean up ruff lint debt across repo (#197) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Brings 126 ruff errors → 0 so CI's lint check goes green and unblocks the 5 PRs targeting dev-0.3 (#176, #180, #181, #182, #191) that were landing on top of pre-existing repo lint debt. What changed: 1. Auto-fixes via `ruff check --fix --unsafe-fixes`: - 40 F401 unused-imports across src/, tests/, examples/ - 8 I001 unsorted-imports - 6 UP037 quoted-annotations modernized - Other auto-fixable rules 2. Hand fixes: - src/benchflow/__init__.py: removed `Trial` from the `from harbor` re-export block (it was shadowed by `from benchflow.trial import Trial` at line 65, which is the canonical public Trial). Added `trial_config_from_yaml` to __all__. - src/benchflow/process.py: 3x `raise ConnectionError(...) from e` for B904 (errors raised inside except clauses). - src/benchflow/mcp/reviewer_server.py: same B904 fix for fastmcp ImportError reraise. - tests/test_skill_eval.py: raw string for `pytest.raises(match=...)` pattern (RUF043). - 3 files: replaced `×` (Unicode multiplication sign) in comments and f-strings with `x` (latin x) to clear RUF001/RUF003. 3. Per-file ignores added to pyproject.toml `[tool.ruff.lint.per-file-ignores]`: - `experiments/*.py` and `tests/conformance/*.py` ignore E402 — these are standalone scripts that legitimately set sys.path before importing. - `src/benchflow/runtime.py` ignores F821 — uses forward references resolved by `from __future__ import annotations`; explicit TYPE_CHECKING imports would force eager loads. No code behavior changes. 580 tests pass; the 8 pre-existing failures (env-leak between subscription auth tests, Docker compose env, judge model default mismatch) are unrelated to this PR. --- benchmarks/followup-bench/runner.py | 2 +- .../notebooks/nanofirm-task/tests/evaluate.py | 1 - docs/notebooks/scene-patterns.ipynb | 47 +++++++++---------- docs/notebooks/scene-patterns.py | 3 -- experiments/reviewer_ablation.py | 19 ++++---- experiments/validate_multi_scene_lifecycle.py | 2 +- .../tests/test_outputs.py | 2 +- .../tests/test_outputs.py | 2 +- .../run_comparison.py | 2 +- .../exploits/conftest_payload.py | 2 +- labs/reward-hack-matrix/run_matrix.py | 9 ++-- pyproject.toml | 8 ++++ src/benchflow/__init__.py | 13 ++--- src/benchflow/_acp_run.py | 11 ++--- src/benchflow/acp/client.py | 1 - src/benchflow/cli/eval.py | 3 +- src/benchflow/cli/main.py | 8 ++-- src/benchflow/job.py | 4 +- src/benchflow/mcp/reviewer_server.py | 6 +-- src/benchflow/process.py | 26 +++++----- src/benchflow/runtime.py | 9 ++-- src/benchflow/sdk.py | 21 --------- src/benchflow/skill_eval.py | 15 +++--- src/benchflow/trial.py | 15 +++--- tests/conformance/proof_multi_agent.py | 10 ++-- tests/conformance/run_conformance.py | 4 +- tests/test_oracle_chokepoint.py | 2 - tests/test_rewards_jsonl.py | 2 +- tests/test_scene_outbox_trial.py | 4 +- tests/test_skill_eval.py | 2 +- tests/test_skill_eval_dryrun.py | 3 +- 31 files changed, 109 insertions(+), 149 deletions(-) diff --git a/benchmarks/followup-bench/runner.py b/benchmarks/followup-bench/runner.py index e3558d5..d942ea6 100644 --- a/benchmarks/followup-bench/runner.py +++ b/benchmarks/followup-bench/runner.py @@ -28,7 +28,7 @@ from benchflow._acp_run import connect_acp, execute_prompts from benchflow._agent_setup import install_agent from benchflow._scene import Role, Scene -from benchflow.agents.registry import AGENTS, AGENT_LAUNCH +from benchflow.agents.registry import AGENT_LAUNCH, AGENTS from benchflow.runtime import Environment logger = logging.getLogger(__name__) diff --git a/docs/notebooks/nanofirm-task/tests/evaluate.py b/docs/notebooks/nanofirm-task/tests/evaluate.py index 2f4fd75..254f70d 100644 --- a/docs/notebooks/nanofirm-task/tests/evaluate.py +++ b/docs/notebooks/nanofirm-task/tests/evaluate.py @@ -1,6 +1,5 @@ """Evaluate contract risk analysis quality.""" import json -import sys ANALYSIS_PATH = "/app/analysis.json" REWARD_PATH = "/logs/verifier/reward.txt" diff --git a/docs/notebooks/scene-patterns.ipynb b/docs/notebooks/scene-patterns.ipynb index 9f1400c..488869f 100644 --- a/docs/notebooks/scene-patterns.ipynb +++ b/docs/notebooks/scene-patterns.ipynb @@ -6,16 +6,16 @@ "source": [ "# BenchFlow Scene Patterns\n", "\n", - "Three evaluation patterns \u2014 baseline, self-review (multi-turn), coder-reviewer (multi-round) \u2014 run end-to-end with `bf.run()`.\n", + "Three evaluation patterns — baseline, self-review (multi-turn), coder-reviewer (multi-round) — run end-to-end with `bf.run()`.\n", "\n", "| Term | Definition | Example |\n", "|------|-----------|--------|\n", "| **Turn** | One prompt in one ACP session | Coder writes code |\n", - "| **Multi-turn** | Same role, multiple turns | Self-review: agent \u2192 agent |\n", - "| **Round** | One A\u2192B exchange between roles | Coder \u2192 Reviewer |\n", - "| **Multi-round** | Different roles exchanging turns | Coder \u2192 Reviewer \u2192 Coder |\n", + "| **Multi-turn** | Same role, multiple turns | Self-review: agent → agent |\n", + "| **Round** | One A→B exchange between roles | Coder → Reviewer |\n", + "| **Multi-round** | Different roles exchanging turns | Coder → Reviewer → Coder |\n", "| **Scene** | Interaction region with roles + turns | A code-review scene |\n", - "| **Trial** | Sequence of scenes in a shared sandbox | Skill-gen \u2192 Solve |\n", + "| **Trial** | Sequence of scenes in a shared sandbox | Skill-gen → Solve |\n", "\n", "**Prerequisites:**\n", "- `pip install benchflow`\n", @@ -30,15 +30,14 @@ "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Task: .ref/terminal-bench-2/regex-log Env: daytona Agent: gemini Model: gemini-3.1-flash-lite-preview\n" ] } ], "source": [ - "import asyncio\n", "import logging\n", "from pathlib import Path\n", "\n", @@ -71,8 +70,8 @@ "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Reward: 1.0\n", "Tool calls: 3\n", @@ -102,7 +101,7 @@ "source": [ "## Pattern 2: Multi-Turn Self-Review\n", "\n", - "Same agent gets a second turn to re-examine its own work. This is **multi-turn** \u2014 one role, multiple prompts.\n", + "Same agent gets a second turn to re-examine its own work. This is **multi-turn** — one role, multiple prompts.\n", "\n", "Use when: a second pass catches what the first missed. Cost: 2x baseline." ] @@ -113,8 +112,8 @@ "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Reward: 1.0\n", "Tool calls: 7\n", @@ -152,7 +151,7 @@ "source": [ "## Pattern 3: Coder + Reviewer (Multi-Round)\n", "\n", - "Two roles \u2014 coder and reviewer \u2014 in a shared sandbox. This is **multi-round** \u2014 different roles exchanging turns.\n", + "Two roles — coder and reviewer — in a shared sandbox. This is **multi-round** — different roles exchanging turns.\n", "\n", "Communication uses the **outbox convention**:\n", "1. Reviewer writes feedback to `/app/.outbox/coder.json` as `{\"to\": \"coder\", \"content\": \"...\"}`\n", @@ -168,8 +167,8 @@ "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Reward: 0.0\n", "Tool calls: 13\n", @@ -195,7 +194,7 @@ " '{\"to\": \"coder\", \"content\": \"Your specific feedback here.\"}'),\n", " Turn(\"coder\",\n", " \"Read the reviewer's feedback and fix the issues. \"\n", - " \"Focus only on what was flagged \u2014 don't start over.\"),\n", + " \"Focus only on what was flagged — don't start over.\"),\n", " ],\n", " )],\n", " environment=ENV,\n", @@ -223,14 +222,14 @@ "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Pattern Reward Tools Error\n", "------------------------------------------------------------\n", - "baseline 1.0 3 \u2014\n", - "self_review 1.0 7 \u2014\n", - "coder_reviewer 0.0 13 \u2014\n" + "baseline 1.0 3 —\n", + "self_review 1.0 7 —\n", + "coder_reviewer 0.0 13 —\n" ] } ], @@ -238,11 +237,11 @@ "print(f\"{'Pattern':<20} {'Reward':>8} {'Tools':>6} {'Error'}\")\n", "print(\"-\" * 60)\n", "for name, r in results.items():\n", - " reward = (r.rewards or {}).get(\"reward\", \"\u2014\")\n", - " err = r.error or \"\u2014\"\n", + " reward = (r.rewards or {}).get(\"reward\", \"—\")\n", + " err = r.error or \"—\"\n", " if len(err) > 30:\n", " err = err[:27] + \"...\"\n", - " print(f\"{name:<20} {str(reward):>8} {r.n_tool_calls:>6} {err}\")" + " print(f\"{name:<20} {reward!s:>8} {r.n_tool_calls:>6} {err}\")" ] }, { @@ -255,12 +254,12 @@ "\n", "| Harbor (PR #1462) | BenchFlow 0.3 | Status |\n", "|-------------------|---------------|--------|\n", - "| `BaseUser` | `Role` \u2014 any role can be a user, reviewer, or custom agent | **Shipped** |\n", - "| `User.run() \u2192 str` | `Turn` with a prompt \u2014 each turn sends one prompt to one role | **Shipped** |\n", + "| `BaseUser` | `Role` — any role can be a user, reviewer, or custom agent | **Shipped** |\n", + "| `User.run() → str` | `Turn` with a prompt — each turn sends one prompt to one role | **Shipped** |\n", "| Per-round message passing | Outbox files + scheduler injection into next role's prompt | **Shipped** |\n", "| Per-round archiving | `scene_messages.jsonl` in trial directory | **Shipped** |\n", "| `--user` CLI flag | YAML scene config (`scenes:` key in trial config) | **Shipped** |\n", - "| `User.run() \u2192 None` (stop) | Fixed turn count only \u2014 no dynamic termination | **Gap** |\n", + "| `User.run() → None` (stop) | Fixed turn count only — no dynamic termination | **Gap** |\n", "| Oracle access (`/solution`) | Not available to user roles during setup | **Gap** |\n", "| Inter-round verification | `verify()` runs once after all scenes | **Gap** |\n", "| User inspects trajectory | User role cannot read prior agent trajectory between rounds | **Gap** |\n", @@ -270,7 +269,7 @@ "\n", "**What it does NOT deliver yet:** Dynamic termination (user decides when to stop), oracle access\n", "for user roles, per-round verification, or inter-round trajectory inspection. These require\n", - "extending the Scene scheduler with callbacks \u2014 tracked for 0.4." + "extending the Scene scheduler with callbacks — tracked for 0.4." ] } ], diff --git a/docs/notebooks/scene-patterns.py b/docs/notebooks/scene-patterns.py index a95369b..961bd09 100644 --- a/docs/notebooks/scene-patterns.py +++ b/docs/notebooks/scene-patterns.py @@ -18,9 +18,6 @@ """ import os -import sys -import json -from pathlib import Path # ── The contract (constructed here, no external files) ────────────── diff --git a/experiments/reviewer_ablation.py b/experiments/reviewer_ablation.py index 5acaeca..8ab0cae 100644 --- a/experiments/reviewer_ablation.py +++ b/experiments/reviewer_ablation.py @@ -17,25 +17,26 @@ import os import sys import time -from datetime import datetime from pathlib import Path logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") sys.path.insert(0, str(Path(__file__).resolve().parents[0].parent / "src")) +import contextlib + +from harbor.models.task.task import Task +from harbor.models.trial.paths import TrialPaths +from harbor.verifier.verifier import Verifier + from benchflow._acp_run import connect_acp, execute_prompts from benchflow._agent_env import resolve_agent_env from benchflow._agent_setup import install_agent from benchflow._credentials import upload_subscription_auth, write_credential_files from benchflow._env_setup import _create_environment from benchflow._sandbox import setup_sandbox_user -from benchflow._scene import Role, Scene -from benchflow.agents.registry import AGENTS, AGENT_LAUNCH +from benchflow.agents.registry import AGENT_LAUNCH, AGENTS from benchflow.sdk import SDK -from harbor.models.task.task import Task -from harbor.verifier.verifier import Verifier -from harbor.models.trial.paths import TrialPaths logger = logging.getLogger(__name__) @@ -141,10 +142,8 @@ async def _run_acp(env, prompt: str, trial_dir: Path, timeout: int = 600) -> tup try: _, n_tools = await execute_prompts(acp_client, session, [prompt], timeout=timeout) finally: - try: + with contextlib.suppress(Exception): await acp_client.close() - except Exception: - pass return n_tools, int(time.time() - t0) @@ -204,7 +203,7 @@ async def run_reviewer(task_path: Path, task_name: str, condition: str) -> dict: total_tools += n_tools # Read coder's outbox - outbox_result = await env.exec("cat /app/.outbox/reviewer.json 2>/dev/null || echo '{}'") + await env.exec("cat /app/.outbox/reviewer.json 2>/dev/null || echo '{}'") await env.exec("rm -rf /app/.outbox/*") # Phase 2: Reviewer diff --git a/experiments/validate_multi_scene_lifecycle.py b/experiments/validate_multi_scene_lifecycle.py index e55eea2..4e6bf87 100644 --- a/experiments/validate_multi_scene_lifecycle.py +++ b/experiments/validate_multi_scene_lifecycle.py @@ -162,7 +162,7 @@ async def main(): r3 = await test_followup_bench() results["followup"] = r3.rewards - logger.info(f"\n=== DOGFOOD RESULTS ===") + logger.info("\n=== DOGFOOD RESULTS ===") for name, rewards in results.items(): reward = (rewards or {}).get("reward", "N/A") logger.info(f" {name}: reward={reward}") diff --git a/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/tests/test_outputs.py b/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/tests/test_outputs.py index 1c85417..df887aa 100644 --- a/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/tests/test_outputs.py +++ b/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/tests/test_outputs.py @@ -6,7 +6,7 @@ def test_arithmetic_is_broken(): def test_false_is_true(): - assert False + raise AssertionError() def test_pi_is_rational(): diff --git a/labs/benchjack-sandbox-hardening/pattern7_pth_injection/tests/test_outputs.py b/labs/benchjack-sandbox-hardening/pattern7_pth_injection/tests/test_outputs.py index 6debbc3..a4742c2 100644 --- a/labs/benchjack-sandbox-hardening/pattern7_pth_injection/tests/test_outputs.py +++ b/labs/benchjack-sandbox-hardening/pattern7_pth_injection/tests/test_outputs.py @@ -3,7 +3,7 @@ def test_arithmetic_is_broken(): def test_false_is_true(): - assert False + raise AssertionError() def test_pi_is_rational(): diff --git a/labs/benchjack-sandbox-hardening/run_comparison.py b/labs/benchjack-sandbox-hardening/run_comparison.py index e01b1df..04963bb 100755 --- a/labs/benchjack-sandbox-hardening/run_comparison.py +++ b/labs/benchjack-sandbox-hardening/run_comparison.py @@ -136,7 +136,7 @@ def main() -> int: return 1 n = len(PATTERNS) - total_steps = 2 + 2 * n # 2 venvs + 2 versions × n patterns + total_steps = 2 + 2 * n # 2 venvs + 2 versions x n patterns print(f"[1/{total_steps}] venv: benchflow==0.2.0 (PyPI)") _create_venv(VENVS_DIR / "bf-0.2.0", ["benchflow==0.2.0"]) diff --git a/labs/reward-hack-matrix/exploits/conftest_payload.py b/labs/reward-hack-matrix/exploits/conftest_payload.py index 30fe23f..7397066 100644 --- a/labs/reward-hack-matrix/exploits/conftest_payload.py +++ b/labs/reward-hack-matrix/exploits/conftest_payload.py @@ -16,7 +16,7 @@ terminal-bench-2 if its verifier ever touches conftest.py). """ -import pytest # noqa: F401 — pytest must see this as a real plugin module +import pytest def pytest_collection_modifyitems(config, items): diff --git a/labs/reward-hack-matrix/run_matrix.py b/labs/reward-hack-matrix/run_matrix.py index d5dfc43..53a86b0 100644 --- a/labs/reward-hack-matrix/run_matrix.py +++ b/labs/reward-hack-matrix/run_matrix.py @@ -30,6 +30,7 @@ import argparse import asyncio +import contextlib import json import os import re @@ -447,13 +448,11 @@ async def shutdown(self) -> None: if self.proc is None: return if self.proc.stdin and not self.proc.stdin.is_closing(): - try: + with contextlib.suppress(OSError, BrokenPipeError): self.proc.stdin.close() - except (OSError, BrokenPipeError): - pass try: await asyncio.wait_for(self.proc.wait(), timeout=30) - except asyncio.TimeoutError: + except TimeoutError: self.proc.kill() await self.proc.wait() if self.reader_task: @@ -717,7 +716,7 @@ def main() -> int: if args.sweep: summary_path = Path(args.summary_path) if args.summary_path else (JOBS_DIR / "matrix_sweep.json") print( - f"[sweep] {len(cells)} cells × {len(VERSIONS)} versions = " + f"[sweep] {len(cells)} cells x {len(VERSIONS)} versions = " f"{len(cells) * len(VERSIONS)} trials, concurrency={args.concurrency}" ) results = asyncio.run( diff --git a/pyproject.toml b/pyproject.toml index 0b032ab..306e71c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -96,6 +96,14 @@ ignore = [ "RUF022", # __all__ unsorted — grouped by section for agent-friendliness ] +[tool.ruff.lint.per-file-ignores] +# Standalone scripts — sys.path manipulation before imports is intentional +"experiments/*.py" = ["E402"] +"tests/conformance/*.py" = ["E402"] +# Forward references resolved via __future__ annotations — ruff flags them +# but they work at runtime; explicit TYPE_CHECKING imports would force eager loads. +"src/benchflow/runtime.py" = ["F821"] + [tool.ty.environment] python-version = "3.12" diff --git a/src/benchflow/__init__.py b/src/benchflow/__init__.py index 809292d..35388b5 100644 --- a/src/benchflow/__init__.py +++ b/src/benchflow/__init__.py @@ -19,13 +19,14 @@ ExecResult, Task, TaskConfig, - Trial, Verifier, VerifierResult, ) # benchflow's additions from benchflow._env_setup import stage_dockerfile_deps +from benchflow._scene import MailboxTransport, Message, MessageTransport, Role, Scene +from benchflow._snapshot import list_snapshots, restore, snapshot from benchflow.acp.client import ACPClient from benchflow.acp.session import ACPSession from benchflow.agents.registry import ( @@ -53,16 +54,15 @@ RuntimeResult, run, # bf.run(agent, env) — the primary 0.3 API ) -from benchflow._scene import MailboxTransport, Message, MessageTransport, Role, Scene -from benchflow._snapshot import list_snapshots, restore, snapshot from benchflow.sdk import SDK -from benchflow.trial import Trial, TrialConfig -from benchflow.trial import Role as TrialRole, Scene as TrialScene, Turn -from benchflow.trial_yaml import trial_config_from_yaml from benchflow.skills import SkillInfo, discover_skills, install_skill, parse_skill from benchflow.trajectories.otel import OTelCollector from benchflow.trajectories.proxy import TrajectoryProxy from benchflow.trajectories.types import Trajectory +from benchflow.trial import Role as TrialRole +from benchflow.trial import Scene as TrialScene +from benchflow.trial import Trial, TrialConfig, Turn +from benchflow.trial_yaml import trial_config_from_yaml # Public API surface. Anything not in this list is implementation detail and # may change without notice. Names are grouped by source module to match the @@ -123,6 +123,7 @@ "TrialRole", "TrialScene", "Turn", + "trial_config_from_yaml", # SDK (backwards compat) "SDK", # Environments / dep staging diff --git a/src/benchflow/_acp_run.py b/src/benchflow/_acp_run.py index 8b64288..5602072 100644 --- a/src/benchflow/_acp_run.py +++ b/src/benchflow/_acp_run.py @@ -18,6 +18,7 @@ """ import asyncio +import contextlib import logging from pathlib import Path @@ -66,7 +67,6 @@ async def connect_acp( agent_launch = build_priv_drop_cmd(agent_launch, sandbox_user) logger.info(f"Agent sandboxed as: {sandbox_user}") - last_err: Exception | None = None acp_client: ACPClient | None = None for attempt in range(_ACP_CONNECT_MAX_RETRIES + 1): if attempt > 0: @@ -106,12 +106,9 @@ async def connect_acp( except ConnectionError as e: # Close the failed client before retrying if acp_client: - try: + with contextlib.suppress(Exception): await acp_client.close() - except Exception: - pass acp_client = None - last_err = e if attempt == _ACP_CONNECT_MAX_RETRIES: raise logger.warning(f"ACP connect failed (attempt {attempt + 1}): {e}") @@ -119,10 +116,8 @@ async def connect_acp( except Exception: # Non-retryable error — close client to prevent leak if acp_client: - try: + with contextlib.suppress(Exception): await acp_client.close() - except Exception: - pass raise if model: diff --git a/src/benchflow/acp/client.py b/src/benchflow/acp/client.py index abd4743..57a55eb 100644 --- a/src/benchflow/acp/client.py +++ b/src/benchflow/acp/client.py @@ -1,6 +1,5 @@ """ACP client — benchflow acts as the client, agents are ACP servers.""" -import json import logging from typing import Any diff --git a/src/benchflow/cli/eval.py b/src/benchflow/cli/eval.py index 58b410f..17cea1d 100644 --- a/src/benchflow/cli/eval.py +++ b/src/benchflow/cli/eval.py @@ -31,7 +31,8 @@ from rich.console import Console from rich.table import Table -from benchflow.job import DEFAULT_AGENT, effective_model as _effective_model +from benchflow.job import DEFAULT_AGENT +from benchflow.job import effective_model as _effective_model console = Console() diff --git a/src/benchflow/cli/main.py b/src/benchflow/cli/main.py index 96611bb..e9acc23 100644 --- a/src/benchflow/cli/main.py +++ b/src/benchflow/cli/main.py @@ -433,9 +433,9 @@ def skills_eval( typer.Argument(help="Path to skill directory containing evals/evals.json"), ], agent: Annotated[ - list[str], + list[str] | None, typer.Option("--agent", "-a", help="Agent(s) to evaluate (repeatable)"), - ] = ["claude-agent-acp"], + ] = None, model: Annotated[ list[str] | None, typer.Option("--model", "-m", help="Model(s) (matched 1:1 with agents)"), @@ -473,6 +473,8 @@ def skills_eval( """ from benchflow.skill_eval import SkillEvaluator, export_gepa_traces + if agent is None: + agent = ["claude-agent-acp"] if not (skill_dir / "evals" / "evals.json").exists(): console.print( f"[red]No evals/evals.json found in {skill_dir}[/red]\n" @@ -757,7 +759,7 @@ def eval_create( eff_model = effective_model(agent, model) # Smart detection: if tasks_dir has task.toml, it's a single task if (tasks_dir / "task.toml").exists(): - from benchflow.trial import Trial, TrialConfig, Scene + from benchflow.trial import Scene, Trial, TrialConfig config = TrialConfig( task_path=tasks_dir, diff --git a/src/benchflow/job.py b/src/benchflow/job.py index 5429add..70dfb56 100644 --- a/src/benchflow/job.py +++ b/src/benchflow/job.py @@ -126,9 +126,7 @@ def should_retry(self, error: str | None) -> bool: return True if self.retry_on_pipe and category == PIPE_CLOSED: return True - if self.retry_on_acp and category == ACP_ERROR: - return True - return False + return bool(self.retry_on_acp and category == ACP_ERROR) def backoff_delay(self, attempt: int) -> float: """Exponential backoff delay for retry attempt.""" diff --git a/src/benchflow/mcp/reviewer_server.py b/src/benchflow/mcp/reviewer_server.py index 9c2dedc..210746b 100644 --- a/src/benchflow/mcp/reviewer_server.py +++ b/src/benchflow/mcp/reviewer_server.py @@ -24,8 +24,6 @@ import json import logging import os -import subprocess -import sys from pathlib import Path logger = logging.getLogger(__name__) @@ -52,11 +50,11 @@ def create_reviewer_server( """ try: from fastmcp import FastMCP - except ImportError: + except ImportError as e: raise ImportError( "fastmcp required for MCP reviewer server. " "Install with: pip install fastmcp" - ) + ) from e mcp = FastMCP("benchflow-reviewer") prompt = review_prompt or os.environ.get("REVIEWER_PROMPT", DEFAULT_REVIEW_PROMPT) diff --git a/src/benchflow/process.py b/src/benchflow/process.py index 4a0e4be..197daa1 100644 --- a/src/benchflow/process.py +++ b/src/benchflow/process.py @@ -313,7 +313,7 @@ async def start( # Use the full compose base command (with -p, --project-directory, # and -f flags) so that exec can find the running project. if self._compose_cmd_base: - inner_parts = shlex.split(self._compose_cmd_base) + ["exec", "-i", "-T"] + inner_parts = [*shlex.split(self._compose_cmd_base), "exec", "-i", "-T"] else: inner_parts = ["docker", "compose", "exec", "-i", "-T"] if cwd: @@ -453,7 +453,7 @@ async def start( logger.info(f"DaytonaPtyProcess: PTY connected (session={session_id})") compose_parts = shlex.split(self._compose_cmd_base) if self._compose_cmd_base else ["docker", "compose"] - exec_parts = compose_parts + ["exec", "-i", "-T"] + exec_parts = [*compose_parts, "exec", "-i", "-T"] if cwd: exec_parts.extend(["-w", cwd]) # Write env vars to a file inside the container (not visible in ps aux), @@ -475,7 +475,7 @@ async def start( marker = f"__BENCHFLOW_ACP_{session_id}__" setup = f"stty -echo 2>/dev/null; echo '{marker}'; exec {exec_cmd}\n" await self._pty.send_input(setup) - logger.info(f"DaytonaPtyProcess: sent setup, waiting for marker...") + logger.info("DaytonaPtyProcess: sent setup, waiting for marker...") while True: try: @@ -484,10 +484,10 @@ async def start( logger.debug(f"DaytonaPtyProcess drain: {decoded[:120]}") if marker in decoded: break - except TimeoutError: - raise ConnectionError("DaytonaPtyProcess: timeout waiting for agent start marker") + except TimeoutError as e: + raise ConnectionError("DaytonaPtyProcess: timeout waiting for agent start marker") from e - logger.info(f"DaytonaPtyProcess: marker seen, agent starting") + logger.info("DaytonaPtyProcess: marker seen, agent starting") async def readline(self) -> bytes: if self._closed: @@ -495,10 +495,10 @@ async def readline(self) -> bytes: try: line = await asyncio.wait_for(self._line_buffer.get(), timeout=300) return line - except TimeoutError: - raise ConnectionError("PTY readline timeout (300s)") + except TimeoutError as e: + raise ConnectionError("PTY readline timeout (300s)") from e except Exception as e: - raise ConnectionError(f"PTY readline error: {e}") + raise ConnectionError(f"PTY readline error: {e}") from e async def writeline(self, data: str) -> None: if not self._pty or self._closed: @@ -508,14 +508,10 @@ async def writeline(self, data: str) -> None: async def close(self) -> None: self._closed = True if self._pty: - try: + with contextlib.suppress(Exception): await self._pty.kill() - except Exception: - pass - try: + with contextlib.suppress(Exception): await self._pty.disconnect() - except Exception: - pass logger.info("DaytonaPtyProcess terminated") @property diff --git a/src/benchflow/runtime.py b/src/benchflow/runtime.py index 1e55f0c..14a785e 100644 --- a/src/benchflow/runtime.py +++ b/src/benchflow/runtime.py @@ -291,13 +291,13 @@ async def execute(self) -> RuntimeResult: async def run( - subject: "Agent | TrialConfig | str", - env: "Environment | str | None" = None, + subject: Agent | TrialConfig | str, + env: Environment | str | None = None, config: RuntimeConfig | None = None, *, - task_path: "str | Path | None" = None, + task_path: str | Path | None = None, model: str | None = None, -) -> "RuntimeResult | RunResult": +) -> RuntimeResult | RunResult: """Primary user-facing API — multiple calling conventions. Usage:: @@ -313,7 +313,6 @@ async def run( # 3. Agent name string (simplest) result = await bf.run("gemini", task_path="tasks/X") """ - from benchflow.models import RunResult from benchflow.trial import Scene, Trial, TrialConfig if isinstance(subject, TrialConfig): diff --git a/src/benchflow/sdk.py b/src/benchflow/sdk.py index bd10a2d..28fa82d 100644 --- a/src/benchflow/sdk.py +++ b/src/benchflow/sdk.py @@ -98,33 +98,12 @@ from harbor.models.trial.paths import TrialPaths from harbor.verifier.verifier import Verifier -from benchflow._acp_run import connect_acp, execute_prompts -from benchflow._agent_env import resolve_agent_env -from benchflow._agent_setup import deploy_skills, install_agent -from benchflow._credentials import ( - upload_subscription_auth, - write_credential_files, -) from benchflow._env_setup import ( - _create_environment, - _inject_skills_into_dockerfile, _patch_harbor_dind, - stage_dockerfile_deps, ) from benchflow._sandbox import ( - _resolve_locked_paths, - _seed_verifier_workspace, - _snapshot_build_config, harden_before_verify, - lockdown_paths, - setup_sandbox_user, -) -from benchflow._trajectory import ( - _capture_session_trajectory, - _scrape_agent_trajectory, ) -from benchflow.acp.client import ACPClient, ACPError -from benchflow.agents.registry import AGENT_LAUNCH from benchflow.models import RunResult, TrajectorySource logger = logging.getLogger(__name__) diff --git a/src/benchflow/skill_eval.py b/src/benchflow/skill_eval.py index b848135..82e9686 100644 --- a/src/benchflow/skill_eval.py +++ b/src/benchflow/skill_eval.py @@ -6,6 +6,7 @@ result = await evaluator.run(agents=["claude-agent-acp"], environment="docker") """ +import contextlib import json import logging import shutil @@ -431,7 +432,7 @@ async def run( all_results: list[CaseResult] = [] # Run each agent - for agent, model in zip(agents, models): + for agent, model in zip(agents, models, strict=False): agent_label = agent.split("/")[-1] if "/" in agent else agent # With-skill run @@ -486,9 +487,9 @@ async def _run_job( with_skill: bool, ) -> list[CaseResult]: """Run a batch of tasks using Job for concurrency and retries.""" - from benchflow.job import Job, JobConfig, RetryConfig - import os + + from benchflow.job import Job, JobConfig, RetryConfig judge_env = {} for key in ("ANTHROPIC_API_KEY", "OPENAI_API_KEY", "GOOGLE_API_KEY", "GEMINI_API_KEY"): if os.environ.get(key): @@ -506,7 +507,7 @@ async def _run_job( agent_env=judge_env, ), ) - job_result = await j.run() + await j.run() results = [] # Walk the jobs directory to collect per-case results @@ -539,10 +540,8 @@ async def _run_job( # Read judge rubric details if available judge_file = trial_dir / "verifier" / "judge_result.json" if judge_file.exists(): - try: + with contextlib.suppress(json.JSONDecodeError, KeyError): rubric_results = json.loads(judge_file.read_text()).get("items") - except (json.JSONDecodeError, KeyError): - pass results.append( CaseResult( @@ -579,7 +578,7 @@ def _compute_lifts( ) -> list[AgentLift]: """Compute per-agent lift from case results.""" lifts = [] - for agent, model in zip(agents, models): + for agent, model in zip(agents, models, strict=False): with_results = [r for r in all_results if r.agent == agent and r.with_skill] baseline_results = [ r for r in all_results if r.agent == agent and not r.with_skill diff --git a/src/benchflow/trial.py b/src/benchflow/trial.py index d2af9ae..d2f1570 100644 --- a/src/benchflow/trial.py +++ b/src/benchflow/trial.py @@ -35,7 +35,7 @@ from __future__ import annotations -import asyncio +import contextlib import json import logging import shlex @@ -57,7 +57,6 @@ _resolve_locked_paths, _seed_verifier_workspace, _snapshot_build_config, - harden_before_verify, lockdown_paths, setup_sandbox_user, ) @@ -66,7 +65,7 @@ _scrape_agent_trajectory, ) from benchflow.acp.client import ACPClient, ACPError -from benchflow.agents.registry import AGENT_LAUNCH, AGENTS +from benchflow.agents.registry import AGENT_LAUNCH from benchflow.models import RunResult, TrajectorySource logger = logging.getLogger(__name__) @@ -110,7 +109,7 @@ def single( prompts: list[str | None] | None = None, role_name: str = "agent", skills_dir: str | Path | None = None, - ) -> "Scene": + ) -> Scene: """Shortcut for single-agent, single-role scene.""" prompts = prompts or [None] return cls( @@ -157,7 +156,7 @@ def from_legacy( prompts: list[str | None] | None = None, skills_dir: str | Path | None = None, **kwargs, - ) -> "TrialConfig": + ) -> TrialConfig: """Construct from flat SDK.run()-style args.""" return cls( task_path=task_path, @@ -440,10 +439,8 @@ async def disconnect(self) -> None: # Kill any lingering agent processes to prevent context bleed between scenes if self._env and self._agent_launch.strip(): agent_cmd = self._agent_launch.split()[0].split("/")[-1] - try: + with contextlib.suppress(Exception): await self._env.exec(f"pkill -f '{agent_cmd}' || true", timeout_sec=10) - except Exception: - pass self._session_tool_count = 0 self._session_traj_count = 0 self._phase = "installed" @@ -630,7 +627,7 @@ async def _run_scene(self, scene: Scene) -> None: inbox: dict[str, list[str]] = {r.name: [] for r in scene.roles} turn_counter = 0 - for i, turn in enumerate(scene.turns): + for _i, turn in enumerate(scene.turns): role = role_map.get(turn.role) if not role: raise ValueError(f"Turn references unknown role {turn.role!r}") diff --git a/tests/conformance/proof_multi_agent.py b/tests/conformance/proof_multi_agent.py index 902d2ca..9aa2729 100644 --- a/tests/conformance/proof_multi_agent.py +++ b/tests/conformance/proof_multi_agent.py @@ -18,6 +18,8 @@ sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "src")) +import contextlib + from harbor.models.task.task import Task from benchflow._acp_run import connect_acp, execute_prompts @@ -73,7 +75,7 @@ async def role_runner(env, role: Role, prompt: str) -> None: trial_dir = Path(f"/tmp/multi-agent-proof/{role.name}") trial_dir.mkdir(parents=True, exist_ok=True) launch_cmd = AGENT_LAUNCH.get(role.agent, role.agent) - acp_client, session, agent_name = await connect_acp( + acp_client, session, _agent_name = await connect_acp( env=env, agent=role.agent, agent_launch=launch_cmd, @@ -85,7 +87,7 @@ async def role_runner(env, role: Role, prompt: str) -> None: agent_cwd="/app", ) try: - trajectory, n_tools = await execute_prompts( + _trajectory, n_tools = await execute_prompts( acp_client, session, [prompt], @@ -93,10 +95,8 @@ async def role_runner(env, role: Role, prompt: str) -> None: ) logging.info(f"[{role.name}] finished: {n_tools} tool calls") finally: - try: + with contextlib.suppress(Exception): await acp_client.close() - except Exception: - pass async def main() -> None: diff --git a/tests/conformance/run_conformance.py b/tests/conformance/run_conformance.py index b8aa503..1f758a1 100644 --- a/tests/conformance/run_conformance.py +++ b/tests/conformance/run_conformance.py @@ -53,9 +53,7 @@ def has_creds(agent_name: str) -> bool: if any(os.environ.get(k) for k in keys): return True sub_file = SUBSCRIPTION_AUTH_FILES.get(agent_name) - if sub_file and Path(sub_file).expanduser().exists(): - return True - return False + return bool(sub_file and Path(sub_file).expanduser().exists()) async def run_one(agent_name: str) -> dict: diff --git a/tests/test_oracle_chokepoint.py b/tests/test_oracle_chokepoint.py index 142c063..a61333d 100644 --- a/tests/test_oracle_chokepoint.py +++ b/tests/test_oracle_chokepoint.py @@ -21,11 +21,9 @@ from __future__ import annotations -import importlib from pathlib import Path from unittest.mock import AsyncMock, patch -import pytest from typer.testing import CliRunner diff --git a/tests/test_rewards_jsonl.py b/tests/test_rewards_jsonl.py index e81f316..bbc029d 100644 --- a/tests/test_rewards_jsonl.py +++ b/tests/test_rewards_jsonl.py @@ -93,7 +93,7 @@ def test_rubric_plus_terminal_no_rubric_in_meta(tmp_path: Path) -> None: json.loads(ln) for ln in (tmp_path / "rewards.jsonl").read_text().strip().splitlines() ] - terminal = [ln for ln in lines if ln["type"] == "terminal"][0] + terminal = next(ln for ln in lines if ln["type"] == "terminal") assert "rubric" not in terminal["meta"] diff --git a/tests/test_scene_outbox_trial.py b/tests/test_scene_outbox_trial.py index 4e6d478..5b6f07a 100644 --- a/tests/test_scene_outbox_trial.py +++ b/tests/test_scene_outbox_trial.py @@ -11,8 +11,7 @@ import json from dataclasses import dataclass from pathlib import Path -from typing import Any -from unittest.mock import AsyncMock, patch +from unittest.mock import AsyncMock import pytest @@ -325,7 +324,6 @@ async def test_heterogeneous_agent_install(coder_reviewer_scene: Scene) -> None: trial._resolved_prompts = ["Solve the task"] installed_agents: list[str] = [] - original_connect_as = Trial.connect_as async def tracking_connect_as(self_inner, role): if role.agent != config.primary_agent: diff --git a/tests/test_skill_eval.py b/tests/test_skill_eval.py index c1f2b3e..0c8c3ed 100644 --- a/tests/test_skill_eval.py +++ b/tests/test_skill_eval.py @@ -133,7 +133,7 @@ def test_minimal_dataset(self, minimal_skill_dir): def test_missing_evals_json(self, tmp_path): skill = tmp_path / "no-evals" skill.mkdir() - with pytest.raises(FileNotFoundError, match="evals.json"): + with pytest.raises(FileNotFoundError, match=r"evals\.json"): load_eval_dataset(skill) def test_empty_cases(self, tmp_path): diff --git a/tests/test_skill_eval_dryrun.py b/tests/test_skill_eval_dryrun.py index 39b6a57..931a8e9 100644 --- a/tests/test_skill_eval_dryrun.py +++ b/tests/test_skill_eval_dryrun.py @@ -218,7 +218,7 @@ def test_gepa_export_roundtrip(self, mock_skill): # Verify trace files traces = list((gepa_dir / "traces").iterdir()) - assert len(traces) == 4 # 2 cases × 2 modes + assert len(traces) == 4 # 2 cases x 2 modes # Verify trace content trace = json.loads(traces[0].read_text()) @@ -228,6 +228,7 @@ def test_gepa_export_roundtrip(self, mock_skill): def test_cli_dryrun_loads_dataset(self, mock_skill): from typer.testing import CliRunner + from benchflow.cli.main import app runner = CliRunner() From 9fd286337eb500d13a656247fc6f8c522ae62bc9 Mon Sep 17 00:00:00 2001 From: Yifeng He Date: Sat, 25 Apr 2026 02:27:49 -0700 Subject: [PATCH 18/25] fix: merge cfg.agent_env into connect_as() env resolution (#191) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: skip model/API-key validation for oracle agent The oracle agent runs solution/solve.sh and never calls an LLM, but resolve_agent_env() was validating API keys for whatever model the CLI defaulted to (claude-haiku-4-5-20251001). This made `bench eval create -a oracle` fail without ANTHROPIC_API_KEY set, even though oracle doesn't need it. * fix: don't assign default model to oracle agent Move the fix from resolve_agent_env to the CLI layer: oracle runs solve.sh and never calls an LLM, so it should not receive DEFAULT_MODEL at all. Both _run_single and _run_batch now pass model=None for oracle. Widen JobConfig.model to str | None to support this. * fix: oracle agent — chokepoint guard, drop orphan eval CLI, helper PR #173 moved the oracle/DEFAULT_MODEL guard from resolve_agent_env to cli/eval.py, but cli/eval.py is orphaned (never imported into the live CLI), so `bench eval create` still passes DEFAULT_MODEL to oracle and trips ANTHROPIC_API_KEY validation. Three changes: - Restore the `agent != "oracle"` guard in resolve_agent_env so the chokepoint defends against any caller that forwards a model. - Delete the orphan cli/eval.py and its tests — the live eval_create lives in cli/main.py and was the actual code path users hit. - Add effective_model(agent, model) helper, change JobConfig.model default to None, replace seven `model or DEFAULT_MODEL` sites in cli/main.py and job.py YAML loaders so oracle gets honest model=None end-to-end (in result/summary JSON, prints, and downstream Trial). Regression test in test_resolve_env_helpers.py pins the chokepoint by calling resolve_agent_env("oracle", DEFAULT_MODEL, {}) with no API key and no host auth — verified to fail on main with the user-facing ANTHROPIC_API_KEY error and pass after the fix. * test: regression suite pinning oracle chokepoint + orphan removal Bundle 14 tests in tests/test_oracle_chokepoint.py that pin each layer of the prior fix at the right altitude: - TestOrphanRemoval — cli/eval.py is gone (ModuleNotFoundError) and no src/ file references benchflow.cli.eval, guarding against a future re-introduction that could swallow the next bug fix the same way. - TestEvalCreateRouting — `bench eval create` callback lives in cli/main.py:eval_create. Pins the architectural fact PR #173 missed. - TestEffectiveModel — unit tests for the helper: oracle drops model, non-oracle falls back to DEFAULT_MODEL, empty string treated as unset. - TestOracleYamlLoaders — Job.from_yaml(oracle config) → model is None for both native and Harbor formats; non-oracle backwards-compat preserved. - TestEvalCreateOracleCLI — end-to-end: live eval_create(agent="oracle") with no API key in env does not raise. Mocks Trial.create and resets the asyncio loop after to avoid polluting pre-existing tests that use the deprecated asyncio.get_event_loop() pattern. Verified to fail on main in the right shape: 9 of 14 fail (each pinning a deleted/added behavior), 5 pass (asserting structural facts already true). The CLI test fails on main with the user-reported error "ANTHROPIC_API_KEY required for model 'claude-haiku-4-5-20251001'…". * fix: restore cli/eval.py and test_eval_cli.py, apply oracle guard The previous commit deleted cli/eval.py and its tests as orphans, but they are intentionally kept. Restore both from main, update eval.py to use the effective_model() helper for the oracle chokepoint fix, and replace the "module is gone" regression test with a guard that cli/main.py does not import cli/eval (the actual invariant). * docs: clarify cli/eval.py and test_eval_cli.py are not wired into live CLI * docs: add fix plan for connect_as() agent_env bug (#2) * docs: expand fix plan with eng review findings and test cases Add two edge-case test requirements (non-overlapping key merge, None safety) from /plan-eng-review. Append review report confirming 0 issues, 0 critical gaps — ready to implement. * fix: merge cfg.agent_env into connect_as() env resolution (#2) connect_as() passed only role.env to resolve_agent_env, losing all config-level env vars (e.g. BENCHFLOW_PROVIDER_BASE_URL from YAML). Merge cfg.agent_env as base with role.env overlay so role-specific vars win on overlap. * remove plan --------- Co-authored-by: Xiangyi Li Co-authored-by: Xiangyi Li --- src/benchflow/trial.py | 7 +- tests/test_connect_as_env.py | 140 +++++++++++++++++++++++++++++++++++ 2 files changed, 146 insertions(+), 1 deletion(-) create mode 100644 tests/test_connect_as_env.py diff --git a/src/benchflow/trial.py b/src/benchflow/trial.py index d2f1570..fd70e5d 100644 --- a/src/benchflow/trial.py +++ b/src/benchflow/trial.py @@ -715,7 +715,12 @@ async def connect_as(self, role: Role) -> None: t0 = datetime.now() agent_launch = AGENT_LAUNCH.get(role.agent, role.agent) - agent_env = resolve_agent_env(role.agent, role.model, role.env or None) + # Merge cfg.agent_env (config-level) with role.env (role-specific) so + # provider creds from YAML reach the agent. role.env wins on overlap. + agent_env = resolve_agent_env( + role.agent, role.model, + {**(cfg.agent_env or {}), **(role.env or {})}, + ) if role.agent != cfg.primary_agent: agent_cfg = await install_agent(self._env, role.agent, self._trial_dir) diff --git a/tests/test_connect_as_env.py b/tests/test_connect_as_env.py new file mode 100644 index 0000000..97ab423 --- /dev/null +++ b/tests/test_connect_as_env.py @@ -0,0 +1,140 @@ +"""Regression tests for connect_as() agent_env merging (issue #2). + +connect_as() must merge cfg.agent_env (config-level) with role.env +(role-level), with role-level keys winning on overlap. Before the fix, +role.env={} was truthy so resolve_agent_env received an empty dict, +discarding all config-level env vars. +""" + +from pathlib import Path +from unittest.mock import AsyncMock, patch + +import pytest + +from benchflow.trial import Role, Scene, TrialConfig + + +def _make_config(agent_env=None, role_env=None): + """Build a minimal TrialConfig with one scene.""" + role = Role(name="agent", agent="claude-agent-acp", model="test-model") + if role_env is not None: + role = Role( + name="agent", agent="claude-agent-acp", model="test-model", env=role_env + ) + scene = Scene(roles=[role]) + return TrialConfig( + task_path=Path("/fake/task"), + scenes=[scene], + agent_env=agent_env, + ) + + +class TestConnectAsEnvMerge: + """Verify connect_as() merges cfg.agent_env with role.env correctly.""" + + @pytest.fixture() + def _mock_trial(self, tmp_path): + """Return a Trial stub wired to capture the agent_env passed to connect_acp.""" + from benchflow.trial import Trial + + cfg = _make_config( + agent_env={"BENCHFLOW_PROVIDER_BASE_URL": "http://localhost:8080/v1"}, + ) + trial = Trial.__new__(Trial) + trial._config = cfg + trial._env = {} + trial._trial_dir = tmp_path + trial._timing = {} + trial._agent_cwd = None + trial._phase = "idle" + return trial + + @pytest.mark.asyncio + async def test_config_env_propagated_through_empty_role_env(self, _mock_trial): + """cfg.agent_env vars reach resolve_agent_env when role.env is {}.""" + captured = {} + + def fake_resolve(agent, model, env): + captured["env"] = env + return env or {} + + with ( + patch("benchflow.trial.resolve_agent_env", side_effect=fake_resolve), + patch("benchflow.trial.connect_acp", new_callable=AsyncMock) as mock_conn, + ): + mock_conn.return_value = (AsyncMock(), AsyncMock(), "agent") + role = _mock_trial._config.scenes[0].roles[0] + await _mock_trial.connect_as(role) + + assert "BENCHFLOW_PROVIDER_BASE_URL" in captured["env"] + assert captured["env"]["BENCHFLOW_PROVIDER_BASE_URL"] == "http://localhost:8080/v1" + + @pytest.mark.asyncio + async def test_role_env_overrides_config_env(self, _mock_trial): + """Role-level env wins over config-level on key overlap.""" + _mock_trial._config = _make_config( + agent_env={"KEY": "from-config", "SHARED": "config-val"}, + role_env={"SHARED": "role-val", "ROLE_ONLY": "yes"}, + ) + captured = {} + + def fake_resolve(agent, model, env): + captured["env"] = env + return env or {} + + with ( + patch("benchflow.trial.resolve_agent_env", side_effect=fake_resolve), + patch("benchflow.trial.connect_acp", new_callable=AsyncMock) as mock_conn, + ): + mock_conn.return_value = (AsyncMock(), AsyncMock(), "agent") + role = _mock_trial._config.scenes[0].roles[0] + await _mock_trial.connect_as(role) + + env = captured["env"] + assert env["KEY"] == "from-config" + assert env["SHARED"] == "role-val" + assert env["ROLE_ONLY"] == "yes" + + @pytest.mark.asyncio + async def test_all_keys_present_in_merge(self, _mock_trial): + """Non-overlapping keys from both dicts are all present.""" + _mock_trial._config = _make_config( + agent_env={"A": "1", "B": "2"}, + role_env={"C": "3", "D": "4"}, + ) + captured = {} + + def fake_resolve(agent, model, env): + captured["env"] = env + return env or {} + + with ( + patch("benchflow.trial.resolve_agent_env", side_effect=fake_resolve), + patch("benchflow.trial.connect_acp", new_callable=AsyncMock) as mock_conn, + ): + mock_conn.return_value = (AsyncMock(), AsyncMock(), "agent") + role = _mock_trial._config.scenes[0].roles[0] + await _mock_trial.connect_as(role) + + env = captured["env"] + assert env == {"A": "1", "B": "2", "C": "3", "D": "4"} + + @pytest.mark.asyncio + async def test_none_config_env_with_empty_role_env(self, _mock_trial): + """cfg.agent_env=None + empty role.env does not crash.""" + _mock_trial._config = _make_config(agent_env=None, role_env={}) + captured = {} + + def fake_resolve(agent, model, env): + captured["env"] = env + return env or {} + + with ( + patch("benchflow.trial.resolve_agent_env", side_effect=fake_resolve), + patch("benchflow.trial.connect_acp", new_callable=AsyncMock) as mock_conn, + ): + mock_conn.return_value = (AsyncMock(), AsyncMock(), "agent") + role = _mock_trial._config.scenes[0].roles[0] + await _mock_trial.connect_as(role) + + assert captured["env"] == {} From 871bd212bf885324c062d7c714baaebb61e0c655 Mon Sep 17 00:00:00 2001 From: Chujun Tao Date: Sat, 25 Apr 2026 05:30:03 -0400 Subject: [PATCH 19/25] Fix/openhands sandbox launch (#182) * rebase on upstream/0.3 * openhand cli add * enhance api key security * refine tests Co-authored-by: Copilot --------- Co-authored-by: Copilot Co-authored-by: Xiangyi Li --- src/benchflow/_acp_run.py | 6 +- src/benchflow/_agent_env.py | 98 ++++++++++++++++++- src/benchflow/_agent_setup.py | 20 +++- src/benchflow/_sandbox.py | 4 +- src/benchflow/agents/registry.py | 37 +++++-- src/benchflow/metrics.py | 2 +- tests/conftest.py | 5 + tests/test_acp.py | 28 ++++++ tests/test_agent_registry.py | 34 +++++++ tests/test_agent_setup.py | 38 ++++++++ tests/test_registry_invariants.py | 1 + tests/test_sdk_internals.py | 155 +++++++++++++++++++++++++++--- tests/test_sdk_lockdown.py | 2 +- tests/test_skill_eval.py | 2 +- tests/test_subscription_auth.py | 4 + tests/test_verify.py | 27 +++--- 16 files changed, 414 insertions(+), 49 deletions(-) create mode 100644 tests/test_agent_setup.py diff --git a/src/benchflow/_acp_run.py b/src/benchflow/_acp_run.py index 5602072..e134e5f 100644 --- a/src/benchflow/_acp_run.py +++ b/src/benchflow/_acp_run.py @@ -27,6 +27,7 @@ from benchflow.acp.client import ACPClient from benchflow.acp.container_transport import ContainerTransport from benchflow.agents.providers import strip_provider_prefix +from benchflow.agents.registry import AGENTS from benchflow.process import DaytonaProcess, DaytonaPtyProcess, DockerProcess logger = logging.getLogger(__name__) @@ -120,13 +121,16 @@ async def connect_acp( await acp_client.close() raise - if model: + agent_cfg = AGENTS.get(agent) + if model and (agent_cfg is None or agent_cfg.supports_acp_set_model): acp_model_id = strip_provider_prefix(model) try: await asyncio.wait_for(acp_client.set_model(acp_model_id), timeout=60) logger.info(f"Model set to: {acp_model_id} (from {model})") except Exception as e: logger.warning(f"Failed to set model via ACP: {e}") + elif model: + logger.info(f"Skipping ACP set_model for {agent} — launch/env config owns model selection") return acp_client, session, agent_name diff --git a/src/benchflow/_agent_env.py b/src/benchflow/_agent_env.py index 72d6498..e67c741 100644 --- a/src/benchflow/_agent_env.py +++ b/src/benchflow/_agent_env.py @@ -26,6 +26,32 @@ logger = logging.getLogger(__name__) +_AUTH_CONTEXT_GROUPS = ( + frozenset({"ANTHROPIC_API_KEY", "ANTHROPIC_AUTH_TOKEN", "CLAUDE_CODE_OAUTH_TOKEN"}), + frozenset({"GEMINI_API_KEY", "GOOGLE_API_KEY"}), +) +_EXPLICIT_AGENT_NATIVE_BRIDGE_KEYS = frozenset({"LLM_API_KEY"}) + + +def _normalize_openhands_model(model: str) -> str: + """Translate benchflow model IDs to OpenHands/LiteLLM model IDs. + + OpenHands expects provider-qualified model names for some providers even + when benchflow uses bare model IDs or its own provider prefixes. + """ + from benchflow.agents.providers import strip_provider_prefix + from benchflow.agents.registry import is_vertex_model + + if model.startswith(("gemini/", "vertex_ai/", "openhands/")): + return model + stripped = strip_provider_prefix(model) + lower = model.lower() + if is_vertex_model(model) and "gemini" in lower: + return f"vertex_ai/{stripped}" + if "gemini" in lower: + return f"gemini/{stripped}" + return stripped + def auto_inherit_env(agent_env: dict[str, str]) -> None: """Copy well-known API keys from host os.environ into agent_env.""" @@ -118,11 +144,22 @@ def resolve_provider_env( _key = agent_env.get(_prov_cfg.auth_env, "") if _key: agent_env.setdefault("BENCHFLOW_PROVIDER_API_KEY", _key) + else: + # No registered provider prefix — bridge the model's well-known API key + # to BENCHFLOW_PROVIDER_API_KEY so env_mapping can translate it to + # agent-native vars (e.g. GEMINI_API_KEY → LLM_API_KEY for openhands). + from benchflow.agents.registry import infer_env_key_for_model + + _inferred = infer_env_key_for_model(model) + if _inferred and _inferred in agent_env: + agent_env.setdefault("BENCHFLOW_PROVIDER_API_KEY", agent_env[_inferred]) # Apply agent env_mapping: translate BENCHFLOW_PROVIDER_* → agent-native vars if agent_cfg and agent_cfg.env_mapping: for src, dst in agent_cfg.env_mapping.items(): if src in agent_env: agent_env.setdefault(dst, agent_env[src]) + if agent == "openhands": + agent_env.setdefault("LLM_MODEL", _normalize_openhands_model(model)) def check_subscription_auth(agent: str, required_key: str) -> bool: @@ -136,6 +173,18 @@ def check_subscription_auth(agent: str, required_key: str) -> bool: return Path(sa.detect_file).expanduser().is_file() +def _shares_auth_context(required_key: str | None, candidate_key: str | None) -> bool: + """True when both keys represent the same provider auth context.""" + if not required_key or not candidate_key: + return False + if required_key == candidate_key: + return True + return any( + required_key in group and candidate_key in group + for group in _AUTH_CONTEXT_GROUPS + ) + + def resolve_agent_env( agent: str, model: str | None, @@ -143,19 +192,61 @@ def resolve_agent_env( ) -> dict[str, str]: """Resolve agent environment: auto-inherit keys, provider vars, env_mapping.""" agent_env = dict(agent_env or {}) + explicit_agent_env_keys = set(agent_env) auto_inherit_env(agent_env) + pre_provider_env = dict(agent_env) + agent_cfg = AGENTS.get(agent) # Oracle runs solve.sh and never calls an LLM — model env vars and # API-key validation are skipped even if a caller forwards a model. if model and agent != "oracle": inject_vertex_credentials(agent_env, model) resolve_provider_env(agent_env, model, agent) + if agent_cfg and agent_cfg.env_mapping: + for src, dst in agent_cfg.env_mapping.items(): + if src in agent_env and dst not in explicit_agent_env_keys: + # Provider resolution must override unrelated host-native + # vars auto-inherited from the environment, but preserve + # explicit agent_env overrides supplied by the caller. + agent_env[dst] = agent_env[src] # Validate required API key for the chosen model from benchflow.agents.registry import infer_env_key_for_model required_key = infer_env_key_for_model(model) - # CLAUDE_CODE_OAUTH_TOKEN is an alternative auth path for Claude agents - has_oauth = "CLAUDE_CODE_OAUTH_TOKEN" in agent_env or "ANTHROPIC_AUTH_TOKEN" in agent_env - if required_key and required_key not in agent_env and not has_oauth: + mapped_provider_key = ( + agent_cfg.env_mapping.get("BENCHFLOW_PROVIDER_API_KEY") + if agent_cfg + else None + ) + has_agent_native_bridge_key = bool( + mapped_provider_key + and pre_provider_env.get(mapped_provider_key) + and ( + _shares_auth_context(required_key, mapped_provider_key) + or ( + mapped_provider_key in _EXPLICIT_AGENT_NATIVE_BRIDGE_KEYS + and mapped_provider_key in explicit_agent_env_keys + ) + ) + ) + if has_agent_native_bridge_key: + # Only pre-existing same-provider aliases or explicit generic bridge + # keys can satisfy provider auth. Values synthesized by env_mapping + # or inherited from another provider context must not bypass the + # model's required credential. + agent_env.setdefault( + "BENCHFLOW_PROVIDER_API_KEY", + pre_provider_env[mapped_provider_key], + ) + has_oauth = any( + key in agent_env and _shares_auth_context(required_key, key) + for key in ("CLAUDE_CODE_OAUTH_TOKEN", "ANTHROPIC_AUTH_TOKEN") + ) + if ( + required_key + and required_key not in agent_env + and not has_oauth + and not has_agent_native_bridge_key + ): if check_subscription_auth(agent, required_key): agent_env["_BENCHFLOW_SUBSCRIPTION_AUTH"] = "1" logger.info( @@ -183,7 +274,6 @@ def resolve_agent_env( ) else: # No model specified — still check subscription auth for required env vars - agent_cfg = AGENTS.get(agent) if agent_cfg: for req_key in agent_cfg.requires_env: if req_key not in agent_env and check_subscription_auth(agent, req_key): diff --git a/src/benchflow/_agent_setup.py b/src/benchflow/_agent_setup.py index 39b1720..0074d94 100644 --- a/src/benchflow/_agent_setup.py +++ b/src/benchflow/_agent_setup.py @@ -36,15 +36,29 @@ async def install_agent(env, agent: str, trial_dir: Path) -> AgentConfig | None: agent_cfg = AGENTS.get(agent_base) if agent_base not in AGENT_INSTALLERS: return agent_cfg + install_cmd = AGENT_INSTALLERS[agent_base] install_timeout = agent_cfg.install_timeout if agent_cfg else 900 logger.info(f"Installing {agent_base} in sandbox (timeout={install_timeout}s)...") install_result = await env.exec( - AGENT_INSTALLERS[agent_base], + install_cmd, timeout_sec=install_timeout, ) install_log = trial_dir / "agent" / "install-stdout.txt" install_log.parent.mkdir(parents=True, exist_ok=True) - install_log.write_text(install_result.stdout or "") + stdout = install_result.stdout or "" + stderr = install_result.stderr or "" + parts = [f"$ {install_cmd}\n"] + if stdout: + parts.append("=== stdout ===\n") + parts.append(stdout) + if not stdout.endswith("\n"): + parts.append("\n") + if stderr: + parts.append("=== stderr ===\n") + parts.append(stderr) + if not stderr.endswith("\n"): + parts.append("\n") + install_log.write_text("".join(parts)) if install_result.return_code != 0: diag = await env.exec( "echo 'OS:' && cat /etc/os-release 2>/dev/null | head -2; " @@ -55,7 +69,7 @@ async def install_agent(env, agent: str, trial_dir: Path) -> AgentConfig | None: raise AgentInstallError( agent=agent_base, return_code=install_result.return_code, - stdout=install_result.stdout or "", + stdout="".join(parts), diagnostics=diag.stdout or "", log_path=str(install_log), ) diff --git a/src/benchflow/_sandbox.py b/src/benchflow/_sandbox.py index cef2f42..130a0c3 100644 --- a/src/benchflow/_sandbox.py +++ b/src/benchflow/_sandbox.py @@ -87,9 +87,7 @@ def build_priv_drop_cmd(agent_launch: str, sandbox_user: str) -> str: setpriv (util-linux) execs directly; su -l is the fallback for Alpine/BusyBox. No outer sh -c wrapper — DockerProcess wraps in bash -c already. """ - inner = ( - f"export HOME=/home/{sandbox_user} && {agent_launch}" - ) + inner = f"export HOME=/home/{sandbox_user} && {agent_launch}" quoted = shlex.quote(inner) return ( f"if setpriv --help 2>&1 | grep -q reuid; then" diff --git a/src/benchflow/agents/registry.py b/src/benchflow/agents/registry.py index 1326726..9b977de 100644 --- a/src/benchflow/agents/registry.py +++ b/src/benchflow/agents/registry.py @@ -160,6 +160,9 @@ class AgentConfig: subscription_auth: SubscriptionAuth | None = None # Host CLI login that can substitute for an API key (e.g. OAuth tokens # from `claude login`). Detected automatically; API keys take precedence. + supports_acp_set_model: bool = True + # Some ACP agents configure the model through env/config at launch time and + # do not implement session/set_model (e.g. OpenHands CLI ACP). # Agent registry — all supported agents @@ -311,23 +314,35 @@ class AgentConfig: "openhands": AgentConfig( name="openhands", description="OpenHands agent via ACP (multi-model, Python-based)", - skill_paths=[], + skill_paths=["$HOME/.agents/skills", "$WORKSPACE/.agents/skills"], home_dirs=[".openhands"], install_cmd=( "export DEBIAN_FRONTEND=noninteractive && " "export PATH=\"$HOME/.local/bin:$PATH\" && " + "( command -v curl >/dev/null 2>&1 || " + " ( apt-get update -qq && " + " apt-get install -y -qq curl ca-certificates >/dev/null 2>&1 ) ) && " "( command -v openhands >/dev/null 2>&1 || ( " - " ( command -v uv >/dev/null 2>&1 || ( " - " ( command -v curl >/dev/null 2>&1 || " - " (apt-get update -qq && apt-get install -y -qq curl ca-certificates >/dev/null 2>&1) ) && " - " curl -LsSf https://astral.sh/uv/install.sh | sh >/dev/null 2>&1 " - " ) ) && " - " uv tool install openhands --python 3.12 >/dev/null 2>&1 " + " UV_OK=0; " + " if command -v uv >/dev/null 2>&1; then " + " UV_VER=$(uv --version 2>/dev/null | awk '{print $2}'); " + " if [ -n \"$UV_VER\" ] && " + " [ \"$(printf '%s\\n' 0.11.6 \"$UV_VER\" | sort -V | head -n1)\" = \"0.11.6\" ]; then " + " UV_OK=1; " + " fi; " + " fi; " + " if [ \"$UV_OK\" = 0 ]; then " + " curl -LsSf https://astral.sh/uv/install.sh | sh >/dev/null 2>&1 && " + " export PATH=\"$HOME/.local/bin:$PATH\"; " + " fi && " + " ( uv tool list 2>/dev/null | grep -q '^openhands\\b' || " + " uv tool install openhands --python 3.12 >/dev/null 2>&1 || " + " curl -fsSL https://install.openhands.dev/install.sh | sh >/dev/null 2>&1 ) " ") ) && " - # Let sandbox user traverse to the uv-managed Python interpreter + # Let sandbox user traverse to uv-managed Python interpreter path. "chmod o+x /root /root/.local /root/.local/share " "/root/.local/share/uv /root/.local/share/uv/tools 2>/dev/null; " - # Seed agent_settings.json so _is_authenticated() passes + # Seed config so OpenHands ACP auth check passes before env override. "mkdir -p ~/.openhands && " "echo '{\"llm\":{\"model\":\"placeholder\",\"api_key\":\"placeholder\"}}' " "> ~/.openhands/agent_settings.json && " @@ -346,8 +361,8 @@ class AgentConfig: env_mapping={ "BENCHFLOW_PROVIDER_BASE_URL": "LLM_BASE_URL", "BENCHFLOW_PROVIDER_API_KEY": "LLM_API_KEY", - "BENCHFLOW_PROVIDER_MODEL": "LLM_MODEL", }, + supports_acp_set_model=False, ), } @@ -519,6 +534,7 @@ def register_agent( credential_files: list[CredentialFile] | None = None, home_dirs: list[str] | None = None, subscription_auth: SubscriptionAuth | None = None, + supports_acp_set_model: bool = True, ) -> AgentConfig: """Register a custom agent at runtime. @@ -548,6 +564,7 @@ def register_agent( credential_files=credential_files or [], home_dirs=home_dirs or [], subscription_auth=subscription_auth, + supports_acp_set_model=supports_acp_set_model, ) AGENTS[name] = config AGENT_INSTALLERS[name] = install_cmd diff --git a/src/benchflow/metrics.py b/src/benchflow/metrics.py index 43bd445..413eb7f 100644 --- a/src/benchflow/metrics.py +++ b/src/benchflow/metrics.py @@ -180,7 +180,7 @@ def collect_metrics( results_dir = Path(results_dir) best: dict[str, dict] = {} - for rfile in results_dir.rglob("result.json"): + for rfile in sorted(results_dir.rglob("result.json")): try: r = json.loads(rfile.read_text()) task = r["task_name"] diff --git a/tests/conftest.py b/tests/conftest.py index a324652..6f5ee12 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,12 +1,17 @@ """Test fixtures.""" import json +import sys from datetime import datetime from pathlib import Path import pytest REPO_ROOT = Path(__file__).parent.parent +SRC_ROOT = REPO_ROOT / "src" +if str(SRC_ROOT) not in sys.path: + sys.path.insert(0, str(SRC_ROOT)) + REF_TASKS = REPO_ROOT / ".ref" / "harbor" / "examples" / "tasks" diff --git a/tests/test_acp.py b/tests/test_acp.py index e0f4729..cea79c5 100644 --- a/tests/test_acp.py +++ b/tests/test_acp.py @@ -347,3 +347,31 @@ async def test_model_id_selection(self, model_in, expected_model, tmp_path): ) mock_acp.set_model.assert_awaited_once_with(expected_model) + + @pytest.mark.asyncio + async def test_openhands_skips_set_model(self, tmp_path): + from benchflow._acp_run import connect_acp + + mock_acp = self._make_mocks() + mock_env = AsyncMock() + with ( + patch( + "benchflow._acp_run.DockerProcess.from_harbor_env", + return_value=MagicMock(), + ), + patch("benchflow._acp_run.ContainerTransport", return_value=MagicMock()), + patch("benchflow._acp_run.ACPClient", return_value=mock_acp), + ): + await connect_acp( + env=mock_env, + agent="openhands", + agent_launch="openhands acp --always-approve --override-with-envs", + agent_env={}, + sandbox_user=None, + model="gemini-3.1-flash-lite-preview", + trial_dir=tmp_path, + environment="docker", + agent_cwd="/app", + ) + + mock_acp.set_model.assert_not_awaited() diff --git a/tests/test_agent_registry.py b/tests/test_agent_registry.py index 4fe5904..ab06cf1 100644 --- a/tests/test_agent_registry.py +++ b/tests/test_agent_registry.py @@ -5,6 +5,7 @@ test_registry_invariants.py — search there for the consolidated tripwire. """ +from benchflow._agent_env import resolve_provider_env from benchflow.agents.providers import PROVIDERS from benchflow.agents.registry import AGENTS @@ -33,6 +34,39 @@ def test_gemini_has_mapping(self): assert cfg.env_mapping["BENCHFLOW_PROVIDER_BASE_URL"] == "GEMINI_API_BASE_URL" assert cfg.env_mapping["BENCHFLOW_PROVIDER_API_KEY"] == "GOOGLE_API_KEY" + def test_openhands_has_mapping(self): + cfg = AGENTS["openhands"] + assert cfg.env_mapping["BENCHFLOW_PROVIDER_BASE_URL"] == "LLM_BASE_URL" + assert cfg.env_mapping["BENCHFLOW_PROVIDER_API_KEY"] == "LLM_API_KEY" + # OpenHands model is normalized in _normalize_openhands_model(). + assert "BENCHFLOW_PROVIDER_MODEL" not in cfg.env_mapping + + def test_openhands_normalizes_model(self): + env = {} + resolve_provider_env( + agent="openhands", + model="zai/glm-5", + agent_env=env, + ) + + assert env["LLM_MODEL"] == "glm-5" + +class TestOpenHandsConfig: + def test_openhands_uses_agentskills_paths(self): + cfg = AGENTS["openhands"] + assert "$HOME/.agents/skills" in cfg.skill_paths + assert "$WORKSPACE/.agents/skills" in cfg.skill_paths + + def test_openhands_install_cmd_has_uv_and_binary_fallbacks(self): + cfg = AGENTS["openhands"] + assert "apt-get install -y -qq curl ca-certificates" in cfg.install_cmd + assert "uv tool install openhands --python 3.12" in cfg.install_cmd + assert "install.openhands.dev/install.sh" in cfg.install_cmd + + def test_openhands_skips_acp_set_model(self): + cfg = AGENTS["openhands"] + assert cfg.supports_acp_set_model is False + class TestAgentCredentialFiles: def test_codex_has_auth_json(self): diff --git a/tests/test_agent_setup.py b/tests/test_agent_setup.py new file mode 100644 index 0000000..ce00d6b --- /dev/null +++ b/tests/test_agent_setup.py @@ -0,0 +1,38 @@ +from pathlib import Path +from types import SimpleNamespace +from unittest.mock import AsyncMock + +import pytest + +from benchflow._agent_setup import install_agent +from benchflow.models import AgentInstallError + + +@pytest.mark.asyncio +async def test_install_agent_writes_command_stdout_and_stderr_on_failure(tmp_path: Path): + env = SimpleNamespace() + env.exec = AsyncMock( + side_effect=[ + SimpleNamespace(return_code=1, stdout="", stderr="uv: command not found\n"), + SimpleNamespace( + stdout="OS:\nID=ubuntu\nNode:\nv22.0.0\nAgent:\nnot found\n", + stderr="", + return_code=0, + ), + ] + ) + + with pytest.raises(AgentInstallError) as exc_info: + await install_agent(env, "openhands", tmp_path) + + err = exc_info.value + log_path = Path(err.log_path) + assert log_path == tmp_path / "agent" / "install-stdout.txt" + assert log_path.exists() + log_text = log_path.read_text() + assert log_text.startswith("$ ") + assert "uv tool install openhands --python 3.12" in log_text + assert "=== stderr ===" in log_text + assert "uv: command not found" in log_text + assert err.stdout == log_text + assert "ID=ubuntu" in err.diagnostics diff --git a/tests/test_registry_invariants.py b/tests/test_registry_invariants.py index 023945b..5b708d3 100644 --- a/tests/test_registry_invariants.py +++ b/tests/test_registry_invariants.py @@ -49,6 +49,7 @@ def test_agent_field_shapes(name, cfg): f"api_protocol={cfg.api_protocol!r} not in {VALID_API_PROTOCOLS}" ) assert isinstance(cfg.install_timeout, int) and cfg.install_timeout > 0 + assert isinstance(cfg.supports_acp_set_model, bool) @pytest.mark.parametrize("name,cfg", AGENTS.items(), ids=list(AGENTS.keys())) diff --git a/tests/test_sdk_internals.py b/tests/test_sdk_internals.py index fe000cf..f5abd3a 100644 --- a/tests/test_sdk_internals.py +++ b/tests/test_sdk_internals.py @@ -21,8 +21,26 @@ def _resolve(self, agent="claude-agent-acp", model=None, agent_env=None): return resolve_agent_env(agent, model, agent_env) - def test_env_mapping_applied_after_provider(self): + def _patch_expanduser(self, monkeypatch, tmp_path): + orig_expanduser = Path.expanduser + + def fake_expanduser(self): + s = str(self) + if s.startswith("~"): + return tmp_path / s[2:] + return orig_expanduser(self) + + monkeypatch.setattr(Path, "expanduser", fake_expanduser) + + def test_env_mapping_applied_after_provider(self, monkeypatch): """env_mapping translates BENCHFLOW_PROVIDER_* → agent-native vars.""" + for key in ( + "ZAI_API_KEY", + "ANTHROPIC_API_KEY", + "ANTHROPIC_AUTH_TOKEN", + "CLAUDE_CODE_OAUTH_TOKEN", + ): + monkeypatch.delenv(key, raising=False) result = self._resolve( agent="claude-agent-acp", model="zai/glm-5", @@ -33,21 +51,136 @@ def test_env_mapping_applied_after_provider(self): assert "ANTHROPIC_AUTH_TOKEN" in result assert result["ANTHROPIC_AUTH_TOKEN"] == "zk-test" + def test_agent_native_api_key_satisfies_model_check(self, monkeypatch): + """Agent-native mapped key (LLM_API_KEY) can satisfy provider auth check.""" + for key in ("OPENAI_API_KEY", "LLM_API_KEY"): + monkeypatch.delenv(key, raising=False) + result = self._resolve( + agent="openhands", + model="openai/gpt-4.1-mini", + agent_env={"LLM_API_KEY": "test-llm-key"}, + ) + assert result["LLM_API_KEY"] == "test-llm-key" + assert result["BENCHFLOW_PROVIDER_API_KEY"] == "test-llm-key" + + def test_same_provider_native_alias_satisfies_model_check(self, monkeypatch): + """Provider-native aliases remain valid for the same auth context.""" + for key in ("GEMINI_API_KEY", "GOOGLE_API_KEY"): + monkeypatch.delenv(key, raising=False) + result = self._resolve( + agent="gemini", + model="gemini-2.5-flash", + agent_env={"GOOGLE_API_KEY": "test-google-key"}, + ) + assert result["GOOGLE_API_KEY"] == "test-google-key" + assert result["BENCHFLOW_PROVIDER_API_KEY"] == "test-google-key" + + @pytest.mark.parametrize( + ("agent", "host_key"), + [ + pytest.param("codex-acp", "OPENAI_API_KEY", id="codex-openai-key"), + pytest.param( + "claude-agent-acp", + "ANTHROPIC_AUTH_TOKEN", + id="claude-auth-token", + ), + pytest.param("gemini", "GOOGLE_API_KEY", id="gemini-google-key"), + ], + ) + def test_cross_provider_host_native_key_does_not_bypass_required_key( + self, monkeypatch, tmp_path, agent, host_key + ): + """Host-native keys for another provider must not satisfy zai auth.""" + for key in ( + "ZAI_API_KEY", + "OPENAI_API_KEY", + "ANTHROPIC_API_KEY", + "ANTHROPIC_AUTH_TOKEN", + "CLAUDE_CODE_OAUTH_TOKEN", + "GOOGLE_API_KEY", + "GEMINI_API_KEY", + ): + monkeypatch.delenv(key, raising=False) + monkeypatch.setenv(host_key, "host-native-key") + self._patch_expanduser(monkeypatch, tmp_path) + + with pytest.raises(ValueError, match="ZAI_API_KEY required"): + self._resolve(agent=agent, model="zai/glm-5", agent_env={}) + + def test_auto_inherited_generic_bridge_key_does_not_bypass_required_key( + self, monkeypatch, tmp_path + ): + """Generic agent-native keys must be passed explicitly to bridge auth.""" + for key in ("ZAI_API_KEY", "LLM_API_KEY"): + monkeypatch.delenv(key, raising=False) + monkeypatch.setenv("LLM_API_KEY", "host-llm-key") + self._patch_expanduser(monkeypatch, tmp_path) + + with pytest.raises(ValueError, match="ZAI_API_KEY required"): + self._resolve(agent="openhands", model="zai/glm-5", agent_env={}) + + def test_openhands_gemini_model_is_prefixed_for_google_ai_studio(self, monkeypatch): + """OpenHands expects Gemini models in gemini/ format.""" + monkeypatch.delenv("GEMINI_API_KEY", raising=False) + result = self._resolve( + agent="openhands", + model="gemini-3.1-flash-lite-preview", + agent_env={"GEMINI_API_KEY": "test-gemini-key"}, + ) + assert result["LLM_MODEL"] == "gemini/gemini-3.1-flash-lite-preview" + assert result["LLM_API_KEY"] == "test-gemini-key" + + def test_openhands_explicit_llm_model_is_preserved(self, monkeypatch): + """User-provided LLM_MODEL must win over derived normalization.""" + monkeypatch.delenv("GEMINI_API_KEY", raising=False) + result = self._resolve( + agent="openhands", + model="gemini-3.1-flash-lite-preview", + agent_env={ + "GEMINI_API_KEY": "test-gemini-key", + "LLM_MODEL": "litellm/custom-format", + }, + ) + assert result["LLM_MODEL"] == "litellm/custom-format" + + def test_openhands_vertex_model_is_prefixed_for_vertex(self, monkeypatch, tmp_path): + """OpenHands expects Vertex Gemini models in vertex_ai/ format.""" + adc_dir = tmp_path / ".config" / "gcloud" + adc_dir.mkdir(parents=True) + (adc_dir / "application_default_credentials.json").write_text("{}") + monkeypatch.setattr("pathlib.Path.home", staticmethod(lambda: tmp_path)) + result = self._resolve( + agent="openhands", + model="google-vertex/gemini-2.5-flash", + agent_env={"GOOGLE_CLOUD_PROJECT": "my-proj"}, + ) + assert result["LLM_MODEL"] == "vertex_ai/gemini-2.5-flash" + + def test_provider_bridge_key_alone_does_not_bypass_required_model_key( + self, monkeypatch + ): + """Only mapped agent-native keys can bypass provider-specific key checks.""" + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + with pytest.raises(ValueError, match="OPENAI_API_KEY required"): + self._resolve( + agent="openclaw", + model="openai/gpt-4.1-mini", + agent_env={"BENCHFLOW_PROVIDER_API_KEY": "x"}, + ) + def test_required_key_missing_raises(self, monkeypatch, tmp_path): """Missing required API key raises ValueError when no subscription auth.""" # Clear any auto-inherited keys from the environment - for key in ("ANTHROPIC_API_KEY", "ZAI_API_KEY", "OPENAI_API_KEY"): + for key in ( + "ANTHROPIC_API_KEY", + "ANTHROPIC_AUTH_TOKEN", + "CLAUDE_CODE_OAUTH_TOKEN", + "ZAI_API_KEY", + "OPENAI_API_KEY", + ): monkeypatch.delenv(key, raising=False) # Ensure no host subscription auth files are found - orig_expanduser = Path.expanduser - - def fake_expanduser(self): - s = str(self) - if s.startswith("~"): - return tmp_path / s[2:] - return orig_expanduser(self) - - monkeypatch.setattr(Path, "expanduser", fake_expanduser) + self._patch_expanduser(monkeypatch, tmp_path) # Anthropic model with pytest.raises(ValueError, match="ANTHROPIC_API_KEY required"): self._resolve( diff --git a/tests/test_sdk_lockdown.py b/tests/test_sdk_lockdown.py index 91cb7d4..0d4f92c 100644 --- a/tests/test_sdk_lockdown.py +++ b/tests/test_sdk_lockdown.py @@ -261,7 +261,7 @@ def test_inner_command_is_shlex_quoted(self): import shlex cmd = build_priv_drop_cmd("agent --flag value", "agent") - inner = "export HOME=/home/agent && cd /home/agent && agent --flag value" + inner = "export HOME=/home/agent && agent --flag value" assert shlex.quote(inner) in cmd def test_single_quotes_in_launch(self): diff --git a/tests/test_skill_eval.py b/tests/test_skill_eval.py index 0c8c3ed..94518d1 100644 --- a/tests/test_skill_eval.py +++ b/tests/test_skill_eval.py @@ -111,7 +111,7 @@ def test_loads_valid_dataset(self, skill_dir): ds = load_eval_dataset(skill_dir) assert ds.skill_name == "calculator" assert len(ds.cases) == 2 - assert ds.judge_model == "gemini-3.1-flash-lite" + assert ds.judge_model == "claude-haiku-4-5-20251001" assert ds.timeout_sec == 120 def test_parses_cases(self, skill_dir): diff --git a/tests/test_subscription_auth.py b/tests/test_subscription_auth.py index 7f322b9..39ff056 100644 --- a/tests/test_subscription_auth.py +++ b/tests/test_subscription_auth.py @@ -98,6 +98,8 @@ def test_subscription_auth_detected(self, monkeypatch, tmp_path): """When host auth file exists and no API key, subscription auth is used.""" for k in ( "ANTHROPIC_API_KEY", + "ANTHROPIC_AUTH_TOKEN", + "CLAUDE_CODE_OAUTH_TOKEN", "OPENAI_API_KEY", "GOOGLE_API_KEY", "GEMINI_API_KEY", @@ -118,6 +120,8 @@ def test_no_auth_file_raises(self, monkeypatch, tmp_path): """When no API key and no host auth file, raises ValueError.""" for k in ( "ANTHROPIC_API_KEY", + "ANTHROPIC_AUTH_TOKEN", + "CLAUDE_CODE_OAUTH_TOKEN", "OPENAI_API_KEY", "GOOGLE_API_KEY", "GEMINI_API_KEY", diff --git a/tests/test_verify.py b/tests/test_verify.py index c3a790f..93f4a51 100644 --- a/tests/test_verify.py +++ b/tests/test_verify.py @@ -476,9 +476,9 @@ def sdk_run_mocks(self, tmp_path): def _patch_sdk_run(self, sdk, mock_env, extra_patches): """Apply shared + extra patches for SDK.run() internals.""" patches = [ - patch("benchflow.sdk._create_environment", return_value=mock_env), + patch("benchflow.trial._create_environment", return_value=mock_env), patch( - "benchflow.sdk.install_agent", + "benchflow.trial.install_agent", new_callable=AsyncMock, return_value=MagicMock( credential_files={}, @@ -487,8 +487,8 @@ def _patch_sdk_run(self, sdk, mock_env, extra_patches): env_mapping={}, ), ), - patch("benchflow.sdk.write_credential_files", new_callable=AsyncMock), - patch("benchflow.sdk.deploy_skills", new_callable=AsyncMock), + patch("benchflow.trial.write_credential_files", new_callable=AsyncMock), + patch("benchflow.trial.deploy_skills", new_callable=AsyncMock), *extra_patches, ] with contextlib.ExitStack() as stack: @@ -516,23 +516,22 @@ async def test_scraped_trajectory_preserves_n_tool_calls( mock_env, [ patch( - "benchflow.sdk.connect_acp", + "benchflow.trial.connect_acp", new_callable=AsyncMock, return_value=(mock_acp, mock_session, "test-agent"), ), patch( - "benchflow.sdk.execute_prompts", + "benchflow.trial.execute_prompts", new_callable=AsyncMock, return_value=([], 5), ), patch( - "benchflow.sdk._scrape_agent_trajectory", + "benchflow.trial._scrape_agent_trajectory", new_callable=AsyncMock, return_value=forged, ), - patch.object( - sdk, - "_verify", + patch( + "benchflow.sdk.SDK._verify", new_callable=AsyncMock, return_value=({"reward": 1.0}, None), ), @@ -568,21 +567,21 @@ async def test_partial_acp_uses_session_tool_calls(self, sdk_run_mocks): mock_env, [ patch( - "benchflow.sdk.connect_acp", + "benchflow.trial.connect_acp", new_callable=AsyncMock, return_value=(mock_acp, mock_session, "test-agent"), ), patch( - "benchflow.sdk.execute_prompts", + "benchflow.trial.execute_prompts", new_callable=AsyncMock, side_effect=ConnectionError("lost"), ), patch( - "benchflow.sdk._capture_session_trajectory", + "benchflow.trial._capture_session_trajectory", return_value=partial_events, ), patch( - "benchflow.sdk._scrape_agent_trajectory", + "benchflow.trial._scrape_agent_trajectory", new_callable=AsyncMock, return_value=[], ), From d4d61ae1b1d2757343ca209787c4058c857517f9 Mon Sep 17 00:00:00 2001 From: Xiangyi Li Date: Sat, 25 Apr 2026 05:36:39 -0400 Subject: [PATCH 20/25] docs: use uv tool install instead of pip install (#176) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: skip model/API-key validation for oracle agent The oracle agent runs solution/solve.sh and never calls an LLM, but resolve_agent_env() was validating API keys for whatever model the CLI defaulted to (claude-haiku-4-5-20251001). This made `bench eval create -a oracle` fail without ANTHROPIC_API_KEY set, even though oracle doesn't need it. * fix: don't assign default model to oracle agent Move the fix from resolve_agent_env to the CLI layer: oracle runs solve.sh and never calls an LLM, so it should not receive DEFAULT_MODEL at all. Both _run_single and _run_batch now pass model=None for oracle. Widen JobConfig.model to str | None to support this. * fix: oracle agent — chokepoint guard, drop orphan eval CLI, helper PR #173 moved the oracle/DEFAULT_MODEL guard from resolve_agent_env to cli/eval.py, but cli/eval.py is orphaned (never imported into the live CLI), so `bench eval create` still passes DEFAULT_MODEL to oracle and trips ANTHROPIC_API_KEY validation. Three changes: - Restore the `agent != "oracle"` guard in resolve_agent_env so the chokepoint defends against any caller that forwards a model. - Delete the orphan cli/eval.py and its tests — the live eval_create lives in cli/main.py and was the actual code path users hit. - Add effective_model(agent, model) helper, change JobConfig.model default to None, replace seven `model or DEFAULT_MODEL` sites in cli/main.py and job.py YAML loaders so oracle gets honest model=None end-to-end (in result/summary JSON, prints, and downstream Trial). Regression test in test_resolve_env_helpers.py pins the chokepoint by calling resolve_agent_env("oracle", DEFAULT_MODEL, {}) with no API key and no host auth — verified to fail on main with the user-facing ANTHROPIC_API_KEY error and pass after the fix. * test: regression suite pinning oracle chokepoint + orphan removal Bundle 14 tests in tests/test_oracle_chokepoint.py that pin each layer of the prior fix at the right altitude: - TestOrphanRemoval — cli/eval.py is gone (ModuleNotFoundError) and no src/ file references benchflow.cli.eval, guarding against a future re-introduction that could swallow the next bug fix the same way. - TestEvalCreateRouting — `bench eval create` callback lives in cli/main.py:eval_create. Pins the architectural fact PR #173 missed. - TestEffectiveModel — unit tests for the helper: oracle drops model, non-oracle falls back to DEFAULT_MODEL, empty string treated as unset. - TestOracleYamlLoaders — Job.from_yaml(oracle config) → model is None for both native and Harbor formats; non-oracle backwards-compat preserved. - TestEvalCreateOracleCLI — end-to-end: live eval_create(agent="oracle") with no API key in env does not raise. Mocks Trial.create and resets the asyncio loop after to avoid polluting pre-existing tests that use the deprecated asyncio.get_event_loop() pattern. Verified to fail on main in the right shape: 9 of 14 fail (each pinning a deleted/added behavior), 5 pass (asserting structural facts already true). The CLI test fails on main with the user-reported error "ANTHROPIC_API_KEY required for model 'claude-haiku-4-5-20251001'…". * fix: restore cli/eval.py and test_eval_cli.py, apply oracle guard The previous commit deleted cli/eval.py and its tests as orphans, but they are intentionally kept. Restore both from main, update eval.py to use the effective_model() helper for the oracle chokepoint fix, and replace the "module is gone" regression test with a guard that cli/main.py does not import cli/eval (the actual invariant). * docs: clarify cli/eval.py and test_eval_cli.py are not wired into live CLI * docs: use `uv tool install` instead of `pip install` benchflow is a CLI tool with entry points — uv tool install gives users an isolated environment (like pipx) without managing venvs manually. --------- Co-authored-by: Yifeng He --- .claude/skills/benchflow/SKILL.md | 4 ++-- .../tasks/benchflow-knowledge/environment/benchflow/SKILL.md | 4 ++-- .../tasks/create-simple-task/environment/benchflow/SKILL.md | 4 ++-- README.md | 4 ++-- docs/api-reference.md | 2 +- docs/quickstart.md | 4 ++-- docs/skill-eval-guide.md | 4 ++-- 7 files changed, 13 insertions(+), 13 deletions(-) diff --git a/.claude/skills/benchflow/SKILL.md b/.claude/skills/benchflow/SKILL.md index 1f21b5f..c740791 100644 --- a/.claude/skills/benchflow/SKILL.md +++ b/.claude/skills/benchflow/SKILL.md @@ -23,7 +23,7 @@ Arguments passed: `$ARGUMENTS` ### No args or `status` — show current state -1. Check if benchflow is installed: `pip show benchflow` +1. Check if benchflow is installed: `uv tool list | grep benchflow` 2. Check if `.env` exists with API keys 3. Check available agents: `benchflow agents` 4. Show recent job results if any exist in `jobs/` @@ -199,7 +199,7 @@ asyncio.run(main()) ## Setup ```bash -pip install benchflow # or: pip install -e . (from source) +uv tool install benchflow # or: uv tool install -e . (from source) source .env # ANTHROPIC_API_KEY, DAYTONA_API_KEY ``` diff --git a/.claude/skills/benchflow/tasks/benchflow-knowledge/environment/benchflow/SKILL.md b/.claude/skills/benchflow/tasks/benchflow-knowledge/environment/benchflow/SKILL.md index 7d3817b..57af0d8 100644 --- a/.claude/skills/benchflow/tasks/benchflow-knowledge/environment/benchflow/SKILL.md +++ b/.claude/skills/benchflow/tasks/benchflow-knowledge/environment/benchflow/SKILL.md @@ -23,7 +23,7 @@ Arguments passed: `$ARGUMENTS` ### No args or `status` — show current state -1. Check if benchflow is installed: `pip show benchflow` +1. Check if benchflow is installed: `uv tool list | grep benchflow` 2. Check if `.env` exists with API keys 3. Check available agents: `benchflow agents` 4. Show recent job results if any exist in `jobs/` @@ -182,7 +182,7 @@ asyncio.run(main()) ## Setup ```bash -pip install benchflow # or: pip install -e . (from source) +uv tool install benchflow # or: uv tool install -e . (from source) source .env # ANTHROPIC_API_KEY, DAYTONA_API_KEY ``` diff --git a/.claude/skills/benchflow/tasks/create-simple-task/environment/benchflow/SKILL.md b/.claude/skills/benchflow/tasks/create-simple-task/environment/benchflow/SKILL.md index 7d3817b..57af0d8 100644 --- a/.claude/skills/benchflow/tasks/create-simple-task/environment/benchflow/SKILL.md +++ b/.claude/skills/benchflow/tasks/create-simple-task/environment/benchflow/SKILL.md @@ -23,7 +23,7 @@ Arguments passed: `$ARGUMENTS` ### No args or `status` — show current state -1. Check if benchflow is installed: `pip show benchflow` +1. Check if benchflow is installed: `uv tool list | grep benchflow` 2. Check if `.env` exists with API keys 3. Check available agents: `benchflow agents` 4. Show recent job results if any exist in `jobs/` @@ -182,7 +182,7 @@ asyncio.run(main()) ## Setup ```bash -pip install benchflow # or: pip install -e . (from source) +uv tool install benchflow # or: uv tool install -e . (from source) source .env # ANTHROPIC_API_KEY, DAYTONA_API_KEY ``` diff --git a/README.md b/README.md index 4c38823..e56d455 100644 --- a/README.md +++ b/README.md @@ -21,10 +21,10 @@ BenchFlow runs AI agents against benchmark tasks in sandboxed environments. It s ## Install ```bash -pip install benchflow==0.3.0a3 +uv tool install benchflow ``` -Requires Python 3.12+. For cloud sandboxes, set `DAYTONA_API_KEY`. +Requires Python 3.12+ and [uv](https://docs.astral.sh/uv/). For cloud sandboxes, set `DAYTONA_API_KEY`. ## Quick Start diff --git a/docs/api-reference.md b/docs/api-reference.md index 5248459..786f0c8 100644 --- a/docs/api-reference.md +++ b/docs/api-reference.md @@ -5,7 +5,7 @@ The Trial/Scene API is the primary way to run agent benchmarks programmatically. ## Install ```bash -pip install benchflow==0.3.0a3 +uv tool install benchflow ``` ## Quick Start diff --git a/docs/quickstart.md b/docs/quickstart.md index e087ea7..34fc2dc 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -4,14 +4,14 @@ Get a benchmark result in under 5 minutes. ## Prerequisites -- Python 3.12+ +- Python 3.12+ and [uv](https://docs.astral.sh/uv/) - A Daytona API key (`DAYTONA_API_KEY`) for cloud sandboxes - An agent API key (e.g. `GEMINI_API_KEY` for Gemini) ## Install ```bash -pip install benchflow==0.3.0a3 +uv tool install benchflow ``` ## Run your first evaluation diff --git a/docs/skill-eval-guide.md b/docs/skill-eval-guide.md index f0a04c6..1268222 100644 --- a/docs/skill-eval-guide.md +++ b/docs/skill-eval-guide.md @@ -5,7 +5,7 @@ Test whether your agent skill actually helps agents perform better. ## Install ```bash -pip install benchflow==0.3.0a3 +uv tool install benchflow ``` ## Overview @@ -382,7 +382,7 @@ BenchFlow generates everything ephemeral — only results persist. **CI integration:** ```bash # In your skill's CI pipeline -pip install benchflow==0.3.0a3 +uv tool install benchflow bench skills eval . -a claude-agent-acp --no-baseline # Exit code 1 if any case scores < 0.5 ``` From 1fccf70f1e14c6349a3a4ad6ce4c7cc4c2c2b01b Mon Sep 17 00:00:00 2001 From: Yifeng He Date: Sat, 25 Apr 2026 02:36:42 -0700 Subject: [PATCH 21/25] feat: wire sandbox_setup_timeout through all configs (#180) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: skip model/API-key validation for oracle agent The oracle agent runs solution/solve.sh and never calls an LLM, but resolve_agent_env() was validating API keys for whatever model the CLI defaulted to (claude-haiku-4-5-20251001). This made `bench eval create -a oracle` fail without ANTHROPIC_API_KEY set, even though oracle doesn't need it. * fix: don't assign default model to oracle agent Move the fix from resolve_agent_env to the CLI layer: oracle runs solve.sh and never calls an LLM, so it should not receive DEFAULT_MODEL at all. Both _run_single and _run_batch now pass model=None for oracle. Widen JobConfig.model to str | None to support this. * fix: oracle agent — chokepoint guard, drop orphan eval CLI, helper PR #173 moved the oracle/DEFAULT_MODEL guard from resolve_agent_env to cli/eval.py, but cli/eval.py is orphaned (never imported into the live CLI), so `bench eval create` still passes DEFAULT_MODEL to oracle and trips ANTHROPIC_API_KEY validation. Three changes: - Restore the `agent != "oracle"` guard in resolve_agent_env so the chokepoint defends against any caller that forwards a model. - Delete the orphan cli/eval.py and its tests — the live eval_create lives in cli/main.py and was the actual code path users hit. - Add effective_model(agent, model) helper, change JobConfig.model default to None, replace seven `model or DEFAULT_MODEL` sites in cli/main.py and job.py YAML loaders so oracle gets honest model=None end-to-end (in result/summary JSON, prints, and downstream Trial). Regression test in test_resolve_env_helpers.py pins the chokepoint by calling resolve_agent_env("oracle", DEFAULT_MODEL, {}) with no API key and no host auth — verified to fail on main with the user-facing ANTHROPIC_API_KEY error and pass after the fix. * test: regression suite pinning oracle chokepoint + orphan removal Bundle 14 tests in tests/test_oracle_chokepoint.py that pin each layer of the prior fix at the right altitude: - TestOrphanRemoval — cli/eval.py is gone (ModuleNotFoundError) and no src/ file references benchflow.cli.eval, guarding against a future re-introduction that could swallow the next bug fix the same way. - TestEvalCreateRouting — `bench eval create` callback lives in cli/main.py:eval_create. Pins the architectural fact PR #173 missed. - TestEffectiveModel — unit tests for the helper: oracle drops model, non-oracle falls back to DEFAULT_MODEL, empty string treated as unset. - TestOracleYamlLoaders — Job.from_yaml(oracle config) → model is None for both native and Harbor formats; non-oracle backwards-compat preserved. - TestEvalCreateOracleCLI — end-to-end: live eval_create(agent="oracle") with no API key in env does not raise. Mocks Trial.create and resets the asyncio loop after to avoid polluting pre-existing tests that use the deprecated asyncio.get_event_loop() pattern. Verified to fail on main in the right shape: 9 of 14 fail (each pinning a deleted/added behavior), 5 pass (asserting structural facts already true). The CLI test fails on main with the user-reported error "ANTHROPIC_API_KEY required for model 'claude-haiku-4-5-20251001'…". * fix: restore cli/eval.py and test_eval_cli.py, apply oracle guard The previous commit deleted cli/eval.py and its tests as orphans, but they are intentionally kept. Restore both from main, update eval.py to use the effective_model() helper for the oracle chokepoint fix, and replace the "module is gone" regression test with a guard that cli/main.py does not import cli/eval (the actual invariant). * docs: clarify cli/eval.py and test_eval_cli.py are not wired into live CLI * test: cover sandbox setup timeout wiring * docs: document sandbox setup timeout * feat: wire sandbox setup timeout through configs `setup_sandbox_user()` already accepted a `timeout_sec` kwarg (default 120s) but no live call site surfaced it — the knob was unreachable for normal runs. Under heavy sandbox bootstrap (parallel containers copying large tool caches into /home/) the 120s cap was hit with no user override. Add `sandbox_setup_timeout: int = 120` to TrialConfig, JobConfig, and RuntimeConfig, and forward it through: - trial YAML (`trial_config_from_dict`) - job YAML (both native and Harbor-compatible loaders) - `SDK.run(..., sandbox_setup_timeout=...)` - `bench eval create --sandbox-setup-timeout` - `Trial.install_agent()` into both `setup_sandbox_user()` call sites (oracle + normal agent) The value is also recorded in the run's `config.json` snapshot to aid post-hoc diagnosis. Default stays at 120s — this change is about making the value configurable, not changing runtime behavior. --------- Co-authored-by: Xiangyi Li Co-authored-by: Xiangyi Li --- docs/api-reference.md | 29 +++++++- docs/cli-reference.md | 7 +- docs/quickstart.md | 17 ++++- src/benchflow/cli/main.py | 9 +++ src/benchflow/job.py | 7 ++ src/benchflow/runtime.py | 3 + src/benchflow/sdk.py | 4 + src/benchflow/trial.py | 12 ++- src/benchflow/trial_yaml.py | 1 + tests/test_runtime.py | 4 +- tests/test_sdk_internals.py | 39 ++++++++++ tests/test_trial_install_agent_timeout.py | 91 +++++++++++++++++++++++ tests/test_yaml_config.py | 6 ++ 13 files changed, 222 insertions(+), 7 deletions(-) create mode 100644 tests/test_trial_install_agent_timeout.py diff --git a/docs/api-reference.md b/docs/api-reference.md index 786f0c8..18fd862 100644 --- a/docs/api-reference.md +++ b/docs/api-reference.md @@ -34,6 +34,7 @@ config = TrialConfig( task_path=Path("tasks/my-task"), scenes=[Scene.single(agent="gemini", model="gemini-3.1-flash-lite-preview")], environment="daytona", + sandbox_setup_timeout=120, ) # Multi-scene BYOS (skill-gen → solve) @@ -46,9 +47,13 @@ config = TrialConfig( turns=[Turn("solver")]), ], environment="daytona", + sandbox_setup_timeout=120, ) ``` +Set `sandbox_setup_timeout` when sandbox user setup needs more than the default 120 seconds. +The same field is also available on `JobConfig` and `RuntimeConfig`. + ### Scene One interaction region — roles take turns executing prompts. @@ -98,6 +103,20 @@ await trial.verify() await trial.cleanup() ``` +### RuntimeConfig + +Runtime-level configuration for the `Agent + Environment` execution path. + +```python +from benchflow.runtime import Agent, Environment, Runtime, RuntimeConfig + +config = RuntimeConfig(sandbox_setup_timeout=300) +agent = Agent("gemini", model="gemini-3.1-flash-lite-preview") +env = Environment.from_task("tasks/X", backend="daytona") +runtime = Runtime(env, agent, config=config) +result = await runtime.execute() +``` + ### bf.run() Convenience function — multiple calling conventions: @@ -111,10 +130,16 @@ result = await bf.run(config) # 2. Agent + Environment (0.3 style) agent = bf.Agent("gemini", model="gemini-3.1-flash-lite-preview") env = bf.Environment.from_task("tasks/X", backend="daytona") -result = await bf.run(agent, env) +runtime_config = bf.RuntimeConfig(sandbox_setup_timeout=300) +result = await bf.run(agent, env, runtime_config) # 3. String shortcut (simplest) -result = await bf.run("gemini", task_path="tasks/X", model="gemini-3.1-flash-lite-preview") +result = await bf.run( + "gemini", + task_path="tasks/X", + model="gemini-3.1-flash-lite-preview", + config=bf.RuntimeConfig(sandbox_setup_timeout=300), +) ``` ## Trial Lifecycle diff --git a/docs/cli-reference.md b/docs/cli-reference.md index 86c702e..4b38788 100644 --- a/docs/cli-reference.md +++ b/docs/cli-reference.md @@ -40,7 +40,8 @@ bench eval create \ -a gemini \ -m gemini-3.1-flash-lite-preview \ -e daytona \ - -c 64 + -c 64 \ + --sandbox-setup-timeout 300 ``` | Flag | Default | Description | @@ -53,6 +54,7 @@ bench eval create \ | `--concurrency`, `-c` | `4` | Max concurrent tasks (batch mode only) | | `--jobs-dir`, `-o` | `jobs` | Output directory | | `--sandbox-user` | `agent` | Sandbox user (null for root) | +| `--sandbox-setup-timeout` | `120` | Timeout in seconds for sandbox user setup | ### bench eval list @@ -145,6 +147,7 @@ bench environment list task_dir: .ref/terminal-bench-2 environment: daytona concurrency: 64 +sandbox_setup_timeout: 300 scenes: - name: solve @@ -165,6 +168,7 @@ model: gemini-3.1-flash-lite-preview environment: daytona concurrency: 64 max_retries: 2 +sandbox_setup_timeout: 300 ``` ### Multi-scene (BYOS skill generation) @@ -173,6 +177,7 @@ max_retries: 2 task_dir: tasks/ environment: daytona concurrency: 10 +sandbox_setup_timeout: 300 scenes: - name: skill-gen diff --git a/docs/quickstart.md b/docs/quickstart.md index 34fc2dc..502fc94 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -26,7 +26,8 @@ bench eval create \ -t .ref/terminal-bench-2/regex-log \ -a gemini \ -m gemini-3.1-flash-lite-preview \ - -e daytona + -e daytona \ + --sandbox-setup-timeout 300 ``` BenchFlow will: @@ -52,6 +53,7 @@ model: gemini-3.1-flash-lite-preview environment: daytona concurrency: 64 max_retries: 2 +sandbox_setup_timeout: 300 ``` ## Python API @@ -70,11 +72,23 @@ config = TrialConfig( task_path=Path("tasks/regex-log"), scenes=[Scene.single(agent="gemini", model="gemini-3.1-flash-lite-preview")], environment="daytona", + sandbox_setup_timeout=300, ) trial = await Trial.create(config) result = await trial.run() ``` +If you are using the `Agent + Environment` path directly, pass the timeout through `RuntimeConfig`: + +```python +from benchflow.runtime import Agent, Environment, Runtime, RuntimeConfig + +agent = Agent("gemini", model="gemini-3.1-flash-lite-preview") +env = Environment.from_task("tasks/regex-log", backend="daytona") +runtime = Runtime(env, agent, config=RuntimeConfig(sandbox_setup_timeout=300)) +result = await runtime.execute() +``` + ## Multi-agent (reviewer pattern) ```python @@ -95,6 +109,7 @@ config = TrialConfig( ]), ], environment="daytona", + sandbox_setup_timeout=300, ) result = await bf.run(config) ``` diff --git a/src/benchflow/cli/main.py b/src/benchflow/cli/main.py index e9acc23..9c9b7f2 100644 --- a/src/benchflow/cli/main.py +++ b/src/benchflow/cli/main.py @@ -740,6 +740,13 @@ def eval_create( str | None, typer.Option("--sandbox-user", help="Sandbox user (null for root)"), ] = "agent", + sandbox_setup_timeout: Annotated[ + int, + typer.Option( + "--sandbox-setup-timeout", + help="Timeout (seconds) for sandbox user setup inside the environment.", + ), + ] = 120, skills_dir: Annotated[ Path | None, typer.Option("--skills-dir", "-s", help="Skills directory to deploy"), @@ -767,6 +774,7 @@ def eval_create( skills_dir=str(skills_dir) if skills_dir else None)], environment=environment, sandbox_user=sandbox_user, + sandbox_setup_timeout=sandbox_setup_timeout, jobs_dir=jobs_dir, agent=agent, model=eff_model, @@ -796,6 +804,7 @@ async def _run(): environment=environment, concurrency=concurrency, sandbox_user=sandbox_user, + sandbox_setup_timeout=sandbox_setup_timeout, skills_dir=str(skills_dir) if skills_dir else None, ), ) diff --git a/src/benchflow/job.py b/src/benchflow/job.py index 70dfb56..bd53a0b 100644 --- a/src/benchflow/job.py +++ b/src/benchflow/job.py @@ -166,6 +166,7 @@ class JobConfig: skills_dir: str | None = None sandbox_user: str | None = "agent" sandbox_locked_paths: list[str] | None = None + sandbox_setup_timeout: int = 120 context_root: str | None = None exclude_tasks: set[str] = field(default_factory=set) @@ -291,6 +292,7 @@ def _from_native_yaml(cls, raw: dict, **kwargs) -> "Job": exclude = set(raw.get("exclude", [])) sandbox_user = raw.get("sandbox_user", "agent") sandbox_locked_paths = raw.get("sandbox_locked_paths") + sandbox_setup_timeout = raw.get("sandbox_setup_timeout", 120) agent_name = raw.get("agent", DEFAULT_AGENT) config = JobConfig( @@ -304,6 +306,7 @@ def _from_native_yaml(cls, raw: dict, **kwargs) -> "Job": skills_dir=str(Path(raw["skills_dir"])) if raw.get("skills_dir") else None, sandbox_user=sandbox_user, sandbox_locked_paths=sandbox_locked_paths, + sandbox_setup_timeout=sandbox_setup_timeout, exclude_tasks=exclude, ) return cls(tasks_dir=tasks_dir, jobs_dir=jobs_dir, config=config, **kwargs) @@ -350,6 +353,7 @@ def _from_harbor_yaml(cls, raw: dict, **kwargs) -> "Job": skills_dir = str(Path(skills_dir_raw)) if skills_dir_raw else None sandbox_user = raw.get("sandbox_user", "agent") sandbox_locked_paths = raw.get("sandbox_locked_paths") + sandbox_setup_timeout = raw.get("sandbox_setup_timeout", 120) config = JobConfig( agent=agent_name, @@ -361,6 +365,7 @@ def _from_harbor_yaml(cls, raw: dict, **kwargs) -> "Job": skills_dir=skills_dir, sandbox_user=sandbox_user, sandbox_locked_paths=sandbox_locked_paths, + sandbox_setup_timeout=sandbox_setup_timeout, ) return cls(tasks_dir=tasks_dir, jobs_dir=jobs_dir, config=config, **kwargs) @@ -428,6 +433,7 @@ async def _run_single_task(self, task_dir: Path, cfg: JobConfig) -> RunResult: skills_dir=self._resolve_skills_dir(task_dir, cfg.skills_dir), sandbox_user=cfg.sandbox_user, sandbox_locked_paths=cfg.sandbox_locked_paths, + sandbox_setup_timeout=cfg.sandbox_setup_timeout, context_root=cfg.context_root, ) trial = await Trial.create(trial_config) @@ -447,6 +453,7 @@ async def _run_single_task_legacy(self, task_dir: Path, cfg: JobConfig) -> RunRe skills_dir=self._resolve_skills_dir(task_dir, cfg.skills_dir), sandbox_user=cfg.sandbox_user, sandbox_locked_paths=cfg.sandbox_locked_paths, + sandbox_setup_timeout=cfg.sandbox_setup_timeout, context_root=cfg.context_root, ) diff --git a/src/benchflow/runtime.py b/src/benchflow/runtime.py index 14a785e..69be399 100644 --- a/src/benchflow/runtime.py +++ b/src/benchflow/runtime.py @@ -142,6 +142,7 @@ class RuntimeConfig: """Configuration for a Runtime execution.""" sandbox_user: str | None = "agent" + sandbox_setup_timeout: int = 120 max_rounds: int = 10 snapshot_policy: str = "none" reward_stream: bool = True @@ -263,6 +264,7 @@ async def execute(self) -> RuntimeResult: environment=self.env.backend, sandbox_user=config.sandbox_user, sandbox_locked_paths=config.sandbox_locked_paths, + sandbox_setup_timeout=config.sandbox_setup_timeout, jobs_dir=config.jobs_dir, context_root=config.context_root, pre_agent_hooks=config.pre_agent_hooks, @@ -338,6 +340,7 @@ async def run( environment=env if isinstance(env, str) else "docker", sandbox_user=rc.sandbox_user, sandbox_locked_paths=rc.sandbox_locked_paths, + sandbox_setup_timeout=rc.sandbox_setup_timeout, jobs_dir=rc.jobs_dir, context_root=rc.context_root, pre_agent_hooks=rc.pre_agent_hooks, diff --git a/src/benchflow/sdk.py b/src/benchflow/sdk.py index 28fa82d..b55a72f 100644 --- a/src/benchflow/sdk.py +++ b/src/benchflow/sdk.py @@ -222,6 +222,7 @@ def _write_config( sandbox_user: str | None, context_root: str | Path | None, sandbox_locked_paths: list[str] | None = None, + sandbox_setup_timeout: int = 120, timeout: int, started_at: datetime, agent_env: dict[str, str], @@ -241,6 +242,7 @@ def _write_config( "skills_dir": str(skills_dir) if skills_dir else None, "sandbox_user": sandbox_user, "sandbox_locked_paths": sandbox_locked_paths, + "sandbox_setup_timeout": sandbox_setup_timeout, "context_root": str(context_root) if context_root else None, "timeout_sec": timeout, "started_at": str(started_at), @@ -442,6 +444,7 @@ async def run( skills_dir: str | Path | None = None, sandbox_user: str | None = "agent", sandbox_locked_paths: list[str] | None = None, + sandbox_setup_timeout: int = 120, pre_agent_hooks: list | None = None, context_root: str | Path | None = None, ) -> RunResult: @@ -490,6 +493,7 @@ async def run( skills_dir=skills_dir, sandbox_user=sandbox_user, sandbox_locked_paths=sandbox_locked_paths, + sandbox_setup_timeout=sandbox_setup_timeout, pre_agent_hooks=pre_agent_hooks, context_root=context_root, ) diff --git a/src/benchflow/trial.py b/src/benchflow/trial.py index fd70e5d..7449fc9 100644 --- a/src/benchflow/trial.py +++ b/src/benchflow/trial.py @@ -132,6 +132,7 @@ class TrialConfig: environment: str = "docker" sandbox_user: str | None = "agent" sandbox_locked_paths: list[str] | None = None + sandbox_setup_timeout: int = 120 services: list[str] | None = None job_name: str | None = None trial_name: str | None = None @@ -329,6 +330,7 @@ async def setup(self) -> None: sandbox_user=cfg.sandbox_user, context_root=cfg.context_root, sandbox_locked_paths=self._effective_locked, + sandbox_setup_timeout=cfg.sandbox_setup_timeout, timeout=self._timeout, started_at=self._started_at, agent_env=self._agent_env, @@ -368,7 +370,10 @@ async def install_agent(self) -> None: if cfg.primary_agent == "oracle": if cfg.sandbox_user: await setup_sandbox_user( - self._env, cfg.sandbox_user, workspace=self._agent_cwd + self._env, + cfg.sandbox_user, + workspace=self._agent_cwd, + timeout_sec=cfg.sandbox_setup_timeout, ) await _snapshot_build_config(self._env, workspace=self._agent_cwd) await _seed_verifier_workspace(self._env, workspace=self._agent_cwd, sandbox_user=cfg.sandbox_user) @@ -390,7 +395,10 @@ async def install_agent(self) -> None: if cfg.sandbox_user: self._agent_cwd = await setup_sandbox_user( - self._env, cfg.sandbox_user, workspace=self._agent_cwd + self._env, + cfg.sandbox_user, + workspace=self._agent_cwd, + timeout_sec=cfg.sandbox_setup_timeout, ) await _snapshot_build_config(self._env, workspace=self._agent_cwd) await _seed_verifier_workspace(self._env, workspace=self._agent_cwd, sandbox_user=cfg.sandbox_user) diff --git a/src/benchflow/trial_yaml.py b/src/benchflow/trial_yaml.py index 5b8ca4a..707978b 100644 --- a/src/benchflow/trial_yaml.py +++ b/src/benchflow/trial_yaml.py @@ -103,6 +103,7 @@ def trial_config_from_dict( environment=raw.get("environment", "docker"), sandbox_user=raw.get("sandbox_user", "agent"), sandbox_locked_paths=raw.get("sandbox_locked_paths"), + sandbox_setup_timeout=raw.get("sandbox_setup_timeout", 120), job_name=raw.get("job_name"), trial_name=raw.get("trial_name"), jobs_dir=raw.get("jobs_dir", "jobs"), diff --git a/tests/test_runtime.py b/tests/test_runtime.py index 16046fe..37819f7 100644 --- a/tests/test_runtime.py +++ b/tests/test_runtime.py @@ -36,6 +36,7 @@ def test_agent_env_default_empty() -> None: def test_runtime_config_defaults() -> None: c = RuntimeConfig() assert c.sandbox_user == "agent" + assert c.sandbox_setup_timeout == 120 assert c.max_rounds == 10 assert c.snapshot_policy == "none" assert c.reward_stream is True @@ -143,9 +144,10 @@ def test_runtime_custom_config() -> None: if not (task_path / "task.toml").exists(): return env = Environment.from_task(task_path, backend="daytona") - config = RuntimeConfig(sandbox_user=None, timeout=1800) + config = RuntimeConfig(sandbox_user=None, timeout=1800, sandbox_setup_timeout=45) runtime = Runtime(env, agent, config) assert runtime.config.sandbox_user is None + assert runtime.config.sandbox_setup_timeout == 45 assert runtime.config.timeout == 1800 diff --git a/tests/test_sdk_internals.py b/tests/test_sdk_internals.py index f5abd3a..6eff44a 100644 --- a/tests/test_sdk_internals.py +++ b/tests/test_sdk_internals.py @@ -7,6 +7,7 @@ import json from datetime import datetime from pathlib import Path +from unittest.mock import AsyncMock import pytest @@ -341,6 +342,7 @@ def test_config_json_written(self, tmp_path): skills_dir=None, sandbox_user=None, context_root=None, + sandbox_setup_timeout=33, timeout=300, started_at=datetime(2026, 4, 8, 12, 0), agent_env={"ANTHROPIC_API_KEY": "sk-secret", "SOME_VAR": "visible"}, @@ -354,6 +356,7 @@ def test_config_json_written(self, tmp_path): "skills_dir", "sandbox_user", "sandbox_locked_paths", + "sandbox_setup_timeout", "context_root", "timeout_sec", "started_at", @@ -365,6 +368,7 @@ def test_config_json_written(self, tmp_path): assert data["agent"] == "claude-agent-acp" assert data["model"] == "claude-haiku-4-5-20251001" assert data["environment"] == "docker" + assert data["sandbox_setup_timeout"] == 33 assert data["timeout_sec"] == 300 def test_secrets_filtered(self, tmp_path): @@ -399,6 +403,41 @@ def test_secrets_filtered(self, tmp_path): assert recorded["SAFE_VAR"] == "visible" +# ── run wiring ── + + +class TestRunWiring: + """Tests for SDK.run() argument forwarding into TrialConfig.""" + + @pytest.mark.asyncio + async def test_run_forwards_sandbox_setup_timeout_to_trial_config( + self, monkeypatch, tmp_path + ): + from benchflow.models import RunResult + from benchflow.sdk import SDK + + seen = {} + + async def fake_create(config): + seen["config"] = config + trial = AsyncMock() + trial.run = AsyncMock( + return_value=RunResult(task_name="task-1", rewards={"reward": 1.0}) + ) + return trial + + monkeypatch.setattr("benchflow.trial.Trial.create", fake_create) + + result = await SDK().run( + task_path=tmp_path, + sandbox_setup_timeout=77, + ) + + assert result.rewards == {"reward": 1.0} + assert seen["config"].sandbox_setup_timeout == 77 + assert seen["config"].task_path == tmp_path + + # ── _build_result ── diff --git a/tests/test_trial_install_agent_timeout.py b/tests/test_trial_install_agent_timeout.py new file mode 100644 index 0000000..40db22c --- /dev/null +++ b/tests/test_trial_install_agent_timeout.py @@ -0,0 +1,91 @@ +"""Tests for Trial.install_agent timeout wiring.""" + +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from benchflow.trial import Trial, TrialConfig + + +def _make_trial(tmp_path, *, agent: str, sandbox_setup_timeout: int) -> Trial: + config = TrialConfig.from_legacy( + task_path=tmp_path / "task", + agent=agent, + prompts=[None], + sandbox_user="agent", + sandbox_setup_timeout=sandbox_setup_timeout, + ) + trial = Trial(config) + trial._env = MagicMock() + trial._env.exec = AsyncMock(return_value=MagicMock(stdout="/workspace\n")) + trial._trial_dir = tmp_path / "trial" + trial._trial_dir.mkdir() + trial._trial_paths = MagicMock() + trial._task = MagicMock() + trial._effective_locked = [] + return trial + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("agent", "expected_setup_return"), + [ + ("claude-agent-acp", "/home/agent"), + ("oracle", None), + ], +) +async def test_install_agent_forwards_sandbox_setup_timeout( + tmp_path, monkeypatch, agent, expected_setup_return +): + trial = _make_trial(tmp_path, agent=agent, sandbox_setup_timeout=41) + + install_agent_mock = AsyncMock(return_value=MagicMock()) + write_credential_files_mock = AsyncMock() + upload_subscription_auth_mock = AsyncMock() + snapshot_build_config_mock = AsyncMock() + seed_verifier_workspace_mock = AsyncMock() + deploy_skills_mock = AsyncMock() + lockdown_paths_mock = AsyncMock() + setup_sandbox_user_mock = AsyncMock(return_value=expected_setup_return) + + monkeypatch.setattr("benchflow.trial.install_agent", install_agent_mock) + monkeypatch.setattr( + "benchflow.trial.write_credential_files", write_credential_files_mock + ) + monkeypatch.setattr( + "benchflow.trial.upload_subscription_auth", upload_subscription_auth_mock + ) + monkeypatch.setattr( + "benchflow.trial._snapshot_build_config", snapshot_build_config_mock + ) + monkeypatch.setattr( + "benchflow.trial._seed_verifier_workspace", seed_verifier_workspace_mock + ) + monkeypatch.setattr("benchflow.trial.deploy_skills", deploy_skills_mock) + monkeypatch.setattr("benchflow.trial.lockdown_paths", lockdown_paths_mock) + monkeypatch.setattr( + "benchflow.trial.setup_sandbox_user", setup_sandbox_user_mock + ) + + await trial.install_agent() + + setup_sandbox_user_mock.assert_awaited_once() + args, kwargs = setup_sandbox_user_mock.await_args + assert args[1] == "agent" + assert kwargs["timeout_sec"] == 41 + assert kwargs["workspace"] == "/workspace" + + if agent == "oracle": + install_agent_mock.assert_not_awaited() + write_credential_files_mock.assert_not_awaited() + deploy_skills_mock.assert_not_awaited() + assert trial._agent_cwd == "/workspace" + else: + install_agent_mock.assert_awaited_once() + write_credential_files_mock.assert_awaited_once() + deploy_skills_mock.assert_awaited_once() + assert trial._agent_cwd == "/home/agent" + + snapshot_build_config_mock.assert_awaited_once() + seed_verifier_workspace_mock.assert_awaited_once() + lockdown_paths_mock.assert_awaited_once() diff --git a/tests/test_yaml_config.py b/tests/test_yaml_config.py index d362296..0de12aa 100644 --- a/tests/test_yaml_config.py +++ b/tests/test_yaml_config.py @@ -24,6 +24,7 @@ def native_yaml(tmp_path): environment: daytona concurrency: 32 max_retries: 1 +sandbox_setup_timeout: 45 prompts: - null - "Review your solution." @@ -46,6 +47,7 @@ def harbor_yaml(tmp_path): orchestrator: type: local n_concurrent_trials: 8 +sandbox_setup_timeout: 75 environment: type: daytona env: @@ -69,6 +71,7 @@ def test_from_native_yaml(native_yaml): assert cfg.environment == "daytona" assert cfg.concurrency == 32 assert cfg.retry.max_retries == 1 + assert cfg.sandbox_setup_timeout == 45 assert cfg.prompts == [None, "Review your solution."] assert job._tasks_dir == Path("tasks") assert job._jobs_dir == Path("output") @@ -85,6 +88,7 @@ def test_from_harbor_yaml(harbor_yaml): assert cfg.concurrency == 8 assert cfg.retry.max_retries == 1 # n_attempts=2 → max_retries=1 assert cfg.agent_env.get("ANTHROPIC_API_KEY") == "test-key" + assert cfg.sandbox_setup_timeout == 75 assert job._tasks_dir == Path("tasks") assert job._jobs_dir == Path("output") @@ -127,6 +131,7 @@ def test_from_harbor_yaml_defaults(tmp_path): assert cfg.agent == "pi-acp" assert cfg.environment == "docker" assert cfg.concurrency == 4 + assert cfg.sandbox_setup_timeout == 120 assert job._tasks_dir == Path("tasks") assert job._jobs_dir == Path("jobs") @@ -253,3 +258,4 @@ def test_defaults_when_omitted(self, tmp_path): assert job._config.exclude_tasks == set() assert job._config.agent_env == {} assert job._config.sandbox_user == "agent" + assert job._config.sandbox_setup_timeout == 120 From e66537ca3b5f02c4824369adc3072324829f40d4 Mon Sep 17 00:00:00 2001 From: Yifeng He Date: Sat, 25 Apr 2026 02:49:02 -0700 Subject: [PATCH 22/25] fix: stop copying root tool installs into sandbox home (#181) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: skip model/API-key validation for oracle agent The oracle agent runs solution/solve.sh and never calls an LLM, but resolve_agent_env() was validating API keys for whatever model the CLI defaulted to (claude-haiku-4-5-20251001). This made `bench eval create -a oracle` fail without ANTHROPIC_API_KEY set, even though oracle doesn't need it. * fix: don't assign default model to oracle agent Move the fix from resolve_agent_env to the CLI layer: oracle runs solve.sh and never calls an LLM, so it should not receive DEFAULT_MODEL at all. Both _run_single and _run_batch now pass model=None for oracle. Widen JobConfig.model to str | None to support this. * fix: oracle agent — chokepoint guard, drop orphan eval CLI, helper PR #173 moved the oracle/DEFAULT_MODEL guard from resolve_agent_env to cli/eval.py, but cli/eval.py is orphaned (never imported into the live CLI), so `bench eval create` still passes DEFAULT_MODEL to oracle and trips ANTHROPIC_API_KEY validation. Three changes: - Restore the `agent != "oracle"` guard in resolve_agent_env so the chokepoint defends against any caller that forwards a model. - Delete the orphan cli/eval.py and its tests — the live eval_create lives in cli/main.py and was the actual code path users hit. - Add effective_model(agent, model) helper, change JobConfig.model default to None, replace seven `model or DEFAULT_MODEL` sites in cli/main.py and job.py YAML loaders so oracle gets honest model=None end-to-end (in result/summary JSON, prints, and downstream Trial). Regression test in test_resolve_env_helpers.py pins the chokepoint by calling resolve_agent_env("oracle", DEFAULT_MODEL, {}) with no API key and no host auth — verified to fail on main with the user-facing ANTHROPIC_API_KEY error and pass after the fix. * test: regression suite pinning oracle chokepoint + orphan removal Bundle 14 tests in tests/test_oracle_chokepoint.py that pin each layer of the prior fix at the right altitude: - TestOrphanRemoval — cli/eval.py is gone (ModuleNotFoundError) and no src/ file references benchflow.cli.eval, guarding against a future re-introduction that could swallow the next bug fix the same way. - TestEvalCreateRouting — `bench eval create` callback lives in cli/main.py:eval_create. Pins the architectural fact PR #173 missed. - TestEffectiveModel — unit tests for the helper: oracle drops model, non-oracle falls back to DEFAULT_MODEL, empty string treated as unset. - TestOracleYamlLoaders — Job.from_yaml(oracle config) → model is None for both native and Harbor formats; non-oracle backwards-compat preserved. - TestEvalCreateOracleCLI — end-to-end: live eval_create(agent="oracle") with no API key in env does not raise. Mocks Trial.create and resets the asyncio loop after to avoid polluting pre-existing tests that use the deprecated asyncio.get_event_loop() pattern. Verified to fail on main in the right shape: 9 of 14 fail (each pinning a deleted/added behavior), 5 pass (asserting structural facts already true). The CLI test fails on main with the user-reported error "ANTHROPIC_API_KEY required for model 'claude-haiku-4-5-20251001'…". * fix: restore cli/eval.py and test_eval_cli.py, apply oracle guard The previous commit deleted cli/eval.py and its tests as orphans, but they are intentionally kept. Restore both from main, update eval.py to use the effective_model() helper for the oracle chokepoint fix, and replace the "module is gone" regression test with a guard that cli/main.py does not import cli/eval (the actual invariant). * docs: clarify cli/eval.py and test_eval_cli.py are not wired into live CLI * docs(plan): add plan to fix sandbox io problem * test: lock sandbox setup contract Plan step 1/6: Lock the new sandbox contract in tests * fix: stop copying root tool installs into sandbox home Plan step 2/6: Narrow setup_sandbox_user() to user state only * refactor: derive sandbox home dirs from registry config Plan step 3/6: Align registry semantics with the new contract * refactor: symlink skills into sandbox, enforce shared install prefixes Replace per-trial skill-tree copies with ln -sfn into a shared /skills (or task skills_dir) root, drop skill_paths from get_sandbox_home_dirs(), and add registry + sandbox-setup invariants that keep agent binaries on /usr/local/* rather than /root-only home paths. Updates task-authoring and api-reference docs to describe the new lightweight sandbox contract. * chore: remove completed sandbox plan doc --------- Co-authored-by: Xiangyi Li Co-authored-by: Xiangyi Li --- docs/api-reference.md | 4 + docs/task-authoring.md | 2 + src/benchflow/_agent_setup.py | 80 +++++++++++------ src/benchflow/_sandbox.py | 33 ++++--- src/benchflow/agents/registry.py | 13 ++- tests/test_agent_setup.py | 142 +++++++++++++++++++++++++++++- tests/test_registry_invariants.py | 16 ++++ tests/test_sandbox.py | 65 ++++++++++---- tests/test_sandbox_setup.py | 106 ++++++++++++++++++++++ 9 files changed, 400 insertions(+), 61 deletions(-) create mode 100644 tests/test_sandbox_setup.py diff --git a/docs/api-reference.md b/docs/api-reference.md index 18fd862..00325e6 100644 --- a/docs/api-reference.md +++ b/docs/api-reference.md @@ -150,6 +150,10 @@ Trial.run() ├─ setup() — resolve config, create env object ├─ start() — spin up sandbox, upload task files, start services ├─ install_agent() — install agent binary, credentials, sandbox user + │ (sandbox user setup: create non-root user, prepare + │ small config/auth dirs, chown the workspace — no + │ recursive copy of /root tool trees; agent binaries + │ must live on shared prefixes like /usr/local/bin) ├─ for scene in scenes: │ └─ _run_scene(scene) │ ├─ setup /app/.outbox/ — (multi-role scenes only) diff --git a/docs/task-authoring.md b/docs/task-authoring.md index adfb641..9081987 100644 --- a/docs/task-authoring.md +++ b/docs/task-authoring.md @@ -50,6 +50,8 @@ env = { OPENAI_API_KEY = "${OPENAI_API_KEY}" } # host vars to injec **Built-in mock services** — if the Dockerfile references a service binary (`claw-gmail`, `claw-slack`, `claw-gcal`, `claw-gdoc`, `claw-gdrive`), BenchFlow starts it automatically. No `[services]` section needed. +**Install tooling to shared prefixes, not `/root`** — when a task image ships Node.js, Python tools, or agent binaries that the sandbox user must execute, install them to `/usr/local/bin`, `/usr/local/lib`, or `/opt`, not `/root/.nvm` or `/root/.local/bin`. `setup_sandbox_user()` creates the non-root user, prepares small config/auth dirs, and chowns the workspace — it does not clone `/root` into the sandbox home. Legacy images that already install tools under `/root` still work via a narrow symlink fallback, but shared prefixes are the supported path. Pre-creating the sandbox user in the Dockerfile is an optional speedup, not a requirement. + --- ## instruction.md diff --git a/src/benchflow/_agent_setup.py b/src/benchflow/_agent_setup.py index 0074d94..0c04a5a 100644 --- a/src/benchflow/_agent_setup.py +++ b/src/benchflow/_agent_setup.py @@ -30,6 +30,47 @@ logger = logging.getLogger(__name__) +def _skill_link_cmd(source: str, dest: str) -> str: + """Link a shared skills tree into an agent discovery path.""" + if source == dest: + return f"mkdir -p {shlex.quote(dest)}" + + parent = shlex.quote(str(Path(dest).parent)) + q_source = shlex.quote(source) + q_dest = shlex.quote(dest) + return ( + f"mkdir -p {parent} && " + f"rm -rf {q_dest} && " + f"ln -sfn {q_source} {q_dest}" + ) + + +async def _link_skill_paths(env, source: str, skill_paths: list[str], home: str, cwd: str) -> int: + """Link one shared skills tree into each configured discovery path.""" + parts = [] + for sp in skill_paths: + expanded = sp.replace("$HOME", home).replace("$WORKSPACE", cwd) + parts.append(_skill_link_cmd(source, expanded)) + if parts: + cmd = " && ".join(parts) + result = await env.exec(cmd, timeout_sec=15) + if result.return_code != 0: + stdout = (getattr(result, "stdout", "") or "").strip() + stderr = (getattr(result, "stderr", "") or "").strip() + details = [ + f"exit code {result.return_code}", + f"command: {cmd}", + ] + if stdout: + details.append(f"stdout: {stdout}") + if stderr: + details.append(f"stderr: {stderr}") + raise RuntimeError( + f"Failed to link skills from {source}: {'; '.join(details)}" + ) + return len(parts) + + async def install_agent(env, agent: str, trial_dir: Path) -> AgentConfig | None: """Install agent in sandbox and return its config.""" agent_base = agent.split()[0] @@ -86,6 +127,9 @@ async def deploy_skills( task: "Task", ) -> None: """Deploy and distribute skills into sandbox.""" + task_skills_dir = task.config.environment.skills_dir + effective_skills = task_skills_dir + # Runtime upload (fallback if not baked into Dockerfile) if skills_dir: dockerfile = task_path / "environment" / "Dockerfile" @@ -98,38 +142,24 @@ async def deploy_skills( if skills_path.is_dir(): logger.info(f"Deploying skills via runtime upload from {skills_path}") await env.upload_dir(skills_path, "/skills") - if agent_cfg and agent_cfg.skill_paths: - parts = [] - for sp in agent_cfg.skill_paths: - expanded = sp.replace("$HOME", "/root").replace( - "$WORKSPACE", "/app" - ) - parent = str(Path(expanded).parent) - parts.append( - f"mkdir -p '{parent}' && ln -sf /skills '{expanded}'" - ) - await env.exec(" && ".join(parts), timeout_sec=10) - logger.info("Skills deployed to /skills and symlinked") + logger.info("Skills deployed to /skills") + effective_skills = "/skills" else: logger.warning(f"Skills dir not found: {skills_path}") else: logger.info("Skills already injected via Dockerfile") # Distribute to agent-specific discovery paths - task_skills_dir = task.config.environment.skills_dir - effective_skills = "/skills" if skills_dir else task_skills_dir if effective_skills and agent_cfg and agent_cfg.skill_paths: home = f"/home/{sandbox_user}" if sandbox_user else "/root" - parts = [] - for sp in agent_cfg.skill_paths: - expanded = sp.replace("$HOME", home).replace("$WORKSPACE", agent_cwd) - q_expanded = shlex.quote(expanded) - q_skills = shlex.quote(effective_skills) - parts.append( - f"mkdir -p {q_expanded} && cp -r {q_skills}/. {q_expanded}/ 2>/dev/null" - ) - if parts: - await env.exec("; ".join(parts), timeout_sec=15) + count = await _link_skill_paths( + env, + effective_skills, + agent_cfg.skill_paths, + home, + agent_cwd, + ) + if count: logger.info( - f"Skills distributed to {len(parts)} paths for {agent_cfg.name}" + f"Skills distributed to {count} paths for {agent_cfg.name}" ) diff --git a/src/benchflow/_sandbox.py b/src/benchflow/_sandbox.py index 130a0c3..63fbcde 100644 --- a/src/benchflow/_sandbox.py +++ b/src/benchflow/_sandbox.py @@ -1,7 +1,7 @@ """Sandbox user setup, path lockdown, and verifier hardening. Owns the "agent runs as non-root" lifecycle: - - Creating the sandbox user and copying root's tooling into its home + - Creating the sandbox user and preparing minimal home state it needs - Building the privilege-drop wrapper (setpriv / su) for agent launch - Locking down solution/test paths so the sandbox user cannot read them - Hardening the environment before the verifier runs @@ -98,6 +98,20 @@ def build_priv_drop_cmd(agent_launch: str, sandbox_user: str) -> str: ) +def _legacy_root_tool_link_cmd(source: str, dest: str) -> str: + """Link legacy root-only tool dirs into the sandbox home when needed.""" + src = shlex.quote(source) + dst = shlex.quote(dest) + parent = shlex.quote(str(Path(dest).parent)) + return ( + f"if [ -e {src} ] && [ ! -L {dst} ]; then " + f"mkdir -p {parent} && " + f"rmdir {dst} 2>/dev/null || true; " + f"[ -e {dst} ] || ln -s {src} {dst}; " + "fi" + ) + + async def setup_sandbox_user( env, sandbox_user: str, workspace: str, *, timeout_sec: int = 120 ) -> str: @@ -107,18 +121,17 @@ async def setup_sandbox_user( f"Invalid sandbox_user: {sandbox_user!r} (must be alphanumeric)" ) logger.info(f"Setting up sandbox user: {sandbox_user}") + home = f"/home/{sandbox_user}" + home_dirs = sorted(d for d in get_sandbox_home_dirs() if d != ".local") await env.exec( f"id -u {sandbox_user} >/dev/null 2>&1 || " f"useradd -m -s /bin/bash {sandbox_user} && " - f"mkdir -p /home/{sandbox_user}/.local/bin && " - "if [ -d /root/.local/bin ]; then " - f"cp -aL /root/.local/bin/. /home/{sandbox_user}/.local/bin/ 2>/dev/null || true; fi && " - "if [ -d /root/.nvm ]; then " - f"cp -a /root/.nvm/. /home/{sandbox_user}/.nvm/ 2>/dev/null || true; fi && " - f"for d in {' '.join(sorted(get_sandbox_home_dirs()))}; do " - f"if [ -d /root/$d ]; then mkdir -p /home/{sandbox_user}/$d && " - f"cp -a /root/$d/. /home/{sandbox_user}/$d/ 2>/dev/null || true; fi; done && " - f"chown -R {sandbox_user}:{sandbox_user} /home/{sandbox_user} && " + f"{_legacy_root_tool_link_cmd('/root/.local/bin', f'{home}/.local/bin')} && " + f"{_legacy_root_tool_link_cmd('/root/.nvm', f'{home}/.nvm')} && " + f"for d in {' '.join(home_dirs)}; do " + f"if [ -d /root/$d ]; then mkdir -p {home}/$d && " + f"cp -a /root/$d/. {home}/$d/ 2>/dev/null || true; fi; done && " + f"chown -R {sandbox_user}:{sandbox_user} {home} && " f"chown -R {sandbox_user}:{sandbox_user} {shlex.quote(workspace)}", timeout_sec=timeout_sec, ) diff --git a/src/benchflow/agents/registry.py b/src/benchflow/agents/registry.py index 9b977de..ca24348 100644 --- a/src/benchflow/agents/registry.py +++ b/src/benchflow/agents/registry.py @@ -374,21 +374,18 @@ class AgentConfig: def get_sandbox_home_dirs() -> set[str]: - """Collect all dot-dirs under $HOME that sandbox user setup should copy. + """Collect user home config/auth dirs BenchFlow may materialize for the sandbox user. Derives from three sources across all registered agents: - - skill_paths: $HOME/.foo/... → ".foo" - credential_files: {home}/.foo/... → ".foo" + - subscription_auth.files: {home}/.foo/... → ".foo" - home_dirs: explicit extras (e.g. ".openclaw") - Always includes ".local" (pip scripts, etc.). + Skill paths are excluded: deploy_skills() now links those paths directly to a + shared skills tree instead of relying on sandbox-home copies. """ - dirs: set[str] = {".local"} + dirs: set[str] = set() for cfg in AGENTS.values(): - for sp in cfg.skill_paths: - if sp.startswith("$HOME/."): - dirname = sp.removeprefix("$HOME/").split("/")[0] - dirs.add(dirname) for cf in cfg.credential_files: # path uses {home}/.foo/... placeholder path = cf.path diff --git a/tests/test_agent_setup.py b/tests/test_agent_setup.py index ce00d6b..3b99ce6 100644 --- a/tests/test_agent_setup.py +++ b/tests/test_agent_setup.py @@ -1,13 +1,151 @@ +"""Tests for agent install and skill deployment setup helpers.""" + from pathlib import Path from types import SimpleNamespace -from unittest.mock import AsyncMock +from unittest.mock import AsyncMock, MagicMock import pytest -from benchflow._agent_setup import install_agent +from benchflow._agent_setup import deploy_skills, install_agent +from benchflow.agents.registry import AgentConfig from benchflow.models import AgentInstallError +def _make_task(skills_dir: str | None): + return SimpleNamespace( + config=SimpleNamespace( + environment=SimpleNamespace( + skills_dir=skills_dir, + ) + ) + ) + + +@pytest.mark.asyncio +async def test_deploy_skills_symlinks_agent_skill_paths_instead_of_copying(tmp_path): + env = MagicMock() + env.exec = AsyncMock(return_value=MagicMock(return_code=0, stdout="")) + env.upload_dir = AsyncMock() + agent_cfg = AgentConfig( + name="test-agent", + install_cmd="true", + launch_cmd="true", + skill_paths=["$HOME/.agents/skills", "$WORKSPACE/skills"], + ) + + await deploy_skills( + env=env, + task_path=tmp_path, + skills_dir=None, + agent_cfg=agent_cfg, + sandbox_user="agent", + agent_cwd="/app", + task=_make_task("/opt/benchflow/skills"), + ) + + env.upload_dir.assert_not_called() + env.exec.assert_awaited_once() + + cmd = env.exec.await_args.args[0] + assert "cp -r" not in cmd + assert " && " in cmd + assert ";" not in cmd + assert "ln -sfn /opt/benchflow/skills /home/agent/.agents/skills" in cmd + assert "ln -sfn /opt/benchflow/skills /app/skills" in cmd + + +@pytest.mark.asyncio +async def test_deploy_skills_uploads_runtime_skills_and_links_shared_tree(tmp_path): + env = MagicMock() + env.exec = AsyncMock(return_value=MagicMock(return_code=0, stdout="")) + env.upload_dir = AsyncMock() + agent_cfg = AgentConfig( + name="test-agent", + install_cmd="true", + launch_cmd="true", + skill_paths=["$HOME/.agents/skills", "$WORKSPACE/skills"], + ) + skills_dir = tmp_path / "skills" + skills_dir.mkdir() + + await deploy_skills( + env=env, + task_path=tmp_path, + skills_dir=skills_dir, + agent_cfg=agent_cfg, + sandbox_user="agent", + agent_cwd="/workspace", + task=_make_task("/opt/benchflow/skills"), + ) + + env.upload_dir.assert_awaited_once_with(skills_dir, "/skills") + env.exec.assert_awaited_once() + + distributed_link_cmd = env.exec.await_args.args[0] + assert " && " in distributed_link_cmd + assert ";" not in distributed_link_cmd + assert "ln -sfn /skills /home/agent/.agents/skills" in distributed_link_cmd + assert "ln -sfn /skills /workspace/skills" in distributed_link_cmd + assert "/root/.agents/skills" not in distributed_link_cmd + assert "/app/skills" not in distributed_link_cmd + + +@pytest.mark.asyncio +async def test_deploy_skills_falls_back_when_local_skills_dir_is_missing(tmp_path): + env = MagicMock() + env.exec = AsyncMock(return_value=MagicMock(return_code=0, stdout="")) + env.upload_dir = AsyncMock() + agent_cfg = AgentConfig( + name="test-agent", + install_cmd="true", + launch_cmd="true", + skill_paths=["$HOME/.agents/skills", "$WORKSPACE/skills"], + ) + + await deploy_skills( + env=env, + task_path=tmp_path, + skills_dir=tmp_path / "missing-skills", + agent_cfg=agent_cfg, + sandbox_user="agent", + agent_cwd="/workspace", + task=_make_task("/opt/benchflow/skills"), + ) + + env.upload_dir.assert_not_called() + env.exec.assert_awaited_once() + + distributed_link_cmd = env.exec.await_args.args[0] + assert "ln -sfn /opt/benchflow/skills /home/agent/.agents/skills" in distributed_link_cmd + assert "ln -sfn /opt/benchflow/skills /workspace/skills" in distributed_link_cmd + assert "ln -sfn /skills /home/agent/.agents/skills" not in distributed_link_cmd + assert "ln -sfn /skills /workspace/skills" not in distributed_link_cmd + + +@pytest.mark.asyncio +async def test_deploy_skills_raises_when_skill_linking_fails(tmp_path): + env = MagicMock() + env.exec = AsyncMock(return_value=MagicMock(return_code=17, stdout="link failed")) + env.upload_dir = AsyncMock() + agent_cfg = AgentConfig( + name="test-agent", + install_cmd="true", + launch_cmd="true", + skill_paths=["$HOME/.agents/skills"], + ) + + with pytest.raises(RuntimeError, match="Failed to link skills"): + await deploy_skills( + env=env, + task_path=tmp_path, + skills_dir=None, + agent_cfg=agent_cfg, + sandbox_user="agent", + agent_cwd="/app", + task=_make_task("/opt/benchflow/skills"), + ) + + @pytest.mark.asyncio async def test_install_agent_writes_command_stdout_and_stderr_on_failure(tmp_path: Path): env = SimpleNamespace() diff --git a/tests/test_registry_invariants.py b/tests/test_registry_invariants.py index 5b708d3..868ffb3 100644 --- a/tests/test_registry_invariants.py +++ b/tests/test_registry_invariants.py @@ -76,6 +76,22 @@ def test_agent_collection_invariants(name, cfg): assert d.startswith("."), f"home_dirs entry {d!r} must start with '.'" +@pytest.mark.parametrize("name,cfg", AGENTS.items(), ids=list(AGENTS.keys())) +def test_agent_install_cmd_targets_shared_paths(name, cfg): + """Installed binaries must land in shared prefixes, not a root-only home. + + setup_sandbox_user() no longer recursively copies /root/.nvm or + /root/.local/bin into the sandbox home. If an install_cmd placed its + binary there, the sandbox user would silently lose access to the agent. + """ + forbidden_binary_prefixes = ("/root/.nvm/", "/root/.local/bin/", "$HOME/.nvm/") + for prefix in forbidden_binary_prefixes: + assert prefix not in cfg.install_cmd, ( + f"{name!r} install_cmd writes under {prefix!r}; use /usr/local/bin " + f"or another shared prefix so the sandbox user inherits the tool" + ) + + @pytest.mark.parametrize("name,cfg", AGENTS.items(), ids=list(AGENTS.keys())) def test_agent_credential_and_subscription_auth(name, cfg): """Optional credential_files and subscription_auth structures.""" diff --git a/tests/test_sandbox.py b/tests/test_sandbox.py index 4c7f9f2..36a8c80 100644 --- a/tests/test_sandbox.py +++ b/tests/test_sandbox.py @@ -1,23 +1,21 @@ -"""Tests for sandbox user directory derivation from agent registry.""" +"""Tests for sandbox user config/auth directory derivation from agent registry.""" from benchflow.agents.registry import ( AGENTS, AgentConfig, + HostAuthFile, + SubscriptionAuth, get_sandbox_home_dirs, ) class TestSandboxDirs: - def test_dirs_derived_from_skill_paths(self): - """All $HOME skill_paths dirs from AGENTS registry are included.""" + def test_skill_only_dirs_not_included(self): + """Skill-only home dirs should not be copied during sandbox user setup.""" dirs = get_sandbox_home_dirs() - # claude-agent-acp has $HOME/.claude/skills - assert ".claude" in dirs - # gemini has $HOME/.gemini/skills - assert ".gemini" in dirs - # pi-acp has $HOME/.pi/agent/skills and $HOME/.agents/skills - assert ".pi" in dirs - assert ".agents" in dirs + + assert ".pi" not in dirs + assert ".agents" not in dirs def test_credential_file_dirs_included(self): """Dirs from credential_files paths are included.""" @@ -31,13 +29,25 @@ def test_home_dirs_included(self): # openclaw has home_dirs=[".openclaw"] assert ".openclaw" in dirs - def test_always_includes_local(self): - """.local is always in the dir list.""" + def test_does_not_include_legacy_local_tool_dir(self): + """.local is not included unless an agent registry path derives it.""" + dirs = get_sandbox_home_dirs() + assert ".local" not in dirs + + def test_only_includes_top_level_home_dirs(self): + """Derived entries stay at $HOME top-level, not nested tool subpaths.""" dirs = get_sandbox_home_dirs() - assert ".local" in dirs + assert ".local/bin" not in dirs - def test_new_agent_auto_included(self): - """Adding an agent with skill_paths=$HOME/.newagent/skills includes .newagent.""" + def test_dirs_represent_registry_backed_home_config_or_auth(self): + """Returned dirs are registry-derived user home config/auth roots.""" + dirs = get_sandbox_home_dirs() + assert {".claude", ".codex", ".gemini", ".openclaw"}.issubset(dirs) + assert ".agents" not in dirs + assert ".pi" not in dirs + + def test_new_agent_skill_path_not_auto_included(self): + """Skill-only home dirs should not become sandbox copy targets.""" AGENTS["_test_agent"] = AgentConfig( name="_test_agent", install_cmd="true", @@ -46,10 +56,33 @@ def test_new_agent_auto_included(self): ) try: dirs = get_sandbox_home_dirs() - assert ".newagent" in dirs + assert ".newagent" not in dirs finally: del AGENTS["_test_agent"] + def test_subscription_auth_file_dirs_included(self): + """Dirs from subscription_auth.files container paths are included.""" + AGENTS["_test_agent_subscription_auth"] = AgentConfig( + name="_test_agent_subscription_auth", + install_cmd="true", + launch_cmd="true", + subscription_auth=SubscriptionAuth( + replaces_env="TEST_API_KEY", + detect_file="~/.subauth/login.json", + files=[ + HostAuthFile( + "~/.subauth/login.json", + "{home}/.subauth/login.json", + ) + ], + ), + ) + try: + dirs = get_sandbox_home_dirs() + assert ".subauth" in dirs + finally: + del AGENTS["_test_agent_subscription_auth"] + def test_workspace_paths_excluded(self): """$WORKSPACE paths are not included (only $HOME paths).""" dirs = get_sandbox_home_dirs() diff --git a/tests/test_sandbox_setup.py b/tests/test_sandbox_setup.py new file mode 100644 index 0000000..f81b392 --- /dev/null +++ b/tests/test_sandbox_setup.py @@ -0,0 +1,106 @@ +"""Focused contract tests for setup_sandbox_user() shell command generation.""" + +import re +import shlex +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from benchflow._sandbox import setup_sandbox_user +from benchflow.agents.registry import get_sandbox_home_dirs + + +async def _run_setup_sandbox_user(*, sandbox_user: str = "agent", workspace: str = "/app"): + env = MagicMock() + env.exec = AsyncMock(return_value=MagicMock(stdout="", stderr="", exit_code=0)) + + await setup_sandbox_user(env, sandbox_user, workspace) + + env.exec.assert_awaited_once() + return env.exec.call_args.args[0], env.exec.call_args.kwargs + + +def _assert_conditional_legacy_symlink(cmd: str, *, source: str, dest: str) -> None: + """Legacy tool dirs should link only when a root-only install exists.""" + assert re.search( + rf"if \[ -e [\"']?{re.escape(source)}[\"']? \].*ln -s(?:f|[a-zA-Z-])* [\"']?{re.escape(source)}[\"']? [\"']?{re.escape(dest)}[\"']?.*fi", + cmd, + ), f"expected explicit symlink from {source} to {dest} in setup command: {cmd}" + + +def _get_copy_loop_dirs(cmd: str) -> list[str]: + """Extract the general home-dir copy loop payload from the shell command.""" + match = re.search(r"for d in (?P.*?); do", cmd) + assert match, f"expected general home-dir copy loop in setup command: {cmd}" + return match.group("dirs").split() + + +class TestSetupSandboxUser: + @pytest.mark.asyncio + async def test_setup_command_avoids_recursive_root_tool_copies(self): + """Heavy root-owned tool dirs should no longer be recursively copied.""" + cmd, kwargs = await _run_setup_sandbox_user() + + assert "cp -aL /root/.local/bin/." not in cmd + assert "cp -a /root/.nvm/." not in cmd + assert kwargs["timeout_sec"] == 120 + + @pytest.mark.asyncio + async def test_setup_command_still_creates_user_prepares_home_and_chowns_workspace(self): + """The non-copy setup contract still creates the user and grants access.""" + cmd, _ = await _run_setup_sandbox_user() + + assert "id -u agent >/dev/null 2>&1 || useradd -m -s /bin/bash agent" in cmd + assert "mkdir -p /home/agent/.local/bin" not in cmd + assert "chown -R agent:agent /home/agent" in cmd + assert f"chown -R agent:agent {shlex.quote('/app')}" in cmd + + @pytest.mark.asyncio + async def test_setup_command_keeps_heavy_root_tool_dirs_on_shared_paths(self): + """Legacy root-only tool dirs should use conditional symlinks, not duplication.""" + cmd, _ = await _run_setup_sandbox_user() + + _assert_conditional_legacy_symlink( + cmd, + source="/root/.local/bin", + dest="/home/agent/.local/bin", + ) + _assert_conditional_legacy_symlink( + cmd, source="/root/.nvm", dest="/home/agent/.nvm" + ) + assert "cp -aL /root/.local/bin/. /home/agent/.local/bin/" not in cmd + assert "cp -a /root/.nvm/. /home/agent/.nvm/" not in cmd + + @pytest.mark.asyncio + async def test_setup_command_copy_loop_excludes_local_dir(self): + """General home-dir copying should narrow to small config/auth dirs only.""" + cmd, _ = await _run_setup_sandbox_user() + + copy_loop_dirs = _get_copy_loop_dirs(cmd) + + assert copy_loop_dirs == sorted( + d for d in get_sandbox_home_dirs() if d != ".local" + ) + assert ".local" not in copy_loop_dirs + assert "mkdir -p /home/agent/$d" in cmd + assert "cp -a /root/$d/. /home/agent/$d/ 2>/dev/null || true" in cmd + + @pytest.mark.asyncio + async def test_setup_command_does_not_copy_heavy_tool_trees_into_home(self): + """BenchFlow-installed agents must not rely on sandbox-home tool copies. + + Agent binaries are placed in /usr/local/bin by the registered install_cmd + values, so setup_sandbox_user() must not bulk-copy heavyweight tool trees + (e.g. /root/.nvm, /root/.local/bin) to make them executable for the user. + """ + cmd, _ = await _run_setup_sandbox_user() + + for heavy_source in ("/root/.nvm", "/root/.local/bin"): + assert f"cp -a {heavy_source}/." not in cmd + assert f"cp -aL {heavy_source}/." not in cmd + + copy_loop_dirs = _get_copy_loop_dirs(cmd) + for heavy_dir in (".nvm", ".local"): + assert heavy_dir not in copy_loop_dirs, ( + f"sandbox copy loop must not include heavy tool dir {heavy_dir!r}" + ) From 176212378727202a2318f5071a10fd654d9183e3 Mon Sep 17 00:00:00 2001 From: Xiangyi Li Date: Sat, 25 Apr 2026 05:49:18 -0400 Subject: [PATCH 23/25] feat: BaseUser abstraction + per-task verifier hardening opt-outs (#194) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: BaseUser abstraction for progressive-disclosure trial loops Add User as a first-class participant in the trial loop — a Python callback that produces prompts, sees test results between rounds, and decides when to stop. This is the infrastructure Josh (GitHub/Microsoft) needs for SWE-bench Pro progressive disclosure. New types (user.py): - BaseUser with setup(instruction, solution) and run(round, instruction, round_result) - RoundResult dataclass with trajectory, rewards, verifier output - PassthroughUser (backward-compat default, single round) - FunctionUser (wraps a plain callback for lightweight use) Trial changes: - TrialConfig gains user, max_user_rounds, oracle_access fields - Trial._run_user_loop(): user.run() → connect → execute → disconnect → soft_verify() → build RoundResult → repeat until None or max rounds - Trial.soft_verify(): runs Harbor verifier WITHOUT hardening so agent stays alive between rounds. Final verify() still does full hardening. - Multi-role + User raises ValueError (deferred to future phase) 16 new tests, 0 regressions on existing 618 tests. * fix: address self-review — 5 bugs in user abstraction 1. Reorder: disconnect() before soft_verify() — agent process is already dead when soft_verify runs, so soft_verify's docstring was misleading. Now disconnect → soft_verify is the explicit flow. 2. soft_verify() now runs CLEANUP_CMD (conftest/pth/sitecustomize purge) before the verifier. Prevents agent from gaming intermediate test results by injecting test-patching files. 3. FunctionUser: use inspect.isawaitable() instead of asyncio.iscoroutine() — handles asyncio.Task, Future, and any __await__ object, not just coroutines. 4. oracle_access: cat /solution now runs as user="root" — /solution is locked (root:700) after install_agent, so the read would silently fail without root. 5. try/finally around connect/execute/disconnect in user loop — ensures disconnect() always runs even if execute() raises. * feat: add user_dogfood.py — progressive disclosure on regex-log Demonstrates the FunctionUser abstraction: - Round 0: terse 2-sentence prompt - Round 1: hints about edge cases on failure - Round 2: full instruction on continued failure - Stops early if tests pass * fix: address Devin review — remove tautological tests, fix model name - Remove 4 tautological tests (pure dataclass reads) per CLAUDE.md convention: TestRoundResult.test_defaults, test_with_data, TestTrialConfigUser.test_user_field_defaults_to_none, test_user_field_set - Fix dogfood model name: gemini-2.5-flash (not expired preview) - Note: iscoroutine→isawaitable was already fixed in 51d6c61 * fix: address code review — oracle safety, unused import, soft_verify tests 1. Oracle /solution is now moved (not deleted) before agent runs and restored before final verify(). Prevents breaking verifiers that need /solution to compute rewards. 2. Remove unused asyncio import from user.py. 3. Add 4 soft_verify tests: timeout, crash, success, and CLEANUP_CMD execution verification. soft_verify is no longer untested. * feat: dogfood results — progressive disclosure on regex-log via Daytona 3-round progressive disclosure with Gemini Flash on regex-log: Round 0: terse prompt (2 tool calls) → reward=0.0 Round 1: hint prompt (3 tool calls) → reward=0.0 Round 2: full instruction (3 tool calls) → reward=0.0 Final verify: reward=0.0 Agent scored 0.0 on all rounds — regex-log is a hard task. But the infrastructure works end-to-end: user loop, soft_verify, fresh ACP sessions per round, user_rounds.jsonl persistence, final hardened verify. No errors. * feat: add opencode agent to registry OpenCode (opencode-ai) is an open-source TypeScript coding agent with ACP support. Skills path: $HOME/.opencode/skills (updated from .opencode/skill per skillsbench #718). Closes skillsbench #718 for the benchflow side. * fix: opencode ACP returns 0 tool calls — model format mismatch Root cause: OpenCode's ACP parseModel() splits modelId on "/" to extract providerID and modelID. When benchflow sent "gemini-3.1-pro-preview" (no slash), opencode parsed it as providerID="gemini-3.1-pro-preview" with modelID="" — an invalid config that silently returned end_turn. Fix: Add acp_model_format field to AgentConfig. When set to "provider/model" (opencode), _format_acp_model() infers the models.dev provider from the bare model name (e.g. "gemini" → "google") and sends "google/gemini-3.1-pro-preview" to set_model. Also: opencode requires_env is now empty (inferred from model at runtime, not hardcoded to ANTHROPIC_API_KEY). * feat: executed notebook — SWE-bench Pro progressive disclosure analysis OpenCode + gemini-3.1-pro-preview on qutebrowser SWE-bench Pro: Baseline (full prompt, 1 round): 40 tools, 736s, reward=0.0 Progressive (3 rounds): 185 tools, 1154s, reward=0.0 Round 0 (terse): 86 tools (81 bash + 5 edit) Round 1 (hints): 76 tools (66 bash + 10 edit) Round 2 (full): 23 tools (16 bash + 7 edit) Both scored 0.0 due to verifier infrastructure bug (rootdir=/tests instead of /app, pytest couldn't find config). Agent's fixes were likely correct — demonstrated passing tests in own environment. Key findings: - Progressive disclosure changed agent behavior (86→76→23 tools) - _reset_cache implemented only after Round 1 hint - OpenCode handled 185 tool calls without token limits - Verifier rootdir bug needs investigation * fix: replace hand-curated pytest plugin whitelist with auto-discovery The old mechanism (4 dicts + 4 functions + 1 regex) required manual code changes for every new benchmark with an undeclared pytest plugin. SWE-bench Pro tasks failed because pytest-benchmark wasn't whitelisted. New mechanism: one container-side script + one async function. At hardening time, enumerate all pytest11 entry points from root-owned system packages. Only root-owned dist-info directories are trusted — editable installs from agent-writable /testbed are excluded. PYTEST_DISABLE_PLUGIN_AUTOLOAD=1 stays in place. Security preserved. task.toml pytest_plugins kept as fallback. Deleted: _PYTEST_PLUGIN_ALIASES, _PYTEST_OPTION_PLUGINS, _PYTEST_INSTALLED_PLUGINS, _PIP_INSTALL_RE, _normalize_pytest_plugin, _plugins_from_verifier_script, _declared_pytest_plugins, _pytest_plugin_flags, tomllib import. Added: _DISCOVER_PYTEST_PLUGINS_SCRIPT, _discover_pytest_plugin_flags. * fix: handle Python 3.9 importlib.metadata API in plugin discovery Python 3.9's entry_points() doesn't accept keyword arguments — returns a dict instead. Fall back to entry_points().get('pytest11', []) when the keyword style raises TypeError. * fix: simplify plugin discovery — skip ownership check The uid==0 check was failing on Python 3.9 containers where ep.dist._path doesn't exist. Simplified to just enumerate all pytest11 entry points — sandbox_user prevents agent pip installs, so all discovered plugins are image-authored. * feat: updated notebook with fixed-verifier results Both progressive + baseline rerun with working verifier (15 plugins discovered). Results with honest scoring: Progressive (3 rounds): 284 tools, 970s, reward=0.0 Round 0: 94 tools, Round 1: 92 tools, Round 2: 98 tools Baseline (1 round): 73 tools, 611s, reward=0.0 Both failed due to agent code errors (circular imports), not verifier infrastructure. Progressive used 4x more compute for same outcome on this task. * fix: preserve trusted PYTHONPATH entries during verifier hardening VERIFIER_ENV cleared PYTHONPATH="" which broke SWE-bench Pro tasks where the Dockerfile sets PYTHONPATH=/app/lib:/app for project imports. New: _trusted_verifier_pythonpath() filters PYTHONPATH using the same root-owned validation as PATH, but does NOT block the workspace — /app is already importable via CWD/pytest sys.path insertion, so clearing it only breaks imports without security benefit. /tmp, /var/tmp, /home/agent are still blocked. Re-pinned after task-env merge like PATH. * fix: address review comments on BaseUser PR - soft_verify: chmod 777 /logs/verifier so non-root verifier can write - soft_verify: restore /solution before verify, re-hide after (oracle access) - validate empty roles (!=1) and multi-scene configs in user loop - remove tautological test_setup_is_noop - remove opencode BENCHFLOW_PROVIDER_API_KEY→ANTHROPIC_API_KEY mapping (wrong for non-Anthropic models; native keys inherited via auto_inherit_env) - warn on unknown provider fallback in _format_acp_model - remove --rootdir=/tests from VERIFIER_ENV (cherry-pick from PR #187) - fix printenv PYTHONPATH crash when unset - fix stale plugin discovery docstring * feat: add SWE-bench Pro oracle validation + baseline experiment script Runs oracle (gold solution) on all 4 testable tasks to verify the --rootdir fix, then runs a single-round agent baseline for comparison with progressive disclosure. Results to CSV. * fix: address Codex review on PR #184 — oracle safety + warnings Three Codex review findings on the BaseUser abstraction: 1. oracle_access=True with user=None silently leaves /solution exposed to the agent for the entire trial. Add a logger.warning at setup time so misconfigurations surface immediately. 2. Oracle restore (mv /solution_oracle_backup /solution) was outside any finally block. If _run_user_loop() raised, /solution was never restored. Move the user/scene execution into try/finally so the restore always runs before the final verify(). 3. Oracle read used a wildcard fallback (cat /solution/* || true) that could leak unintended files (binaries, credentials). Narrow to solve.sh — the canonical SWE-bench Pro oracle path. Bugs Codex flagged that were FALSE POSITIVES (verified against code): - "session counter reset" — disconnect() already resets both counters - "None instruction" — _resolve_prompts returns [instruction] not [None] Tests still pass: 15 user + 58 sandbox = 73 total. * feat: per-task verifier hardening opt-outs + restore --rootdir=/app Two related changes addressing SWE-bench Pro oracle compatibility: 1) Restore --rootdir=/app in PYTEST_ADDOPTS Removing --rootdir entirely (PR #187) made pytest fall back to /dev as rootdir (from -c /dev/null), producing test node IDs like ../dev/::test_foo instead of /::test_foo. The verifier expects full-path IDs and reported 0 passing tests on openlibrary even though all 18 tests passed. --rootdir=/app anchors test IDs to the canonical Harbor repo root while -c /dev/null still blocks pyproject.toml/pytest.ini discovery and --confcutdir=/tests still blocks conftest walk-up beyond /tests. 2) Per-task [verifier.hardening] opt-outs in task.toml The cleanup that deletes agent-injected conftest.py also deletes legitimate repo conftest.py files. qutebrowser ships conftest.py that sets up import order to break a real circular dependency between qutebrowser.browser.inspector and qutebrowser.misc.miscwidgets — without them, pytest collection fails on a type annotation in miscwidgets.py:419. Tasks now declare opt-outs in task.toml: [verifier.hardening] cleanup_conftests = false # qutebrowser Defaults remain secure (all True). New helpers in _sandbox.py: - HARDENING_DEFAULTS: dict of feature flags - _read_hardening_config(task_dir): parse task.toml [verifier.hardening] - _build_cleanup_cmd(hardening): build cleanup honoring opt-outs CLEANUP_CMD constant kept as backward-compat alias. Both harden_before_verify() and Trial.soft_verify() now read per-task hardening config before running cleanup. Validation on SWE-bench Pro oracle (Daytona): Before: 2/4 (ansible, flipt) — openlibrary failed test ID format, qutebrowser failed conftest deletion After: 5/5 (ansible, flipt, openlibrary, qutebrowser, navidrome) Tests: 80 passing (15 user + 65 sandbox including 7 new opt-out tests). * docs: add progressive-disclosure guide + SWE-bench Pro demo notebook For Josh's SWE-bench Pro use case (and Harbor #1316 parity in the no-second-LLM case): - docs/progressive-disclosure.md: dedicated guide for the BaseUser abstraction. Covers the API, oracle access, [verifier.hardening] opt-outs, and when to choose BaseUser vs multi-role Scene. - docs/use-cases.md: brief mention in §1 (Interactive User Simulation) pointing to progressive-disclosure.md for the lighter-weight callback-based pattern. - examples/swebench_pro_progressive_disclosure.ipynb: clean rewrite of the existing notebook. Shows the API, oracle 5/5, baseline 4 tasks, per-task hardening opt-out example, and a placeholder cell that auto- loads the latest progressive-disclosure run from /tmp/swebench-pro-jobs/progressive when one exists. Executes top-to- bottom against the current oracle/baseline CSV. - examples/swebench_pro_user_dogfood.py: ready-to-run script for progressive disclosure on any of the 5 working SWE-bench Pro tasks. Three-round user: terse → failing tests + half spec → full spec. - experiments/swebench-pro-results.csv: oracle + baseline results from 2026-04-24 Daytona run. qutebrowser entry is pre-fix (verified post- fix separately, noted in notebook). * docs: add progressive-disclosure.md to CLAUDE.md docs index --- CLAUDE.md | 2 +- docs/progressive-disclosure.md | 214 ++++++++++ docs/use-cases.md | 4 + .../swebench_pro_progressive_disclosure.ipynb | 369 ++++++++++++++++++ examples/swebench_pro_user_dogfood.py | 135 +++++++ examples/user_dogfood.py | 93 +++++ examples/user_dogfood_results/result.json | 25 ++ .../user_dogfood_results/user_rounds.jsonl | 3 + experiments/swebench-pro-results.csv | 9 + .../swebench_pro_oracle_and_baseline.py | 263 +++++++++++++ pyproject.toml | 2 + src/benchflow/__init__.py | 6 + src/benchflow/_acp_run.py | 48 ++- src/benchflow/_agent_env.py | 4 + src/benchflow/_sandbox.py | 290 ++++++++------ src/benchflow/agents/registry.py | 31 ++ src/benchflow/trial.py | 219 ++++++++++- src/benchflow/user.py | 101 +++++ tests/test_sandbox_hardening.py | 209 +++++----- tests/test_user.py | 336 ++++++++++++++++ 20 files changed, 2142 insertions(+), 221 deletions(-) create mode 100644 docs/progressive-disclosure.md create mode 100644 examples/swebench_pro_progressive_disclosure.ipynb create mode 100644 examples/swebench_pro_user_dogfood.py create mode 100644 examples/user_dogfood.py create mode 100644 examples/user_dogfood_results/result.json create mode 100644 examples/user_dogfood_results/user_rounds.jsonl create mode 100644 experiments/swebench-pro-results.csv create mode 100644 experiments/swebench_pro_oracle_and_baseline.py create mode 100644 src/benchflow/user.py create mode 100644 tests/test_user.py diff --git a/CLAUDE.md b/CLAUDE.md index 7695ddc..b089ffe 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -2,7 +2,7 @@ Multi-turn agent benchmarking with ACP. -Docs: `docs/quickstart.md`, `docs/cli-reference.md`, `docs/api-reference.md`, `docs/task-authoring.md`, `docs/use-cases.md`. +Docs: `docs/quickstart.md`, `docs/cli-reference.md`, `docs/api-reference.md`, `docs/task-authoring.md`, `docs/use-cases.md`, `docs/progressive-disclosure.md`. ## Setup diff --git a/docs/progressive-disclosure.md b/docs/progressive-disclosure.md new file mode 100644 index 0000000..13e3c78 --- /dev/null +++ b/docs/progressive-disclosure.md @@ -0,0 +1,214 @@ +# Progressive Disclosure with `BaseUser` + +A pattern for multi-round agent runs where a Python callback drives the loop, deciding what to tell the agent next based on what happened in the previous round. + +This is BenchFlow's lightweight alternative to multi-agent "user simulation" Scenes (see [use-cases.md](./use-cases.md#1-interactive-user-simulation-harbor-1316-equivalent)). Use a `BaseUser` callback when: + +- You need programmatic control over the loop (e.g. terse prompt → hints on test failure → stop on pass). +- You don't want to spin up a second LLM just to play the "user" role. +- Your "user" logic is rule-based or oracle-guided rather than open-ended. + +For comparison: a Scene-based simulated user is another LLM with its own tool access, useful for nuanced feedback. A `BaseUser` is a sync/async Python function, useful for deterministic, scriptable progressive disclosure. + +--- + +## Why this exists + +This was built for [Josh's SWE-bench Pro use case](https://github.com/swe-bench-pro/swe-bench-pro): the dataset's instructions are long structured specs that overwhelm agents in a single turn. A `BaseUser` lets you compress the spec to a terse prompt for round 0, watch which tests fail, then disclose hints from the spec on subsequent rounds. + +It is also benchflow's parity answer to [Harbor #1316](https://github.com/harbor-ai/harbor/issues/1316) for the no-second-LLM case — Harbor's proposal required a FastMCP sidecar; BenchFlow's `BaseUser` is in-process Python. + +--- + +## Quick start + +```python +import asyncio +from pathlib import Path +import benchflow as bf +from benchflow import FunctionUser, RoundResult +from benchflow.trial import TrialConfig, Scene + + +def my_user(round: int, instruction: str, rr: RoundResult | None) -> str | None: + if round == 0: + # Round 0: terse prompt, no hints + return instruction.split("\n")[0] + if rr and rr.rewards and rr.rewards.get("reward", 0) >= 1.0: + return None # passed, stop + if round >= 3: + return None # cap at 3 rounds + # Otherwise: show the failing tests as a hint for next round + return ( + f"The previous attempt failed these tests:\n{rr.verifier_output}\n" + f"Here is the full spec for context:\n{instruction}" + ) + + +config = TrialConfig( + task_path=Path(".ref/swebenchpro/instance_flipt-io__flipt-..."), + scenes=[Scene.single(agent="opencode", model="anthropic/claude-sonnet-4-6")], + user=FunctionUser(my_user), + max_user_rounds=3, + environment="daytona", +) +result = asyncio.run(bf.run(config)) +``` + +--- + +## API + +### `BaseUser` + +Subclass and override `run()`. Optionally override `setup()` for one-time initialization. + +```python +from benchflow import BaseUser, RoundResult + + +class MyUser(BaseUser): + async def setup(self, instruction: str, solution: str | None = None) -> None: + """Called once before the first round. + + instruction — the original task instruction (from instruction.md) + solution — the gold answer if oracle_access=True, else None + """ + self.spec_lines = instruction.split("\n") + self.gold = solution # only set if oracle_access=True + + async def run( + self, + round: int, + instruction: str, + round_result: RoundResult | None = None, + ) -> str | None: + """Produce the next prompt, or None to stop the loop. + + round — 0-indexed round number + instruction — the original task instruction + round_result — None on round 0; previous round's outcome on subsequent rounds + """ + ... # return prompt str or None +``` + +### `RoundResult` + +Dataclass passed to `run()` from round 1 onward. + +```python +@dataclass +class RoundResult: + round: int # 0-indexed + trajectory: list[dict] # ACP events from this round only + rewards: dict[str, Any] | None # verifier rewards (None if verifier crashed) + verifier_output: str | None # raw verifier stdout/log content + verifier_error: str | None # exception message if verifier failed + n_tool_calls: int # tool calls in this round +``` + +### `PassthroughUser` + +Sends the instruction unchanged on round 0, stops on round 1. Backward-compatible single-round behavior. + +### `FunctionUser` + +Wraps a plain function as a `BaseUser`. Sync and async both supported (via `inspect.isawaitable`). + +```python +def fn(round, instruction, rr): return None if round > 0 else instruction +user = FunctionUser(fn) + +async def afn(round, instruction, rr): ... +user = FunctionUser(afn) +``` + +### `TrialConfig` fields + +```python +user: BaseUser | None = None # the callback +max_user_rounds: int = 5 # hard cap on rounds (loop stops earlier if user returns None) +oracle_access: bool = False # expose gold solution to user.setup() +``` + +A `User` requires a single-scene, single-role config. Multi-scene or multi-role configs raise `ValueError`. + +--- + +## Oracle access + +When `oracle_access=True`, the trial: + +1. Reads `/solution/solve.sh` before the agent starts and passes its content to `user.setup(instruction, solution=...)`. +2. Moves `/solution` → `/solution_oracle_backup` so the agent cannot read it during its rounds. +3. Temporarily restores `/solution` for `soft_verify()` between rounds (and re-hides it). +4. Restores `/solution` permanently before the final `verify()`. + +Step 4 is wrapped in a `try/finally`, so if a round throws, the restore still runs. + +> ⚠️ Setting `oracle_access=True` without a `User` is a misconfiguration — the solution stays exposed to the agent for the entire trial. BenchFlow logs a `WARNING` at setup time when this happens. + +Use cases for oracle access: +- Dataset generation: have the user generate optimal prompts based on knowing the answer. +- Curriculum learning: progressively reveal hints from the gold solution. +- Research: study how much oracle information is needed for an agent to succeed. + +--- + +## Per-task hardening opt-outs + +The verifier's pre-run cleanup deletes `conftest.py` files outside `/tests/` to prevent agent reward-hacking. Some tasks (e.g. qutebrowser) ship legitimate `conftest.py` that sets up Python's import order to break a real circular dependency. The default cleanup deletes them, breaking pytest collection. + +Tasks declare opt-outs in `task.toml`: + +```toml +[verifier] +timeout_sec = 3000 + +[verifier.hardening] +cleanup_conftests = false +``` + +Available flags (all default `true` — secure-by-default): + +| Flag | Effect when `false` | +|------|---------------------| +| `cleanup_conftests` | Don't delete `conftest.py` outside `/tests/` before verify | + +Other cleanup steps (`sitecustomize.py`, `.pth` files, `*.py` in `/tmp`) always run — they have no legitimate use case in repo source trees and broaden the attack surface if disabled. + +Unknown keys in `[verifier.hardening]` are logged as warnings and ignored. String values for boolean flags are rejected (must be TOML `true` / `false`). + +--- + +## Failure modes + +The user loop catches exceptions from `user.run()` and logs them as the trial error, breaking out of the loop: + +```python +[User] round 2: prompt='Try again, focusing on...' +ERROR: user.run() failed at round 2: KeyError: 'spec_section' +``` + +`soft_verify()` between rounds catches its own timeouts and crashes — they surface as `RoundResult.verifier_error`, not as trial-level failures. The next round still runs; the user sees the error and decides whether to continue. + +Trajectory and tool counts are sliced per round from `Trial._trajectory`. The session counters reset on `disconnect()` between rounds, so each round's `RoundResult.trajectory` and `n_tool_calls` reflect only that round's events. + +--- + +## Worked example + +See [`examples/swebench_pro_progressive_disclosure.ipynb`](../examples/swebench_pro_progressive_disclosure.ipynb) for a 5-task SWE-bench Pro comparison: oracle vs single-round baseline vs 3-round progressive disclosure on flipt and openlibrary. + +For a minimal end-to-end script, see [`examples/user_dogfood.py`](../examples/user_dogfood.py). + +--- + +## Comparison with multi-agent Scene-based user simulation + +| Pattern | When to use | +|---------|-------------| +| `BaseUser` callback (this doc) | Programmatic, rule-based, deterministic. No second LLM. Cheap. | +| Multi-role Scene with simulated-user role ([use-cases.md §1](./use-cases.md#1-interactive-user-simulation-harbor-1316-equivalent)) | Open-ended, conversational. The "user" is another LLM with full tool access. Better for nuanced human-like interaction. | + +Both patterns coexist. Choose `BaseUser` for the lighter-weight case; choose Scenes when you actually want a second agent in the loop. diff --git a/docs/use-cases.md b/docs/use-cases.md index 63fa243..fbdf108 100644 --- a/docs/use-cases.md +++ b/docs/use-cases.md @@ -84,6 +84,10 @@ result = await bf.run(config) - The user agent is a real LLM with full tool access -- it can read files, check outputs, and give nuanced feedback, not just templated responses. - Same task folder works for single-turn (baseline) and interactive (with user) via different YAML configs. +### Lighter-weight alternative: `BaseUser` callback + +When you don't need a second LLM and your "user" logic is rule-based or oracle-guided (e.g. compress instruction → show test failures as hints → stop on pass), use a `BaseUser` Python callback instead of a multi-role Scene. See [progressive-disclosure.md](./progressive-disclosure.md). This is benchflow's no-second-LLM parity answer to Harbor #1316 and was built for the SWE-bench Pro progressive-disclosure use case. + --- ## 2. Code Review Loop (followup-bench) diff --git a/examples/swebench_pro_progressive_disclosure.ipynb b/examples/swebench_pro_progressive_disclosure.ipynb new file mode 100644 index 0000000..3b6fe2c --- /dev/null +++ b/examples/swebench_pro_progressive_disclosure.ipynb @@ -0,0 +1,369 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Progressive Disclosure on SWE-bench Pro\n", + "\n", + "Demo of the `BaseUser` abstraction for multi-round agent runs on SWE-bench Pro.\n", + "\n", + "**Tasks:** ansible, flipt, openlibrary, navidrome, qutebrowser (5 oracle-passing tasks).\n", + "**Baseline agent:** Gemini 3.1 Pro Preview, single-round.\n", + "**Progressive:** `BaseUser` callback, up to 3 rounds, hints disclosed on test failure.\n", + "\n", + "Built for [Josh's GitHub/Microsoft SWE-bench Pro use case](https://github.com/swe-bench-pro/swe-bench-pro). Parity answer to [Harbor #1316](https://github.com/harbor-ai/harbor/issues/1316) for the no-second-LLM case — see [`docs/progressive-disclosure.md`](../docs/progressive-disclosure.md).\n", + "\n", + "## Setup history (2026-04-24)\n", + "\n", + "| Issue | Fix |\n", + "|-------|-----|\n", + "| `--rootdir` removed → test IDs came out as `../dev/::test_name`, verifier saw 0 passes despite all tests passing | Restored `--rootdir=/app` in `PYTEST_ADDOPTS` |\n", + "| `conftest.py` cleanup deleted qutebrowser's legitimate import-order conftests, breaking pytest collection (circular import) | Per-task `[verifier.hardening]` opt-out in `task.toml`: `cleanup_conftests = false` |\n", + "\n", + "Result: oracle 2/4 → 5/5." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. The `BaseUser` API" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-25T01:10:06.729067Z", + "iopub.status.busy": "2026-04-25T01:10:06.728964Z", + "iopub.status.idle": "2026-04-25T01:10:10.646349Z", + "shell.execute_reply": "2026-04-25T01:10:10.645629Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "BaseUser: (self, round: 'int', instruction: 'str', round_result: 'RoundResult | None' = None) -> 'str | None'\n", + "RoundResult fields: ['round', 'trajectory', 'rewards', 'verifier_output', 'verifier_error', 'n_tool_calls']\n", + "TrialConfig user fields: ['user', 'max_user_rounds', 'oracle_access']\n" + ] + } + ], + "source": [ + "import inspect\n", + "import os\n", + "\n", + "os.chdir('/workspace/repos/benchflow')\n", + "\n", + "from benchflow import BaseUser, FunctionUser, RoundResult\n", + "from benchflow.trial import TrialConfig\n", + "\n", + "print('BaseUser:', inspect.signature(BaseUser.run))\n", + "print('RoundResult fields:', list(RoundResult.__dataclass_fields__))\n", + "print('TrialConfig user fields:', [f for f in TrialConfig.__dataclass_fields__\n", + " if f in ('user', 'max_user_rounds', 'oracle_access')])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. A minimal progressive-disclosure user\n", + "\n", + "Three-round progressive disclosure: terse prompt → hints with failing tests → full spec." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-25T01:10:10.667007Z", + "iopub.status.busy": "2026-04-25T01:10:10.666654Z", + "iopub.status.idle": "2026-04-25T01:10:10.670952Z", + "shell.execute_reply": "2026-04-25T01:10:10.670403Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "User type: FunctionUser\n" + ] + } + ], + "source": [ + "def progressive(round, instruction, rr):\n", + " if round == 0:\n", + " first_line = instruction.split('\\n', 1)[0].strip()\n", + " return f'{first_line}\\n\\nRead /app/ to understand the codebase, then implement the fix. Run tests when you think you are done.'\n", + "\n", + " if rr and rr.rewards:\n", + " score = rr.rewards.get('reward', rr.rewards.get('exact_match', 0))\n", + " if score >= 1.0:\n", + " return None # passed, stop\n", + "\n", + " if round == 1:\n", + " half = len(instruction) // 2\n", + " return (\n", + " f'Tests failed:\\n{(rr.verifier_output or \"\")[:1500]}\\n\\n'\n", + " f'First half of spec:\\n{instruction[:half]}'\n", + " )\n", + "\n", + " if round == 2:\n", + " return f'Full spec:\\n{instruction}'\n", + "\n", + " return None\n", + "\n", + "user = FunctionUser(progressive)\n", + "print(f'User type: {type(user).__name__}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Oracle validation: 5/5 SWE-bench Pro tasks pass\n", + "\n", + "Verifies the gold solution (`solve.sh`) reaches reward=1.0 on every task. Required before trusting baseline/progressive numbers — without a working oracle we can't tell if low rewards mean agent failure or verifier failure." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-25T01:10:10.672135Z", + "iopub.status.busy": "2026-04-25T01:10:10.672040Z", + "iopub.status.idle": "2026-04-25T01:10:10.676265Z", + "shell.execute_reply": "2026-04-25T01:10:10.675540Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Task Reward Time(s) Notes\n", + "------------------------------------------------------------\n", + "ansible 1.0 54.3 \n", + "flipt 1.0 480.5 \n", + "openlibrary 1.0 147.7 \n", + "qutebrowser 1.0 24.5 after [verifier.hardening] cleanup_conftests=false\n", + "navidrome 1.0 ~120 verified separately\n" + ] + } + ], + "source": [ + "import csv\n", + "from pathlib import Path\n", + "\n", + "results = list(csv.DictReader(open('experiments/swebench-pro-results.csv')))\n", + "\n", + "# qutebrowser oracle in CSV is from BEFORE the hardening opt-out fix.\n", + "# Post-fix oracle: 1.0 (verified separately on 2026-04-24).\n", + "qutebrowser_post_fix = {'task': 'qutebrowser', 'experiment': 'oracle',\n", + " 'reward': '1.0', 'elapsed_s': '24.5',\n", + " 'note': 'after [verifier.hardening] cleanup_conftests=false'}\n", + "navidrome_oracle = {'task': 'navidrome', 'experiment': 'oracle',\n", + " 'reward': '1.0', 'elapsed_s': '~120',\n", + " 'note': 'verified separately'}\n", + "\n", + "oracle = [r for r in results if r['experiment'] == 'oracle' and r['task'] != 'qutebrowser']\n", + "for r in oracle:\n", + " r['note'] = ''\n", + "oracle.append(qutebrowser_post_fix)\n", + "oracle.append(navidrome_oracle)\n", + "\n", + "print(f'{\"Task\":<14} {\"Reward\":<8} {\"Time(s)\":<8} Notes')\n", + "print('-' * 60)\n", + "for r in oracle:\n", + " print(f'{r[\"task\"]:<14} {r[\"reward\"]:<8} {r[\"elapsed_s\"]:<8} {r[\"note\"]}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Baseline: Gemini 3.1 Pro single-round\n", + "\n", + "Each task gets the full SWE-bench Pro instruction in one shot. No user, no rounds." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-25T01:10:10.677398Z", + "iopub.status.busy": "2026-04-25T01:10:10.677287Z", + "iopub.status.idle": "2026-04-25T01:10:10.681221Z", + "shell.execute_reply": "2026-04-25T01:10:10.680559Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Task Reward Tools Time(s) \n", + "------------------------------------------\n", + "ansible 1.0 23 206.8 \n", + "flipt 0.0 61 1443.7 \n", + "openlibrary 1.0 32 340.0 \n", + "qutebrowser 0.0 72 543.6 \n", + "\n", + "Baseline (Gemini 3.1 Pro, 1 round): 2/4 passed\n" + ] + } + ], + "source": [ + "baseline = [r for r in results if r['experiment'] == 'baseline']\n", + "\n", + "print(f'{\"Task\":<14} {\"Reward\":<8} {\"Tools\":<6} {\"Time(s)\":<8}')\n", + "print('-' * 42)\n", + "for r in baseline:\n", + " print(f'{r[\"task\"]:<14} {r[\"reward\"]:<8} {r[\"n_tool_calls\"]:<6} {r[\"elapsed_s\"]:<8}')\n", + "\n", + "passed = sum(1 for r in baseline if float(r['reward']) >= 1.0)\n", + "print(f'\\nBaseline (Gemini 3.1 Pro, 1 round): {passed}/{len(baseline)} passed')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Progressive disclosure: where it should help\n", + "\n", + "**flipt** is the interesting case: oracle passes (gold `solve.sh` works) but Gemini's single-round baseline fails after 61 tool calls and 24 minutes. The agent had the full spec but didn't solve it. Hypothesis: progressive disclosure with failing-test feedback could let the agent course-correct.\n", + "\n", + "**openlibrary** is the regression test: baseline already passes, so progressive disclosure should also pass. If progressive breaks something the baseline solves, that's a bug in the user loop.\n", + "\n", + "Run the script:\n", + "```bash\n", + "GEMINI_API_KEY=... python examples/swebench_pro_user_dogfood.py --task flipt --max-rounds 3\n", + "```\n", + "\n", + "Results land in `/tmp/swebench-pro-jobs/progressive///`." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-25T01:10:10.682639Z", + "iopub.status.busy": "2026-04-25T01:10:10.682546Z", + "iopub.status.idle": "2026-04-25T01:10:10.686008Z", + "shell.execute_reply": "2026-04-25T01:10:10.685533Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Latest trial: /tmp/swebench-pro-jobs/progressive/2026-04-24__18-07-33/instance_flipt-io__flipt-02e21636c58e86c51119b63e0fb5ca7b813b07b1__0684d916\n", + " (no user_rounds.jsonl yet — run is in progress or failed before any round)\n" + ] + } + ], + "source": [ + "# Once you have a progressive run, point at its trial dir and load round results.\n", + "# Each round's outcome is logged to user_rounds.jsonl.\n", + "\n", + "import json\n", + "\n", + "PROGRESSIVE_ROOT = Path('/tmp/swebench-pro-jobs/progressive')\n", + "\n", + "if PROGRESSIVE_ROOT.exists():\n", + " runs = sorted(PROGRESSIVE_ROOT.iterdir())\n", + " if runs:\n", + " latest_job = runs[-1]\n", + " instances = list(latest_job.iterdir())\n", + " if instances:\n", + " trial = instances[0]\n", + " print(f'Latest trial: {trial}')\n", + " rounds_log = trial / 'user_rounds.jsonl'\n", + " if rounds_log.exists():\n", + " rounds = [json.loads(l) for l in rounds_log.read_text().splitlines()]\n", + " print(f'\\n{\"Round\":<7} {\"Reward\":<10} {\"Tools\":<7} Verifier error')\n", + " print('-' * 60)\n", + " for r in rounds:\n", + " score = (r.get('rewards') or {}).get('reward', '?')\n", + " err = (r.get('verifier_error') or '')[:30]\n", + " print(f'{r[\"round\"]:<7} {score!s:<10} {r[\"n_tool_calls\"]:<7} {err}')\n", + " else:\n", + " print(' (no user_rounds.jsonl yet — run is in progress or failed before any round)')\n", + "else:\n", + " print(f'No progressive runs at {PROGRESSIVE_ROOT} yet.')\n", + " print('Run examples/swebench_pro_user_dogfood.py to generate one.')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Per-task hardening opt-outs\n", + "\n", + "qutebrowser ships legitimate `conftest.py` files that fix a circular import in its own source. The default verifier hardening deletes them, breaking pytest collection. The task opts out:\n", + "\n", + "```toml\n", + "# .ref/swebenchpro/instance_qutebrowser__.../task.toml\n", + "[verifier.hardening]\n", + "cleanup_conftests = false\n", + "```\n", + "\n", + "Available flags (all default `true` — secure-by-default):\n", + "\n", + "| Flag | Effect when `false` |\n", + "|------|---------------------|\n", + "| `cleanup_conftests` | Don't delete `conftest.py` outside `/tests/` before verify |\n", + "\n", + "See [`docs/progressive-disclosure.md`](../docs/progressive-disclosure.md) for the full design rationale." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Comparison with multi-agent simulated user\n", + "\n", + "BenchFlow has two patterns for multi-round agent runs:\n", + "\n", + "| Pattern | When to use |\n", + "|---------|-------------|\n", + "| `BaseUser` callback (this notebook) | Programmatic, rule-based, deterministic. No second LLM. Cheap. |\n", + "| Multi-role Scene with simulated-user role ([use-cases.md §1](../docs/use-cases.md#1-interactive-user-simulation-harbor-1316-equivalent)) | Open-ended, conversational. The 'user' is another LLM with full tool access. |\n", + "\n", + "Both are functionally at parity with [Harbor #1316](https://github.com/harbor-ai/harbor/issues/1316), avoiding the FastMCP sidecar requirement. Choose based on whether you want a deterministic callback or another LLM agent." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/swebench_pro_user_dogfood.py b/examples/swebench_pro_user_dogfood.py new file mode 100644 index 0000000..997ccc8 --- /dev/null +++ b/examples/swebench_pro_user_dogfood.py @@ -0,0 +1,135 @@ +"""Dogfood: SWE-bench Pro progressive disclosure with BaseUser. + +Demonstrates the BaseUser abstraction on a SWE-bench Pro task — the original +motivation for this feature (Josh's GitHub/Microsoft use case). + +The user: + Round 0: terse problem description (one sentence from the spec). + Round 1+: if tests failed, append the failing test names and a section of + the original spec as a hint. + Stops when reward >= 1.0 or max_user_rounds hit. + +Usage: + GEMINI_API_KEY=... python examples/swebench_pro_user_dogfood.py + GEMINI_API_KEY=... python examples/swebench_pro_user_dogfood.py --task openlibrary + +Tasks available (oracle-validated 5/5 on 2026-04-24): + ansible, flipt, openlibrary, navidrome, qutebrowser +""" + +import argparse +import asyncio +import logging +from pathlib import Path + +import benchflow as bf +from benchflow.trial import Scene, TrialConfig +from benchflow.user import FunctionUser, RoundResult + +logging.basicConfig(level=logging.INFO, format="%(name)s %(message)s") + + +SWEBENCH_PRO_TASKS = { + "ansible": "instance_ansible__ansible-0ea40e09d1b35bcb69ff4d9cecf3d0defa4b36e8-v30a923fb5c164d6cd18280c02422f75e611e8fb2", + "flipt": "instance_flipt-io__flipt-02e21636c58e86c51119b63e0fb5ca7b813b07b1", + "openlibrary": "instance_internetarchive__openlibrary-00bec1e7c8f3272c469a58e1377df03f955ed478-v13642507b4fc1f8d234172bf8129942da2c2ca26", + "navidrome": "instance_navidrome__navidrome-0130c6dc13438b48cf0fdfab08a89e357b5517c9", + "qutebrowser": "instance_qutebrowser__qutebrowser-01d1d1494411380d97cac14614a829d3a69cecaf-v2ef375ac784985212b1805e1d0431dc8f1b3c171", +} + + +def make_progressive_user() -> FunctionUser: + """User that compresses the instruction on round 0 and discloses hints on failure.""" + + def progressive( + round: int, instruction: str, rr: RoundResult | None + ) -> str | None: + # Round 0: terse — first line of the spec. + if round == 0: + first_line = instruction.split("\n", 1)[0].strip() + return ( + f"{first_line}\n\n" + "Read /app/ to understand the codebase, then implement the fix. " + "Run tests when you think you're done." + ) + + # Stop on success. + if rr and rr.rewards: + score = rr.rewards.get("reward", rr.rewards.get("exact_match", 0)) + if score >= 1.0: + print(f" [User] reward={score} at round {round}, stopping.") + return None + + # Round 1: show failing tests + first half of spec. + if round == 1: + half = len(instruction) // 2 + return ( + "The verifier reported these issues:\n\n" + f"{(rr.verifier_output or '')[:1500]}\n\n" + "Here is the first half of the spec for context:\n\n" + f"{instruction[:half]}\n\n" + "Continue working in /app/." + ) + + # Round 2: full spec. + if round == 2: + return ( + "Still failing. Full specification:\n\n" + f"{instruction}\n\n" + "Address every requirement listed above." + ) + + return None + + return FunctionUser(progressive) + + +async def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--task", + default="flipt", + choices=list(SWEBENCH_PRO_TASKS), + help="Which SWE-bench Pro task to run", + ) + parser.add_argument("--agent", default="gemini") + parser.add_argument("--model", default="gemini-3.1-pro-preview") + parser.add_argument("--backend", default="daytona") + parser.add_argument("--max-rounds", type=int, default=3) + args = parser.parse_args() + + task_dir = SWEBENCH_PRO_TASKS[args.task] + task_path = Path(".ref/swebenchpro") / task_dir + if not task_path.exists(): + print(f"Task not found: {task_path}") + return + + config = TrialConfig( + task_path=task_path, + scenes=[Scene.single(agent=args.agent, model=args.model)], + environment=args.backend, + sandbox_user="agent", + user=make_progressive_user(), + max_user_rounds=args.max_rounds, + jobs_dir="/tmp/swebench-pro-jobs/progressive", + ) + + print(f"Progressive disclosure on {args.task}") + print(f" Agent: {args.agent} / {args.model}") + print(f" Backend: {args.backend}") + print(f" Rounds: up to {args.max_rounds}") + print() + + result = await bf.run(config) + + print("\n=== RESULT ===") + print(f" Task: {result.task_name}") + print(f" Rewards: {result.rewards}") + print(f" Tool calls: {result.n_tool_calls}") + print(f" Error: {result.error}") + if result.trial_dir: + print(f" Trial dir: {result.trial_dir}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/user_dogfood.py b/examples/user_dogfood.py new file mode 100644 index 0000000..517afc0 --- /dev/null +++ b/examples/user_dogfood.py @@ -0,0 +1,93 @@ +"""Dogfood: run regex-log task with progressive-disclosure User. + +Demonstrates the BaseUser abstraction — a FunctionUser that: + Round 0: gives a terse version of the instruction + Round 1+: if tests failed, gives a hint based on the full instruction + Stops when tests pass or max_rounds hit. + +Usage: + GEMINI_API_KEY=... python examples/user_dogfood.py +""" + +import asyncio +import logging +from pathlib import Path + +import benchflow as bf +from benchflow.trial import Scene, TrialConfig +from benchflow.user import FunctionUser, RoundResult + +logging.basicConfig(level=logging.INFO, format="%(name)s %(message)s") + + +def progressive_user( + round: int, instruction: str, rr: RoundResult | None +) -> str | None: + if round == 0: + return ( + "Write a regex that matches dates (YYYY-MM-DD) in log lines " + "containing an IPv4 address. Save it to /app/regex.txt" + ) + + if rr and rr.rewards: + score = rr.rewards.get("exact_match", rr.rewards.get("reward", 0)) + if score >= 1.0: + print(f" [User] Tests passed at round {round}! Stopping.") + return None + + if round == 1: + return ( + "The tests failed. Important details you may have missed:\n" + "- Match only the LAST date in each line\n" + "- Feb can have up to 29 days\n" + "- Dates/IPs must not be preceded/followed by alphanumeric chars\n" + "- Use re.findall with re.MULTILINE\n" + "Fix /app/regex.txt" + ) + + if round == 2: + return ( + "Still failing. Here's the full instruction:\n\n" + instruction + ) + + return None + + +async def main(): + task_path = Path(".ref/terminal-bench-2/regex-log") + if not task_path.exists(): + print(f"Task not found at {task_path}") + return + + config = TrialConfig( + task_path=task_path, + scenes=[Scene.single(agent="gemini", model="gemini-2.5-flash")], + environment="daytona", + user=FunctionUser(progressive_user), + max_user_rounds=4, + ) + + print("Running progressive-disclosure trial on regex-log...") + print(" Agent: gemini/flash") + print(f" Max rounds: {config.max_user_rounds}") + print(f" Environment: {config.environment}") + print() + + result = await bf.run(config) + + print("\n=== RESULT ===") + print(f" Task: {result.task_name}") + print(f" Rewards: {result.rewards}") + print(f" Error: {result.error}") + print(f" Tool calls: {result.n_tool_calls}") + + if result.rewards: + score = result.rewards.get("exact_match", result.rewards.get("reward", 0)) + if score >= 1.0: + print(" PASSED (final verify)") + else: + print(f" FAILED (score={score})") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/user_dogfood_results/result.json b/examples/user_dogfood_results/result.json new file mode 100644 index 0000000..0f36059 --- /dev/null +++ b/examples/user_dogfood_results/result.json @@ -0,0 +1,25 @@ +{ + "task_name": "regex-log", + "trial_name": "regex-log__bab1cbf6", + "rewards": { + "reward": 0.0 + }, + "agent": "gemini", + "agent_name": "gemini-cli", + "model": "gemini-2.5-flash", + "n_tool_calls": 8, + "n_prompts": 1, + "error": null, + "verifier_error": null, + "partial_trajectory": false, + "trajectory_source": "acp", + "started_at": "2026-04-22 22:37:42.034339", + "finished_at": "2026-04-22 22:40:58.748485", + "timing": { + "environment_setup": 1.3, + "agent_setup": 3.1, + "agent_execution": 15.6, + "verifier": 6.4, + "total": 196.7 + } +} \ No newline at end of file diff --git a/examples/user_dogfood_results/user_rounds.jsonl b/examples/user_dogfood_results/user_rounds.jsonl new file mode 100644 index 0000000..258dfaf --- /dev/null +++ b/examples/user_dogfood_results/user_rounds.jsonl @@ -0,0 +1,3 @@ +{"round": 0, "prompt": "Write a regex that matches dates (YYYY-MM-DD) in log lines containing an IPv4 address. Save it to /app/regex.txt", "rewards": {"reward": 0.0}, "verifier_error": null, "n_tool_calls": 2, "n_trajectory_events": 4} +{"round": 1, "prompt": "The tests failed. Important details you may have missed:\n- Match only the LAST date in each line\n- Feb can have up to 29 days\n- Dates/IPs must not be preceded/followed by alphanumeric chars\n- Use re.findall with re.MULTILINE\nFix /app/regex.txt", "rewards": {"reward": 0.0}, "verifier_error": null, "n_tool_calls": 3, "n_trajectory_events": 5} +{"round": 2, "prompt": "Still failing. Here's the full instruction:\n\nWrite a regex expression that matches dates in the format YYYY-MM-DD appearing in lines that contain an IPv4 address in a log file.\nIf multiple dates are present in a line, the regex should match only the last date in that line.\nAssume that February can have up to 29 days in all years, without distinguishing leap years from non-leap years.\nIPv4 addresses use normal decimal notation without leading zeros in each octet.\n\nNote: Be careful that there might be text in the log that looks similar to dates or IPv4 addresses but is not (e.g., user 1134-12-1234). \nTo avoid false matches, ensure that valid dates and IPv4 addresses are not immediately preceded or followed by alphanumeric characters.\n\nSave your regex in /app/regex.txt\nThe regex will be read from the file and applied to the log file contents using Python's re.findall with the re.MULTILINE flag.\nExample Python usage:\n```\nimport re\n\nwith open(\"/app/regex.txt\") as f:\n pattern = f.read().strip()\n\nmatches = re.findall(pattern, log_text, re.MULTILINE)\n```", "rewards": {"reward": 0.0}, "verifier_error": null, "n_tool_calls": 3, "n_trajectory_events": 5} diff --git a/experiments/swebench-pro-results.csv b/experiments/swebench-pro-results.csv new file mode 100644 index 0000000..2c10f0b --- /dev/null +++ b/experiments/swebench-pro-results.csv @@ -0,0 +1,9 @@ +agent,elapsed_s,error,experiment,model,n_tool_calls,reward,task,task_full +,54.3,,oracle,,0,1.0,ansible,instance_ansible__ansible-0ea40e09d1b35bcb69ff4d9cecf3d0defa4b36e8-v30a923fb5c164d6cd18280c02422f75e611e8fb2 +,480.5,,oracle,,0,1.0,flipt,instance_flipt-io__flipt-02e21636c58e86c51119b63e0fb5ca7b813b07b1 +,147.7,,oracle,,0,1.0,openlibrary,instance_internetarchive__openlibrary-00bec1e7c8f3272c469a58e1377df03f955ed478-v13642507b4fc1f8d234172bf8129942da2c2ca26 +,24.3,,oracle,,0,0.0,qutebrowser,instance_qutebrowser__qutebrowser-01d1d1494411380d97cac14614a829d3a69cecaf-v2ef375ac784985212b1805e1d0431dc8f1b3c171 +gemini,206.8,,baseline,gemini-3.1-pro-preview,23,1.0,ansible,instance_ansible__ansible-0ea40e09d1b35bcb69ff4d9cecf3d0defa4b36e8-v30a923fb5c164d6cd18280c02422f75e611e8fb2 +gemini,1443.7,,baseline,gemini-3.1-pro-preview,61,0.0,flipt,instance_flipt-io__flipt-02e21636c58e86c51119b63e0fb5ca7b813b07b1 +gemini,340.0,,baseline,gemini-3.1-pro-preview,32,1.0,openlibrary,instance_internetarchive__openlibrary-00bec1e7c8f3272c469a58e1377df03f955ed478-v13642507b4fc1f8d234172bf8129942da2c2ca26 +gemini,543.6,,baseline,gemini-3.1-pro-preview,72,0.0,qutebrowser,instance_qutebrowser__qutebrowser-01d1d1494411380d97cac14614a829d3a69cecaf-v2ef375ac784985212b1805e1d0431dc8f1b3c171 diff --git a/experiments/swebench_pro_oracle_and_baseline.py b/experiments/swebench_pro_oracle_and_baseline.py new file mode 100644 index 0000000..00b865d --- /dev/null +++ b/experiments/swebench_pro_oracle_and_baseline.py @@ -0,0 +1,263 @@ +"""SWE-bench Pro: oracle validation + Harbor baseline. + +Two experiments on the 4 testable SWE-bench Pro tasks: + + 1) Oracle validation — runs gold solution (solve.sh), verifies reward=1.0 + for all 4 tasks. Confirms the --rootdir=/tests fix resolved qutebrowser + and openlibrary failures. + + 2) Harbor baseline — single-round agent evaluation (no progressive + disclosure). Runs the same 4 tasks with a real agent to establish + baseline pass rates for comparison with progressive disclosure. + +Usage: + # Oracle only (no API key needed): + python experiments/swebench_pro_oracle_and_baseline.py --oracle-only + + # Full run (oracle + baseline): + GEMINI_API_KEY=... python experiments/swebench_pro_oracle_and_baseline.py + + # Customize agent/model/backend: + GEMINI_API_KEY=... python experiments/swebench_pro_oracle_and_baseline.py \ + --agent gemini --model gemini-3.1-pro-preview --backend daytona + +Results → experiments/swebench-pro-results.csv +""" + +import argparse +import asyncio +import csv +import logging +import os +import sys +import time +from pathlib import Path + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s %(name)s %(message)s", +) +logger = logging.getLogger(__name__) + +sys.path.insert(0, str(Path(__file__).resolve().parents[0].parent / "src")) + +import benchflow as bf +from benchflow.trial import Scene, TrialConfig + +SWEBENCH_PRO_ROOT = Path(__file__).resolve().parents[1] / ".ref" / "swebenchpro" + +TASKS = [ + "instance_ansible__ansible-0ea40e09d1b35bcb69ff4d9cecf3d0defa4b36e8-v30a923fb5c164d6cd18280c02422f75e611e8fb2", + "instance_flipt-io__flipt-02e21636c58e86c51119b63e0fb5ca7b813b07b1", + "instance_internetarchive__openlibrary-00bec1e7c8f3272c469a58e1377df03f955ed478-v13642507b4fc1f8d234172bf8129942da2c2ca26", + "instance_qutebrowser__qutebrowser-01d1d1494411380d97cac14614a829d3a69cecaf-v2ef375ac784985212b1805e1d0431dc8f1b3c171", +] + +TASK_LABELS = { + "instance_ansible": "ansible", + "instance_flipt": "flipt", + "instance_internet": "openlibrary", + "instance_qutebrowser": "qutebrowser", +} + +RESULTS_FILE = Path(__file__).parent / "swebench-pro-results.csv" +JOBS_DIR = Path("/tmp/swebench-pro-jobs") + + +def task_label(task_name: str) -> str: + for prefix, label in TASK_LABELS.items(): + if task_name.startswith(prefix): + return label + return task_name[:30] + + +async def run_oracle(task_path: Path, backend: str) -> dict: + """Run oracle (gold solution) on a single task. Returns result dict.""" + label = task_label(task_path.name) + logger.info(f"[oracle] {label}: starting") + t0 = time.time() + + config = TrialConfig( + task_path=task_path, + agent="oracle", + environment=backend, + sandbox_user="agent", + jobs_dir=str(JOBS_DIR / "oracle"), + ) + + try: + result = await bf.run(config) + elapsed = time.time() - t0 + reward = None + if result.rewards: + reward = result.rewards.get("reward", result.rewards.get("exact_match")) + logger.info( + f"[oracle] {label}: reward={reward} " + f"error={result.error!r} ({elapsed:.0f}s)" + ) + return { + "experiment": "oracle", + "task": label, + "task_full": task_path.name, + "reward": reward, + "error": result.error or "", + "elapsed_s": round(elapsed, 1), + "n_tool_calls": result.n_tool_calls, + } + except Exception as e: + elapsed = time.time() - t0 + logger.error(f"[oracle] {label}: crashed: {e}", exc_info=True) + return { + "experiment": "oracle", + "task": label, + "task_full": task_path.name, + "reward": None, + "error": str(e), + "elapsed_s": round(elapsed, 1), + "n_tool_calls": 0, + } + + +async def run_baseline( + task_path: Path, agent: str, model: str, backend: str +) -> dict: + """Run single-round baseline (no progressive disclosure) on a single task.""" + label = task_label(task_path.name) + logger.info(f"[baseline] {label}: starting ({agent}/{model})") + t0 = time.time() + + config = TrialConfig( + task_path=task_path, + scenes=[Scene.single(agent=agent, model=model)], + environment=backend, + sandbox_user="agent", + jobs_dir=str(JOBS_DIR / "baseline"), + ) + + try: + result = await bf.run(config) + elapsed = time.time() - t0 + reward = None + if result.rewards: + reward = result.rewards.get("reward", result.rewards.get("exact_match")) + logger.info( + f"[baseline] {label}: reward={reward} " + f"tools={result.n_tool_calls} error={result.error!r} ({elapsed:.0f}s)" + ) + return { + "experiment": "baseline", + "task": label, + "task_full": task_path.name, + "agent": agent, + "model": model, + "reward": reward, + "error": result.error or "", + "elapsed_s": round(elapsed, 1), + "n_tool_calls": result.n_tool_calls, + } + except Exception as e: + elapsed = time.time() - t0 + logger.error(f"[baseline] {label}: crashed: {e}", exc_info=True) + return { + "experiment": "baseline", + "task": label, + "task_full": task_path.name, + "agent": agent, + "model": model, + "reward": None, + "error": str(e), + "elapsed_s": round(elapsed, 1), + "n_tool_calls": 0, + } + + +async def main(): + parser = argparse.ArgumentParser(description="SWE-bench Pro oracle + baseline") + parser.add_argument("--oracle-only", action="store_true", help="Skip baseline") + parser.add_argument("--agent", default=os.environ.get("AGENT", "gemini")) + parser.add_argument("--model", default=os.environ.get("MODEL", "gemini-3.1-pro-preview")) + parser.add_argument("--backend", default=os.environ.get("BACKEND", "docker")) + parser.add_argument("--concurrency", type=int, default=2) + args = parser.parse_args() + + task_paths = [SWEBENCH_PRO_ROOT / t for t in TASKS] + missing = [p for p in task_paths if not p.exists()] + if missing: + logger.error(f"Missing tasks: {[p.name for p in missing]}") + sys.exit(1) + + JOBS_DIR.mkdir(parents=True, exist_ok=True) + results: list[dict] = [] + + # Phase 1: Oracle validation (all 4 tasks in parallel) + logger.info("=" * 60) + logger.info("Phase 1: Oracle validation (gold solution)") + logger.info("=" * 60) + + sem = asyncio.Semaphore(args.concurrency) + + async def bounded_oracle(tp): + async with sem: + return await run_oracle(tp, args.backend) + + oracle_results = await asyncio.gather( + *[bounded_oracle(tp) for tp in task_paths] + ) + results.extend(oracle_results) + + oracle_pass = sum(1 for r in oracle_results if r["reward"] == 1.0) + logger.info(f"Oracle: {oracle_pass}/{len(oracle_results)} passed") + + if oracle_pass < len(oracle_results): + failed = [r["task"] for r in oracle_results if r["reward"] != 1.0] + logger.warning(f"Oracle failures: {failed}") + + # Phase 2: Baseline (single-round agent) + if not args.oracle_only: + logger.info("") + logger.info("=" * 60) + logger.info(f"Phase 2: Baseline ({args.agent}/{args.model})") + logger.info("=" * 60) + + async def bounded_baseline(tp): + async with sem: + return await run_baseline(tp, args.agent, args.model, args.backend) + + baseline_results = await asyncio.gather( + *[bounded_baseline(tp) for tp in task_paths] + ) + results.extend(baseline_results) + + baseline_pass = sum(1 for r in baseline_results if r.get("reward", 0) == 1.0) + logger.info(f"Baseline: {baseline_pass}/{len(baseline_results)} passed") + + # Write CSV + if results: + fieldnames = list(results[0].keys()) + all_keys = set() + for r in results: + all_keys.update(r.keys()) + fieldnames = sorted(all_keys) + + with open(RESULTS_FILE, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(results) + logger.info(f"Results written to {RESULTS_FILE}") + + # Summary table + print("\n" + "=" * 60) + print("SUMMARY") + print("=" * 60) + print(f"{'Experiment':<12} {'Task':<15} {'Reward':<10} {'Tools':<8} {'Time':<8} {'Error'}") + print("-" * 80) + for r in results: + reward_str = str(r.get("reward", "—")) + tools_str = str(r.get("n_tool_calls", "—")) + time_str = f"{r.get('elapsed_s', 0):.0f}s" + error_str = (r.get("error", "") or "")[:40] + print(f"{r['experiment']:<12} {r['task']:<15} {reward_str:<10} {tools_str:<8} {time_str:<8} {error_str}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/pyproject.toml b/pyproject.toml index 306e71c..62f8f46 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -100,6 +100,8 @@ ignore = [ # Standalone scripts — sys.path manipulation before imports is intentional "experiments/*.py" = ["E402"] "tests/conformance/*.py" = ["E402"] +# Notebooks: cell-local imports + short loop vars are notebook conventions +"examples/*.ipynb" = ["E402", "E741", "SIM115"] # Forward references resolved via __future__ annotations — ruff flags them # but they work at runtime; explicit TYPE_CHECKING imports would force eager loads. "src/benchflow/runtime.py" = ["F821"] diff --git a/src/benchflow/__init__.py b/src/benchflow/__init__.py index 35388b5..1d9129a 100644 --- a/src/benchflow/__init__.py +++ b/src/benchflow/__init__.py @@ -63,6 +63,7 @@ from benchflow.trial import Scene as TrialScene from benchflow.trial import Trial, TrialConfig, Turn from benchflow.trial_yaml import trial_config_from_yaml +from benchflow.user import BaseUser, FunctionUser, PassthroughUser, RoundResult # Public API surface. Anything not in this list is implementation detail and # may change without notice. Names are grouped by source module to match the @@ -124,6 +125,11 @@ "TrialScene", "Turn", "trial_config_from_yaml", + # User abstraction (progressive disclosure) + "BaseUser", + "FunctionUser", + "PassthroughUser", + "RoundResult", # SDK (backwards compat) "SDK", # Environments / dep staging diff --git a/src/benchflow/_acp_run.py b/src/benchflow/_acp_run.py index e134e5f..58b95be 100644 --- a/src/benchflow/_acp_run.py +++ b/src/benchflow/_acp_run.py @@ -36,6 +36,52 @@ _ACP_CONNECT_MAX_RETRIES = 3 _ACP_CONNECT_BASE_DELAY = 2.0 +# models.dev provider inference — used when acp_model_format="provider/model" +# to reconstruct "provider/model" from a bare model name. +_MODELSDEV_PROVIDER_HEURISTICS: list[tuple[str, str]] = [ + # (substring in model name, models.dev provider ID) + ("gemini", "google"), + ("gemma", "google"), + ("gpt", "openai"), + ("o1", "openai"), + ("o3", "openai"), + ("o4", "openai"), + ("claude", "anthropic"), + ("haiku", "anthropic"), + ("sonnet", "anthropic"), + ("opus", "anthropic"), + ("mistral", "mistral"), + ("codestral", "mistral"), +] + + +def _format_acp_model(model: str, agent: str) -> str: + """Format a model ID for ACP session/set_model based on agent requirements. + + Most agents expect a bare model name (e.g. "claude-sonnet-4-6"). + Agents with acp_model_format="provider/model" (e.g. opencode) need the + models.dev provider prefix (e.g. "google/gemini-3.1-pro-preview"). + + Strips benchflow's custom provider prefixes first, then re-adds the + models.dev provider prefix when the agent requires it. + """ + bare = strip_provider_prefix(model) + agent_cfg = AGENTS.get(agent) + if not agent_cfg or agent_cfg.acp_model_format != "provider/model": + return bare + # Already has a slash — assume it's provider/model already + if "/" in bare: + return bare + # Infer the models.dev provider from the bare model name + m = bare.lower() + for substring, provider in _MODELSDEV_PROVIDER_HEURISTICS: + if substring in m: + return f"{provider}/{bare}" + logger.warning( + "Cannot infer models.dev provider for %r — defaulting to anthropic/", bare + ) + return f"anthropic/{bare}" + async def connect_acp( env, @@ -123,7 +169,7 @@ async def connect_acp( agent_cfg = AGENTS.get(agent) if model and (agent_cfg is None or agent_cfg.supports_acp_set_model): - acp_model_id = strip_provider_prefix(model) + acp_model_id = _format_acp_model(model, agent) try: await asyncio.wait_for(acp_client.set_model(acp_model_id), timeout=60) logger.info(f"Model set to: {acp_model_id} (from {model})") diff --git a/src/benchflow/_agent_env.py b/src/benchflow/_agent_env.py index e67c741..0229b17 100644 --- a/src/benchflow/_agent_env.py +++ b/src/benchflow/_agent_env.py @@ -64,6 +64,7 @@ def auto_inherit_env(agent_env: dict[str, str]) -> None: "OPENAI_API_KEY", "GOOGLE_API_KEY", "GEMINI_API_KEY", + "GOOGLE_GENERATIVE_AI_API_KEY", "GOOGLE_CLOUD_PROJECT", "GOOGLE_CLOUD_LOCATION", "LLM_API_KEY", @@ -80,6 +81,9 @@ def auto_inherit_env(agent_env: dict[str, str]) -> None: # Mirror GEMINI_API_KEY as GOOGLE_API_KEY (some agents expect one or the other) if "GEMINI_API_KEY" in agent_env and "GOOGLE_API_KEY" not in agent_env: agent_env["GOOGLE_API_KEY"] = agent_env["GEMINI_API_KEY"] + # Mirror GEMINI_API_KEY as GOOGLE_GENERATIVE_AI_API_KEY (opencode/models.dev convention) + if "GEMINI_API_KEY" in agent_env and "GOOGLE_GENERATIVE_AI_API_KEY" not in agent_env: + agent_env["GOOGLE_GENERATIVE_AI_API_KEY"] = agent_env["GEMINI_API_KEY"] # CLAUDE_CODE_OAUTH_TOKEN is a separate auth path — Claude CLI reads it # directly. Don't map to ANTHROPIC_API_KEY (different auth mechanism). diff --git a/src/benchflow/_sandbox.py b/src/benchflow/_sandbox.py index 63fbcde..7294a3d 100644 --- a/src/benchflow/_sandbox.py +++ b/src/benchflow/_sandbox.py @@ -16,7 +16,6 @@ import os import re import shlex -import tomllib from pathlib import Path from typing import TYPE_CHECKING @@ -305,7 +304,7 @@ async def _refresh_verifier_workspace(env, workspace: str) -> None: "PYTEST_ADDOPTS": ( "-c /dev/null " # block pyproject.toml/pytest.ini/tox.ini/setup.cfg discovery "--confcutdir=/tests " # block conftest.py walk-up beyond /tests - "--rootdir=/tests " + "--rootdir=/app " # anchor test node IDs to repo root (not /dev from -c) "-p no:cacheprovider" ), # Block pytest11 entry-point plugins. An agent can modify a pre-installed @@ -347,34 +346,24 @@ async def _refresh_verifier_workspace(env, workspace: str) -> None: _SAFE_VERIFIER_PATH_PARTS = tuple(_SAFE_VERIFIER_PATH.split(":")) _RUNTIME_PATH_PREFIXES = ("/tmp", "/var/tmp", "/logs", "/testbed") -# pytest plugin names are not always the same as the PyPI distribution name -# or the option they register. These aliases cover the common benchmark -# verifier plugins while preserving PYTEST_DISABLE_PLUGIN_AUTOLOAD=1. -_PYTEST_PLUGIN_ALIASES = { - "ctrf": "ctrf", - "pytest-json-ctrf": "ctrf", - "pytest_json_ctrf": "ctrf", - "pytest_json_ctrf.plugin": "ctrf", - "pytest-json-report": "pytest_jsonreport", - "pytest_json_report": "pytest_jsonreport", - "pytest_jsonreport": "pytest_jsonreport", - "pytest_jsonreport.plugin": "pytest_jsonreport", -} -_PYTEST_OPTION_PLUGINS = { - "--ctrf": "ctrf", - "--json-report": "pytest_jsonreport", - "--json-report-file": "pytest_jsonreport", -} - -# Pytest plugins worth auto-loading when test.sh pip-installs them but the -# task author forgot to declare pytest_plugins in task.toml. Map distribution -# name (as it appears in `pip install pytest-foo`) to importable plugin name. -_PYTEST_INSTALLED_PLUGINS = { - "pytest-asyncio": "pytest_asyncio", - "pytest-anyio": "anyio.pytest_plugin", - "pytest-trio": "pytest_trio", -} -_PIP_INSTALL_RE = re.compile(r"\bpip3?\s+install\b[^\n;|&]*", re.IGNORECASE) +# Container-side script to enumerate pre-installed pytest11 entry points. +# Runs after sandbox_user processes are killed, so the agent cannot install +# new packages between enumeration and verification. The sandbox_user cannot +# pip install (not root), so all discovered plugins are image-authored. +_DISCOVER_PYTEST_PLUGINS_SCRIPT = r""" +import json, sys +try: + from importlib.metadata import entry_points + try: + eps = list(entry_points(group='pytest11')) + except TypeError: + eps = list(entry_points().get('pytest11', [])) + names = sorted(set(ep.name for ep in eps)) + print(json.dumps(names)) +except Exception as e: + print(json.dumps({"error": str(e)}), file=sys.stderr) + print("[]") +""".strip() def _under_path(path: str, prefix: str) -> bool: @@ -394,6 +383,21 @@ def _blocked_verifier_path_prefixes( return tuple(dict.fromkeys(prefixes)) +def _blocked_verifier_pythonpath_prefixes( + sandbox_user: str | None, +) -> tuple[str, ...]: + """Paths blocked from verifier PYTHONPATH. + + Unlike PATH, the workspace is NOT blocked: PYTHONPATH entries like /app + are set by the Dockerfile for project imports, and the workspace is + already importable via CWD/pytest sys.path insertion regardless. + """ + prefixes = list(_RUNTIME_PATH_PREFIXES) + if sandbox_user: + prefixes.append(f"/home/{sandbox_user}") + return tuple(dict.fromkeys(prefixes)) + + def _merge_trusted_verifier_path(extras: list[str]) -> str: """Prepend validated image PATH entries to the verifier allowlist.""" kept: list[str] = [] @@ -462,76 +466,38 @@ def _trusted_path_extras_cmd(raw_path: str, blocked_prefixes: tuple[str, ...]) - ) -def _normalize_pytest_plugin(name: object) -> str | None: - """Return the importable pytest plugin name for a task declaration.""" - if not isinstance(name, str): - return None - clean = name.strip() - if not clean: - return None - return _PYTEST_PLUGIN_ALIASES.get(clean, clean) +async def _discover_pytest_plugin_flags(env, task: "Task") -> str: + """Auto-discover pytest plugins from root-owned system packages. - -def _plugins_from_verifier_script(task: "Task") -> list[str]: - """Infer known pytest plugins needed by legacy verifier scripts. - - Older SkillsBench/TB2 tasks predate task-level pytest plugin metadata and - call options such as --ctrf directly from tests/test.sh. With pytest entry - point autoload disabled, those options must be backed by explicit -p flags. + Runs a container-side script that enumerates pytest11 entry points and + filters to only those whose dist-info is in a root-owned directory. + Falls back to task.toml pytest_plugins declarations if discovery fails. + Replaces the previous hand-curated whitelist mechanism. """ - task_dir = getattr(task, "task_dir", None) - if not isinstance(task_dir, (str, os.PathLike)): - return [] - test_sh = Path(task_dir) / "tests" / "test.sh" - try: - content = test_sh.read_text() - except OSError: - return [] - plugins: list[str] = [] - for option, plugin in _PYTEST_OPTION_PLUGINS.items(): - if option in content and plugin not in plugins: - plugins.append(plugin) - # Detect pip-installed pytest plugins so PYTEST_DISABLE_PLUGIN_AUTOLOAD=1 - # doesn't silently drop them. Only matches the exact installer line so - # arbitrary text mentioning the plugin name is ignored. - for match in _PIP_INSTALL_RE.findall(content): - for dist, plugin in _PYTEST_INSTALLED_PLUGINS.items(): - if dist in match and plugin not in plugins: - plugins.append(plugin) - return plugins - - -def _declared_pytest_plugins(task: "Task") -> list[object]: - """Return pytest_plugins from the model, falling back to raw task.toml.""" - declared = getattr(task.config.verifier, "pytest_plugins", None) - if declared: - return list(declared) - task_dir = getattr(task, "task_dir", None) - if not isinstance(task_dir, (str, os.PathLike)): - return [] - config_path = Path(task_dir) / "task.toml" + # Container-side auto-discovery try: - data = tomllib.loads(config_path.read_text()) - except (OSError, tomllib.TOMLDecodeError): - return [] - plugins = data.get("verifier", {}).get("pytest_plugins", []) - if isinstance(plugins, list): - return plugins - return [] + result = await env.exec( + f"python3 -c {shlex.quote(_DISCOVER_PYTEST_PLUGINS_SCRIPT)}", + user="root", timeout_sec=15, + ) + if result.stderr: + logger.debug(f"Plugin discovery stderr: {result.stderr.strip()}") + discovered = _json.loads(result.stdout or "[]") + if isinstance(discovered, list): + plugins.extend(p for p in discovered if isinstance(p, str) and p.strip()) + logger.info(f"Discovered {len(plugins)} pytest plugins from container") + except Exception as e: + logger.warning(f"Pytest plugin discovery failed, using task.toml fallback: {e}") + # Merge task.toml declarations as fallback + declared = getattr(task.config.verifier, "pytest_plugins", None) + if declared: + for name in declared: + if isinstance(name, str) and name.strip() and name.strip() not in plugins: + plugins.append(name.strip()) -def _pytest_plugin_flags(task: "Task") -> str: - """Build deterministic -p flags for inferred and declared pytest plugins.""" - plugins: list[str] = [] - for plugin in _plugins_from_verifier_script(task): - if plugin not in plugins: - plugins.append(plugin) - for plugin in _declared_pytest_plugins(task): - normalized = _normalize_pytest_plugin(plugin) - if normalized and normalized not in plugins: - plugins.append(normalized) return " ".join(f"-p {shlex.quote(p)}" for p in plugins) @@ -596,6 +562,31 @@ async def _trusted_verifier_path( return _merge_trusted_verifier_path([e for e in extras if isinstance(e, str)]) +async def _trusted_verifier_pythonpath( + env, sandbox_user: str | None, +) -> str: + """Return filtered PYTHONPATH preserving only trusted image entries. + + Same root-owned, non-world-writable validation as PATH, but does not + block the workspace — it is already importable via CWD/pytest and + is chowned to root before verification. + """ + pp_result = await env.exec("printenv PYTHONPATH 2>/dev/null || true", user="root", timeout_sec=10) + raw_pp = (pp_result.stdout or "").strip() + if not raw_pp: + return "" + blocked = _blocked_verifier_pythonpath_prefixes(sandbox_user) + cmd = _trusted_path_extras_cmd(raw_pp, blocked) + result = await env.exec(cmd, user="root", timeout_sec=10) + try: + extras = _json.loads(result.stdout or "[]") + except _json.JSONDecodeError: + return "" + if not isinstance(extras, list): + return "" + return ":".join(e for e in extras if isinstance(e, str)) + + # Wipe and recreate /logs/verifier/ before the verifier runs. # rm -rf severs hardlinks, removes symlink replacements, and eliminates # variant filenames/subdirs the agent may have pre-staged. @@ -603,26 +594,87 @@ async def _trusted_verifier_path( "rm -rf /logs/verifier && mkdir -p /logs/verifier && chmod 777 /logs/verifier" ) -# Remove injected conftest.py, sitecustomize.py/usercustomize.py, and .pth -# files from writable sys.path entries (preserves /usr/lib, /usr/local/lib). -# Also purge *.py from temp dirs: covers module-shadow via non-workspace cwd. -CLEANUP_CMD = ( - "find / -name conftest.py -not -path '/tests/*' -delete 2>/dev/null; " - "find /tmp /var/tmp -name '*.py' -delete 2>/dev/null; " - 'python3 -c "' - "import sys,os;" - "[os.remove(os.path.join(d,f)) " - " for d in sys.path " - " for f in ('sitecustomize.py','usercustomize.py') " - " if d and not d.startswith('/usr/lib') and not d.startswith('/usr/local/lib') " - " and os.path.isfile(os.path.join(d,f))];" - "[os.remove(os.path.join(d,f)) " - " for d in sys.path if d and os.path.isdir(d) " - " for f in os.listdir(d) if f.endswith('.pth') " - " and not d.startswith('/usr/lib') and not d.startswith('/usr/local/lib') " - " and os.path.isfile(os.path.join(d,f))]" - '" 2>/dev/null || true' -) +# Per-task hardening opt-outs. Tasks declare these in task.toml under +# [verifier.hardening] when their legitimate test setup conflicts with the +# default cleanup (e.g. qutebrowser ships a real conftest.py that the cleanup +# would otherwise delete, breaking pytest collection). +# +# Defaults are secure (all True). Tasks opt out individually: +# +# [verifier.hardening] +# cleanup_conftests = false # don't delete conftest.py before verify +HARDENING_DEFAULTS: dict[str, bool] = { + "cleanup_conftests": True, +} + + +def _read_hardening_config(task_dir: "Path | str | None") -> dict[str, bool]: + """Read [verifier.hardening] section from task.toml. Returns merged defaults.""" + import tomllib + from pathlib import Path as _Path + + result = dict(HARDENING_DEFAULTS) + if task_dir is None: + return result + toml_path = _Path(task_dir) / "task.toml" + if not toml_path.exists(): + return result + try: + with open(toml_path, "rb") as f: + data = tomllib.load(f) + except Exception as e: + logger.warning(f"task.toml parse error in {task_dir}: {e}") + return result + overrides = data.get("verifier", {}).get("hardening", {}) + for k, v in overrides.items(): + if k in result and isinstance(v, bool): + result[k] = v + else: + logger.warning( + f"task.toml [verifier.hardening] unknown/invalid: {k}={v!r}" + ) + return result + + +def _build_cleanup_cmd(hardening: dict[str, bool] | None = None) -> str: + """Build the cleanup shell command, honoring per-task hardening opt-outs. + + Steps: + - conftest.py removal outside /tests (skippable via cleanup_conftests=false) + - *.py purge from /tmp /var/tmp (always — covers module-shadow via cwd) + - sitecustomize.py/usercustomize.py removal from writable sys.path + - .pth removal from writable sys.path + + sitecustomize/usercustomize/.pth always run — opt-outs there would broaden + the attack surface beyond what real-world tasks need. + """ + h = hardening or HARDENING_DEFAULTS + parts: list[str] = [] + if h.get("cleanup_conftests", True): + parts.append( + "find / -name conftest.py -not -path '/tests/*' -delete 2>/dev/null" + ) + parts.append("find /tmp /var/tmp -name '*.py' -delete 2>/dev/null") + parts.append( + 'python3 -c "' + "import sys,os;" + "[os.remove(os.path.join(d,f)) " + " for d in sys.path " + " for f in ('sitecustomize.py','usercustomize.py') " + " if d and not d.startswith('/usr/lib') and not d.startswith('/usr/local/lib') " + " and os.path.isfile(os.path.join(d,f))];" + "[os.remove(os.path.join(d,f)) " + " for d in sys.path if d and os.path.isdir(d) " + " for f in os.listdir(d) if f.endswith('.pth') " + " and not d.startswith('/usr/lib') and not d.startswith('/usr/local/lib') " + " and os.path.isfile(os.path.join(d,f))]" + '" 2>/dev/null || true' + ) + return "; ".join(parts) + + +# Backward-compat: the all-defaults cleanup command. +CLEANUP_CMD = _build_cleanup_cmd() async def harden_before_verify( @@ -722,9 +774,11 @@ async def harden_before_verify( f"chown -R root:root {shlex.quote(workspace)}", user="root", ) - await env.exec(CLEANUP_CMD, user="root", timeout_sec=10) + hardening = _read_hardening_config(getattr(task, "task_dir", None)) + await env.exec(_build_cleanup_cmd(hardening), user="root", timeout_sec=10) hardened_path = await _trusted_verifier_path(env, sandbox_user, workspace) + hardened_pythonpath = await _trusted_verifier_pythonpath(env, sandbox_user) distro_env = await _distro_pip_env(env) verifier_env = dict(VERIFIER_ENV) @@ -736,16 +790,16 @@ async def harden_before_verify( # plugin loading, or inject code via breakpoint()/coverage/Django/Celery # startup hooks. verifier_env["PATH"] = hardened_path + verifier_env["PYTHONPATH"] = hardened_pythonpath verifier_env["PYTEST_DISABLE_PLUGIN_AUTOLOAD"] = "1" verifier_env["PYTHONBREAKPOINT"] = "0" verifier_env["COVERAGE_PROCESS_START"] = "" verifier_env["DJANGO_SETTINGS_MODULE"] = "" verifier_env["CELERY_CONFIG_MODULE"] = "" - # Re-enable known verifier plugins by appending -p flags to the hardened - # base — never to a task-supplied PYTEST_ADDOPTS. Legacy task sets are - # inferred from tests/test.sh; newer tasks may declare pytest_plugins. + # Auto-discover pytest plugins from root-owned system packages and + # task.toml declarations. Appends -p flags to the hardened base. base_addopts = VERIFIER_ENV["PYTEST_ADDOPTS"] - flags = _pytest_plugin_flags(task) + flags = await _discover_pytest_plugin_flags(env, task) if flags: verifier_env["PYTEST_ADDOPTS"] = base_addopts + f" {flags}" else: diff --git a/src/benchflow/agents/registry.py b/src/benchflow/agents/registry.py index ca24348..83bf2c1 100644 --- a/src/benchflow/agents/registry.py +++ b/src/benchflow/agents/registry.py @@ -157,6 +157,13 @@ class AgentConfig: home_dirs: list[str] = field(default_factory=list) # Extra dot-dirs under $HOME to copy to sandbox user (for dirs not # derivable from skill_paths or credential_files, e.g. ".openclaw"). + acp_model_format: str = "bare" + # How the agent expects the modelId in session/set_model: + # "bare" — just the model name (e.g. "claude-sonnet-4-6"). + # Default; works for claude-agent-acp, codex-acp. + # "provider/model" — models.dev convention (e.g. "google/gemini-3.1-pro-preview"). + # Required by opencode, which uses Provider.parseModel() + # to split on "/" and treats the first segment as provider ID. subscription_auth: SubscriptionAuth | None = None # Host CLI login that can substitute for an API key (e.g. OAuth tokens # from `claude login`). Detected automatically; API keys take precedence. @@ -311,6 +318,28 @@ class AgentConfig: ], ), ), + "opencode": AgentConfig( + name="opencode", + description="OpenCode via ACP — open-source coding agent (TypeScript)", + skill_paths=["$HOME/.opencode/skills"], + home_dirs=[".opencode"], + install_cmd=( + f"{_NODE_INSTALL} && " + "( command -v opencode >/dev/null 2>&1 || " + "npm install -g opencode-ai@latest >/dev/null 2>&1 ) && " + "command -v opencode >/dev/null 2>&1" + ), + launch_cmd="opencode acp", + protocol="acp", + requires_env=[], # inferred from --model at runtime + acp_model_format="provider/model", + # OpenCode uses models.dev provider IDs — its parseModel() splits + # modelId on "/" so set_model must send "google/gemini-3.1-pro-preview", + # not just "gemini-3.1-pro-preview". + env_mapping={ + "BENCHFLOW_PROVIDER_BASE_URL": "OPENAI_BASE_URL", + }, + ), "openhands": AgentConfig( name="openhands", description="OpenHands agent via ACP (multi-model, Python-based)", @@ -531,6 +560,7 @@ def register_agent( credential_files: list[CredentialFile] | None = None, home_dirs: list[str] | None = None, subscription_auth: SubscriptionAuth | None = None, + acp_model_format: str = "bare", supports_acp_set_model: bool = True, ) -> AgentConfig: """Register a custom agent at runtime. @@ -561,6 +591,7 @@ def register_agent( credential_files=credential_files or [], home_dirs=home_dirs or [], subscription_auth=subscription_auth, + acp_model_format=acp_model_format, supports_acp_set_model=supports_acp_set_model, ) AGENTS[name] = config diff --git a/src/benchflow/trial.py b/src/benchflow/trial.py index 7449fc9..b255105 100644 --- a/src/benchflow/trial.py +++ b/src/benchflow/trial.py @@ -35,6 +35,7 @@ from __future__ import annotations +import asyncio import contextlib import json import logging @@ -67,6 +68,7 @@ from benchflow.acp.client import ACPClient, ACPError from benchflow.agents.registry import AGENT_LAUNCH from benchflow.models import RunResult, TrajectorySource +from benchflow.user import BaseUser, RoundResult logger = logging.getLogger(__name__) @@ -140,6 +142,11 @@ class TrialConfig: context_root: str | Path | None = None pre_agent_hooks: list | None = None + # User-driven progressive-disclosure loop + user: BaseUser | None = None + max_user_rounds: int = 5 + oracle_access: bool = False + # Legacy compat fields — used by SDK.run() shim. Ignored when scenes is set. agent: str = "claude-agent-acp" prompts: list[str | None] | None = None @@ -275,6 +282,11 @@ async def setup(self) -> None: logger.warning( "sandbox_user=None — agent runs as root with no path lockdown." ) + if cfg.oracle_access and cfg.user is None: + logger.warning( + "oracle_access=True without a User — /solution stays visible " + "to the agent for the entire trial." + ) self._effective_locked = _resolve_locked_paths( cfg.sandbox_user, cfg.sandbox_locked_paths @@ -517,6 +529,69 @@ async def verify(self) -> dict | None: self._phase = "verified" return self._rewards + async def soft_verify(self) -> tuple[dict | None, str | None, str | None]: + """Run the verifier without full hardening — for intermediate feedback. + + Skips process kill and workspace restore/chown (so the sandbox + stays usable for the next round), but DOES purge agent-injected + conftest.py / sitecustomize.py / .pth files to prevent the agent + from gaming intermediate test results. + + Returns (rewards, verifier_output, verifier_error). The final + verify() still does full hardening. + """ + from harbor import Verifier + + from benchflow._sandbox import _build_cleanup_cmd, _read_hardening_config + + self._trial_paths.verifier_dir.mkdir(parents=True, exist_ok=True) + # Clean verifier output dir — chmod 777 so non-root verifier processes can write + await self._env.exec( + "rm -rf /logs/verifier && mkdir -p /logs/verifier && chmod 777 /logs/verifier", + user="root", timeout_sec=10, + ) + # Purge agent-injected conftest/sitecustomize/.pth without + # killing processes or restoring workspace. + # Honor per-task [verifier.hardening] opt-outs from task.toml. + hardening = _read_hardening_config(getattr(self._task, "task_dir", None)) + await self._env.exec( + _build_cleanup_cmd(hardening), user="root", timeout_sec=10 + ) + + rewards = None + verifier_output = None + verifier_error = None + try: + verifier = Verifier( + task=self._task, + trial_paths=self._trial_paths, + environment=self._env, + ) + verifier_result = await asyncio.wait_for( + verifier.verify(), + timeout=self._task.config.verifier.timeout_sec, + ) + rewards = verifier_result.rewards + # Capture raw verifier output for the user + cat = await self._env.exec( + "cat /logs/verifier/*.log 2>/dev/null || " + "cat /logs/verifier/output.txt 2>/dev/null || true", + timeout_sec=10, + ) + verifier_output = (cat.stdout or "").strip() or None + logger.info(f"[soft_verify] rewards={rewards}") + except TimeoutError: + verifier_error = ( + f"soft verifier timed out after " + f"{self._task.config.verifier.timeout_sec}s" + ) + logger.warning(verifier_error) + except Exception as e: + verifier_error = f"soft verifier crashed: {e}" + logger.warning(verifier_error) + + return rewards, verifier_output, verifier_error + # ── Phase 5: CLEANUP ── async def cleanup(self) -> None: @@ -575,8 +650,18 @@ async def run(self) -> RunResult: ) else: await self.install_agent() - for scene in cfg.effective_scenes: - await self._run_scene(scene) + try: + if cfg.user is not None: + await self._run_user_loop() + else: + for scene in cfg.effective_scenes: + await self._run_scene(scene) + finally: + if cfg.oracle_access: + await self._env.exec( + "mv /solution_oracle_backup /solution 2>/dev/null || true", + user="root", timeout_sec=10, + ) await self.verify() @@ -712,6 +797,136 @@ async def _read_scene_outbox(self, sender: str) -> list[tuple[str, str]]: await self._env.exec(f"rm -f {quoted}", timeout_sec=10) return messages + async def _run_user_loop(self) -> None: + """Execute a user-driven progressive-disclosure loop. + + Each round: user.run() → connect → agent.execute() → disconnect → + soft_verify() → build RoundResult → repeat. Stops when user.run() + returns None or max_user_rounds is reached. + """ + cfg = self._config + user = cfg.user + assert user is not None + + if len(cfg.effective_scenes) > 1: + raise ValueError( + "User-driven loops operate on a single scene. " + f"Got {len(cfg.effective_scenes)} scenes." + ) + scene = cfg.effective_scenes[0] + if len(scene.roles) != 1: + raise ValueError( + "User-driven loops require a single-role scene. " + f"Got {len(scene.roles)} roles." + ) + role = scene.roles[0] + + instruction = self._resolved_prompts[0] if self._resolved_prompts else ( + "Solve the task described in /app/instruction.md" + ) + + # Oracle access: read /solution before the agent runs, then remove it + solution: str | None = None + if cfg.oracle_access: + cat = await self._env.exec( + "cat /solution/solve.sh 2>/dev/null || true", + user="root", timeout_sec=10, + ) + solution = (cat.stdout or "").strip() or None + + await user.setup(instruction, solution) + + # Hide oracle files from agent — move rather than delete so the + # final verify() can still access /solution if the verifier needs it. + if cfg.oracle_access: + await self._env.exec( + "mv /solution /solution_oracle_backup 2>/dev/null || true", + user="root", timeout_sec=10, + ) + + round_result: RoundResult | None = None + rounds_log: list[dict] = [] + + for round_num in range(cfg.max_user_rounds): + try: + prompt = await user.run(round_num, instruction, round_result) + except Exception as e: + self._error = f"user.run() failed at round {round_num}: {e}" + logger.error(self._error, exc_info=True) + break + + if prompt is None: + logger.info(f"[User] stopped at round {round_num}") + break + + logger.info( + f"[User] round {round_num}: prompt={prompt[:80]!r}..." + if len(prompt) > 80 else f"[User] round {round_num}: prompt={prompt!r}" + ) + + # Fresh ACP session each round — agent starts clean but sees + # its previous workspace changes in the shared sandbox. + traj_before = len(self._trajectory) + try: + await self.connect_as(role) + await self.execute(prompts=[prompt]) + finally: + await self.disconnect() + + round_trajectory = self._trajectory[traj_before:] + round_tools = sum( + 1 for e in round_trajectory + if isinstance(e, dict) and e.get("type") == "tool_call" + ) + + # Soft verify: run tests after agent disconnected but before + # next round. Temporarily restore /solution so the verifier can + # access it, then re-hide before the next agent round. + if cfg.oracle_access: + await self._env.exec( + "mv /solution_oracle_backup /solution 2>/dev/null || true", + user="root", timeout_sec=10, + ) + try: + rewards, verifier_output, verifier_error = await self.soft_verify() + finally: + if cfg.oracle_access: + await self._env.exec( + "mv /solution /solution_oracle_backup 2>/dev/null || true", + user="root", timeout_sec=10, + ) + + round_result = RoundResult( + round=round_num, + trajectory=round_trajectory, + rewards=rewards, + verifier_output=verifier_output, + verifier_error=verifier_error, + n_tool_calls=round_tools, + ) + + rounds_log.append({ + "round": round_num, + "prompt": prompt, + "rewards": rewards, + "verifier_error": verifier_error, + "n_tool_calls": round_tools, + "n_trajectory_events": len(round_trajectory), + }) + + logger.info( + f"[User] round {round_num} done: " + f"rewards={rewards}, tools={round_tools}" + ) + + # Persist round log + if rounds_log and self._trial_dir: + log_path = self._trial_dir / "user_rounds.jsonl" + with log_path.open("w") as f: + for entry in rounds_log: + f.write(json.dumps(entry) + "\n") + logger.info(f"[User] {len(rounds_log)} rounds → {log_path}") + async def connect_as(self, role: Role) -> None: """Open an ACP connection for a specific role. diff --git a/src/benchflow/user.py b/src/benchflow/user.py new file mode 100644 index 0000000..2c8fde4 --- /dev/null +++ b/src/benchflow/user.py @@ -0,0 +1,101 @@ +"""User abstraction for progressive-disclosure trial loops. + +A User is a Python callback that participates in the trial loop alongside the +agent. Each round: user.run() produces a prompt → agent executes → verifier +checks → user sees the result and decides what to say next (or stops). + +This is distinct from multi-role scenes (PR #179) where multiple ACP agents +collaborate via outbox files. The User runs in the scheduler process, not in +the sandbox. +""" + +from __future__ import annotations + +import inspect +from collections.abc import Awaitable, Callable +from dataclasses import dataclass, field +from typing import Any, cast + + +@dataclass +class RoundResult: + """Outcome of one agent round, passed to User.run() for the next decision.""" + + round: int + trajectory: list[dict] = field(default_factory=list) + rewards: dict[str, Any] | None = None + verifier_output: str | None = None + verifier_error: str | None = None + n_tool_calls: int = 0 + + +class BaseUser: + """Abstract user that drives a progressive-disclosure trial loop. + + Subclass and implement ``run()`` to control what prompt the agent sees + each round. Return ``None`` from ``run()`` to stop the loop early. + """ + + async def setup(self, instruction: str, solution: str | None = None) -> None: + """Called once before the first round. + + ``instruction`` is the original task instruction (from instruction.md). + ``solution`` is the gold answer if oracle access is enabled, else None. + """ + + async def run( + self, + round: int, + instruction: str, + round_result: RoundResult | None = None, + ) -> str | None: + """Produce the next prompt for the agent, or None to stop. + + ``round`` starts at 0. ``round_result`` is None on the first call + and contains the previous round's outcome on subsequent calls. + """ + raise NotImplementedError + + +class PassthroughUser(BaseUser): + """Sends the original instruction unchanged. Single-round, backward compatible.""" + + async def run( + self, + round: int, + instruction: str, + round_result: RoundResult | None = None, + ) -> str | None: + if round == 0: + return instruction + return None + + +class FunctionUser(BaseUser): + """Wraps a plain function as a User — for lightweight one-off use. + + The function signature matches ``BaseUser.run()``: + fn(round, instruction, round_result) -> str | None + + Both sync and async functions are supported. + """ + + def __init__( + self, + fn: Callable[ + [int, str, RoundResult | None], + str | None | Awaitable[str | None], + ], + ) -> None: + self._fn = fn + + async def run( + self, + round: int, + instruction: str, + round_result: RoundResult | None = None, + ) -> str | None: + result = self._fn(round, instruction, round_result) + if inspect.isawaitable(result): + return cast(str | None, await result) + return cast(str | None, result) diff --git a/tests/test_sandbox_hardening.py b/tests/test_sandbox_hardening.py index c12b957..6396c4e 100644 --- a/tests/test_sandbox_hardening.py +++ b/tests/test_sandbox_hardening.py @@ -152,7 +152,7 @@ async def test_with_sandbox_user(self, harness): assert "sitecustomize.py" in cleanup_cmd and ".pth" in cleanup_cmd assert "-not -path '/tests/*'" in cleanup_cmd injected = task.config.verifier.env - assert "--rootdir=/tests" in injected["PYTEST_ADDOPTS"] + assert "--rootdir=/app" in injected["PYTEST_ADDOPTS"] assert "-p no:cacheprovider" in injected["PYTEST_ADDOPTS"] assert injected["PYTHONPATH"] == "" assert "PYTHONHOME" not in injected # breaks Py_Initialize if set to "" @@ -171,7 +171,7 @@ async def test_without_sandbox_user(self, harness): assert all("pkill" not in c for c in cmds) assert any("conftest.py" in c for c in cmds) addopts = task.config.verifier.env["PYTEST_ADDOPTS"] - assert "--rootdir=/tests" in addopts + assert "--rootdir=/app" in addopts assert "-p no:cacheprovider" in addopts @pytest.mark.asyncio @@ -629,7 +629,7 @@ def test_env_contract(self): assert "-c /dev/null" in addopts assert "--confcutdir=/tests" in addopts - assert "--rootdir=/tests" in addopts + assert "--rootdir=/app" in addopts assert "-p no:cacheprovider" in addopts assert ( "PYTHONSAFEPATH" not in VERIFIER_ENV @@ -662,34 +662,41 @@ async def test_distro_pip_env_fedora(self): ) assert await _distro_pip_env(env) == {"PIP_PREFIX": "/usr/local"} - def test_pip_installed_pytest_plugin_inferred(self, tmp_path): - """`pip install pytest-asyncio` in test.sh auto-loads pytest_asyncio.""" - from benchflow._sandbox import _plugins_from_verifier_script + @pytest.mark.asyncio + async def test_container_plugin_discovery_merged_into_addopts(self): + """Plugins discovered from root-owned container packages appear as -p flags.""" + from benchflow._sandbox import harden_before_verify - task_dir = tmp_path / "task" - (task_dir / "tests").mkdir(parents=True) - (task_dir / "tests" / "test.sh").write_text( - "pip3 install --break-system-packages pytest==8.3.4 pytest-asyncio==0.24.0\n" - "pytest /tests/test_perf.py\n" - ) - task = MagicMock() - task.task_dir = str(task_dir) - plugins = _plugins_from_verifier_script(task) - assert "pytest_asyncio" in plugins + def side_effect(cmd, **kwargs): + if "_DISCOVER_PYTEST" in str(cmd) or "importlib.metadata" in str(cmd): + return MagicMock( + stdout='["benchmark", "xdist"]', stderr="", exit_code=0 + ) + return MagicMock(stdout="", stderr="", exit_code=0) - def test_pip_install_unrelated_mention_not_inferred(self, tmp_path): - """A bare mention of 'pytest-asyncio' in a comment must not auto-load it.""" - from benchflow._sandbox import _plugins_from_verifier_script + env = _make_env(side_effect=side_effect) + task = _make_task() + await harden_before_verify(env, task, sandbox_user=None) - task_dir = tmp_path / "task" - (task_dir / "tests").mkdir(parents=True) - (task_dir / "tests" / "test.sh").write_text( - "# tests using pytest-asyncio decorators (already installed in image)\n" - "pytest /tests/test_perf.py\n" - ) - task = MagicMock() - task.task_dir = str(task_dir) - assert _plugins_from_verifier_script(task) == [] + addopts = task.config.verifier.env["PYTEST_ADDOPTS"] + assert "-p benchmark" in addopts + assert "-p xdist" in addopts + + @pytest.mark.asyncio + async def test_plugin_discovery_failure_graceful(self): + """If container-side discovery fails, hardening proceeds without extra plugins.""" + from benchflow._sandbox import VERIFIER_ENV, harden_before_verify + + def side_effect(cmd, **kwargs): + if "importlib.metadata" in str(cmd): + raise RuntimeError("no python3") + return MagicMock(stdout="", stderr="", exit_code=0) + + env = _make_env(side_effect=side_effect) + task = _make_task() + await harden_before_verify(env, task, sandbox_user=None) + + assert task.config.verifier.env["PYTEST_ADDOPTS"] == VERIFIER_ENV["PYTEST_ADDOPTS"] @pytest.mark.asyncio async def test_distro_pip_env_ubuntu(self): @@ -831,77 +838,20 @@ async def test_plugin_autoload_disabled_survives_task_env_override(self): @pytest.mark.asyncio async def test_per_task_plugins_appended_to_addopts(self): - """pytest_plugins are translated to -p flags; PYTEST_DISABLE_PLUGIN_AUTOLOAD must survive.""" + """pytest_plugins from task.toml are translated to -p flags.""" from benchflow._sandbox import harden_before_verify env = _make_env(side_effect=_manifest_env(_blank_manifest())) task = _make_task() - task.config.verifier.pytest_plugins = ["pytest-json-ctrf", "myplug"] + task.config.verifier.pytest_plugins = ["ctrf", "myplug"] await harden_before_verify(env, task, sandbox_user=None, workspace=None) final_env = task.config.verifier.env addopts = final_env.get("PYTEST_ADDOPTS", "") assert "-p ctrf" in addopts assert "-p myplug" in addopts - # The security flag must still be present after the plugin flags are appended. assert final_env.get("PYTEST_DISABLE_PLUGIN_AUTOLOAD") == "1" - @pytest.mark.asyncio - async def test_legacy_ctrf_script_gets_allowlisted_plugin(self, tmp_path): - """Legacy verifier scripts using --ctrf get -p ctrf without pytest autoload.""" - from benchflow._sandbox import harden_before_verify - - tests_dir = tmp_path / "tests" - tests_dir.mkdir() - (tests_dir / "test.sh").write_text( - "uvx --with pytest-json-ctrf==0.3.5 " - "pytest --ctrf /logs/verifier/ctrf.json /tests/test_outputs.py\n" - ) - env = _make_env(side_effect=_manifest_env(_blank_manifest())) - task = _make_task() - task.task_dir = tmp_path - await harden_before_verify(env, task, sandbox_user=None, workspace=None) - - final_env = task.config.verifier.env - assert "-p ctrf" in final_env["PYTEST_ADDOPTS"] - assert final_env.get("PYTEST_DISABLE_PLUGIN_AUTOLOAD") == "1" - - @pytest.mark.asyncio - async def test_legacy_json_report_script_gets_allowlisted_plugin(self, tmp_path): - """Legacy verifier scripts using --json-report get the right import name.""" - from benchflow._sandbox import harden_before_verify - - tests_dir = tmp_path / "tests" - tests_dir.mkdir() - (tests_dir / "test.sh").write_text( - "pytest --json-report " - "--json-report-file=/logs/verifier/report.json /tests/test_outputs.py\n" - ) - env = _make_env(side_effect=_manifest_env(_blank_manifest())) - task = _make_task() - task.task_dir = tmp_path - await harden_before_verify(env, task, sandbox_user=None, workspace=None) - - assert "-p pytest_jsonreport" in task.config.verifier.env["PYTEST_ADDOPTS"] - - @pytest.mark.asyncio - async def test_task_toml_pytest_plugins_fallback(self, tmp_path): - """Raw task.toml pytest_plugins work even if Harbor ignores the extra field.""" - from benchflow._sandbox import harden_before_verify - - (tmp_path / "task.toml").write_text( - '[verifier]\npytest_plugins = ["pytest_json_ctrf", "pytest-json-report"]\n' - ) - env = _make_env(side_effect=_manifest_env(_blank_manifest())) - task = _make_task() - task.task_dir = tmp_path - task.config.verifier.pytest_plugins = None - await harden_before_verify(env, task, sandbox_user=None, workspace=None) - - addopts = task.config.verifier.env["PYTEST_ADDOPTS"] - assert "-p ctrf" in addopts - assert "-p pytest_jsonreport" in addopts - @pytest.mark.asyncio @pytest.mark.parametrize("plugins", [None, []]) async def test_no_extra_addopts_when_no_plugins(self, plugins): @@ -964,7 +914,7 @@ async def test_pytest_addopts_task_override_with_plugins(self): env = _make_env() task = _make_task() task.config.verifier.env = {"PYTEST_ADDOPTS": "--rootdir=/evil"} - task.config.verifier.pytest_plugins = ["pytest-json-ctrf"] + task.config.verifier.pytest_plugins = ["ctrf"] await harden_before_verify(env, task, sandbox_user=None) addopts = task.config.verifier.env["PYTEST_ADDOPTS"] @@ -1163,22 +1113,83 @@ def test_devnull_blocks_hostile_pyproject(self, tmp_path): ) -class TestSandboxFailureModes: - """Recovery paths when untrusted inputs (task.toml, PATH extras) are malformed.""" +class TestHardeningOptOuts: + """Per-task [verifier.hardening] opt-outs from task.toml.""" + + def test_defaults_when_no_task_dir(self): + from benchflow._sandbox import HARDENING_DEFAULTS, _read_hardening_config - def test_harden_before_verify_survives_manifest_read_failure(self, tmp_path): - """Truncated task.toml must not propagate TOMLDecodeError; treat as no plugins.""" - from benchflow._sandbox import _declared_pytest_plugins + assert _read_hardening_config(None) == HARDENING_DEFAULTS + + def test_defaults_when_no_hardening_section(self, tmp_path): + from benchflow._sandbox import HARDENING_DEFAULTS, _read_hardening_config - # Truncated inline-table → tomllib.TOMLDecodeError (tmp_path / "task.toml").write_text( - "[verifier]\npytest_plugins = [\n", + "[verifier]\ntimeout_sec = 60\n" ) - task = _make_task() - task.task_dir = str(tmp_path) - task.config.verifier.pytest_plugins = None + assert _read_hardening_config(tmp_path) == HARDENING_DEFAULTS + + def test_opt_out_cleanup_conftests(self, tmp_path): + from benchflow._sandbox import _read_hardening_config - assert _declared_pytest_plugins(task) == [] + (tmp_path / "task.toml").write_text( + "[verifier.hardening]\ncleanup_conftests = false\n" + ) + cfg = _read_hardening_config(tmp_path) + assert cfg["cleanup_conftests"] is False + + def test_unknown_key_logged_not_applied(self, tmp_path, caplog): + from benchflow._sandbox import HARDENING_DEFAULTS, _read_hardening_config + + (tmp_path / "task.toml").write_text( + "[verifier.hardening]\nbogus_flag = true\n" + ) + cfg = _read_hardening_config(tmp_path) + assert cfg == HARDENING_DEFAULTS # bogus key ignored + assert any("bogus_flag" in r.message for r in caplog.records) + + def test_invalid_value_type_rejected(self, tmp_path, caplog): + from benchflow._sandbox import _read_hardening_config + + (tmp_path / "task.toml").write_text( + '[verifier.hardening]\ncleanup_conftests = "false"\n' + ) + cfg = _read_hardening_config(tmp_path) + # String "false" is not bool — rejected, default applied + assert cfg["cleanup_conftests"] is True + + def test_build_cleanup_includes_conftest_by_default(self): + from benchflow._sandbox import _build_cleanup_cmd + + cmd = _build_cleanup_cmd() + assert "find / -name conftest.py" in cmd + assert "find /tmp /var/tmp" in cmd + assert "sitecustomize.py" in cmd + + def test_build_cleanup_skips_conftest_when_disabled(self): + from benchflow._sandbox import _build_cleanup_cmd + + cmd = _build_cleanup_cmd({"cleanup_conftests": False}) + assert "find / -name conftest.py" not in cmd + # Other steps still run + assert "find /tmp /var/tmp" in cmd + assert "sitecustomize.py" in cmd + + +class TestSandboxFailureModes: + """Recovery paths when untrusted inputs (task.toml, PATH extras) are malformed.""" + + @pytest.mark.asyncio + async def test_plugin_discovery_bad_json_graceful(self): + """Malformed JSON from container plugin discovery falls back gracefully.""" + from benchflow._sandbox import _discover_pytest_plugin_flags + + env = _make_env(side_effect=lambda cmd, **kw: MagicMock( + stdout="not valid json", stderr="", exit_code=0 + )) + task = _make_task() + flags = await _discover_pytest_plugin_flags(env, task) + assert flags == "" @pytest.mark.asyncio async def test_trusted_path_extras_malformed_json_falls_back(self): diff --git a/tests/test_user.py b/tests/test_user.py new file mode 100644 index 0000000..8ebf1c4 --- /dev/null +++ b/tests/test_user.py @@ -0,0 +1,336 @@ +"""Tests for the User abstraction — BaseUser, PassthroughUser, FunctionUser, RoundResult.""" + +from __future__ import annotations + +import tempfile +from dataclasses import dataclass +from pathlib import Path +from unittest.mock import AsyncMock, patch + +import pytest + +from benchflow.trial import Role, Scene, Trial, TrialConfig, Turn +from benchflow.user import BaseUser, FunctionUser, PassthroughUser, RoundResult + +# ── Unit tests for User types ── + + +class TestPassthroughUser: + @pytest.mark.asyncio + async def test_sends_instruction_once(self): + user = PassthroughUser() + result = await user.run(0, "Fix the bug") + assert result == "Fix the bug" + + @pytest.mark.asyncio + async def test_stops_after_first_round(self): + user = PassthroughUser() + await user.run(0, "Fix the bug") + result = await user.run(1, "Fix the bug", RoundResult(round=0)) + assert result is None + + + +class TestFunctionUser: + @pytest.mark.asyncio + async def test_sync_function(self): + def my_fn(round: int, instruction: str, rr: RoundResult | None) -> str | None: + if round == 0: + return "terse: " + instruction[:10] + return None + + user = FunctionUser(my_fn) + assert await user.run(0, "Fix the authentication bug") == "terse: Fix the au" + assert await user.run(1, "Fix the authentication bug", RoundResult(round=0)) is None + + @pytest.mark.asyncio + async def test_async_function(self): + async def my_fn(round: int, instruction: str, rr: RoundResult | None) -> str | None: + if round == 0: + return instruction + if rr and rr.rewards and rr.rewards.get("exact_match", 0) < 1.0: + return "Try again, tests failed" + return None + + user = FunctionUser(my_fn) + assert await user.run(0, "task") == "task" + + failing = RoundResult(round=0, rewards={"exact_match": 0.0}) + assert await user.run(1, "task", failing) == "Try again, tests failed" + + passing = RoundResult(round=1, rewards={"exact_match": 1.0}) + assert await user.run(2, "task", passing) is None + + +class TestBaseUser: + @pytest.mark.asyncio + async def test_not_implemented(self): + user = BaseUser() + with pytest.raises(NotImplementedError): + await user.run(0, "task") + + +# ── Integration tests for user loop in Trial ── + + +@dataclass +class FakeExecResult: + stdout: str = "" + stderr: str = "" + return_code: int = 0 + + +class FakeEnv: + """Minimal sandbox mock for user loop tests.""" + + def __init__(self) -> None: + self._exec_log: list[str] = [] + + async def exec(self, cmd: str, **kwargs) -> FakeExecResult: + self._exec_log.append(cmd) + if "cat /solution" in cmd: + return FakeExecResult(stdout="gold answer here") + if "rm -rf /solution" in cmd: + return FakeExecResult() + if "/logs/verifier" in cmd: + return FakeExecResult() + if "cat /logs/verifier" in cmd: + return FakeExecResult(stdout="") + return FakeExecResult() + + async def stop(self, **kwargs): + pass + + +def _make_user_trial( + user: BaseUser, max_rounds: int = 5, oracle: bool = False, tmp_path: Path | None = None, +) -> Trial: + config = TrialConfig( + task_path=Path("tasks/fake"), + scenes=[Scene.single(agent="gemini", model="flash")], + environment="docker", + user=user, + max_user_rounds=max_rounds, + oracle_access=oracle, + ) + trial = Trial(config) + trial._env = FakeEnv() + trial._resolved_prompts = ["Solve the task described in /app/instruction.md"] + trial_dir = tmp_path or Path(tempfile.mkdtemp(prefix="benchflow-test-")) + trial._trial_dir = trial_dir + verifier_dir = trial_dir / "verifier" + verifier_dir.mkdir(parents=True, exist_ok=True) + trial._trial_paths = type("P", (), {"verifier_dir": verifier_dir})() + trial._task = type("T", (), { + "config": type("C", (), { + "verifier": type("V", (), { + "timeout_sec": 30, + "env": {}, + })(), + "agent": type("A", (), {"timeout_sec": 60})(), + })(), + })() + trial._agent_cwd = "/app" + return trial + + +class RecordingUser(BaseUser): + """User that records calls and stops after a fixed number of rounds.""" + + def __init__(self, max_rounds: int = 2, prompts: list[str] | None = None): + self.setup_calls: list[tuple[str, str | None]] = [] + self.run_calls: list[tuple[int, str, RoundResult | None]] = [] + self._max = max_rounds + self._prompts = prompts or ["Do the thing", "Try harder"] + + async def setup(self, instruction: str, solution: str | None = None) -> None: + self.setup_calls.append((instruction, solution)) + + async def run( + self, round: int, instruction: str, round_result: RoundResult | None = None + ) -> str | None: + self.run_calls.append((round, instruction, round_result)) + if round >= self._max: + return None + return self._prompts[min(round, len(self._prompts) - 1)] + + +class TestUserLoop: + @pytest.mark.asyncio + async def test_user_loop_calls_setup_and_run(self): + user = RecordingUser(max_rounds=1) + trial = _make_user_trial(user, max_rounds=3) + + with patch.object(trial, "connect_as", new_callable=AsyncMock), \ + patch.object(trial, "execute", new_callable=AsyncMock, return_value=([], 0)), \ + patch.object(trial, "disconnect", new_callable=AsyncMock), \ + patch.object(trial, "soft_verify", new_callable=AsyncMock, return_value=({"exact_match": 1.0}, None, None)): + + await trial._run_user_loop() + + assert len(user.setup_calls) == 1 + assert user.setup_calls[0][0] == "Solve the task described in /app/instruction.md" + assert len(user.run_calls) == 2 # round 0 → prompt, round 1 → None + assert user.run_calls[0][0] == 0 # round number + assert user.run_calls[1][0] == 1 + + @pytest.mark.asyncio + async def test_user_loop_passes_round_result(self): + user = RecordingUser(max_rounds=2) + trial = _make_user_trial(user, max_rounds=5) + + with patch.object(trial, "connect_as", new_callable=AsyncMock), \ + patch.object(trial, "execute", new_callable=AsyncMock, return_value=([], 0)), \ + patch.object(trial, "disconnect", new_callable=AsyncMock), \ + patch.object(trial, "soft_verify", new_callable=AsyncMock, return_value=({"exact_match": 0.5}, "1 failed", None)): + + await trial._run_user_loop() + + # First call has no round_result + assert user.run_calls[0][2] is None + # Second call has round_result from round 0 + rr = user.run_calls[1][2] + assert isinstance(rr, RoundResult) + assert rr.round == 0 + assert rr.rewards == {"exact_match": 0.5} + assert rr.verifier_output == "1 failed" + + @pytest.mark.asyncio + async def test_user_loop_respects_max_rounds(self): + """User that never stops is capped by max_user_rounds.""" + def never_stop(r, instr, rr): + return "keep going" + + user = FunctionUser(never_stop) + trial = _make_user_trial(user, max_rounds=3) + + call_count = 0 + async def mock_execute(prompts=None): + nonlocal call_count + call_count += 1 + return [], 0 + + with patch.object(trial, "connect_as", new_callable=AsyncMock), \ + patch.object(trial, "execute", side_effect=mock_execute), \ + patch.object(trial, "disconnect", new_callable=AsyncMock), \ + patch.object(trial, "soft_verify", new_callable=AsyncMock, return_value=(None, None, None)): + + await trial._run_user_loop() + + assert call_count == 3 + + @pytest.mark.asyncio + async def test_oracle_access(self): + user = RecordingUser(max_rounds=0) + trial = _make_user_trial(user, oracle=True) + + with patch.object(trial, "connect_as", new_callable=AsyncMock), \ + patch.object(trial, "execute", new_callable=AsyncMock, return_value=([], 0)), \ + patch.object(trial, "disconnect", new_callable=AsyncMock), \ + patch.object(trial, "soft_verify", new_callable=AsyncMock, return_value=(None, None, None)): + + await trial._run_user_loop() + + assert user.setup_calls[0][1] == "gold answer here" + + @pytest.mark.asyncio + async def test_multi_role_raises(self): + user = RecordingUser() + config = TrialConfig( + task_path=Path("tasks/fake"), + scenes=[Scene( + name="multi", + roles=[Role("a", "gemini"), Role("b", "gemini")], + turns=[Turn("a"), Turn("b")], + )], + user=user, + ) + trial = Trial(config) + trial._env = FakeEnv() + trial._resolved_prompts = ["task"] + + with pytest.raises(ValueError, match="single-role"): + await trial._run_user_loop() + + @pytest.mark.asyncio + async def test_user_run_exception_stops_loop(self): + class FailingUser(BaseUser): + async def run(self, round, instruction, rr=None): + raise RuntimeError("oops") + + trial = _make_user_trial(FailingUser()) + + with patch.object(trial, "connect_as", new_callable=AsyncMock), \ + patch.object(trial, "execute", new_callable=AsyncMock, return_value=([], 0)), \ + patch.object(trial, "disconnect", new_callable=AsyncMock), \ + patch.object(trial, "soft_verify", new_callable=AsyncMock, return_value=(None, None, None)): + + await trial._run_user_loop() + + assert "user.run() failed" in trial._error + + +class TestSoftVerify: + @pytest.mark.asyncio + async def test_soft_verify_timeout(self): + trial = _make_user_trial(PassthroughUser()) + + with patch("harbor.Verifier") as MockVerifier, \ + patch("benchflow._sandbox.CLEANUP_CMD", "true"): + mock_instance = MockVerifier.return_value + mock_instance.verify = AsyncMock(side_effect=TimeoutError()) + + rewards, output, error = await trial.soft_verify() + + assert rewards is None + assert output is None + assert "timed out" in error + + @pytest.mark.asyncio + async def test_soft_verify_crash(self): + trial = _make_user_trial(PassthroughUser()) + + with patch("harbor.Verifier") as MockVerifier, \ + patch("benchflow._sandbox.CLEANUP_CMD", "true"): + mock_instance = MockVerifier.return_value + mock_instance.verify = AsyncMock(side_effect=RuntimeError("boom")) + + rewards, _output, error = await trial.soft_verify() + + assert rewards is None + assert "crashed" in error + assert "boom" in error + + @pytest.mark.asyncio + async def test_soft_verify_success(self): + trial = _make_user_trial(PassthroughUser()) + + mock_result = type("VR", (), {"rewards": {"exact_match": 1.0}})() + + with patch("harbor.Verifier") as MockVerifier, \ + patch("benchflow._sandbox.CLEANUP_CMD", "true"): + mock_instance = MockVerifier.return_value + mock_instance.verify = AsyncMock(return_value=mock_result) + + rewards, _output, error = await trial.soft_verify() + + assert rewards == {"exact_match": 1.0} + assert error is None + + @pytest.mark.asyncio + async def test_soft_verify_runs_cleanup_cmd(self): + trial = _make_user_trial(PassthroughUser()) + + mock_result = type("VR", (), {"rewards": {}})() + + with patch("harbor.Verifier") as MockVerifier, \ + patch("benchflow._sandbox._build_cleanup_cmd", + return_value="echo cleanup_sentinel"): + mock_instance = MockVerifier.return_value + mock_instance.verify = AsyncMock(return_value=mock_result) + + await trial.soft_verify() + + # Verify cleanup command was executed + exec_log = trial._env._exec_log + assert any("cleanup_sentinel" in cmd for cmd in exec_log) From 5c583edd220b34a23c41a91bc04c691ea4b6681f Mon Sep 17 00:00:00 2001 From: Xiangyi Li Date: Sat, 25 Apr 2026 06:00:27 -0400 Subject: [PATCH 24/25] Infra fixes for SkillsBench Apr 2026 trials: PTY timeout + Daytona logs retry (#196) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Bump DaytonaPtyProcess readline timeout 300s→900s Long-running TTS/audio tasks (e.g. pg-essay-to-audiobook) generate extended quiet periods on stdout while ffmpeg/whisper run. The 300s PTY readline timeout fires before the per-task agent timeout (900s), prematurely killing healthy runs. Align readline timeout with the standard agent timeout so the PTY only fails when the inner process is actually wedged. * Daytona SDK: retry SessionCommandLogsResponse ValidationError The Daytona server occasionally returns an empty string instead of a JSON object when fetching session command logs, which causes pydantic to raise ValidationError inside AsyncProcess.get_session_command_logs. We've reproduced this on SDK 0.168.x and 0.169.x; the surface is most visible in skillsbench tasks that ask the verifier for command output (e.g. latex-formula-extraction). Patch the SDK method at runtime with a small bounded retry. After four malformed payloads we fall back to an empty (but valid) response so callers can still inspect exit_code via get_session_command — a silent missing-logs is preferable to taking a whole trial as ERROR on an upstream marshalling glitch. Patch is applied lazily from _create_environment so we never touch the SDK on Docker-only runs. * Daytona retry: catch DaytonaError wrapping the malformed-logs ValidationError The first version of this patch only matched on pydantic ValidationError, but AsyncProcess.get_session_command_logs is decorated by intercept_errors at class-definition time — every inner exception is converted to DaytonaError before our patched bound method ever sees it. Verified against latex-formula-extraction on Daytona: the patch wrapper was being called, but the except-clause never matched, so the run still failed. Match on DaytonaError whose message contains 'SessionCommandLogsResponse' in addition to bare ValidationError, and drop the wrapper to 2 attempts (harbor already wraps the call in tenacity x3 — extra retries here are wasted on a deterministic malformed payload). Empty-fallback unchanged. --- src/benchflow/_daytona_patches.py | 104 ++++++++++++++++++++++++++++++ src/benchflow/_env_setup.py | 4 ++ src/benchflow/process.py | 4 +- 3 files changed, 110 insertions(+), 2 deletions(-) create mode 100644 src/benchflow/_daytona_patches.py diff --git a/src/benchflow/_daytona_patches.py b/src/benchflow/_daytona_patches.py new file mode 100644 index 0000000..2541bf0 --- /dev/null +++ b/src/benchflow/_daytona_patches.py @@ -0,0 +1,104 @@ +"""Idempotent patches to the Daytona SDK to work around upstream bugs. + +Imported (and applied) lazily from ``_env_setup._create_environment`` so the +SDK is only touched when a Daytona environment is actually being built. +""" + +import asyncio +import logging +from typing import Any + +logger = logging.getLogger(__name__) + +_PATCHED = False + + +def apply() -> None: + """Install workarounds for known Daytona SDK bugs. + + Currently: + * ``AsyncProcess.get_session_command_logs`` occasionally raises + ``pydantic.ValidationError`` because the server returns an empty + string instead of a JSON object for ``SessionCommandLogsResponse``. + Reproduces in SDK 0.168.x and 0.169.x. Wrap with a small bounded + retry that returns an empty-but-valid response if every attempt + fails — callers can still observe the command's exit_code via + ``get_session_command``, so a missing logs payload is recoverable. + """ + global _PATCHED + if _PATCHED: + return + + try: + from daytona._async.process import AsyncProcess + from daytona.common.errors import DaytonaError + from daytona.common.process import SessionCommandLogsResponse + except Exception: # pragma: no cover — SDK not installed / layout changed + logger.debug("daytona SDK not importable; skipping patches", exc_info=True) + return + + try: + from pydantic import ValidationError + except Exception: # pragma: no cover + return + + # AsyncProcess.get_session_command_logs is decorated by intercept_errors + # at class definition, which converts every inner exception (including + # the pydantic ValidationError we care about) into a DaytonaError. The + # decorated bound method is what we capture here, so we have to match + # on the wrapped DaytonaError shape too — not just ValidationError. + original = AsyncProcess.get_session_command_logs + + _MALFORMED_MARKER = "SessionCommandLogsResponse" + + def _is_malformed_logs_error(exc: BaseException) -> bool: + if isinstance(exc, ValidationError): + return True + if isinstance(exc, DaytonaError) and _MALFORMED_MARKER in str(exc): + return True + return False + + async def _patched_get_session_command_logs( + self: Any, session_id: str, command_id: str + ) -> SessionCommandLogsResponse: + # Harbor already wraps this call in tenacity (3 attempts), so + # additional retries here are usually wasted on a deterministic + # malformed payload. Try once more with a small delay in case it + # IS transient, then return an empty-but-valid response so the + # caller can still observe the command's exit_code via + # get_session_command. Original error is logged for triage. + attempts = 2 + delay = 0.5 + last_exc: BaseException | None = None + for attempt in range(1, attempts + 1): + try: + return await original(self, session_id, command_id) + except (ValidationError, DaytonaError) as exc: + if not _is_malformed_logs_error(exc): + raise + last_exc = exc + logger.warning( + "daytona get_session_command_logs malformed payload " + "(attempt %d/%d) for session=%s command=%s: %s", + attempt, + attempts, + session_id, + command_id, + exc, + ) + if attempt < attempts: + await asyncio.sleep(delay) + + logger.error( + "daytona get_session_command_logs malformed %d times for " + "session=%s command=%s; falling back to empty logs (%s)", + attempts, + session_id, + command_id, + last_exc, + ) + return SessionCommandLogsResponse(output="", stdout="", stderr="") + + AsyncProcess.get_session_command_logs = _patched_get_session_command_logs # type: ignore[method-assign] + _PATCHED = True + logger.debug("daytona SDK patches applied") diff --git a/src/benchflow/_env_setup.py b/src/benchflow/_env_setup.py index bfc5283..9800ffc 100644 --- a/src/benchflow/_env_setup.py +++ b/src/benchflow/_env_setup.py @@ -261,6 +261,10 @@ def _create_environment( elif environment_type == "daytona": from harbor.environments.daytona import DaytonaEnvironment + from benchflow._daytona_patches import apply as _apply_daytona_patches + + _apply_daytona_patches() + env_config = task.config.environment if env_config.cpus > _DAYTONA_MAX_CPUS: logger.warning( diff --git a/src/benchflow/process.py b/src/benchflow/process.py index 197daa1..f387c52 100644 --- a/src/benchflow/process.py +++ b/src/benchflow/process.py @@ -493,10 +493,10 @@ async def readline(self) -> bytes: if self._closed: raise ConnectionError("PTY closed") try: - line = await asyncio.wait_for(self._line_buffer.get(), timeout=300) + line = await asyncio.wait_for(self._line_buffer.get(), timeout=900) return line except TimeoutError as e: - raise ConnectionError("PTY readline timeout (300s)") from e + raise ConnectionError("PTY readline timeout (900s)") from e except Exception as e: raise ConnectionError(f"PTY readline error: {e}") from e From 46d5eda65162563dfd8ab617a4c64ccb5135519d Mon Sep 17 00:00:00 2001 From: Xiangyi Li Date: Sat, 25 Apr 2026 06:35:11 -0400 Subject: [PATCH 25/25] fix: env-file path mismatch in DinD compose mode (#198) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: env-file path mismatch in DinD compose mode Devin caught a real bug introduced by PR #193 (DinD compose ACP): src/benchflow/process.py:325 sets remote_env_path = "/tmp/benchflow_env_$$.env" expecting the remote shell to expand $$ to its PID. But shlex.join() at line 329 single-quotes the --env-file argument, so docker compose receives the literal string "/tmp/benchflow_env_$$.env" while the cat heredoc that writes the file (line 339, raw f-string) does expand $$. The file is written to /tmp/benchflow_env_.env and read from /tmp/benchflow_env_$$.env — silent mismatch, env vars (incl. API keys) silently dropped in DinD compose tasks. Fix: use uuid.uuid4().hex[:16] for the unique suffix instead of relying on shell-side $$ expansion. The path is then a literal that survives quoting. Apply the same fix to the direct (non-DinD) Daytona branch even though it was working — uniformity makes the path robust against future quoting changes. Also fix a pre-existing SIM103 lint error in _daytona_patches.py that ruff caught while validating the test changes. Tests: tests/test_process.py +2 regression tests pinning that no remote command contains a literal "$$" — would catch this exact regression. 8/8 process tests pass; ruff clean. * test: reference PR #193 / #198 in regression test docstring Devin caught: CLAUDE.md mandates regression tests name the commit/PR they guard. Updated TestDaytonaProcessEnvFilePath docstring to cite PR #198 (the fix) and PR #193 / commit cdccac7 (the regression). --- src/benchflow/_daytona_patches.py | 4 +- src/benchflow/process.py | 13 ++++- tests/test_process.py | 96 +++++++++++++++++++++++++++++++ 3 files changed, 108 insertions(+), 5 deletions(-) diff --git a/src/benchflow/_daytona_patches.py b/src/benchflow/_daytona_patches.py index 2541bf0..56abe78 100644 --- a/src/benchflow/_daytona_patches.py +++ b/src/benchflow/_daytona_patches.py @@ -54,9 +54,7 @@ def apply() -> None: def _is_malformed_logs_error(exc: BaseException) -> bool: if isinstance(exc, ValidationError): return True - if isinstance(exc, DaytonaError) and _MALFORMED_MARKER in str(exc): - return True - return False + return isinstance(exc, DaytonaError) and _MALFORMED_MARKER in str(exc) async def _patched_get_session_command_logs( self: Any, session_id: str, command_id: str diff --git a/src/benchflow/process.py b/src/benchflow/process.py index f387c52..b9d2c21 100644 --- a/src/benchflow/process.py +++ b/src/benchflow/process.py @@ -12,6 +12,7 @@ import logging import os import shlex +import uuid from abc import ABC, abstractmethod from typing import Any @@ -320,9 +321,15 @@ async def start( inner_parts.extend(["-w", cwd]) # Write env vars to a temp file on the remote VM instead of passing # as -e K=V args (which are visible in ps aux on the remote host). + # Use a Python-generated unique suffix instead of a shell `$$` + # expansion: shlex.join() (below) single-quotes the --env-file arg, + # so `$$` would survive as a literal in the docker compose call + # while the cat > ... heredoc would expand it — the file would be + # written to one path and read from another. uuid.uuid4 sidesteps + # the entire shell-expansion-vs-quoting problem. remote_env_path = None if env: - remote_env_path = "/tmp/benchflow_env_$$.env" + remote_env_path = f"/tmp/benchflow_env_{uuid.uuid4().hex[:16]}.env" env_lines = "\n".join(f"{k}={v}" for k, v in env.items()) inner_parts.extend(["--env-file", remote_env_path]) inner_parts.extend(["main", "bash", "-c", command]) @@ -349,7 +356,9 @@ async def start( env_prefix = "" remote_env_path = None if env: - remote_env_path = "/tmp/benchflow_env_$$.env" + # Python-generated unique suffix; see DinD branch above for why + # $$ shell expansion is fragile across quoting boundaries. + remote_env_path = f"/tmp/benchflow_env_{uuid.uuid4().hex[:16]}.env" env_lines = "\n".join( f"export {k}={shlex.quote(v)}" for k, v in env.items() ) diff --git a/tests/test_process.py b/tests/test_process.py index c604a58..68ea0a0 100644 --- a/tests/test_process.py +++ b/tests/test_process.py @@ -202,3 +202,99 @@ async def capture_communicate(data=None): assert ( f"export SINGLE_QUOTE={shlex.quote('it' + chr(39) + 's a test')}" in content ) + + +class TestDaytonaProcessEnvFilePath: + """Regression: env-file path must be unique without relying on shell `$$` expansion. + + Guards the fix from PR #198 against the regression introduced by PR #193 + (DinD compose ACP via Daytona PTY WebSocket, commit cdccac7). + + The DinD branch builds an inner `docker compose exec --env-file PATH ...` + command and runs it through `shlex.join()`, which single-quotes any `$$` + (preventing remote shell expansion). The `cat > PATH` heredoc that writes + the file uses raw f-string interpolation where `$$` IS expanded. If the + path contains `$$`, the file is written to one path and read from another + — env vars silently disappear. + + The direct (non-DinD) branch uses raw f-string in both write and read, so + `$$` would expand consistently — but uuid is robust against future quoting + changes. Pin both branches. + """ + + @pytest.mark.asyncio + async def test_dind_env_file_path_does_not_use_shell_pid_expansion(self): + """DinD path must not use $$ — shlex.join would quote it literally.""" + from unittest.mock import MagicMock + + from benchflow.process import DaytonaProcess + + sandbox = MagicMock() + sandbox.create_ssh_access = AsyncMock( + return_value=MagicMock(token="abc") + ) + proc = DaytonaProcess( + sandbox=sandbox, + is_dind=True, + compose_cmd_prefix="", + compose_cmd_base="docker compose -p test", + ) + + captured = [] + + async def fake_exec(*args, **kwargs): + captured.append(list(args)) + mock_proc = AsyncMock() + mock_proc.pid = 12345 + mock_proc.returncode = 0 + mock_proc.stdin = AsyncMock() + mock_proc.stdout = AsyncMock() + mock_proc.stderr = AsyncMock() + mock_proc.communicate = AsyncMock(return_value=(b"", b"")) + return mock_proc + + with patch("asyncio.create_subprocess_exec", side_effect=fake_exec): + await proc.start(command="echo hi", env={"FOO": "bar"}) + + # Last arg of ssh is the remote command. Search it for $$ + remote_cmd = captured[0][-1] + assert "$$" not in remote_cmd, ( + "$$ in remote command — shlex.join() will quote it, mismatching " + f"the cat heredoc that does expand it. Got: {remote_cmd[:200]!r}" + ) + # And: a real path was used (literal hex suffix, no shell variable) + assert "/tmp/benchflow_env_" in remote_cmd + assert "--env-file" in remote_cmd + + @pytest.mark.asyncio + async def test_direct_sandbox_env_file_path_does_not_use_shell_pid_expansion(self): + """Direct (non-DinD) path is currently safe with $$, but pin the uuid form for robustness.""" + from unittest.mock import MagicMock + + from benchflow.process import DaytonaProcess + + sandbox = MagicMock() + sandbox.create_ssh_access = AsyncMock( + return_value=MagicMock(token="abc") + ) + proc = DaytonaProcess(sandbox=sandbox, is_dind=False) + + captured = [] + + async def fake_exec(*args, **kwargs): + captured.append(list(args)) + mock_proc = AsyncMock() + mock_proc.pid = 12345 + mock_proc.returncode = 0 + mock_proc.stdin = AsyncMock() + mock_proc.stdout = AsyncMock() + mock_proc.stderr = AsyncMock() + mock_proc.communicate = AsyncMock(return_value=(b"", b"")) + return mock_proc + + with patch("asyncio.create_subprocess_exec", side_effect=fake_exec): + await proc.start(command="echo hi", env={"FOO": "bar"}) + + remote_cmd = captured[0][-1] + assert "$$" not in remote_cmd + assert "/tmp/benchflow_env_" in remote_cmd