diff --git a/docs/api-reference.md b/docs/api-reference.md index 5248459..79db5e1 100644 --- a/docs/api-reference.md +++ b/docs/api-reference.md @@ -34,6 +34,7 @@ config = TrialConfig( task_path=Path("tasks/my-task"), scenes=[Scene.single(agent="gemini", model="gemini-3.1-flash-lite-preview")], environment="daytona", + sandbox_setup_timeout=120, ) # Multi-scene BYOS (skill-gen → solve) @@ -46,9 +47,13 @@ config = TrialConfig( turns=[Turn("solver")]), ], environment="daytona", + sandbox_setup_timeout=120, ) ``` +Set `sandbox_setup_timeout` when sandbox user setup needs more than the default 120 seconds. +The same field is also available on `JobConfig` and `RuntimeConfig`. + ### Scene One interaction region — roles take turns executing prompts. @@ -98,6 +103,20 @@ await trial.verify() await trial.cleanup() ``` +### RuntimeConfig + +Runtime-level configuration for the `Agent + Environment` execution path. + +```python +from benchflow.runtime import Agent, Environment, Runtime, RuntimeConfig + +config = RuntimeConfig(sandbox_setup_timeout=300) +agent = Agent("gemini", model="gemini-3.1-flash-lite-preview") +env = Environment.from_task("tasks/X", backend="daytona") +runtime = Runtime(env, agent, config=config) +result = await runtime.execute() +``` + ### bf.run() Convenience function — multiple calling conventions: @@ -111,10 +130,16 @@ result = await bf.run(config) # 2. Agent + Environment (0.3 style) agent = bf.Agent("gemini", model="gemini-3.1-flash-lite-preview") env = bf.Environment.from_task("tasks/X", backend="daytona") -result = await bf.run(agent, env) +runtime_config = bf.RuntimeConfig(sandbox_setup_timeout=300) +result = await bf.run(agent, env, runtime_config) # 3. String shortcut (simplest) -result = await bf.run("gemini", task_path="tasks/X", model="gemini-3.1-flash-lite-preview") +result = await bf.run( + "gemini", + task_path="tasks/X", + model="gemini-3.1-flash-lite-preview", + config=bf.RuntimeConfig(sandbox_setup_timeout=300), +) ``` ## Trial Lifecycle diff --git a/docs/cli-reference.md b/docs/cli-reference.md index 86c702e..4b38788 100644 --- a/docs/cli-reference.md +++ b/docs/cli-reference.md @@ -40,7 +40,8 @@ bench eval create \ -a gemini \ -m gemini-3.1-flash-lite-preview \ -e daytona \ - -c 64 + -c 64 \ + --sandbox-setup-timeout 300 ``` | Flag | Default | Description | @@ -53,6 +54,7 @@ bench eval create \ | `--concurrency`, `-c` | `4` | Max concurrent tasks (batch mode only) | | `--jobs-dir`, `-o` | `jobs` | Output directory | | `--sandbox-user` | `agent` | Sandbox user (null for root) | +| `--sandbox-setup-timeout` | `120` | Timeout in seconds for sandbox user setup | ### bench eval list @@ -145,6 +147,7 @@ bench environment list task_dir: .ref/terminal-bench-2 environment: daytona concurrency: 64 +sandbox_setup_timeout: 300 scenes: - name: solve @@ -165,6 +168,7 @@ model: gemini-3.1-flash-lite-preview environment: daytona concurrency: 64 max_retries: 2 +sandbox_setup_timeout: 300 ``` ### Multi-scene (BYOS skill generation) @@ -173,6 +177,7 @@ max_retries: 2 task_dir: tasks/ environment: daytona concurrency: 10 +sandbox_setup_timeout: 300 scenes: - name: skill-gen diff --git a/docs/quickstart.md b/docs/quickstart.md index e087ea7..a1a9e11 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -26,7 +26,8 @@ bench eval create \ -t .ref/terminal-bench-2/regex-log \ -a gemini \ -m gemini-3.1-flash-lite-preview \ - -e daytona + -e daytona \ + --sandbox-setup-timeout 300 ``` BenchFlow will: @@ -52,6 +53,7 @@ model: gemini-3.1-flash-lite-preview environment: daytona concurrency: 64 max_retries: 2 +sandbox_setup_timeout: 300 ``` ## Python API @@ -70,11 +72,23 @@ config = TrialConfig( task_path=Path("tasks/regex-log"), scenes=[Scene.single(agent="gemini", model="gemini-3.1-flash-lite-preview")], environment="daytona", + sandbox_setup_timeout=300, ) trial = await Trial.create(config) result = await trial.run() ``` +If you are using the `Agent + Environment` path directly, pass the timeout through `RuntimeConfig`: + +```python +from benchflow.runtime import Agent, Environment, Runtime, RuntimeConfig + +agent = Agent("gemini", model="gemini-3.1-flash-lite-preview") +env = Environment.from_task("tasks/regex-log", backend="daytona") +runtime = Runtime(env, agent, config=RuntimeConfig(sandbox_setup_timeout=300)) +result = await runtime.execute() +``` + ## Multi-agent (reviewer pattern) ```python @@ -95,6 +109,7 @@ config = TrialConfig( ]), ], environment="daytona", + sandbox_setup_timeout=300, ) result = await bf.run(config) ``` diff --git a/src/benchflow/cli/main.py b/src/benchflow/cli/main.py index e9acc23..9c9b7f2 100644 --- a/src/benchflow/cli/main.py +++ b/src/benchflow/cli/main.py @@ -740,6 +740,13 @@ def eval_create( str | None, typer.Option("--sandbox-user", help="Sandbox user (null for root)"), ] = "agent", + sandbox_setup_timeout: Annotated[ + int, + typer.Option( + "--sandbox-setup-timeout", + help="Timeout (seconds) for sandbox user setup inside the environment.", + ), + ] = 120, skills_dir: Annotated[ Path | None, typer.Option("--skills-dir", "-s", help="Skills directory to deploy"), @@ -767,6 +774,7 @@ def eval_create( skills_dir=str(skills_dir) if skills_dir else None)], environment=environment, sandbox_user=sandbox_user, + sandbox_setup_timeout=sandbox_setup_timeout, jobs_dir=jobs_dir, agent=agent, model=eff_model, @@ -796,6 +804,7 @@ async def _run(): environment=environment, concurrency=concurrency, sandbox_user=sandbox_user, + sandbox_setup_timeout=sandbox_setup_timeout, skills_dir=str(skills_dir) if skills_dir else None, ), ) diff --git a/src/benchflow/job.py b/src/benchflow/job.py index 70dfb56..bd53a0b 100644 --- a/src/benchflow/job.py +++ b/src/benchflow/job.py @@ -166,6 +166,7 @@ class JobConfig: skills_dir: str | None = None sandbox_user: str | None = "agent" sandbox_locked_paths: list[str] | None = None + sandbox_setup_timeout: int = 120 context_root: str | None = None exclude_tasks: set[str] = field(default_factory=set) @@ -291,6 +292,7 @@ def _from_native_yaml(cls, raw: dict, **kwargs) -> "Job": exclude = set(raw.get("exclude", [])) sandbox_user = raw.get("sandbox_user", "agent") sandbox_locked_paths = raw.get("sandbox_locked_paths") + sandbox_setup_timeout = raw.get("sandbox_setup_timeout", 120) agent_name = raw.get("agent", DEFAULT_AGENT) config = JobConfig( @@ -304,6 +306,7 @@ def _from_native_yaml(cls, raw: dict, **kwargs) -> "Job": skills_dir=str(Path(raw["skills_dir"])) if raw.get("skills_dir") else None, sandbox_user=sandbox_user, sandbox_locked_paths=sandbox_locked_paths, + sandbox_setup_timeout=sandbox_setup_timeout, exclude_tasks=exclude, ) return cls(tasks_dir=tasks_dir, jobs_dir=jobs_dir, config=config, **kwargs) @@ -350,6 +353,7 @@ def _from_harbor_yaml(cls, raw: dict, **kwargs) -> "Job": skills_dir = str(Path(skills_dir_raw)) if skills_dir_raw else None sandbox_user = raw.get("sandbox_user", "agent") sandbox_locked_paths = raw.get("sandbox_locked_paths") + sandbox_setup_timeout = raw.get("sandbox_setup_timeout", 120) config = JobConfig( agent=agent_name, @@ -361,6 +365,7 @@ def _from_harbor_yaml(cls, raw: dict, **kwargs) -> "Job": skills_dir=skills_dir, sandbox_user=sandbox_user, sandbox_locked_paths=sandbox_locked_paths, + sandbox_setup_timeout=sandbox_setup_timeout, ) return cls(tasks_dir=tasks_dir, jobs_dir=jobs_dir, config=config, **kwargs) @@ -428,6 +433,7 @@ async def _run_single_task(self, task_dir: Path, cfg: JobConfig) -> RunResult: skills_dir=self._resolve_skills_dir(task_dir, cfg.skills_dir), sandbox_user=cfg.sandbox_user, sandbox_locked_paths=cfg.sandbox_locked_paths, + sandbox_setup_timeout=cfg.sandbox_setup_timeout, context_root=cfg.context_root, ) trial = await Trial.create(trial_config) @@ -447,6 +453,7 @@ async def _run_single_task_legacy(self, task_dir: Path, cfg: JobConfig) -> RunRe skills_dir=self._resolve_skills_dir(task_dir, cfg.skills_dir), sandbox_user=cfg.sandbox_user, sandbox_locked_paths=cfg.sandbox_locked_paths, + sandbox_setup_timeout=cfg.sandbox_setup_timeout, context_root=cfg.context_root, ) diff --git a/src/benchflow/runtime.py b/src/benchflow/runtime.py index 14a785e..69be399 100644 --- a/src/benchflow/runtime.py +++ b/src/benchflow/runtime.py @@ -142,6 +142,7 @@ class RuntimeConfig: """Configuration for a Runtime execution.""" sandbox_user: str | None = "agent" + sandbox_setup_timeout: int = 120 max_rounds: int = 10 snapshot_policy: str = "none" reward_stream: bool = True @@ -263,6 +264,7 @@ async def execute(self) -> RuntimeResult: environment=self.env.backend, sandbox_user=config.sandbox_user, sandbox_locked_paths=config.sandbox_locked_paths, + sandbox_setup_timeout=config.sandbox_setup_timeout, jobs_dir=config.jobs_dir, context_root=config.context_root, pre_agent_hooks=config.pre_agent_hooks, @@ -338,6 +340,7 @@ async def run( environment=env if isinstance(env, str) else "docker", sandbox_user=rc.sandbox_user, sandbox_locked_paths=rc.sandbox_locked_paths, + sandbox_setup_timeout=rc.sandbox_setup_timeout, jobs_dir=rc.jobs_dir, context_root=rc.context_root, pre_agent_hooks=rc.pre_agent_hooks, diff --git a/src/benchflow/sdk.py b/src/benchflow/sdk.py index 28fa82d..b55a72f 100644 --- a/src/benchflow/sdk.py +++ b/src/benchflow/sdk.py @@ -222,6 +222,7 @@ def _write_config( sandbox_user: str | None, context_root: str | Path | None, sandbox_locked_paths: list[str] | None = None, + sandbox_setup_timeout: int = 120, timeout: int, started_at: datetime, agent_env: dict[str, str], @@ -241,6 +242,7 @@ def _write_config( "skills_dir": str(skills_dir) if skills_dir else None, "sandbox_user": sandbox_user, "sandbox_locked_paths": sandbox_locked_paths, + "sandbox_setup_timeout": sandbox_setup_timeout, "context_root": str(context_root) if context_root else None, "timeout_sec": timeout, "started_at": str(started_at), @@ -442,6 +444,7 @@ async def run( skills_dir: str | Path | None = None, sandbox_user: str | None = "agent", sandbox_locked_paths: list[str] | None = None, + sandbox_setup_timeout: int = 120, pre_agent_hooks: list | None = None, context_root: str | Path | None = None, ) -> RunResult: @@ -490,6 +493,7 @@ async def run( skills_dir=skills_dir, sandbox_user=sandbox_user, sandbox_locked_paths=sandbox_locked_paths, + sandbox_setup_timeout=sandbox_setup_timeout, pre_agent_hooks=pre_agent_hooks, context_root=context_root, ) diff --git a/src/benchflow/trial.py b/src/benchflow/trial.py index fd70e5d..7449fc9 100644 --- a/src/benchflow/trial.py +++ b/src/benchflow/trial.py @@ -132,6 +132,7 @@ class TrialConfig: environment: str = "docker" sandbox_user: str | None = "agent" sandbox_locked_paths: list[str] | None = None + sandbox_setup_timeout: int = 120 services: list[str] | None = None job_name: str | None = None trial_name: str | None = None @@ -329,6 +330,7 @@ async def setup(self) -> None: sandbox_user=cfg.sandbox_user, context_root=cfg.context_root, sandbox_locked_paths=self._effective_locked, + sandbox_setup_timeout=cfg.sandbox_setup_timeout, timeout=self._timeout, started_at=self._started_at, agent_env=self._agent_env, @@ -368,7 +370,10 @@ async def install_agent(self) -> None: if cfg.primary_agent == "oracle": if cfg.sandbox_user: await setup_sandbox_user( - self._env, cfg.sandbox_user, workspace=self._agent_cwd + self._env, + cfg.sandbox_user, + workspace=self._agent_cwd, + timeout_sec=cfg.sandbox_setup_timeout, ) await _snapshot_build_config(self._env, workspace=self._agent_cwd) await _seed_verifier_workspace(self._env, workspace=self._agent_cwd, sandbox_user=cfg.sandbox_user) @@ -390,7 +395,10 @@ async def install_agent(self) -> None: if cfg.sandbox_user: self._agent_cwd = await setup_sandbox_user( - self._env, cfg.sandbox_user, workspace=self._agent_cwd + self._env, + cfg.sandbox_user, + workspace=self._agent_cwd, + timeout_sec=cfg.sandbox_setup_timeout, ) await _snapshot_build_config(self._env, workspace=self._agent_cwd) await _seed_verifier_workspace(self._env, workspace=self._agent_cwd, sandbox_user=cfg.sandbox_user) diff --git a/src/benchflow/trial_yaml.py b/src/benchflow/trial_yaml.py index 5b8ca4a..707978b 100644 --- a/src/benchflow/trial_yaml.py +++ b/src/benchflow/trial_yaml.py @@ -103,6 +103,7 @@ def trial_config_from_dict( environment=raw.get("environment", "docker"), sandbox_user=raw.get("sandbox_user", "agent"), sandbox_locked_paths=raw.get("sandbox_locked_paths"), + sandbox_setup_timeout=raw.get("sandbox_setup_timeout", 120), job_name=raw.get("job_name"), trial_name=raw.get("trial_name"), jobs_dir=raw.get("jobs_dir", "jobs"), diff --git a/tests/test_runtime.py b/tests/test_runtime.py index 16046fe..37819f7 100644 --- a/tests/test_runtime.py +++ b/tests/test_runtime.py @@ -36,6 +36,7 @@ def test_agent_env_default_empty() -> None: def test_runtime_config_defaults() -> None: c = RuntimeConfig() assert c.sandbox_user == "agent" + assert c.sandbox_setup_timeout == 120 assert c.max_rounds == 10 assert c.snapshot_policy == "none" assert c.reward_stream is True @@ -143,9 +144,10 @@ def test_runtime_custom_config() -> None: if not (task_path / "task.toml").exists(): return env = Environment.from_task(task_path, backend="daytona") - config = RuntimeConfig(sandbox_user=None, timeout=1800) + config = RuntimeConfig(sandbox_user=None, timeout=1800, sandbox_setup_timeout=45) runtime = Runtime(env, agent, config) assert runtime.config.sandbox_user is None + assert runtime.config.sandbox_setup_timeout == 45 assert runtime.config.timeout == 1800 diff --git a/tests/test_sdk_internals.py b/tests/test_sdk_internals.py index f5abd3a..6eff44a 100644 --- a/tests/test_sdk_internals.py +++ b/tests/test_sdk_internals.py @@ -7,6 +7,7 @@ import json from datetime import datetime from pathlib import Path +from unittest.mock import AsyncMock import pytest @@ -341,6 +342,7 @@ def test_config_json_written(self, tmp_path): skills_dir=None, sandbox_user=None, context_root=None, + sandbox_setup_timeout=33, timeout=300, started_at=datetime(2026, 4, 8, 12, 0), agent_env={"ANTHROPIC_API_KEY": "sk-secret", "SOME_VAR": "visible"}, @@ -354,6 +356,7 @@ def test_config_json_written(self, tmp_path): "skills_dir", "sandbox_user", "sandbox_locked_paths", + "sandbox_setup_timeout", "context_root", "timeout_sec", "started_at", @@ -365,6 +368,7 @@ def test_config_json_written(self, tmp_path): assert data["agent"] == "claude-agent-acp" assert data["model"] == "claude-haiku-4-5-20251001" assert data["environment"] == "docker" + assert data["sandbox_setup_timeout"] == 33 assert data["timeout_sec"] == 300 def test_secrets_filtered(self, tmp_path): @@ -399,6 +403,41 @@ def test_secrets_filtered(self, tmp_path): assert recorded["SAFE_VAR"] == "visible" +# ── run wiring ── + + +class TestRunWiring: + """Tests for SDK.run() argument forwarding into TrialConfig.""" + + @pytest.mark.asyncio + async def test_run_forwards_sandbox_setup_timeout_to_trial_config( + self, monkeypatch, tmp_path + ): + from benchflow.models import RunResult + from benchflow.sdk import SDK + + seen = {} + + async def fake_create(config): + seen["config"] = config + trial = AsyncMock() + trial.run = AsyncMock( + return_value=RunResult(task_name="task-1", rewards={"reward": 1.0}) + ) + return trial + + monkeypatch.setattr("benchflow.trial.Trial.create", fake_create) + + result = await SDK().run( + task_path=tmp_path, + sandbox_setup_timeout=77, + ) + + assert result.rewards == {"reward": 1.0} + assert seen["config"].sandbox_setup_timeout == 77 + assert seen["config"].task_path == tmp_path + + # ── _build_result ── diff --git a/tests/test_trial_install_agent_timeout.py b/tests/test_trial_install_agent_timeout.py new file mode 100644 index 0000000..40db22c --- /dev/null +++ b/tests/test_trial_install_agent_timeout.py @@ -0,0 +1,91 @@ +"""Tests for Trial.install_agent timeout wiring.""" + +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from benchflow.trial import Trial, TrialConfig + + +def _make_trial(tmp_path, *, agent: str, sandbox_setup_timeout: int) -> Trial: + config = TrialConfig.from_legacy( + task_path=tmp_path / "task", + agent=agent, + prompts=[None], + sandbox_user="agent", + sandbox_setup_timeout=sandbox_setup_timeout, + ) + trial = Trial(config) + trial._env = MagicMock() + trial._env.exec = AsyncMock(return_value=MagicMock(stdout="/workspace\n")) + trial._trial_dir = tmp_path / "trial" + trial._trial_dir.mkdir() + trial._trial_paths = MagicMock() + trial._task = MagicMock() + trial._effective_locked = [] + return trial + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("agent", "expected_setup_return"), + [ + ("claude-agent-acp", "/home/agent"), + ("oracle", None), + ], +) +async def test_install_agent_forwards_sandbox_setup_timeout( + tmp_path, monkeypatch, agent, expected_setup_return +): + trial = _make_trial(tmp_path, agent=agent, sandbox_setup_timeout=41) + + install_agent_mock = AsyncMock(return_value=MagicMock()) + write_credential_files_mock = AsyncMock() + upload_subscription_auth_mock = AsyncMock() + snapshot_build_config_mock = AsyncMock() + seed_verifier_workspace_mock = AsyncMock() + deploy_skills_mock = AsyncMock() + lockdown_paths_mock = AsyncMock() + setup_sandbox_user_mock = AsyncMock(return_value=expected_setup_return) + + monkeypatch.setattr("benchflow.trial.install_agent", install_agent_mock) + monkeypatch.setattr( + "benchflow.trial.write_credential_files", write_credential_files_mock + ) + monkeypatch.setattr( + "benchflow.trial.upload_subscription_auth", upload_subscription_auth_mock + ) + monkeypatch.setattr( + "benchflow.trial._snapshot_build_config", snapshot_build_config_mock + ) + monkeypatch.setattr( + "benchflow.trial._seed_verifier_workspace", seed_verifier_workspace_mock + ) + monkeypatch.setattr("benchflow.trial.deploy_skills", deploy_skills_mock) + monkeypatch.setattr("benchflow.trial.lockdown_paths", lockdown_paths_mock) + monkeypatch.setattr( + "benchflow.trial.setup_sandbox_user", setup_sandbox_user_mock + ) + + await trial.install_agent() + + setup_sandbox_user_mock.assert_awaited_once() + args, kwargs = setup_sandbox_user_mock.await_args + assert args[1] == "agent" + assert kwargs["timeout_sec"] == 41 + assert kwargs["workspace"] == "/workspace" + + if agent == "oracle": + install_agent_mock.assert_not_awaited() + write_credential_files_mock.assert_not_awaited() + deploy_skills_mock.assert_not_awaited() + assert trial._agent_cwd == "/workspace" + else: + install_agent_mock.assert_awaited_once() + write_credential_files_mock.assert_awaited_once() + deploy_skills_mock.assert_awaited_once() + assert trial._agent_cwd == "/home/agent" + + snapshot_build_config_mock.assert_awaited_once() + seed_verifier_workspace_mock.assert_awaited_once() + lockdown_paths_mock.assert_awaited_once() diff --git a/tests/test_yaml_config.py b/tests/test_yaml_config.py index d362296..0de12aa 100644 --- a/tests/test_yaml_config.py +++ b/tests/test_yaml_config.py @@ -24,6 +24,7 @@ def native_yaml(tmp_path): environment: daytona concurrency: 32 max_retries: 1 +sandbox_setup_timeout: 45 prompts: - null - "Review your solution." @@ -46,6 +47,7 @@ def harbor_yaml(tmp_path): orchestrator: type: local n_concurrent_trials: 8 +sandbox_setup_timeout: 75 environment: type: daytona env: @@ -69,6 +71,7 @@ def test_from_native_yaml(native_yaml): assert cfg.environment == "daytona" assert cfg.concurrency == 32 assert cfg.retry.max_retries == 1 + assert cfg.sandbox_setup_timeout == 45 assert cfg.prompts == [None, "Review your solution."] assert job._tasks_dir == Path("tasks") assert job._jobs_dir == Path("output") @@ -85,6 +88,7 @@ def test_from_harbor_yaml(harbor_yaml): assert cfg.concurrency == 8 assert cfg.retry.max_retries == 1 # n_attempts=2 → max_retries=1 assert cfg.agent_env.get("ANTHROPIC_API_KEY") == "test-key" + assert cfg.sandbox_setup_timeout == 75 assert job._tasks_dir == Path("tasks") assert job._jobs_dir == Path("output") @@ -127,6 +131,7 @@ def test_from_harbor_yaml_defaults(tmp_path): assert cfg.agent == "pi-acp" assert cfg.environment == "docker" assert cfg.concurrency == 4 + assert cfg.sandbox_setup_timeout == 120 assert job._tasks_dir == Path("tasks") assert job._jobs_dir == Path("jobs") @@ -253,3 +258,4 @@ def test_defaults_when_omitted(self, tmp_path): assert job._config.exclude_tasks == set() assert job._config.agent_env == {} assert job._config.sandbox_user == "agent" + assert job._config.sandbox_setup_timeout == 120