Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 27 additions & 2 deletions docs/api-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ config = TrialConfig(
task_path=Path("tasks/my-task"),
scenes=[Scene.single(agent="gemini", model="gemini-3.1-flash-lite-preview")],
environment="daytona",
sandbox_setup_timeout=120,
)

# Multi-scene BYOS (skill-gen → solve)
Expand All @@ -46,9 +47,13 @@ config = TrialConfig(
turns=[Turn("solver")]),
],
environment="daytona",
sandbox_setup_timeout=120,
)
```

Set `sandbox_setup_timeout` when sandbox user setup needs more than the default 120 seconds.
The same field is also available on `JobConfig` and `RuntimeConfig`.

### Scene

One interaction region — roles take turns executing prompts.
Expand Down Expand Up @@ -98,6 +103,20 @@ await trial.verify()
await trial.cleanup()
```

### RuntimeConfig

Runtime-level configuration for the `Agent + Environment` execution path.

```python
from benchflow.runtime import Agent, Environment, Runtime, RuntimeConfig

config = RuntimeConfig(sandbox_setup_timeout=300)
agent = Agent("gemini", model="gemini-3.1-flash-lite-preview")
env = Environment.from_task("tasks/X", backend="daytona")
runtime = Runtime(env, agent, config=config)
result = await runtime.execute()
```

### bf.run()

Convenience function — multiple calling conventions:
Expand All @@ -111,10 +130,16 @@ result = await bf.run(config)
# 2. Agent + Environment (0.3 style)
agent = bf.Agent("gemini", model="gemini-3.1-flash-lite-preview")
env = bf.Environment.from_task("tasks/X", backend="daytona")
result = await bf.run(agent, env)
runtime_config = bf.RuntimeConfig(sandbox_setup_timeout=300)
result = await bf.run(agent, env, runtime_config)

# 3. String shortcut (simplest)
result = await bf.run("gemini", task_path="tasks/X", model="gemini-3.1-flash-lite-preview")
result = await bf.run(
"gemini",
task_path="tasks/X",
model="gemini-3.1-flash-lite-preview",
config=bf.RuntimeConfig(sandbox_setup_timeout=300),
)
```

## Trial Lifecycle
Expand Down
7 changes: 6 additions & 1 deletion docs/cli-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ bench eval create \
-a gemini \
-m gemini-3.1-flash-lite-preview \
-e daytona \
-c 64
-c 64 \
--sandbox-setup-timeout 300
```

| Flag | Default | Description |
Expand All @@ -53,6 +54,7 @@ bench eval create \
| `--concurrency`, `-c` | `4` | Max concurrent tasks (batch mode only) |
| `--jobs-dir`, `-o` | `jobs` | Output directory |
| `--sandbox-user` | `agent` | Sandbox user (null for root) |
| `--sandbox-setup-timeout` | `120` | Timeout in seconds for sandbox user setup |

### bench eval list

Expand Down Expand Up @@ -145,6 +147,7 @@ bench environment list
task_dir: .ref/terminal-bench-2
environment: daytona
concurrency: 64
sandbox_setup_timeout: 300

scenes:
- name: solve
Expand All @@ -165,6 +168,7 @@ model: gemini-3.1-flash-lite-preview
environment: daytona
concurrency: 64
max_retries: 2
sandbox_setup_timeout: 300
```

### Multi-scene (BYOS skill generation)
Expand All @@ -173,6 +177,7 @@ max_retries: 2
task_dir: tasks/
environment: daytona
concurrency: 10
sandbox_setup_timeout: 300

scenes:
- name: skill-gen
Expand Down
17 changes: 16 additions & 1 deletion docs/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ bench eval create \
-t .ref/terminal-bench-2/regex-log \
-a gemini \
-m gemini-3.1-flash-lite-preview \
-e daytona
-e daytona \
--sandbox-setup-timeout 300
```

BenchFlow will:
Expand All @@ -52,6 +53,7 @@ model: gemini-3.1-flash-lite-preview
environment: daytona
concurrency: 64
max_retries: 2
sandbox_setup_timeout: 300
```

## Python API
Expand All @@ -70,11 +72,23 @@ config = TrialConfig(
task_path=Path("tasks/regex-log"),
scenes=[Scene.single(agent="gemini", model="gemini-3.1-flash-lite-preview")],
environment="daytona",
sandbox_setup_timeout=300,
)
trial = await Trial.create(config)
result = await trial.run()
```

If you are using the `Agent + Environment` path directly, pass the timeout through `RuntimeConfig`:

```python
from benchflow.runtime import Agent, Environment, Runtime, RuntimeConfig

agent = Agent("gemini", model="gemini-3.1-flash-lite-preview")
env = Environment.from_task("tasks/regex-log", backend="daytona")
runtime = Runtime(env, agent, config=RuntimeConfig(sandbox_setup_timeout=300))
result = await runtime.execute()
```

## Multi-agent (reviewer pattern)

```python
Expand All @@ -95,6 +109,7 @@ config = TrialConfig(
]),
],
environment="daytona",
sandbox_setup_timeout=300,
)
result = await bf.run(config)
```
Expand Down
9 changes: 9 additions & 0 deletions src/benchflow/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -740,6 +740,13 @@ def eval_create(
str | None,
typer.Option("--sandbox-user", help="Sandbox user (null for root)"),
] = "agent",
sandbox_setup_timeout: Annotated[
int,
typer.Option(
"--sandbox-setup-timeout",
help="Timeout (seconds) for sandbox user setup inside the environment.",
),
] = 120,
skills_dir: Annotated[
Path | None,
typer.Option("--skills-dir", "-s", help="Skills directory to deploy"),
Expand Down Expand Up @@ -767,6 +774,7 @@ def eval_create(
skills_dir=str(skills_dir) if skills_dir else None)],
environment=environment,
sandbox_user=sandbox_user,
sandbox_setup_timeout=sandbox_setup_timeout,
jobs_dir=jobs_dir,
agent=agent,
model=eff_model,
Expand Down Expand Up @@ -796,6 +804,7 @@ async def _run():
environment=environment,
concurrency=concurrency,
sandbox_user=sandbox_user,
sandbox_setup_timeout=sandbox_setup_timeout,
skills_dir=str(skills_dir) if skills_dir else None,
),
)
Expand Down
7 changes: 7 additions & 0 deletions src/benchflow/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ class JobConfig:
skills_dir: str | None = None
sandbox_user: str | None = "agent"
sandbox_locked_paths: list[str] | None = None
sandbox_setup_timeout: int = 120
context_root: str | None = None
exclude_tasks: set[str] = field(default_factory=set)

Expand Down Expand Up @@ -291,6 +292,7 @@ def _from_native_yaml(cls, raw: dict, **kwargs) -> "Job":
exclude = set(raw.get("exclude", []))
sandbox_user = raw.get("sandbox_user", "agent")
sandbox_locked_paths = raw.get("sandbox_locked_paths")
sandbox_setup_timeout = raw.get("sandbox_setup_timeout", 120)

agent_name = raw.get("agent", DEFAULT_AGENT)
config = JobConfig(
Expand All @@ -304,6 +306,7 @@ def _from_native_yaml(cls, raw: dict, **kwargs) -> "Job":
skills_dir=str(Path(raw["skills_dir"])) if raw.get("skills_dir") else None,
sandbox_user=sandbox_user,
sandbox_locked_paths=sandbox_locked_paths,
sandbox_setup_timeout=sandbox_setup_timeout,
exclude_tasks=exclude,
)
return cls(tasks_dir=tasks_dir, jobs_dir=jobs_dir, config=config, **kwargs)
Expand Down Expand Up @@ -350,6 +353,7 @@ def _from_harbor_yaml(cls, raw: dict, **kwargs) -> "Job":
skills_dir = str(Path(skills_dir_raw)) if skills_dir_raw else None
sandbox_user = raw.get("sandbox_user", "agent")
sandbox_locked_paths = raw.get("sandbox_locked_paths")
sandbox_setup_timeout = raw.get("sandbox_setup_timeout", 120)

config = JobConfig(
agent=agent_name,
Expand All @@ -361,6 +365,7 @@ def _from_harbor_yaml(cls, raw: dict, **kwargs) -> "Job":
skills_dir=skills_dir,
sandbox_user=sandbox_user,
sandbox_locked_paths=sandbox_locked_paths,
sandbox_setup_timeout=sandbox_setup_timeout,
)
return cls(tasks_dir=tasks_dir, jobs_dir=jobs_dir, config=config, **kwargs)

Expand Down Expand Up @@ -428,6 +433,7 @@ async def _run_single_task(self, task_dir: Path, cfg: JobConfig) -> RunResult:
skills_dir=self._resolve_skills_dir(task_dir, cfg.skills_dir),
sandbox_user=cfg.sandbox_user,
sandbox_locked_paths=cfg.sandbox_locked_paths,
sandbox_setup_timeout=cfg.sandbox_setup_timeout,
context_root=cfg.context_root,
)
trial = await Trial.create(trial_config)
Expand All @@ -447,6 +453,7 @@ async def _run_single_task_legacy(self, task_dir: Path, cfg: JobConfig) -> RunRe
skills_dir=self._resolve_skills_dir(task_dir, cfg.skills_dir),
sandbox_user=cfg.sandbox_user,
sandbox_locked_paths=cfg.sandbox_locked_paths,
sandbox_setup_timeout=cfg.sandbox_setup_timeout,
context_root=cfg.context_root,
)

Expand Down
3 changes: 3 additions & 0 deletions src/benchflow/runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ class RuntimeConfig:
"""Configuration for a Runtime execution."""

sandbox_user: str | None = "agent"
sandbox_setup_timeout: int = 120
max_rounds: int = 10
snapshot_policy: str = "none"
reward_stream: bool = True
Expand Down Expand Up @@ -263,6 +264,7 @@ async def execute(self) -> RuntimeResult:
environment=self.env.backend,
sandbox_user=config.sandbox_user,
sandbox_locked_paths=config.sandbox_locked_paths,
sandbox_setup_timeout=config.sandbox_setup_timeout,
jobs_dir=config.jobs_dir,
context_root=config.context_root,
pre_agent_hooks=config.pre_agent_hooks,
Expand Down Expand Up @@ -338,6 +340,7 @@ async def run(
environment=env if isinstance(env, str) else "docker",
sandbox_user=rc.sandbox_user,
sandbox_locked_paths=rc.sandbox_locked_paths,
sandbox_setup_timeout=rc.sandbox_setup_timeout,
jobs_dir=rc.jobs_dir,
context_root=rc.context_root,
pre_agent_hooks=rc.pre_agent_hooks,
Expand Down
4 changes: 4 additions & 0 deletions src/benchflow/sdk.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@ def _write_config(
sandbox_user: str | None,
context_root: str | Path | None,
sandbox_locked_paths: list[str] | None = None,
sandbox_setup_timeout: int = 120,
timeout: int,
started_at: datetime,
agent_env: dict[str, str],
Expand All @@ -241,6 +242,7 @@ def _write_config(
"skills_dir": str(skills_dir) if skills_dir else None,
"sandbox_user": sandbox_user,
"sandbox_locked_paths": sandbox_locked_paths,
"sandbox_setup_timeout": sandbox_setup_timeout,
"context_root": str(context_root) if context_root else None,
"timeout_sec": timeout,
"started_at": str(started_at),
Expand Down Expand Up @@ -442,6 +444,7 @@ async def run(
skills_dir: str | Path | None = None,
sandbox_user: str | None = "agent",
sandbox_locked_paths: list[str] | None = None,
sandbox_setup_timeout: int = 120,
pre_agent_hooks: list | None = None,
context_root: str | Path | None = None,
) -> RunResult:
Expand Down Expand Up @@ -490,6 +493,7 @@ async def run(
skills_dir=skills_dir,
sandbox_user=sandbox_user,
sandbox_locked_paths=sandbox_locked_paths,
sandbox_setup_timeout=sandbox_setup_timeout,
pre_agent_hooks=pre_agent_hooks,
context_root=context_root,
)
Expand Down
12 changes: 10 additions & 2 deletions src/benchflow/trial.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ class TrialConfig:
environment: str = "docker"
sandbox_user: str | None = "agent"
sandbox_locked_paths: list[str] | None = None
sandbox_setup_timeout: int = 120
services: list[str] | None = None
job_name: str | None = None
trial_name: str | None = None
Expand Down Expand Up @@ -329,6 +330,7 @@ async def setup(self) -> None:
sandbox_user=cfg.sandbox_user,
context_root=cfg.context_root,
sandbox_locked_paths=self._effective_locked,
sandbox_setup_timeout=cfg.sandbox_setup_timeout,
timeout=self._timeout,
started_at=self._started_at,
agent_env=self._agent_env,
Expand Down Expand Up @@ -368,7 +370,10 @@ async def install_agent(self) -> None:
if cfg.primary_agent == "oracle":
if cfg.sandbox_user:
await setup_sandbox_user(
self._env, cfg.sandbox_user, workspace=self._agent_cwd
self._env,
cfg.sandbox_user,
workspace=self._agent_cwd,
timeout_sec=cfg.sandbox_setup_timeout,
)
await _snapshot_build_config(self._env, workspace=self._agent_cwd)
await _seed_verifier_workspace(self._env, workspace=self._agent_cwd, sandbox_user=cfg.sandbox_user)
Expand All @@ -390,7 +395,10 @@ async def install_agent(self) -> None:

if cfg.sandbox_user:
self._agent_cwd = await setup_sandbox_user(
self._env, cfg.sandbox_user, workspace=self._agent_cwd
self._env,
cfg.sandbox_user,
workspace=self._agent_cwd,
timeout_sec=cfg.sandbox_setup_timeout,
)
await _snapshot_build_config(self._env, workspace=self._agent_cwd)
await _seed_verifier_workspace(self._env, workspace=self._agent_cwd, sandbox_user=cfg.sandbox_user)
Expand Down
1 change: 1 addition & 0 deletions src/benchflow/trial_yaml.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ def trial_config_from_dict(
environment=raw.get("environment", "docker"),
sandbox_user=raw.get("sandbox_user", "agent"),
sandbox_locked_paths=raw.get("sandbox_locked_paths"),
sandbox_setup_timeout=raw.get("sandbox_setup_timeout", 120),
job_name=raw.get("job_name"),
trial_name=raw.get("trial_name"),
jobs_dir=raw.get("jobs_dir", "jobs"),
Expand Down
4 changes: 3 additions & 1 deletion tests/test_runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def test_agent_env_default_empty() -> None:
def test_runtime_config_defaults() -> None:
c = RuntimeConfig()
assert c.sandbox_user == "agent"
assert c.sandbox_setup_timeout == 120
assert c.max_rounds == 10
assert c.snapshot_policy == "none"
assert c.reward_stream is True
Expand Down Expand Up @@ -143,9 +144,10 @@ def test_runtime_custom_config() -> None:
if not (task_path / "task.toml").exists():
return
env = Environment.from_task(task_path, backend="daytona")
config = RuntimeConfig(sandbox_user=None, timeout=1800)
config = RuntimeConfig(sandbox_user=None, timeout=1800, sandbox_setup_timeout=45)
runtime = Runtime(env, agent, config)
assert runtime.config.sandbox_user is None
assert runtime.config.sandbox_setup_timeout == 45
assert runtime.config.timeout == 1800


Expand Down
Loading