benchflow-ai · xdotli · Apr 25, 2026 · Apr 21, 2026 · Apr 21, 2026 · Apr 21, 2026
diff --git a/docs/api-reference.md b/docs/api-reference.md
@@ -34,6 +34,7 @@ config = TrialConfig(
     task_path=Path("tasks/my-task"),
     scenes=[Scene.single(agent="gemini", model="gemini-3.1-flash-lite-preview")],
     environment="daytona",
+    sandbox_setup_timeout=120,
 )
 
 # Multi-scene BYOS (skill-gen → solve)
@@ -46,9 +47,13 @@ config = TrialConfig(
               turns=[Turn("solver")]),
     ],
     environment="daytona",
+    sandbox_setup_timeout=120,
 )
 ```
 
+Set `sandbox_setup_timeout` when sandbox user setup needs more than the default 120 seconds.
+The same field is also available on `JobConfig` and `RuntimeConfig`.
+
 ### Scene
 
 One interaction region — roles take turns executing prompts.
@@ -98,6 +103,20 @@ await trial.verify()
 await trial.cleanup()
 ```
 
+### RuntimeConfig
+
+Runtime-level configuration for the `Agent + Environment` execution path.
+
+```python
+from benchflow.runtime import Agent, Environment, Runtime, RuntimeConfig
+
+config = RuntimeConfig(sandbox_setup_timeout=300)
+agent = Agent("gemini", model="gemini-3.1-flash-lite-preview")
+env = Environment.from_task("tasks/X", backend="daytona")
+runtime = Runtime(env, agent, config=config)
+result = await runtime.execute()
+```
+
 ### bf.run()
 
 Convenience function — multiple calling conventions:
@@ -111,10 +130,16 @@ result = await bf.run(config)
 # 2. Agent + Environment (0.3 style)
 agent = bf.Agent("gemini", model="gemini-3.1-flash-lite-preview")
 env = bf.Environment.from_task("tasks/X", backend="daytona")
-result = await bf.run(agent, env)
+runtime_config = bf.RuntimeConfig(sandbox_setup_timeout=300)
+result = await bf.run(agent, env, runtime_config)
 
 # 3. String shortcut (simplest)
-result = await bf.run("gemini", task_path="tasks/X", model="gemini-3.1-flash-lite-preview")
+result = await bf.run(
+    "gemini",
+    task_path="tasks/X",
+    model="gemini-3.1-flash-lite-preview",
+    config=bf.RuntimeConfig(sandbox_setup_timeout=300),
+)
 ```
 
 ## Trial Lifecycle

diff --git a/docs/cli-reference.md b/docs/cli-reference.md
@@ -40,7 +40,8 @@ bench eval create \
   -a gemini \
   -m gemini-3.1-flash-lite-preview \
   -e daytona \
-  -c 64
+  -c 64 \
+  --sandbox-setup-timeout 300
 ```
 
 | Flag | Default | Description |
@@ -53,6 +54,7 @@ bench eval create \
 | `--concurrency`, `-c` | `4` | Max concurrent tasks (batch mode only) |
 | `--jobs-dir`, `-o` | `jobs` | Output directory |
 | `--sandbox-user` | `agent` | Sandbox user (null for root) |
+| `--sandbox-setup-timeout` | `120` | Timeout in seconds for sandbox user setup |
 
 ### bench eval list
 
@@ -145,6 +147,7 @@ bench environment list
 task_dir: .ref/terminal-bench-2
 environment: daytona
 concurrency: 64
+sandbox_setup_timeout: 300
 
 scenes:
   - name: solve
@@ -165,6 +168,7 @@ model: gemini-3.1-flash-lite-preview
 environment: daytona
 concurrency: 64
 max_retries: 2
+sandbox_setup_timeout: 300
 ```
 
 ### Multi-scene (BYOS skill generation)
@@ -173,6 +177,7 @@ max_retries: 2
 task_dir: tasks/
 environment: daytona
 concurrency: 10
+sandbox_setup_timeout: 300
 
 scenes:
   - name: skill-gen

diff --git a/docs/quickstart.md b/docs/quickstart.md
@@ -26,7 +26,8 @@ bench eval create \
   -t .ref/terminal-bench-2/regex-log \
   -a gemini \
   -m gemini-3.1-flash-lite-preview \
-  -e daytona
+  -e daytona \
+  --sandbox-setup-timeout 300
 ```
 
 BenchFlow will:
@@ -52,6 +53,7 @@ model: gemini-3.1-flash-lite-preview
 environment: daytona
 concurrency: 64
 max_retries: 2
+sandbox_setup_timeout: 300
 ```
 
 ## Python API
@@ -70,11 +72,23 @@ config = TrialConfig(
     task_path=Path("tasks/regex-log"),
     scenes=[Scene.single(agent="gemini", model="gemini-3.1-flash-lite-preview")],
     environment="daytona",
+    sandbox_setup_timeout=300,
 )
 trial = await Trial.create(config)
 result = await trial.run()
 ```
 
+If you are using the `Agent + Environment` path directly, pass the timeout through `RuntimeConfig`:
+
+```python
+from benchflow.runtime import Agent, Environment, Runtime, RuntimeConfig
+
+agent = Agent("gemini", model="gemini-3.1-flash-lite-preview")
+env = Environment.from_task("tasks/regex-log", backend="daytona")
+runtime = Runtime(env, agent, config=RuntimeConfig(sandbox_setup_timeout=300))
+result = await runtime.execute()
+```
+
 ## Multi-agent (reviewer pattern)
 
 ```python
@@ -95,6 +109,7 @@ config = TrialConfig(
               ]),
     ],
     environment="daytona",
+    sandbox_setup_timeout=300,
 )
 result = await bf.run(config)
 ```

diff --git a/src/benchflow/cli/main.py b/src/benchflow/cli/main.py
@@ -740,6 +740,13 @@ def eval_create(
         str | None,
         typer.Option("--sandbox-user", help="Sandbox user (null for root)"),
     ] = "agent",
+    sandbox_setup_timeout: Annotated[
+        int,
+        typer.Option(
+            "--sandbox-setup-timeout",
+            help="Timeout (seconds) for sandbox user setup inside the environment.",
+        ),
+    ] = 120,
     skills_dir: Annotated[
         Path | None,
         typer.Option("--skills-dir", "-s", help="Skills directory to deploy"),
@@ -767,6 +774,7 @@ def eval_create(
                                      skills_dir=str(skills_dir) if skills_dir else None)],
                 environment=environment,
                 sandbox_user=sandbox_user,
+                sandbox_setup_timeout=sandbox_setup_timeout,
                 jobs_dir=jobs_dir,
                 agent=agent,
                 model=eff_model,
@@ -796,6 +804,7 @@ async def _run():
                     environment=environment,
                     concurrency=concurrency,
                     sandbox_user=sandbox_user,
+                    sandbox_setup_timeout=sandbox_setup_timeout,
                     skills_dir=str(skills_dir) if skills_dir else None,
                 ),
             )

diff --git a/src/benchflow/job.py b/src/benchflow/job.py
@@ -166,6 +166,7 @@ class JobConfig:
     skills_dir: str | None = None
     sandbox_user: str | None = "agent"
     sandbox_locked_paths: list[str] | None = None
+    sandbox_setup_timeout: int = 120
     context_root: str | None = None
     exclude_tasks: set[str] = field(default_factory=set)
 
@@ -291,6 +292,7 @@ def _from_native_yaml(cls, raw: dict, **kwargs) -> "Job":
         exclude = set(raw.get("exclude", []))
         sandbox_user = raw.get("sandbox_user", "agent")
         sandbox_locked_paths = raw.get("sandbox_locked_paths")
+        sandbox_setup_timeout = raw.get("sandbox_setup_timeout", 120)
 
         agent_name = raw.get("agent", DEFAULT_AGENT)
         config = JobConfig(
@@ -304,6 +306,7 @@ def _from_native_yaml(cls, raw: dict, **kwargs) -> "Job":
             skills_dir=str(Path(raw["skills_dir"])) if raw.get("skills_dir") else None,
             sandbox_user=sandbox_user,
             sandbox_locked_paths=sandbox_locked_paths,
+            sandbox_setup_timeout=sandbox_setup_timeout,
             exclude_tasks=exclude,
         )
         return cls(tasks_dir=tasks_dir, jobs_dir=jobs_dir, config=config, **kwargs)
@@ -350,6 +353,7 @@ def _from_harbor_yaml(cls, raw: dict, **kwargs) -> "Job":
         skills_dir = str(Path(skills_dir_raw)) if skills_dir_raw else None
         sandbox_user = raw.get("sandbox_user", "agent")
         sandbox_locked_paths = raw.get("sandbox_locked_paths")
+        sandbox_setup_timeout = raw.get("sandbox_setup_timeout", 120)
 
         config = JobConfig(
             agent=agent_name,
@@ -361,6 +365,7 @@ def _from_harbor_yaml(cls, raw: dict, **kwargs) -> "Job":
             skills_dir=skills_dir,
             sandbox_user=sandbox_user,
             sandbox_locked_paths=sandbox_locked_paths,
+            sandbox_setup_timeout=sandbox_setup_timeout,
         )
         return cls(tasks_dir=tasks_dir, jobs_dir=jobs_dir, config=config, **kwargs)
 
@@ -428,6 +433,7 @@ async def _run_single_task(self, task_dir: Path, cfg: JobConfig) -> RunResult:
             skills_dir=self._resolve_skills_dir(task_dir, cfg.skills_dir),
             sandbox_user=cfg.sandbox_user,
             sandbox_locked_paths=cfg.sandbox_locked_paths,
+            sandbox_setup_timeout=cfg.sandbox_setup_timeout,
             context_root=cfg.context_root,
         )
         trial = await Trial.create(trial_config)
@@ -447,6 +453,7 @@ async def _run_single_task_legacy(self, task_dir: Path, cfg: JobConfig) -> RunRe
             skills_dir=self._resolve_skills_dir(task_dir, cfg.skills_dir),
             sandbox_user=cfg.sandbox_user,
             sandbox_locked_paths=cfg.sandbox_locked_paths,
+            sandbox_setup_timeout=cfg.sandbox_setup_timeout,
             context_root=cfg.context_root,
         )
 

diff --git a/src/benchflow/runtime.py b/src/benchflow/runtime.py
@@ -142,6 +142,7 @@ class RuntimeConfig:
     """Configuration for a Runtime execution."""
 
     sandbox_user: str | None = "agent"
+    sandbox_setup_timeout: int = 120
     max_rounds: int = 10
     snapshot_policy: str = "none"
     reward_stream: bool = True
@@ -263,6 +264,7 @@ async def execute(self) -> RuntimeResult:
             environment=self.env.backend,
             sandbox_user=config.sandbox_user,
             sandbox_locked_paths=config.sandbox_locked_paths,
+            sandbox_setup_timeout=config.sandbox_setup_timeout,
             jobs_dir=config.jobs_dir,
             context_root=config.context_root,
             pre_agent_hooks=config.pre_agent_hooks,
@@ -338,6 +340,7 @@ async def run(
             environment=env if isinstance(env, str) else "docker",
             sandbox_user=rc.sandbox_user,
             sandbox_locked_paths=rc.sandbox_locked_paths,
+            sandbox_setup_timeout=rc.sandbox_setup_timeout,
             jobs_dir=rc.jobs_dir,
             context_root=rc.context_root,
             pre_agent_hooks=rc.pre_agent_hooks,

diff --git a/src/benchflow/sdk.py b/src/benchflow/sdk.py
@@ -222,6 +222,7 @@ def _write_config(
         sandbox_user: str | None,
         context_root: str | Path | None,
         sandbox_locked_paths: list[str] | None = None,
+        sandbox_setup_timeout: int = 120,
         timeout: int,
         started_at: datetime,
         agent_env: dict[str, str],
@@ -241,6 +242,7 @@ def _write_config(
             "skills_dir": str(skills_dir) if skills_dir else None,
             "sandbox_user": sandbox_user,
             "sandbox_locked_paths": sandbox_locked_paths,
+            "sandbox_setup_timeout": sandbox_setup_timeout,
             "context_root": str(context_root) if context_root else None,
             "timeout_sec": timeout,
             "started_at": str(started_at),
@@ -442,6 +444,7 @@ async def run(
         skills_dir: str | Path | None = None,
         sandbox_user: str | None = "agent",
         sandbox_locked_paths: list[str] | None = None,
+        sandbox_setup_timeout: int = 120,
         pre_agent_hooks: list | None = None,
         context_root: str | Path | None = None,
     ) -> RunResult:
@@ -490,6 +493,7 @@ async def run(
             skills_dir=skills_dir,
             sandbox_user=sandbox_user,
             sandbox_locked_paths=sandbox_locked_paths,
+            sandbox_setup_timeout=sandbox_setup_timeout,
             pre_agent_hooks=pre_agent_hooks,
             context_root=context_root,
         )

diff --git a/src/benchflow/trial.py b/src/benchflow/trial.py
@@ -132,6 +132,7 @@ class TrialConfig:
     environment: str = "docker"
     sandbox_user: str | None = "agent"
     sandbox_locked_paths: list[str] | None = None
+    sandbox_setup_timeout: int = 120
     services: list[str] | None = None
     job_name: str | None = None
     trial_name: str | None = None
@@ -329,6 +330,7 @@ async def setup(self) -> None:
             sandbox_user=cfg.sandbox_user,
             context_root=cfg.context_root,
             sandbox_locked_paths=self._effective_locked,
+            sandbox_setup_timeout=cfg.sandbox_setup_timeout,
             timeout=self._timeout,
             started_at=self._started_at,
             agent_env=self._agent_env,
@@ -368,7 +370,10 @@ async def install_agent(self) -> None:
         if cfg.primary_agent == "oracle":
             if cfg.sandbox_user:
                 await setup_sandbox_user(
-                    self._env, cfg.sandbox_user, workspace=self._agent_cwd
+                    self._env,
+                    cfg.sandbox_user,
+                    workspace=self._agent_cwd,
+                    timeout_sec=cfg.sandbox_setup_timeout,
                 )
             await _snapshot_build_config(self._env, workspace=self._agent_cwd)
             await _seed_verifier_workspace(self._env, workspace=self._agent_cwd, sandbox_user=cfg.sandbox_user)
@@ -390,7 +395,10 @@ async def install_agent(self) -> None:
 
         if cfg.sandbox_user:
             self._agent_cwd = await setup_sandbox_user(
-                self._env, cfg.sandbox_user, workspace=self._agent_cwd
+                self._env,
+                cfg.sandbox_user,
+                workspace=self._agent_cwd,
+                timeout_sec=cfg.sandbox_setup_timeout,
             )
         await _snapshot_build_config(self._env, workspace=self._agent_cwd)
         await _seed_verifier_workspace(self._env, workspace=self._agent_cwd, sandbox_user=cfg.sandbox_user)

diff --git a/src/benchflow/trial_yaml.py b/src/benchflow/trial_yaml.py
@@ -103,6 +103,7 @@ def trial_config_from_dict(
         environment=raw.get("environment", "docker"),
         sandbox_user=raw.get("sandbox_user", "agent"),
         sandbox_locked_paths=raw.get("sandbox_locked_paths"),
+        sandbox_setup_timeout=raw.get("sandbox_setup_timeout", 120),
         job_name=raw.get("job_name"),
         trial_name=raw.get("trial_name"),
         jobs_dir=raw.get("jobs_dir", "jobs"),

diff --git a/tests/test_runtime.py b/tests/test_runtime.py
@@ -36,6 +36,7 @@ def test_agent_env_default_empty() -> None:
 def test_runtime_config_defaults() -> None:
     c = RuntimeConfig()
     assert c.sandbox_user == "agent"
+    assert c.sandbox_setup_timeout == 120
     assert c.max_rounds == 10
     assert c.snapshot_policy == "none"
     assert c.reward_stream is True
@@ -143,9 +144,10 @@ def test_runtime_custom_config() -> None:
     if not (task_path / "task.toml").exists():
         return
     env = Environment.from_task(task_path, backend="daytona")
-    config = RuntimeConfig(sandbox_user=None, timeout=1800)
+    config = RuntimeConfig(sandbox_user=None, timeout=1800, sandbox_setup_timeout=45)
     runtime = Runtime(env, agent, config)
     assert runtime.config.sandbox_user is None
+    assert runtime.config.sandbox_setup_timeout == 45
     assert runtime.config.timeout == 1800