Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/api-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,10 @@ Trial.run()
├─ setup() — resolve config, create env object
├─ start() — spin up sandbox, upload task files, start services
├─ install_agent() — install agent binary, credentials, sandbox user
│ (sandbox user setup: create non-root user, prepare
│ small config/auth dirs, chown the workspace — no
│ recursive copy of /root tool trees; agent binaries
│ must live on shared prefixes like /usr/local/bin)
├─ for scene in scenes:
│ └─ _run_scene(scene)
│ ├─ setup /app/.outbox/ — (multi-role scenes only)
Expand Down
2 changes: 2 additions & 0 deletions docs/task-authoring.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ env = { OPENAI_API_KEY = "${OPENAI_API_KEY}" } # host vars to injec

**Built-in mock services** — if the Dockerfile references a service binary (`claw-gmail`, `claw-slack`, `claw-gcal`, `claw-gdoc`, `claw-gdrive`), BenchFlow starts it automatically. No `[services]` section needed.

**Install tooling to shared prefixes, not `/root`** — when a task image ships Node.js, Python tools, or agent binaries that the sandbox user must execute, install them to `/usr/local/bin`, `/usr/local/lib`, or `/opt`, not `/root/.nvm` or `/root/.local/bin`. `setup_sandbox_user()` creates the non-root user, prepares small config/auth dirs, and chowns the workspace — it does not clone `/root` into the sandbox home. Legacy images that already install tools under `/root` still work via a narrow symlink fallback, but shared prefixes are the supported path. Pre-creating the sandbox user in the Dockerfile is an optional speedup, not a requirement.

---

## instruction.md
Expand Down
80 changes: 55 additions & 25 deletions src/benchflow/_agent_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,47 @@
logger = logging.getLogger(__name__)


def _skill_link_cmd(source: str, dest: str) -> str:
"""Link a shared skills tree into an agent discovery path."""
if source == dest:
return f"mkdir -p {shlex.quote(dest)}"

parent = shlex.quote(str(Path(dest).parent))
q_source = shlex.quote(source)
q_dest = shlex.quote(dest)
return (
f"mkdir -p {parent} && "
f"rm -rf {q_dest} && "
f"ln -sfn {q_source} {q_dest}"
)


async def _link_skill_paths(env, source: str, skill_paths: list[str], home: str, cwd: str) -> int:
"""Link one shared skills tree into each configured discovery path."""
parts = []
for sp in skill_paths:
expanded = sp.replace("$HOME", home).replace("$WORKSPACE", cwd)
parts.append(_skill_link_cmd(source, expanded))
if parts:
cmd = " && ".join(parts)
result = await env.exec(cmd, timeout_sec=15)
if result.return_code != 0:
stdout = (getattr(result, "stdout", "") or "").strip()
stderr = (getattr(result, "stderr", "") or "").strip()
details = [
f"exit code {result.return_code}",
f"command: {cmd}",
]
if stdout:
details.append(f"stdout: {stdout}")
if stderr:
details.append(f"stderr: {stderr}")
raise RuntimeError(
f"Failed to link skills from {source}: {'; '.join(details)}"
)
return len(parts)


async def install_agent(env, agent: str, trial_dir: Path) -> AgentConfig | None:
"""Install agent in sandbox and return its config."""
agent_base = agent.split()[0]
Expand Down Expand Up @@ -86,6 +127,9 @@ async def deploy_skills(
task: "Task",
) -> None:
"""Deploy and distribute skills into sandbox."""
task_skills_dir = task.config.environment.skills_dir
effective_skills = task_skills_dir

# Runtime upload (fallback if not baked into Dockerfile)
if skills_dir:
dockerfile = task_path / "environment" / "Dockerfile"
Expand All @@ -98,38 +142,24 @@ async def deploy_skills(
if skills_path.is_dir():
logger.info(f"Deploying skills via runtime upload from {skills_path}")
await env.upload_dir(skills_path, "/skills")
if agent_cfg and agent_cfg.skill_paths:
parts = []
for sp in agent_cfg.skill_paths:
expanded = sp.replace("$HOME", "/root").replace(
"$WORKSPACE", "/app"
)
parent = str(Path(expanded).parent)
parts.append(
f"mkdir -p '{parent}' && ln -sf /skills '{expanded}'"
)
await env.exec(" && ".join(parts), timeout_sec=10)
logger.info("Skills deployed to /skills and symlinked")
logger.info("Skills deployed to /skills")
effective_skills = "/skills"
else:
logger.warning(f"Skills dir not found: {skills_path}")
else:
logger.info("Skills already injected via Dockerfile")
Comment on lines 149 to 150
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 deploy_skills uses wrong source path when skills are already injected via Dockerfile

When skills_dir is provided AND the Dockerfile already contains COPY _deps/skills /skills/ (the already_injected path at src/benchflow/_agent_setup.py:126), effective_skills stays set to task_skills_dir (line 117) instead of being updated to "/skills". In the old code, effective_skills = "/skills" if skills_dir else task_skills_dir unconditionally set it to "/skills" whenever skills_dir was truthy, which correctly covered this case.

If task_skills_dir is None (common when the task doesn't declare environment.skills_dir), the condition at line 139 (if effective_skills and ...) is False, so no skill linking happens at all — skills are silently not distributed to agent discovery paths despite being present at /skills in the container.

Suggested change
else:
logger.info("Skills already injected via Dockerfile")
else:
logger.info("Skills already injected via Dockerfile")
effective_skills = "/skills"
Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.


# Distribute to agent-specific discovery paths
task_skills_dir = task.config.environment.skills_dir
effective_skills = "/skills" if skills_dir else task_skills_dir
if effective_skills and agent_cfg and agent_cfg.skill_paths:
home = f"/home/{sandbox_user}" if sandbox_user else "/root"
parts = []
for sp in agent_cfg.skill_paths:
expanded = sp.replace("$HOME", home).replace("$WORKSPACE", agent_cwd)
q_expanded = shlex.quote(expanded)
q_skills = shlex.quote(effective_skills)
parts.append(
f"mkdir -p {q_expanded} && cp -r {q_skills}/. {q_expanded}/ 2>/dev/null"
)
if parts:
await env.exec("; ".join(parts), timeout_sec=15)
count = await _link_skill_paths(
env,
effective_skills,
agent_cfg.skill_paths,
home,
agent_cwd,
)
if count:
logger.info(
f"Skills distributed to {len(parts)} paths for {agent_cfg.name}"
f"Skills distributed to {count} paths for {agent_cfg.name}"
)
33 changes: 23 additions & 10 deletions src/benchflow/_sandbox.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Sandbox user setup, path lockdown, and verifier hardening.

Owns the "agent runs as non-root" lifecycle:
- Creating the sandbox user and copying root's tooling into its home
- Creating the sandbox user and preparing minimal home state it needs
- Building the privilege-drop wrapper (setpriv / su) for agent launch
- Locking down solution/test paths so the sandbox user cannot read them
- Hardening the environment before the verifier runs
Expand Down Expand Up @@ -98,6 +98,20 @@ def build_priv_drop_cmd(agent_launch: str, sandbox_user: str) -> str:
)


def _legacy_root_tool_link_cmd(source: str, dest: str) -> str:
"""Link legacy root-only tool dirs into the sandbox home when needed."""
src = shlex.quote(source)
dst = shlex.quote(dest)
parent = shlex.quote(str(Path(dest).parent))
return (
f"if [ -e {src} ] && [ ! -L {dst} ]; then "
f"mkdir -p {parent} && "
f"rmdir {dst} 2>/dev/null || true; "
f"[ -e {dst} ] || ln -s {src} {dst}; "
"fi"
)


async def setup_sandbox_user(
env, sandbox_user: str, workspace: str, *, timeout_sec: int = 120
) -> str:
Expand All @@ -107,18 +121,17 @@ async def setup_sandbox_user(
f"Invalid sandbox_user: {sandbox_user!r} (must be alphanumeric)"
)
logger.info(f"Setting up sandbox user: {sandbox_user}")
home = f"/home/{sandbox_user}"
home_dirs = sorted(d for d in get_sandbox_home_dirs() if d != ".local")
await env.exec(
f"id -u {sandbox_user} >/dev/null 2>&1 || "
f"useradd -m -s /bin/bash {sandbox_user} && "
f"mkdir -p /home/{sandbox_user}/.local/bin && "
"if [ -d /root/.local/bin ]; then "
f"cp -aL /root/.local/bin/. /home/{sandbox_user}/.local/bin/ 2>/dev/null || true; fi && "
"if [ -d /root/.nvm ]; then "
f"cp -a /root/.nvm/. /home/{sandbox_user}/.nvm/ 2>/dev/null || true; fi && "
f"for d in {' '.join(sorted(get_sandbox_home_dirs()))}; do "
f"if [ -d /root/$d ]; then mkdir -p /home/{sandbox_user}/$d && "
f"cp -a /root/$d/. /home/{sandbox_user}/$d/ 2>/dev/null || true; fi; done && "
f"chown -R {sandbox_user}:{sandbox_user} /home/{sandbox_user} && "
f"{_legacy_root_tool_link_cmd('/root/.local/bin', f'{home}/.local/bin')} && "
f"{_legacy_root_tool_link_cmd('/root/.nvm', f'{home}/.nvm')} && "
f"for d in {' '.join(home_dirs)}; do "
f"if [ -d /root/$d ]; then mkdir -p {home}/$d && "
f"cp -a /root/$d/. {home}/$d/ 2>/dev/null || true; fi; done && "
f"chown -R {sandbox_user}:{sandbox_user} {home} && "
f"chown -R {sandbox_user}:{sandbox_user} {shlex.quote(workspace)}",
timeout_sec=timeout_sec,
)
Expand Down
13 changes: 5 additions & 8 deletions src/benchflow/agents/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,21 +374,18 @@ class AgentConfig:


def get_sandbox_home_dirs() -> set[str]:
"""Collect all dot-dirs under $HOME that sandbox user setup should copy.
"""Collect user home config/auth dirs BenchFlow may materialize for the sandbox user.

Derives from three sources across all registered agents:
- skill_paths: $HOME/.foo/... → ".foo"
- credential_files: {home}/.foo/... → ".foo"
- subscription_auth.files: {home}/.foo/... → ".foo"
- home_dirs: explicit extras (e.g. ".openclaw")

Always includes ".local" (pip scripts, etc.).
Skill paths are excluded: deploy_skills() now links those paths directly to a
shared skills tree instead of relying on sandbox-home copies.
"""
dirs: set[str] = {".local"}
dirs: set[str] = set()
for cfg in AGENTS.values():
for sp in cfg.skill_paths:
if sp.startswith("$HOME/."):
dirname = sp.removeprefix("$HOME/").split("/")[0]
dirs.add(dirname)
for cf in cfg.credential_files:
# path uses {home}/.foo/... placeholder
path = cf.path
Expand Down
142 changes: 140 additions & 2 deletions tests/test_agent_setup.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,151 @@
"""Tests for agent install and skill deployment setup helpers."""

from pathlib import Path
from types import SimpleNamespace
from unittest.mock import AsyncMock
from unittest.mock import AsyncMock, MagicMock

import pytest

from benchflow._agent_setup import install_agent
from benchflow._agent_setup import deploy_skills, install_agent
from benchflow.agents.registry import AgentConfig
from benchflow.models import AgentInstallError


def _make_task(skills_dir: str | None):
return SimpleNamespace(
config=SimpleNamespace(
environment=SimpleNamespace(
skills_dir=skills_dir,
)
)
)


@pytest.mark.asyncio
async def test_deploy_skills_symlinks_agent_skill_paths_instead_of_copying(tmp_path):
env = MagicMock()
env.exec = AsyncMock(return_value=MagicMock(return_code=0, stdout=""))
env.upload_dir = AsyncMock()
agent_cfg = AgentConfig(
name="test-agent",
install_cmd="true",
launch_cmd="true",
skill_paths=["$HOME/.agents/skills", "$WORKSPACE/skills"],
)

await deploy_skills(
env=env,
task_path=tmp_path,
skills_dir=None,
agent_cfg=agent_cfg,
sandbox_user="agent",
agent_cwd="/app",
task=_make_task("/opt/benchflow/skills"),
)

env.upload_dir.assert_not_called()
env.exec.assert_awaited_once()

cmd = env.exec.await_args.args[0]
assert "cp -r" not in cmd
assert " && " in cmd
assert ";" not in cmd
assert "ln -sfn /opt/benchflow/skills /home/agent/.agents/skills" in cmd
assert "ln -sfn /opt/benchflow/skills /app/skills" in cmd


@pytest.mark.asyncio
async def test_deploy_skills_uploads_runtime_skills_and_links_shared_tree(tmp_path):
env = MagicMock()
env.exec = AsyncMock(return_value=MagicMock(return_code=0, stdout=""))
env.upload_dir = AsyncMock()
agent_cfg = AgentConfig(
name="test-agent",
install_cmd="true",
launch_cmd="true",
skill_paths=["$HOME/.agents/skills", "$WORKSPACE/skills"],
)
skills_dir = tmp_path / "skills"
skills_dir.mkdir()

await deploy_skills(
env=env,
task_path=tmp_path,
skills_dir=skills_dir,
agent_cfg=agent_cfg,
sandbox_user="agent",
agent_cwd="/workspace",
task=_make_task("/opt/benchflow/skills"),
)

env.upload_dir.assert_awaited_once_with(skills_dir, "/skills")
env.exec.assert_awaited_once()

distributed_link_cmd = env.exec.await_args.args[0]
assert " && " in distributed_link_cmd
assert ";" not in distributed_link_cmd
assert "ln -sfn /skills /home/agent/.agents/skills" in distributed_link_cmd
assert "ln -sfn /skills /workspace/skills" in distributed_link_cmd
assert "/root/.agents/skills" not in distributed_link_cmd
assert "/app/skills" not in distributed_link_cmd


@pytest.mark.asyncio
async def test_deploy_skills_falls_back_when_local_skills_dir_is_missing(tmp_path):
env = MagicMock()
env.exec = AsyncMock(return_value=MagicMock(return_code=0, stdout=""))
env.upload_dir = AsyncMock()
agent_cfg = AgentConfig(
name="test-agent",
install_cmd="true",
launch_cmd="true",
skill_paths=["$HOME/.agents/skills", "$WORKSPACE/skills"],
)

await deploy_skills(
env=env,
task_path=tmp_path,
skills_dir=tmp_path / "missing-skills",
agent_cfg=agent_cfg,
sandbox_user="agent",
agent_cwd="/workspace",
task=_make_task("/opt/benchflow/skills"),
)

env.upload_dir.assert_not_called()
env.exec.assert_awaited_once()

distributed_link_cmd = env.exec.await_args.args[0]
assert "ln -sfn /opt/benchflow/skills /home/agent/.agents/skills" in distributed_link_cmd
assert "ln -sfn /opt/benchflow/skills /workspace/skills" in distributed_link_cmd
assert "ln -sfn /skills /home/agent/.agents/skills" not in distributed_link_cmd
assert "ln -sfn /skills /workspace/skills" not in distributed_link_cmd


@pytest.mark.asyncio
async def test_deploy_skills_raises_when_skill_linking_fails(tmp_path):
env = MagicMock()
env.exec = AsyncMock(return_value=MagicMock(return_code=17, stdout="link failed"))
env.upload_dir = AsyncMock()
agent_cfg = AgentConfig(
name="test-agent",
install_cmd="true",
launch_cmd="true",
skill_paths=["$HOME/.agents/skills"],
)

with pytest.raises(RuntimeError, match="Failed to link skills"):
await deploy_skills(
env=env,
task_path=tmp_path,
skills_dir=None,
agent_cfg=agent_cfg,
sandbox_user="agent",
agent_cwd="/app",
task=_make_task("/opt/benchflow/skills"),
)


@pytest.mark.asyncio
async def test_install_agent_writes_command_stdout_and_stderr_on_failure(tmp_path: Path):
env = SimpleNamespace()
Expand Down
16 changes: 16 additions & 0 deletions tests/test_registry_invariants.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,22 @@ def test_agent_collection_invariants(name, cfg):
assert d.startswith("."), f"home_dirs entry {d!r} must start with '.'"


@pytest.mark.parametrize("name,cfg", AGENTS.items(), ids=list(AGENTS.keys()))
def test_agent_install_cmd_targets_shared_paths(name, cfg):
"""Installed binaries must land in shared prefixes, not a root-only home.

setup_sandbox_user() no longer recursively copies /root/.nvm or
/root/.local/bin into the sandbox home. If an install_cmd placed its
binary there, the sandbox user would silently lose access to the agent.
"""
forbidden_binary_prefixes = ("/root/.nvm/", "/root/.local/bin/", "$HOME/.nvm/")
for prefix in forbidden_binary_prefixes:
assert prefix not in cfg.install_cmd, (
f"{name!r} install_cmd writes under {prefix!r}; use /usr/local/bin "
f"or another shared prefix so the sandbox user inherits the tool"
)


@pytest.mark.parametrize("name,cfg", AGENTS.items(), ids=list(AGENTS.keys()))
def test_agent_credential_and_subscription_auth(name, cfg):
"""Optional credential_files and subscription_auth structures."""
Expand Down
Loading