Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion benchmarks/followup-bench/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from benchflow._acp_run import connect_acp, execute_prompts
from benchflow._agent_setup import install_agent
from benchflow._scene import Role, Scene
from benchflow.agents.registry import AGENTS, AGENT_LAUNCH
from benchflow.agents.registry import AGENT_LAUNCH, AGENTS
from benchflow.runtime import Environment

logger = logging.getLogger(__name__)
Expand Down
1 change: 0 additions & 1 deletion docs/notebooks/nanofirm-task/tests/evaluate.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Evaluate contract risk analysis quality."""
import json
import sys

ANALYSIS_PATH = "/app/analysis.json"
REWARD_PATH = "/logs/verifier/reward.txt"
Expand Down
47 changes: 23 additions & 24 deletions docs/notebooks/scene-patterns.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,16 @@
"source": [
"# BenchFlow Scene Patterns\n",
"\n",
"Three evaluation patterns \u2014 baseline, self-review (multi-turn), coder-reviewer (multi-round) \u2014 run end-to-end with `bf.run()`.\n",
"Three evaluation patterns baseline, self-review (multi-turn), coder-reviewer (multi-round) run end-to-end with `bf.run()`.\n",
"\n",
"| Term | Definition | Example |\n",
"|------|-----------|--------|\n",
"| **Turn** | One prompt in one ACP session | Coder writes code |\n",
"| **Multi-turn** | Same role, multiple turns | Self-review: agent \u2192 agent |\n",
"| **Round** | One A\u2192B exchange between roles | Coder \u2192 Reviewer |\n",
"| **Multi-round** | Different roles exchanging turns | Coder \u2192 Reviewer \u2192 Coder |\n",
"| **Multi-turn** | Same role, multiple turns | Self-review: agent agent |\n",
"| **Round** | One A→B exchange between roles | Coder Reviewer |\n",
"| **Multi-round** | Different roles exchanging turns | Coder Reviewer Coder |\n",
"| **Scene** | Interaction region with roles + turns | A code-review scene |\n",
"| **Trial** | Sequence of scenes in a shared sandbox | Skill-gen \u2192 Solve |\n",
"| **Trial** | Sequence of scenes in a shared sandbox | Skill-gen Solve |\n",
"\n",
"**Prerequisites:**\n",
"- `pip install benchflow`\n",
Expand All @@ -30,15 +30,14 @@
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"output_type": "stream",
"text": [
"Task: .ref/terminal-bench-2/regex-log Env: daytona Agent: gemini Model: gemini-3.1-flash-lite-preview\n"
]
}
],
"source": [
"import asyncio\n",
"import logging\n",
"from pathlib import Path\n",
"\n",
Expand Down Expand Up @@ -71,8 +70,8 @@
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"output_type": "stream",
"text": [
"Reward: 1.0\n",
"Tool calls: 3\n",
Expand Down Expand Up @@ -102,7 +101,7 @@
"source": [
"## Pattern 2: Multi-Turn Self-Review\n",
"\n",
"Same agent gets a second turn to re-examine its own work. This is **multi-turn** \u2014 one role, multiple prompts.\n",
"Same agent gets a second turn to re-examine its own work. This is **multi-turn** one role, multiple prompts.\n",
"\n",
"Use when: a second pass catches what the first missed. Cost: 2x baseline."
]
Expand All @@ -113,8 +112,8 @@
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"output_type": "stream",
"text": [
"Reward: 1.0\n",
"Tool calls: 7\n",
Expand Down Expand Up @@ -152,7 +151,7 @@
"source": [
"## Pattern 3: Coder + Reviewer (Multi-Round)\n",
"\n",
"Two roles \u2014 coder and reviewer \u2014 in a shared sandbox. This is **multi-round** \u2014 different roles exchanging turns.\n",
"Two roles coder and reviewer in a shared sandbox. This is **multi-round** different roles exchanging turns.\n",
"\n",
"Communication uses the **outbox convention**:\n",
"1. Reviewer writes feedback to `/app/.outbox/coder.json` as `{\"to\": \"coder\", \"content\": \"...\"}`\n",
Expand All @@ -168,8 +167,8 @@
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"output_type": "stream",
"text": [
"Reward: 0.0\n",
"Tool calls: 13\n",
Expand All @@ -195,7 +194,7 @@
" '{\"to\": \"coder\", \"content\": \"Your specific feedback here.\"}'),\n",
" Turn(\"coder\",\n",
" \"Read the reviewer's feedback and fix the issues. \"\n",
" \"Focus only on what was flagged \u2014 don't start over.\"),\n",
" \"Focus only on what was flagged don't start over.\"),\n",
" ],\n",
" )],\n",
" environment=ENV,\n",
Expand Down Expand Up @@ -223,26 +222,26 @@
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"output_type": "stream",
"text": [
"Pattern Reward Tools Error\n",
"------------------------------------------------------------\n",
"baseline 1.0 3 \u2014\n",
"self_review 1.0 7 \u2014\n",
"coder_reviewer 0.0 13 \u2014\n"
"baseline 1.0 3 \n",
"self_review 1.0 7 \n",
"coder_reviewer 0.0 13 \n"
]
}
],
"source": [
"print(f\"{'Pattern':<20} {'Reward':>8} {'Tools':>6} {'Error'}\")\n",
"print(\"-\" * 60)\n",
"for name, r in results.items():\n",
" reward = (r.rewards or {}).get(\"reward\", \"\u2014\")\n",
" err = r.error or \"\u2014\"\n",
" reward = (r.rewards or {}).get(\"reward\", \"\")\n",
" err = r.error or \"\"\n",
" if len(err) > 30:\n",
" err = err[:27] + \"...\"\n",
" print(f\"{name:<20} {str(reward):>8} {r.n_tool_calls:>6} {err}\")"
" print(f\"{name:<20} {reward!s:>8} {r.n_tool_calls:>6} {err}\")"
]
},
{
Expand All @@ -255,12 +254,12 @@
"\n",
"| Harbor (PR #1462) | BenchFlow 0.3 | Status |\n",
"|-------------------|---------------|--------|\n",
"| `BaseUser` | `Role` \u2014 any role can be a user, reviewer, or custom agent | **Shipped** |\n",
"| `User.run() \u2192 str` | `Turn` with a prompt \u2014 each turn sends one prompt to one role | **Shipped** |\n",
"| `BaseUser` | `Role` any role can be a user, reviewer, or custom agent | **Shipped** |\n",
"| `User.run() str` | `Turn` with a prompt each turn sends one prompt to one role | **Shipped** |\n",
"| Per-round message passing | Outbox files + scheduler injection into next role's prompt | **Shipped** |\n",
"| Per-round archiving | `scene_messages.jsonl` in trial directory | **Shipped** |\n",
"| `--user` CLI flag | YAML scene config (`scenes:` key in trial config) | **Shipped** |\n",
"| `User.run() \u2192 None` (stop) | Fixed turn count only \u2014 no dynamic termination | **Gap** |\n",
"| `User.run() None` (stop) | Fixed turn count only no dynamic termination | **Gap** |\n",
"| Oracle access (`/solution`) | Not available to user roles during setup | **Gap** |\n",
"| Inter-round verification | `verify()` runs once after all scenes | **Gap** |\n",
"| User inspects trajectory | User role cannot read prior agent trajectory between rounds | **Gap** |\n",
Expand All @@ -270,7 +269,7 @@
"\n",
"**What it does NOT deliver yet:** Dynamic termination (user decides when to stop), oracle access\n",
"for user roles, per-round verification, or inter-round trajectory inspection. These require\n",
"extending the Scene scheduler with callbacks \u2014 tracked for 0.4."
"extending the Scene scheduler with callbacks tracked for 0.4."
]
}
],
Expand Down
3 changes: 0 additions & 3 deletions docs/notebooks/scene-patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,6 @@
"""

import os
import sys
import json
from pathlib import Path

# ── The contract (constructed here, no external files) ──────────────

Expand Down
19 changes: 9 additions & 10 deletions experiments/reviewer_ablation.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,25 +17,26 @@
import os
import sys
import time
from datetime import datetime
from pathlib import Path

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")

sys.path.insert(0, str(Path(__file__).resolve().parents[0].parent / "src"))

import contextlib

from harbor.models.task.task import Task
from harbor.models.trial.paths import TrialPaths
from harbor.verifier.verifier import Verifier

from benchflow._acp_run import connect_acp, execute_prompts
from benchflow._agent_env import resolve_agent_env
from benchflow._agent_setup import install_agent
from benchflow._credentials import upload_subscription_auth, write_credential_files
from benchflow._env_setup import _create_environment
from benchflow._sandbox import setup_sandbox_user
from benchflow._scene import Role, Scene
from benchflow.agents.registry import AGENTS, AGENT_LAUNCH
from benchflow.agents.registry import AGENT_LAUNCH, AGENTS
from benchflow.sdk import SDK
from harbor.models.task.task import Task
from harbor.verifier.verifier import Verifier
from harbor.models.trial.paths import TrialPaths

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -141,10 +142,8 @@ async def _run_acp(env, prompt: str, trial_dir: Path, timeout: int = 600) -> tup
try:
_, n_tools = await execute_prompts(acp_client, session, [prompt], timeout=timeout)
finally:
try:
with contextlib.suppress(Exception):
await acp_client.close()
except Exception:
pass
return n_tools, int(time.time() - t0)


Expand Down Expand Up @@ -204,7 +203,7 @@ async def run_reviewer(task_path: Path, task_name: str, condition: str) -> dict:
total_tools += n_tools

# Read coder's outbox
outbox_result = await env.exec("cat /app/.outbox/reviewer.json 2>/dev/null || echo '{}'")
await env.exec("cat /app/.outbox/reviewer.json 2>/dev/null || echo '{}'")
await env.exec("rm -rf /app/.outbox/*")

# Phase 2: Reviewer
Expand Down
2 changes: 1 addition & 1 deletion experiments/validate_multi_scene_lifecycle.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ async def main():
r3 = await test_followup_bench()
results["followup"] = r3.rewards

logger.info(f"\n=== DOGFOOD RESULTS ===")
logger.info("\n=== DOGFOOD RESULTS ===")
for name, rewards in results.items():
reward = (rewards or {}).get("reward", "N/A")
logger.info(f" {name}: reward={reward}")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ def test_arithmetic_is_broken():


def test_false_is_true():
assert False
raise AssertionError()


def test_pi_is_rational():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ def test_arithmetic_is_broken():


def test_false_is_true():
assert False
raise AssertionError()


def test_pi_is_rational():
Expand Down
2 changes: 1 addition & 1 deletion labs/benchjack-sandbox-hardening/run_comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def main() -> int:
return 1

n = len(PATTERNS)
total_steps = 2 + 2 * n # 2 venvs + 2 versions × n patterns
total_steps = 2 + 2 * n # 2 venvs + 2 versions x n patterns

print(f"[1/{total_steps}] venv: benchflow==0.2.0 (PyPI)")
_create_venv(VENVS_DIR / "bf-0.2.0", ["benchflow==0.2.0"])
Expand Down
2 changes: 1 addition & 1 deletion labs/reward-hack-matrix/exploits/conftest_payload.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
terminal-bench-2 if its verifier ever touches conftest.py).
"""

import pytest # noqa: F401 — pytest must see this as a real plugin module
import pytest


def pytest_collection_modifyitems(config, items):
Expand Down
9 changes: 4 additions & 5 deletions labs/reward-hack-matrix/run_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

import argparse
import asyncio
import contextlib
import json
import os
import re
Expand Down Expand Up @@ -447,13 +448,11 @@ async def shutdown(self) -> None:
if self.proc is None:
return
if self.proc.stdin and not self.proc.stdin.is_closing():
try:
with contextlib.suppress(OSError, BrokenPipeError):
self.proc.stdin.close()
except (OSError, BrokenPipeError):
pass
try:
await asyncio.wait_for(self.proc.wait(), timeout=30)
except asyncio.TimeoutError:
except TimeoutError:
self.proc.kill()
await self.proc.wait()
if self.reader_task:
Expand Down Expand Up @@ -717,7 +716,7 @@ def main() -> int:
if args.sweep:
summary_path = Path(args.summary_path) if args.summary_path else (JOBS_DIR / "matrix_sweep.json")
print(
f"[sweep] {len(cells)} cells × {len(VERSIONS)} versions = "
f"[sweep] {len(cells)} cells x {len(VERSIONS)} versions = "
f"{len(cells) * len(VERSIONS)} trials, concurrency={args.concurrency}"
)
results = asyncio.run(
Expand Down
8 changes: 8 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,14 @@ ignore = [
"RUF022", # __all__ unsorted — grouped by section for agent-friendliness
]

[tool.ruff.lint.per-file-ignores]
# Standalone scripts — sys.path manipulation before imports is intentional
"experiments/*.py" = ["E402"]
"tests/conformance/*.py" = ["E402"]
# Forward references resolved via __future__ annotations — ruff flags them
# but they work at runtime; explicit TYPE_CHECKING imports would force eager loads.
"src/benchflow/runtime.py" = ["F821"]

[tool.ty.environment]
python-version = "3.12"

Expand Down
13 changes: 7 additions & 6 deletions src/benchflow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,14 @@
ExecResult,
Task,
TaskConfig,
Trial,
Verifier,
VerifierResult,
)

# benchflow's additions
from benchflow._env_setup import stage_dockerfile_deps
from benchflow._scene import MailboxTransport, Message, MessageTransport, Role, Scene
from benchflow._snapshot import list_snapshots, restore, snapshot
from benchflow.acp.client import ACPClient
from benchflow.acp.session import ACPSession
from benchflow.agents.registry import (
Expand Down Expand Up @@ -53,16 +54,15 @@
RuntimeResult,
run, # bf.run(agent, env) — the primary 0.3 API
)
from benchflow._scene import MailboxTransport, Message, MessageTransport, Role, Scene
from benchflow._snapshot import list_snapshots, restore, snapshot
from benchflow.sdk import SDK
from benchflow.trial import Trial, TrialConfig
from benchflow.trial import Role as TrialRole, Scene as TrialScene, Turn
from benchflow.trial_yaml import trial_config_from_yaml
from benchflow.skills import SkillInfo, discover_skills, install_skill, parse_skill
from benchflow.trajectories.otel import OTelCollector
from benchflow.trajectories.proxy import TrajectoryProxy
from benchflow.trajectories.types import Trajectory
from benchflow.trial import Role as TrialRole
from benchflow.trial import Scene as TrialScene
from benchflow.trial import Trial, TrialConfig, Turn
from benchflow.trial_yaml import trial_config_from_yaml

# Public API surface. Anything not in this list is implementation detail and
# may change without notice. Names are grouped by source module to match the
Expand Down Expand Up @@ -123,6 +123,7 @@
"TrialRole",
"TrialScene",
"Turn",
"trial_config_from_yaml",
# SDK (backwards compat)
"SDK",
# Environments / dep staging
Expand Down
Loading