benchflow-ai · xdotli · Apr 25, 2026 · Apr 25, 2026
diff --git a/benchmarks/followup-bench/runner.py b/benchmarks/followup-bench/runner.py
@@ -28,7 +28,7 @@
 from benchflow._acp_run import connect_acp, execute_prompts
 from benchflow._agent_setup import install_agent
 from benchflow._scene import Role, Scene
-from benchflow.agents.registry import AGENTS, AGENT_LAUNCH
+from benchflow.agents.registry import AGENT_LAUNCH, AGENTS
 from benchflow.runtime import Environment
 
 logger = logging.getLogger(__name__)

diff --git a/docs/notebooks/nanofirm-task/tests/evaluate.py b/docs/notebooks/nanofirm-task/tests/evaluate.py
@@ -1,6 +1,5 @@
 """Evaluate contract risk analysis quality."""
 import json
-import sys
 
 ANALYSIS_PATH = "/app/analysis.json"
 REWARD_PATH = "/logs/verifier/reward.txt"

diff --git a/docs/notebooks/scene-patterns.ipynb b/docs/notebooks/scene-patterns.ipynb
@@ -6,16 +6,16 @@
    "source": [
     "# BenchFlow Scene Patterns\n",
     "\n",
-    "Three evaluation patterns \u2014 baseline, self-review (multi-turn), coder-reviewer (multi-round) \u2014 run end-to-end with `bf.run()`.\n",
+    "Three evaluation patterns — baseline, self-review (multi-turn), coder-reviewer (multi-round) — run end-to-end with `bf.run()`.\n",
     "\n",
     "| Term | Definition | Example |\n",
     "|------|-----------|--------|\n",
     "| **Turn** | One prompt in one ACP session | Coder writes code |\n",
-    "| **Multi-turn** | Same role, multiple turns | Self-review: agent \u2192 agent |\n",
-    "| **Round** | One A\u2192B exchange between roles | Coder \u2192 Reviewer |\n",
-    "| **Multi-round** | Different roles exchanging turns | Coder \u2192 Reviewer \u2192 Coder |\n",
+    "| **Multi-turn** | Same role, multiple turns | Self-review: agent → agent |\n",
+    "| **Round** | One A→B exchange between roles | Coder → Reviewer |\n",
+    "| **Multi-round** | Different roles exchanging turns | Coder → Reviewer → Coder |\n",
     "| **Scene** | Interaction region with roles + turns | A code-review scene |\n",
-    "| **Trial** | Sequence of scenes in a shared sandbox | Skill-gen \u2192 Solve |\n",
+    "| **Trial** | Sequence of scenes in a shared sandbox | Skill-gen → Solve |\n",
     "\n",
     "**Prerequisites:**\n",
     "- `pip install benchflow`\n",
@@ -30,15 +30,14 @@
    "metadata": {},
    "outputs": [
     {
-     "output_type": "stream",
      "name": "stdout",
+     "output_type": "stream",
      "text": [
       "Task: .ref/terminal-bench-2/regex-log  Env: daytona  Agent: gemini  Model: gemini-3.1-flash-lite-preview\n"
      ]
     }
    ],
    "source": [
-    "import asyncio\n",
     "import logging\n",
     "from pathlib import Path\n",
     "\n",
@@ -71,8 +70,8 @@
    "metadata": {},
    "outputs": [
     {
-     "output_type": "stream",
      "name": "stdout",
+     "output_type": "stream",
      "text": [
       "Reward:     1.0\n",
       "Tool calls: 3\n",
@@ -102,7 +101,7 @@
    "source": [
     "## Pattern 2: Multi-Turn Self-Review\n",
     "\n",
-    "Same agent gets a second turn to re-examine its own work. This is **multi-turn** \u2014 one role, multiple prompts.\n",
+    "Same agent gets a second turn to re-examine its own work. This is **multi-turn** — one role, multiple prompts.\n",
     "\n",
     "Use when: a second pass catches what the first missed. Cost: 2x baseline."
    ]
@@ -113,8 +112,8 @@
    "metadata": {},
    "outputs": [
     {
-     "output_type": "stream",
      "name": "stdout",
+     "output_type": "stream",
      "text": [
       "Reward:     1.0\n",
       "Tool calls: 7\n",
@@ -152,7 +151,7 @@
    "source": [
     "## Pattern 3: Coder + Reviewer (Multi-Round)\n",
     "\n",
-    "Two roles \u2014 coder and reviewer \u2014 in a shared sandbox. This is **multi-round** \u2014 different roles exchanging turns.\n",
+    "Two roles — coder and reviewer — in a shared sandbox. This is **multi-round** — different roles exchanging turns.\n",
     "\n",
     "Communication uses the **outbox convention**:\n",
     "1. Reviewer writes feedback to `/app/.outbox/coder.json` as `{\"to\": \"coder\", \"content\": \"...\"}`\n",
@@ -168,8 +167,8 @@
    "metadata": {},
    "outputs": [
     {
-     "output_type": "stream",
      "name": "stdout",
+     "output_type": "stream",
      "text": [
       "Reward:     0.0\n",
       "Tool calls: 13\n",
@@ -195,7 +194,7 @@
     "                 '{\"to\": \"coder\", \"content\": \"Your specific feedback here.\"}'),\n",
     "            Turn(\"coder\",\n",
     "                 \"Read the reviewer's feedback and fix the issues. \"\n",
-    "                 \"Focus only on what was flagged \u2014 don't start over.\"),\n",
+    "                 \"Focus only on what was flagged — don't start over.\"),\n",
     "        ],\n",
     "    )],\n",
     "    environment=ENV,\n",
@@ -223,26 +222,26 @@
    "metadata": {},
    "outputs": [
     {
-     "output_type": "stream",
      "name": "stdout",
+     "output_type": "stream",
      "text": [
       "Pattern                Reward  Tools Error\n",
       "------------------------------------------------------------\n",
-      "baseline                  1.0      3 \u2014\n",
-      "self_review               1.0      7 \u2014\n",
-      "coder_reviewer            0.0     13 \u2014\n"
+      "baseline                  1.0      3 —\n",
+      "self_review               1.0      7 —\n",
+      "coder_reviewer            0.0     13 —\n"
      ]
     }
    ],
    "source": [
     "print(f\"{'Pattern':<20} {'Reward':>8} {'Tools':>6} {'Error'}\")\n",
     "print(\"-\" * 60)\n",
     "for name, r in results.items():\n",
-    "    reward = (r.rewards or {}).get(\"reward\", \"\u2014\")\n",
-    "    err = r.error or \"\u2014\"\n",
+    "    reward = (r.rewards or {}).get(\"reward\", \"—\")\n",
+    "    err = r.error or \"—\"\n",
     "    if len(err) > 30:\n",
     "        err = err[:27] + \"...\"\n",
-    "    print(f\"{name:<20} {str(reward):>8} {r.n_tool_calls:>6} {err}\")"
+    "    print(f\"{name:<20} {reward!s:>8} {r.n_tool_calls:>6} {err}\")"
    ]
   },
   {
@@ -255,12 +254,12 @@
     "\n",
     "| Harbor (PR #1462) | BenchFlow 0.3 | Status |\n",
     "|-------------------|---------------|--------|\n",
-    "| `BaseUser` | `Role` \u2014 any role can be a user, reviewer, or custom agent | **Shipped** |\n",
-    "| `User.run() \u2192 str` | `Turn` with a prompt \u2014 each turn sends one prompt to one role | **Shipped** |\n",
+    "| `BaseUser` | `Role` — any role can be a user, reviewer, or custom agent | **Shipped** |\n",
+    "| `User.run() → str` | `Turn` with a prompt — each turn sends one prompt to one role | **Shipped** |\n",
     "| Per-round message passing | Outbox files + scheduler injection into next role's prompt | **Shipped** |\n",
     "| Per-round archiving | `scene_messages.jsonl` in trial directory | **Shipped** |\n",
     "| `--user` CLI flag | YAML scene config (`scenes:` key in trial config) | **Shipped** |\n",
-    "| `User.run() \u2192 None` (stop) | Fixed turn count only \u2014 no dynamic termination | **Gap** |\n",
+    "| `User.run() → None` (stop) | Fixed turn count only — no dynamic termination | **Gap** |\n",
     "| Oracle access (`/solution`) | Not available to user roles during setup | **Gap** |\n",
     "| Inter-round verification | `verify()` runs once after all scenes | **Gap** |\n",
     "| User inspects trajectory | User role cannot read prior agent trajectory between rounds | **Gap** |\n",
@@ -270,7 +269,7 @@
     "\n",
     "**What it does NOT deliver yet:** Dynamic termination (user decides when to stop), oracle access\n",
     "for user roles, per-round verification, or inter-round trajectory inspection. These require\n",
-    "extending the Scene scheduler with callbacks \u2014 tracked for 0.4."
+    "extending the Scene scheduler with callbacks — tracked for 0.4."
    ]
   }
  ],

diff --git a/docs/notebooks/scene-patterns.py b/docs/notebooks/scene-patterns.py
@@ -18,9 +18,6 @@
 """
 
 import os
-import sys
-import json
-from pathlib import Path
 
 # ── The contract (constructed here, no external files) ──────────────
 

diff --git a/experiments/reviewer_ablation.py b/experiments/reviewer_ablation.py
@@ -17,25 +17,26 @@
 import os
 import sys
 import time
-from datetime import datetime
 from pathlib import Path
 
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
 
 sys.path.insert(0, str(Path(__file__).resolve().parents[0].parent / "src"))
 
+import contextlib
+
+from harbor.models.task.task import Task
+from harbor.models.trial.paths import TrialPaths
+from harbor.verifier.verifier import Verifier
+
 from benchflow._acp_run import connect_acp, execute_prompts
 from benchflow._agent_env import resolve_agent_env
 from benchflow._agent_setup import install_agent
 from benchflow._credentials import upload_subscription_auth, write_credential_files
 from benchflow._env_setup import _create_environment
 from benchflow._sandbox import setup_sandbox_user
-from benchflow._scene import Role, Scene
-from benchflow.agents.registry import AGENTS, AGENT_LAUNCH
+from benchflow.agents.registry import AGENT_LAUNCH, AGENTS
 from benchflow.sdk import SDK
-from harbor.models.task.task import Task
-from harbor.verifier.verifier import Verifier
-from harbor.models.trial.paths import TrialPaths
 
 logger = logging.getLogger(__name__)
 
@@ -141,10 +142,8 @@ async def _run_acp(env, prompt: str, trial_dir: Path, timeout: int = 600) -> tup
     try:
         _, n_tools = await execute_prompts(acp_client, session, [prompt], timeout=timeout)
     finally:
-        try:
+        with contextlib.suppress(Exception):
             await acp_client.close()
-        except Exception:
-            pass
     return n_tools, int(time.time() - t0)
 
 
@@ -204,7 +203,7 @@ async def run_reviewer(task_path: Path, task_name: str, condition: str) -> dict:
         total_tools += n_tools
 
         # Read coder's outbox
-        outbox_result = await env.exec("cat /app/.outbox/reviewer.json 2>/dev/null || echo '{}'")
+        await env.exec("cat /app/.outbox/reviewer.json 2>/dev/null || echo '{}'")
         await env.exec("rm -rf /app/.outbox/*")
 
         # Phase 2: Reviewer

diff --git a/experiments/validate_multi_scene_lifecycle.py b/experiments/validate_multi_scene_lifecycle.py
@@ -162,7 +162,7 @@ async def main():
     r3 = await test_followup_bench()
     results["followup"] = r3.rewards
 
-    logger.info(f"\n=== DOGFOOD RESULTS ===")
+    logger.info("\n=== DOGFOOD RESULTS ===")
     for name, rewards in results.items():
         reward = (rewards or {}).get("reward", "N/A")
         logger.info(f"  {name}: reward={reward}")

diff --git a/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/tests/test_outputs.py b/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/tests/test_outputs.py
@@ -6,7 +6,7 @@ def test_arithmetic_is_broken():
 
 
 def test_false_is_true():
-    assert False
+    raise AssertionError()
 
 
 def test_pi_is_rational():

diff --git a/labs/benchjack-sandbox-hardening/pattern7_pth_injection/tests/test_outputs.py b/labs/benchjack-sandbox-hardening/pattern7_pth_injection/tests/test_outputs.py
@@ -3,7 +3,7 @@ def test_arithmetic_is_broken():
 
 
 def test_false_is_true():
-    assert False
+    raise AssertionError()
 
 
 def test_pi_is_rational():

diff --git a/labs/benchjack-sandbox-hardening/run_comparison.py b/labs/benchjack-sandbox-hardening/run_comparison.py
@@ -136,7 +136,7 @@ def main() -> int:
         return 1
 
     n = len(PATTERNS)
-    total_steps = 2 + 2 * n  # 2 venvs + 2 versions × n patterns
+    total_steps = 2 + 2 * n  # 2 venvs + 2 versions x n patterns
 
     print(f"[1/{total_steps}] venv: benchflow==0.2.0 (PyPI)")
     _create_venv(VENVS_DIR / "bf-0.2.0", ["benchflow==0.2.0"])

diff --git a/labs/reward-hack-matrix/exploits/conftest_payload.py b/labs/reward-hack-matrix/exploits/conftest_payload.py
@@ -16,7 +16,7 @@
 terminal-bench-2 if its verifier ever touches conftest.py).
 """
 
-import pytest  # noqa: F401  — pytest must see this as a real plugin module
+import pytest
 
 
 def pytest_collection_modifyitems(config, items):

diff --git a/labs/reward-hack-matrix/run_matrix.py b/labs/reward-hack-matrix/run_matrix.py
@@ -30,6 +30,7 @@
 
 import argparse
 import asyncio
+import contextlib
 import json
 import os
 import re
@@ -447,13 +448,11 @@ async def shutdown(self) -> None:
         if self.proc is None:
             return
         if self.proc.stdin and not self.proc.stdin.is_closing():
-            try:
+            with contextlib.suppress(OSError, BrokenPipeError):
                 self.proc.stdin.close()
-            except (OSError, BrokenPipeError):
-                pass
         try:
             await asyncio.wait_for(self.proc.wait(), timeout=30)
-        except asyncio.TimeoutError:
+        except TimeoutError:
             self.proc.kill()
             await self.proc.wait()
         if self.reader_task:
@@ -717,7 +716,7 @@ def main() -> int:
     if args.sweep:
         summary_path = Path(args.summary_path) if args.summary_path else (JOBS_DIR / "matrix_sweep.json")
         print(
-            f"[sweep] {len(cells)} cells × {len(VERSIONS)} versions = "
+            f"[sweep] {len(cells)} cells x {len(VERSIONS)} versions = "
             f"{len(cells) * len(VERSIONS)} trials, concurrency={args.concurrency}"
         )
         results = asyncio.run(

diff --git a/pyproject.toml b/pyproject.toml
@@ -96,6 +96,14 @@ ignore = [
     "RUF022", # __all__ unsorted — grouped by section for agent-friendliness
 ]
 
+[tool.ruff.lint.per-file-ignores]
+# Standalone scripts — sys.path manipulation before imports is intentional
+"experiments/*.py" = ["E402"]
+"tests/conformance/*.py" = ["E402"]
+# Forward references resolved via __future__ annotations — ruff flags them
+# but they work at runtime; explicit TYPE_CHECKING imports would force eager loads.
+"src/benchflow/runtime.py" = ["F821"]
+
 [tool.ty.environment]
 python-version = "3.12"
 

diff --git a/src/benchflow/__init__.py b/src/benchflow/__init__.py
@@ -19,13 +19,14 @@
     ExecResult,
     Task,
     TaskConfig,
-    Trial,
     Verifier,
     VerifierResult,
 )
 
 # benchflow's additions
 from benchflow._env_setup import stage_dockerfile_deps
+from benchflow._scene import MailboxTransport, Message, MessageTransport, Role, Scene
+from benchflow._snapshot import list_snapshots, restore, snapshot
 from benchflow.acp.client import ACPClient
 from benchflow.acp.session import ACPSession
 from benchflow.agents.registry import (
@@ -53,16 +54,15 @@
     RuntimeResult,
     run,  # bf.run(agent, env) — the primary 0.3 API
 )
-from benchflow._scene import MailboxTransport, Message, MessageTransport, Role, Scene
-from benchflow._snapshot import list_snapshots, restore, snapshot
 from benchflow.sdk import SDK
-from benchflow.trial import Trial, TrialConfig
-from benchflow.trial import Role as TrialRole, Scene as TrialScene, Turn
-from benchflow.trial_yaml import trial_config_from_yaml
 from benchflow.skills import SkillInfo, discover_skills, install_skill, parse_skill
 from benchflow.trajectories.otel import OTelCollector
 from benchflow.trajectories.proxy import TrajectoryProxy
 from benchflow.trajectories.types import Trajectory
+from benchflow.trial import Role as TrialRole
+from benchflow.trial import Scene as TrialScene
+from benchflow.trial import Trial, TrialConfig, Turn
+from benchflow.trial_yaml import trial_config_from_yaml
 
 # Public API surface. Anything not in this list is implementation detail and
 # may change without notice. Names are grouped by source module to match the
@@ -123,6 +123,7 @@
     "TrialRole",
     "TrialScene",
     "Turn",
+    "trial_config_from_yaml",
     # SDK (backwards compat)
     "SDK",
     # Environments / dep staging