diff --git a/benchmarks/followup-bench/runner.py b/benchmarks/followup-bench/runner.py index e3558d51..d942ea6c 100644 --- a/benchmarks/followup-bench/runner.py +++ b/benchmarks/followup-bench/runner.py @@ -28,7 +28,7 @@ from benchflow._acp_run import connect_acp, execute_prompts from benchflow._agent_setup import install_agent from benchflow._scene import Role, Scene -from benchflow.agents.registry import AGENTS, AGENT_LAUNCH +from benchflow.agents.registry import AGENT_LAUNCH, AGENTS from benchflow.runtime import Environment logger = logging.getLogger(__name__) diff --git a/docs/notebooks/nanofirm-task/tests/evaluate.py b/docs/notebooks/nanofirm-task/tests/evaluate.py index 2f4fd75f..254f70d3 100644 --- a/docs/notebooks/nanofirm-task/tests/evaluate.py +++ b/docs/notebooks/nanofirm-task/tests/evaluate.py @@ -1,6 +1,5 @@ """Evaluate contract risk analysis quality.""" import json -import sys ANALYSIS_PATH = "/app/analysis.json" REWARD_PATH = "/logs/verifier/reward.txt" diff --git a/docs/notebooks/scene-patterns.ipynb b/docs/notebooks/scene-patterns.ipynb index 9f1400c9..488869f2 100644 --- a/docs/notebooks/scene-patterns.ipynb +++ b/docs/notebooks/scene-patterns.ipynb @@ -6,16 +6,16 @@ "source": [ "# BenchFlow Scene Patterns\n", "\n", - "Three evaluation patterns \u2014 baseline, self-review (multi-turn), coder-reviewer (multi-round) \u2014 run end-to-end with `bf.run()`.\n", + "Three evaluation patterns — baseline, self-review (multi-turn), coder-reviewer (multi-round) — run end-to-end with `bf.run()`.\n", "\n", "| Term | Definition | Example |\n", "|------|-----------|--------|\n", "| **Turn** | One prompt in one ACP session | Coder writes code |\n", - "| **Multi-turn** | Same role, multiple turns | Self-review: agent \u2192 agent |\n", - "| **Round** | One A\u2192B exchange between roles | Coder \u2192 Reviewer |\n", - "| **Multi-round** | Different roles exchanging turns | Coder \u2192 Reviewer \u2192 Coder |\n", + "| **Multi-turn** | Same role, multiple turns | Self-review: agent → agent |\n", + "| **Round** | One A→B exchange between roles | Coder → Reviewer |\n", + "| **Multi-round** | Different roles exchanging turns | Coder → Reviewer → Coder |\n", "| **Scene** | Interaction region with roles + turns | A code-review scene |\n", - "| **Trial** | Sequence of scenes in a shared sandbox | Skill-gen \u2192 Solve |\n", + "| **Trial** | Sequence of scenes in a shared sandbox | Skill-gen → Solve |\n", "\n", "**Prerequisites:**\n", "- `pip install benchflow`\n", @@ -30,15 +30,14 @@ "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Task: .ref/terminal-bench-2/regex-log Env: daytona Agent: gemini Model: gemini-3.1-flash-lite-preview\n" ] } ], "source": [ - "import asyncio\n", "import logging\n", "from pathlib import Path\n", "\n", @@ -71,8 +70,8 @@ "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Reward: 1.0\n", "Tool calls: 3\n", @@ -102,7 +101,7 @@ "source": [ "## Pattern 2: Multi-Turn Self-Review\n", "\n", - "Same agent gets a second turn to re-examine its own work. This is **multi-turn** \u2014 one role, multiple prompts.\n", + "Same agent gets a second turn to re-examine its own work. This is **multi-turn** — one role, multiple prompts.\n", "\n", "Use when: a second pass catches what the first missed. Cost: 2x baseline." ] @@ -113,8 +112,8 @@ "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Reward: 1.0\n", "Tool calls: 7\n", @@ -152,7 +151,7 @@ "source": [ "## Pattern 3: Coder + Reviewer (Multi-Round)\n", "\n", - "Two roles \u2014 coder and reviewer \u2014 in a shared sandbox. This is **multi-round** \u2014 different roles exchanging turns.\n", + "Two roles — coder and reviewer — in a shared sandbox. This is **multi-round** — different roles exchanging turns.\n", "\n", "Communication uses the **outbox convention**:\n", "1. Reviewer writes feedback to `/app/.outbox/coder.json` as `{\"to\": \"coder\", \"content\": \"...\"}`\n", @@ -168,8 +167,8 @@ "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Reward: 0.0\n", "Tool calls: 13\n", @@ -195,7 +194,7 @@ " '{\"to\": \"coder\", \"content\": \"Your specific feedback here.\"}'),\n", " Turn(\"coder\",\n", " \"Read the reviewer's feedback and fix the issues. \"\n", - " \"Focus only on what was flagged \u2014 don't start over.\"),\n", + " \"Focus only on what was flagged — don't start over.\"),\n", " ],\n", " )],\n", " environment=ENV,\n", @@ -223,14 +222,14 @@ "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Pattern Reward Tools Error\n", "------------------------------------------------------------\n", - "baseline 1.0 3 \u2014\n", - "self_review 1.0 7 \u2014\n", - "coder_reviewer 0.0 13 \u2014\n" + "baseline 1.0 3 —\n", + "self_review 1.0 7 —\n", + "coder_reviewer 0.0 13 —\n" ] } ], @@ -238,11 +237,11 @@ "print(f\"{'Pattern':<20} {'Reward':>8} {'Tools':>6} {'Error'}\")\n", "print(\"-\" * 60)\n", "for name, r in results.items():\n", - " reward = (r.rewards or {}).get(\"reward\", \"\u2014\")\n", - " err = r.error or \"\u2014\"\n", + " reward = (r.rewards or {}).get(\"reward\", \"—\")\n", + " err = r.error or \"—\"\n", " if len(err) > 30:\n", " err = err[:27] + \"...\"\n", - " print(f\"{name:<20} {str(reward):>8} {r.n_tool_calls:>6} {err}\")" + " print(f\"{name:<20} {reward!s:>8} {r.n_tool_calls:>6} {err}\")" ] }, { @@ -255,12 +254,12 @@ "\n", "| Harbor (PR #1462) | BenchFlow 0.3 | Status |\n", "|-------------------|---------------|--------|\n", - "| `BaseUser` | `Role` \u2014 any role can be a user, reviewer, or custom agent | **Shipped** |\n", - "| `User.run() \u2192 str` | `Turn` with a prompt \u2014 each turn sends one prompt to one role | **Shipped** |\n", + "| `BaseUser` | `Role` — any role can be a user, reviewer, or custom agent | **Shipped** |\n", + "| `User.run() → str` | `Turn` with a prompt — each turn sends one prompt to one role | **Shipped** |\n", "| Per-round message passing | Outbox files + scheduler injection into next role's prompt | **Shipped** |\n", "| Per-round archiving | `scene_messages.jsonl` in trial directory | **Shipped** |\n", "| `--user` CLI flag | YAML scene config (`scenes:` key in trial config) | **Shipped** |\n", - "| `User.run() \u2192 None` (stop) | Fixed turn count only \u2014 no dynamic termination | **Gap** |\n", + "| `User.run() → None` (stop) | Fixed turn count only — no dynamic termination | **Gap** |\n", "| Oracle access (`/solution`) | Not available to user roles during setup | **Gap** |\n", "| Inter-round verification | `verify()` runs once after all scenes | **Gap** |\n", "| User inspects trajectory | User role cannot read prior agent trajectory between rounds | **Gap** |\n", @@ -270,7 +269,7 @@ "\n", "**What it does NOT deliver yet:** Dynamic termination (user decides when to stop), oracle access\n", "for user roles, per-round verification, or inter-round trajectory inspection. These require\n", - "extending the Scene scheduler with callbacks \u2014 tracked for 0.4." + "extending the Scene scheduler with callbacks — tracked for 0.4." ] } ], diff --git a/docs/notebooks/scene-patterns.py b/docs/notebooks/scene-patterns.py index a95369b8..961bd09a 100644 --- a/docs/notebooks/scene-patterns.py +++ b/docs/notebooks/scene-patterns.py @@ -18,9 +18,6 @@ """ import os -import sys -import json -from pathlib import Path # ── The contract (constructed here, no external files) ────────────── diff --git a/experiments/reviewer_ablation.py b/experiments/reviewer_ablation.py index 5acaecab..8ab0caec 100644 --- a/experiments/reviewer_ablation.py +++ b/experiments/reviewer_ablation.py @@ -17,25 +17,26 @@ import os import sys import time -from datetime import datetime from pathlib import Path logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") sys.path.insert(0, str(Path(__file__).resolve().parents[0].parent / "src")) +import contextlib + +from harbor.models.task.task import Task +from harbor.models.trial.paths import TrialPaths +from harbor.verifier.verifier import Verifier + from benchflow._acp_run import connect_acp, execute_prompts from benchflow._agent_env import resolve_agent_env from benchflow._agent_setup import install_agent from benchflow._credentials import upload_subscription_auth, write_credential_files from benchflow._env_setup import _create_environment from benchflow._sandbox import setup_sandbox_user -from benchflow._scene import Role, Scene -from benchflow.agents.registry import AGENTS, AGENT_LAUNCH +from benchflow.agents.registry import AGENT_LAUNCH, AGENTS from benchflow.sdk import SDK -from harbor.models.task.task import Task -from harbor.verifier.verifier import Verifier -from harbor.models.trial.paths import TrialPaths logger = logging.getLogger(__name__) @@ -141,10 +142,8 @@ async def _run_acp(env, prompt: str, trial_dir: Path, timeout: int = 600) -> tup try: _, n_tools = await execute_prompts(acp_client, session, [prompt], timeout=timeout) finally: - try: + with contextlib.suppress(Exception): await acp_client.close() - except Exception: - pass return n_tools, int(time.time() - t0) @@ -204,7 +203,7 @@ async def run_reviewer(task_path: Path, task_name: str, condition: str) -> dict: total_tools += n_tools # Read coder's outbox - outbox_result = await env.exec("cat /app/.outbox/reviewer.json 2>/dev/null || echo '{}'") + await env.exec("cat /app/.outbox/reviewer.json 2>/dev/null || echo '{}'") await env.exec("rm -rf /app/.outbox/*") # Phase 2: Reviewer diff --git a/experiments/validate_multi_scene_lifecycle.py b/experiments/validate_multi_scene_lifecycle.py index e55eea26..4e6bf877 100644 --- a/experiments/validate_multi_scene_lifecycle.py +++ b/experiments/validate_multi_scene_lifecycle.py @@ -162,7 +162,7 @@ async def main(): r3 = await test_followup_bench() results["followup"] = r3.rewards - logger.info(f"\n=== DOGFOOD RESULTS ===") + logger.info("\n=== DOGFOOD RESULTS ===") for name, rewards in results.items(): reward = (rewards or {}).get("reward", "N/A") logger.info(f" {name}: reward={reward}") diff --git a/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/tests/test_outputs.py b/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/tests/test_outputs.py index 1c85417f..df887aa9 100644 --- a/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/tests/test_outputs.py +++ b/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/tests/test_outputs.py @@ -6,7 +6,7 @@ def test_arithmetic_is_broken(): def test_false_is_true(): - assert False + raise AssertionError() def test_pi_is_rational(): diff --git a/labs/benchjack-sandbox-hardening/pattern7_pth_injection/tests/test_outputs.py b/labs/benchjack-sandbox-hardening/pattern7_pth_injection/tests/test_outputs.py index 6debbc31..a4742c25 100644 --- a/labs/benchjack-sandbox-hardening/pattern7_pth_injection/tests/test_outputs.py +++ b/labs/benchjack-sandbox-hardening/pattern7_pth_injection/tests/test_outputs.py @@ -3,7 +3,7 @@ def test_arithmetic_is_broken(): def test_false_is_true(): - assert False + raise AssertionError() def test_pi_is_rational(): diff --git a/labs/benchjack-sandbox-hardening/run_comparison.py b/labs/benchjack-sandbox-hardening/run_comparison.py index e01b1df8..04963bb8 100755 --- a/labs/benchjack-sandbox-hardening/run_comparison.py +++ b/labs/benchjack-sandbox-hardening/run_comparison.py @@ -136,7 +136,7 @@ def main() -> int: return 1 n = len(PATTERNS) - total_steps = 2 + 2 * n # 2 venvs + 2 versions × n patterns + total_steps = 2 + 2 * n # 2 venvs + 2 versions x n patterns print(f"[1/{total_steps}] venv: benchflow==0.2.0 (PyPI)") _create_venv(VENVS_DIR / "bf-0.2.0", ["benchflow==0.2.0"]) diff --git a/labs/reward-hack-matrix/exploits/conftest_payload.py b/labs/reward-hack-matrix/exploits/conftest_payload.py index 30fe23f7..7397066a 100644 --- a/labs/reward-hack-matrix/exploits/conftest_payload.py +++ b/labs/reward-hack-matrix/exploits/conftest_payload.py @@ -16,7 +16,7 @@ terminal-bench-2 if its verifier ever touches conftest.py). """ -import pytest # noqa: F401 — pytest must see this as a real plugin module +import pytest def pytest_collection_modifyitems(config, items): diff --git a/labs/reward-hack-matrix/run_matrix.py b/labs/reward-hack-matrix/run_matrix.py index d5dfc439..53a86b04 100644 --- a/labs/reward-hack-matrix/run_matrix.py +++ b/labs/reward-hack-matrix/run_matrix.py @@ -30,6 +30,7 @@ import argparse import asyncio +import contextlib import json import os import re @@ -447,13 +448,11 @@ async def shutdown(self) -> None: if self.proc is None: return if self.proc.stdin and not self.proc.stdin.is_closing(): - try: + with contextlib.suppress(OSError, BrokenPipeError): self.proc.stdin.close() - except (OSError, BrokenPipeError): - pass try: await asyncio.wait_for(self.proc.wait(), timeout=30) - except asyncio.TimeoutError: + except TimeoutError: self.proc.kill() await self.proc.wait() if self.reader_task: @@ -717,7 +716,7 @@ def main() -> int: if args.sweep: summary_path = Path(args.summary_path) if args.summary_path else (JOBS_DIR / "matrix_sweep.json") print( - f"[sweep] {len(cells)} cells × {len(VERSIONS)} versions = " + f"[sweep] {len(cells)} cells x {len(VERSIONS)} versions = " f"{len(cells) * len(VERSIONS)} trials, concurrency={args.concurrency}" ) results = asyncio.run( diff --git a/pyproject.toml b/pyproject.toml index 0b032ab2..306e71c4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -96,6 +96,14 @@ ignore = [ "RUF022", # __all__ unsorted — grouped by section for agent-friendliness ] +[tool.ruff.lint.per-file-ignores] +# Standalone scripts — sys.path manipulation before imports is intentional +"experiments/*.py" = ["E402"] +"tests/conformance/*.py" = ["E402"] +# Forward references resolved via __future__ annotations — ruff flags them +# but they work at runtime; explicit TYPE_CHECKING imports would force eager loads. +"src/benchflow/runtime.py" = ["F821"] + [tool.ty.environment] python-version = "3.12" diff --git a/src/benchflow/__init__.py b/src/benchflow/__init__.py index 809292d7..35388b58 100644 --- a/src/benchflow/__init__.py +++ b/src/benchflow/__init__.py @@ -19,13 +19,14 @@ ExecResult, Task, TaskConfig, - Trial, Verifier, VerifierResult, ) # benchflow's additions from benchflow._env_setup import stage_dockerfile_deps +from benchflow._scene import MailboxTransport, Message, MessageTransport, Role, Scene +from benchflow._snapshot import list_snapshots, restore, snapshot from benchflow.acp.client import ACPClient from benchflow.acp.session import ACPSession from benchflow.agents.registry import ( @@ -53,16 +54,15 @@ RuntimeResult, run, # bf.run(agent, env) — the primary 0.3 API ) -from benchflow._scene import MailboxTransport, Message, MessageTransport, Role, Scene -from benchflow._snapshot import list_snapshots, restore, snapshot from benchflow.sdk import SDK -from benchflow.trial import Trial, TrialConfig -from benchflow.trial import Role as TrialRole, Scene as TrialScene, Turn -from benchflow.trial_yaml import trial_config_from_yaml from benchflow.skills import SkillInfo, discover_skills, install_skill, parse_skill from benchflow.trajectories.otel import OTelCollector from benchflow.trajectories.proxy import TrajectoryProxy from benchflow.trajectories.types import Trajectory +from benchflow.trial import Role as TrialRole +from benchflow.trial import Scene as TrialScene +from benchflow.trial import Trial, TrialConfig, Turn +from benchflow.trial_yaml import trial_config_from_yaml # Public API surface. Anything not in this list is implementation detail and # may change without notice. Names are grouped by source module to match the @@ -123,6 +123,7 @@ "TrialRole", "TrialScene", "Turn", + "trial_config_from_yaml", # SDK (backwards compat) "SDK", # Environments / dep staging diff --git a/src/benchflow/_acp_run.py b/src/benchflow/_acp_run.py index 8b64288c..56020721 100644 --- a/src/benchflow/_acp_run.py +++ b/src/benchflow/_acp_run.py @@ -18,6 +18,7 @@ """ import asyncio +import contextlib import logging from pathlib import Path @@ -66,7 +67,6 @@ async def connect_acp( agent_launch = build_priv_drop_cmd(agent_launch, sandbox_user) logger.info(f"Agent sandboxed as: {sandbox_user}") - last_err: Exception | None = None acp_client: ACPClient | None = None for attempt in range(_ACP_CONNECT_MAX_RETRIES + 1): if attempt > 0: @@ -106,12 +106,9 @@ async def connect_acp( except ConnectionError as e: # Close the failed client before retrying if acp_client: - try: + with contextlib.suppress(Exception): await acp_client.close() - except Exception: - pass acp_client = None - last_err = e if attempt == _ACP_CONNECT_MAX_RETRIES: raise logger.warning(f"ACP connect failed (attempt {attempt + 1}): {e}") @@ -119,10 +116,8 @@ async def connect_acp( except Exception: # Non-retryable error — close client to prevent leak if acp_client: - try: + with contextlib.suppress(Exception): await acp_client.close() - except Exception: - pass raise if model: diff --git a/src/benchflow/acp/client.py b/src/benchflow/acp/client.py index abd4743a..57a55eb9 100644 --- a/src/benchflow/acp/client.py +++ b/src/benchflow/acp/client.py @@ -1,6 +1,5 @@ """ACP client — benchflow acts as the client, agents are ACP servers.""" -import json import logging from typing import Any diff --git a/src/benchflow/cli/eval.py b/src/benchflow/cli/eval.py index 58b410fc..17cea1dd 100644 --- a/src/benchflow/cli/eval.py +++ b/src/benchflow/cli/eval.py @@ -31,7 +31,8 @@ from rich.console import Console from rich.table import Table -from benchflow.job import DEFAULT_AGENT, effective_model as _effective_model +from benchflow.job import DEFAULT_AGENT +from benchflow.job import effective_model as _effective_model console = Console() diff --git a/src/benchflow/cli/main.py b/src/benchflow/cli/main.py index 96611bbd..e9acc235 100644 --- a/src/benchflow/cli/main.py +++ b/src/benchflow/cli/main.py @@ -433,9 +433,9 @@ def skills_eval( typer.Argument(help="Path to skill directory containing evals/evals.json"), ], agent: Annotated[ - list[str], + list[str] | None, typer.Option("--agent", "-a", help="Agent(s) to evaluate (repeatable)"), - ] = ["claude-agent-acp"], + ] = None, model: Annotated[ list[str] | None, typer.Option("--model", "-m", help="Model(s) (matched 1:1 with agents)"), @@ -473,6 +473,8 @@ def skills_eval( """ from benchflow.skill_eval import SkillEvaluator, export_gepa_traces + if agent is None: + agent = ["claude-agent-acp"] if not (skill_dir / "evals" / "evals.json").exists(): console.print( f"[red]No evals/evals.json found in {skill_dir}[/red]\n" @@ -757,7 +759,7 @@ def eval_create( eff_model = effective_model(agent, model) # Smart detection: if tasks_dir has task.toml, it's a single task if (tasks_dir / "task.toml").exists(): - from benchflow.trial import Trial, TrialConfig, Scene + from benchflow.trial import Scene, Trial, TrialConfig config = TrialConfig( task_path=tasks_dir, diff --git a/src/benchflow/job.py b/src/benchflow/job.py index 5429add2..70dfb560 100644 --- a/src/benchflow/job.py +++ b/src/benchflow/job.py @@ -126,9 +126,7 @@ def should_retry(self, error: str | None) -> bool: return True if self.retry_on_pipe and category == PIPE_CLOSED: return True - if self.retry_on_acp and category == ACP_ERROR: - return True - return False + return bool(self.retry_on_acp and category == ACP_ERROR) def backoff_delay(self, attempt: int) -> float: """Exponential backoff delay for retry attempt.""" diff --git a/src/benchflow/mcp/reviewer_server.py b/src/benchflow/mcp/reviewer_server.py index 9c2dedcc..210746b9 100644 --- a/src/benchflow/mcp/reviewer_server.py +++ b/src/benchflow/mcp/reviewer_server.py @@ -24,8 +24,6 @@ import json import logging import os -import subprocess -import sys from pathlib import Path logger = logging.getLogger(__name__) @@ -52,11 +50,11 @@ def create_reviewer_server( """ try: from fastmcp import FastMCP - except ImportError: + except ImportError as e: raise ImportError( "fastmcp required for MCP reviewer server. " "Install with: pip install fastmcp" - ) + ) from e mcp = FastMCP("benchflow-reviewer") prompt = review_prompt or os.environ.get("REVIEWER_PROMPT", DEFAULT_REVIEW_PROMPT) diff --git a/src/benchflow/process.py b/src/benchflow/process.py index 4a0e4be2..197daa1c 100644 --- a/src/benchflow/process.py +++ b/src/benchflow/process.py @@ -313,7 +313,7 @@ async def start( # Use the full compose base command (with -p, --project-directory, # and -f flags) so that exec can find the running project. if self._compose_cmd_base: - inner_parts = shlex.split(self._compose_cmd_base) + ["exec", "-i", "-T"] + inner_parts = [*shlex.split(self._compose_cmd_base), "exec", "-i", "-T"] else: inner_parts = ["docker", "compose", "exec", "-i", "-T"] if cwd: @@ -453,7 +453,7 @@ async def start( logger.info(f"DaytonaPtyProcess: PTY connected (session={session_id})") compose_parts = shlex.split(self._compose_cmd_base) if self._compose_cmd_base else ["docker", "compose"] - exec_parts = compose_parts + ["exec", "-i", "-T"] + exec_parts = [*compose_parts, "exec", "-i", "-T"] if cwd: exec_parts.extend(["-w", cwd]) # Write env vars to a file inside the container (not visible in ps aux), @@ -475,7 +475,7 @@ async def start( marker = f"__BENCHFLOW_ACP_{session_id}__" setup = f"stty -echo 2>/dev/null; echo '{marker}'; exec {exec_cmd}\n" await self._pty.send_input(setup) - logger.info(f"DaytonaPtyProcess: sent setup, waiting for marker...") + logger.info("DaytonaPtyProcess: sent setup, waiting for marker...") while True: try: @@ -484,10 +484,10 @@ async def start( logger.debug(f"DaytonaPtyProcess drain: {decoded[:120]}") if marker in decoded: break - except TimeoutError: - raise ConnectionError("DaytonaPtyProcess: timeout waiting for agent start marker") + except TimeoutError as e: + raise ConnectionError("DaytonaPtyProcess: timeout waiting for agent start marker") from e - logger.info(f"DaytonaPtyProcess: marker seen, agent starting") + logger.info("DaytonaPtyProcess: marker seen, agent starting") async def readline(self) -> bytes: if self._closed: @@ -495,10 +495,10 @@ async def readline(self) -> bytes: try: line = await asyncio.wait_for(self._line_buffer.get(), timeout=300) return line - except TimeoutError: - raise ConnectionError("PTY readline timeout (300s)") + except TimeoutError as e: + raise ConnectionError("PTY readline timeout (300s)") from e except Exception as e: - raise ConnectionError(f"PTY readline error: {e}") + raise ConnectionError(f"PTY readline error: {e}") from e async def writeline(self, data: str) -> None: if not self._pty or self._closed: @@ -508,14 +508,10 @@ async def writeline(self, data: str) -> None: async def close(self) -> None: self._closed = True if self._pty: - try: + with contextlib.suppress(Exception): await self._pty.kill() - except Exception: - pass - try: + with contextlib.suppress(Exception): await self._pty.disconnect() - except Exception: - pass logger.info("DaytonaPtyProcess terminated") @property diff --git a/src/benchflow/runtime.py b/src/benchflow/runtime.py index 1e55f0c7..14a785ea 100644 --- a/src/benchflow/runtime.py +++ b/src/benchflow/runtime.py @@ -291,13 +291,13 @@ async def execute(self) -> RuntimeResult: async def run( - subject: "Agent | TrialConfig | str", - env: "Environment | str | None" = None, + subject: Agent | TrialConfig | str, + env: Environment | str | None = None, config: RuntimeConfig | None = None, *, - task_path: "str | Path | None" = None, + task_path: str | Path | None = None, model: str | None = None, -) -> "RuntimeResult | RunResult": +) -> RuntimeResult | RunResult: """Primary user-facing API — multiple calling conventions. Usage:: @@ -313,7 +313,6 @@ async def run( # 3. Agent name string (simplest) result = await bf.run("gemini", task_path="tasks/X") """ - from benchflow.models import RunResult from benchflow.trial import Scene, Trial, TrialConfig if isinstance(subject, TrialConfig): diff --git a/src/benchflow/sdk.py b/src/benchflow/sdk.py index bd10a2d0..28fa82d4 100644 --- a/src/benchflow/sdk.py +++ b/src/benchflow/sdk.py @@ -98,33 +98,12 @@ from harbor.models.trial.paths import TrialPaths from harbor.verifier.verifier import Verifier -from benchflow._acp_run import connect_acp, execute_prompts -from benchflow._agent_env import resolve_agent_env -from benchflow._agent_setup import deploy_skills, install_agent -from benchflow._credentials import ( - upload_subscription_auth, - write_credential_files, -) from benchflow._env_setup import ( - _create_environment, - _inject_skills_into_dockerfile, _patch_harbor_dind, - stage_dockerfile_deps, ) from benchflow._sandbox import ( - _resolve_locked_paths, - _seed_verifier_workspace, - _snapshot_build_config, harden_before_verify, - lockdown_paths, - setup_sandbox_user, -) -from benchflow._trajectory import ( - _capture_session_trajectory, - _scrape_agent_trajectory, ) -from benchflow.acp.client import ACPClient, ACPError -from benchflow.agents.registry import AGENT_LAUNCH from benchflow.models import RunResult, TrajectorySource logger = logging.getLogger(__name__) diff --git a/src/benchflow/skill_eval.py b/src/benchflow/skill_eval.py index b848135e..82e96869 100644 --- a/src/benchflow/skill_eval.py +++ b/src/benchflow/skill_eval.py @@ -6,6 +6,7 @@ result = await evaluator.run(agents=["claude-agent-acp"], environment="docker") """ +import contextlib import json import logging import shutil @@ -431,7 +432,7 @@ async def run( all_results: list[CaseResult] = [] # Run each agent - for agent, model in zip(agents, models): + for agent, model in zip(agents, models, strict=False): agent_label = agent.split("/")[-1] if "/" in agent else agent # With-skill run @@ -486,9 +487,9 @@ async def _run_job( with_skill: bool, ) -> list[CaseResult]: """Run a batch of tasks using Job for concurrency and retries.""" - from benchflow.job import Job, JobConfig, RetryConfig - import os + + from benchflow.job import Job, JobConfig, RetryConfig judge_env = {} for key in ("ANTHROPIC_API_KEY", "OPENAI_API_KEY", "GOOGLE_API_KEY", "GEMINI_API_KEY"): if os.environ.get(key): @@ -506,7 +507,7 @@ async def _run_job( agent_env=judge_env, ), ) - job_result = await j.run() + await j.run() results = [] # Walk the jobs directory to collect per-case results @@ -539,10 +540,8 @@ async def _run_job( # Read judge rubric details if available judge_file = trial_dir / "verifier" / "judge_result.json" if judge_file.exists(): - try: + with contextlib.suppress(json.JSONDecodeError, KeyError): rubric_results = json.loads(judge_file.read_text()).get("items") - except (json.JSONDecodeError, KeyError): - pass results.append( CaseResult( @@ -579,7 +578,7 @@ def _compute_lifts( ) -> list[AgentLift]: """Compute per-agent lift from case results.""" lifts = [] - for agent, model in zip(agents, models): + for agent, model in zip(agents, models, strict=False): with_results = [r for r in all_results if r.agent == agent and r.with_skill] baseline_results = [ r for r in all_results if r.agent == agent and not r.with_skill diff --git a/src/benchflow/trial.py b/src/benchflow/trial.py index d2af9ae0..d2f1570e 100644 --- a/src/benchflow/trial.py +++ b/src/benchflow/trial.py @@ -35,7 +35,7 @@ from __future__ import annotations -import asyncio +import contextlib import json import logging import shlex @@ -57,7 +57,6 @@ _resolve_locked_paths, _seed_verifier_workspace, _snapshot_build_config, - harden_before_verify, lockdown_paths, setup_sandbox_user, ) @@ -66,7 +65,7 @@ _scrape_agent_trajectory, ) from benchflow.acp.client import ACPClient, ACPError -from benchflow.agents.registry import AGENT_LAUNCH, AGENTS +from benchflow.agents.registry import AGENT_LAUNCH from benchflow.models import RunResult, TrajectorySource logger = logging.getLogger(__name__) @@ -110,7 +109,7 @@ def single( prompts: list[str | None] | None = None, role_name: str = "agent", skills_dir: str | Path | None = None, - ) -> "Scene": + ) -> Scene: """Shortcut for single-agent, single-role scene.""" prompts = prompts or [None] return cls( @@ -157,7 +156,7 @@ def from_legacy( prompts: list[str | None] | None = None, skills_dir: str | Path | None = None, **kwargs, - ) -> "TrialConfig": + ) -> TrialConfig: """Construct from flat SDK.run()-style args.""" return cls( task_path=task_path, @@ -440,10 +439,8 @@ async def disconnect(self) -> None: # Kill any lingering agent processes to prevent context bleed between scenes if self._env and self._agent_launch.strip(): agent_cmd = self._agent_launch.split()[0].split("/")[-1] - try: + with contextlib.suppress(Exception): await self._env.exec(f"pkill -f '{agent_cmd}' || true", timeout_sec=10) - except Exception: - pass self._session_tool_count = 0 self._session_traj_count = 0 self._phase = "installed" @@ -630,7 +627,7 @@ async def _run_scene(self, scene: Scene) -> None: inbox: dict[str, list[str]] = {r.name: [] for r in scene.roles} turn_counter = 0 - for i, turn in enumerate(scene.turns): + for _i, turn in enumerate(scene.turns): role = role_map.get(turn.role) if not role: raise ValueError(f"Turn references unknown role {turn.role!r}") diff --git a/tests/conformance/proof_multi_agent.py b/tests/conformance/proof_multi_agent.py index 902d2ca3..9aa2729c 100644 --- a/tests/conformance/proof_multi_agent.py +++ b/tests/conformance/proof_multi_agent.py @@ -18,6 +18,8 @@ sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "src")) +import contextlib + from harbor.models.task.task import Task from benchflow._acp_run import connect_acp, execute_prompts @@ -73,7 +75,7 @@ async def role_runner(env, role: Role, prompt: str) -> None: trial_dir = Path(f"/tmp/multi-agent-proof/{role.name}") trial_dir.mkdir(parents=True, exist_ok=True) launch_cmd = AGENT_LAUNCH.get(role.agent, role.agent) - acp_client, session, agent_name = await connect_acp( + acp_client, session, _agent_name = await connect_acp( env=env, agent=role.agent, agent_launch=launch_cmd, @@ -85,7 +87,7 @@ async def role_runner(env, role: Role, prompt: str) -> None: agent_cwd="/app", ) try: - trajectory, n_tools = await execute_prompts( + _trajectory, n_tools = await execute_prompts( acp_client, session, [prompt], @@ -93,10 +95,8 @@ async def role_runner(env, role: Role, prompt: str) -> None: ) logging.info(f"[{role.name}] finished: {n_tools} tool calls") finally: - try: + with contextlib.suppress(Exception): await acp_client.close() - except Exception: - pass async def main() -> None: diff --git a/tests/conformance/run_conformance.py b/tests/conformance/run_conformance.py index b8aa5038..1f758a13 100644 --- a/tests/conformance/run_conformance.py +++ b/tests/conformance/run_conformance.py @@ -53,9 +53,7 @@ def has_creds(agent_name: str) -> bool: if any(os.environ.get(k) for k in keys): return True sub_file = SUBSCRIPTION_AUTH_FILES.get(agent_name) - if sub_file and Path(sub_file).expanduser().exists(): - return True - return False + return bool(sub_file and Path(sub_file).expanduser().exists()) async def run_one(agent_name: str) -> dict: diff --git a/tests/test_oracle_chokepoint.py b/tests/test_oracle_chokepoint.py index 142c0634..a61333da 100644 --- a/tests/test_oracle_chokepoint.py +++ b/tests/test_oracle_chokepoint.py @@ -21,11 +21,9 @@ from __future__ import annotations -import importlib from pathlib import Path from unittest.mock import AsyncMock, patch -import pytest from typer.testing import CliRunner diff --git a/tests/test_rewards_jsonl.py b/tests/test_rewards_jsonl.py index e81f3164..bbc029df 100644 --- a/tests/test_rewards_jsonl.py +++ b/tests/test_rewards_jsonl.py @@ -93,7 +93,7 @@ def test_rubric_plus_terminal_no_rubric_in_meta(tmp_path: Path) -> None: json.loads(ln) for ln in (tmp_path / "rewards.jsonl").read_text().strip().splitlines() ] - terminal = [ln for ln in lines if ln["type"] == "terminal"][0] + terminal = next(ln for ln in lines if ln["type"] == "terminal") assert "rubric" not in terminal["meta"] diff --git a/tests/test_scene_outbox_trial.py b/tests/test_scene_outbox_trial.py index 4e6d4780..5b6f07ad 100644 --- a/tests/test_scene_outbox_trial.py +++ b/tests/test_scene_outbox_trial.py @@ -11,8 +11,7 @@ import json from dataclasses import dataclass from pathlib import Path -from typing import Any -from unittest.mock import AsyncMock, patch +from unittest.mock import AsyncMock import pytest @@ -325,7 +324,6 @@ async def test_heterogeneous_agent_install(coder_reviewer_scene: Scene) -> None: trial._resolved_prompts = ["Solve the task"] installed_agents: list[str] = [] - original_connect_as = Trial.connect_as async def tracking_connect_as(self_inner, role): if role.agent != config.primary_agent: diff --git a/tests/test_skill_eval.py b/tests/test_skill_eval.py index c1f2b3e6..0c8c3edc 100644 --- a/tests/test_skill_eval.py +++ b/tests/test_skill_eval.py @@ -133,7 +133,7 @@ def test_minimal_dataset(self, minimal_skill_dir): def test_missing_evals_json(self, tmp_path): skill = tmp_path / "no-evals" skill.mkdir() - with pytest.raises(FileNotFoundError, match="evals.json"): + with pytest.raises(FileNotFoundError, match=r"evals\.json"): load_eval_dataset(skill) def test_empty_cases(self, tmp_path): diff --git a/tests/test_skill_eval_dryrun.py b/tests/test_skill_eval_dryrun.py index 39b6a57e..931a8e96 100644 --- a/tests/test_skill_eval_dryrun.py +++ b/tests/test_skill_eval_dryrun.py @@ -218,7 +218,7 @@ def test_gepa_export_roundtrip(self, mock_skill): # Verify trace files traces = list((gepa_dir / "traces").iterdir()) - assert len(traces) == 4 # 2 cases × 2 modes + assert len(traces) == 4 # 2 cases x 2 modes # Verify trace content trace = json.loads(traces[0].read_text()) @@ -228,6 +228,7 @@ def test_gepa_export_roundtrip(self, mock_skill): def test_cli_dryrun_loads_dataset(self, mock_skill): from typer.testing import CliRunner + from benchflow.cli.main import app runner = CliRunner()