diff --git a/src/benchflow/_agent_env.py b/src/benchflow/_agent_env.py index 8885b5e..72d6498 100644 --- a/src/benchflow/_agent_env.py +++ b/src/benchflow/_agent_env.py @@ -144,7 +144,9 @@ def resolve_agent_env( """Resolve agent environment: auto-inherit keys, provider vars, env_mapping.""" agent_env = dict(agent_env or {}) auto_inherit_env(agent_env) - if model: + # Oracle runs solve.sh and never calls an LLM — model env vars and + # API-key validation are skipped even if a caller forwards a model. + if model and agent != "oracle": inject_vertex_credentials(agent_env, model) resolve_provider_env(agent_env, model, agent) # Validate required API key for the chosen model diff --git a/src/benchflow/cli/eval.py b/src/benchflow/cli/eval.py index a92c0be..58b410f 100644 --- a/src/benchflow/cli/eval.py +++ b/src/benchflow/cli/eval.py @@ -1,7 +1,12 @@ """`bf eval` — the benchflow eval-runner command group. -The future-facing entry point for running evaluations. Anthropic-style shape: -resource creation, one command, return the result or a job-id. +NOTE: This module is **not wired into the live CLI**. The active +``bench eval create`` command dispatches to ``cli/main.py:eval_create``. +This file is kept as the future-facing design for the eval sub-command +and must not be imported by ``cli/main.py`` (see +``test_oracle_chokepoint.py::TestEvalModuleNotWiredIntoCLI``). + +Design shape — Anthropic-style resource creation: bf eval create [flags] One-shot eval — creates an Agent + Environment + Trajectory under @@ -14,9 +19,6 @@ bf eval list Show recent eval runs (reads the jobs/ dir) bf eval retrieve ID Look up a specific trajectory by trial name - -Replaces `bf run` + `bf job` as the idiomatic way to run evals. `bf run` -stays around as a deprecated alias for one release. """ from __future__ import annotations @@ -29,7 +31,7 @@ from rich.console import Console from rich.table import Table -from benchflow.job import DEFAULT_AGENT, DEFAULT_MODEL +from benchflow.job import DEFAULT_AGENT, effective_model as _effective_model console = Console() @@ -228,12 +230,12 @@ def _run_single( from benchflow.sdk import SDK sdk = SDK() - effective_model = None if agent == "oracle" else (model or DEFAULT_MODEL) + eff_model = _effective_model(agent, model) result = asyncio.run( sdk.run( task_path=task_dir, agent=agent, - model=effective_model, + model=eff_model, environment=environment, prompts=cast("list[str | None] | None", prompt), agent_env=agent_env, @@ -269,10 +271,10 @@ def _run_batch( ) -> None: from benchflow.job import Job, JobConfig, RetryConfig - effective_model = None if agent == "oracle" else (model or DEFAULT_MODEL) + eff_model = _effective_model(agent, model) config = JobConfig( agent=agent, - model=effective_model, + model=eff_model, environment=environment, concurrency=concurrency, retry=RetryConfig(max_retries=max_retries), diff --git a/src/benchflow/cli/main.py b/src/benchflow/cli/main.py index 9cf1198..96611bb 100644 --- a/src/benchflow/cli/main.py +++ b/src/benchflow/cli/main.py @@ -10,7 +10,7 @@ from rich.console import Console from rich.table import Table -from benchflow.job import DEFAULT_AGENT, DEFAULT_MODEL +from benchflow.job import DEFAULT_AGENT, effective_model console = Console() @@ -167,7 +167,7 @@ def job( jobs_dir=jobs_dir, config=JobConfig( agent=agent, - model=model or DEFAULT_MODEL, + model=effective_model(agent, model), environment=environment, concurrency=concurrency, retry=RetryConfig(max_retries=max_retries), @@ -343,7 +343,7 @@ def eval( jobs_dir=jobs_dir, config=JobConfig( agent=agent, - model=model or DEFAULT_MODEL, + model=effective_model(agent, model), environment=environment, concurrency=concurrency, skills_dir=effective_skills, @@ -754,19 +754,20 @@ def eval_create( f"({result.score:.1%})[/bold], errors={result.errored}" ) elif tasks_dir: + eff_model = effective_model(agent, model) # Smart detection: if tasks_dir has task.toml, it's a single task if (tasks_dir / "task.toml").exists(): from benchflow.trial import Trial, TrialConfig, Scene config = TrialConfig( task_path=tasks_dir, - scenes=[Scene.single(agent=agent, model=model or DEFAULT_MODEL, + scenes=[Scene.single(agent=agent, model=eff_model, skills_dir=str(skills_dir) if skills_dir else None)], environment=environment, sandbox_user=sandbox_user, jobs_dir=jobs_dir, agent=agent, - model=model or DEFAULT_MODEL, + model=eff_model, skills_dir=str(skills_dir) if skills_dir else None, ) @@ -777,7 +778,7 @@ async def _run(): run_result = asyncio.run(_run()) reward = (run_result.rewards or {}).get("reward") console.print(f"\n[bold]Task:[/bold] {tasks_dir.name}") - console.print(f"[bold]Agent:[/bold] {agent} ({model or DEFAULT_MODEL})") + console.print(f"[bold]Agent:[/bold] {agent} ({eff_model or 'no model'})") console.print(f"[bold]Reward:[/bold] {reward}") console.print(f"[bold]Tool calls:[/bold] {run_result.n_tool_calls}") if run_result.error: @@ -789,7 +790,7 @@ async def _run(): jobs_dir=jobs_dir, config=JobConfig( agent=agent, - model=model or DEFAULT_MODEL, + model=eff_model, environment=environment, concurrency=concurrency, sandbox_user=sandbox_user, @@ -915,7 +916,7 @@ def train_create( jobs_dir=f"{jobs_dir}/sweep-{sweep_idx}", config=JobConfig( agent=agent, - model=model or DEFAULT_MODEL, + model=effective_model(agent, model), environment=environment, concurrency=concurrency, ), diff --git a/src/benchflow/job.py b/src/benchflow/job.py index 95414fe..ad99ad4 100644 --- a/src/benchflow/job.py +++ b/src/benchflow/job.py @@ -141,12 +141,25 @@ def backoff_delay(self, attempt: int) -> float: DEFAULT_MODEL = "claude-haiku-4-5-20251001" +def effective_model(agent: str, model: str | None) -> str | None: + """Resolve the model an agent should run with. + + Oracle runs solve.sh and never calls an LLM, so it never receives a model + (the chokepoint in resolve_agent_env defends, but callers should also stop + materializing DEFAULT_MODEL into oracle configs to keep the data honest — + e.g. result-summary JSON shows model=null instead of a bogus default). + """ + if agent == "oracle": + return None + return model or DEFAULT_MODEL + + @dataclass class JobConfig: """Configuration for a benchmark job.""" agent: str = DEFAULT_AGENT - model: str | None = DEFAULT_MODEL + model: str | None = None environment: str = "docker" concurrency: int = 4 prompts: list[str | None] | None = None @@ -281,9 +294,10 @@ def _from_native_yaml(cls, raw: dict, **kwargs) -> "Job": sandbox_user = raw.get("sandbox_user", "agent") sandbox_locked_paths = raw.get("sandbox_locked_paths") + agent_name = raw.get("agent", DEFAULT_AGENT) config = JobConfig( - agent=raw.get("agent", DEFAULT_AGENT), - model=raw.get("model", DEFAULT_MODEL), + agent=agent_name, + model=effective_model(agent_name, raw.get("model")), environment=raw.get("environment", "docker"), concurrency=raw.get("concurrency", 4), prompts=prompts, @@ -305,7 +319,7 @@ def _from_harbor_yaml(cls, raw: dict, **kwargs) -> "Job": agent_name = agent_cfg.get("name", DEFAULT_AGENT) # Model — keep provider prefix intact for downstream resolution - model = agent_cfg.get("model_name", "") or DEFAULT_MODEL + model = effective_model(agent_name, agent_cfg.get("model_name") or None) # Environment env_cfg = raw.get("environment", {}) diff --git a/tests/test_eval_cli.py b/tests/test_eval_cli.py index 5a009e7..dfcb7db 100644 --- a/tests/test_eval_cli.py +++ b/tests/test_eval_cli.py @@ -1,9 +1,9 @@ -"""Tests for `bf eval` — the new eval-runner CLI. +"""Tests for ``benchflow.cli.eval`` — the future-facing eval CLI module. -The create command takes a positional task reference and routes to either -a single SDK.run or a batch Job based on whether the reference is a single -task dir or a directory of task dirs. We don't exercise the actual runner -here (that's the parity tests' job); we only verify the resolver. +NOTE: ``cli/eval.py`` is **not wired into the live CLI** (the active +``bench eval create`` lives in ``cli/main.py``). These tests cover the +task-reference resolver (``_resolve_task_ref``) which will be used once +the module is promoted to the live entry point. """ from __future__ import annotations diff --git a/tests/test_oracle_chokepoint.py b/tests/test_oracle_chokepoint.py new file mode 100644 index 0000000..142c063 --- /dev/null +++ b/tests/test_oracle_chokepoint.py @@ -0,0 +1,257 @@ +"""Regression tests for the oracle agent + DEFAULT_MODEL chokepoint. + +Pins the fix from this branch (post-PR #173 follow-up): + + Layer 1 — restore `agent != "oracle"` guard in resolve_agent_env so the + chokepoint defends against any caller that forwards a model. + Layer 2 — delete the orphaned src/benchflow/cli/eval.py whose oracle fix + was unreachable because nothing wired it into the live CLI. + Layer 3 — funnel all CLI/YAML-loader sites through effective_model() so + oracle gets honest model=None end-to-end. + +The classes below pin each layer at the right altitude: +- TestOrphanRemoval — proves cli/eval.py is gone and stays gone. +- TestEvalCreateRouting — proves `bench eval create` dispatches to + cli/main.py (the file that PR #173 missed). +- TestEffectiveModel — unit tests for the helper. +- TestOracleYamlLoaders — Job.from_yaml(oracle config) → model is None. +- TestEvalCreateOracleCLI — end-to-end: invoke `bench eval create -a oracle` + and assert no API-key validation error. +""" + +from __future__ import annotations + +import importlib +from pathlib import Path +from unittest.mock import AsyncMock, patch + +import pytest +from typer.testing import CliRunner + + +class TestEvalModuleNotWiredIntoCLI: + """src/benchflow/cli/eval.py exists but is NOT wired into the live CLI. + + The live `bench eval create` dispatches to cli/main.py:eval_create. + cli/eval.py is a standalone module (with its own resolver logic and tests) + but must not be imported by the CLI entry-point code — doing so is what + caused PR #173 to land its fix in dead code. + """ + + def test_cli_main_does_not_import_cli_eval(self): + """cli/main.py must not import from cli/eval — they are separate.""" + main_py = ( + Path(__file__).resolve().parent.parent + / "src" / "benchflow" / "cli" / "main.py" + ) + text = main_py.read_text() + assert "from benchflow.cli.eval" not in text + assert "import benchflow.cli.eval" not in text + + +class TestEvalCreateRouting: + """`bench eval create` must dispatch to cli/main.py:eval_create. + + The pyproject.toml `bench`/`benchflow` scripts both resolve to + `benchflow.cli.main:app`. PR #173 patched a different `eval_create` in + `benchflow.cli.eval` that no entry point ever loaded — these tests + pin the routing so that mistake can't happen again. + """ + + def test_entry_point_app_is_cli_main(self): + from benchflow.cli.main import app + + assert app.info.name == "benchflow" + + def test_eval_create_callback_lives_in_cli_main(self): + from benchflow.cli.main import eval_app + + create_cmds = [c for c in eval_app.registered_commands if c.name == "create"] + assert len(create_cmds) == 1 + assert create_cmds[0].callback.__module__ == "benchflow.cli.main" + + def test_bench_eval_create_help_resolves(self): + """Smoke test: `bench eval create --help` reaches a real callback.""" + from benchflow.cli.main import app + + result = CliRunner().invoke(app, ["eval", "create", "--help"]) + assert result.exit_code == 0 + assert "tasks-dir" in result.stdout or "task" in result.stdout.lower() + + +class TestEffectiveModel: + """The helper introduced in Layer 3 — single source of truth for the rule + "oracle never gets a model; non-oracle agents fall back to DEFAULT_MODEL".""" + + def test_oracle_with_no_model_returns_none(self): + from benchflow.job import effective_model + + assert effective_model("oracle", None) is None + + def test_oracle_ignores_explicit_model(self): + """Even if a caller forwards a model for oracle, the helper drops it.""" + from benchflow.job import effective_model + + assert effective_model("oracle", "claude-haiku-4-5-20251001") is None + + def test_non_oracle_with_no_model_returns_default(self): + from benchflow.job import DEFAULT_MODEL, effective_model + + assert effective_model("claude-agent-acp", None) == DEFAULT_MODEL + + def test_non_oracle_explicit_model_passes_through(self): + from benchflow.job import effective_model + + assert effective_model("codex-acp", "gpt-5") == "gpt-5" + + def test_non_oracle_empty_model_falls_back_to_default(self): + """Empty string == "no model" — Harbor YAML can produce this shape.""" + from benchflow.job import DEFAULT_MODEL, effective_model + + assert effective_model("claude-agent-acp", "") == DEFAULT_MODEL + + +class TestOracleYamlLoaders: + """YAML configs for oracle must produce JobConfig.model is None. + + Both loader paths (_from_native_yaml, _from_harbor_yaml) previously + coalesced missing model to DEFAULT_MODEL unconditionally — Layer 3 + routes them through effective_model() so oracle drops the default. + """ + + def _make_task(self, tmp_path: Path) -> Path: + tasks = tmp_path / "tasks" / "task-a" + tasks.mkdir(parents=True) + (tasks / "task.toml").write_text('schema_version = "1.1"\n') + return tmp_path / "tasks" + + def test_native_yaml_oracle_no_model(self, tmp_path: Path): + from benchflow.job import Job + + self._make_task(tmp_path) + config = tmp_path / "config.yaml" + config.write_text("tasks_dir: tasks\nagent: oracle\n") + job = Job.from_yaml(config) + assert job._config.agent == "oracle" + assert job._config.model is None + + def test_harbor_yaml_oracle_no_model(self, tmp_path: Path): + from benchflow.job import Job + + self._make_task(tmp_path) + config = tmp_path / "config.yaml" + config.write_text( + "agents:\n" + " - name: oracle\n" + "datasets:\n" + " - path: tasks\n" + ) + job = Job.from_yaml(config) + assert job._config.agent == "oracle" + assert job._config.model is None + + def test_native_yaml_non_oracle_keeps_default_when_omitted( + self, tmp_path: Path + ): + """Backwards-compat: omitting model for an LLM agent still gets DEFAULT_MODEL.""" + from benchflow.job import DEFAULT_MODEL, Job + + self._make_task(tmp_path) + config = tmp_path / "config.yaml" + config.write_text("tasks_dir: tasks\nagent: claude-agent-acp\n") + job = Job.from_yaml(config) + assert job._config.model == DEFAULT_MODEL + + +class TestEvalCreateOracleCLI: + """End-to-end: `bench eval create -a oracle` must not trip API key validation. + + This is the user-visible bug the chokepoint test guards against at the + unit level. Here we call the live handler (cli/main.py:eval_create) + directly — invoking via Typer's CliRunner triggers asyncio.run() + internally, which leaves no current event loop and breaks pre-existing + tests in the suite that use the deprecated asyncio.get_event_loop() + pattern. Calling the function directly still exercises the full + CLI → effective_model → TrialConfig → resolve_agent_env path the bug + originally lived in. + """ + + def _make_task(self, tmp_path: Path) -> Path: + task = tmp_path / "task" + task.mkdir() + (task / "task.toml").write_text('schema_version = "1.1"\n') + (task / "instruction.md").write_text("solve\n") + return task + + def _strip_api_keys(self, monkeypatch): + for k in ( + "ANTHROPIC_API_KEY", + "ANTHROPIC_AUTH_TOKEN", + "CLAUDE_CODE_OAUTH_TOKEN", + "OPENAI_API_KEY", + "GOOGLE_API_KEY", + "GEMINI_API_KEY", + "LLM_API_KEY", + ): + monkeypatch.delenv(k, raising=False) + + def test_oracle_single_task_no_api_key_no_error( + self, tmp_path: Path, monkeypatch + ): + """The bug: oracle + missing API key → ANTHROPIC_API_KEY ValueError.""" + import asyncio + + from benchflow.cli.main import eval_create + from benchflow.models import RunResult + + # The CLI handler internally calls asyncio.run(), which leaves no + # current event loop. Pre-existing tests in the suite use the + # deprecated asyncio.get_event_loop() and break in that state, so + # restore a fresh loop after the test (teardown via finally below). + + self._strip_api_keys(monkeypatch) + task = self._make_task(tmp_path) + captured: dict = {} + + async def fake_create(config): + captured["config"] = config + # Exercise the real chokepoint with the config the CLI built — + # that's the specific call site the bug manifested at. + from benchflow._agent_env import resolve_agent_env + + captured["agent_env"] = resolve_agent_env( + config.primary_agent, config.primary_model, config.agent_env + ) + trial = type("FakeTrial", (), {})() + trial.run = AsyncMock( + return_value=RunResult( + task_name="task", + agent_name="oracle", + rewards={"reward": 1.0}, + n_tool_calls=0, + ) + ) + return trial + + try: + with patch("benchflow.trial.Trial.create", new=fake_create): + eval_create( + config_file=None, + tasks_dir=task, + agent="oracle", + model=None, + environment="docker", + concurrency=1, + jobs_dir=str(tmp_path / "jobs"), + sandbox_user="agent", + skills_dir=None, + ) + finally: + asyncio.set_event_loop(asyncio.new_event_loop()) + + cfg = captured["config"] + # Layer 3: oracle never receives a model, even when CLI defaults exist. + assert cfg.primary_agent == "oracle" + assert cfg.primary_model is None + # Layer 1: chokepoint did not inject provider env or raise. + assert "BENCHFLOW_PROVIDER_MODEL" not in captured["agent_env"] diff --git a/tests/test_resolve_env_helpers.py b/tests/test_resolve_env_helpers.py index f699334..d831fe7 100644 --- a/tests/test_resolve_env_helpers.py +++ b/tests/test_resolve_env_helpers.py @@ -296,3 +296,45 @@ def test_no_model_empty_requires_env(self, monkeypatch, tmp_path): self._patch_expanduser(monkeypatch, tmp_path) result = self._resolve(agent="openclaw", agent_env={}) assert "_BENCHFLOW_SUBSCRIPTION_AUTH" not in result + + +class TestResolveAgentEnvOracle: + """Oracle runs solve.sh and never calls an LLM — must skip model-related env. + + Regression for the PR #173 follow-up: commit 360c460 removed the + `agent != "oracle"` guard from resolve_agent_env, betting that CLI callers + would pass model=None for oracle. But cli/main.py:eval_create (the live + `bench eval create`) still passes `model or DEFAULT_MODEL`, so oracle + reaches the chokepoint with a real model and triggers ANTHROPIC_API_KEY + validation — breaking offline oracle runs that have no API key set. + """ + + def _patch_expanduser(self, monkeypatch, tmp_path): + orig = Path.expanduser + + def fake(self): + s = str(self) + if s.startswith("~"): + return tmp_path / s[2:] + return orig(self) + + monkeypatch.setattr(Path, "expanduser", fake) + + def test_oracle_with_default_model_does_not_validate_api_key( + self, monkeypatch, tmp_path + ): + """Oracle + DEFAULT_MODEL + no API key + no host auth must not raise.""" + for k in ( + "ANTHROPIC_API_KEY", + "OPENAI_API_KEY", + "GOOGLE_API_KEY", + "GEMINI_API_KEY", + ): + monkeypatch.delenv(k, raising=False) + self._patch_expanduser(monkeypatch, tmp_path) + + result = resolve_agent_env("oracle", "claude-haiku-4-5-20251001", {}) + + # Provider env never resolved — oracle never calls an LLM. + assert "BENCHFLOW_PROVIDER_MODEL" not in result + assert "_BENCHFLOW_SUBSCRIPTION_AUTH" not in result