Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/benchflow/_agent_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,9 @@ def resolve_agent_env(
"""Resolve agent environment: auto-inherit keys, provider vars, env_mapping."""
agent_env = dict(agent_env or {})
auto_inherit_env(agent_env)
if model:
# Oracle runs solve.sh and never calls an LLM — model env vars and
# API-key validation are skipped even if a caller forwards a model.
if model and agent != "oracle":
inject_vertex_credentials(agent_env, model)
resolve_provider_env(agent_env, model, agent)
# Validate required API key for the chosen model
Expand Down
22 changes: 12 additions & 10 deletions src/benchflow/cli/eval.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
"""`bf eval` — the benchflow eval-runner command group.

The future-facing entry point for running evaluations. Anthropic-style shape:
resource creation, one command, return the result or a job-id.
NOTE: This module is **not wired into the live CLI**. The active
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@claude is this comment good?

``bench eval create`` command dispatches to ``cli/main.py:eval_create``.
This file is kept as the future-facing design for the eval sub-command
and must not be imported by ``cli/main.py`` (see
``test_oracle_chokepoint.py::TestEvalModuleNotWiredIntoCLI``).

Design shape — Anthropic-style resource creation:

bf eval create <task-ref> [flags]
One-shot eval — creates an Agent + Environment + Trajectory under
Expand All @@ -14,9 +19,6 @@

bf eval list Show recent eval runs (reads the jobs/ dir)
bf eval retrieve ID Look up a specific trajectory by trial name

Replaces `bf run` + `bf job` as the idiomatic way to run evals. `bf run`
stays around as a deprecated alias for one release.
"""

from __future__ import annotations
Expand All @@ -29,7 +31,7 @@
from rich.console import Console
from rich.table import Table

from benchflow.job import DEFAULT_AGENT, DEFAULT_MODEL
from benchflow.job import DEFAULT_AGENT, effective_model as _effective_model

console = Console()

Expand Down Expand Up @@ -228,12 +230,12 @@ def _run_single(
from benchflow.sdk import SDK

sdk = SDK()
effective_model = None if agent == "oracle" else (model or DEFAULT_MODEL)
eff_model = _effective_model(agent, model)
result = asyncio.run(
sdk.run(
task_path=task_dir,
agent=agent,
model=effective_model,
model=eff_model,
environment=environment,
prompts=cast("list[str | None] | None", prompt),
agent_env=agent_env,
Expand Down Expand Up @@ -269,10 +271,10 @@ def _run_batch(
) -> None:
from benchflow.job import Job, JobConfig, RetryConfig

effective_model = None if agent == "oracle" else (model or DEFAULT_MODEL)
eff_model = _effective_model(agent, model)
config = JobConfig(
agent=agent,
model=effective_model,
model=eff_model,
environment=environment,
concurrency=concurrency,
retry=RetryConfig(max_retries=max_retries),
Expand Down
17 changes: 9 additions & 8 deletions src/benchflow/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from rich.console import Console
from rich.table import Table

from benchflow.job import DEFAULT_AGENT, DEFAULT_MODEL
from benchflow.job import DEFAULT_AGENT, effective_model

console = Console()

Expand Down Expand Up @@ -167,7 +167,7 @@ def job(
jobs_dir=jobs_dir,
config=JobConfig(
agent=agent,
model=model or DEFAULT_MODEL,
model=effective_model(agent, model),
environment=environment,
concurrency=concurrency,
retry=RetryConfig(max_retries=max_retries),
Expand Down Expand Up @@ -343,7 +343,7 @@ def eval(
jobs_dir=jobs_dir,
config=JobConfig(
agent=agent,
model=model or DEFAULT_MODEL,
model=effective_model(agent, model),
environment=environment,
concurrency=concurrency,
skills_dir=effective_skills,
Expand Down Expand Up @@ -754,19 +754,20 @@ def eval_create(
f"({result.score:.1%})[/bold], errors={result.errored}"
)
elif tasks_dir:
eff_model = effective_model(agent, model)
# Smart detection: if tasks_dir has task.toml, it's a single task
if (tasks_dir / "task.toml").exists():
from benchflow.trial import Trial, TrialConfig, Scene

config = TrialConfig(
task_path=tasks_dir,
scenes=[Scene.single(agent=agent, model=model or DEFAULT_MODEL,
scenes=[Scene.single(agent=agent, model=eff_model,
skills_dir=str(skills_dir) if skills_dir else None)],
environment=environment,
sandbox_user=sandbox_user,
jobs_dir=jobs_dir,
agent=agent,
model=model or DEFAULT_MODEL,
model=eff_model,
skills_dir=str(skills_dir) if skills_dir else None,
)

Expand All @@ -777,7 +778,7 @@ async def _run():
run_result = asyncio.run(_run())
reward = (run_result.rewards or {}).get("reward")
console.print(f"\n[bold]Task:[/bold] {tasks_dir.name}")
console.print(f"[bold]Agent:[/bold] {agent} ({model or DEFAULT_MODEL})")
console.print(f"[bold]Agent:[/bold] {agent} ({eff_model or 'no model'})")
console.print(f"[bold]Reward:[/bold] {reward}")
console.print(f"[bold]Tool calls:[/bold] {run_result.n_tool_calls}")
if run_result.error:
Expand All @@ -789,7 +790,7 @@ async def _run():
jobs_dir=jobs_dir,
config=JobConfig(
agent=agent,
model=model or DEFAULT_MODEL,
model=eff_model,
environment=environment,
concurrency=concurrency,
sandbox_user=sandbox_user,
Expand Down Expand Up @@ -915,7 +916,7 @@ def train_create(
jobs_dir=f"{jobs_dir}/sweep-{sweep_idx}",
config=JobConfig(
agent=agent,
model=model or DEFAULT_MODEL,
model=effective_model(agent, model),
environment=environment,
concurrency=concurrency,
),
Expand Down
22 changes: 18 additions & 4 deletions src/benchflow/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,12 +141,25 @@ def backoff_delay(self, attempt: int) -> float:
DEFAULT_MODEL = "claude-haiku-4-5-20251001"


def effective_model(agent: str, model: str | None) -> str | None:
"""Resolve the model an agent should run with.

Oracle runs solve.sh and never calls an LLM, so it never receives a model
(the chokepoint in resolve_agent_env defends, but callers should also stop
materializing DEFAULT_MODEL into oracle configs to keep the data honest —
e.g. result-summary JSON shows model=null instead of a bogus default).
"""
if agent == "oracle":
return None
return model or DEFAULT_MODEL


@dataclass
class JobConfig:
"""Configuration for a benchmark job."""

agent: str = DEFAULT_AGENT
model: str | None = DEFAULT_MODEL
model: str | None = None
environment: str = "docker"
concurrency: int = 4
prompts: list[str | None] | None = None
Expand Down Expand Up @@ -281,9 +294,10 @@ def _from_native_yaml(cls, raw: dict, **kwargs) -> "Job":
sandbox_user = raw.get("sandbox_user", "agent")
sandbox_locked_paths = raw.get("sandbox_locked_paths")

agent_name = raw.get("agent", DEFAULT_AGENT)
config = JobConfig(
agent=raw.get("agent", DEFAULT_AGENT),
model=raw.get("model", DEFAULT_MODEL),
agent=agent_name,
model=effective_model(agent_name, raw.get("model")),
environment=raw.get("environment", "docker"),
concurrency=raw.get("concurrency", 4),
prompts=prompts,
Expand All @@ -305,7 +319,7 @@ def _from_harbor_yaml(cls, raw: dict, **kwargs) -> "Job":
agent_name = agent_cfg.get("name", DEFAULT_AGENT)

# Model — keep provider prefix intact for downstream resolution
model = agent_cfg.get("model_name", "") or DEFAULT_MODEL
model = effective_model(agent_name, agent_cfg.get("model_name") or None)

# Environment
env_cfg = raw.get("environment", {})
Expand Down
10 changes: 5 additions & 5 deletions tests/test_eval_cli.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
"""Tests for `bf eval` — the new eval-runner CLI.
"""Tests for ``benchflow.cli.eval`` — the future-facing eval CLI module.

The create command takes a positional task reference and routes to either
a single SDK.run or a batch Job based on whether the reference is a single
task dir or a directory of task dirs. We don't exercise the actual runner
here (that's the parity tests' job); we only verify the resolver.
NOTE: ``cli/eval.py`` is **not wired into the live CLI** (the active
``bench eval create`` lives in ``cli/main.py``). These tests cover the
task-reference resolver (``_resolve_task_ref``) which will be used once
the module is promoted to the live entry point.
"""

from __future__ import annotations
Expand Down
Loading
Loading