diff --git a/src/benchflow/cli/eval.py b/src/benchflow/cli/eval.py index 0b13fd8..a92c0be 100644 --- a/src/benchflow/cli/eval.py +++ b/src/benchflow/cli/eval.py @@ -228,11 +228,12 @@ def _run_single( from benchflow.sdk import SDK sdk = SDK() + effective_model = None if agent == "oracle" else (model or DEFAULT_MODEL) result = asyncio.run( sdk.run( task_path=task_dir, agent=agent, - model=model or DEFAULT_MODEL, + model=effective_model, environment=environment, prompts=cast("list[str | None] | None", prompt), agent_env=agent_env, @@ -268,9 +269,10 @@ def _run_batch( ) -> None: from benchflow.job import Job, JobConfig, RetryConfig + effective_model = None if agent == "oracle" else (model or DEFAULT_MODEL) config = JobConfig( agent=agent, - model=model or DEFAULT_MODEL, + model=effective_model, environment=environment, concurrency=concurrency, retry=RetryConfig(max_retries=max_retries), diff --git a/src/benchflow/job.py b/src/benchflow/job.py index 6d8fe9b..95414fe 100644 --- a/src/benchflow/job.py +++ b/src/benchflow/job.py @@ -146,7 +146,7 @@ class JobConfig: """Configuration for a benchmark job.""" agent: str = DEFAULT_AGENT - model: str = DEFAULT_MODEL + model: str | None = DEFAULT_MODEL environment: str = "docker" concurrency: int = 4 prompts: list[str | None] | None = None