diff --git a/src/benchflow/_agent_env.py b/src/benchflow/_agent_env.py
index 8885b5e..72d6498 100644
--- a/src/benchflow/_agent_env.py
+++ b/src/benchflow/_agent_env.py
@@ -144,7 +144,9 @@ def resolve_agent_env(
     """Resolve agent environment: auto-inherit keys, provider vars, env_mapping."""
     agent_env = dict(agent_env or {})
     auto_inherit_env(agent_env)
-    if model:
+    # Oracle runs solve.sh and never calls an LLM — model env vars and
+    # API-key validation are skipped even if a caller forwards a model.
+    if model and agent != "oracle":
         inject_vertex_credentials(agent_env, model)
         resolve_provider_env(agent_env, model, agent)
         # Validate required API key for the chosen model
diff --git a/src/benchflow/cli/eval.py b/src/benchflow/cli/eval.py
index a92c0be..58b410f 100644
--- a/src/benchflow/cli/eval.py
+++ b/src/benchflow/cli/eval.py
@@ -1,7 +1,12 @@
 """`bf eval` — the benchflow eval-runner command group.
 
-The future-facing entry point for running evaluations. Anthropic-style shape:
-resource creation, one command, return the result or a job-id.
+NOTE: This module is **not wired into the live CLI**.  The active
+``bench eval create`` command dispatches to ``cli/main.py:eval_create``.
+This file is kept as the future-facing design for the eval sub-command
+and must not be imported by ``cli/main.py`` (see
+``test_oracle_chokepoint.py::TestEvalModuleNotWiredIntoCLI``).
+
+Design shape — Anthropic-style resource creation:
 
     bf eval create <task-ref> [flags]
         One-shot eval — creates an Agent + Environment + Trajectory under
@@ -14,9 +19,6 @@
 
     bf eval list          Show recent eval runs (reads the jobs/ dir)
     bf eval retrieve ID   Look up a specific trajectory by trial name
-
-Replaces `bf run` + `bf job` as the idiomatic way to run evals. `bf run`
-stays around as a deprecated alias for one release.
 """
 
 from __future__ import annotations
@@ -29,7 +31,7 @@
 from rich.console import Console
 from rich.table import Table
 
-from benchflow.job import DEFAULT_AGENT, DEFAULT_MODEL
+from benchflow.job import DEFAULT_AGENT, effective_model as _effective_model
 
 console = Console()
 
@@ -228,12 +230,12 @@ def _run_single(
     from benchflow.sdk import SDK
 
     sdk = SDK()
-    effective_model = None if agent == "oracle" else (model or DEFAULT_MODEL)
+    eff_model = _effective_model(agent, model)
     result = asyncio.run(
         sdk.run(
             task_path=task_dir,
             agent=agent,
-            model=effective_model,
+            model=eff_model,
             environment=environment,
             prompts=cast("list[str | None] | None", prompt),
             agent_env=agent_env,
@@ -269,10 +271,10 @@ def _run_batch(
 ) -> None:
     from benchflow.job import Job, JobConfig, RetryConfig
 
-    effective_model = None if agent == "oracle" else (model or DEFAULT_MODEL)
+    eff_model = _effective_model(agent, model)
     config = JobConfig(
         agent=agent,
-        model=effective_model,
+        model=eff_model,
         environment=environment,
         concurrency=concurrency,
         retry=RetryConfig(max_retries=max_retries),
diff --git a/src/benchflow/cli/main.py b/src/benchflow/cli/main.py
index 9cf1198..96611bb 100644
--- a/src/benchflow/cli/main.py
+++ b/src/benchflow/cli/main.py
@@ -10,7 +10,7 @@
 from rich.console import Console
 from rich.table import Table
 
-from benchflow.job import DEFAULT_AGENT, DEFAULT_MODEL
+from benchflow.job import DEFAULT_AGENT, effective_model
 
 console = Console()
 
@@ -167,7 +167,7 @@ def job(
             jobs_dir=jobs_dir,
             config=JobConfig(
                 agent=agent,
-                model=model or DEFAULT_MODEL,
+                model=effective_model(agent, model),
                 environment=environment,
                 concurrency=concurrency,
                 retry=RetryConfig(max_retries=max_retries),
@@ -343,7 +343,7 @@ def eval(
         jobs_dir=jobs_dir,
         config=JobConfig(
             agent=agent,
-            model=model or DEFAULT_MODEL,
+            model=effective_model(agent, model),
             environment=environment,
             concurrency=concurrency,
             skills_dir=effective_skills,
@@ -754,19 +754,20 @@ def eval_create(
             f"({result.score:.1%})[/bold], errors={result.errored}"
         )
     elif tasks_dir:
+        eff_model = effective_model(agent, model)
         # Smart detection: if tasks_dir has task.toml, it's a single task
         if (tasks_dir / "task.toml").exists():
             from benchflow.trial import Trial, TrialConfig, Scene
 
             config = TrialConfig(
                 task_path=tasks_dir,
-                scenes=[Scene.single(agent=agent, model=model or DEFAULT_MODEL,
+                scenes=[Scene.single(agent=agent, model=eff_model,
                                      skills_dir=str(skills_dir) if skills_dir else None)],
                 environment=environment,
                 sandbox_user=sandbox_user,
                 jobs_dir=jobs_dir,
                 agent=agent,
-                model=model or DEFAULT_MODEL,
+                model=eff_model,
                 skills_dir=str(skills_dir) if skills_dir else None,
             )
 
@@ -777,7 +778,7 @@ async def _run():
             run_result = asyncio.run(_run())
             reward = (run_result.rewards or {}).get("reward")
             console.print(f"\n[bold]Task:[/bold] {tasks_dir.name}")
-            console.print(f"[bold]Agent:[/bold] {agent} ({model or DEFAULT_MODEL})")
+            console.print(f"[bold]Agent:[/bold] {agent} ({eff_model or 'no model'})")
             console.print(f"[bold]Reward:[/bold] {reward}")
             console.print(f"[bold]Tool calls:[/bold] {run_result.n_tool_calls}")
             if run_result.error:
@@ -789,7 +790,7 @@ async def _run():
                 jobs_dir=jobs_dir,
                 config=JobConfig(
                     agent=agent,
-                    model=model or DEFAULT_MODEL,
+                    model=eff_model,
                     environment=environment,
                     concurrency=concurrency,
                     sandbox_user=sandbox_user,
@@ -915,7 +916,7 @@ def train_create(
                 jobs_dir=f"{jobs_dir}/sweep-{sweep_idx}",
                 config=JobConfig(
                     agent=agent,
-                    model=model or DEFAULT_MODEL,
+                    model=effective_model(agent, model),
                     environment=environment,
                     concurrency=concurrency,
                 ),
diff --git a/src/benchflow/job.py b/src/benchflow/job.py
index 95414fe..ad99ad4 100644
--- a/src/benchflow/job.py
+++ b/src/benchflow/job.py
@@ -141,12 +141,25 @@ def backoff_delay(self, attempt: int) -> float:
 DEFAULT_MODEL = "claude-haiku-4-5-20251001"
 
 
+def effective_model(agent: str, model: str | None) -> str | None:
+    """Resolve the model an agent should run with.
+
+    Oracle runs solve.sh and never calls an LLM, so it never receives a model
+    (the chokepoint in resolve_agent_env defends, but callers should also stop
+    materializing DEFAULT_MODEL into oracle configs to keep the data honest —
+    e.g. result-summary JSON shows model=null instead of a bogus default).
+    """
+    if agent == "oracle":
+        return None
+    return model or DEFAULT_MODEL
+
+
 @dataclass
 class JobConfig:
     """Configuration for a benchmark job."""
 
     agent: str = DEFAULT_AGENT
-    model: str | None = DEFAULT_MODEL
+    model: str | None = None
     environment: str = "docker"
     concurrency: int = 4
     prompts: list[str | None] | None = None
@@ -281,9 +294,10 @@ def _from_native_yaml(cls, raw: dict, **kwargs) -> "Job":
         sandbox_user = raw.get("sandbox_user", "agent")
         sandbox_locked_paths = raw.get("sandbox_locked_paths")
 
+        agent_name = raw.get("agent", DEFAULT_AGENT)
         config = JobConfig(
-            agent=raw.get("agent", DEFAULT_AGENT),
-            model=raw.get("model", DEFAULT_MODEL),
+            agent=agent_name,
+            model=effective_model(agent_name, raw.get("model")),
             environment=raw.get("environment", "docker"),
             concurrency=raw.get("concurrency", 4),
             prompts=prompts,
@@ -305,7 +319,7 @@ def _from_harbor_yaml(cls, raw: dict, **kwargs) -> "Job":
         agent_name = agent_cfg.get("name", DEFAULT_AGENT)
 
         # Model — keep provider prefix intact for downstream resolution
-        model = agent_cfg.get("model_name", "") or DEFAULT_MODEL
+        model = effective_model(agent_name, agent_cfg.get("model_name") or None)
 
         # Environment
         env_cfg = raw.get("environment", {})
diff --git a/tests/test_eval_cli.py b/tests/test_eval_cli.py
index 5a009e7..dfcb7db 100644
--- a/tests/test_eval_cli.py
+++ b/tests/test_eval_cli.py
@@ -1,9 +1,9 @@
-"""Tests for `bf eval` — the new eval-runner CLI.
+"""Tests for ``benchflow.cli.eval`` — the future-facing eval CLI module.
 
-The create command takes a positional task reference and routes to either
-a single SDK.run or a batch Job based on whether the reference is a single
-task dir or a directory of task dirs. We don't exercise the actual runner
-here (that's the parity tests' job); we only verify the resolver.
+NOTE: ``cli/eval.py`` is **not wired into the live CLI** (the active
+``bench eval create`` lives in ``cli/main.py``).  These tests cover the
+task-reference resolver (``_resolve_task_ref``) which will be used once
+the module is promoted to the live entry point.
 """
 
 from __future__ import annotations
diff --git a/tests/test_oracle_chokepoint.py b/tests/test_oracle_chokepoint.py
new file mode 100644
index 0000000..142c063
--- /dev/null
+++ b/tests/test_oracle_chokepoint.py
@@ -0,0 +1,257 @@
+"""Regression tests for the oracle agent + DEFAULT_MODEL chokepoint.
+
+Pins the fix from this branch (post-PR #173 follow-up):
+
+  Layer 1 — restore `agent != "oracle"` guard in resolve_agent_env so the
+            chokepoint defends against any caller that forwards a model.
+  Layer 2 — delete the orphaned src/benchflow/cli/eval.py whose oracle fix
+            was unreachable because nothing wired it into the live CLI.
+  Layer 3 — funnel all CLI/YAML-loader sites through effective_model() so
+            oracle gets honest model=None end-to-end.
+
+The classes below pin each layer at the right altitude:
+- TestOrphanRemoval       — proves cli/eval.py is gone and stays gone.
+- TestEvalCreateRouting   — proves `bench eval create` dispatches to
+                            cli/main.py (the file that PR #173 missed).
+- TestEffectiveModel      — unit tests for the helper.
+- TestOracleYamlLoaders   — Job.from_yaml(oracle config) → model is None.
+- TestEvalCreateOracleCLI — end-to-end: invoke `bench eval create -a oracle`
+                            and assert no API-key validation error.
+"""
+
+from __future__ import annotations
+
+import importlib
+from pathlib import Path
+from unittest.mock import AsyncMock, patch
+
+import pytest
+from typer.testing import CliRunner
+
+
+class TestEvalModuleNotWiredIntoCLI:
+    """src/benchflow/cli/eval.py exists but is NOT wired into the live CLI.
+
+    The live `bench eval create` dispatches to cli/main.py:eval_create.
+    cli/eval.py is a standalone module (with its own resolver logic and tests)
+    but must not be imported by the CLI entry-point code — doing so is what
+    caused PR #173 to land its fix in dead code.
+    """
+
+    def test_cli_main_does_not_import_cli_eval(self):
+        """cli/main.py must not import from cli/eval — they are separate."""
+        main_py = (
+            Path(__file__).resolve().parent.parent
+            / "src" / "benchflow" / "cli" / "main.py"
+        )
+        text = main_py.read_text()
+        assert "from benchflow.cli.eval" not in text
+        assert "import benchflow.cli.eval" not in text
+
+
+class TestEvalCreateRouting:
+    """`bench eval create` must dispatch to cli/main.py:eval_create.
+
+    The pyproject.toml `bench`/`benchflow` scripts both resolve to
+    `benchflow.cli.main:app`. PR #173 patched a different `eval_create` in
+    `benchflow.cli.eval` that no entry point ever loaded — these tests
+    pin the routing so that mistake can't happen again.
+    """
+
+    def test_entry_point_app_is_cli_main(self):
+        from benchflow.cli.main import app
+
+        assert app.info.name == "benchflow"
+
+    def test_eval_create_callback_lives_in_cli_main(self):
+        from benchflow.cli.main import eval_app
+
+        create_cmds = [c for c in eval_app.registered_commands if c.name == "create"]
+        assert len(create_cmds) == 1
+        assert create_cmds[0].callback.__module__ == "benchflow.cli.main"
+
+    def test_bench_eval_create_help_resolves(self):
+        """Smoke test: `bench eval create --help` reaches a real callback."""
+        from benchflow.cli.main import app
+
+        result = CliRunner().invoke(app, ["eval", "create", "--help"])
+        assert result.exit_code == 0
+        assert "tasks-dir" in result.stdout or "task" in result.stdout.lower()
+
+
+class TestEffectiveModel:
+    """The helper introduced in Layer 3 — single source of truth for the rule
+    "oracle never gets a model; non-oracle agents fall back to DEFAULT_MODEL"."""
+
+    def test_oracle_with_no_model_returns_none(self):
+        from benchflow.job import effective_model
+
+        assert effective_model("oracle", None) is None
+
+    def test_oracle_ignores_explicit_model(self):
+        """Even if a caller forwards a model for oracle, the helper drops it."""
+        from benchflow.job import effective_model
+
+        assert effective_model("oracle", "claude-haiku-4-5-20251001") is None
+
+    def test_non_oracle_with_no_model_returns_default(self):
+        from benchflow.job import DEFAULT_MODEL, effective_model
+
+        assert effective_model("claude-agent-acp", None) == DEFAULT_MODEL
+
+    def test_non_oracle_explicit_model_passes_through(self):
+        from benchflow.job import effective_model
+
+        assert effective_model("codex-acp", "gpt-5") == "gpt-5"
+
+    def test_non_oracle_empty_model_falls_back_to_default(self):
+        """Empty string == "no model" — Harbor YAML can produce this shape."""
+        from benchflow.job import DEFAULT_MODEL, effective_model
+
+        assert effective_model("claude-agent-acp", "") == DEFAULT_MODEL
+
+
+class TestOracleYamlLoaders:
+    """YAML configs for oracle must produce JobConfig.model is None.
+
+    Both loader paths (_from_native_yaml, _from_harbor_yaml) previously
+    coalesced missing model to DEFAULT_MODEL unconditionally — Layer 3
+    routes them through effective_model() so oracle drops the default.
+    """
+
+    def _make_task(self, tmp_path: Path) -> Path:
+        tasks = tmp_path / "tasks" / "task-a"
+        tasks.mkdir(parents=True)
+        (tasks / "task.toml").write_text('schema_version = "1.1"\n')
+        return tmp_path / "tasks"
+
+    def test_native_yaml_oracle_no_model(self, tmp_path: Path):
+        from benchflow.job import Job
+
+        self._make_task(tmp_path)
+        config = tmp_path / "config.yaml"
+        config.write_text("tasks_dir: tasks\nagent: oracle\n")
+        job = Job.from_yaml(config)
+        assert job._config.agent == "oracle"
+        assert job._config.model is None
+
+    def test_harbor_yaml_oracle_no_model(self, tmp_path: Path):
+        from benchflow.job import Job
+
+        self._make_task(tmp_path)
+        config = tmp_path / "config.yaml"
+        config.write_text(
+            "agents:\n"
+            "  - name: oracle\n"
+            "datasets:\n"
+            "  - path: tasks\n"
+        )
+        job = Job.from_yaml(config)
+        assert job._config.agent == "oracle"
+        assert job._config.model is None
+
+    def test_native_yaml_non_oracle_keeps_default_when_omitted(
+        self, tmp_path: Path
+    ):
+        """Backwards-compat: omitting model for an LLM agent still gets DEFAULT_MODEL."""
+        from benchflow.job import DEFAULT_MODEL, Job
+
+        self._make_task(tmp_path)
+        config = tmp_path / "config.yaml"
+        config.write_text("tasks_dir: tasks\nagent: claude-agent-acp\n")
+        job = Job.from_yaml(config)
+        assert job._config.model == DEFAULT_MODEL
+
+
+class TestEvalCreateOracleCLI:
+    """End-to-end: `bench eval create -a oracle` must not trip API key validation.
+
+    This is the user-visible bug the chokepoint test guards against at the
+    unit level. Here we call the live handler (cli/main.py:eval_create)
+    directly — invoking via Typer's CliRunner triggers asyncio.run()
+    internally, which leaves no current event loop and breaks pre-existing
+    tests in the suite that use the deprecated asyncio.get_event_loop()
+    pattern. Calling the function directly still exercises the full
+    CLI → effective_model → TrialConfig → resolve_agent_env path the bug
+    originally lived in.
+    """
+
+    def _make_task(self, tmp_path: Path) -> Path:
+        task = tmp_path / "task"
+        task.mkdir()
+        (task / "task.toml").write_text('schema_version = "1.1"\n')
+        (task / "instruction.md").write_text("solve\n")
+        return task
+
+    def _strip_api_keys(self, monkeypatch):
+        for k in (
+            "ANTHROPIC_API_KEY",
+            "ANTHROPIC_AUTH_TOKEN",
+            "CLAUDE_CODE_OAUTH_TOKEN",
+            "OPENAI_API_KEY",
+            "GOOGLE_API_KEY",
+            "GEMINI_API_KEY",
+            "LLM_API_KEY",
+        ):
+            monkeypatch.delenv(k, raising=False)
+
+    def test_oracle_single_task_no_api_key_no_error(
+        self, tmp_path: Path, monkeypatch
+    ):
+        """The bug: oracle + missing API key → ANTHROPIC_API_KEY ValueError."""
+        import asyncio
+
+        from benchflow.cli.main import eval_create
+        from benchflow.models import RunResult
+
+        # The CLI handler internally calls asyncio.run(), which leaves no
+        # current event loop. Pre-existing tests in the suite use the
+        # deprecated asyncio.get_event_loop() and break in that state, so
+        # restore a fresh loop after the test (teardown via finally below).
+
+        self._strip_api_keys(monkeypatch)
+        task = self._make_task(tmp_path)
+        captured: dict = {}
+
+        async def fake_create(config):
+            captured["config"] = config
+            # Exercise the real chokepoint with the config the CLI built —
+            # that's the specific call site the bug manifested at.
+            from benchflow._agent_env import resolve_agent_env
+
+            captured["agent_env"] = resolve_agent_env(
+                config.primary_agent, config.primary_model, config.agent_env
+            )
+            trial = type("FakeTrial", (), {})()
+            trial.run = AsyncMock(
+                return_value=RunResult(
+                    task_name="task",
+                    agent_name="oracle",
+                    rewards={"reward": 1.0},
+                    n_tool_calls=0,
+                )
+            )
+            return trial
+
+        try:
+            with patch("benchflow.trial.Trial.create", new=fake_create):
+                eval_create(
+                    config_file=None,
+                    tasks_dir=task,
+                    agent="oracle",
+                    model=None,
+                    environment="docker",
+                    concurrency=1,
+                    jobs_dir=str(tmp_path / "jobs"),
+                    sandbox_user="agent",
+                    skills_dir=None,
+                )
+        finally:
+            asyncio.set_event_loop(asyncio.new_event_loop())
+
+        cfg = captured["config"]
+        # Layer 3: oracle never receives a model, even when CLI defaults exist.
+        assert cfg.primary_agent == "oracle"
+        assert cfg.primary_model is None
+        # Layer 1: chokepoint did not inject provider env or raise.
+        assert "BENCHFLOW_PROVIDER_MODEL" not in captured["agent_env"]
diff --git a/tests/test_resolve_env_helpers.py b/tests/test_resolve_env_helpers.py
index f699334..d831fe7 100644
--- a/tests/test_resolve_env_helpers.py
+++ b/tests/test_resolve_env_helpers.py
@@ -296,3 +296,45 @@ def test_no_model_empty_requires_env(self, monkeypatch, tmp_path):
         self._patch_expanduser(monkeypatch, tmp_path)
         result = self._resolve(agent="openclaw", agent_env={})
         assert "_BENCHFLOW_SUBSCRIPTION_AUTH" not in result
+
+
+class TestResolveAgentEnvOracle:
+    """Oracle runs solve.sh and never calls an LLM — must skip model-related env.
+
+    Regression for the PR #173 follow-up: commit 360c460 removed the
+    `agent != "oracle"` guard from resolve_agent_env, betting that CLI callers
+    would pass model=None for oracle. But cli/main.py:eval_create (the live
+    `bench eval create`) still passes `model or DEFAULT_MODEL`, so oracle
+    reaches the chokepoint with a real model and triggers ANTHROPIC_API_KEY
+    validation — breaking offline oracle runs that have no API key set.
+    """
+
+    def _patch_expanduser(self, monkeypatch, tmp_path):
+        orig = Path.expanduser
+
+        def fake(self):
+            s = str(self)
+            if s.startswith("~"):
+                return tmp_path / s[2:]
+            return orig(self)
+
+        monkeypatch.setattr(Path, "expanduser", fake)
+
+    def test_oracle_with_default_model_does_not_validate_api_key(
+        self, monkeypatch, tmp_path
+    ):
+        """Oracle + DEFAULT_MODEL + no API key + no host auth must not raise."""
+        for k in (
+            "ANTHROPIC_API_KEY",
+            "OPENAI_API_KEY",
+            "GOOGLE_API_KEY",
+            "GEMINI_API_KEY",
+        ):
+            monkeypatch.delenv(k, raising=False)
+        self._patch_expanduser(monkeypatch, tmp_path)
+
+        result = resolve_agent_env("oracle", "claude-haiku-4-5-20251001", {})
+
+        # Provider env never resolved — oracle never calls an LLM.
+        assert "BENCHFLOW_PROVIDER_MODEL" not in result
+        assert "_BENCHFLOW_SUBSCRIPTION_AUTH" not in result