Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 35 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,14 @@ Create a `.env` file in the project root (or export these in your shell):
```bash
ANTHROPIC_API_KEY=<your-anthropic-api-key> # if using anthropic models
OPENAI_API_KEY=<your-openai-api-key> # if using openai models
LOCAL_LLM_BASE_URL=http://localhost:8000 # shared fallback for local model prefixes
LOCAL_LLM_API_KEY=<optional-local-api-key> # optional shared local API key
HF_TOKEN=<your-hugging-face-token>
GITHUB_TOKEN=<github-personal-access-token>
```
If no `HF_TOKEN` is set, the CLI will prompt you to paste one on first launch. To get a GITHUB_TOKEN follow the tutorial [here](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens#creating-a-fine-grained-personal-access-token).
If no `HF_TOKEN` is set, the CLI will prompt you to paste one on first launch
unless you start on a local model. To get a GITHUB_TOKEN follow the tutorial
[here](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens#creating-a-fine-grained-personal-access-token).

### Usage

Expand All @@ -52,12 +56,41 @@ ml-intern "fine-tune llama on my dataset"
```bash
ml-intern --model anthropic/claude-opus-4-7 "your prompt" # requires ANTHROPIC_API_KEY
ml-intern --model openai/gpt-5.5 "your prompt" # requires OPENAI_API_KEY
ml-intern --model ollama/llama3.1:8b "your prompt"
ml-intern --model vllm/meta-llama/Llama-3.1-8B-Instruct "your prompt"
ml-intern --max-iterations 100 "your prompt"
ml-intern --no-stream "your prompt"
```

Run `ml-intern` then `/model` to see the full list of suggested model ids
(Claude, GPT, and HF-router models like MiniMax, Kimi, GLM, DeepSeek).
(Claude, GPT, HF-router models like MiniMax, Kimi, GLM, DeepSeek, and local
model prefixes).

**Local models:**

Local model support uses OpenAI-compatible HTTP endpoints through LiteLLM. The
agent does not load model weights directly from disk; start your inference
server first, then select it with a provider-specific model prefix:

```bash
ml-intern --model ollama/llama3.1:8b "your prompt"
ml-intern --model vllm/meta-llama/Llama-3.1-8B-Instruct "your prompt"
```

Inside interactive mode, switch with `/model`:

```text
/model ollama/llama3.1:8b
/model lm_studio/google/gemma-3-4b
/model llamacpp/llama-3.1-8b-instruct
```

Supported local prefixes are `ollama/`, `vllm/`, `lm_studio/`, and
`llamacpp/`. Set `LOCAL_LLM_BASE_URL` and optional `LOCAL_LLM_API_KEY` to use
one shared local endpoint, or override a specific provider with its matching
`*_BASE_URL` / `*_API_KEY` variable, such as `OLLAMA_BASE_URL` or
`VLLM_API_KEY`. Provider-specific variables take precedence over the shared
local variables. Base URLs may include or omit `/v1`.

## Sharing Traces

Expand Down
62 changes: 62 additions & 0 deletions agent/core/llm_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,17 @@
creating circular imports.
"""

import os

from agent.core.hf_tokens import get_hf_bill_to, resolve_hf_router_token
from agent.core.local_models import (
LOCAL_MODEL_API_KEY_DEFAULT,
LOCAL_MODEL_API_KEY_ENV,
LOCAL_MODEL_BASE_URL_ENV,
is_reserved_local_model_id,
local_model_name,
local_model_provider,
)


def _resolve_hf_router_token(session_hf_token: str | None = None) -> str | None:
Expand Down Expand Up @@ -96,6 +106,46 @@ class UnsupportedEffortError(ValueError):
"""


def _local_api_base(base_url: str) -> str:
base = base_url.strip().rstrip("/")
if base.endswith("/v1"):
return base
return f"{base}/v1"


def _resolve_local_model_params(
model_name: str,
reasoning_effort: str | None = None,
strict: bool = False,
) -> dict:
if reasoning_effort and strict:
raise UnsupportedEffortError(
"Local OpenAI-compatible endpoints don't accept reasoning_effort"
)

local_name = local_model_name(model_name)
if local_name is None:
raise ValueError(f"Unsupported local model id: {model_name}")

provider = local_model_provider(model_name)
assert provider is not None
raw_base = (
os.environ.get(provider["base_url_env"])
or os.environ.get(LOCAL_MODEL_BASE_URL_ENV)
or provider["base_url_default"]
)
api_key = (
os.environ.get(provider["api_key_env"])
or os.environ.get(LOCAL_MODEL_API_KEY_ENV)
or LOCAL_MODEL_API_KEY_DEFAULT
)
return {
"model": f"openai/{local_name}",
"api_base": _local_api_base(raw_base),
"api_key": api_key,
}


def _resolve_llm_params(
model_name: str,
session_hf_token: str | None = None,
Expand All @@ -121,6 +171,12 @@ def _resolve_llm_params(
• ``openai/<model>`` — ``reasoning_effort`` forwarded as a top-level
kwarg (GPT-5 / o-series). LiteLLM uses the user's ``OPENAI_API_KEY``.

• ``ollama/<model>``, ``vllm/<model>``, ``lm_studio/<model>``, and
``llamacpp/<model>`` — local OpenAI-compatible endpoints. The id prefix
selects a configurable localhost base URL, and the model suffix is sent
to LiteLLM as ``openai/<model>``. These endpoints don't receive
``reasoning_effort``.

• Anything else is treated as a HuggingFace router id. We hit the
auto-routing OpenAI-compatible endpoint at
``https://router.huggingface.co/v1``. The id can be bare or carry an
Expand Down Expand Up @@ -187,6 +243,12 @@ def _resolve_llm_params(
params["reasoning_effort"] = reasoning_effort
return params

if is_reserved_local_model_id(model_name):
raise ValueError(f"Unsupported local model id: {model_name}")

if local_model_provider(model_name) is not None:
return _resolve_local_model_params(model_name, reasoning_effort, strict)

hf_model = model_name.removeprefix("huggingface/")
api_key = _resolve_hf_router_token(session_hf_token)
params = {
Expand Down
59 changes: 59 additions & 0 deletions agent/core/local_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""Helpers for CLI local OpenAI-compatible model ids."""

LOCAL_MODEL_PROVIDERS: dict[str, dict[str, str]] = {
"ollama/": {
"base_url_env": "OLLAMA_BASE_URL",
"base_url_default": "http://localhost:11434",
"api_key_env": "OLLAMA_API_KEY",
},
"vllm/": {
"base_url_env": "VLLM_BASE_URL",
"base_url_default": "http://localhost:8000",
"api_key_env": "VLLM_API_KEY",
},
"lm_studio/": {
"base_url_env": "LMSTUDIO_BASE_URL",
"base_url_default": "http://127.0.0.1:1234",
"api_key_env": "LMSTUDIO_API_KEY",
},
"llamacpp/": {
"base_url_env": "LLAMACPP_BASE_URL",
"base_url_default": "http://localhost:8080",
"api_key_env": "LLAMACPP_API_KEY",
},
}

LOCAL_MODEL_PREFIXES = tuple(LOCAL_MODEL_PROVIDERS)
RESERVED_LOCAL_MODEL_PREFIXES = ("openai-compat/",)
LOCAL_MODEL_BASE_URL_ENV = "LOCAL_LLM_BASE_URL"
LOCAL_MODEL_API_KEY_ENV = "LOCAL_LLM_API_KEY"
LOCAL_MODEL_API_KEY_DEFAULT = "sk-local-no-key-required"


def local_model_provider(model_id: str) -> dict[str, str] | None:
"""Return provider config for a local model id, if it uses a local prefix."""
for prefix, config in LOCAL_MODEL_PROVIDERS.items():
if model_id.startswith(prefix):
return config
return None


def local_model_name(model_id: str) -> str | None:
"""Return the backend model name with the local provider prefix removed."""
for prefix in LOCAL_MODEL_PREFIXES:
if model_id.startswith(prefix):
name = model_id[len(prefix) :]
return name or None
return None


def is_local_model_id(model_id: str) -> bool:
"""Return True for non-empty, whitespace-free local model ids."""
if not model_id or any(char.isspace() for char in model_id):
return False
return local_model_name(model_id) is not None


def is_reserved_local_model_id(model_id: str) -> bool:
"""Return True for local-style prefixes intentionally not supported."""
return model_id.startswith(RESERVED_LOCAL_MODEL_PREFIXES)
66 changes: 60 additions & 6 deletions agent/core/model_switcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,17 @@

from __future__ import annotations

import asyncio

from litellm import acompletion

from agent.core.effort_probe import ProbeInconclusive, probe_effort
from agent.core.llm_params import _resolve_llm_params
from agent.core.local_models import (
LOCAL_MODEL_PREFIXES,
is_local_model_id,
is_reserved_local_model_id,
)


# Suggested models shown by `/model` (not a gate). Users can paste any HF
Expand All @@ -40,6 +50,8 @@


_ROUTING_POLICIES = {"fastest", "cheapest", "preferred"}
_DIRECT_PREFIXES = ("anthropic/", "openai/", *LOCAL_MODEL_PREFIXES)
_LOCAL_PROBE_TIMEOUT = 15.0


def is_valid_model_id(model_id: str) -> bool:
Expand All @@ -48,13 +60,22 @@ def is_valid_model_id(model_id: str) -> bool:
Accepts:
• anthropic/<model>
• openai/<model>
• ollama/<model>, vllm/<model>, lm_studio/<model>, llamacpp/<model>
• <org>/<model>[:<tag>] (HF router; tag = provider or policy)
• huggingface/<org>/<model>[:<tag>] (same, accepts legacy prefix)

Actual availability is verified against the HF router catalog on
switch, and by the provider on the probe's ping call.
"""
if not model_id or "/" not in model_id:
if not model_id:
return False
if is_local_model_id(model_id):
return True
if is_reserved_local_model_id(model_id):
return False
if any(model_id.startswith(prefix) for prefix in LOCAL_MODEL_PREFIXES):
return False
if "/" not in model_id:
return False
head = model_id.split(":", 1)[0]
parts = head.split("/")
Expand All @@ -70,7 +91,7 @@ def _print_hf_routing_info(model_id: str, console) -> bool:
Anthropic / OpenAI ids return ``True`` without printing anything —
the probe below covers "does this model exist".
"""
if model_id.startswith(("anthropic/", "openai/")):
if model_id.startswith(_DIRECT_PREFIXES):
return True

from agent.core import hf_router_catalog as cat
Expand Down Expand Up @@ -141,7 +162,9 @@ def print_model_listing(config, console) -> None:
console.print(
"\n[dim]Paste any HF model id (e.g. 'MiniMaxAI/MiniMax-M2.7').\n"
"Add ':fastest', ':cheapest', ':preferred', or ':<provider>' to override routing.\n"
"Use 'anthropic/<model>' or 'openai/<model>' for direct API access.[/dim]"
"Use 'anthropic/<model>' or 'openai/<model>' for direct API access.\n"
"Use 'ollama/<model>', 'vllm/<model>', 'lm_studio/<model>', or "
"'llamacpp/<model>' for local OpenAI-compatible endpoints.[/dim]"
)


Expand All @@ -151,7 +174,21 @@ def print_invalid_id(arg: str, console) -> None:
"[dim]Expected:\n"
" • <org>/<model>[:tag] (HF router — paste from huggingface.co)\n"
" • anthropic/<model>\n"
" • openai/<model>[/dim]"
" • openai/<model>\n"
" • ollama/<model> | vllm/<model> | lm_studio/<model> | llamacpp/<model>[/dim]"
)


async def _probe_local_model(model_id: str) -> None:
params = _resolve_llm_params(model_id)
await asyncio.wait_for(
acompletion(
messages=[{"role": "user", "content": "ping"}],
max_tokens=1,
stream=False,
**params,
),
timeout=_LOCAL_PROBE_TIMEOUT,
)


Expand All @@ -173,9 +210,26 @@ async def probe_and_switch_model(
* ✗ hard error (auth, model-not-found, quota) — we reject the switch
and keep the current model so the user isn't stranded

Transient errors (5xx, timeout) complete the switch with a yellow
warning; the next real call re-surfaces the error if it's persistent.
For non-local models, transient errors (5xx, timeout) complete the switch
with a yellow warning; the next real call re-surfaces the error if it's
persistent. Local models reject every probe error, including timeouts, and
keep the current model.
"""
if is_local_model_id(model_id):
console.print(f"[dim]checking local model {model_id}...[/dim]")
try:
await _probe_local_model(model_id)
except Exception as e:
console.print(f"[bold red]Switch failed:[/bold red] {e}")
console.print(f"[dim]Keeping current model: {config.model_name}[/dim]")
return

_commit_switch(model_id, config, session, effective=None, cache=True)
console.print(
f"[green]Model switched to {model_id}[/green] [dim](effort: off)[/dim]"
)
return

preference = config.reasoning_effort
if not _print_hf_routing_info(model_id, console):
return
Expand Down
27 changes: 15 additions & 12 deletions agent/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from agent.core.agent_loop import submission_loop
from agent.core import model_switcher
from agent.core.hf_tokens import resolve_hf_token
from agent.core.local_models import is_local_model_id
from agent.core.session import OpType
from agent.core.tools import ToolRouter
from agent.messaging.gateway import NotificationGateway
Expand Down Expand Up @@ -967,15 +968,15 @@ async def main(model: str | None = None):
# Create prompt session for input (needed early for token prompt)
prompt_session = PromptSession()

# HF token — required, prompt if missing
hf_token = resolve_hf_token()
if not hf_token:
hf_token = await _prompt_and_save_hf_token(prompt_session)

config = load_config(CLI_CONFIG_PATH, include_user_defaults=True)
if model:
config.model_name = model

# HF token — required for Hub-backed models/tools, but not for local LLMs.
hf_token = resolve_hf_token()
if not hf_token and not is_local_model_id(config.model_name):
hf_token = await _prompt_and_save_hf_token(prompt_session)

# Resolve username for banner
hf_user = _get_hf_user(hf_token)

Expand Down Expand Up @@ -1198,25 +1199,27 @@ async def headless_main(
logging.basicConfig(level=logging.WARNING)
_configure_runtime_logging()

config = load_config(CLI_CONFIG_PATH, include_user_defaults=True)
config.yolo_mode = True # Auto-approve everything in headless mode

if model:
config.model_name = model

hf_token = resolve_hf_token()
if not hf_token:
if not hf_token and not is_local_model_id(config.model_name):
print(
"ERROR: No HF token found. Set HF_TOKEN or run `huggingface-cli login`.",
file=sys.stderr,
)
sys.exit(1)

print("HF token loaded", file=sys.stderr)
if hf_token:
print("HF token loaded", file=sys.stderr)

config = load_config(CLI_CONFIG_PATH, include_user_defaults=True)
config.yolo_mode = True # Auto-approve everything in headless mode
notification_gateway = NotificationGateway(config.messaging)
await notification_gateway.start()
hf_user = _get_hf_user(hf_token)

if model:
config.model_name = model

if max_iterations is not None:
config.max_iterations = max_iterations

Expand Down
Loading
Loading