Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 78 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,84 @@ JSON file:
}
```

### Model Selection

In the CLI, run `/model` to list suggested models and see the active model:

```text
/model
```

Switch models by passing the model id:

```text
/model moonshotai/Kimi-K2.6
/model bedrock/us.anthropic.claude-opus-4-6-v1
```

You can also choose a model at startup:

```bash
ml-intern --model moonshotai/Kimi-K2.6 "your prompt"
```

### Local Models

Local model support uses OpenAI-compatible HTTP endpoints through LiteLLM. The agent does not load model weights directly from disk; a local inference server must already be running.

Supported local model id prefixes:

| Prefix | Default endpoint | Example |
| --- | --- | --- |
| `ollama/` | `http://localhost:11434/v1` | `ollama/llama3.1` |
| `vllm/` | `http://localhost:8000/v1` | `vllm/Qwen3.5-2B` |
| `llamacpp/` | `http://localhost:8001/v1` | `llamacpp/unsloth/Qwen3.5-2B` |
| `local://` | `${LOCAL_LLM_BASE_URL}/v1` | `local://my-model` |

Override endpoints with environment variables:

```bash
OLLAMA_BASE_URL=http://localhost:11434
VLLM_BASE_URL=http://localhost:8000
LLAMACPP_BASE_URL=http://localhost:8001
LOCAL_LLM_BASE_URL=http://localhost:8000
```

Keep these endpoint variables server-controlled. Do not expose them as user-editable web/API inputs; they determine where the backend sends LLM traffic.

For example, with Ollama:

```bash
ollama pull llama3.1
ollama serve
ml-intern
```

Then switch inside the CLI:

```text
/model ollama/llama3.1
```

For llama.cpp, start an OpenAI-compatible server first, then point the agent at it if you are not using the default port:

```bash
export LLAMACPP_BASE_URL=http://localhost:8080
ml-intern
```

```text
/model llamacpp/<model-id-from-/v1/models>
```

For the web UI/API, enable local model selection:

```bash
ENABLE_LOCAL_MODELS=true
```

When `ENABLE_LOCAL_MODELS=true`, the backend exposes local model presets and accepts custom local paths with the prefixes above. The web model menu also shows a custom local model path field, so you can enter values like `ollama/qwen2.5-coder` or `local://my-model`.

## Architecture

### Component Overview
Expand Down
59 changes: 59 additions & 0 deletions agent/core/llm_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,13 @@
creating circular imports.
"""

import logging
import os

from agent.core.hf_tokens import get_hf_bill_to, resolve_hf_router_token

logger = logging.getLogger(__name__)


def _resolve_hf_router_token(session_hf_token: str | None = None) -> str | None:
"""Backward-compatible private wrapper used by tests and older imports."""
Expand Down Expand Up @@ -79,6 +84,7 @@ def _widened(model: str) -> bool:
_ANTHROPIC_EFFORTS = {"low", "medium", "high", "xhigh", "max"}
_OPENAI_EFFORTS = {"minimal", "low", "medium", "high", "xhigh"}
_HF_EFFORTS = {"low", "medium", "high"}
_LOCAL_DEFAULT_API_KEY = "sk-no-key-required"


class UnsupportedEffortError(ValueError):
Expand All @@ -89,6 +95,19 @@ class UnsupportedEffortError(ValueError):
"""


def _raise_for_local_effort(reasoning_effort: str | None, strict: bool) -> None:
if not reasoning_effort:
return
message = "Local OpenAI-compatible endpoints don't accept reasoning_effort"
if strict:
raise UnsupportedEffortError(message)
logger.warning(
"%s; dropping reasoning_effort=%r for this local model call",
message,
reasoning_effort,
)


def _resolve_llm_params(
model_name: str,
session_hf_token: str | None = None,
Expand Down Expand Up @@ -180,6 +199,46 @@ def _resolve_llm_params(
params["reasoning_effort"] = reasoning_effort
return params

if model_name.startswith("ollama/"):
_raise_for_local_effort(reasoning_effort, strict)
local_model = model_name.split("/", 1)[1]
api_base = os.environ.get("OLLAMA_BASE_URL", "http://localhost:11434")
return {
"model": f"openai/{local_model}",
"api_base": f"{api_base.rstrip('/')}/v1",
"api_key": os.environ.get("OLLAMA_API_KEY", _LOCAL_DEFAULT_API_KEY),
}

if model_name.startswith("vllm/"):
_raise_for_local_effort(reasoning_effort, strict)
local_model = model_name.split("/", 1)[1]
api_base = os.environ.get("VLLM_BASE_URL", "http://localhost:8000")
return {
"model": f"openai/{local_model}",
"api_base": f"{api_base.rstrip('/')}/v1",
"api_key": os.environ.get("VLLM_API_KEY", _LOCAL_DEFAULT_API_KEY),
}

if model_name.startswith("llamacpp/"):
_raise_for_local_effort(reasoning_effort, strict)
local_model = model_name.split("/", 1)[1]
api_base = os.environ.get("LLAMACPP_BASE_URL", "http://localhost:8001")
return {
"model": f"openai/{local_model}",
"api_base": f"{api_base.rstrip('/')}/v1",
"api_key": os.environ.get("LLAMACPP_API_KEY", _LOCAL_DEFAULT_API_KEY),
}

if model_name.startswith("local://"):
_raise_for_local_effort(reasoning_effort, strict)
local_model = model_name.split("://", 1)[1]
api_base = os.environ.get("LOCAL_LLM_BASE_URL", "http://localhost:8000")
return {
"model": f"openai/{local_model}",
"api_base": f"{api_base.rstrip('/')}/v1",
"api_key": os.environ.get("LOCAL_LLM_API_KEY", _LOCAL_DEFAULT_API_KEY),
}

hf_model = model_name.removeprefix("huggingface/")
api_key = _resolve_hf_router_token(session_hf_token)
params = {
Expand Down
14 changes: 11 additions & 3 deletions agent/core/model_switcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from __future__ import annotations

from agent.core.effort_probe import ProbeInconclusive, probe_effort
from ml_intern.local_models import LOCAL_MODEL_PREFIXES, is_local_model_id


# Suggested models shown by `/model` (not a gate). Users can paste any HF
Expand Down Expand Up @@ -44,13 +45,18 @@ def is_valid_model_id(model_id: str) -> bool:
Accepts:
• anthropic/<model>
• openai/<model>
• ollama/<model>, vllm/<model>, llamacpp/<model>, local://<model>
• <org>/<model>[:<tag>] (HF router; tag = provider or policy)
• huggingface/<org>/<model>[:<tag>] (same, accepts legacy prefix)

Actual availability is verified against the HF router catalog on
switch, and by the provider on the probe's ping call.
"""
if not model_id or "/" not in model_id:
if not model_id:
return False
if is_local_model_id(model_id):
return True
if "/" not in model_id:
return False
head = model_id.split(":", 1)[0]
parts = head.split("/")
Expand All @@ -66,7 +72,7 @@ def _print_hf_routing_info(model_id: str, console) -> bool:
Anthropic / OpenAI ids return ``True`` without printing anything —
the probe below covers "does this model exist".
"""
if model_id.startswith(("anthropic/", "openai/")):
if model_id.startswith(("anthropic/", "openai/", *LOCAL_MODEL_PREFIXES)):
return True

from agent.core import hf_router_catalog as cat
Expand Down Expand Up @@ -139,7 +145,9 @@ def print_model_listing(config, console) -> None:
console.print(
"\n[dim]Paste any HF model id (e.g. 'MiniMaxAI/MiniMax-M2.7').\n"
"Add ':fastest', ':cheapest', ':preferred', or ':<provider>' to override routing.\n"
"Use 'anthropic/<model>' or 'openai/<model>' for direct API access.[/dim]"
"Use 'anthropic/<model>' or 'openai/<model>' for direct API access.\n"
"Use 'ollama/<model>', 'vllm/<model>', 'llamacpp/<model>', or 'local://<model>' "
"for local OpenAI-compatible endpoints.[/dim]"
)


Expand Down
95 changes: 95 additions & 0 deletions backend/model_catalog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
"""Model catalog and validation helpers for agent API routes."""

import os
from typing import Any

from ml_intern.local_models import is_local_model_id


def local_models_enabled() -> bool:
return os.environ.get("ENABLE_LOCAL_MODELS", "false").lower() in {
"1",
"true",
"yes",
"on",
}


def get_available_models() -> list[dict[str, Any]]:
models: list[dict[str, Any]] = [
{
"id": "moonshotai/Kimi-K2.6",
"label": "Kimi K2.6",
"provider": "huggingface",
"tier": "free",
"recommended": True,
},
{
"id": "bedrock/us.anthropic.claude-opus-4-6-v1",
"label": "Claude Opus 4.6",
"provider": "anthropic",
"tier": "pro",
"recommended": True,
},
{
"id": "MiniMaxAI/MiniMax-M2.7",
"label": "MiniMax M2.7",
"provider": "huggingface",
"tier": "free",
},
{
"id": "zai-org/GLM-5.1",
"label": "GLM 5.1",
"provider": "huggingface",
"tier": "free",
},
]

if local_models_enabled():
models.extend(
[
{
"id": "ollama/llama3.1",
"label": "Llama 3.1 (Ollama)",
"provider": "local",
"tier": "free",
},
{
"id": "vllm/Qwen3.5-2B",
"label": "Qwen3.5-2B (vLLM)",
"provider": "local",
"tier": "free",
},
{
"id": "llamacpp/unsloth/Qwen3.5-2B",
"label": "Qwen3.5-2B (llama.cpp)",
"provider": "local",
"tier": "free",
"recommended": True,
},
]
)

return models


def available_model_ids() -> set[str]:
return {m["id"] for m in get_available_models()}


def is_custom_local_model_id(model_id: str) -> bool:
if not local_models_enabled():
return False
if not isinstance(model_id, str):
return False
if not model_id or any(char.isspace() for char in model_id):
return False
return is_local_model_id(model_id)


def is_valid_model_id(model_id: str) -> bool:
return model_id in available_model_ids() or is_custom_local_model_id(model_id)


def is_anthropic_model(model_id: str) -> bool:
return model_id.startswith(("anthropic/", "bedrock/")) and "anthropic" in model_id
Loading
Loading