huggingface · bhargav1000 · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026 · Apr 28, 2026
diff --git a/README.md b/README.md
@@ -106,6 +106,84 @@ JSON file:
 }
 ```
 
+### Model Selection
+
+In the CLI, run `/model` to list suggested models and see the active model:
+
+```text
+/model
+```
+
+Switch models by passing the model id:
+
+```text
+/model moonshotai/Kimi-K2.6
+/model bedrock/us.anthropic.claude-opus-4-6-v1
+```
+
+You can also choose a model at startup:
+
+```bash
+ml-intern --model moonshotai/Kimi-K2.6 "your prompt"
+```
+
+### Local Models
+
+Local model support uses OpenAI-compatible HTTP endpoints through LiteLLM. The agent does not load model weights directly from disk; a local inference server must already be running.
+
+Supported local model id prefixes:
+
+| Prefix | Default endpoint | Example |
+| --- | --- | --- |
+| `ollama/` | `http://localhost:11434/v1` | `ollama/llama3.1` |
+| `vllm/` | `http://localhost:8000/v1` | `vllm/Qwen3.5-2B` |
+| `llamacpp/` | `http://localhost:8001/v1` | `llamacpp/unsloth/Qwen3.5-2B` |
+| `local://` | `${LOCAL_LLM_BASE_URL}/v1` | `local://my-model` |
+
+Override endpoints with environment variables:
+
+```bash
+OLLAMA_BASE_URL=http://localhost:11434
+VLLM_BASE_URL=http://localhost:8000
+LLAMACPP_BASE_URL=http://localhost:8001
+LOCAL_LLM_BASE_URL=http://localhost:8000
+```
+
+Keep these endpoint variables server-controlled. Do not expose them as user-editable web/API inputs; they determine where the backend sends LLM traffic.
+
+For example, with Ollama:
+
+```bash
+ollama pull llama3.1
+ollama serve
+ml-intern
+```
+
+Then switch inside the CLI:
+
+```text
+/model ollama/llama3.1
+```
+
+For llama.cpp, start an OpenAI-compatible server first, then point the agent at it if you are not using the default port:
+
+```bash
+export LLAMACPP_BASE_URL=http://localhost:8080
+ml-intern
+```
+
+```text
+/model llamacpp/<model-id-from-/v1/models>
+```
+
+For the web UI/API, enable local model selection:
+
+```bash
+ENABLE_LOCAL_MODELS=true
+```
+
+When `ENABLE_LOCAL_MODELS=true`, the backend exposes local model presets and accepts custom local paths with the prefixes above. The web model menu also shows a custom local model path field, so you can enter values like `ollama/qwen2.5-coder` or `local://my-model`.
+
 ## Architecture
 
 ### Component Overview

diff --git a/agent/core/llm_params.py b/agent/core/llm_params.py
@@ -5,8 +5,13 @@
 creating circular imports.
 """
 
+import logging
+import os
+
 from agent.core.hf_tokens import get_hf_bill_to, resolve_hf_router_token
 
+logger = logging.getLogger(__name__)
+
 
 def _resolve_hf_router_token(session_hf_token: str | None = None) -> str | None:
     """Backward-compatible private wrapper used by tests and older imports."""
@@ -79,6 +84,7 @@ def _widened(model: str) -> bool:
 _ANTHROPIC_EFFORTS = {"low", "medium", "high", "xhigh", "max"}
 _OPENAI_EFFORTS = {"minimal", "low", "medium", "high", "xhigh"}
 _HF_EFFORTS = {"low", "medium", "high"}
+_LOCAL_DEFAULT_API_KEY = "sk-no-key-required"
 
 
 class UnsupportedEffortError(ValueError):
@@ -89,6 +95,19 @@ class UnsupportedEffortError(ValueError):
     """
 
 
+def _raise_for_local_effort(reasoning_effort: str | None, strict: bool) -> None:
+    if not reasoning_effort:
+        return
+    message = "Local OpenAI-compatible endpoints don't accept reasoning_effort"
+    if strict:
+        raise UnsupportedEffortError(message)
+    logger.warning(
+        "%s; dropping reasoning_effort=%r for this local model call",
+        message,
+        reasoning_effort,
+    )
+
+
 def _resolve_llm_params(
     model_name: str,
     session_hf_token: str | None = None,
@@ -180,6 +199,46 @@ def _resolve_llm_params(
                 params["reasoning_effort"] = reasoning_effort
         return params
 
+    if model_name.startswith("ollama/"):
+        _raise_for_local_effort(reasoning_effort, strict)
+        local_model = model_name.split("/", 1)[1]
+        api_base = os.environ.get("OLLAMA_BASE_URL", "http://localhost:11434")
+        return {
+            "model": f"openai/{local_model}",
+            "api_base": f"{api_base.rstrip('/')}/v1",
+            "api_key": os.environ.get("OLLAMA_API_KEY", _LOCAL_DEFAULT_API_KEY),
+        }
+
+    if model_name.startswith("vllm/"):
+        _raise_for_local_effort(reasoning_effort, strict)
+        local_model = model_name.split("/", 1)[1]
+        api_base = os.environ.get("VLLM_BASE_URL", "http://localhost:8000")
+        return {
+            "model": f"openai/{local_model}",
+            "api_base": f"{api_base.rstrip('/')}/v1",
+            "api_key": os.environ.get("VLLM_API_KEY", _LOCAL_DEFAULT_API_KEY),
+        }
+
+    if model_name.startswith("llamacpp/"):
+        _raise_for_local_effort(reasoning_effort, strict)
+        local_model = model_name.split("/", 1)[1]
+        api_base = os.environ.get("LLAMACPP_BASE_URL", "http://localhost:8001")
+        return {
+            "model": f"openai/{local_model}",
+            "api_base": f"{api_base.rstrip('/')}/v1",
+            "api_key": os.environ.get("LLAMACPP_API_KEY", _LOCAL_DEFAULT_API_KEY),
+        }
+
+    if model_name.startswith("local://"):
+        _raise_for_local_effort(reasoning_effort, strict)
+        local_model = model_name.split("://", 1)[1]
+        api_base = os.environ.get("LOCAL_LLM_BASE_URL", "http://localhost:8000")
+        return {
+            "model": f"openai/{local_model}",
+            "api_base": f"{api_base.rstrip('/')}/v1",
+            "api_key": os.environ.get("LOCAL_LLM_API_KEY", _LOCAL_DEFAULT_API_KEY),
+        }
+
     hf_model = model_name.removeprefix("huggingface/")
     api_key = _resolve_hf_router_token(session_hf_token)
     params = {

diff --git a/agent/core/model_switcher.py b/agent/core/model_switcher.py
@@ -16,6 +16,7 @@
 from __future__ import annotations
 
 from agent.core.effort_probe import ProbeInconclusive, probe_effort
+from ml_intern.local_models import LOCAL_MODEL_PREFIXES, is_local_model_id
 
 
 # Suggested models shown by `/model` (not a gate). Users can paste any HF
@@ -44,13 +45,18 @@ def is_valid_model_id(model_id: str) -> bool:
     Accepts:
       • anthropic/<model>
       • openai/<model>
+      • ollama/<model>, vllm/<model>, llamacpp/<model>, local://<model>
       • <org>/<model>[:<tag>]            (HF router; tag = provider or policy)
       • huggingface/<org>/<model>[:<tag>] (same, accepts legacy prefix)
 
     Actual availability is verified against the HF router catalog on
     switch, and by the provider on the probe's ping call.
     """
-    if not model_id or "/" not in model_id:
+    if not model_id:
+        return False
+    if is_local_model_id(model_id):
+        return True
+    if "/" not in model_id:
         return False
     head = model_id.split(":", 1)[0]
     parts = head.split("/")
@@ -66,7 +72,7 @@ def _print_hf_routing_info(model_id: str, console) -> bool:
     Anthropic / OpenAI ids return ``True`` without printing anything —
     the probe below covers "does this model exist".
     """
-    if model_id.startswith(("anthropic/", "openai/")):
+    if model_id.startswith(("anthropic/", "openai/", *LOCAL_MODEL_PREFIXES)):
         return True
 
     from agent.core import hf_router_catalog as cat
@@ -139,7 +145,9 @@ def print_model_listing(config, console) -> None:
     console.print(
         "\n[dim]Paste any HF model id (e.g. 'MiniMaxAI/MiniMax-M2.7').\n"
         "Add ':fastest', ':cheapest', ':preferred', or ':<provider>' to override routing.\n"
-        "Use 'anthropic/<model>' or 'openai/<model>' for direct API access.[/dim]"
+        "Use 'anthropic/<model>' or 'openai/<model>' for direct API access.\n"
+        "Use 'ollama/<model>', 'vllm/<model>', 'llamacpp/<model>', or 'local://<model>' "
+        "for local OpenAI-compatible endpoints.[/dim]"
     )
 
 

diff --git a/backend/model_catalog.py b/backend/model_catalog.py
@@ -0,0 +1,95 @@
+"""Model catalog and validation helpers for agent API routes."""
+
+import os
+from typing import Any
+
+from ml_intern.local_models import is_local_model_id
+
+
+def local_models_enabled() -> bool:
+    return os.environ.get("ENABLE_LOCAL_MODELS", "false").lower() in {
+        "1",
+        "true",
+        "yes",
+        "on",
+    }
+
+
+def get_available_models() -> list[dict[str, Any]]:
+    models: list[dict[str, Any]] = [
+        {
+            "id": "moonshotai/Kimi-K2.6",
+            "label": "Kimi K2.6",
+            "provider": "huggingface",
+            "tier": "free",
+            "recommended": True,
+        },
+        {
+            "id": "bedrock/us.anthropic.claude-opus-4-6-v1",
+            "label": "Claude Opus 4.6",
+            "provider": "anthropic",
+            "tier": "pro",
+            "recommended": True,
+        },
+        {
+            "id": "MiniMaxAI/MiniMax-M2.7",
+            "label": "MiniMax M2.7",
+            "provider": "huggingface",
+            "tier": "free",
+        },
+        {
+            "id": "zai-org/GLM-5.1",
+            "label": "GLM 5.1",
+            "provider": "huggingface",
+            "tier": "free",
+        },
+    ]
+
+    if local_models_enabled():
+        models.extend(
+            [
+                {
+                    "id": "ollama/llama3.1",
+                    "label": "Llama 3.1 (Ollama)",
+                    "provider": "local",
+                    "tier": "free",
+                },
+                {
+                    "id": "vllm/Qwen3.5-2B",
+                    "label": "Qwen3.5-2B (vLLM)",
+                    "provider": "local",
+                    "tier": "free",
+                },
+                {
+                    "id": "llamacpp/unsloth/Qwen3.5-2B",
+                    "label": "Qwen3.5-2B (llama.cpp)",
+                    "provider": "local",
+                    "tier": "free",
+                    "recommended": True,
+                },
+            ]
+        )
+
+    return models
+
+
+def available_model_ids() -> set[str]:
+    return {m["id"] for m in get_available_models()}
+
+
+def is_custom_local_model_id(model_id: str) -> bool:
+    if not local_models_enabled():
+        return False
+    if not isinstance(model_id, str):
+        return False
+    if not model_id or any(char.isspace() for char in model_id):
+        return False
+    return is_local_model_id(model_id)
+
+
+def is_valid_model_id(model_id: str) -> bool:
+    return model_id in available_model_ids() or is_custom_local_model_id(model_id)
+
+
+def is_anthropic_model(model_id: str) -> bool:
+    return model_id.startswith(("anthropic/", "bedrock/")) and "anthropic" in model_id