From 945854ec503b8d70b6a9b8d58500fcdd2f3d4cbb Mon Sep 17 00:00:00 2001 From: Aryan Kumar <156166681+aryan5v@users.noreply.github.com> Date: Tue, 28 Apr 2026 21:43:15 -0700 Subject: [PATCH] Add TokenRouter model support --- README.md | 8 +++++ agent/core/agent_loop.py | 1 + agent/core/llm_params.py | 41 ++++++++++++++++++++++ agent/core/model_switcher.py | 24 ++++++++----- backend/routes/agent.py | 18 +++++++--- frontend/src/components/Chat/ChatInput.tsx | 7 ++++ tests/unit/test_llm_params.py | 37 +++++++++++++++++++ 7 files changed, 122 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 8a6c1ccd..bfecfcb8 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,7 @@ Create a `.env` file in the project root (or export these in your shell): ```bash ANTHROPIC_API_KEY= # if using anthropic models OPENAI_API_KEY= # if using openai models +TOKENROUTER_API_KEY= # if using tokenrouter models HF_TOKEN= GITHUB_TOKEN= ``` @@ -52,10 +53,17 @@ ml-intern "fine-tune llama on my dataset" ```bash ml-intern --model anthropic/claude-opus-4-6 "your prompt" ml-intern --model openai/gpt-5.5 "your prompt" +ml-intern --model tokenrouter/auto:balance "your prompt" ml-intern --max-iterations 100 "your prompt" ml-intern --no-stream "your prompt" ``` +OpenAI-compatible TokenRouter models use the `tokenrouter/` prefix. ML Intern +sends the suffix as the TokenRouter model id, so examples include +`tokenrouter/auto:balance`, `tokenrouter/auto:fast`, and +`tokenrouter/openai:gpt-4o`. Set `TOKENROUTER_API_BASE` only if you need to +override the default `https://api.tokenrouter.io/v1` endpoint. + ## Supported Gateways ML Intern currently supports one-way notification gateways from CLI sessions. diff --git a/agent/core/agent_loop.py b/agent/core/agent_loop.py index 8b7a4572..8eb5c2a3 100644 --- a/agent/core/agent_loop.py +++ b/agent/core/agent_loop.py @@ -310,6 +310,7 @@ def _friendly_error_message(error: Exception) -> str | None: "To fix this, set the API key for your model provider:\n" " • Anthropic: export ANTHROPIC_API_KEY=sk-...\n" " • OpenAI: export OPENAI_API_KEY=sk-...\n" + " • TokenRouter: export TOKENROUTER_API_KEY=tr_...\n" " • HF Router: export HF_TOKEN=hf_...\n\n" "You can also add it to a .env file in the project root.\n" "To switch models, use the /model command." diff --git a/agent/core/llm_params.py b/agent/core/llm_params.py index 880886b3..321cfc62 100644 --- a/agent/core/llm_params.py +++ b/agent/core/llm_params.py @@ -5,6 +5,8 @@ creating circular imports. """ +import os + from agent.core.hf_tokens import get_hf_bill_to, resolve_hf_router_token @@ -13,6 +15,23 @@ def _resolve_hf_router_token(session_hf_token: str | None = None) -> str | None: return resolve_hf_router_token(session_hf_token) +def _clean_env_value(value: str | None) -> str | None: + if value is None: + return None + return value.replace("\r", "").replace("\n", "").strip() or None + + +def _resolve_tokenrouter_api_key() -> str | None: + return _clean_env_value(os.environ.get("TOKENROUTER_API_KEY")) + + +def _resolve_tokenrouter_api_base() -> str: + return ( + _clean_env_value(os.environ.get("TOKENROUTER_API_BASE")) + or "https://api.tokenrouter.io/v1" + ).rstrip("/") + + def _patch_litellm_effort_validation() -> None: """Neuter LiteLLM 1.83's hardcoded effort-level validation. @@ -72,6 +91,7 @@ def _widened(model: str) -> bool: # Effort levels accepted on the wire. # Anthropic (4.6+): low | medium | high | xhigh | max (output_config.effort) # OpenAI direct: minimal | low | medium | high | xhigh (reasoning_effort top-level) +# TokenRouter: no effort parameter documented on OpenAI-compatible chat # HF router: low | medium | high (extra_body.reasoning_effort) # # We validate *shape* here and let the probe cascade walk down on rejection; @@ -114,6 +134,15 @@ def _resolve_llm_params( • ``openai/`` — ``reasoning_effort`` forwarded as a top-level kwarg (GPT-5 / o-series). LiteLLM uses the user's ``OPENAI_API_KEY``. + • ``tokenrouter/`` — TokenRouter's OpenAI-compatible endpoint at + ``https://api.tokenrouter.io/v1``. The prefix is stripped and the + remainder is sent through LiteLLM's OpenAI adapter, so ids like + ``tokenrouter/auto:balance`` and ``tokenrouter/openai:gpt-4o`` work. + Set ``TOKENROUTER_API_KEY`` for authentication and optionally + ``TOKENROUTER_API_BASE`` for a custom compatible endpoint. TokenRouter's + chat-completions compatibility docs do not list ``reasoning_effort``, so + we omit it instead of forwarding a provider-specific parameter. + • Anything else is treated as a HuggingFace router id. We hit the auto-routing OpenAI-compatible endpoint at ``https://router.huggingface.co/v1``. The id can be bare or carry an @@ -180,6 +209,18 @@ def _resolve_llm_params( params["reasoning_effort"] = reasoning_effort return params + if model_name.startswith("tokenrouter/"): + if reasoning_effort and strict: + raise UnsupportedEffortError( + f"TokenRouter doesn't accept effort={reasoning_effort!r}" + ) + tokenrouter_model = model_name.removeprefix("tokenrouter/") + return { + "model": f"openai/{tokenrouter_model}", + "api_base": _resolve_tokenrouter_api_base(), + "api_key": _resolve_tokenrouter_api_key(), + } + hf_model = model_name.removeprefix("huggingface/") api_key = _resolve_hf_router_token(session_hf_token) params = { diff --git a/agent/core/model_switcher.py b/agent/core/model_switcher.py index 63c0f40c..bf2da8d6 100644 --- a/agent/core/model_switcher.py +++ b/agent/core/model_switcher.py @@ -19,13 +19,14 @@ # Suggested models shown by `/model` (not a gate). Users can paste any HF -# model id (e.g. "MiniMaxAI/MiniMax-M2.7") or an `anthropic/` / `openai/` -# prefix for direct API access. For HF ids, append ":fastest" / -# ":cheapest" / ":preferred" / ":" to override the default -# routing policy (auto = fastest with failover). +# model id (e.g. "MiniMaxAI/MiniMax-M2.7") or an `anthropic/`, `openai/`, +# `bedrock/`, or `tokenrouter/` prefix for direct API access. For HF ids, +# append ":fastest" / ":cheapest" / ":preferred" / ":" to override +# the default routing policy (auto = fastest with failover). SUGGESTED_MODELS = [ {"id": "openai/gpt-5.5", "label": "GPT-5.5"}, {"id": "openai/gpt-5.4", "label": "GPT-5.4"}, + {"id": "tokenrouter/auto:balance", "label": "TokenRouter Auto Balance"}, {"id": "anthropic/claude-opus-4-7", "label": "Claude Opus 4.7"}, {"id": "anthropic/claude-opus-4-6", "label": "Claude Opus 4.6"}, {"id": "bedrock/us.anthropic.claude-opus-4-6-v1", "label": "Claude Opus 4.6 via Bedrock"}, @@ -44,6 +45,8 @@ def is_valid_model_id(model_id: str) -> bool: Accepts: • anthropic/ • openai/ + • bedrock/ + • tokenrouter//[:] (HF router; tag = provider or policy) • huggingface//[:] (same, accepts legacy prefix) @@ -63,10 +66,10 @@ def _print_hf_routing_info(model_id: str, console) -> bool: proceed with the switch, ``False`` to indicate a hard problem the user should notice before we fire the effort probe. - Anthropic / OpenAI ids return ``True`` without printing anything — - the probe below covers "does this model exist". + Anthropic / OpenAI / Bedrock / TokenRouter ids return ``True`` without + printing anything — the probe below covers "does this model exist". """ - if model_id.startswith(("anthropic/", "openai/")): + if model_id.startswith(("anthropic/", "openai/", "bedrock/", "tokenrouter/")): return True from agent.core import hf_router_catalog as cat @@ -139,7 +142,8 @@ def print_model_listing(config, console) -> None: console.print( "\n[dim]Paste any HF model id (e.g. 'MiniMaxAI/MiniMax-M2.7').\n" "Add ':fastest', ':cheapest', ':preferred', or ':' to override routing.\n" - "Use 'anthropic/' or 'openai/' for direct API access.[/dim]" + "Use 'anthropic/', 'openai/', 'bedrock/', or " + "'tokenrouter/' for direct API access.[/dim]" ) @@ -149,7 +153,9 @@ def print_invalid_id(arg: str, console) -> None: "[dim]Expected:\n" " • /[:tag] (HF router — paste from huggingface.co)\n" " • anthropic/\n" - " • openai/[/dim]" + " • openai/\n" + " • bedrock/\n" + " • tokenrouter/[/dim]" ) diff --git a/backend/routes/agent.py b/backend/routes/agent.py index 96830568..9d37dba5 100644 --- a/backend/routes/agent.py +++ b/backend/routes/agent.py @@ -56,6 +56,12 @@ "tier": "pro", "recommended": True, }, + { + "id": "tokenrouter/auto:balance", + "label": "TokenRouter Auto Balance", + "provider": "tokenrouter", + "tier": "external", + }, { "id": "MiniMaxAI/MiniMax-M2.7", "label": "MiniMax M2.7", @@ -72,16 +78,18 @@ def _is_anthropic_model(model_id: str) -> bool: - return "anthropic" in model_id + return model_id.startswith("anthropic/") or ( + model_id.startswith("bedrock/") and "anthropic" in model_id + ) async def _require_hf_for_anthropic(request: Request, model_id: str) -> None: """403 if a non-``huggingface``-org user tries to select an Anthropic model. - Anthropic models are billed to the Space's ``ANTHROPIC_API_KEY``; every - other model in ``AVAILABLE_MODELS`` is routed through HF Router and - billed via ``X-HF-Bill-To``. The gate only fires for Anthropic so - non-HF users can still freely switch between the free models. + Anthropic models are billed to the Space's ``ANTHROPIC_API_KEY``; HF Router + models are billed via ``X-HF-Bill-To`` and TokenRouter models use the + server's ``TOKENROUTER_API_KEY``. The gate only fires for direct Anthropic + models so non-HF users can still switch between the other models. Pattern: https://github.com/huggingface/ml-intern/pull/63 """ diff --git a/frontend/src/components/Chat/ChatInput.tsx b/frontend/src/components/Chat/ChatInput.tsx index 58e253c1..c4c2e568 100644 --- a/frontend/src/components/Chat/ChatInput.tsx +++ b/frontend/src/components/Chat/ChatInput.tsx @@ -42,6 +42,13 @@ const MODEL_OPTIONS: ModelOption[] = [ avatarUrl: 'https://huggingface.co/api/avatars/Anthropic', recommended: true, }, + { + id: 'tokenrouter-auto-balance', + name: 'TokenRouter Auto', + description: 'TokenRouter', + modelPath: 'tokenrouter/auto:balance', + avatarUrl: 'https://www.google.com/s2/favicons?domain=tokenrouter.io&sz=64', + }, { id: 'minimax-m2.7', name: 'MiniMax M2.7', diff --git a/tests/unit/test_llm_params.py b/tests/unit/test_llm_params.py index 5234461a..ec9cbf6b 100644 --- a/tests/unit/test_llm_params.py +++ b/tests/unit/test_llm_params.py @@ -30,6 +30,43 @@ def test_openai_max_effort_is_still_rejected(): raise AssertionError("Expected UnsupportedEffortError for max effort") +def test_tokenrouter_params_use_openai_compatible_endpoint(monkeypatch): + monkeypatch.setenv("TOKENROUTER_API_KEY", " tokenrouter-key ") + monkeypatch.delenv("TOKENROUTER_API_BASE", raising=False) + + params = _resolve_llm_params("tokenrouter/auto:balance") + + assert params == { + "model": "openai/auto:balance", + "api_base": "https://api.tokenrouter.io/v1", + "api_key": "tokenrouter-key", + } + + +def test_tokenrouter_api_base_can_be_overridden(monkeypatch): + monkeypatch.setenv("TOKENROUTER_API_KEY", "tokenrouter-key") + monkeypatch.setenv("TOKENROUTER_API_BASE", " https://proxy.example.test/v1/ ") + + params = _resolve_llm_params("tokenrouter/openai:gpt-4o") + + assert params["model"] == "openai/openai:gpt-4o" + assert params["api_base"] == "https://proxy.example.test/v1" + assert params["api_key"] == "tokenrouter-key" + + +def test_tokenrouter_effort_is_rejected_in_strict_probe_mode(): + try: + _resolve_llm_params( + "tokenrouter/auto:balance", + reasoning_effort="high", + strict=True, + ) + except UnsupportedEffortError as exc: + assert "TokenRouter doesn't accept effort='high'" in str(exc) + else: + raise AssertionError("Expected UnsupportedEffortError for TokenRouter effort") + + def test_hf_router_token_prefers_inference_token(monkeypatch): monkeypatch.setenv("INFERENCE_TOKEN", " inference-token ") monkeypatch.setenv("HF_TOKEN", "hf-token")