From 3666d82c715f76f951aa24a1523ed21e69f61f2f Mon Sep 17 00:00:00 2001
From: realityinspector <info@timepointai.com>
Date: Thu, 27 Nov 2025 08:59:11 -0800
Subject: [PATCH 1/4] chore: clean slate for v2.0 refactor

This is a fresh start for TIMEPOINT Flash v2.0.

The v1.0 codebase has been archived:
- Branch: archive/v1-legacy
- Tag: v1.0.0-legacy

v2.0 will implement:
- Clean provider abstraction (Google + OpenRouter)
- Mirascope for unified LLM interface
- Synthetic time system with temporal navigation
- Batteries-included CLI
- Test-driven development
- Production-ready FastAPI server

See REFACTOR.md in archive/v1-legacy for complete plan.

https://claude.com/claude-code

From d097e0dff426b7985a3158af4124616e49190920 Mon Sep 17 00:00:00 2001
From: realityinspector <info@timepointai.com>
Date: Fri, 13 Mar 2026 10:48:33 -0600
Subject: [PATCH 2/4] security: remove private repo refs and Railway URLs from
 public docs

- Replace Railway auto-domain URLs with public custom domain (flash.timepointai.com) in FIRST-TIME-SETUP.md
- Replace Railway URL placeholders with generic domain examples in DEPLOY.md, IOS_INTEGRATION.md, config.py
- Remove private repo names (timepoint-flash-deploy, timepoint-billing) from DEPLOY.md and IOS_INTEGRATION.md
- Remove private repo names/links from Timepoint Suite table in README.md
---
 README.md                |  8 ++++----
 app/config.py            |  2 +-
 docs/DEPLOY.md           |  8 ++++----
 docs/FIRST-TIME-SETUP.md | 22 +++++++++++-----------
 docs/IOS_INTEGRATION.md  |  4 ++--
 5 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index 9206ed3..8789128 100644
--- a/README.md
+++ b/README.md
@@ -307,10 +307,10 @@ Open-source engines for temporal AI. Render the past. Simulate the future. Score
 | **SNAG Bench** | Open Source | timepoint-snag-bench | Quality Certifier — measures Causal Resolution across renderings |
 | **Proteus** | Open Source | proteus | Settlement Layer — prediction markets that validate Rendered Futures |
 | **TDF** | Open Source | timepoint-tdf | Data Format — JSON-LD interchange across all services |
-| **Web App** | Private | timepoint-web-app | Browser client at app.timepointai.com |
-| **iPhone App** | Private | timepoint-iphone-app | iOS client — Synthetic Time Travel on mobile |
-| **Billing** | Private | timepoint-billing | Payment processing — Apple IAP + Stripe |
-| **Landing** | Private | timepoint-landing | Marketing site at timepointai.com |
+| **Web App** | — | — | Browser client at app.timepointai.com |
+| **iPhone App** | — | — | iOS client — Synthetic Time Travel on mobile |
+| **Billing** | — | — | Payment processing — Apple IAP + Stripe |
+| **Landing** | — | — | Marketing site at timepointai.com |
 
 ---
 
diff --git a/app/config.py b/app/config.py
index c36255c..593e57f 100644
--- a/app/config.py
+++ b/app/config.py
@@ -444,7 +444,7 @@ class Settings(BaseSettings):
     )
     CORS_ORIGINS: str = Field(
         default="",
-        description="Comma-separated additional CORS origins (e.g. https://your-app.up.railway.app)",
+        description="Comma-separated additional CORS origins (e.g. https://your-domain.example.com)",
     )
 
     # Share URL
diff --git a/docs/DEPLOY.md b/docs/DEPLOY.md
index aa042ab..81591bf 100644
--- a/docs/DEPLOY.md
+++ b/docs/DEPLOY.md
@@ -63,14 +63,14 @@ Railway auto-detects the `Dockerfile` and deploys with PostgreSQL, health checks
 ### Verify
 
 ```bash
-curl https://your-app.up.railway.app/health
+curl https://your-domain.example.com/health
 # → {"status":"healthy","version":"2.4.0","database":true,"providers":{"google":true,"openrouter":true}}
 ```
 
 ### Generate a Scene
 
 ```bash
-curl -X POST https://your-app.up.railway.app/api/v1/timepoints/generate/sync \
+curl -X POST https://your-domain.example.com/api/v1/timepoints/generate/sync \
   -H "Content-Type: application/json" \
   -d '{"query": "Alan Turing breaks Enigma at Bletchley Park Hut 8, winter 1941", "preset": "balanced", "generate_image": true}'
 ```
@@ -80,7 +80,7 @@ curl -X POST https://your-app.up.railway.app/api/v1/timepoints/generate/sync \
 Run the smoke test workflow manually from GitHub Actions:
 
 ```bash
-gh workflow run smoke.yml -f target_url=https://your-app.up.railway.app
+gh workflow run smoke.yml -f target_url=https://your-domain.example.com
 ```
 
 ---
@@ -150,7 +150,7 @@ Set `CORS_ENABLED=false` when Flash is internal-only and never called from brows
 
 The open-source app ships with `NoOpBilling` — all credit checks pass and access is unlimited. The `BillingProvider` protocol in `app/services/billing.py` provides hooks for custom billing integrations.
 
-The deployed version (`timepoint-flash-deploy`) uses a separate billing microservice (`timepoint-billing`) that handles Apple IAP and Stripe payments as its own Railway service with its own PostgreSQL database. The main app proxies billing requests and exposes an internal credits API for the billing service to grant/spend credits after purchases.
+The deployed version uses a separate billing microservice that handles Apple IAP and Stripe payments as its own service with its own PostgreSQL database. The main app proxies billing requests and exposes an internal credits API for the billing service to grant/spend credits after purchases.
 
 ---
 
diff --git a/docs/FIRST-TIME-SETUP.md b/docs/FIRST-TIME-SETUP.md
index 5444453..b6e6195 100644
--- a/docs/FIRST-TIME-SETUP.md
+++ b/docs/FIRST-TIME-SETUP.md
@@ -2,7 +2,7 @@
 
 You're an AI agent with access to a running TIMEPOINT Flash server. This guide shows you how to use it.
 
-**Base URL:** `https://timepoint-flash-deploy-production.up.railway.app`
+**Base URL:** `https://flash.timepointai.com`
 
 No authentication is required (AUTH_ENABLED=false). All endpoints are open-access.
 
@@ -11,7 +11,7 @@ No authentication is required (AUTH_ENABLED=false). All endpoints are open-acces
 ## 1. Check the Server
 
 ```bash
-curl https://timepoint-flash-deploy-production.up.railway.app/health
+curl https://flash.timepointai.com/health
 ```
 
 Expected:
@@ -30,7 +30,7 @@ The core operation. Give it a historical moment, get back characters, dialog, re
 ### Synchronous (simplest)
 
 ```bash
-curl -X POST https://timepoint-flash-deploy-production.up.railway.app/api/v1/timepoints/generate/sync \
+curl -X POST https://flash.timepointai.com/api/v1/timepoints/generate/sync \
   -H "Content-Type: application/json" \
   -d '{
     "query": "Alan Turing breaks Enigma at Bletchley Park Hut 8, winter 1941",
@@ -44,7 +44,7 @@ This blocks for 30-120 seconds and returns the complete scene.
 ### Streaming (recommended for UIs)
 
 ```bash
-curl -X POST https://timepoint-flash-deploy-production.up.railway.app/api/v1/timepoints/generate/stream \
+curl -X POST https://flash.timepointai.com/api/v1/timepoints/generate/stream \
   -H "Content-Type: application/json" \
   -d '{
     "query": "Oppenheimer watches the Trinity test, 5:29 AM July 16 1945",
@@ -64,7 +64,7 @@ data: {"event": "done", "progress": 100, "data": {"timepoint_id": "abc123", ...}
 ### Background (fire and forget)
 
 ```bash
-curl -X POST https://timepoint-flash-deploy-production.up.railway.app/api/v1/timepoints/generate \
+curl -X POST https://flash.timepointai.com/api/v1/timepoints/generate \
   -H "Content-Type: application/json" \
   -d '{"query": "Gavrilo Princip at Schiller Deli Sarajevo June 28 1914", "preset": "balanced"}'
 ```
@@ -96,7 +96,7 @@ Returns immediately with a timepoint ID. Poll `GET /api/v1/timepoints/{id}` unti
 ## 3. Retrieve a Scene
 
 ```bash
-curl "https://timepoint-flash-deploy-production.up.railway.app/api/v1/timepoints/{id}?full=true&include_image=true"
+curl "https://flash.timepointai.com/api/v1/timepoints/{id}?full=true&include_image=true"
 ```
 
 - `full=true` — include scene, characters, dialog, relationships
@@ -129,7 +129,7 @@ When `AUTH_ENABLED=true`, interaction endpoints require a Bearer JWT and deduct
 After generating a scene, chat with any character in it:
 
 ```bash
-curl -X POST https://timepoint-flash-deploy-production.up.railway.app/api/v1/interactions/{timepoint_id}/chat \
+curl -X POST https://flash.timepointai.com/api/v1/interactions/{timepoint_id}/chat \
   -H "Content-Type: application/json" \
   -d '{
     "character": "Oppenheimer",
@@ -171,14 +171,14 @@ Jump forward or backward from any scene. The new scene preserves characters and
 
 **Jump forward:**
 ```bash
-curl -X POST https://timepoint-flash-deploy-production.up.railway.app/api/v1/temporal/{timepoint_id}/next \
+curl -X POST https://flash.timepointai.com/api/v1/temporal/{timepoint_id}/next \
   -H "Content-Type: application/json" \
   -d '{"units": 1, "unit": "hour"}'
 ```
 
 **Jump backward:**
 ```bash
-curl -X POST https://timepoint-flash-deploy-production.up.railway.app/api/v1/temporal/{timepoint_id}/prior \
+curl -X POST https://flash.timepointai.com/api/v1/temporal/{timepoint_id}/prior \
   -H "Content-Type: application/json" \
   -d '{"units": 30, "unit": "minute"}'
 ```
@@ -205,7 +205,7 @@ GET /api/v1/timepoints?visibility=public
 GET /api/v1/timepoints?visibility=private   # owner only (requires auth)
 
 # Set a scene to private
-curl -X PATCH https://timepoint-flash-deploy-production.up.railway.app/api/v1/timepoints/{id}/visibility \
+curl -X PATCH https://flash.timepointai.com/api/v1/timepoints/{id}/visibility \
   -H "Content-Type: application/json" \
   -d '{"visibility": "private"}'
 
@@ -269,7 +269,7 @@ All errors return `{"detail": "Error message"}`.
 
 For complete endpoint documentation including auth, credits, and eval: [API.md](API.md)
 
-OpenAPI schema available at: `https://timepoint-flash-deploy-production.up.railway.app/openapi.json`
+OpenAPI schema available at: `https://flash.timepointai.com/openapi.json`
 
 ---
 
diff --git a/docs/IOS_INTEGRATION.md b/docs/IOS_INTEGRATION.md
index dbd68cb..89d1183 100644
--- a/docs/IOS_INTEGRATION.md
+++ b/docs/IOS_INTEGRATION.md
@@ -134,7 +134,7 @@ When receiving a 402:
 
 ## 4. Endpoint Map for iOS MVP
 
-All endpoints are under `/api/v1`. Prefix with your Railway base URL (e.g. `https://your-app.up.railway.app`).
+All endpoints are under `/api/v1`. Prefix with your deployment base URL (e.g. `https://your-domain.example.com`).
 
 ### Auth (requires `AUTH_ENABLED=true`)
 
@@ -362,7 +362,7 @@ The admin grant endpoint (`POST /credits/admin/grant`) now accepts an optional `
 
 ## 13. Billing Hooks
 
-The open-source app includes a `BillingProvider` protocol (`app/services/billing.py`) with a default `NoOpBilling` implementation (unlimited access). The deployed version (`timepoint-flash-deploy`) uses a separate billing microservice that handles Apple IAP and Stripe payments, proxying billing requests through the main app.
+The open-source app includes a `BillingProvider` protocol (`app/services/billing.py`) with a default `NoOpBilling` implementation (unlimited access). The deployed version uses a separate billing microservice that handles Apple IAP and Stripe payments, proxying billing requests through the main app.
 
 The billing hooks provide:
 - `check_credits(user_id, cost)` — called before credit-consuming operations

From b0cf30d919d8a216d950271874af09a9189bd17e Mon Sep 17 00:00:00 2001
From: realityinspector <info@timepointai.com>
Date: Sat, 14 Mar 2026 12:39:35 -0600
Subject: [PATCH 3/4] feat: add NVIDIA Nemotron and NousResearch Hermes model
 families (#19)

---
 app/api/v1/models.py           |  30 ++++++
 app/api/v1/timepoints.py       |  10 +-
 app/config.py                  |  17 ++++
 app/core/model_capabilities.py | 167 +++++++++++++++++++++++++++++++++
 app/core/model_policy.py       |  10 +-
 5 files changed, 227 insertions(+), 7 deletions(-)

diff --git a/app/api/v1/models.py b/app/api/v1/models.py
index 4ae5ec7..14480fa 100644
--- a/app/api/v1/models.py
+++ b/app/api/v1/models.py
@@ -141,6 +141,36 @@ def get_configured_models() -> list[ModelInfo]:
                 capabilities=["image_generation"],
                 pricing={"prompt": 0.00012, "completion": 0.0},
             ),
+            # NVIDIA Nemotron
+            ModelInfo(
+                id="nvidia/llama-3.3-nemotron-super-49b-v1.5",
+                name="Nemotron Super 49B v1.5",
+                provider="openrouter",
+                capabilities=["text"],
+                context_length=131072,
+            ),
+            ModelInfo(
+                id="nvidia/llama-3.1-nemotron-70b-instruct",
+                name="Nemotron 70B Instruct",
+                provider="openrouter",
+                capabilities=["text"],
+                context_length=131072,
+            ),
+            # NousResearch Hermes
+            ModelInfo(
+                id="nousresearch/hermes-4-70b",
+                name="Hermes 4 70B",
+                provider="openrouter",
+                capabilities=["text"],
+                context_length=131072,
+            ),
+            ModelInfo(
+                id="nousresearch/hermes-3-llama-3.1-405b",
+                name="Hermes 3 405B",
+                provider="openrouter",
+                capabilities=["text"],
+                context_length=131072,
+            ),
         ])
 
     return models
diff --git a/app/api/v1/timepoints.py b/app/api/v1/timepoints.py
index fd40669..db9d2fb 100644
--- a/app/api/v1/timepoints.py
+++ b/app/api/v1/timepoints.py
@@ -234,10 +234,14 @@ def _get_permissive_text_model() -> str:
         preference = [
             "meta-llama/llama-4-scout-17b-16e-instruct",
             "meta-llama/llama-4-maverick-17b-128e-instruct",
-            "deepseek/deepseek-chat-v3-0324",       # Fast chat model
-            "qwen/qwen3-30b-a3b",                   # Fast MoE model
+            "nvidia/llama-3.3-nemotron-super-49b-v1.5",  # Nemotron Super, fast MoE
+            "nousresearch/hermes-4-70b",             # Hermes 4 70B, strong reasoning
+            "deepseek/deepseek-chat-v3-0324",        # Fast chat model
+            "qwen/qwen3-30b-a3b",                    # Fast MoE model
+            "nvidia/nemotron-3-nano-30b-a3b",        # Nemotron Nano, very fast
+            "nousresearch/hermes-3-llama-3.1-70b",   # Hermes 3 70B fallback
             "mistralai/mistral-small-3.2-24b-instruct",
-            "qwen/qwen3-235b-a22b",                 # Large but non-thinking
+            "qwen/qwen3-235b-a22b",                  # Large but non-thinking
             "deepseek/deepseek-r1-0528",             # Thinking model — slow, last resort
         ]
         for model_id in preference:
diff --git a/app/config.py b/app/config.py
index 593e57f..b05f1d5 100644
--- a/app/config.py
+++ b/app/config.py
@@ -104,6 +104,23 @@ class VerifiedModels:
         "google/gemini-2.0-flash-001",        # Fast, handles JSON well
         "google/gemini-2.0-flash-001:free",   # Free tier (rate limited)
         "google/gemini-3-flash-preview",      # Latest thinking model, agentic workflows
+        # NVIDIA Nemotron family
+        "nvidia/llama-3.1-nemotron-70b-instruct",
+        "nvidia/llama-3.3-nemotron-super-49b-v1.5",
+        "nvidia/nemotron-3-nano-30b-a3b",
+        "nvidia/nemotron-3-nano-30b-a3b:free",
+        "nvidia/nemotron-3-super-120b-a12b:free",
+        "nvidia/nemotron-nano-12b-v2-vl",
+        "nvidia/nemotron-nano-12b-v2-vl:free",
+        "nvidia/nemotron-nano-9b-v2",
+        "nvidia/nemotron-nano-9b-v2:free",
+        # NousResearch Hermes family
+        "nousresearch/hermes-2-pro-llama-3-8b",
+        "nousresearch/hermes-3-llama-3.1-405b",
+        "nousresearch/hermes-3-llama-3.1-405b:free",
+        "nousresearch/hermes-3-llama-3.1-70b",
+        "nousresearch/hermes-4-405b",
+        "nousresearch/hermes-4-70b",
     ]
 
     # Fallback chains - ordered by preference
diff --git a/app/core/model_capabilities.py b/app/core/model_capabilities.py
index 6719121..628cb69 100644
--- a/app/core/model_capabilities.py
+++ b/app/core/model_capabilities.py
@@ -404,6 +404,173 @@ class TextModelConfig:
         max_output_tokens=16384,
         notes="GPT-4o Mini via OpenRouter",
     ),
+    # NVIDIA Nemotron family (via OpenRouter)
+    "nvidia/llama-3.1-nemotron-70b-instruct": TextModelConfig(
+        model_id="nvidia/llama-3.1-nemotron-70b-instruct",
+        provider="openrouter",
+        supports_json_schema=False,
+        supports_json_mode=True,
+        supports_function_calling=True,
+        supports_streaming=True,
+        supports_extended_thinking=False,
+        max_output_tokens=8192,
+        notes="Nemotron 70B instruct, strong reasoning",
+    ),
+    "nvidia/llama-3.3-nemotron-super-49b-v1.5": TextModelConfig(
+        model_id="nvidia/llama-3.3-nemotron-super-49b-v1.5",
+        provider="openrouter",
+        supports_json_schema=False,
+        supports_json_mode=True,
+        supports_function_calling=True,
+        supports_streaming=True,
+        supports_extended_thinking=False,
+        max_output_tokens=8192,
+        notes="Nemotron Super 49B v1.5, efficient MoE",
+    ),
+    "nvidia/nemotron-3-nano-30b-a3b": TextModelConfig(
+        model_id="nvidia/nemotron-3-nano-30b-a3b",
+        provider="openrouter",
+        supports_json_schema=False,
+        supports_json_mode=True,
+        supports_function_calling=True,
+        supports_streaming=True,
+        supports_extended_thinking=False,
+        max_output_tokens=8192,
+        notes="Nemotron 3 Nano 30B, fast MoE (3B active)",
+    ),
+    "nvidia/nemotron-3-nano-30b-a3b:free": TextModelConfig(
+        model_id="nvidia/nemotron-3-nano-30b-a3b:free",
+        provider="openrouter",
+        supports_json_schema=False,
+        supports_json_mode=True,
+        supports_function_calling=True,
+        supports_streaming=True,
+        supports_extended_thinking=False,
+        max_output_tokens=8192,
+        notes="Nemotron 3 Nano 30B free tier",
+    ),
+    "nvidia/nemotron-3-super-120b-a12b:free": TextModelConfig(
+        model_id="nvidia/nemotron-3-super-120b-a12b:free",
+        provider="openrouter",
+        supports_json_schema=False,
+        supports_json_mode=True,
+        supports_function_calling=True,
+        supports_streaming=True,
+        supports_extended_thinking=False,
+        max_output_tokens=8192,
+        notes="Nemotron 3 Super 120B free tier, large MoE (12B active)",
+    ),
+    "nvidia/nemotron-nano-12b-v2-vl": TextModelConfig(
+        model_id="nvidia/nemotron-nano-12b-v2-vl",
+        provider="openrouter",
+        supports_json_schema=False,
+        supports_json_mode=True,
+        supports_function_calling=True,
+        supports_streaming=True,
+        supports_extended_thinking=False,
+        max_output_tokens=8192,
+        notes="Nemotron Nano 12B v2 with vision",
+    ),
+    "nvidia/nemotron-nano-12b-v2-vl:free": TextModelConfig(
+        model_id="nvidia/nemotron-nano-12b-v2-vl:free",
+        provider="openrouter",
+        supports_json_schema=False,
+        supports_json_mode=True,
+        supports_function_calling=True,
+        supports_streaming=True,
+        supports_extended_thinking=False,
+        max_output_tokens=8192,
+        notes="Nemotron Nano 12B v2 with vision, free tier",
+    ),
+    "nvidia/nemotron-nano-9b-v2": TextModelConfig(
+        model_id="nvidia/nemotron-nano-9b-v2",
+        provider="openrouter",
+        supports_json_schema=False,
+        supports_json_mode=True,
+        supports_function_calling=True,
+        supports_streaming=True,
+        supports_extended_thinking=False,
+        max_output_tokens=8192,
+        notes="Nemotron Nano 9B v2, compact and fast",
+    ),
+    "nvidia/nemotron-nano-9b-v2:free": TextModelConfig(
+        model_id="nvidia/nemotron-nano-9b-v2:free",
+        provider="openrouter",
+        supports_json_schema=False,
+        supports_json_mode=True,
+        supports_function_calling=True,
+        supports_streaming=True,
+        supports_extended_thinking=False,
+        max_output_tokens=8192,
+        notes="Nemotron Nano 9B v2 free tier",
+    ),
+    # NousResearch Hermes family (via OpenRouter)
+    "nousresearch/hermes-2-pro-llama-3-8b": TextModelConfig(
+        model_id="nousresearch/hermes-2-pro-llama-3-8b",
+        provider="openrouter",
+        supports_json_schema=False,
+        supports_json_mode=True,
+        supports_function_calling=True,
+        supports_streaming=True,
+        supports_extended_thinking=False,
+        max_output_tokens=8192,
+        notes="Hermes 2 Pro 8B, compact function-calling model",
+    ),
+    "nousresearch/hermes-3-llama-3.1-405b": TextModelConfig(
+        model_id="nousresearch/hermes-3-llama-3.1-405b",
+        provider="openrouter",
+        supports_json_schema=False,
+        supports_json_mode=True,
+        supports_function_calling=True,
+        supports_streaming=True,
+        supports_extended_thinking=False,
+        max_output_tokens=8192,
+        notes="Hermes 3 405B, flagship open-weight model",
+    ),
+    "nousresearch/hermes-3-llama-3.1-405b:free": TextModelConfig(
+        model_id="nousresearch/hermes-3-llama-3.1-405b:free",
+        provider="openrouter",
+        supports_json_schema=False,
+        supports_json_mode=True,
+        supports_function_calling=True,
+        supports_streaming=True,
+        supports_extended_thinking=False,
+        max_output_tokens=8192,
+        notes="Hermes 3 405B free tier",
+    ),
+    "nousresearch/hermes-3-llama-3.1-70b": TextModelConfig(
+        model_id="nousresearch/hermes-3-llama-3.1-70b",
+        provider="openrouter",
+        supports_json_schema=False,
+        supports_json_mode=True,
+        supports_function_calling=True,
+        supports_streaming=True,
+        supports_extended_thinking=False,
+        max_output_tokens=8192,
+        notes="Hermes 3 70B, strong general-purpose model",
+    ),
+    "nousresearch/hermes-4-405b": TextModelConfig(
+        model_id="nousresearch/hermes-4-405b",
+        provider="openrouter",
+        supports_json_schema=False,
+        supports_json_mode=True,
+        supports_function_calling=True,
+        supports_streaming=True,
+        supports_extended_thinking=False,
+        max_output_tokens=8192,
+        notes="Hermes 4 405B, latest flagship",
+    ),
+    "nousresearch/hermes-4-70b": TextModelConfig(
+        model_id="nousresearch/hermes-4-70b",
+        provider="openrouter",
+        supports_json_schema=False,
+        supports_json_mode=True,
+        supports_function_calling=True,
+        supports_streaming=True,
+        supports_extended_thinking=False,
+        max_output_tokens=8192,
+        notes="Hermes 4 70B, strong reasoning",
+    ),
 }
 
 # Default config for unknown models (conservative - assume JSON mode works)
diff --git a/app/core/model_policy.py b/app/core/model_policy.py
index ff8145c..74679ca 100644
--- a/app/core/model_policy.py
+++ b/app/core/model_policy.py
@@ -13,7 +13,8 @@
     "microsoft/",           # Phi family
     "google/gemma",         # Gemma open-weight
     "allenai/",
-    "nvidia/",
+    "nvidia/",              # Nemotron family
+    "nousresearch/",        # Hermes family (open-weight)
     "black-forest-labs/",   # FLUX open-weight image models
 )
 
@@ -21,7 +22,7 @@
 GOOGLE_MODEL_PREFIXES = ("gemini", "imagen", "flux-schnell")
 
 # Prefixes routed through OpenRouter (may be restricted or permissive)
-OPENROUTER_PREFIXES = ("meta-llama/", "anthropic/", "mistralai/", "openai/", "deepseek/", "qwen/", "microsoft/", "black-forest-labs/")
+OPENROUTER_PREFIXES = ("meta-llama/", "anthropic/", "mistralai/", "openai/", "deepseek/", "qwen/", "microsoft/", "nvidia/", "nousresearch/", "black-forest-labs/")
 
 
 def derive_model_provider(model_id: str | None) -> str:
@@ -47,8 +48,9 @@ def is_model_permissive(model_id: str | None) -> bool:
 def derive_model_permissiveness(model_id: str | None) -> str:
     """Derive distillation licensing permissiveness from a model ID.
 
-    Open-weight models (Llama, DeepSeek, Qwen, Mistral, Phi, Gemma) are
-    'permissive' — safe for distillation and derivative works.
+    Open-weight models (Llama, DeepSeek, Qwen, Mistral, Phi, Gemma,
+    Nemotron, Hermes) are 'permissive' — safe for distillation and
+    derivative works.
     Frontier models (Google Gemini, Anthropic, OpenAI) are 'restricted'.
     """
     if not model_id:

From 0390230ad92a7d879cca5a19f7ffe562e29bfaeb Mon Sep 17 00:00:00 2001
From: realityinspector <info@allonething.xyz>
Date: Sun, 15 Mar 2026 20:37:30 -0600
Subject: [PATCH 4/4] feat: add free distillable mode with Hunter Alpha, Healer
 Alpha, Nemotron Super

---
 app/config.py                  | 18 ++++++++++++++++++
 app/core/llm_router.py         | 11 ++++++++++-
 app/core/model_capabilities.py | 23 +++++++++++++++++++++++
 app/core/model_policy.py       |  3 ++-
 4 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/app/config.py b/app/config.py
index b05f1d5..27c289a 100644
--- a/app/config.py
+++ b/app/config.py
@@ -54,12 +54,14 @@ class QualityPreset(str, Enum):
     - HYPER: Fastest speed with Gemini 2.0 Flash via OpenRouter
     - BALANCED: Default balance of quality and speed
     - GEMINI3: Latest Gemini 3 Flash Preview via OpenRouter (thinking model)
+    - FREE_DISTILLABLE: Free distillable models — $0 cost, outputs usable for training/distillation
     """
 
     HD = "hd"
     HYPER = "hyper"
     BALANCED = "balanced"
     GEMINI3 = "gemini3"
+    FREE_DISTILLABLE = "free_distillable"
 
 
 class Environment(str, Enum):
@@ -121,6 +123,9 @@ class VerifiedModels:
         "nousresearch/hermes-3-llama-3.1-70b",
         "nousresearch/hermes-4-405b",
         "nousresearch/hermes-4-70b",
+        # OpenRouter free distillable models
+        "openrouter/hunter-alpha",
+        "openrouter/healer-alpha",
     ]
 
     # Fallback chains - ordered by preference
@@ -245,6 +250,18 @@ def is_verified_or_available(cls, model: str, provider: "ProviderType") -> bool:
         "thinking_level": "medium",  # Gemini 3 supports configurable thinking
         "image_supported": True,
     },
+    QualityPreset.FREE_DISTILLABLE: {
+        "name": "Free Distillable",
+        "description": "Free models with distillation rights — $0 cost, text-only (no image gen)",
+        "text_model": "openrouter/hunter-alpha",
+        "judge_model": "openrouter/healer-alpha",
+        "image_model": None,  # No free distillable image models available yet
+        "image_provider": None,
+        "text_provider": ProviderType.OPENROUTER,
+        "max_tokens": 4096,
+        "thinking_level": None,
+        "image_supported": False,  # Text-only mode
+    },
 }
 
 
@@ -255,6 +272,7 @@ def is_verified_or_available(cls, model: str, provider: "ProviderType") -> bool:
     QualityPreset.BALANCED: ParallelismMode.NORMAL, # Default behavior
     QualityPreset.HYPER: ParallelismMode.MAX,       # Speed focus, maximum parallelism
     QualityPreset.GEMINI3: ParallelismMode.AGGRESSIVE,  # Thinking model, moderate parallelism
+    QualityPreset.FREE_DISTILLABLE: ParallelismMode.SEQUENTIAL,  # Free models need sequential
 }
 
 # Provider rate limits (requests per minute and safe concurrent calls)
diff --git a/app/core/llm_router.py b/app/core/llm_router.py
index 9ad4f2c..5c52d8e 100644
--- a/app/core/llm_router.py
+++ b/app/core/llm_router.py
@@ -134,6 +134,13 @@ class ModelTier(str, Enum):
 }
 
 
+# Known free distillable models (no :free suffix but still free)
+FREE_DISTILLABLE_MODELS = {
+    "openrouter/hunter-alpha",
+    "openrouter/healer-alpha",
+}
+
+
 def is_free_model(model_id: str) -> bool:
     """Check if a model is a free tier model on OpenRouter.
 
@@ -146,7 +153,9 @@ def is_free_model(model_id: str) -> bool:
     if not model_id:
         return False
     model_lower = model_id.lower()
-    return ":free" in model_lower or "/free" in model_lower
+    if ":free" in model_lower or "/free" in model_lower:
+        return True
+    return model_lower in FREE_DISTILLABLE_MODELS
 
 
 class LLMRouter:
diff --git a/app/core/model_capabilities.py b/app/core/model_capabilities.py
index 628cb69..3fb1cc2 100644
--- a/app/core/model_capabilities.py
+++ b/app/core/model_capabilities.py
@@ -571,6 +571,29 @@ class TextModelConfig:
         max_output_tokens=8192,
         notes="Hermes 4 70B, strong reasoning",
     ),
+    # OpenRouter free distillable models
+    "openrouter/hunter-alpha": TextModelConfig(
+        model_id="openrouter/hunter-alpha",
+        provider="openrouter",
+        supports_json_schema=False,
+        supports_json_mode=True,
+        supports_function_calling=True,
+        supports_streaming=True,
+        supports_extended_thinking=False,
+        max_output_tokens=32000,
+        notes="Free distillable, 1M context, text+image input",
+    ),
+    "openrouter/healer-alpha": TextModelConfig(
+        model_id="openrouter/healer-alpha",
+        provider="openrouter",
+        supports_json_schema=False,
+        supports_json_mode=True,
+        supports_function_calling=True,
+        supports_streaming=True,
+        supports_extended_thinking=False,
+        max_output_tokens=32000,
+        notes="Free distillable, 262K context, multimodal input",
+    ),
 }
 
 # Default config for unknown models (conservative - assume JSON mode works)
diff --git a/app/core/model_policy.py b/app/core/model_policy.py
index 74679ca..4135983 100644
--- a/app/core/model_policy.py
+++ b/app/core/model_policy.py
@@ -16,13 +16,14 @@
     "nvidia/",              # Nemotron family
     "nousresearch/",        # Hermes family (open-weight)
     "black-forest-labs/",   # FLUX open-weight image models
+    "openrouter/",          # OpenRouter free distillable models (Hunter, Healer)
 )
 
 # Google-native model prefixes (always restricted)
 GOOGLE_MODEL_PREFIXES = ("gemini", "imagen", "flux-schnell")
 
 # Prefixes routed through OpenRouter (may be restricted or permissive)
-OPENROUTER_PREFIXES = ("meta-llama/", "anthropic/", "mistralai/", "openai/", "deepseek/", "qwen/", "microsoft/", "nvidia/", "nousresearch/", "black-forest-labs/")
+OPENROUTER_PREFIXES = ("meta-llama/", "anthropic/", "mistralai/", "openai/", "deepseek/", "qwen/", "microsoft/", "nvidia/", "nousresearch/", "black-forest-labs/", "openrouter/")
 
 
 def derive_model_provider(model_id: str | None) -> str: