From a096eefc652bb3e6953981a99dbbf66fd8ab3e39 Mon Sep 17 00:00:00 2001
From: realityinspector <mcdonald.sean@gmail.com>
Date: Wed, 11 Mar 2026 19:00:37 -0600
Subject: [PATCH 1/3] fix: dramatically reduce permissive mode latency (10min
 -> ~2min)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Default max_tokens=2048 when model_policy=permissive and no preset
- Use batch dialog (1 LLM call) instead of sequential (7 calls)
- Skip critique loop in permissive mode (saves 1-8 calls)
- Block Google fallback in permissive mode (correctness: stay Google-free)
- Reorder model preference to prioritize fast non-thinking models
  (DeepSeek R1 moved to last resort — its 30-60s/call caused timeouts)
---
 app/api/v1/timepoints.py | 12 +++++-----
 app/core/llm_router.py   | 27 +++++++++++++++++++++--
 app/core/pipeline.py     | 47 ++++++++++++++++++++++++++++++++++++++--
 3 files changed, 77 insertions(+), 9 deletions(-)

diff --git a/app/api/v1/timepoints.py b/app/api/v1/timepoints.py
index 424b7ca..59bbe40 100644
--- a/app/api/v1/timepoints.py
+++ b/app/api/v1/timepoints.py
@@ -226,15 +226,17 @@ def _get_permissive_text_model() -> str:
         if registry.model_count == 0:
             return _DEFAULT_PERMISSIVE_TEXT_MODEL
 
-        # Walk preference list; return first that's in the live registry
+        # Walk preference list; return first that's in the live registry.
+        # Prioritize fast non-thinking models — DeepSeek R1 is a thinking
+        # model that takes 30-60s per call and causes pipeline timeouts.
         preference = [
             "meta-llama/llama-4-scout-17b-16e-instruct",
             "meta-llama/llama-4-maverick-17b-128e-instruct",
-            "deepseek/deepseek-r1-0528",
-            "deepseek/deepseek-chat-v3-0324",
-            "qwen/qwen3-235b-a22b",
-            "qwen/qwen3-30b-a3b",
+            "deepseek/deepseek-chat-v3-0324",       # Fast chat model
+            "qwen/qwen3-30b-a3b",                   # Fast MoE model
             "mistralai/mistral-small-3.2-24b-instruct",
+            "qwen/qwen3-235b-a22b",                 # Large but non-thinking
+            "deepseek/deepseek-r1-0528",             # Thinking model — slow, last resort
         ]
         for model_id in preference:
             if registry.is_model_available(model_id):
diff --git a/app/core/llm_router.py b/app/core/llm_router.py
index 686ea33..3bb5ac7 100644
--- a/app/core/llm_router.py
+++ b/app/core/llm_router.py
@@ -195,6 +195,7 @@ def __init__(
         preset: QualityPreset | None = None,
         text_model: str | None = None,
         image_model: str | None = None,
+        model_policy: str | None = None,
     ) -> None:
         """Initialize LLM router.
 
@@ -203,12 +204,14 @@ def __init__(
             preset: Quality preset (HD, HYPER, BALANCED). Overrides config models.
             text_model: Custom text model override (overrides preset).
             image_model: Custom image model override (overrides preset).
+            model_policy: Model policy (e.g. "permissive" blocks Google fallback).
         """
         settings = get_settings()
         self.preset = preset
         self._preset_config = PRESET_CONFIGS.get(preset) if preset else None
         self._custom_text_model = text_model
         self._custom_image_model = image_model
+        self._model_policy = model_policy
 
         # Build config from settings if not provided
         if config is None:
@@ -624,7 +627,15 @@ async def call(
                     logger.warning(f"Paid model fallback also failed: {e2}")
 
             # Try Google provider as ultimate fallback using verified model
-            if ProviderType.GOOGLE in self.providers and self.config.primary != ProviderType.GOOGLE:
+            # (blocked in permissive mode — must stay Google-free)
+            is_permissive = bool(
+                self._model_policy and self._model_policy.lower() == "permissive"
+            )
+            if (
+                ProviderType.GOOGLE in self.providers
+                and self.config.primary != ProviderType.GOOGLE
+                and not is_permissive
+            ):
                 logger.info("Falling back to Google provider with verified model")
                 try:
                     provider = self._get_provider(ProviderType.GOOGLE)
@@ -635,6 +646,8 @@ async def call(
                     )
                 except ProviderError as e3:
                     logger.warning(f"Google provider fallback failed: {e3}")
+            elif is_permissive:
+                logger.info("Skipping Google fallback: model_policy=permissive")
 
             # All fallbacks exhausted
             raise ProviderError(
@@ -749,7 +762,15 @@ async def call_structured(
                     logger.warning(f"Paid model fallback also failed: {e2}")
 
             # Try Google provider as ultimate fallback using verified model
-            if ProviderType.GOOGLE in self.providers and self.config.primary != ProviderType.GOOGLE:
+            # (blocked in permissive mode — must stay Google-free)
+            is_permissive = bool(
+                self._model_policy and self._model_policy.lower() == "permissive"
+            )
+            if (
+                ProviderType.GOOGLE in self.providers
+                and self.config.primary != ProviderType.GOOGLE
+                and not is_permissive
+            ):
                 logger.info("Falling back to Google provider with verified model")
                 try:
                     provider = self._get_provider(ProviderType.GOOGLE)
@@ -761,6 +782,8 @@ async def call_structured(
                     )
                 except ProviderError as e3:
                     logger.warning(f"Google provider fallback failed: {e3}")
+            elif is_permissive:
+                logger.info("Skipping Google fallback: model_policy=permissive")
 
             # All fallbacks exhausted
             raise ProviderError(
diff --git a/app/core/pipeline.py b/app/core/pipeline.py
index f2de686..e8fa6ae 100644
--- a/app/core/pipeline.py
+++ b/app/core/pipeline.py
@@ -291,7 +291,20 @@ def __init__(
         self._text_model = text_model
         self._image_model = image_model
         self._model_policy = model_policy
-        self._llm_params: dict[str, Any] = llm_params or {}
+
+        # Build effective llm_params: apply permissive speed defaults
+        # when no preset is set and caller hasn't specified max_tokens.
+        effective_params = dict(llm_params or {})
+        if (
+            model_policy
+            and model_policy.lower() == "permissive"
+            and preset is None
+            and "max_tokens" not in effective_params
+        ):
+            effective_params["max_tokens"] = 2048
+            logger.info("model_policy=permissive: defaulting max_tokens=2048 for speed")
+
+        self._llm_params: dict[str, Any] = effective_params
         self._max_parallelism_override = max_parallelism
         self._max_parallelism: int | None = None  # Set during execution planning
         self._semaphore: asyncio.Semaphore | None = None
@@ -324,6 +337,7 @@ def router(self) -> LLMRouter:
                 preset=self._preset,
                 text_model=self._text_model,
                 image_model=self._image_model,
+                model_policy=self._model_policy,
             )
         return self._router
 
@@ -341,7 +355,15 @@ def _init_agents(self) -> None:
         self._char_id_agent = CharacterIdentificationAgent(router=router)
         self._char_bio_agent = CharacterBioAgent(router=router)
         self._moment_agent = MomentAgent(router=router)
-        self._dialog_agent = DialogAgent(router=router)
+        # Permissive mode: use batch dialog (1 LLM call) instead of
+        # sequential roleplay (7 calls) to cut latency dramatically.
+        is_permissive = bool(
+            self._model_policy and self._model_policy.lower() == "permissive"
+        )
+        self._dialog_agent = DialogAgent(
+            router=router,
+            use_sequential=not is_permissive,
+        )
         self._camera_agent = CameraAgent(router=router)
         self._graph_agent = GraphAgent(router=router)
         self._image_prompt_agent = ImagePromptAgent(router=router)
@@ -1390,6 +1412,27 @@ async def _step_dialog(self, state: PipelineState) -> PipelineState:
             state.dialog_data = result.content
 
             # === CRITIQUE LOOP (one pass max) ===
+            # Skip critique in permissive mode — saves 1-8 LLM calls and the
+            # batch dialog output is already quality-constrained by the prompt.
+            is_permissive = bool(
+                self._model_policy and self._model_policy.lower() == "permissive"
+            )
+            if is_permissive:
+                logger.info("Skipping dialog critique: model_policy=permissive (speed mode)")
+                state.step_results.append(
+                    StepResult(
+                        step=step,
+                        success=result.success,
+                        data=state.dialog_data,
+                        error=result.error,
+                        latency_ms=result.latency_ms,
+                        model_used=result.model_used,
+                    )
+                )
+                if state.dialog_data:
+                    logger.debug(f"Dialog: {len(state.dialog_data.lines)} lines")
+                return state
+
             critique_input = CritiqueInput(
                 step_name="dialog",
                 output_json=result.content.model_dump_json(),

From 8c69f5de4065bc9ffdda5d25d65393767c7a3a7c Mon Sep 17 00:00:00 2001
From: realityinspector <mcdonald.sean@gmail.com>
Date: Wed, 11 Mar 2026 19:26:22 -0600
Subject: [PATCH 2/3] refactor: remove Pollinations, use OpenRouter for all
 image generation

Remove Pollinations.ai as image provider. All image generation now
routes through OpenRouter or Google native. Permissive mode uses
OpenRouter for images (Google-free). Simplifies fallback chain from
3-tier (Google -> OpenRouter -> Pollinations) to 2-tier (Google ->
OpenRouter).
---
 app/api/v1/timepoints.py         |  20 +++--
 app/core/llm_router.py           | 127 +++++--------------------------
 app/core/model_policy.py         |   4 -
 docs/API.md                      |  20 +++--
 docs/DOWNSTREAM_MODEL_CONTROL.md |   4 +-
 5 files changed, 42 insertions(+), 133 deletions(-)

diff --git a/app/api/v1/timepoints.py b/app/api/v1/timepoints.py
index 59bbe40..5823c23 100644
--- a/app/api/v1/timepoints.py
+++ b/app/api/v1/timepoints.py
@@ -173,8 +173,12 @@ class GenerateRequest(BaseModel):
     )
     image_model: str | None = Field(
         default=None,
-        description="Custom image model override (e.g., 'google/imagen-3'). Overrides preset.",
-        examples=["google/imagen-3", "black-forest-labs/flux-1.1-pro"],
+        description=(
+            "Image model ID. OpenRouter format ('org/model') or Google native "
+            "(gemini-2.5-flash-image, gemini-3-pro-image-preview)."
+        ),
+        examples=["gemini-2.5-flash-image", "gemini-3-pro-image-preview",
+                   "google/gemini-2.5-flash-image-preview"],
     )
     write_blob: bool = Field(
         default=False,
@@ -210,12 +214,10 @@ class GenerateRequest(BaseModel):
     )
 
 
-# Default permissive models — used when model_policy="permissive" and no
-# explicit text_model/image_model is provided.  These must be open-weight
-# models available on OpenRouter (or Pollinations for images).
+# Default permissive text model — used when model_policy="permissive" and no
+# explicit text_model is provided.  Must be an open-weight model on OpenRouter.
+# Image model is resolved at runtime via get_image_fallback_model().
 _DEFAULT_PERMISSIVE_TEXT_MODEL = "meta-llama/llama-4-scout-17b-16e-instruct"
-_DEFAULT_PERMISSIVE_IMAGE_MODEL = "pollinations"  # Free, open, always available
-
 
 def _get_permissive_text_model() -> str:
     """Pick the best available permissive text model from the registry."""
@@ -267,7 +269,9 @@ def resolve_model_policy(
             text_model = _get_permissive_text_model()
             logger.info("model_policy=permissive → text_model=%s", text_model)
         if not image_model:
-            image_model = _DEFAULT_PERMISSIVE_IMAGE_MODEL
+            # Use the best available OpenRouter image model (no Pollinations)
+            from app.core.llm_router import get_image_fallback_model
+            image_model = get_image_fallback_model()
             logger.info("model_policy=permissive → image_model=%s", image_model)
 
     return text_model, image_model
diff --git a/app/core/llm_router.py b/app/core/llm_router.py
index 3bb5ac7..64dcb7f 100644
--- a/app/core/llm_router.py
+++ b/app/core/llm_router.py
@@ -27,15 +27,11 @@
 """
 
 import asyncio
-import base64
 import logging
-import time
 from collections.abc import AsyncIterator
 from enum import Enum
 from typing import Any, TypeVar
-from urllib.parse import quote
 
-import httpx
 from pydantic import BaseModel
 
 from app.config import (
@@ -97,12 +93,6 @@ def get_image_fallback_model() -> str:
         pass
     return _IMAGE_FALLBACK_DEFAULT
 
-# Pollinations.ai - Ultimate free fallback for image generation
-# No API key required, always available, decent quality
-# NOTE: URL changed from image.pollinations.ai to gen.pollinations.ai in early 2026
-POLLINATIONS_URL = "https://gen.pollinations.ai/image/{prompt}"
-POLLINATIONS_TIMEOUT = 60.0  # Image generation can take time
-
 # Rate limit retry settings
 MAX_RETRIES = 5
 INITIAL_BACKOFF = 2.0  # seconds
@@ -809,73 +799,6 @@ async def call_structured(
 
             raise
 
-    async def _generate_image_pollinations(
-        self,
-        prompt: str,
-        **kwargs: Any,
-    ) -> LLMResponse[str]:
-        """Generate image using Pollinations.ai (free, no API key required).
-
-        This is the ultimate fallback for image generation. Pollinations.ai
-        provides free image generation with no API key, no rate limits,
-        and decent quality using Stable Diffusion models.
-
-        Args:
-            prompt: The image generation prompt.
-            **kwargs: Additional parameters (currently unused).
-
-        Returns:
-            LLMResponse containing base64-encoded image.
-
-        Raises:
-            ProviderError: If the request fails.
-        """
-        start_time = time.perf_counter()
-
-        # URL-encode the prompt for safe embedding in URL
-        encoded_prompt = quote(prompt, safe="")
-        url = POLLINATIONS_URL.format(prompt=encoded_prompt)
-
-        # Add parameters for better quality
-        # nologo=true removes watermark, width/height for resolution, model=flux for best quality
-        url += "?nologo=true&width=1024&height=1024&model=flux"
-
-        logger.info(f"Pollinations.ai fallback: generating image for prompt (first 50 chars): {prompt[:50]}...")
-
-        try:
-            async with httpx.AsyncClient(timeout=POLLINATIONS_TIMEOUT) as client:
-                response = await client.get(url)
-
-                if response.status_code != 200:
-                    raise ProviderError(
-                        message=f"Pollinations.ai returned status {response.status_code}",
-                        provider=ProviderType.OPENROUTER,  # Use OPENROUTER as proxy
-                        status_code=response.status_code,
-                        retryable=response.status_code >= 500,
-                    )
-
-                # Response is raw image bytes (JPEG)
-                image_bytes = response.content
-                image_b64 = base64.b64encode(image_bytes).decode("utf-8")
-
-                latency_ms = int((time.perf_counter() - start_time) * 1000)
-                logger.info(f"Pollinations.ai image generated successfully in {latency_ms}ms")
-
-                return LLMResponse(
-                    content=image_b64,
-                    model="pollinations-ai",
-                    provider=ProviderType.OPENROUTER,  # Use OPENROUTER as proxy type
-                    latency_ms=latency_ms,
-                )
-
-        except httpx.HTTPError as e:
-            logger.error(f"Pollinations.ai request failed: {e}")
-            raise ProviderError(
-                message=f"Pollinations.ai request failed: {e}",
-                provider=ProviderType.OPENROUTER,
-                retryable=True,
-            ) from e
-
     async def _generate_image_with_retry(
         self,
         provider: LLMProvider,
@@ -995,16 +918,15 @@ async def generate_image(
         Raises:
             ProviderError: If image generation fails after all retries and fallbacks.
         """
-        # Direct Pollinations path — when caller explicitly requests it
-        # (e.g. model_policy="permissive" sets image_model="pollinations")
-        image_model_id = self._get_model_for_capability(ModelCapability.IMAGE, self.config.primary)
-        if image_model_id and "pollinations" in image_model_id.lower():
-            logger.info("Image model is Pollinations — using direct Pollinations path")
-            return await self._generate_image_pollinations(prompt)
-
         # Determine provider for image generation
-        # Prefer preset's image_provider, then Google native, then fallback
-        if self._preset_config and "image_provider" in self._preset_config:
+        # Prefer preset's image_provider, then Google native, then OpenRouter
+        is_permissive = bool(
+            self._model_policy and self._model_policy.lower() == "permissive"
+        )
+        if is_permissive and ProviderType.OPENROUTER in self.providers:
+            # Permissive mode: always use OpenRouter for images (Google-free)
+            image_provider = ProviderType.OPENROUTER
+        elif self._preset_config and "image_provider" in self._preset_config:
             image_provider = self._preset_config["image_provider"]
         elif ProviderType.GOOGLE in self.providers:
             image_provider = ProviderType.GOOGLE
@@ -1033,20 +955,15 @@ async def generate_image(
             should_fallback = (
                 image_provider != ProviderType.OPENROUTER
                 and ProviderType.OPENROUTER in self.providers
+                and not is_permissive  # Already on OpenRouter in permissive mode
             )
 
             if not should_fallback:
-                # No OpenRouter fallback, but try Pollinations.ai as ultimate fallback
-                logger.info("No OpenRouter configured, falling back to Pollinations.ai")
-                try:
-                    return await self._generate_image_pollinations(prompt)
-                except ProviderError as e2:
-                    logger.error(f"Pollinations.ai fallback failed: {e2}")
-                    raise ProviderError(
-                        message=f"Image generation failed. Primary: {e}, Pollinations: {e2}",
-                        provider=image_provider,
-                        retryable=False,
-                    ) from e
+                raise ProviderError(
+                    message=f"Image generation failed: {e}",
+                    provider=image_provider,
+                    retryable=False,
+                ) from e
 
             # Log appropriately based on error type
             image_fallback = get_image_fallback_model()
@@ -1076,17 +993,11 @@ async def generate_image(
                 )
             except (RateLimitError, ProviderError) as e2:
                 logger.warning(f"OpenRouter image fallback also failed: {e2}")
-                # Try Pollinations.ai as ultimate free fallback
-                logger.info("Falling back to Pollinations.ai (free, no API key required)")
-                try:
-                    return await self._generate_image_pollinations(prompt)
-                except ProviderError as e3:
-                    logger.error(f"Pollinations.ai fallback also failed: {e3}")
-                    raise ProviderError(
-                        message=f"All image providers failed. Primary: {e}, OpenRouter: {e2}, Pollinations: {e3}",
-                        provider=image_provider,
-                        retryable=False,
-                    ) from e
+                raise ProviderError(
+                    message=f"All image providers failed. Primary: {e}, OpenRouter: {e2}",
+                    provider=image_provider,
+                    retryable=False,
+                ) from e
 
     async def analyze_image(
         self,
diff --git a/app/core/model_policy.py b/app/core/model_policy.py
index 0e853ef..f832652 100644
--- a/app/core/model_policy.py
+++ b/app/core/model_policy.py
@@ -32,8 +32,6 @@ def derive_model_provider(model_id: str | None) -> str:
         return "google"
     if any(lower.startswith(p) for p in OPENROUTER_PREFIXES):
         return "openrouter"
-    if "pollinations" in lower:
-        return "pollinations"
     return "google"
 
 
@@ -49,6 +47,4 @@ def derive_model_permissiveness(model_id: str | None) -> str:
     lower = model_id.lower()
     if any(lower.startswith(p) for p in PERMISSIVE_PREFIXES):
         return "permissive"
-    if "pollinations" in lower:
-        return "permissive"  # Pollinations uses open models
     return "restricted"
diff --git a/docs/API.md b/docs/API.md
index 63870f7..bc892c7 100644
--- a/docs/API.md
+++ b/docs/API.md
@@ -72,11 +72,11 @@ Use only open-weight, distillable models — zero Google API calls:
   "model_policy": "permissive"
 }
 ```
-Text routes to DeepSeek/Llama/Qwen via OpenRouter, images route to Pollinations, and Google grounding is skipped. Response metadata reflects the actual models used:
+Text routes to DeepSeek/Llama/Qwen via OpenRouter, images route to OpenRouter (Flux/Gemini), and Google grounding is skipped. Response metadata reflects the actual models used:
 ```json
 {
-  "text_model_used": "deepseek/deepseek-r1-0528",
-  "image_model_used": "pollinations",
+  "text_model_used": "deepseek/deepseek-chat-v3-0324",
+  "image_model_used": "google/gemini-2.5-flash-image-preview",
   "model_provider": "openrouter",
   "model_permissiveness": "permissive"
 }
@@ -93,7 +93,7 @@ Text routes to DeepSeek/Llama/Qwen via OpenRouter, images route to Pollinations,
   "generate_image": true
 }
 ```
-This uses the specified Qwen model for text, Pollinations for images (from permissive policy), and skips Google grounding.
+This uses the specified Qwen model for text, OpenRouter for images (from permissive policy), and skips Google grounding.
 
 ---
 
@@ -192,7 +192,7 @@ Generate a scene with real-time progress updates via Server-Sent Events.
 | generate_image | boolean | No | Generate AI image (default: false) |
 | preset | string | No | Quality preset: `hd`, `hyper`, `balanced` (default), `gemini3` |
 | text_model | string | No | Text model ID — OpenRouter format (`org/model`) or Google native (`gemini-*`). Overrides preset. |
-| image_model | string | No | Image model ID — `pollinations` for free open-source, or Google native. Overrides preset. |
+| image_model | string | No | Image model ID — OpenRouter format (`org/model`) or Google native. Overrides preset. |
 | model_policy | string | No | `"permissive"` — selects only open-weight models (Llama, DeepSeek, Qwen) and skips Google-dependent steps. Fully Google-free. Works alongside explicit model overrides. |
 | llm_params | object | No | Fine-grained LLM parameters applied to all pipeline agents. See **LLM Parameters** below. |
 | visibility | string | No | `public` (default) or `private` — controls who can see full data |
@@ -1069,18 +1069,16 @@ When Google API quota is exhausted or rate-limited:
 
 ### Image Generation
 
-Image generation uses a resilient 3-tier fallback chain:
+Image generation uses a 2-tier fallback chain:
 
 | Priority | Provider | Details |
 |----------|----------|---------|
 | 1 | **Google Imagen** | Native API, highest quality. Quota exhaustion = instant fallback. |
-| 2 | **OpenRouter Flux** | Via `/chat/completions` with `modalities: ["image", "text"]` |
-| 3 | **Pollinations.ai** | Free, no API key required. Ultimate fallback, never fails. |
+| 2 | **OpenRouter** | Via `/chat/completions` with `modalities: ["image", "text"]`. Best available model auto-selected. |
 
 **Behavior:**
-- Quota exhaustion on Google = immediate fallback (no retries wasted)
-- OpenRouter failure = fallback to Pollinations.ai
-- Pollinations.ai = always succeeds (free API, no rate limits)
+- Quota exhaustion on Google = immediate fallback to OpenRouter (no retries wasted)
+- In permissive mode, images route directly to OpenRouter (Google-free)
 - Scene completes with image from whichever provider succeeds
 
 ---
diff --git a/docs/DOWNSTREAM_MODEL_CONTROL.md b/docs/DOWNSTREAM_MODEL_CONTROL.md
index 73147db..a6168b9 100644
--- a/docs/DOWNSTREAM_MODEL_CONTROL.md
+++ b/docs/DOWNSTREAM_MODEL_CONTROL.md
@@ -2,7 +2,7 @@
 
 **For teams building on TIMEPOINT Flash (Web App, iPhone App, Clockchain, Billing, Enterprise integrations)**
 
-TIMEPOINT Flash now supports full downstream control of model selection and generation hyperparameters on every generation request. Downstream apps can set `model_policy: "permissive"` to route all 14 pipeline agents through open-weight models (DeepSeek R1, Llama, Qwen, Mistral) via OpenRouter with Pollinations for images — making the entire pipeline fully Google-free with zero Google API calls, including grounding. Apps can also specify exact models by name using `text_model` and `image_model` (any OpenRouter-compatible model ID like `qwen/qwen3-235b-a22b` or Google native like `gemini-2.5-flash`), and these explicit overrides take priority over `model_policy`, which in turn takes priority over `preset`. In addition, the new `llm_params` object provides fine-grained control over generation hyperparameters — temperature, max_tokens, top_p, top_k, frequency/presence/repetition penalties, stop sequences, thinking level, and system prompt injection (prefix/suffix) — all applied uniformly across every agent in the pipeline. Request-level `llm_params` override each agent's built-in defaults, so setting `temperature: 0.3` overrides the scene agent's default of 0.7, the dialog agent's default of 0.85, etc. All of these controls are composable: you can combine `model_policy`, explicit models, `preset`, and `llm_params` in the same request.
+TIMEPOINT Flash now supports full downstream control of model selection and generation hyperparameters on every generation request. Downstream apps can set `model_policy: "permissive"` to route all 14 pipeline agents through open-weight models (DeepSeek, Llama, Qwen, Mistral) via OpenRouter for both text and images — making the entire pipeline fully Google-free with zero Google API calls, including grounding. Apps can also specify exact models by name using `text_model` and `image_model` (any OpenRouter-compatible model ID like `qwen/qwen3-235b-a22b` or Google native like `gemini-2.5-flash`), and these explicit overrides take priority over `model_policy`, which in turn takes priority over `preset`. In addition, the new `llm_params` object provides fine-grained control over generation hyperparameters — temperature, max_tokens, top_p, top_k, frequency/presence/repetition penalties, stop sequences, thinking level, and system prompt injection (prefix/suffix) — all applied uniformly across every agent in the pipeline. Request-level `llm_params` override each agent's built-in defaults, so setting `temperature: 0.3` overrides the scene agent's default of 0.7, the dialog agent's default of 0.85, etc. All of these controls are composable: you can combine `model_policy`, explicit models, `preset`, and `llm_params` in the same request.
 
 ## Request Parameters
 
@@ -12,7 +12,7 @@ TIMEPOINT Flash now supports full downstream control of model selection and gene
 | `generate_image` | boolean | No | Generate AI image (default: false) |
 | `preset` | string | No | Quality preset: `hyper`, `balanced` (default), `hd`, `gemini3` |
 | `text_model` | string | No | Text model ID — OpenRouter format (`org/model`) or Google native (`gemini-*`). Overrides preset. |
-| `image_model` | string | No | Image model ID — `pollinations` for free, or Google native. Overrides preset. |
+| `image_model` | string | No | Image model ID — OpenRouter format (`org/model`) or Google native. Overrides preset. |
 | `model_policy` | string | No | `"permissive"` for open-weight only, Google-free generation. |
 | `llm_params` | object | No | Fine-grained LLM hyperparameters (see table below). |
 | `visibility` | string | No | `public` (default) or `private` |

From f5d35a770f244fb1f509684f9e90e2f8109f22ba Mon Sep 17 00:00:00 2001
From: realityinspector <mcdonald.sean@gmail.com>
Date: Wed, 11 Mar 2026 19:32:08 -0600
Subject: [PATCH 3/3] =?UTF-8?q?fix:=20enforce=20permissive=20policy=20?=
 =?UTF-8?q?=E2=80=94=20reject=20proprietary=20models,=20use=20open-weight?=
 =?UTF-8?q?=20image=20models?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Validate explicit text_model/image_model against PERMISSIVE_PREFIXES
  when model_policy=permissive. Returns 422 for proprietary models
  (OpenAI, Anthropic, Google Gemini).
- Add is_model_permissive() helper for reuse across validation and
  registry filtering.
- Add permissive_only param to get_best_image_model() and
  get_image_fallback_model() — skips Gemini/proprietary image models.
- Permissive image model selection now prefers open-weight models from
  the OpenRouter registry instead of defaulting to Gemini.
---
 app/api/v1/timepoints.py   | 31 +++++++++++++++++++++++++++++--
 app/core/llm_router.py     | 10 +++++++---
 app/core/model_policy.py   | 13 +++++++++----
 app/core/model_registry.py | 29 +++++++++++++++++++++++------
 4 files changed, 68 insertions(+), 15 deletions(-)

diff --git a/app/api/v1/timepoints.py b/app/api/v1/timepoints.py
index 5823c23..2acfc8f 100644
--- a/app/api/v1/timepoints.py
+++ b/app/api/v1/timepoints.py
@@ -258,20 +258,47 @@ def resolve_model_policy(
       2. model_policy="permissive"  → auto-select open-weight models
       3. None → let preset / settings defaults handle it
 
+    When model_policy="permissive", explicit models are validated against
+    the PERMISSIVE_PREFIXES allowlist. Proprietary models (OpenAI, Anthropic,
+    Google Gemini) are rejected with 422.
+
     Returns:
         (text_model, image_model) to pass to the pipeline.
+
+    Raises:
+        HTTPException: 422 if explicit models violate permissive policy.
     """
+    from app.core.model_policy import is_model_permissive
+
     text_model = request.text_model
     image_model = request.image_model
 
     if request.model_policy and request.model_policy.lower() == "permissive":
+        # Validate explicit models against permissive allowlist
+        if text_model and not is_model_permissive(text_model):
+            raise HTTPException(
+                status_code=422,
+                detail=(
+                    f"model_policy='permissive' requires open-weight models. "
+                    f"'{text_model}' is proprietary. Use models from: "
+                    f"meta-llama/, deepseek/, qwen/, mistralai/, microsoft/, google/gemma, allenai/, nvidia/"
+                ),
+            )
+        if image_model and not is_model_permissive(image_model):
+            raise HTTPException(
+                status_code=422,
+                detail=(
+                    f"model_policy='permissive' requires open-weight models. "
+                    f"'{image_model}' is proprietary."
+                ),
+            )
+
         if not text_model:
             text_model = _get_permissive_text_model()
             logger.info("model_policy=permissive → text_model=%s", text_model)
         if not image_model:
-            # Use the best available OpenRouter image model (no Pollinations)
             from app.core.llm_router import get_image_fallback_model
-            image_model = get_image_fallback_model()
+            image_model = get_image_fallback_model(permissive_only=True)
             logger.info("model_policy=permissive → image_model=%s", image_model)
 
     return text_model, image_model
diff --git a/app/core/llm_router.py b/app/core/llm_router.py
index 64dcb7f..14d8b5b 100644
--- a/app/core/llm_router.py
+++ b/app/core/llm_router.py
@@ -81,12 +81,16 @@ def get_paid_fallback_model() -> str:
     return _PAID_FALLBACK_DEFAULT
 
 
-def get_image_fallback_model() -> str:
-    """Get the best image fallback model, consulting the registry first."""
+def get_image_fallback_model(permissive_only: bool = False) -> str:
+    """Get the best image fallback model, consulting the registry first.
+
+    Args:
+        permissive_only: If True, only return open-weight image models.
+    """
     try:
         from app.core.model_registry import OpenRouterModelRegistry
         registry = OpenRouterModelRegistry.get_instance()
-        best = registry.get_best_image_model()
+        best = registry.get_best_image_model(permissive_only=permissive_only)
         if best:
             return best
     except Exception:
diff --git a/app/core/model_policy.py b/app/core/model_policy.py
index f832652..b088a06 100644
--- a/app/core/model_policy.py
+++ b/app/core/model_policy.py
@@ -35,6 +35,14 @@ def derive_model_provider(model_id: str | None) -> str:
     return "google"
 
 
+def is_model_permissive(model_id: str | None) -> bool:
+    """Check if a model ID is open-weight / permissively licensed."""
+    if not model_id:
+        return False
+    lower = model_id.lower()
+    return any(lower.startswith(p) for p in PERMISSIVE_PREFIXES)
+
+
 def derive_model_permissiveness(model_id: str | None) -> str:
     """Derive distillation licensing permissiveness from a model ID.
 
@@ -44,7 +52,4 @@ def derive_model_permissiveness(model_id: str | None) -> str:
     """
     if not model_id:
         return "unknown"
-    lower = model_id.lower()
-    if any(lower.startswith(p) for p in PERMISSIVE_PREFIXES):
-        return "permissive"
-    return "restricted"
+    return "permissive" if is_model_permissive(model_id) else "restricted"
diff --git a/app/core/model_registry.py b/app/core/model_registry.py
index 60753db..eb19271 100644
--- a/app/core/model_registry.py
+++ b/app/core/model_registry.py
@@ -166,15 +166,21 @@ def get_best_text_model(self, prefer_free: bool = False) -> str | None:
         candidates.sort(key=lambda x: x[1], reverse=True)
         return candidates[0][0]
 
-    def get_best_image_model(self) -> str | None:
+    def get_best_image_model(self, permissive_only: bool = False) -> str | None:
         """Find the best available image generation model.
 
         Heuristic: filter to models with "image" in output_modalities,
-        prefer gemini models.
+        prefer gemini models (unless permissive_only is set).
+
+        Args:
+            permissive_only: If True, only return open-weight models
+                (no Google, OpenAI, Anthropic).
 
         Returns:
             Model ID or None if cache is empty.
         """
+        from app.core.model_policy import is_model_permissive
+
         if not self._models:
             return None
 
@@ -187,18 +193,29 @@ def get_best_image_model(self) -> str | None:
             if not isinstance(output_mods, list) or "image" not in output_mods:
                 continue
 
+            if permissive_only and not is_model_permissive(model_id):
+                continue
+
             ctx = info.get("context_length", 0)
             if "gemini" in model_id.lower():
                 gemini_candidates.append((model_id, ctx))
             else:
                 other_candidates.append((model_id, ctx))
 
-        # Prefer gemini models, sorted by context_length desc
-        if gemini_candidates:
-            gemini_candidates.sort(key=lambda x: x[1], reverse=True)
-            return gemini_candidates[0][0]
+        if not permissive_only:
+            # Prefer gemini models, sorted by context_length desc
+            if gemini_candidates:
+                gemini_candidates.sort(key=lambda x: x[1], reverse=True)
+                return gemini_candidates[0][0]
+
         if other_candidates:
             other_candidates.sort(key=lambda x: x[1], reverse=True)
             return other_candidates[0][0]
 
+        # Fallback: if permissive_only but no permissive image models found,
+        # return gemini via OpenRouter (still better than nothing)
+        if gemini_candidates:
+            gemini_candidates.sort(key=lambda x: x[1], reverse=True)
+            return gemini_candidates[0][0]
+
         return None