From a096eefc652bb3e6953981a99dbbf66fd8ab3e39 Mon Sep 17 00:00:00 2001 From: realityinspector Date: Wed, 11 Mar 2026 19:00:37 -0600 Subject: [PATCH 1/3] fix: dramatically reduce permissive mode latency (10min -> ~2min) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Default max_tokens=2048 when model_policy=permissive and no preset - Use batch dialog (1 LLM call) instead of sequential (7 calls) - Skip critique loop in permissive mode (saves 1-8 calls) - Block Google fallback in permissive mode (correctness: stay Google-free) - Reorder model preference to prioritize fast non-thinking models (DeepSeek R1 moved to last resort — its 30-60s/call caused timeouts) --- app/api/v1/timepoints.py | 12 +++++----- app/core/llm_router.py | 27 +++++++++++++++++++++-- app/core/pipeline.py | 47 ++++++++++++++++++++++++++++++++++++++-- 3 files changed, 77 insertions(+), 9 deletions(-) diff --git a/app/api/v1/timepoints.py b/app/api/v1/timepoints.py index 424b7ca..59bbe40 100644 --- a/app/api/v1/timepoints.py +++ b/app/api/v1/timepoints.py @@ -226,15 +226,17 @@ def _get_permissive_text_model() -> str: if registry.model_count == 0: return _DEFAULT_PERMISSIVE_TEXT_MODEL - # Walk preference list; return first that's in the live registry + # Walk preference list; return first that's in the live registry. + # Prioritize fast non-thinking models — DeepSeek R1 is a thinking + # model that takes 30-60s per call and causes pipeline timeouts. preference = [ "meta-llama/llama-4-scout-17b-16e-instruct", "meta-llama/llama-4-maverick-17b-128e-instruct", - "deepseek/deepseek-r1-0528", - "deepseek/deepseek-chat-v3-0324", - "qwen/qwen3-235b-a22b", - "qwen/qwen3-30b-a3b", + "deepseek/deepseek-chat-v3-0324", # Fast chat model + "qwen/qwen3-30b-a3b", # Fast MoE model "mistralai/mistral-small-3.2-24b-instruct", + "qwen/qwen3-235b-a22b", # Large but non-thinking + "deepseek/deepseek-r1-0528", # Thinking model — slow, last resort ] for model_id in preference: if registry.is_model_available(model_id): diff --git a/app/core/llm_router.py b/app/core/llm_router.py index 686ea33..3bb5ac7 100644 --- a/app/core/llm_router.py +++ b/app/core/llm_router.py @@ -195,6 +195,7 @@ def __init__( preset: QualityPreset | None = None, text_model: str | None = None, image_model: str | None = None, + model_policy: str | None = None, ) -> None: """Initialize LLM router. @@ -203,12 +204,14 @@ def __init__( preset: Quality preset (HD, HYPER, BALANCED). Overrides config models. text_model: Custom text model override (overrides preset). image_model: Custom image model override (overrides preset). + model_policy: Model policy (e.g. "permissive" blocks Google fallback). """ settings = get_settings() self.preset = preset self._preset_config = PRESET_CONFIGS.get(preset) if preset else None self._custom_text_model = text_model self._custom_image_model = image_model + self._model_policy = model_policy # Build config from settings if not provided if config is None: @@ -624,7 +627,15 @@ async def call( logger.warning(f"Paid model fallback also failed: {e2}") # Try Google provider as ultimate fallback using verified model - if ProviderType.GOOGLE in self.providers and self.config.primary != ProviderType.GOOGLE: + # (blocked in permissive mode — must stay Google-free) + is_permissive = bool( + self._model_policy and self._model_policy.lower() == "permissive" + ) + if ( + ProviderType.GOOGLE in self.providers + and self.config.primary != ProviderType.GOOGLE + and not is_permissive + ): logger.info("Falling back to Google provider with verified model") try: provider = self._get_provider(ProviderType.GOOGLE) @@ -635,6 +646,8 @@ async def call( ) except ProviderError as e3: logger.warning(f"Google provider fallback failed: {e3}") + elif is_permissive: + logger.info("Skipping Google fallback: model_policy=permissive") # All fallbacks exhausted raise ProviderError( @@ -749,7 +762,15 @@ async def call_structured( logger.warning(f"Paid model fallback also failed: {e2}") # Try Google provider as ultimate fallback using verified model - if ProviderType.GOOGLE in self.providers and self.config.primary != ProviderType.GOOGLE: + # (blocked in permissive mode — must stay Google-free) + is_permissive = bool( + self._model_policy and self._model_policy.lower() == "permissive" + ) + if ( + ProviderType.GOOGLE in self.providers + and self.config.primary != ProviderType.GOOGLE + and not is_permissive + ): logger.info("Falling back to Google provider with verified model") try: provider = self._get_provider(ProviderType.GOOGLE) @@ -761,6 +782,8 @@ async def call_structured( ) except ProviderError as e3: logger.warning(f"Google provider fallback failed: {e3}") + elif is_permissive: + logger.info("Skipping Google fallback: model_policy=permissive") # All fallbacks exhausted raise ProviderError( diff --git a/app/core/pipeline.py b/app/core/pipeline.py index f2de686..e8fa6ae 100644 --- a/app/core/pipeline.py +++ b/app/core/pipeline.py @@ -291,7 +291,20 @@ def __init__( self._text_model = text_model self._image_model = image_model self._model_policy = model_policy - self._llm_params: dict[str, Any] = llm_params or {} + + # Build effective llm_params: apply permissive speed defaults + # when no preset is set and caller hasn't specified max_tokens. + effective_params = dict(llm_params or {}) + if ( + model_policy + and model_policy.lower() == "permissive" + and preset is None + and "max_tokens" not in effective_params + ): + effective_params["max_tokens"] = 2048 + logger.info("model_policy=permissive: defaulting max_tokens=2048 for speed") + + self._llm_params: dict[str, Any] = effective_params self._max_parallelism_override = max_parallelism self._max_parallelism: int | None = None # Set during execution planning self._semaphore: asyncio.Semaphore | None = None @@ -324,6 +337,7 @@ def router(self) -> LLMRouter: preset=self._preset, text_model=self._text_model, image_model=self._image_model, + model_policy=self._model_policy, ) return self._router @@ -341,7 +355,15 @@ def _init_agents(self) -> None: self._char_id_agent = CharacterIdentificationAgent(router=router) self._char_bio_agent = CharacterBioAgent(router=router) self._moment_agent = MomentAgent(router=router) - self._dialog_agent = DialogAgent(router=router) + # Permissive mode: use batch dialog (1 LLM call) instead of + # sequential roleplay (7 calls) to cut latency dramatically. + is_permissive = bool( + self._model_policy and self._model_policy.lower() == "permissive" + ) + self._dialog_agent = DialogAgent( + router=router, + use_sequential=not is_permissive, + ) self._camera_agent = CameraAgent(router=router) self._graph_agent = GraphAgent(router=router) self._image_prompt_agent = ImagePromptAgent(router=router) @@ -1390,6 +1412,27 @@ async def _step_dialog(self, state: PipelineState) -> PipelineState: state.dialog_data = result.content # === CRITIQUE LOOP (one pass max) === + # Skip critique in permissive mode — saves 1-8 LLM calls and the + # batch dialog output is already quality-constrained by the prompt. + is_permissive = bool( + self._model_policy and self._model_policy.lower() == "permissive" + ) + if is_permissive: + logger.info("Skipping dialog critique: model_policy=permissive (speed mode)") + state.step_results.append( + StepResult( + step=step, + success=result.success, + data=state.dialog_data, + error=result.error, + latency_ms=result.latency_ms, + model_used=result.model_used, + ) + ) + if state.dialog_data: + logger.debug(f"Dialog: {len(state.dialog_data.lines)} lines") + return state + critique_input = CritiqueInput( step_name="dialog", output_json=result.content.model_dump_json(), From 8c69f5de4065bc9ffdda5d25d65393767c7a3a7c Mon Sep 17 00:00:00 2001 From: realityinspector Date: Wed, 11 Mar 2026 19:26:22 -0600 Subject: [PATCH 2/3] refactor: remove Pollinations, use OpenRouter for all image generation Remove Pollinations.ai as image provider. All image generation now routes through OpenRouter or Google native. Permissive mode uses OpenRouter for images (Google-free). Simplifies fallback chain from 3-tier (Google -> OpenRouter -> Pollinations) to 2-tier (Google -> OpenRouter). --- app/api/v1/timepoints.py | 20 +++-- app/core/llm_router.py | 127 +++++-------------------------- app/core/model_policy.py | 4 - docs/API.md | 20 +++-- docs/DOWNSTREAM_MODEL_CONTROL.md | 4 +- 5 files changed, 42 insertions(+), 133 deletions(-) diff --git a/app/api/v1/timepoints.py b/app/api/v1/timepoints.py index 59bbe40..5823c23 100644 --- a/app/api/v1/timepoints.py +++ b/app/api/v1/timepoints.py @@ -173,8 +173,12 @@ class GenerateRequest(BaseModel): ) image_model: str | None = Field( default=None, - description="Custom image model override (e.g., 'google/imagen-3'). Overrides preset.", - examples=["google/imagen-3", "black-forest-labs/flux-1.1-pro"], + description=( + "Image model ID. OpenRouter format ('org/model') or Google native " + "(gemini-2.5-flash-image, gemini-3-pro-image-preview)." + ), + examples=["gemini-2.5-flash-image", "gemini-3-pro-image-preview", + "google/gemini-2.5-flash-image-preview"], ) write_blob: bool = Field( default=False, @@ -210,12 +214,10 @@ class GenerateRequest(BaseModel): ) -# Default permissive models — used when model_policy="permissive" and no -# explicit text_model/image_model is provided. These must be open-weight -# models available on OpenRouter (or Pollinations for images). +# Default permissive text model — used when model_policy="permissive" and no +# explicit text_model is provided. Must be an open-weight model on OpenRouter. +# Image model is resolved at runtime via get_image_fallback_model(). _DEFAULT_PERMISSIVE_TEXT_MODEL = "meta-llama/llama-4-scout-17b-16e-instruct" -_DEFAULT_PERMISSIVE_IMAGE_MODEL = "pollinations" # Free, open, always available - def _get_permissive_text_model() -> str: """Pick the best available permissive text model from the registry.""" @@ -267,7 +269,9 @@ def resolve_model_policy( text_model = _get_permissive_text_model() logger.info("model_policy=permissive → text_model=%s", text_model) if not image_model: - image_model = _DEFAULT_PERMISSIVE_IMAGE_MODEL + # Use the best available OpenRouter image model (no Pollinations) + from app.core.llm_router import get_image_fallback_model + image_model = get_image_fallback_model() logger.info("model_policy=permissive → image_model=%s", image_model) return text_model, image_model diff --git a/app/core/llm_router.py b/app/core/llm_router.py index 3bb5ac7..64dcb7f 100644 --- a/app/core/llm_router.py +++ b/app/core/llm_router.py @@ -27,15 +27,11 @@ """ import asyncio -import base64 import logging -import time from collections.abc import AsyncIterator from enum import Enum from typing import Any, TypeVar -from urllib.parse import quote -import httpx from pydantic import BaseModel from app.config import ( @@ -97,12 +93,6 @@ def get_image_fallback_model() -> str: pass return _IMAGE_FALLBACK_DEFAULT -# Pollinations.ai - Ultimate free fallback for image generation -# No API key required, always available, decent quality -# NOTE: URL changed from image.pollinations.ai to gen.pollinations.ai in early 2026 -POLLINATIONS_URL = "https://gen.pollinations.ai/image/{prompt}" -POLLINATIONS_TIMEOUT = 60.0 # Image generation can take time - # Rate limit retry settings MAX_RETRIES = 5 INITIAL_BACKOFF = 2.0 # seconds @@ -809,73 +799,6 @@ async def call_structured( raise - async def _generate_image_pollinations( - self, - prompt: str, - **kwargs: Any, - ) -> LLMResponse[str]: - """Generate image using Pollinations.ai (free, no API key required). - - This is the ultimate fallback for image generation. Pollinations.ai - provides free image generation with no API key, no rate limits, - and decent quality using Stable Diffusion models. - - Args: - prompt: The image generation prompt. - **kwargs: Additional parameters (currently unused). - - Returns: - LLMResponse containing base64-encoded image. - - Raises: - ProviderError: If the request fails. - """ - start_time = time.perf_counter() - - # URL-encode the prompt for safe embedding in URL - encoded_prompt = quote(prompt, safe="") - url = POLLINATIONS_URL.format(prompt=encoded_prompt) - - # Add parameters for better quality - # nologo=true removes watermark, width/height for resolution, model=flux for best quality - url += "?nologo=true&width=1024&height=1024&model=flux" - - logger.info(f"Pollinations.ai fallback: generating image for prompt (first 50 chars): {prompt[:50]}...") - - try: - async with httpx.AsyncClient(timeout=POLLINATIONS_TIMEOUT) as client: - response = await client.get(url) - - if response.status_code != 200: - raise ProviderError( - message=f"Pollinations.ai returned status {response.status_code}", - provider=ProviderType.OPENROUTER, # Use OPENROUTER as proxy - status_code=response.status_code, - retryable=response.status_code >= 500, - ) - - # Response is raw image bytes (JPEG) - image_bytes = response.content - image_b64 = base64.b64encode(image_bytes).decode("utf-8") - - latency_ms = int((time.perf_counter() - start_time) * 1000) - logger.info(f"Pollinations.ai image generated successfully in {latency_ms}ms") - - return LLMResponse( - content=image_b64, - model="pollinations-ai", - provider=ProviderType.OPENROUTER, # Use OPENROUTER as proxy type - latency_ms=latency_ms, - ) - - except httpx.HTTPError as e: - logger.error(f"Pollinations.ai request failed: {e}") - raise ProviderError( - message=f"Pollinations.ai request failed: {e}", - provider=ProviderType.OPENROUTER, - retryable=True, - ) from e - async def _generate_image_with_retry( self, provider: LLMProvider, @@ -995,16 +918,15 @@ async def generate_image( Raises: ProviderError: If image generation fails after all retries and fallbacks. """ - # Direct Pollinations path — when caller explicitly requests it - # (e.g. model_policy="permissive" sets image_model="pollinations") - image_model_id = self._get_model_for_capability(ModelCapability.IMAGE, self.config.primary) - if image_model_id and "pollinations" in image_model_id.lower(): - logger.info("Image model is Pollinations — using direct Pollinations path") - return await self._generate_image_pollinations(prompt) - # Determine provider for image generation - # Prefer preset's image_provider, then Google native, then fallback - if self._preset_config and "image_provider" in self._preset_config: + # Prefer preset's image_provider, then Google native, then OpenRouter + is_permissive = bool( + self._model_policy and self._model_policy.lower() == "permissive" + ) + if is_permissive and ProviderType.OPENROUTER in self.providers: + # Permissive mode: always use OpenRouter for images (Google-free) + image_provider = ProviderType.OPENROUTER + elif self._preset_config and "image_provider" in self._preset_config: image_provider = self._preset_config["image_provider"] elif ProviderType.GOOGLE in self.providers: image_provider = ProviderType.GOOGLE @@ -1033,20 +955,15 @@ async def generate_image( should_fallback = ( image_provider != ProviderType.OPENROUTER and ProviderType.OPENROUTER in self.providers + and not is_permissive # Already on OpenRouter in permissive mode ) if not should_fallback: - # No OpenRouter fallback, but try Pollinations.ai as ultimate fallback - logger.info("No OpenRouter configured, falling back to Pollinations.ai") - try: - return await self._generate_image_pollinations(prompt) - except ProviderError as e2: - logger.error(f"Pollinations.ai fallback failed: {e2}") - raise ProviderError( - message=f"Image generation failed. Primary: {e}, Pollinations: {e2}", - provider=image_provider, - retryable=False, - ) from e + raise ProviderError( + message=f"Image generation failed: {e}", + provider=image_provider, + retryable=False, + ) from e # Log appropriately based on error type image_fallback = get_image_fallback_model() @@ -1076,17 +993,11 @@ async def generate_image( ) except (RateLimitError, ProviderError) as e2: logger.warning(f"OpenRouter image fallback also failed: {e2}") - # Try Pollinations.ai as ultimate free fallback - logger.info("Falling back to Pollinations.ai (free, no API key required)") - try: - return await self._generate_image_pollinations(prompt) - except ProviderError as e3: - logger.error(f"Pollinations.ai fallback also failed: {e3}") - raise ProviderError( - message=f"All image providers failed. Primary: {e}, OpenRouter: {e2}, Pollinations: {e3}", - provider=image_provider, - retryable=False, - ) from e + raise ProviderError( + message=f"All image providers failed. Primary: {e}, OpenRouter: {e2}", + provider=image_provider, + retryable=False, + ) from e async def analyze_image( self, diff --git a/app/core/model_policy.py b/app/core/model_policy.py index 0e853ef..f832652 100644 --- a/app/core/model_policy.py +++ b/app/core/model_policy.py @@ -32,8 +32,6 @@ def derive_model_provider(model_id: str | None) -> str: return "google" if any(lower.startswith(p) for p in OPENROUTER_PREFIXES): return "openrouter" - if "pollinations" in lower: - return "pollinations" return "google" @@ -49,6 +47,4 @@ def derive_model_permissiveness(model_id: str | None) -> str: lower = model_id.lower() if any(lower.startswith(p) for p in PERMISSIVE_PREFIXES): return "permissive" - if "pollinations" in lower: - return "permissive" # Pollinations uses open models return "restricted" diff --git a/docs/API.md b/docs/API.md index 63870f7..bc892c7 100644 --- a/docs/API.md +++ b/docs/API.md @@ -72,11 +72,11 @@ Use only open-weight, distillable models — zero Google API calls: "model_policy": "permissive" } ``` -Text routes to DeepSeek/Llama/Qwen via OpenRouter, images route to Pollinations, and Google grounding is skipped. Response metadata reflects the actual models used: +Text routes to DeepSeek/Llama/Qwen via OpenRouter, images route to OpenRouter (Flux/Gemini), and Google grounding is skipped. Response metadata reflects the actual models used: ```json { - "text_model_used": "deepseek/deepseek-r1-0528", - "image_model_used": "pollinations", + "text_model_used": "deepseek/deepseek-chat-v3-0324", + "image_model_used": "google/gemini-2.5-flash-image-preview", "model_provider": "openrouter", "model_permissiveness": "permissive" } @@ -93,7 +93,7 @@ Text routes to DeepSeek/Llama/Qwen via OpenRouter, images route to Pollinations, "generate_image": true } ``` -This uses the specified Qwen model for text, Pollinations for images (from permissive policy), and skips Google grounding. +This uses the specified Qwen model for text, OpenRouter for images (from permissive policy), and skips Google grounding. --- @@ -192,7 +192,7 @@ Generate a scene with real-time progress updates via Server-Sent Events. | generate_image | boolean | No | Generate AI image (default: false) | | preset | string | No | Quality preset: `hd`, `hyper`, `balanced` (default), `gemini3` | | text_model | string | No | Text model ID — OpenRouter format (`org/model`) or Google native (`gemini-*`). Overrides preset. | -| image_model | string | No | Image model ID — `pollinations` for free open-source, or Google native. Overrides preset. | +| image_model | string | No | Image model ID — OpenRouter format (`org/model`) or Google native. Overrides preset. | | model_policy | string | No | `"permissive"` — selects only open-weight models (Llama, DeepSeek, Qwen) and skips Google-dependent steps. Fully Google-free. Works alongside explicit model overrides. | | llm_params | object | No | Fine-grained LLM parameters applied to all pipeline agents. See **LLM Parameters** below. | | visibility | string | No | `public` (default) or `private` — controls who can see full data | @@ -1069,18 +1069,16 @@ When Google API quota is exhausted or rate-limited: ### Image Generation -Image generation uses a resilient 3-tier fallback chain: +Image generation uses a 2-tier fallback chain: | Priority | Provider | Details | |----------|----------|---------| | 1 | **Google Imagen** | Native API, highest quality. Quota exhaustion = instant fallback. | -| 2 | **OpenRouter Flux** | Via `/chat/completions` with `modalities: ["image", "text"]` | -| 3 | **Pollinations.ai** | Free, no API key required. Ultimate fallback, never fails. | +| 2 | **OpenRouter** | Via `/chat/completions` with `modalities: ["image", "text"]`. Best available model auto-selected. | **Behavior:** -- Quota exhaustion on Google = immediate fallback (no retries wasted) -- OpenRouter failure = fallback to Pollinations.ai -- Pollinations.ai = always succeeds (free API, no rate limits) +- Quota exhaustion on Google = immediate fallback to OpenRouter (no retries wasted) +- In permissive mode, images route directly to OpenRouter (Google-free) - Scene completes with image from whichever provider succeeds --- diff --git a/docs/DOWNSTREAM_MODEL_CONTROL.md b/docs/DOWNSTREAM_MODEL_CONTROL.md index 73147db..a6168b9 100644 --- a/docs/DOWNSTREAM_MODEL_CONTROL.md +++ b/docs/DOWNSTREAM_MODEL_CONTROL.md @@ -2,7 +2,7 @@ **For teams building on TIMEPOINT Flash (Web App, iPhone App, Clockchain, Billing, Enterprise integrations)** -TIMEPOINT Flash now supports full downstream control of model selection and generation hyperparameters on every generation request. Downstream apps can set `model_policy: "permissive"` to route all 14 pipeline agents through open-weight models (DeepSeek R1, Llama, Qwen, Mistral) via OpenRouter with Pollinations for images — making the entire pipeline fully Google-free with zero Google API calls, including grounding. Apps can also specify exact models by name using `text_model` and `image_model` (any OpenRouter-compatible model ID like `qwen/qwen3-235b-a22b` or Google native like `gemini-2.5-flash`), and these explicit overrides take priority over `model_policy`, which in turn takes priority over `preset`. In addition, the new `llm_params` object provides fine-grained control over generation hyperparameters — temperature, max_tokens, top_p, top_k, frequency/presence/repetition penalties, stop sequences, thinking level, and system prompt injection (prefix/suffix) — all applied uniformly across every agent in the pipeline. Request-level `llm_params` override each agent's built-in defaults, so setting `temperature: 0.3` overrides the scene agent's default of 0.7, the dialog agent's default of 0.85, etc. All of these controls are composable: you can combine `model_policy`, explicit models, `preset`, and `llm_params` in the same request. +TIMEPOINT Flash now supports full downstream control of model selection and generation hyperparameters on every generation request. Downstream apps can set `model_policy: "permissive"` to route all 14 pipeline agents through open-weight models (DeepSeek, Llama, Qwen, Mistral) via OpenRouter for both text and images — making the entire pipeline fully Google-free with zero Google API calls, including grounding. Apps can also specify exact models by name using `text_model` and `image_model` (any OpenRouter-compatible model ID like `qwen/qwen3-235b-a22b` or Google native like `gemini-2.5-flash`), and these explicit overrides take priority over `model_policy`, which in turn takes priority over `preset`. In addition, the new `llm_params` object provides fine-grained control over generation hyperparameters — temperature, max_tokens, top_p, top_k, frequency/presence/repetition penalties, stop sequences, thinking level, and system prompt injection (prefix/suffix) — all applied uniformly across every agent in the pipeline. Request-level `llm_params` override each agent's built-in defaults, so setting `temperature: 0.3` overrides the scene agent's default of 0.7, the dialog agent's default of 0.85, etc. All of these controls are composable: you can combine `model_policy`, explicit models, `preset`, and `llm_params` in the same request. ## Request Parameters @@ -12,7 +12,7 @@ TIMEPOINT Flash now supports full downstream control of model selection and gene | `generate_image` | boolean | No | Generate AI image (default: false) | | `preset` | string | No | Quality preset: `hyper`, `balanced` (default), `hd`, `gemini3` | | `text_model` | string | No | Text model ID — OpenRouter format (`org/model`) or Google native (`gemini-*`). Overrides preset. | -| `image_model` | string | No | Image model ID — `pollinations` for free, or Google native. Overrides preset. | +| `image_model` | string | No | Image model ID — OpenRouter format (`org/model`) or Google native. Overrides preset. | | `model_policy` | string | No | `"permissive"` for open-weight only, Google-free generation. | | `llm_params` | object | No | Fine-grained LLM hyperparameters (see table below). | | `visibility` | string | No | `public` (default) or `private` | From f5d35a770f244fb1f509684f9e90e2f8109f22ba Mon Sep 17 00:00:00 2001 From: realityinspector Date: Wed, 11 Mar 2026 19:32:08 -0600 Subject: [PATCH 3/3] =?UTF-8?q?fix:=20enforce=20permissive=20policy=20?= =?UTF-8?q?=E2=80=94=20reject=20proprietary=20models,=20use=20open-weight?= =?UTF-8?q?=20image=20models?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Validate explicit text_model/image_model against PERMISSIVE_PREFIXES when model_policy=permissive. Returns 422 for proprietary models (OpenAI, Anthropic, Google Gemini). - Add is_model_permissive() helper for reuse across validation and registry filtering. - Add permissive_only param to get_best_image_model() and get_image_fallback_model() — skips Gemini/proprietary image models. - Permissive image model selection now prefers open-weight models from the OpenRouter registry instead of defaulting to Gemini. --- app/api/v1/timepoints.py | 31 +++++++++++++++++++++++++++++-- app/core/llm_router.py | 10 +++++++--- app/core/model_policy.py | 13 +++++++++---- app/core/model_registry.py | 29 +++++++++++++++++++++++------ 4 files changed, 68 insertions(+), 15 deletions(-) diff --git a/app/api/v1/timepoints.py b/app/api/v1/timepoints.py index 5823c23..2acfc8f 100644 --- a/app/api/v1/timepoints.py +++ b/app/api/v1/timepoints.py @@ -258,20 +258,47 @@ def resolve_model_policy( 2. model_policy="permissive" → auto-select open-weight models 3. None → let preset / settings defaults handle it + When model_policy="permissive", explicit models are validated against + the PERMISSIVE_PREFIXES allowlist. Proprietary models (OpenAI, Anthropic, + Google Gemini) are rejected with 422. + Returns: (text_model, image_model) to pass to the pipeline. + + Raises: + HTTPException: 422 if explicit models violate permissive policy. """ + from app.core.model_policy import is_model_permissive + text_model = request.text_model image_model = request.image_model if request.model_policy and request.model_policy.lower() == "permissive": + # Validate explicit models against permissive allowlist + if text_model and not is_model_permissive(text_model): + raise HTTPException( + status_code=422, + detail=( + f"model_policy='permissive' requires open-weight models. " + f"'{text_model}' is proprietary. Use models from: " + f"meta-llama/, deepseek/, qwen/, mistralai/, microsoft/, google/gemma, allenai/, nvidia/" + ), + ) + if image_model and not is_model_permissive(image_model): + raise HTTPException( + status_code=422, + detail=( + f"model_policy='permissive' requires open-weight models. " + f"'{image_model}' is proprietary." + ), + ) + if not text_model: text_model = _get_permissive_text_model() logger.info("model_policy=permissive → text_model=%s", text_model) if not image_model: - # Use the best available OpenRouter image model (no Pollinations) from app.core.llm_router import get_image_fallback_model - image_model = get_image_fallback_model() + image_model = get_image_fallback_model(permissive_only=True) logger.info("model_policy=permissive → image_model=%s", image_model) return text_model, image_model diff --git a/app/core/llm_router.py b/app/core/llm_router.py index 64dcb7f..14d8b5b 100644 --- a/app/core/llm_router.py +++ b/app/core/llm_router.py @@ -81,12 +81,16 @@ def get_paid_fallback_model() -> str: return _PAID_FALLBACK_DEFAULT -def get_image_fallback_model() -> str: - """Get the best image fallback model, consulting the registry first.""" +def get_image_fallback_model(permissive_only: bool = False) -> str: + """Get the best image fallback model, consulting the registry first. + + Args: + permissive_only: If True, only return open-weight image models. + """ try: from app.core.model_registry import OpenRouterModelRegistry registry = OpenRouterModelRegistry.get_instance() - best = registry.get_best_image_model() + best = registry.get_best_image_model(permissive_only=permissive_only) if best: return best except Exception: diff --git a/app/core/model_policy.py b/app/core/model_policy.py index f832652..b088a06 100644 --- a/app/core/model_policy.py +++ b/app/core/model_policy.py @@ -35,6 +35,14 @@ def derive_model_provider(model_id: str | None) -> str: return "google" +def is_model_permissive(model_id: str | None) -> bool: + """Check if a model ID is open-weight / permissively licensed.""" + if not model_id: + return False + lower = model_id.lower() + return any(lower.startswith(p) for p in PERMISSIVE_PREFIXES) + + def derive_model_permissiveness(model_id: str | None) -> str: """Derive distillation licensing permissiveness from a model ID. @@ -44,7 +52,4 @@ def derive_model_permissiveness(model_id: str | None) -> str: """ if not model_id: return "unknown" - lower = model_id.lower() - if any(lower.startswith(p) for p in PERMISSIVE_PREFIXES): - return "permissive" - return "restricted" + return "permissive" if is_model_permissive(model_id) else "restricted" diff --git a/app/core/model_registry.py b/app/core/model_registry.py index 60753db..eb19271 100644 --- a/app/core/model_registry.py +++ b/app/core/model_registry.py @@ -166,15 +166,21 @@ def get_best_text_model(self, prefer_free: bool = False) -> str | None: candidates.sort(key=lambda x: x[1], reverse=True) return candidates[0][0] - def get_best_image_model(self) -> str | None: + def get_best_image_model(self, permissive_only: bool = False) -> str | None: """Find the best available image generation model. Heuristic: filter to models with "image" in output_modalities, - prefer gemini models. + prefer gemini models (unless permissive_only is set). + + Args: + permissive_only: If True, only return open-weight models + (no Google, OpenAI, Anthropic). Returns: Model ID or None if cache is empty. """ + from app.core.model_policy import is_model_permissive + if not self._models: return None @@ -187,18 +193,29 @@ def get_best_image_model(self) -> str | None: if not isinstance(output_mods, list) or "image" not in output_mods: continue + if permissive_only and not is_model_permissive(model_id): + continue + ctx = info.get("context_length", 0) if "gemini" in model_id.lower(): gemini_candidates.append((model_id, ctx)) else: other_candidates.append((model_id, ctx)) - # Prefer gemini models, sorted by context_length desc - if gemini_candidates: - gemini_candidates.sort(key=lambda x: x[1], reverse=True) - return gemini_candidates[0][0] + if not permissive_only: + # Prefer gemini models, sorted by context_length desc + if gemini_candidates: + gemini_candidates.sort(key=lambda x: x[1], reverse=True) + return gemini_candidates[0][0] + if other_candidates: other_candidates.sort(key=lambda x: x[1], reverse=True) return other_candidates[0][0] + # Fallback: if permissive_only but no permissive image models found, + # return gemini via OpenRouter (still better than nothing) + if gemini_candidates: + gemini_candidates.sort(key=lambda x: x[1], reverse=True) + return gemini_candidates[0][0] + return None