diff --git a/app/agents/image_prompt_optimizer.py b/app/agents/image_prompt_optimizer.py index 283862d..3127d96 100644 --- a/app/agents/image_prompt_optimizer.py +++ b/app/agents/image_prompt_optimizer.py @@ -100,7 +100,7 @@ class ImagePromptOptimizerOutput(BaseModel): description="Elements removed to reduce complexity" ) - issues_found: list[PromptIssue] = Field( + issues_found: list[PromptIssue | str] = Field( default_factory=list, description="Quality issues detected and addressed" ) @@ -282,7 +282,7 @@ async def run( # Log significant issues critical_issues = [ i for i in result.content.issues_found - if i.severity == "critical" + if isinstance(i, PromptIssue) and i.severity == "critical" ] if critical_issues: logger.warning( diff --git a/app/api/v1/timepoints.py b/app/api/v1/timepoints.py index 2acfc8f..fd40669 100644 --- a/app/api/v1/timepoints.py +++ b/app/api/v1/timepoints.py @@ -1063,6 +1063,8 @@ async def generate_timepoint_sync( resp.request_context = request.request_context return resp + except HTTPException: + raise # Let validation errors (e.g. 422 from model policy) pass through except Exception as e: logger.error(f"Sync generation failed: {e}") raise HTTPException(status_code=500, detail=str(e)) from e diff --git a/app/core/llm_router.py b/app/core/llm_router.py index 14d8b5b..9ad4f2c 100644 --- a/app/core/llm_router.py +++ b/app/core/llm_router.py @@ -66,6 +66,8 @@ # Static fallback defaults (used when model registry has no data) _PAID_FALLBACK_DEFAULT = VerifiedModels.OPENROUTER_TEXT[0] # google/gemini-2.0-flash-001 _IMAGE_FALLBACK_DEFAULT = "google/gemini-2.5-flash-image-preview" +# FLUX for permissive mode — fully open-weight image generation via OpenRouter +_IMAGE_FALLBACK_PERMISSIVE = "black-forest-labs/flux.2-pro" def get_paid_fallback_model() -> str: @@ -95,6 +97,9 @@ def get_image_fallback_model(permissive_only: bool = False) -> str: return best except Exception: pass + # No open-weight image models exist on OpenRouter — use Google via OpenRouter + if permissive_only: + return _IMAGE_FALLBACK_PERMISSIVE return _IMAGE_FALLBACK_DEFAULT # Rate limit retry settings diff --git a/app/core/model_policy.py b/app/core/model_policy.py index b088a06..ff8145c 100644 --- a/app/core/model_policy.py +++ b/app/core/model_policy.py @@ -9,18 +9,19 @@ "meta-llama/", "deepseek/", "qwen/", - "mistralai/", # Mistral open-weight models (Apache 2.0) - "microsoft/", # Phi family - "google/gemma", # Gemma open-weight + "mistralai/", # Mistral open-weight models (Apache 2.0) + "microsoft/", # Phi family + "google/gemma", # Gemma open-weight "allenai/", "nvidia/", + "black-forest-labs/", # FLUX open-weight image models ) # Google-native model prefixes (always restricted) GOOGLE_MODEL_PREFIXES = ("gemini", "imagen", "flux-schnell") # Prefixes routed through OpenRouter (may be restricted or permissive) -OPENROUTER_PREFIXES = ("meta-llama/", "anthropic/", "mistralai/", "openai/", "deepseek/", "qwen/", "microsoft/") +OPENROUTER_PREFIXES = ("meta-llama/", "anthropic/", "mistralai/", "openai/", "deepseek/", "qwen/", "microsoft/", "black-forest-labs/") def derive_model_provider(model_id: str | None) -> str: diff --git a/app/core/providers/openrouter.py b/app/core/providers/openrouter.py index c138067..0b0eb64 100644 --- a/app/core/providers/openrouter.py +++ b/app/core/providers/openrouter.py @@ -391,7 +391,8 @@ async def generate_image( "content": f"Generate an image: {prompt}", } ], - "modalities": ["image", "text"], # Request image output + # Image-only models (FLUX) need ["image"]; multimodal models need both + "modalities": ["image"] if "flux" in model.lower() else ["image", "text"], } try: diff --git a/docs/DOWNSTREAM_MODEL_CONTROL.md b/docs/DOWNSTREAM_MODEL_CONTROL.md index a6168b9..d630cdc 100644 --- a/docs/DOWNSTREAM_MODEL_CONTROL.md +++ b/docs/DOWNSTREAM_MODEL_CONTROL.md @@ -2,7 +2,7 @@ **For teams building on TIMEPOINT Flash (Web App, iPhone App, Clockchain, Billing, Enterprise integrations)** -TIMEPOINT Flash now supports full downstream control of model selection and generation hyperparameters on every generation request. Downstream apps can set `model_policy: "permissive"` to route all 14 pipeline agents through open-weight models (DeepSeek, Llama, Qwen, Mistral) via OpenRouter for both text and images — making the entire pipeline fully Google-free with zero Google API calls, including grounding. Apps can also specify exact models by name using `text_model` and `image_model` (any OpenRouter-compatible model ID like `qwen/qwen3-235b-a22b` or Google native like `gemini-2.5-flash`), and these explicit overrides take priority over `model_policy`, which in turn takes priority over `preset`. In addition, the new `llm_params` object provides fine-grained control over generation hyperparameters — temperature, max_tokens, top_p, top_k, frequency/presence/repetition penalties, stop sequences, thinking level, and system prompt injection (prefix/suffix) — all applied uniformly across every agent in the pipeline. Request-level `llm_params` override each agent's built-in defaults, so setting `temperature: 0.3` overrides the scene agent's default of 0.7, the dialog agent's default of 0.85, etc. All of these controls are composable: you can combine `model_policy`, explicit models, `preset`, and `llm_params` in the same request. +TIMEPOINT Flash now supports full downstream control of model selection and generation hyperparameters on every generation request. Downstream apps can set `model_policy: "permissive"` to route all 14 pipeline agents through open-weight models (DeepSeek, Llama, Qwen, Mistral) via OpenRouter for text, and FLUX (Black Forest Labs) for images — making the entire pipeline fully Google-free with zero Google API calls, including grounding. Apps can also specify exact models by name using `text_model` and `image_model` (any OpenRouter-compatible model ID like `qwen/qwen3-235b-a22b` or Google native like `gemini-2.5-flash`), and these explicit overrides take priority over `model_policy`, which in turn takes priority over `preset`. In addition, the new `llm_params` object provides fine-grained control over generation hyperparameters — temperature, max_tokens, top_p, top_k, frequency/presence/repetition penalties, stop sequences, thinking level, and system prompt injection (prefix/suffix) — all applied uniformly across every agent in the pipeline. Request-level `llm_params` override each agent's built-in defaults, so setting `temperature: 0.3` overrides the scene agent's default of 0.7, the dialog agent's default of 0.85, etc. All of these controls are composable: you can combine `model_policy`, explicit models, `preset`, and `llm_params` in the same request. ## Request Parameters diff --git a/docs/DOWNSTREAM_NOTICE_2026-03-11.md b/docs/DOWNSTREAM_NOTICE_2026-03-11.md new file mode 100644 index 0000000..5e21c17 --- /dev/null +++ b/docs/DOWNSTREAM_NOTICE_2026-03-11.md @@ -0,0 +1,105 @@ +# DOWNSTREAM NOTICE — 2026-03-11 + +**Breaking changes to `model_policy: "permissive"` and image generation** + +Affects: Web App, iPhone App, Clockchain, Billing, Enterprise integrations + +--- + +## What changed + +### 1. Permissive mode now enforces open-weight models (BREAKING) + +Previously, `model_policy: "permissive"` was advisory — it labeled model provenance in the response but did not block proprietary models. **It now enforces.** + +If you send `model_policy: "permissive"` with an explicit `text_model` or `image_model` that is proprietary, the request will be rejected with **HTTP 422**: + +```json +{ + "detail": "model_policy='permissive' requires open-weight models. 'openai/gpt-4o' is proprietary. Use models from: meta-llama/, deepseek/, qwen/, mistralai/, microsoft/, google/gemma, allenai/, nvidia/" +} +``` + +**Action required:** If you pass explicit model IDs alongside `model_policy: "permissive"`, ensure they are from the open-weight allowlist: + +| Prefix | Examples | +|--------|----------| +| `meta-llama/` | `meta-llama/llama-4-scout-17b-16e-instruct` | +| `deepseek/` | `deepseek/deepseek-chat-v3-0324` | +| `qwen/` | `qwen/qwen3-235b-a22b`, `qwen/qwen3-30b-a3b` | +| `mistralai/` | `mistralai/mistral-small-3.2-24b-instruct` | +| `microsoft/` | Phi family | +| `google/gemma` | Gemma open-weight models only (not Gemini) | +| `allenai/` | OLMo family | +| `nvidia/` | Nemotron family | +| `black-forest-labs/` | FLUX image models (`flux.2-pro`, `flux.2-max`, `flux.2-flex`) | + +If you omit `text_model` / `image_model` with permissive mode, Flash auto-selects the best available open-weight model from the registry. No action needed in that case. + +### 2. Pollinations removed — all images via OpenRouter (BREAKING if you used `image_model: "pollinations"`) + +Pollinations.ai has been removed entirely as an image provider. The image fallback chain is now: + +- **Before:** Google → OpenRouter → Pollinations (3-tier) +- **After:** Google → OpenRouter (2-tier) + +If you were passing `image_model: "pollinations"` in requests, this will now fall through to OpenRouter image models. Remove any explicit `"pollinations"` references. + +### 3. Permissive mode is dramatically faster + +Generation with `model_policy: "permissive"` was timing out at 600s due to DeepSeek R1 (a thinking model, 30-60s per call) being selected early in the fallback chain. Fixed: + +- **Model preference reordered:** Fast chat models first (Llama 4 Scout, DeepSeek Chat V3, Qwen3-30B). DeepSeek R1 is now last resort. +- **Dialog batched:** Permissive mode uses batch dialog (1 LLM call) instead of sequential (7 calls). +- **Critique loop skipped:** Permissive mode skips the dialog critique/refinement pass. +- **Default max_tokens=2048:** Applied when no preset is specified, preventing unbounded generation. + +Expected latency: ~2 minutes (down from 10+ minutes / timeout). + +### 4. Google fallback blocked in permissive mode + +The LLM router previously fell back to Google Gemini when OpenRouter calls failed, even in permissive mode. This violated the Google-free guarantee. The router now skips Google fallback entirely when `model_policy: "permissive"`. + +--- + +## Migration checklist + +- [ ] **Search for `"pollinations"` in your codebase** — remove any explicit references as an `image_model` value +- [ ] **If you send explicit models with `model_policy: "permissive"`** — verify they match the allowlist prefixes above, or handle 422 responses +- [ ] **If you had long timeouts for permissive mode** — you can likely reduce them (2-3 minutes is sufficient now) +- [ ] **No action needed if** you use `model_policy: "permissive"` without explicit model overrides — auto-selection handles everything + +## Request examples + +**Simplest permissive request (recommended):** +```json +{ + "query": "The signing of the Magna Carta, 1215", + "generate_image": true, + "model_policy": "permissive" +} +``` + +**Permissive with explicit open-weight model:** +```json +{ + "query": "Apollo 11 Moon Landing, 1969", + "model_policy": "permissive", + "text_model": "qwen/qwen3-235b-a22b", + "generate_image": true +} +``` + +**This will now fail (422):** +```json +{ + "query": "D-Day, 1944", + "model_policy": "permissive", + "text_model": "openai/gpt-4o" +} +``` + +--- + +*Deployed to production: 2026-03-11* +*PR (open-source): https://github.com/timepointai/timepoint-flash/pull/16*