From 633121f543439d4a5dc18a5918dceba94b3ef089 Mon Sep 17 00:00:00 2001
From: zy6p <goooog@88.com>
Date: Tue, 14 Apr 2026 13:41:00 +0800
Subject: [PATCH] Fix Gemini media prompt sanitization

---
 src/api/routes.py | 77 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 75 insertions(+), 2 deletions(-)
diff --git a/src/api/routes.py b/src/api/routes.py
index 2b20b099..7d41ec92 100644
--- a/src/api/routes.py
+++ b/src/api/routes.py
@@ -28,6 +28,31 @@
 MARKDOWN_IMAGE_RE = re.compile(r"!\[.*?\]\((.*?)\)")
 HTML_VIDEO_RE = re.compile(r"<video[^>]+src=['\"](.*?)['\"]", re.IGNORECASE)
 DATA_URL_RE = re.compile(r"^data:(?P<mime>[^;]+);base64,(?P<data>.+)$", re.DOTALL)
+MEDIA_PROMPT_TOOL_BLOCK_RE = re.compile(r"<tools>.*?</tools>", re.IGNORECASE | re.DOTALL)
+MEDIA_SYSTEM_INSTRUCTION_MARKERS = (
+    "<tools>",
+    "</tools>",
+    "function calling ai model",
+    "function signatures",
+    "\"$schema\"",
+    "\"additionalproperties\"",
+)
+MEDIA_PROMPT_PREAMBLE_PATTERNS = (
+    re.compile(r"^you are a function calling ai model\.?$", re.IGNORECASE),
+    re.compile(
+        r"^you are provided with function signatures within .* xml tags\.?$",
+        re.IGNORECASE,
+    ),
+    re.compile(
+        r"^you may call one or more functions to assist with the user query\.?$",
+        re.IGNORECASE,
+    ),
+    re.compile(
+        r"^don't make assumptions about what values to plug into functions\.?$",
+        re.IGNORECASE,
+    ),
+    re.compile(r"^here are the available tools:.*$", re.IGNORECASE),
+)
 GEMINI_STATUS_MAP = {
     400: "INVALID_ARGUMENT",
     401: "UNAUTHENTICATED",
@@ -225,6 +250,40 @@ def _extract_text_from_gemini_content(content: Optional[GeminiContent]) -> str:
     return "\n".join(part for part in text_parts if part).strip()
 
 
+def _should_ignore_media_system_instruction(system_instruction: str) -> bool:
+    """Drop agent/tool scaffolding before sending media prompts upstream."""
+    if not system_instruction:
+        return False
+
+    normalized = system_instruction.lower()
+    if len(system_instruction) > 1200:
+        return True
+
+    return any(marker in normalized for marker in MEDIA_SYSTEM_INSTRUCTION_MARKERS)
+
+
+def _sanitize_media_prompt(prompt: str) -> str:
+    """Strip agent/tool scaffolding that image/video models cannot use."""
+    if not prompt:
+        return ""
+
+    sanitized = MEDIA_PROMPT_TOOL_BLOCK_RE.sub(" ", prompt.strip())
+    cleaned_lines: List[str] = []
+    for raw_line in sanitized.splitlines():
+        line = raw_line.strip()
+        if not line:
+            if cleaned_lines and cleaned_lines[-1] != "":
+                cleaned_lines.append("")
+            continue
+        if any(pattern.fullmatch(line) for pattern in MEDIA_PROMPT_PREAMBLE_PATTERNS):
+            continue
+        cleaned_lines.append(line)
+
+    sanitized = "\n".join(cleaned_lines).strip()
+    sanitized = re.sub(r"\n{3,}", "\n\n", sanitized)
+    return sanitized.strip()
+
+
 async def _extract_prompt_and_images_from_openai_messages(
     messages: List[ChatMessage],
 ) -> tuple[str, List[bytes]]:
@@ -382,13 +441,27 @@ async def _normalize_gemini_request(
     model: str,
     request: GeminiGenerateContentRequest,
 ) -> NormalizedGenerationRequest:
+    resolved_model = _resolve_request_model(model, request)
     prompt, images = await _extract_prompt_and_images_from_gemini_contents(request.contents)
     system_instruction = _extract_text_from_gemini_content(request.systemInstruction)
+    model_config = MODEL_CONFIG.get(resolved_model)
+    media_model = bool(model_config and model_config.get("type") in {"image", "video"})
+
+    if media_model:
+        prompt = _sanitize_media_prompt(prompt)
+
     if system_instruction:
-        prompt = f"{system_instruction}\n\n{prompt}".strip()
+        if media_model and _should_ignore_media_system_instruction(system_instruction):
+            debug_logger.log_warning(
+                f"[GEMINI] 忽略媒体模型的 systemInstruction: model={resolved_model}, len={len(system_instruction)}"
+            )
+        else:
+            if media_model:
+                system_instruction = _sanitize_media_prompt(system_instruction)
+            prompt = f"{system_instruction}\n\n{prompt}".strip()
 
     return NormalizedGenerationRequest(
-        model=_resolve_request_model(model, request),
+        model=resolved_model,
         prompt=prompt,
         images=images,
     )