From 7f03c9e41f1aa36793a7894d0a66937de6441436 Mon Sep 17 00:00:00 2001
From: YGYOOO <594487125@qq.com>
Date: Tue, 5 May 2026 19:57:42 +0800
Subject: [PATCH] feat: enhance image generation support for Google AI Studio

- Updated environment configuration to allow image generation models to use Google's native API.
- Modified README files to reflect changes in model configuration and usage instructions.
- Implemented logic in image generation functions to handle requests for both OpenAI and Google-native providers.
---
 .env.example                                  |  13 +-
 README.md                                     |   9 +-
 README_EN.md                                  |   9 +-
 .../character/src/models/gemini-flash-img.mjs | 117 ++++++++++---
 .../map/src/models/gemini-flash-img.mjs       | 157 +++++++++++++-----
 5 files changed, 238 insertions(+), 67 deletions(-)

diff --git a/.env.example b/.env.example
index 2ff4145..dd1b1f0 100644
--- a/.env.example
+++ b/.env.example
@@ -1,11 +1,11 @@
 # ============================================================
 # WorldX 环境配置 / Environment Configuration
 # ============================================================
-# 所有模型均使用 OpenAI 兼容协议（chat/completions）。
-# 你可以使用 OpenRouter、Google AI Studio、或任何兼容平台。
+# 除绘图模型可使用 Google 原生图片接口外，其余模型使用 OpenAI 兼容协议（chat/completions）。
+# 你可以使用 OpenRouter、Google AI Studio、或任何兼容平台；绘图模型见 IMAGE_GEN_PROVIDER。
 #
-# All models use the OpenAI-compatible protocol (chat/completions).
-# You can use OpenRouter, Google AI Studio, or any compatible platform.
+# All models use OpenAI-compatible chat/completions, except Image Generation can use Google's native image API.
+# You can use OpenRouter, Google AI Studio, or any compatible platform; see IMAGE_GEN_PROVIDER for image generation.
 # ============================================================
 
 # --- 编排引擎 / Orchestrator ---
@@ -18,6 +18,9 @@ ORCHESTRATOR_MODEL=
 # --- 绘图模型 / Image Generation ---
 # 用于生成地图和角色立绘，建议使用gemini-3.1-flash-image-preview或更强文生图模型。
 # Used for map and character art. Recommend gemini-3.1-flash-image-preview or a stronger text-to-image model.
+# 可选：openai-compatible（默认）或 google-native。Google AI Studio 图片生成请使用 google-native。
+# Optional: openai-compatible (default) or google-native. Use google-native for Google AI Studio image generation.
+IMAGE_GEN_PROVIDER=
 IMAGE_GEN_BASE_URL=
 IMAGE_GEN_API_KEY=
 IMAGE_GEN_MODEL=
@@ -53,6 +56,7 @@ SIMULATION_MODEL=
 # ORCHESTRATOR_MODEL=google/gemini-2.5-pro-preview
 #
 # IMAGE_GEN_BASE_URL=https://openrouter.ai/api/v1
+# IMAGE_GEN_PROVIDER=openai-compatible
 # IMAGE_GEN_API_KEY=sk-or-v1-xxxx
 # IMAGE_GEN_MODEL=google/gemini-3.1-flash-image-preview
 #
@@ -73,6 +77,7 @@ SIMULATION_MODEL=
 # ORCHESTRATOR_API_KEY=AIzaSy...
 # ORCHESTRATOR_MODEL=gemini-2.5-pro-preview
 #
+# IMAGE_GEN_PROVIDER=google-native
 # IMAGE_GEN_BASE_URL=https://generativelanguage.googleapis.com/v1beta/openai
 # IMAGE_GEN_API_KEY=AIzaSy...
 # IMAGE_GEN_MODEL=gemini-3.1-flash-image-preview
diff --git a/README.md b/README.md
index 7fb9374..a3bbd9b 100644
--- a/README.md
+++ b/README.md
@@ -108,7 +108,7 @@ npm run create -- "赛博朋克风格的深夜拉面馆，黑客和仿生人在
 
 ## 模型配置
 
-WorldX 使用 **4 个模型角色**，各自独立配置。全部采用 OpenAI 兼容的 `chat/completions` 协议 —— 任何兼容平台均可使用。
+WorldX 使用 **4 个模型角色**，各自独立配置。除绘图模型支持 Google AI Studio 原生图片接口外，其余角色均采用 OpenAI 兼容的 `chat/completions` 协议。
 
 
 | 角色       | 环境变量前缀          | 用途             | 推荐模型                                      |
@@ -119,7 +119,7 @@ WorldX 使用 **4 个模型角色**，各自独立配置。全部采用 OpenAI 
 | **世界驱动** | `SIMULATION_`   | 驱动运行时角色行为      | 任意模型，便宜的就行（如 `gemini-2.5-flash`）          |
 
 
-每个角色需要 3 个环境变量：
+每个角色通常需要 3 个环境变量：
 
 ```env
 {ROLE}_BASE_URL=https://openrouter.ai/api/v1    # API 地址
@@ -127,6 +127,8 @@ WorldX 使用 **4 个模型角色**，各自独立配置。全部采用 OpenAI 
 {ROLE}_MODEL=google/gemini-3.1-pro-preview       # 模型标识
 ```
 
+绘图模型可额外设置 `IMAGE_GEN_PROVIDER`，`IMAGE_GEN_PROVIDER` 可选 `openai-compatible`（默认，适合 OpenRouter）或 `google-native`（适合 Google AI Studio 图片生成）。
+
 ### 平台配置示例
 
 <details>
@@ -140,6 +142,7 @@ ORCHESTRATOR_API_KEY=sk-or-v1-xxxx
 ORCHESTRATOR_MODEL=google/gemini-3.1-pro-preview
 
 IMAGE_GEN_BASE_URL=https://openrouter.ai/api/v1
+IMAGE_GEN_PROVIDER=openai-compatible
 IMAGE_GEN_API_KEY=sk-or-v1-xxxx
 IMAGE_GEN_MODEL=google/gemini-3.1-flash-image-preview
 
@@ -165,6 +168,7 @@ ORCHESTRATOR_API_KEY=AIzaSy...
 ORCHESTRATOR_MODEL=gemini-3.1-pro-preview
 
 IMAGE_GEN_BASE_URL=https://generativelanguage.googleapis.com/v1beta/openai
+IMAGE_GEN_PROVIDER=google-native
 IMAGE_GEN_API_KEY=AIzaSy...
 IMAGE_GEN_MODEL=gemini-3.1-flash-image-preview
 
@@ -192,6 +196,7 @@ ORCHESTRATOR_MODEL=gemini-3.1-pro-preview
 
 # 美术生成 — Google AI Studio
 IMAGE_GEN_BASE_URL=https://generativelanguage.googleapis.com/v1beta/openai
+IMAGE_GEN_PROVIDER=google-native
 IMAGE_GEN_API_KEY=AIzaSy...
 IMAGE_GEN_MODEL=gemini-3.1-flash-image-preview
 
diff --git a/README_EN.md b/README_EN.md
index 3ef4031..4eb0679 100644
--- a/README_EN.md
+++ b/README_EN.md
@@ -95,7 +95,7 @@ npm run create -- "A cyberpunk noodle shop where hackers and androids share rumo
 
 ## Model Configuration
 
-WorldX uses **4 model roles**, each configurable independently. All use the OpenAI-compatible `chat/completions` protocol — any compatible platform works.
+WorldX uses **4 model roles**, each configurable independently. All roles use the OpenAI-compatible `chat/completions` protocol except Image Gen, which can also use Google AI Studio's native image API.
 
 | Role | Env Prefix | What It Does | Recommended |
 |------|-----------|-------------|-------------|
@@ -104,7 +104,7 @@ WorldX uses **4 model roles**, each configurable independently. All use the Open
 | **Vision** | `VISION_` | Reviews map quality, locates regions/elements | Strong multimodal model (e.g. `gemini-3.1-pro-preview`) |
 | **Simulation** | `SIMULATION_` | Drives runtime character behavior | Any model — cheaper is fine (e.g. `gemini-2.5-flash`) |
 
-Each role needs 3 env vars:
+Each role usually needs 3 env vars:
 
 ```env
 {ROLE}_BASE_URL=https://openrouter.ai/api/v1    # API base URL
@@ -112,6 +112,8 @@ Each role needs 3 env vars:
 {ROLE}_MODEL=google/gemini-3.1-pro-preview       # Model identifier
 ```
 
+Image Gen can additionally set `IMAGE_GEN_PROVIDER`. `IMAGE_GEN_PROVIDER` can be `openai-compatible` (default, for OpenRouter) or `google-native` (for Google AI Studio image generation).
+
 ### Platform Examples
 
 <details>
@@ -125,6 +127,7 @@ ORCHESTRATOR_API_KEY=sk-or-v1-xxxx
 ORCHESTRATOR_MODEL=google/gemini-3.1-pro-preview
 
 IMAGE_GEN_BASE_URL=https://openrouter.ai/api/v1
+IMAGE_GEN_PROVIDER=openai-compatible
 IMAGE_GEN_API_KEY=sk-or-v1-xxxx
 IMAGE_GEN_MODEL=google/gemini-3.1-flash-image-preview
 
@@ -150,6 +153,7 @@ ORCHESTRATOR_API_KEY=AIzaSy...
 ORCHESTRATOR_MODEL=gemini-3.1-pro-preview
 
 IMAGE_GEN_BASE_URL=https://generativelanguage.googleapis.com/v1beta/openai
+IMAGE_GEN_PROVIDER=google-native
 IMAGE_GEN_API_KEY=AIzaSy...
 IMAGE_GEN_MODEL=gemini-3.1-flash-image-preview
 
@@ -177,6 +181,7 @@ ORCHESTRATOR_MODEL=gemini-3.1-pro-preview
 
 # Art generation — Google AI Studio
 IMAGE_GEN_BASE_URL=https://generativelanguage.googleapis.com/v1beta/openai
+IMAGE_GEN_PROVIDER=google-native
 IMAGE_GEN_API_KEY=AIzaSy...
 IMAGE_GEN_MODEL=gemini-3.1-flash-image-preview
 
diff --git a/generators/character/src/models/gemini-flash-img.mjs b/generators/character/src/models/gemini-flash-img.mjs
index fb743b6..4b53749 100644
--- a/generators/character/src/models/gemini-flash-img.mjs
+++ b/generators/character/src/models/gemini-flash-img.mjs
@@ -7,8 +7,51 @@ const DEFAULT_BASE_URL = "https://openrouter.ai/api/v1";
 const DEFAULT_MODEL = "google/gemini-3.1-flash-image-preview";
 const MODEL = process.env.IMAGE_GEN_MODEL || DEFAULT_MODEL;
 const BASE_URL = process.env.IMAGE_GEN_BASE_URL || DEFAULT_BASE_URL;
+const PROVIDER = (process.env.IMAGE_GEN_PROVIDER || "").trim().toLowerCase();
 const REQUEST_TIMEOUT_MS = parseInt(process.env.IMAGE_GEN_TIMEOUT_MS || "180000", 10);
 
+function useGoogleNativeProvider() {
+  return (
+    PROVIDER === "google-native" ||
+    PROVIDER === "google" ||
+    (!PROVIDER && BASE_URL.includes("generativelanguage.googleapis.com"))
+  );
+}
+
+function getGoogleNativeBaseUrl() {
+  const trimmed = BASE_URL.replace(/\/+$/, "");
+  return trimmed.endsWith("/openai")
+    ? trimmed.slice(0, -"/openai".length)
+    : trimmed;
+}
+
+function getGoogleNativeModel() {
+  return MODEL.replace(/^google\//, "").replace(/^models\//, "");
+}
+
+function buildGoogleNativeUrl(apiKey) {
+  const model = encodeURIComponent(getGoogleNativeModel());
+  return `${getGoogleNativeBaseUrl()}/models/${model}:generateContent?key=${encodeURIComponent(apiKey)}`;
+}
+
+function buildGoogleNativeBody(parts) {
+  return {
+    contents: [{ role: "user", parts }],
+    generationConfig: {
+      responseModalities: ["TEXT", "IMAGE"],
+    },
+  };
+}
+
+async function postGoogleNativeImage(parts, { apiKey, signal }) {
+  return fetch(buildGoogleNativeUrl(apiKey), {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify(buildGoogleNativeBody(parts)),
+    signal,
+  });
+}
+
 /**
  * Image editing: send reference image + text instruction -> new image.
  * @param {string} text  - generation instruction
@@ -23,31 +66,42 @@ export async function editImage(text, imageBuffer, { imageSize = "1K" } = {}) {
   const timer = setTimeout(() => controller.abort(), REQUEST_TIMEOUT_MS);
 
   try {
-    const res = await fetch(`${BASE_URL}/chat/completions`, {
-      method: "POST",
-      headers: {
-        Authorization: `Bearer ${API_KEY}`,
-        "Content-Type": "application/json",
-      },
-      body: JSON.stringify({
-        model: MODEL,
-        messages: [
+    const res = useGoogleNativeProvider()
+      ? await postGoogleNativeImage(
+          [
+            { text },
+            { inlineData: { mimeType: "image/png", data: base64 } },
+          ],
           {
-            role: "user",
-            content: [
-              { type: "text", text },
+            apiKey: API_KEY,
+            signal: controller.signal,
+          },
+        )
+      : await fetch(`${BASE_URL}/chat/completions`, {
+          method: "POST",
+          headers: {
+            Authorization: `Bearer ${API_KEY}`,
+            "Content-Type": "application/json",
+          },
+          body: JSON.stringify({
+            model: MODEL,
+            messages: [
               {
-                type: "image_url",
-                image_url: { url: `data:image/png;base64,${base64}` },
+                role: "user",
+                content: [
+                  { type: "text", text },
+                  {
+                    type: "image_url",
+                    image_url: { url: `data:image/png;base64,${base64}` },
+                  },
+                ],
               },
             ],
-          },
-        ],
-        modalities: ["image", "text"],
-        image_config: { image_size: imageSize },
-      }),
-      signal: controller.signal,
-    });
+            modalities: ["image", "text"],
+            image_config: { image_size: imageSize },
+          }),
+          signal: controller.signal,
+        });
 
     if (!res.ok) {
       const err = await res.text();
@@ -55,7 +109,9 @@ export async function editImage(text, imageBuffer, { imageSize = "1K" } = {}) {
     }
 
     const data = await res.json();
-    return extractImageBuffer(data);
+    return useGoogleNativeProvider()
+      ? extractGoogleNativeImageBuffer(data)
+      : extractImageBuffer(data);
   } catch (e) {
     if (e.name === "AbortError") {
       throw new Error(`Image Gen Edit request timed out after ${REQUEST_TIMEOUT_MS / 1000}s`);
@@ -93,3 +149,20 @@ function extractImageBuffer(data) {
 
   throw new Error("No image found in Image Gen response");
 }
+
+function extractGoogleNativeImageBuffer(data) {
+  const candidates = Array.isArray(data.candidates) ? data.candidates : [];
+  for (const candidate of candidates) {
+    const parts = Array.isArray(candidate.content?.parts)
+      ? candidate.content.parts
+      : [];
+    for (const part of parts) {
+      const inlineData = part.inlineData || part.inline_data;
+      if (inlineData?.data) {
+        return Buffer.from(inlineData.data, "base64");
+      }
+    }
+  }
+
+  throw new Error("No image found in Google native Image Gen response");
+}
diff --git a/generators/map/src/models/gemini-flash-img.mjs b/generators/map/src/models/gemini-flash-img.mjs
index 8c76d7a..b852c8e 100644
--- a/generators/map/src/models/gemini-flash-img.mjs
+++ b/generators/map/src/models/gemini-flash-img.mjs
@@ -10,6 +10,7 @@ const DEFAULT_BASE_URL = "https://openrouter.ai/api/v1";
 const DEFAULT_MODEL = "google/gemini-3.1-flash-image-preview";
 const MODEL = process.env.IMAGE_GEN_MODEL || DEFAULT_MODEL;
 const BASE_URL = process.env.IMAGE_GEN_BASE_URL || DEFAULT_BASE_URL;
+const PROVIDER = (process.env.IMAGE_GEN_PROVIDER || "").trim().toLowerCase();
 const DEFAULT_REQUEST_TIMEOUT_MS = parseInt(process.env.IMAGE_GEN_TIMEOUT_MS || "180000", 10);
 const MAX_CONSECUTIVE_FAILURES = 2;
 
@@ -42,6 +43,48 @@ function resolveRequestTimeoutMs(requestTimeoutMs, timeoutEnvKey) {
   return DEFAULT_REQUEST_TIMEOUT_MS;
 }
 
+function useGoogleNativeProvider() {
+  return (
+    PROVIDER === "google-native" ||
+    PROVIDER === "google" ||
+    (!PROVIDER && BASE_URL.includes("generativelanguage.googleapis.com"))
+  );
+}
+
+function getGoogleNativeBaseUrl() {
+  const trimmed = BASE_URL.replace(/\/+$/, "");
+  return trimmed.endsWith("/openai")
+    ? trimmed.slice(0, -"/openai".length)
+    : trimmed;
+}
+
+function getGoogleNativeModel() {
+  return MODEL.replace(/^google\//, "").replace(/^models\//, "");
+}
+
+function buildGoogleNativeUrl(apiKey) {
+  const model = encodeURIComponent(getGoogleNativeModel());
+  return `${getGoogleNativeBaseUrl()}/models/${model}:generateContent?key=${encodeURIComponent(apiKey)}`;
+}
+
+function buildGoogleNativeBody(parts) {
+  return {
+    contents: [{ role: "user", parts }],
+    generationConfig: {
+      responseModalities: ["TEXT", "IMAGE"],
+    },
+  };
+}
+
+async function postGoogleNativeImage(parts, { apiKey, signal }) {
+  return fetch(buildGoogleNativeUrl(apiKey), {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify(buildGoogleNativeBody(parts)),
+    signal,
+  });
+}
+
 /**
  * Text-to-image generation.
  * @returns {Buffer} PNG image buffer
@@ -59,20 +102,28 @@ export async function generateImage(
     const timer = setTimeout(() => controller.abort(), timeoutMs);
 
     try {
-      const res = await fetch(`${BASE_URL}/chat/completions`, {
-        method: "POST",
-        headers: {
-          Authorization: `Bearer ${API_KEY}`,
-          "Content-Type": "application/json",
-        },
-        body: JSON.stringify({
-          model: MODEL,
-          messages: [{ role: "user", content: prompt }],
-          modalities: ["image", "text"],
-          image_config: { aspect_ratio: aspectRatio, image_size: imageSize },
-        }),
-        signal: controller.signal,
-      });
+      const nativePrompt = useGoogleNativeProvider()
+        ? `${prompt}\n\nGenerate the image in ${aspectRatio} aspect ratio.`
+        : prompt;
+      const res = useGoogleNativeProvider()
+        ? await postGoogleNativeImage([{ text: nativePrompt }], {
+            apiKey: API_KEY,
+            signal: controller.signal,
+          })
+        : await fetch(`${BASE_URL}/chat/completions`, {
+            method: "POST",
+            headers: {
+              Authorization: `Bearer ${API_KEY}`,
+              "Content-Type": "application/json",
+            },
+            body: JSON.stringify({
+              model: MODEL,
+              messages: [{ role: "user", content: prompt }],
+              modalities: ["image", "text"],
+              image_config: { aspect_ratio: aspectRatio, image_size: imageSize },
+            }),
+            signal: controller.signal,
+          });
 
       if (!res.ok) {
         const err = await res.text();
@@ -82,7 +133,9 @@ export async function generateImage(
       }
 
       const data = await res.json();
-      const buf = extractImageBuffer(data);
+      const buf = useGoogleNativeProvider()
+        ? extractGoogleNativeImageBuffer(data)
+        : extractImageBuffer(data);
       logModelImageResponse(logStep, MODEL, "(returned to caller)", buf.length);
       return buf;
     } catch (e) {
@@ -116,31 +169,42 @@ export async function editImage(text, imageBuffer, { imageSize = "2K", logStep =
     const timer = setTimeout(() => controller.abort(), timeoutMs);
 
     try {
-      const res = await fetch(`${BASE_URL}/chat/completions`, {
-        method: "POST",
-        headers: {
-          Authorization: `Bearer ${API_KEY}`,
-          "Content-Type": "application/json",
-        },
-        body: JSON.stringify({
-          model: MODEL,
-          messages: [
+      const res = useGoogleNativeProvider()
+        ? await postGoogleNativeImage(
+            [
+              { text },
+              { inlineData: { mimeType: "image/png", data: base64 } },
+            ],
             {
-              role: "user",
-              content: [
-                { type: "text", text },
+              apiKey: API_KEY,
+              signal: controller.signal,
+            },
+          )
+        : await fetch(`${BASE_URL}/chat/completions`, {
+            method: "POST",
+            headers: {
+              Authorization: `Bearer ${API_KEY}`,
+              "Content-Type": "application/json",
+            },
+            body: JSON.stringify({
+              model: MODEL,
+              messages: [
                 {
-                  type: "image_url",
-                  image_url: { url: `data:image/png;base64,${base64}` },
+                  role: "user",
+                  content: [
+                    { type: "text", text },
+                    {
+                      type: "image_url",
+                      image_url: { url: `data:image/png;base64,${base64}` },
+                    },
+                  ],
                 },
               ],
-            },
-          ],
-          modalities: ["image", "text"],
-          image_config: { image_size: imageSize },
-        }),
-        signal: controller.signal,
-      });
+              modalities: ["image", "text"],
+              image_config: { image_size: imageSize },
+            }),
+            signal: controller.signal,
+          });
 
       if (!res.ok) {
         const err = await res.text();
@@ -150,7 +214,9 @@ export async function editImage(text, imageBuffer, { imageSize = "2K", logStep =
       }
 
       const data = await res.json();
-      const buf = extractImageBuffer(data);
+      const buf = useGoogleNativeProvider()
+        ? extractGoogleNativeImageBuffer(data)
+        : extractImageBuffer(data);
       logModelImageResponse(logStep, MODEL, "(returned to caller)", buf.length);
       return buf;
     } catch (e) {
@@ -193,3 +259,20 @@ function extractImageBuffer(data) {
 
   throw new Error("No image found in Image Gen response");
 }
+
+function extractGoogleNativeImageBuffer(data) {
+  const candidates = Array.isArray(data.candidates) ? data.candidates : [];
+  for (const candidate of candidates) {
+    const parts = Array.isArray(candidate.content?.parts)
+      ? candidate.content.parts
+      : [];
+    for (const part of parts) {
+      const inlineData = part.inlineData || part.inline_data;
+      if (inlineData?.data) {
+        return Buffer.from(inlineData.data, "base64");
+      }
+    }
+  }
+
+  throw new Error("No image found in Google native Image Gen response");
+}