From 7f03c9e41f1aa36793a7894d0a66937de6441436 Mon Sep 17 00:00:00 2001 From: YGYOOO <594487125@qq.com> Date: Tue, 5 May 2026 19:57:42 +0800 Subject: [PATCH] feat: enhance image generation support for Google AI Studio - Updated environment configuration to allow image generation models to use Google's native API. - Modified README files to reflect changes in model configuration and usage instructions. - Implemented logic in image generation functions to handle requests for both OpenAI and Google-native providers. --- .env.example | 13 +- README.md | 9 +- README_EN.md | 9 +- .../character/src/models/gemini-flash-img.mjs | 117 ++++++++++--- .../map/src/models/gemini-flash-img.mjs | 157 +++++++++++++----- 5 files changed, 238 insertions(+), 67 deletions(-) diff --git a/.env.example b/.env.example index 2ff4145..dd1b1f0 100644 --- a/.env.example +++ b/.env.example @@ -1,11 +1,11 @@ # ============================================================ # WorldX 环境配置 / Environment Configuration # ============================================================ -# 所有模型均使用 OpenAI 兼容协议(chat/completions)。 -# 你可以使用 OpenRouter、Google AI Studio、或任何兼容平台。 +# 除绘图模型可使用 Google 原生图片接口外,其余模型使用 OpenAI 兼容协议(chat/completions)。 +# 你可以使用 OpenRouter、Google AI Studio、或任何兼容平台;绘图模型见 IMAGE_GEN_PROVIDER。 # -# All models use the OpenAI-compatible protocol (chat/completions). -# You can use OpenRouter, Google AI Studio, or any compatible platform. +# All models use OpenAI-compatible chat/completions, except Image Generation can use Google's native image API. +# You can use OpenRouter, Google AI Studio, or any compatible platform; see IMAGE_GEN_PROVIDER for image generation. # ============================================================ # --- 编排引擎 / Orchestrator --- @@ -18,6 +18,9 @@ ORCHESTRATOR_MODEL= # --- 绘图模型 / Image Generation --- # 用于生成地图和角色立绘,建议使用gemini-3.1-flash-image-preview或更强文生图模型。 # Used for map and character art. Recommend gemini-3.1-flash-image-preview or a stronger text-to-image model. +# 可选:openai-compatible(默认)或 google-native。Google AI Studio 图片生成请使用 google-native。 +# Optional: openai-compatible (default) or google-native. Use google-native for Google AI Studio image generation. +IMAGE_GEN_PROVIDER= IMAGE_GEN_BASE_URL= IMAGE_GEN_API_KEY= IMAGE_GEN_MODEL= @@ -53,6 +56,7 @@ SIMULATION_MODEL= # ORCHESTRATOR_MODEL=google/gemini-2.5-pro-preview # # IMAGE_GEN_BASE_URL=https://openrouter.ai/api/v1 +# IMAGE_GEN_PROVIDER=openai-compatible # IMAGE_GEN_API_KEY=sk-or-v1-xxxx # IMAGE_GEN_MODEL=google/gemini-3.1-flash-image-preview # @@ -73,6 +77,7 @@ SIMULATION_MODEL= # ORCHESTRATOR_API_KEY=AIzaSy... # ORCHESTRATOR_MODEL=gemini-2.5-pro-preview # +# IMAGE_GEN_PROVIDER=google-native # IMAGE_GEN_BASE_URL=https://generativelanguage.googleapis.com/v1beta/openai # IMAGE_GEN_API_KEY=AIzaSy... # IMAGE_GEN_MODEL=gemini-3.1-flash-image-preview diff --git a/README.md b/README.md index 7fb9374..a3bbd9b 100644 --- a/README.md +++ b/README.md @@ -108,7 +108,7 @@ npm run create -- "赛博朋克风格的深夜拉面馆,黑客和仿生人在 ## 模型配置 -WorldX 使用 **4 个模型角色**,各自独立配置。全部采用 OpenAI 兼容的 `chat/completions` 协议 —— 任何兼容平台均可使用。 +WorldX 使用 **4 个模型角色**,各自独立配置。除绘图模型支持 Google AI Studio 原生图片接口外,其余角色均采用 OpenAI 兼容的 `chat/completions` 协议。 | 角色 | 环境变量前缀 | 用途 | 推荐模型 | @@ -119,7 +119,7 @@ WorldX 使用 **4 个模型角色**,各自独立配置。全部采用 OpenAI | **世界驱动** | `SIMULATION_` | 驱动运行时角色行为 | 任意模型,便宜的就行(如 `gemini-2.5-flash`) | -每个角色需要 3 个环境变量: +每个角色通常需要 3 个环境变量: ```env {ROLE}_BASE_URL=https://openrouter.ai/api/v1 # API 地址 @@ -127,6 +127,8 @@ WorldX 使用 **4 个模型角色**,各自独立配置。全部采用 OpenAI {ROLE}_MODEL=google/gemini-3.1-pro-preview # 模型标识 ``` +绘图模型可额外设置 `IMAGE_GEN_PROVIDER`,`IMAGE_GEN_PROVIDER` 可选 `openai-compatible`(默认,适合 OpenRouter)或 `google-native`(适合 Google AI Studio 图片生成)。 + ### 平台配置示例
@@ -140,6 +142,7 @@ ORCHESTRATOR_API_KEY=sk-or-v1-xxxx ORCHESTRATOR_MODEL=google/gemini-3.1-pro-preview IMAGE_GEN_BASE_URL=https://openrouter.ai/api/v1 +IMAGE_GEN_PROVIDER=openai-compatible IMAGE_GEN_API_KEY=sk-or-v1-xxxx IMAGE_GEN_MODEL=google/gemini-3.1-flash-image-preview @@ -165,6 +168,7 @@ ORCHESTRATOR_API_KEY=AIzaSy... ORCHESTRATOR_MODEL=gemini-3.1-pro-preview IMAGE_GEN_BASE_URL=https://generativelanguage.googleapis.com/v1beta/openai +IMAGE_GEN_PROVIDER=google-native IMAGE_GEN_API_KEY=AIzaSy... IMAGE_GEN_MODEL=gemini-3.1-flash-image-preview @@ -192,6 +196,7 @@ ORCHESTRATOR_MODEL=gemini-3.1-pro-preview # 美术生成 — Google AI Studio IMAGE_GEN_BASE_URL=https://generativelanguage.googleapis.com/v1beta/openai +IMAGE_GEN_PROVIDER=google-native IMAGE_GEN_API_KEY=AIzaSy... IMAGE_GEN_MODEL=gemini-3.1-flash-image-preview diff --git a/README_EN.md b/README_EN.md index 3ef4031..4eb0679 100644 --- a/README_EN.md +++ b/README_EN.md @@ -95,7 +95,7 @@ npm run create -- "A cyberpunk noodle shop where hackers and androids share rumo ## Model Configuration -WorldX uses **4 model roles**, each configurable independently. All use the OpenAI-compatible `chat/completions` protocol — any compatible platform works. +WorldX uses **4 model roles**, each configurable independently. All roles use the OpenAI-compatible `chat/completions` protocol except Image Gen, which can also use Google AI Studio's native image API. | Role | Env Prefix | What It Does | Recommended | |------|-----------|-------------|-------------| @@ -104,7 +104,7 @@ WorldX uses **4 model roles**, each configurable independently. All use the Open | **Vision** | `VISION_` | Reviews map quality, locates regions/elements | Strong multimodal model (e.g. `gemini-3.1-pro-preview`) | | **Simulation** | `SIMULATION_` | Drives runtime character behavior | Any model — cheaper is fine (e.g. `gemini-2.5-flash`) | -Each role needs 3 env vars: +Each role usually needs 3 env vars: ```env {ROLE}_BASE_URL=https://openrouter.ai/api/v1 # API base URL @@ -112,6 +112,8 @@ Each role needs 3 env vars: {ROLE}_MODEL=google/gemini-3.1-pro-preview # Model identifier ``` +Image Gen can additionally set `IMAGE_GEN_PROVIDER`. `IMAGE_GEN_PROVIDER` can be `openai-compatible` (default, for OpenRouter) or `google-native` (for Google AI Studio image generation). + ### Platform Examples
@@ -125,6 +127,7 @@ ORCHESTRATOR_API_KEY=sk-or-v1-xxxx ORCHESTRATOR_MODEL=google/gemini-3.1-pro-preview IMAGE_GEN_BASE_URL=https://openrouter.ai/api/v1 +IMAGE_GEN_PROVIDER=openai-compatible IMAGE_GEN_API_KEY=sk-or-v1-xxxx IMAGE_GEN_MODEL=google/gemini-3.1-flash-image-preview @@ -150,6 +153,7 @@ ORCHESTRATOR_API_KEY=AIzaSy... ORCHESTRATOR_MODEL=gemini-3.1-pro-preview IMAGE_GEN_BASE_URL=https://generativelanguage.googleapis.com/v1beta/openai +IMAGE_GEN_PROVIDER=google-native IMAGE_GEN_API_KEY=AIzaSy... IMAGE_GEN_MODEL=gemini-3.1-flash-image-preview @@ -177,6 +181,7 @@ ORCHESTRATOR_MODEL=gemini-3.1-pro-preview # Art generation — Google AI Studio IMAGE_GEN_BASE_URL=https://generativelanguage.googleapis.com/v1beta/openai +IMAGE_GEN_PROVIDER=google-native IMAGE_GEN_API_KEY=AIzaSy... IMAGE_GEN_MODEL=gemini-3.1-flash-image-preview diff --git a/generators/character/src/models/gemini-flash-img.mjs b/generators/character/src/models/gemini-flash-img.mjs index fb743b6..4b53749 100644 --- a/generators/character/src/models/gemini-flash-img.mjs +++ b/generators/character/src/models/gemini-flash-img.mjs @@ -7,8 +7,51 @@ const DEFAULT_BASE_URL = "https://openrouter.ai/api/v1"; const DEFAULT_MODEL = "google/gemini-3.1-flash-image-preview"; const MODEL = process.env.IMAGE_GEN_MODEL || DEFAULT_MODEL; const BASE_URL = process.env.IMAGE_GEN_BASE_URL || DEFAULT_BASE_URL; +const PROVIDER = (process.env.IMAGE_GEN_PROVIDER || "").trim().toLowerCase(); const REQUEST_TIMEOUT_MS = parseInt(process.env.IMAGE_GEN_TIMEOUT_MS || "180000", 10); +function useGoogleNativeProvider() { + return ( + PROVIDER === "google-native" || + PROVIDER === "google" || + (!PROVIDER && BASE_URL.includes("generativelanguage.googleapis.com")) + ); +} + +function getGoogleNativeBaseUrl() { + const trimmed = BASE_URL.replace(/\/+$/, ""); + return trimmed.endsWith("/openai") + ? trimmed.slice(0, -"/openai".length) + : trimmed; +} + +function getGoogleNativeModel() { + return MODEL.replace(/^google\//, "").replace(/^models\//, ""); +} + +function buildGoogleNativeUrl(apiKey) { + const model = encodeURIComponent(getGoogleNativeModel()); + return `${getGoogleNativeBaseUrl()}/models/${model}:generateContent?key=${encodeURIComponent(apiKey)}`; +} + +function buildGoogleNativeBody(parts) { + return { + contents: [{ role: "user", parts }], + generationConfig: { + responseModalities: ["TEXT", "IMAGE"], + }, + }; +} + +async function postGoogleNativeImage(parts, { apiKey, signal }) { + return fetch(buildGoogleNativeUrl(apiKey), { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(buildGoogleNativeBody(parts)), + signal, + }); +} + /** * Image editing: send reference image + text instruction -> new image. * @param {string} text - generation instruction @@ -23,31 +66,42 @@ export async function editImage(text, imageBuffer, { imageSize = "1K" } = {}) { const timer = setTimeout(() => controller.abort(), REQUEST_TIMEOUT_MS); try { - const res = await fetch(`${BASE_URL}/chat/completions`, { - method: "POST", - headers: { - Authorization: `Bearer ${API_KEY}`, - "Content-Type": "application/json", - }, - body: JSON.stringify({ - model: MODEL, - messages: [ + const res = useGoogleNativeProvider() + ? await postGoogleNativeImage( + [ + { text }, + { inlineData: { mimeType: "image/png", data: base64 } }, + ], { - role: "user", - content: [ - { type: "text", text }, + apiKey: API_KEY, + signal: controller.signal, + }, + ) + : await fetch(`${BASE_URL}/chat/completions`, { + method: "POST", + headers: { + Authorization: `Bearer ${API_KEY}`, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + model: MODEL, + messages: [ { - type: "image_url", - image_url: { url: `data:image/png;base64,${base64}` }, + role: "user", + content: [ + { type: "text", text }, + { + type: "image_url", + image_url: { url: `data:image/png;base64,${base64}` }, + }, + ], }, ], - }, - ], - modalities: ["image", "text"], - image_config: { image_size: imageSize }, - }), - signal: controller.signal, - }); + modalities: ["image", "text"], + image_config: { image_size: imageSize }, + }), + signal: controller.signal, + }); if (!res.ok) { const err = await res.text(); @@ -55,7 +109,9 @@ export async function editImage(text, imageBuffer, { imageSize = "1K" } = {}) { } const data = await res.json(); - return extractImageBuffer(data); + return useGoogleNativeProvider() + ? extractGoogleNativeImageBuffer(data) + : extractImageBuffer(data); } catch (e) { if (e.name === "AbortError") { throw new Error(`Image Gen Edit request timed out after ${REQUEST_TIMEOUT_MS / 1000}s`); @@ -93,3 +149,20 @@ function extractImageBuffer(data) { throw new Error("No image found in Image Gen response"); } + +function extractGoogleNativeImageBuffer(data) { + const candidates = Array.isArray(data.candidates) ? data.candidates : []; + for (const candidate of candidates) { + const parts = Array.isArray(candidate.content?.parts) + ? candidate.content.parts + : []; + for (const part of parts) { + const inlineData = part.inlineData || part.inline_data; + if (inlineData?.data) { + return Buffer.from(inlineData.data, "base64"); + } + } + } + + throw new Error("No image found in Google native Image Gen response"); +} diff --git a/generators/map/src/models/gemini-flash-img.mjs b/generators/map/src/models/gemini-flash-img.mjs index 8c76d7a..b852c8e 100644 --- a/generators/map/src/models/gemini-flash-img.mjs +++ b/generators/map/src/models/gemini-flash-img.mjs @@ -10,6 +10,7 @@ const DEFAULT_BASE_URL = "https://openrouter.ai/api/v1"; const DEFAULT_MODEL = "google/gemini-3.1-flash-image-preview"; const MODEL = process.env.IMAGE_GEN_MODEL || DEFAULT_MODEL; const BASE_URL = process.env.IMAGE_GEN_BASE_URL || DEFAULT_BASE_URL; +const PROVIDER = (process.env.IMAGE_GEN_PROVIDER || "").trim().toLowerCase(); const DEFAULT_REQUEST_TIMEOUT_MS = parseInt(process.env.IMAGE_GEN_TIMEOUT_MS || "180000", 10); const MAX_CONSECUTIVE_FAILURES = 2; @@ -42,6 +43,48 @@ function resolveRequestTimeoutMs(requestTimeoutMs, timeoutEnvKey) { return DEFAULT_REQUEST_TIMEOUT_MS; } +function useGoogleNativeProvider() { + return ( + PROVIDER === "google-native" || + PROVIDER === "google" || + (!PROVIDER && BASE_URL.includes("generativelanguage.googleapis.com")) + ); +} + +function getGoogleNativeBaseUrl() { + const trimmed = BASE_URL.replace(/\/+$/, ""); + return trimmed.endsWith("/openai") + ? trimmed.slice(0, -"/openai".length) + : trimmed; +} + +function getGoogleNativeModel() { + return MODEL.replace(/^google\//, "").replace(/^models\//, ""); +} + +function buildGoogleNativeUrl(apiKey) { + const model = encodeURIComponent(getGoogleNativeModel()); + return `${getGoogleNativeBaseUrl()}/models/${model}:generateContent?key=${encodeURIComponent(apiKey)}`; +} + +function buildGoogleNativeBody(parts) { + return { + contents: [{ role: "user", parts }], + generationConfig: { + responseModalities: ["TEXT", "IMAGE"], + }, + }; +} + +async function postGoogleNativeImage(parts, { apiKey, signal }) { + return fetch(buildGoogleNativeUrl(apiKey), { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(buildGoogleNativeBody(parts)), + signal, + }); +} + /** * Text-to-image generation. * @returns {Buffer} PNG image buffer @@ -59,20 +102,28 @@ export async function generateImage( const timer = setTimeout(() => controller.abort(), timeoutMs); try { - const res = await fetch(`${BASE_URL}/chat/completions`, { - method: "POST", - headers: { - Authorization: `Bearer ${API_KEY}`, - "Content-Type": "application/json", - }, - body: JSON.stringify({ - model: MODEL, - messages: [{ role: "user", content: prompt }], - modalities: ["image", "text"], - image_config: { aspect_ratio: aspectRatio, image_size: imageSize }, - }), - signal: controller.signal, - }); + const nativePrompt = useGoogleNativeProvider() + ? `${prompt}\n\nGenerate the image in ${aspectRatio} aspect ratio.` + : prompt; + const res = useGoogleNativeProvider() + ? await postGoogleNativeImage([{ text: nativePrompt }], { + apiKey: API_KEY, + signal: controller.signal, + }) + : await fetch(`${BASE_URL}/chat/completions`, { + method: "POST", + headers: { + Authorization: `Bearer ${API_KEY}`, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + model: MODEL, + messages: [{ role: "user", content: prompt }], + modalities: ["image", "text"], + image_config: { aspect_ratio: aspectRatio, image_size: imageSize }, + }), + signal: controller.signal, + }); if (!res.ok) { const err = await res.text(); @@ -82,7 +133,9 @@ export async function generateImage( } const data = await res.json(); - const buf = extractImageBuffer(data); + const buf = useGoogleNativeProvider() + ? extractGoogleNativeImageBuffer(data) + : extractImageBuffer(data); logModelImageResponse(logStep, MODEL, "(returned to caller)", buf.length); return buf; } catch (e) { @@ -116,31 +169,42 @@ export async function editImage(text, imageBuffer, { imageSize = "2K", logStep = const timer = setTimeout(() => controller.abort(), timeoutMs); try { - const res = await fetch(`${BASE_URL}/chat/completions`, { - method: "POST", - headers: { - Authorization: `Bearer ${API_KEY}`, - "Content-Type": "application/json", - }, - body: JSON.stringify({ - model: MODEL, - messages: [ + const res = useGoogleNativeProvider() + ? await postGoogleNativeImage( + [ + { text }, + { inlineData: { mimeType: "image/png", data: base64 } }, + ], { - role: "user", - content: [ - { type: "text", text }, + apiKey: API_KEY, + signal: controller.signal, + }, + ) + : await fetch(`${BASE_URL}/chat/completions`, { + method: "POST", + headers: { + Authorization: `Bearer ${API_KEY}`, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + model: MODEL, + messages: [ { - type: "image_url", - image_url: { url: `data:image/png;base64,${base64}` }, + role: "user", + content: [ + { type: "text", text }, + { + type: "image_url", + image_url: { url: `data:image/png;base64,${base64}` }, + }, + ], }, ], - }, - ], - modalities: ["image", "text"], - image_config: { image_size: imageSize }, - }), - signal: controller.signal, - }); + modalities: ["image", "text"], + image_config: { image_size: imageSize }, + }), + signal: controller.signal, + }); if (!res.ok) { const err = await res.text(); @@ -150,7 +214,9 @@ export async function editImage(text, imageBuffer, { imageSize = "2K", logStep = } const data = await res.json(); - const buf = extractImageBuffer(data); + const buf = useGoogleNativeProvider() + ? extractGoogleNativeImageBuffer(data) + : extractImageBuffer(data); logModelImageResponse(logStep, MODEL, "(returned to caller)", buf.length); return buf; } catch (e) { @@ -193,3 +259,20 @@ function extractImageBuffer(data) { throw new Error("No image found in Image Gen response"); } + +function extractGoogleNativeImageBuffer(data) { + const candidates = Array.isArray(data.candidates) ? data.candidates : []; + for (const candidate of candidates) { + const parts = Array.isArray(candidate.content?.parts) + ? candidate.content.parts + : []; + for (const part of parts) { + const inlineData = part.inlineData || part.inline_data; + if (inlineData?.data) { + return Buffer.from(inlineData.data, "base64"); + } + } + } + + throw new Error("No image found in Google native Image Gen response"); +}