diff --git a/src-api/src/extensions/agent/codeany/index.ts b/src-api/src/extensions/agent/codeany/index.ts index 99b0231..4aa45a4 100644 --- a/src-api/src/extensions/agent/codeany/index.ts +++ b/src-api/src/extensions/agent/codeany/index.ts @@ -385,33 +385,39 @@ export class CodeAnyAgent extends BaseAgent { ? { enabled: true, image: options.sandbox.image, apiEndpoint: options.sandbox.apiEndpoint || SANDBOX_API_URL } : undefined; - // Handle image attachments - let imageInstruction = ''; + // Build image content blocks for multimodal input + const imageContentBlocks: Array<{ type: 'image'; source: { type: 'base64'; media_type: string; data: string } }> = []; if (options?.images && options.images.length > 0) { - const imagePaths = await saveImagesToDisk(options.images, sessionCwd); - if (imagePaths.length > 0) { - imageInstruction = ` -## MANDATORY IMAGE ANALYSIS - DO THIS FIRST - -The user has attached ${imagePaths.length} image file(s): -${imagePaths.map((p, i) => `${i + 1}. ${p}`).join('\n')} - -**YOUR FIRST ACTION MUST BE:** Use the Read tool to view each image file listed above. - -**CRITICAL:** DO NOT respond until you have READ and SEEN the actual image content. - ---- -User's request (answer this AFTER reading the images): -`; + for (const image of options.images) { + try { + let base64Data = image.data; + if (base64Data.includes(',')) { + base64Data = base64Data.split(',')[1]; + } + const mediaType = image.mimeType || 'image/png'; + imageContentBlocks.push({ + type: 'image', + source: { type: 'base64', media_type: mediaType, data: base64Data }, + }); + logger.info(`[CodeAny] Added image as multimodal content: ${mediaType}, ${Math.round(base64Data.length / 1024)}KB base64`); + } catch (error) { + logger.error(`[CodeAny] Failed to process image for multimodal:`, error); + } } } const conversationContext = this.formatConversationHistory(options?.conversation); const languageInstruction = buildLanguageInstruction(options?.language, prompt); - - const enhancedPrompt = imageInstruction - ? imageInstruction + languageInstruction + prompt + '\n\n' + getWorkspaceInstruction(sessionCwd, sandboxOpts) + conversationContext - : getWorkspaceInstruction(sessionCwd, sandboxOpts) + conversationContext + languageInstruction + prompt; + const textPrompt = getWorkspaceInstruction(sessionCwd, sandboxOpts) + conversationContext + languageInstruction + prompt; + + // Multimodal content array if images present, plain string otherwise + let finalPrompt: string | any[]; + if (imageContentBlocks.length > 0) { + finalPrompt = [...imageContentBlocks, { type: 'text', text: textPrompt }]; + logger.info(`[CodeAny] Using multimodal prompt with ${imageContentBlocks.length} image(s)`); + } else { + finalPrompt = textPrompt; + } // Load MCP servers const userMcpServers = await loadMcpServers(options?.mcpConfig as McpConfig | undefined); @@ -429,10 +435,10 @@ User's request (answer this AFTER reading the images): logger.info(`[CodeAny ${session.id}] ========== AGENT START ==========`); logger.info(`[CodeAny ${session.id}] Model: ${this.config.model || '(default)'}`); logger.info(`[CodeAny ${session.id}] Custom API: ${this.isUsingCustomApi()}`); - logger.info(`[CodeAny ${session.id}] Prompt length: ${enhancedPrompt.length} chars`); + logger.info(`[CodeAny ${session.id}] Prompt length: ${typeof finalPrompt === 'string' ? finalPrompt.length : 'multimodal'} chars`); try { - for await (const message of query({ prompt: enhancedPrompt, options: sdkOpts })) { + for await (const message of query({ prompt: finalPrompt, options: sdkOpts })) { if (session.abortController.signal.aborted) break; yield* this.processMessage(message, session.id, sentTextHashes, sentToolIds); }