diff --git a/src/multimodal/image-loader.ts b/src/multimodal/image-loader.ts new file mode 100644 index 0000000..d729581 --- /dev/null +++ b/src/multimodal/image-loader.ts @@ -0,0 +1,131 @@ +/** + * Image Loader — Vision LLM Integration + * + * Analyzes images via OpenAI Vision API (or compatible), + * extracting descriptions, tags, and entities. + */ + +import { getLLMApiKey, getLLMBaseUrl, getLLMModel } from '../config.js'; +import { isLLMEnabled } from '../llm/provider.js'; + +// ── Types ──────────────────────────────────────────────────────────── + +export interface ImageInput { + /** Base64-encoded image data */ + base64: string; + /** Image MIME type (default: image/png) */ + mimeType?: string; + /** Original filename */ + filename?: string; + /** Custom analysis prompt */ + prompt?: string; +} + +export interface ImageAnalysisResult { + /** Natural language description of the image */ + description: string; + /** Relevant tags/categories */ + tags: string[]; + /** Key entities/concepts depicted */ + entities: string[]; +} + +// ── Internal Vision LLM Call ───────────────────────────────────────── + +async function callVisionLLM( + systemPrompt: string, + imageBase64: string, + mimeType: string, +): Promise { + const apiKey = getLLMApiKey(); + if (!apiKey) { + throw new Error('No LLM API key configured for image analysis.'); + } + + let baseUrl = getLLMBaseUrl('https://api.openai.com/v1').replace(/\/+$/, ''); + if (!baseUrl.endsWith('/v1')) baseUrl += '/v1'; + const model = getLLMModel('gpt-4o'); + + const response = await fetch(`${baseUrl}/chat/completions`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'Authorization': `Bearer ${apiKey}`, + }, + body: JSON.stringify({ + model, + messages: [{ + role: 'user', + content: [ + { type: 'text', text: systemPrompt }, + { + type: 'image_url', + image_url: { url: `data:${mimeType};base64,${imageBase64}` }, + }, + ], + }], + temperature: 0.1, + max_tokens: 1024, + }), + signal: AbortSignal.timeout(30_000), + }); + + if (!response.ok) { + const errorText = await response.text().catch(() => 'unknown error'); + throw new Error(`Vision LLM error (${response.status}): ${errorText}`); + } + + const data = await response.json() as { + choices: Array<{ message: { content: string } }>; + }; + + return data.choices[0]?.message?.content ?? ''; +} + +// ── Public API ─────────────────────────────────────────────────────── + +const DEFAULT_PROMPT = + 'Analyze this image. Return ONLY a JSON object with this exact format: ' + + '{"description": "detailed description", "tags": ["tag1", "tag2"], "entities": ["entity1", "entity2"]}'; + +/** + * Analyze an image using Vision LLM. + * + * @throws Error if LLM not configured. + */ +export async function analyzeImage(input: ImageInput): Promise { + if (!isLLMEnabled()) { + throw new Error( + 'LLM not configured for image analysis. ' + + 'Set MEMORIX_LLM_API_KEY or OPENAI_API_KEY.', + ); + } + + const mimeType = input.mimeType ?? 'image/png'; + const prompt = input.prompt ?? DEFAULT_PROMPT; + + const response = await callVisionLLM(prompt, input.base64, mimeType); + + // Try to parse structured JSON response + try { + // Extract JSON from response (may be wrapped in markdown code block) + const jsonMatch = response.match(/\{[\s\S]*\}/); + if (jsonMatch) { + const parsed = JSON.parse(jsonMatch[0]); + return { + description: parsed.description ?? response, + tags: Array.isArray(parsed.tags) ? parsed.tags : [], + entities: Array.isArray(parsed.entities) ? parsed.entities : [], + }; + } + } catch { + // JSON parse failed — fall through to text extraction + } + + // Fallback: treat entire response as description + return { + description: response, + tags: [], + entities: [], + }; +} diff --git a/src/server.ts b/src/server.ts index 69162fd..a281679 100644 --- a/src/server.ts +++ b/src/server.ts @@ -3055,6 +3055,55 @@ export async function createMemorixServer( }, ); + server.registerTool( + 'memorix_ingest_image', + { + title: 'Ingest Image', + description: + 'Analyze an image via Vision LLM and store the analysis as a memory observation. ' + + 'Returns description, tags, and entities extracted from the image.', + inputSchema: { + base64: z.string().describe('Base64-encoded image data'), + mimeType: z.string().optional().describe('Image MIME type (e.g. image/png, image/jpeg)'), + filename: z.string().optional().describe('Original filename'), + prompt: z.string().optional().describe('Custom analysis prompt'), + }, + }, + async (args) => { + try { + const { analyzeImage } = await import('./multimodal/image-loader.js'); + const analysis = await analyzeImage(args); + const entityName = args.filename?.replace(/\.[^.]+$/, '') ?? `image-${Date.now()}`; + markInternalWrite(); + const { observation } = await storeObservation({ + entityName, + type: 'discovery', + title: `Image analysis: ${entityName}`, + narrative: analysis.description, + concepts: analysis.tags, + facts: analysis.entities, + projectId: project.id, + }); + return { + content: [{ + type: 'text' as const, + text: `\uD83D\uDDBC\uFE0F Image analyzed\n` + + `Observation #${observation.id}\n` + + `Tags: ${analysis.tags.join(', ') || 'none'}\n` + + `Preview: ${analysis.description.slice(0, 300)}${analysis.description.length > 300 ? '\u2026' : ''}`, + }], + }; + } catch (err: unknown) { + return { + content: [{ + type: 'text' as const, + text: `\u274C Image ingestion failed: ${err instanceof Error ? err.message : String(err)}`, + }], + isError: true, + }; + } + }, + ); // Deferred initialization — runs AFTER transport connect so MCP handshake isn't blocked. // Sync advisory scan and file watcher are non-essential for tool functionality. const deferredInit = async () => { diff --git a/tests/multimodal/image-loader.test.ts b/tests/multimodal/image-loader.test.ts new file mode 100644 index 0000000..ae56351 --- /dev/null +++ b/tests/multimodal/image-loader.test.ts @@ -0,0 +1,142 @@ +import { describe, it, expect, afterEach, beforeEach } from 'bun:test'; +import { analyzeImage } from '../../src/multimodal/image-loader.js'; +import { resetConfigCache } from '../../src/config.js'; +import { setLLMConfig } from '../../src/llm/provider.js'; + +describe('image-loader', () => { + const originalFetch = globalThis.fetch; + + beforeEach(() => { + resetConfigCache(); + setLLMConfig(null); + }); + + afterEach(() => { + globalThis.fetch = originalFetch; + delete process.env.OPENAI_API_KEY; + delete process.env.MEMORIX_LLM_API_KEY; + delete process.env.MEMORIX_API_KEY; + setLLMConfig(null); + resetConfigCache(); + }); + + it('analyzes image with structured JSON response', async () => { + process.env.OPENAI_API_KEY = 'test-key'; + setLLMConfig({ provider: 'openai', apiKey: 'test-key', model: 'gpt-4o', baseUrl: 'https://api.openai.com/v1' }); + globalThis.fetch = (async () => { + return new Response(JSON.stringify({ + choices: [{ + message: { + content: '{"description":"A cat sitting on a windowsill","tags":["animal","cat","indoor"],"entities":["cat","windowsill"]}', + }, + }], + }), { status: 200 }); + }) as typeof fetch; + + const result = await analyzeImage({ base64: 'dGVzdA==', mimeType: 'image/png' }); + expect(result.description).toBe('A cat sitting on a windowsill'); + expect(result.tags).toContain('cat'); + expect(result.tags).toContain('animal'); + expect(result.entities).toContain('cat'); + }); + + it('falls back to text when JSON parse fails', async () => { + process.env.OPENAI_API_KEY = 'test-key'; + setLLMConfig({ provider: 'openai', apiKey: 'test-key', model: 'gpt-4o', baseUrl: 'https://api.openai.com/v1' }); + globalThis.fetch = (async () => { + return new Response(JSON.stringify({ + choices: [{ + message: { content: 'This is a beautiful mountain landscape with snow-capped peaks.' }, + }], + }), { status: 200 }); + }) as typeof fetch; + + const result = await analyzeImage({ base64: 'dGVzdA==' }); + expect(result.description).toBe('This is a beautiful mountain landscape with snow-capped peaks.'); + expect(result.tags).toEqual([]); + expect(result.entities).toEqual([]); + }); + + it('handles JSON wrapped in markdown code block', async () => { + process.env.OPENAI_API_KEY = 'test-key'; + setLLMConfig({ provider: 'openai', apiKey: 'test-key', model: 'gpt-4o', baseUrl: 'https://api.openai.com/v1' }); + globalThis.fetch = (async () => { + return new Response(JSON.stringify({ + choices: [{ + message: { + content: '```json\n{"description":"A diagram","tags":["diagram"],"entities":["flowchart"]}\n```', + }, + }], + }), { status: 200 }); + }) as typeof fetch; + + const result = await analyzeImage({ base64: 'dGVzdA==' }); + expect(result.description).toBe('A diagram'); + expect(result.tags).toContain('diagram'); + }); + + it('throws when LLM not configured', async () => { + setLLMConfig(null); + delete process.env.OPENAI_API_KEY; + delete process.env.MEMORIX_LLM_API_KEY; + delete process.env.MEMORIX_API_KEY; + delete process.env.ANTHROPIC_API_KEY; + delete process.env.OPENROUTER_API_KEY; + + await expect( + analyzeImage({ base64: 'dGVzdA==' }), + ).rejects.toThrow('LLM not configured'); + }); + + it('passes custom prompt to Vision LLM', async () => { + process.env.OPENAI_API_KEY = 'test-key'; + setLLMConfig({ provider: 'openai', apiKey: 'test-key', model: 'gpt-4o', baseUrl: 'https://api.openai.com/v1' }); + let sentBody: string = ''; + globalThis.fetch = (async (_url: any, opts: any) => { + sentBody = typeof opts?.body === 'string' ? opts.body : ''; + return new Response(JSON.stringify({ + choices: [{ message: { content: '{"description":"Custom analysis","tags":[],"entities":[]}' } }], + }), { status: 200 }); + }) as typeof fetch; + + await analyzeImage({ + base64: 'dGVzdA==', + prompt: 'Extract text from this screenshot', + }); + + expect(sentBody).toContain('Extract text from this screenshot'); + }); + + it('sends correct Vision API format', async () => { + process.env.OPENAI_API_KEY = 'test-key'; + setLLMConfig({ provider: 'openai', apiKey: 'test-key', model: 'gpt-4o', baseUrl: 'https://api.openai.com/v1' }); + let parsedBody: any = null; + globalThis.fetch = (async (_url: any, opts: any) => { + parsedBody = JSON.parse(opts?.body ?? '{}'); + return new Response(JSON.stringify({ + choices: [{ message: { content: '{"description":"test","tags":[],"entities":[]}' } }], + }), { status: 200 }); + }) as typeof fetch; + + await analyzeImage({ base64: 'aW1hZ2VkYXRh', mimeType: 'image/jpeg' }); + + expect(parsedBody).toBeTruthy(); + const content = parsedBody.messages[0].content; + expect(content).toBeInstanceOf(Array); + expect(content[0].type).toBe('text'); + expect(content[1].type).toBe('image_url'); + expect(content[1].image_url.url).toContain('data:image/jpeg;base64,'); + }); + + it('handles Vision LLM API errors', async () => { + process.env.OPENAI_API_KEY = 'test-key'; + setLLMConfig({ provider: 'openai', apiKey: 'test-key', model: 'gpt-4o', baseUrl: 'https://api.openai.com/v1' }); + globalThis.fetch = (async () => { + return new Response('Model not found', { status: 404 }); + }) as typeof fetch; + + await expect( + analyzeImage({ base64: 'dGVzdA==' }), + ).rejects.toThrow('Vision LLM error (404)'); + }); +});