Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 131 additions & 0 deletions src/multimodal/image-loader.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
/**
* Image Loader — Vision LLM Integration
*
* Analyzes images via OpenAI Vision API (or compatible),
* extracting descriptions, tags, and entities.
*/

import { getLLMApiKey, getLLMBaseUrl, getLLMModel } from '../config.js';
import { isLLMEnabled } from '../llm/provider.js';

// ── Types ────────────────────────────────────────────────────────────

export interface ImageInput {
/** Base64-encoded image data */
base64: string;
/** Image MIME type (default: image/png) */
mimeType?: string;
/** Original filename */
filename?: string;
/** Custom analysis prompt */
prompt?: string;
}

export interface ImageAnalysisResult {
/** Natural language description of the image */
description: string;
/** Relevant tags/categories */
tags: string[];
/** Key entities/concepts depicted */
entities: string[];
}

// ── Internal Vision LLM Call ─────────────────────────────────────────

async function callVisionLLM(
systemPrompt: string,
imageBase64: string,
mimeType: string,
): Promise<string> {
const apiKey = getLLMApiKey();
if (!apiKey) {
throw new Error('No LLM API key configured for image analysis.');
}

let baseUrl = getLLMBaseUrl('https://api.openai.com/v1').replace(/\/+$/, '');
if (!baseUrl.endsWith('/v1')) baseUrl += '/v1';
const model = getLLMModel('gpt-4o');

const response = await fetch(`${baseUrl}/chat/completions`, {
Comment on lines +45 to +49
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Route vision calls by configured LLM provider

isLLMEnabled() allows this path when the runtime is configured for non-OpenAI providers, but callVisionLLM hardcodes OpenAI defaults (https://api.openai.com/v1, gpt-4o) and always posts to /chat/completions. In environments that auto-detect anthropic from ANTHROPIC_API_KEY, image ingestion will consistently fail against the wrong endpoint/model even though LLM mode is reported as enabled. This should use provider-aware config/routing (or the existing provider abstraction) so the request format matches the active provider.

Useful? React with 👍 / 👎.

method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${apiKey}`,
},
body: JSON.stringify({
model,
messages: [{
role: 'user',
content: [
{ type: 'text', text: systemPrompt },
{
type: 'image_url',
image_url: { url: `data:${mimeType};base64,${imageBase64}` },
},
],
}],
temperature: 0.1,
max_tokens: 1024,
}),
signal: AbortSignal.timeout(30_000),
});

if (!response.ok) {
const errorText = await response.text().catch(() => 'unknown error');
throw new Error(`Vision LLM error (${response.status}): ${errorText}`);
}

const data = await response.json() as {
choices: Array<{ message: { content: string } }>;
};

return data.choices[0]?.message?.content ?? '';
}

// ── Public API ───────────────────────────────────────────────────────

const DEFAULT_PROMPT =
'Analyze this image. Return ONLY a JSON object with this exact format: ' +
'{"description": "detailed description", "tags": ["tag1", "tag2"], "entities": ["entity1", "entity2"]}';

/**
* Analyze an image using Vision LLM.
*
* @throws Error if LLM not configured.
*/
export async function analyzeImage(input: ImageInput): Promise<ImageAnalysisResult> {
if (!isLLMEnabled()) {
throw new Error(
'LLM not configured for image analysis. ' +
'Set MEMORIX_LLM_API_KEY or OPENAI_API_KEY.',
);
}

const mimeType = input.mimeType ?? 'image/png';
const prompt = input.prompt ?? DEFAULT_PROMPT;

const response = await callVisionLLM(prompt, input.base64, mimeType);

// Try to parse structured JSON response
try {
// Extract JSON from response (may be wrapped in markdown code block)
const jsonMatch = response.match(/\{[\s\S]*\}/);
if (jsonMatch) {
const parsed = JSON.parse(jsonMatch[0]);
return {
description: parsed.description ?? response,
tags: Array.isArray(parsed.tags) ? parsed.tags : [],
entities: Array.isArray(parsed.entities) ? parsed.entities : [],
};
}
} catch {
// JSON parse failed — fall through to text extraction
}

// Fallback: treat entire response as description
return {
description: response,
tags: [],
entities: [],
};
}
49 changes: 49 additions & 0 deletions src/server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3055,6 +3055,55 @@ export async function createMemorixServer(
},
);

server.registerTool(
'memorix_ingest_image',
{
title: 'Ingest Image',
description:
'Analyze an image via Vision LLM and store the analysis as a memory observation. ' +
'Returns description, tags, and entities extracted from the image.',
inputSchema: {
base64: z.string().describe('Base64-encoded image data'),
mimeType: z.string().optional().describe('Image MIME type (e.g. image/png, image/jpeg)'),
filename: z.string().optional().describe('Original filename'),
prompt: z.string().optional().describe('Custom analysis prompt'),
},
},
async (args) => {
try {
const { analyzeImage } = await import('./multimodal/image-loader.js');
const analysis = await analyzeImage(args);
const entityName = args.filename?.replace(/\.[^.]+$/, '') ?? `image-${Date.now()}`;
markInternalWrite();
const { observation } = await storeObservation({
entityName,
type: 'discovery',
title: `Image analysis: ${entityName}`,
narrative: analysis.description,
concepts: analysis.tags,
facts: analysis.entities,
projectId: project.id,
});
return {
content: [{
type: 'text' as const,
text: `\uD83D\uDDBC\uFE0F Image analyzed\n` +
`Observation #${observation.id}\n` +
`Tags: ${analysis.tags.join(', ') || 'none'}\n` +
`Preview: ${analysis.description.slice(0, 300)}${analysis.description.length > 300 ? '\u2026' : ''}`,
}],
};
} catch (err: unknown) {
return {
content: [{
type: 'text' as const,
text: `\u274C Image ingestion failed: ${err instanceof Error ? err.message : String(err)}`,
}],
isError: true,
};
}
},
);
// Deferred initialization — runs AFTER transport connect so MCP handshake isn't blocked.
// Sync advisory scan and file watcher are non-essential for tool functionality.
const deferredInit = async () => {
Expand Down
142 changes: 142 additions & 0 deletions tests/multimodal/image-loader.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import { describe, it, expect, afterEach, beforeEach } from 'bun:test';

Check failure on line 1 in tests/multimodal/image-loader.test.ts

View workflow job for this annotation

GitHub Actions / test (ubuntu-latest, 20)

tests/multimodal/image-loader.test.ts

Error: Cannot find package 'bun:test' imported from '/home/runner/work/memorix/memorix/tests/multimodal/image-loader.test.ts' ❯ tests/multimodal/image-loader.test.ts:1:1 ⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯ Serialized Error: { code: 'ERR_MODULE_NOT_FOUND' } Caused by: Caused by: Error: Failed to load url bun:test (resolved id: bun:test). Does the file exist? ❯ loadAndTransform node_modules/vite/dist/node/chunks/dep-D4NMHUTW.js:35729:17

Check failure on line 1 in tests/multimodal/image-loader.test.ts

View workflow job for this annotation

GitHub Actions / test (ubuntu-latest, 22)

tests/multimodal/image-loader.test.ts

Error: Cannot find package 'bun:test' imported from '/home/runner/work/memorix/memorix/tests/multimodal/image-loader.test.ts' ❯ tests/multimodal/image-loader.test.ts:1:1 ⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯ Serialized Error: { code: 'ERR_MODULE_NOT_FOUND' } Caused by: Caused by: Error: Failed to load url bun:test (resolved id: bun:test). Does the file exist? ❯ loadAndTransform node_modules/vite/dist/node/chunks/dep-D4NMHUTW.js:35729:17
import { analyzeImage } from '../../src/multimodal/image-loader.js';
import { resetConfigCache } from '../../src/config.js';
import { setLLMConfig } from '../../src/llm/provider.js';

describe('image-loader', () => {
const originalFetch = globalThis.fetch;

beforeEach(() => {
resetConfigCache();
setLLMConfig(null);
});

afterEach(() => {
globalThis.fetch = originalFetch;
delete process.env.OPENAI_API_KEY;
delete process.env.MEMORIX_LLM_API_KEY;
delete process.env.MEMORIX_API_KEY;
setLLMConfig(null);
resetConfigCache();
});

it('analyzes image with structured JSON response', async () => {
process.env.OPENAI_API_KEY = 'test-key';
setLLMConfig({ provider: 'openai', apiKey: 'test-key', model: 'gpt-4o', baseUrl: 'https://api.openai.com/v1' });
globalThis.fetch = (async () => {
return new Response(JSON.stringify({
choices: [{
message: {
content: '{"description":"A cat sitting on a windowsill","tags":["animal","cat","indoor"],"entities":["cat","windowsill"]}',
},
}],
}), { status: 200 });
}) as typeof fetch;

const result = await analyzeImage({ base64: 'dGVzdA==', mimeType: 'image/png' });
expect(result.description).toBe('A cat sitting on a windowsill');
expect(result.tags).toContain('cat');
expect(result.tags).toContain('animal');
expect(result.entities).toContain('cat');
});

it('falls back to text when JSON parse fails', async () => {
process.env.OPENAI_API_KEY = 'test-key';
setLLMConfig({ provider: 'openai', apiKey: 'test-key', model: 'gpt-4o', baseUrl: 'https://api.openai.com/v1' });
globalThis.fetch = (async () => {
return new Response(JSON.stringify({
choices: [{
message: { content: 'This is a beautiful mountain landscape with snow-capped peaks.' },
}],
}), { status: 200 });
}) as typeof fetch;

const result = await analyzeImage({ base64: 'dGVzdA==' });
expect(result.description).toBe('This is a beautiful mountain landscape with snow-capped peaks.');
expect(result.tags).toEqual([]);
expect(result.entities).toEqual([]);
});

it('handles JSON wrapped in markdown code block', async () => {
process.env.OPENAI_API_KEY = 'test-key';
setLLMConfig({ provider: 'openai', apiKey: 'test-key', model: 'gpt-4o', baseUrl: 'https://api.openai.com/v1' });
globalThis.fetch = (async () => {
return new Response(JSON.stringify({
choices: [{
message: {
content: '```json\n{"description":"A diagram","tags":["diagram"],"entities":["flowchart"]}\n```',
},
}],
}), { status: 200 });
}) as typeof fetch;

const result = await analyzeImage({ base64: 'dGVzdA==' });
expect(result.description).toBe('A diagram');
expect(result.tags).toContain('diagram');
});

it('throws when LLM not configured', async () => {
setLLMConfig(null);
delete process.env.OPENAI_API_KEY;
delete process.env.MEMORIX_LLM_API_KEY;
delete process.env.MEMORIX_API_KEY;
delete process.env.ANTHROPIC_API_KEY;
delete process.env.OPENROUTER_API_KEY;

await expect(
analyzeImage({ base64: 'dGVzdA==' }),
).rejects.toThrow('LLM not configured');
});

it('passes custom prompt to Vision LLM', async () => {
process.env.OPENAI_API_KEY = 'test-key';
setLLMConfig({ provider: 'openai', apiKey: 'test-key', model: 'gpt-4o', baseUrl: 'https://api.openai.com/v1' });
let sentBody: string = '';
globalThis.fetch = (async (_url: any, opts: any) => {
sentBody = typeof opts?.body === 'string' ? opts.body : '';
return new Response(JSON.stringify({
choices: [{ message: { content: '{"description":"Custom analysis","tags":[],"entities":[]}' } }],
}), { status: 200 });
}) as typeof fetch;

await analyzeImage({
base64: 'dGVzdA==',
prompt: 'Extract text from this screenshot',
});

expect(sentBody).toContain('Extract text from this screenshot');
});

it('sends correct Vision API format', async () => {
process.env.OPENAI_API_KEY = 'test-key';
setLLMConfig({ provider: 'openai', apiKey: 'test-key', model: 'gpt-4o', baseUrl: 'https://api.openai.com/v1' });
let parsedBody: any = null;
globalThis.fetch = (async (_url: any, opts: any) => {
parsedBody = JSON.parse(opts?.body ?? '{}');
return new Response(JSON.stringify({
choices: [{ message: { content: '{"description":"test","tags":[],"entities":[]}' } }],
}), { status: 200 });
}) as typeof fetch;

await analyzeImage({ base64: 'aW1hZ2VkYXRh', mimeType: 'image/jpeg' });

expect(parsedBody).toBeTruthy();
const content = parsedBody.messages[0].content;
expect(content).toBeInstanceOf(Array);
expect(content[0].type).toBe('text');
expect(content[1].type).toBe('image_url');
expect(content[1].image_url.url).toContain('data:image/jpeg;base64,');
});

it('handles Vision LLM API errors', async () => {
process.env.OPENAI_API_KEY = 'test-key';
setLLMConfig({ provider: 'openai', apiKey: 'test-key', model: 'gpt-4o', baseUrl: 'https://api.openai.com/v1' });
globalThis.fetch = (async () => {
return new Response('Model not found', { status: 404 });
}) as typeof fetch;

await expect(
analyzeImage({ base64: 'dGVzdA==' }),
).rejects.toThrow('Vision LLM error (404)');
});
});
Loading