AVIDS2 · RaviTharuma · Mar 29, 2026 · chatgpt-codex-connector · Mar 29, 2026
diff --git a/src/multimodal/image-loader.ts b/src/multimodal/image-loader.ts
@@ -0,0 +1,131 @@
+/**
+ * Image Loader — Vision LLM Integration
+ *
+ * Analyzes images via OpenAI Vision API (or compatible),
+ * extracting descriptions, tags, and entities.
+ */
+
+import { getLLMApiKey, getLLMBaseUrl, getLLMModel } from '../config.js';
+import { isLLMEnabled } from '../llm/provider.js';
+
+// ── Types ────────────────────────────────────────────────────────────
+
+export interface ImageInput {
+  /** Base64-encoded image data */
+  base64: string;
+  /** Image MIME type (default: image/png) */
+  mimeType?: string;
+  /** Original filename */
+  filename?: string;
+  /** Custom analysis prompt */
+  prompt?: string;
+}
+
+export interface ImageAnalysisResult {
+  /** Natural language description of the image */
+  description: string;
+  /** Relevant tags/categories */
+  tags: string[];
+  /** Key entities/concepts depicted */
+  entities: string[];
+}
+
+// ── Internal Vision LLM Call ─────────────────────────────────────────
+
+async function callVisionLLM(
+  systemPrompt: string,
+  imageBase64: string,
+  mimeType: string,
+): Promise<string> {
+  const apiKey = getLLMApiKey();
+  if (!apiKey) {
+    throw new Error('No LLM API key configured for image analysis.');
+  }
+
+  let baseUrl = getLLMBaseUrl('https://api.openai.com/v1').replace(/\/+$/, '');
+  if (!baseUrl.endsWith('/v1')) baseUrl += '/v1';
+  const model = getLLMModel('gpt-4o');
+
+  const response = await fetch(`${baseUrl}/chat/completions`, {
+    method: 'POST',
+    headers: {
+      'Content-Type': 'application/json',
+      'Authorization': `Bearer ${apiKey}`,
+    },
+    body: JSON.stringify({
+      model,
+      messages: [{
+        role: 'user',
+        content: [
+          { type: 'text', text: systemPrompt },
+          {
+            type: 'image_url',
+            image_url: { url: `data:${mimeType};base64,${imageBase64}` },
+          },
+        ],
+      }],
+      temperature: 0.1,
+      max_tokens: 1024,
+    }),
+    signal: AbortSignal.timeout(30_000),
+  });
+
+  if (!response.ok) {
+    const errorText = await response.text().catch(() => 'unknown error');
+    throw new Error(`Vision LLM error (${response.status}): ${errorText}`);
+  }
+
+  const data = await response.json() as {
+    choices: Array<{ message: { content: string } }>;
+  };
+
+  return data.choices[0]?.message?.content ?? '';
+}
+
+// ── Public API ───────────────────────────────────────────────────────
+
+const DEFAULT_PROMPT =
+  'Analyze this image. Return ONLY a JSON object with this exact format: ' +
+  '{"description": "detailed description", "tags": ["tag1", "tag2"], "entities": ["entity1", "entity2"]}';
+
+/**
+ * Analyze an image using Vision LLM.
+ *
+ * @throws Error if LLM not configured.
+ */
+export async function analyzeImage(input: ImageInput): Promise<ImageAnalysisResult> {
+  if (!isLLMEnabled()) {
+    throw new Error(
+      'LLM not configured for image analysis. ' +
+      'Set MEMORIX_LLM_API_KEY or OPENAI_API_KEY.',
+    );
+  }
+
+  const mimeType = input.mimeType ?? 'image/png';
+  const prompt = input.prompt ?? DEFAULT_PROMPT;
+
+  const response = await callVisionLLM(prompt, input.base64, mimeType);
+
+  // Try to parse structured JSON response
+  try {
+    // Extract JSON from response (may be wrapped in markdown code block)
+    const jsonMatch = response.match(/\{[\s\S]*\}/);
+    if (jsonMatch) {
+      const parsed = JSON.parse(jsonMatch[0]);
+      return {
+        description: parsed.description ?? response,
+        tags: Array.isArray(parsed.tags) ? parsed.tags : [],
+        entities: Array.isArray(parsed.entities) ? parsed.entities : [],
+      };
+    }
+  } catch {
+    // JSON parse failed — fall through to text extraction
+  }
+
+  // Fallback: treat entire response as description
+  return {
+    description: response,
+    tags: [],
+    entities: [],
+  };
+}
diff --git a/src/server.ts b/src/server.ts
@@ -3055,6 +3055,55 @@ export async function createMemorixServer(
     },
   );
 
+  server.registerTool(
+    'memorix_ingest_image',
+    {
+      title: 'Ingest Image',
+      description:
+        'Analyze an image via Vision LLM and store the analysis as a memory observation. ' +
+        'Returns description, tags, and entities extracted from the image.',
+      inputSchema: {
+        base64: z.string().describe('Base64-encoded image data'),
+        mimeType: z.string().optional().describe('Image MIME type (e.g. image/png, image/jpeg)'),
+        filename: z.string().optional().describe('Original filename'),
+        prompt: z.string().optional().describe('Custom analysis prompt'),
+      },
+    },
+    async (args) => {
+      try {
+        const { analyzeImage } = await import('./multimodal/image-loader.js');
+        const analysis = await analyzeImage(args);
+        const entityName = args.filename?.replace(/\.[^.]+$/, '') ?? `image-${Date.now()}`;
+        markInternalWrite();
+        const { observation } = await storeObservation({
+          entityName,
+          type: 'discovery',
+          title: `Image analysis: ${entityName}`,
+          narrative: analysis.description,
+          concepts: analysis.tags,
+          facts: analysis.entities,
+          projectId: project.id,
+        });
+        return {
+          content: [{
+            type: 'text' as const,
+            text: `\uD83D\uDDBC\uFE0F Image analyzed\n` +
+              `Observation #${observation.id}\n` +
+              `Tags: ${analysis.tags.join(', ') || 'none'}\n` +
+              `Preview: ${analysis.description.slice(0, 300)}${analysis.description.length > 300 ? '\u2026' : ''}`,
+          }],
+        };
+      } catch (err: unknown) {
+        return {
+          content: [{
+            type: 'text' as const,
+            text: `\u274C Image ingestion failed: ${err instanceof Error ? err.message : String(err)}`,
+          }],
+          isError: true,
+        };
+      }
+    },
+  );
   // Deferred initialization — runs AFTER transport connect so MCP handshake isn't blocked.
   // Sync advisory scan and file watcher are non-essential for tool functionality.
   const deferredInit = async () => {

diff --git a/tests/multimodal/image-loader.test.ts b/tests/multimodal/image-loader.test.ts
@@ -0,0 +1,142 @@
+import { describe, it, expect, afterEach, beforeEach } from 'bun:test';
+import { analyzeImage } from '../../src/multimodal/image-loader.js';
+import { resetConfigCache } from '../../src/config.js';
+import { setLLMConfig } from '../../src/llm/provider.js';
+
+describe('image-loader', () => {
+  const originalFetch = globalThis.fetch;
+
+  beforeEach(() => {
+    resetConfigCache();
+    setLLMConfig(null);
+  });
+
+  afterEach(() => {
+    globalThis.fetch = originalFetch;
+    delete process.env.OPENAI_API_KEY;
+    delete process.env.MEMORIX_LLM_API_KEY;
+    delete process.env.MEMORIX_API_KEY;
+    setLLMConfig(null);
+    resetConfigCache();
+  });
+
+  it('analyzes image with structured JSON response', async () => {
+    process.env.OPENAI_API_KEY = 'test-key';
+    setLLMConfig({ provider: 'openai', apiKey: 'test-key', model: 'gpt-4o', baseUrl: 'https://api.openai.com/v1' });
+    globalThis.fetch = (async () => {
+      return new Response(JSON.stringify({
+        choices: [{
+          message: {
+            content: '{"description":"A cat sitting on a windowsill","tags":["animal","cat","indoor"],"entities":["cat","windowsill"]}',
+          },
+        }],
+      }), { status: 200 });
+    }) as typeof fetch;
+
+    const result = await analyzeImage({ base64: 'dGVzdA==', mimeType: 'image/png' });
+    expect(result.description).toBe('A cat sitting on a windowsill');
+    expect(result.tags).toContain('cat');
+    expect(result.tags).toContain('animal');
+    expect(result.entities).toContain('cat');
+  });
+
+  it('falls back to text when JSON parse fails', async () => {
+    process.env.OPENAI_API_KEY = 'test-key';
+    setLLMConfig({ provider: 'openai', apiKey: 'test-key', model: 'gpt-4o', baseUrl: 'https://api.openai.com/v1' });
+    globalThis.fetch = (async () => {
+      return new Response(JSON.stringify({
+        choices: [{
+          message: { content: 'This is a beautiful mountain landscape with snow-capped peaks.' },
+        }],
+      }), { status: 200 });
+    }) as typeof fetch;
+
+    const result = await analyzeImage({ base64: 'dGVzdA==' });
+    expect(result.description).toBe('This is a beautiful mountain landscape with snow-capped peaks.');
+    expect(result.tags).toEqual([]);
+    expect(result.entities).toEqual([]);
+  });
+
+  it('handles JSON wrapped in markdown code block', async () => {
+    process.env.OPENAI_API_KEY = 'test-key';
+    setLLMConfig({ provider: 'openai', apiKey: 'test-key', model: 'gpt-4o', baseUrl: 'https://api.openai.com/v1' });
+    globalThis.fetch = (async () => {
+      return new Response(JSON.stringify({
+        choices: [{
+          message: {
+            content: '```json\n{"description":"A diagram","tags":["diagram"],"entities":["flowchart"]}\n```',
+          },
+        }],
+      }), { status: 200 });
+    }) as typeof fetch;
+
+    const result = await analyzeImage({ base64: 'dGVzdA==' });
+    expect(result.description).toBe('A diagram');
+    expect(result.tags).toContain('diagram');
+  });
+
+  it('throws when LLM not configured', async () => {
+    setLLMConfig(null);
+    delete process.env.OPENAI_API_KEY;
+    delete process.env.MEMORIX_LLM_API_KEY;
+    delete process.env.MEMORIX_API_KEY;
+    delete process.env.ANTHROPIC_API_KEY;
+    delete process.env.OPENROUTER_API_KEY;
+
+    await expect(
+      analyzeImage({ base64: 'dGVzdA==' }),
+    ).rejects.toThrow('LLM not configured');
+  });
+
+  it('passes custom prompt to Vision LLM', async () => {
+    process.env.OPENAI_API_KEY = 'test-key';
+    setLLMConfig({ provider: 'openai', apiKey: 'test-key', model: 'gpt-4o', baseUrl: 'https://api.openai.com/v1' });
+    let sentBody: string = '';
+    globalThis.fetch = (async (_url: any, opts: any) => {
+      sentBody = typeof opts?.body === 'string' ? opts.body : '';
+      return new Response(JSON.stringify({
+        choices: [{ message: { content: '{"description":"Custom analysis","tags":[],"entities":[]}' } }],
+      }), { status: 200 });
+    }) as typeof fetch;
+
+    await analyzeImage({
+      base64: 'dGVzdA==',
+      prompt: 'Extract text from this screenshot',
+    });
+
+    expect(sentBody).toContain('Extract text from this screenshot');
+  });
+
+  it('sends correct Vision API format', async () => {
+    process.env.OPENAI_API_KEY = 'test-key';
+    setLLMConfig({ provider: 'openai', apiKey: 'test-key', model: 'gpt-4o', baseUrl: 'https://api.openai.com/v1' });
+    let parsedBody: any = null;
+    globalThis.fetch = (async (_url: any, opts: any) => {
+      parsedBody = JSON.parse(opts?.body ?? '{}');
+      return new Response(JSON.stringify({
+        choices: [{ message: { content: '{"description":"test","tags":[],"entities":[]}' } }],
+      }), { status: 200 });
+    }) as typeof fetch;
+
+    await analyzeImage({ base64: 'aW1hZ2VkYXRh', mimeType: 'image/jpeg' });
+
+    expect(parsedBody).toBeTruthy();
+    const content = parsedBody.messages[0].content;
+    expect(content).toBeInstanceOf(Array);
+    expect(content[0].type).toBe('text');
+    expect(content[1].type).toBe('image_url');
+    expect(content[1].image_url.url).toContain('data:image/jpeg;base64,');
+  });
+
+  it('handles Vision LLM API errors', async () => {
+    process.env.OPENAI_API_KEY = 'test-key';
+    setLLMConfig({ provider: 'openai', apiKey: 'test-key', model: 'gpt-4o', baseUrl: 'https://api.openai.com/v1' });
+    globalThis.fetch = (async () => {
+      return new Response('Model not found', { status: 404 });
+    }) as typeof fetch;
+
+    await expect(
+      analyzeImage({ base64: 'dGVzdA==' }),
+    ).rejects.toThrow('Vision LLM error (404)');
+  });
+});