AVIDS2 · RaviTharuma · Mar 29, 2026 · chatgpt-codex-connector · Mar 29, 2026 · chatgpt-codex-connector
diff --git a/src/multimodal/audio-loader.ts b/src/multimodal/audio-loader.ts
@@ -0,0 +1,145 @@
+/**
+ * Audio Loader — Whisper API Integration
+ *
+ * Transcribes audio files via OpenAI Whisper or Groq Whisper API,
+ * then stores the transcript as a Memorix observation.
+ *
+ * Supports: mp3, wav, m4a, webm, mp4, ogg, flac
+ * Providers: OpenAI (whisper-1), Groq (whisper-large-v3)
+ */
+
+import { getLLMApiKey } from '../config.js';
+
+// ── Types ────────────────────────────────────────────────────────────
+
+export interface AudioInput {
+  /** Base64-encoded audio data */
+  base64: string;
+  /** Audio MIME type (default: audio/mp3) */
+  mimeType?: string;
+  /** Original filename */
+  filename?: string;
+  /** ISO language code for transcription hint */
+  language?: string;
+  /** Whisper provider: openai or groq */
+  provider?: 'openai' | 'groq';
+}
+
+export interface TranscriptionResult {
+  /** Transcribed text */
+  text: string;
+  /** Audio duration in seconds */
+  duration?: number;
+  /** Detected language */
+  language?: string;
+  /** Provider used */
+  provider: string;
+}
+
+// ── Provider Config ──────────────────────────────────────────────────
+
+const PROVIDERS = {
+  openai: {
+    baseUrl: 'https://api.openai.com/v1',
+    model: 'whisper-1',
+  },
+  groq: {
+    baseUrl: 'https://api.groq.com/openai/v1',
+    model: 'whisper-large-v3',
+  },
+} as const;
+
+// ── Core Functions ───────────────────────────────────────────────────
+
+/**
+ * Transcribe audio via Whisper API.
+ *
+ * @throws Error if no API key configured or API returns error.
+ */
+export async function transcribeAudio(input: AudioInput): Promise<TranscriptionResult> {
+  const apiKey = getLLMApiKey();
+  if (!apiKey) {
+    throw new Error(
+      'No API key configured for audio transcription. ' +
+      'Set MEMORIX_LLM_API_KEY, MEMORIX_API_KEY, or OPENAI_API_KEY.',
+    );
+  }
+
+  const providerName = input.provider
+    ?? (process.env.MEMORIX_AUDIO_PROVIDER as 'openai' | 'groq' | undefined)
+    ?? 'openai';
+  const config = PROVIDERS[providerName] ?? PROVIDERS.openai;
+
+  // Build multipart form
+  const audioBuffer = Buffer.from(input.base64, 'base64');
+  const blob = new Blob([audioBuffer], { type: input.mimeType ?? 'audio/mp3' });
+  const form = new FormData();
+  form.append('file', blob, input.filename ?? 'audio.mp3');
+  form.append('model', config.model);
+  form.append('response_format', 'json');
+  if (input.language) {
+    form.append('language', input.language);
+  }
+
+  const response = await fetch(`${config.baseUrl}/audio/transcriptions`, {
+    method: 'POST',
+    headers: { 'Authorization': `Bearer ${apiKey}` },
+    body: form,
+    signal: AbortSignal.timeout(120_000), // 2 min timeout for large files
+  });
+
+  if (!response.ok) {
+    const errorText = await response.text().catch(() => 'unknown error');
+    throw new Error(`Whisper API error (${response.status}): ${errorText}`);
+  }
+
+  const data = await response.json() as {
+    text: string;
+    duration?: number;
+    language?: string;
+  };
+
+  return {
+    text: data.text,
+    duration: data.duration,
+    language: data.language,
+    provider: providerName,
+  };
+}
+
+/**
+ * Transcribe audio and store as a Memorix observation.
+ */
+export async function ingestAudio(
+  input: AudioInput,
+  storeFn: (obs: {
+    entityName: string;
+    type: string;
+    title: string;
+    narrative: string;
+    concepts: string[];
+    projectId: string;
+  }) => Promise<{ observation: { id: number }; upserted: boolean }>,
+  projectId: string,
+): Promise<{ observationId: number; text: string; duration?: number }> {
+  const result = await transcribeAudio(input);
+
+  const entityName = input.filename
+    ? input.filename.replace(/\.[^.]+$/, '')
+    : `audio-${Date.now()}`;
+
+  const { observation } = await storeFn({
+    entityName,
+    type: 'discovery',
+    title: `Audio transcript: ${entityName}`,
+    narrative: result.text,
+    concepts: ['audio', 'transcript', ...(result.language ? [result.language] : [])],
+    projectId,
+  });
+
+  return {
+    observationId: observation.id,
+    text: result.text,
+    duration: result.duration,
+  };
+}
diff --git a/src/multimodal/index.ts b/src/multimodal/index.ts
@@ -0,0 +1,12 @@
+/**
+ * Multimodal Ingestion — Unified Entry Point
+ *
+ * Re-exports all multimodal loaders for convenient access.
+ */
+
+export {
+  transcribeAudio,
+  ingestAudio,
+  type AudioInput,
+  type TranscriptionResult,
+} from './audio-loader.js';
diff --git a/src/server.ts b/src/server.ts
@@ -3055,6 +3055,51 @@
     },
   );
 
+  // ── Multimodal Ingestion Tools ─────────────────────────────────────
+
+  server.registerTool(
+    'memorix_ingest_audio',
+    {
+      title: 'Ingest Audio',
+      description:
+        'Transcribe audio via Whisper API (OpenAI or Groq) and store the transcript as a memory observation. ' +
+        'Supports mp3, wav, m4a, webm, ogg, flac formats.',
+      inputSchema: {
+        base64: z.string().describe('Base64-encoded audio data'),
+        mimeType: z.string().optional().describe('Audio MIME type (e.g. audio/mp3)'),
+        filename: z.string().optional().describe('Original filename'),
+        language: z.string().optional().describe('ISO language code for transcription hint'),
+        provider: z.enum(['openai', 'groq']).optional().describe('Whisper provider (default: openai)'),
+      },
+    },
+    async (args) => {
+      try {
+        const { ingestAudio } = await import('./multimodal/index.js');
+        markInternalWrite();
+        const result = await ingestAudio(
+          args,
+          (obs) => storeObservation(obs),
+          project.id,
+        );
+        return {
+          content: [{
+            type: 'text' as const,
+            text: `🎤 Audio transcribed (${result.duration ? result.duration.toFixed(1) + 's' : 'unknown duration'})\n` +
+              `Observation #${result.observationId}\n` +
+              `Preview: ${result.text.slice(0, 300)}${result.text.length > 300 ? '…' : ''}`,
+          }],
+        };
+      } catch (err: unknown) {
+        return {
+          content: [{
+            type: 'text' as const,
+            text: `❌ Audio ingestion failed: ${err instanceof Error ? err.message : String(err)}`,
+          }],
+          isError: true,
+        };
+      }
+    },
+  );
   // Deferred initialization — runs AFTER transport connect so MCP handshake isn't blocked.
   // Sync advisory scan and file watcher are non-essential for tool functionality.
   const deferredInit = async () => {

diff --git a/tests/multimodal/audio-loader.test.ts b/tests/multimodal/audio-loader.test.ts
@@ -0,0 +1,173 @@
+import { describe, it, expect, afterEach, beforeEach } from 'bun:test';
+import { transcribeAudio, ingestAudio } from '../../src/multimodal/audio-loader.js';
+import { resetConfigCache } from '../../src/config.js';
+
+describe('audio-loader', () => {
+  const originalFetch = globalThis.fetch;
+
+  beforeEach(() => {
+    resetConfigCache();
+  });
+
+  afterEach(() => {
+    globalThis.fetch = originalFetch;
+    // Clean up env vars
+    delete process.env.OPENAI_API_KEY;
+    delete process.env.MEMORIX_LLM_API_KEY;
+    delete process.env.MEMORIX_API_KEY;
+    delete process.env.MEMORIX_AUDIO_PROVIDER;
+    resetConfigCache();
+  });
+
+  it('calls OpenAI Whisper endpoint by default', async () => {
+    let calledUrl = '';
+    let calledHeaders: Record<string, string> = {};
+    globalThis.fetch = (async (url: any, opts: any) => {
+      calledUrl = String(url);
+      calledHeaders = opts?.headers ?? {};
+      return new Response(JSON.stringify({ text: 'hello world', duration: 5.2 }), { status: 200 });
+    }) as typeof fetch;
+
+    process.env.OPENAI_API_KEY = 'test-key-123';
+    const result = await transcribeAudio({
+      base64: Buffer.from('fake audio data').toString('base64'),
+    });
+
+    expect(calledUrl).toContain('api.openai.com');
+    expect(calledUrl).toContain('/audio/transcriptions');
+    expect(calledHeaders['Authorization']).toBe('Bearer test-key-123');
+    expect(result.text).toBe('hello world');
+    expect(result.duration).toBe(5.2);
+    expect(result.provider).toBe('openai');
+  });
+
+  it('calls Groq endpoint when provider=groq', async () => {
+    let calledUrl = '';
+    globalThis.fetch = (async (url: any) => {
+      calledUrl = String(url);
+      return new Response(JSON.stringify({ text: 'groq result' }), { status: 200 });
+    }) as typeof fetch;
+
+    process.env.OPENAI_API_KEY = 'test-key';
+    const result = await transcribeAudio({
+      base64: Buffer.from('fake').toString('base64'),
+      provider: 'groq',
+    });
+
+    expect(calledUrl).toContain('api.groq.com');
+    expect(result.provider).toBe('groq');
+  });
+
+  it('uses MEMORIX_AUDIO_PROVIDER env var', async () => {
+    let calledUrl = '';
+    globalThis.fetch = (async (url: any) => {
+      calledUrl = String(url);
+      return new Response(JSON.stringify({ text: 'env result' }), { status: 200 });
+    }) as typeof fetch;
+
+    process.env.OPENAI_API_KEY = 'test-key';
+    process.env.MEMORIX_AUDIO_PROVIDER = 'groq';
+    const result = await transcribeAudio({
+      base64: Buffer.from('fake').toString('base64'),
+    });
+
+    expect(calledUrl).toContain('api.groq.com');
+    expect(result.provider).toBe('groq');
+  });
+
+  it('throws when no API key configured', async () => {
+    // Ensure no API keys are set
+    delete process.env.OPENAI_API_KEY;
+    delete process.env.MEMORIX_LLM_API_KEY;
+    delete process.env.MEMORIX_API_KEY;
+    delete process.env.ANTHROPIC_API_KEY;
+    delete process.env.OPENROUTER_API_KEY;
+
+    await expect(
+      transcribeAudio({ base64: 'dGVzdA==' }),
+    ).rejects.toThrow('No API key configured');
+  });
+
+  it('throws on API error response', async () => {
+    globalThis.fetch = (async () => {
+      return new Response('Rate limit exceeded', { status: 429 });
+    }) as typeof fetch;
+
+    process.env.OPENAI_API_KEY = 'test-key';
+    await expect(
+      transcribeAudio({ base64: Buffer.from('audio').toString('base64') }),
+    ).rejects.toThrow('Whisper API error (429)');
+  });
+
+  it('passes language parameter', async () => {
+    let formData: FormData | null = null;
+    globalThis.fetch = (async (_url: any, opts: any) => {
+      formData = opts?.body;
+      return new Response(JSON.stringify({ text: 'bonjour', language: 'fr' }), { status: 200 });
+    }) as typeof fetch;
+
+    process.env.OPENAI_API_KEY = 'test-key';
+    const result = await transcribeAudio({
+      base64: Buffer.from('french audio').toString('base64'),
+      language: 'fr',
+    });
+
+    expect(result.text).toBe('bonjour');
+    expect(result.language).toBe('fr');
+    // FormData should have language field
+    expect(formData).toBeTruthy();
+  });
+
+  it('ingestAudio stores observation with correct fields', async () => {
+    globalThis.fetch = (async () => {
+      return new Response(JSON.stringify({ text: 'transcribed content', duration: 30 }), { status: 200 });
+    }) as typeof fetch;
+
+    process.env.OPENAI_API_KEY = 'test-key';
+
+    let storedObs: Record<string, unknown> | null = null;
+    const storeFn = async (obs: Record<string, unknown>) => {
+      storedObs = obs;
+      return { observation: { id: 42 }, upserted: false };
+    };
+
+    const result = await ingestAudio(
+      { base64: Buffer.from('audio data').toString('base64'), filename: 'meeting-notes.mp3' },
+      storeFn as any,
+      'project-123',
+    );
+
+    expect(result.observationId).toBe(42);
+    expect(result.text).toBe('transcribed content');
+    expect(result.duration).toBe(30);
+    expect(storedObs).toBeTruthy();
+    expect(storedObs!.entityName).toBe('meeting-notes');
+    expect(storedObs!.type).toBe('discovery');
+    expect(storedObs!.projectId).toBe('project-123');
+    expect((storedObs!.concepts as string[])).toContain('audio');
+    expect((storedObs!.concepts as string[])).toContain('transcript');
+  });
+
+  it('ingestAudio uses timestamp for unnamed files', async () => {
+    globalThis.fetch = (async () => {
+      return new Response(JSON.stringify({ text: 'text' }), { status: 200 });
+    }) as typeof fetch;
+
+    process.env.OPENAI_API_KEY = 'test-key';
+
+    let storedObs: Record<string, unknown> | null = null;
+    const storeFn = async (obs: Record<string, unknown>) => {
+      storedObs = obs;
+      return { observation: { id: 1 }, upserted: false };
+    };
+
+    await ingestAudio(
+      { base64: Buffer.from('data').toString('base64') },
+      storeFn as any,
+      'proj',
+    );
+
+    expect(storedObs).toBeTruthy();
+    expect((storedObs!.entityName as string)).toMatch(/^audio-\d+$/);
+  });
+});