diff --git a/src/multimodal/audio-loader.ts b/src/multimodal/audio-loader.ts new file mode 100644 index 0000000..0c5f1ba --- /dev/null +++ b/src/multimodal/audio-loader.ts @@ -0,0 +1,145 @@ +/** + * Audio Loader — Whisper API Integration + * + * Transcribes audio files via OpenAI Whisper or Groq Whisper API, + * then stores the transcript as a Memorix observation. + * + * Supports: mp3, wav, m4a, webm, mp4, ogg, flac + * Providers: OpenAI (whisper-1), Groq (whisper-large-v3) + */ + +import { getLLMApiKey } from '../config.js'; + +// ── Types ──────────────────────────────────────────────────────────── + +export interface AudioInput { + /** Base64-encoded audio data */ + base64: string; + /** Audio MIME type (default: audio/mp3) */ + mimeType?: string; + /** Original filename */ + filename?: string; + /** ISO language code for transcription hint */ + language?: string; + /** Whisper provider: openai or groq */ + provider?: 'openai' | 'groq'; +} + +export interface TranscriptionResult { + /** Transcribed text */ + text: string; + /** Audio duration in seconds */ + duration?: number; + /** Detected language */ + language?: string; + /** Provider used */ + provider: string; +} + +// ── Provider Config ────────────────────────────────────────────────── + +const PROVIDERS = { + openai: { + baseUrl: 'https://api.openai.com/v1', + model: 'whisper-1', + }, + groq: { + baseUrl: 'https://api.groq.com/openai/v1', + model: 'whisper-large-v3', + }, +} as const; + +// ── Core Functions ─────────────────────────────────────────────────── + +/** + * Transcribe audio via Whisper API. + * + * @throws Error if no API key configured or API returns error. + */ +export async function transcribeAudio(input: AudioInput): Promise { + const apiKey = getLLMApiKey(); + if (!apiKey) { + throw new Error( + 'No API key configured for audio transcription. ' + + 'Set MEMORIX_LLM_API_KEY, MEMORIX_API_KEY, or OPENAI_API_KEY.', + ); + } + + const providerName = input.provider + ?? (process.env.MEMORIX_AUDIO_PROVIDER as 'openai' | 'groq' | undefined) + ?? 'openai'; + const config = PROVIDERS[providerName] ?? PROVIDERS.openai; + + // Build multipart form + const audioBuffer = Buffer.from(input.base64, 'base64'); + const blob = new Blob([audioBuffer], { type: input.mimeType ?? 'audio/mp3' }); + const form = new FormData(); + form.append('file', blob, input.filename ?? 'audio.mp3'); + form.append('model', config.model); + form.append('response_format', 'json'); + if (input.language) { + form.append('language', input.language); + } + + const response = await fetch(`${config.baseUrl}/audio/transcriptions`, { + method: 'POST', + headers: { 'Authorization': `Bearer ${apiKey}` }, + body: form, + signal: AbortSignal.timeout(120_000), // 2 min timeout for large files + }); + + if (!response.ok) { + const errorText = await response.text().catch(() => 'unknown error'); + throw new Error(`Whisper API error (${response.status}): ${errorText}`); + } + + const data = await response.json() as { + text: string; + duration?: number; + language?: string; + }; + + return { + text: data.text, + duration: data.duration, + language: data.language, + provider: providerName, + }; +} + +/** + * Transcribe audio and store as a Memorix observation. + */ +export async function ingestAudio( + input: AudioInput, + storeFn: (obs: { + entityName: string; + type: string; + title: string; + narrative: string; + concepts: string[]; + projectId: string; + }) => Promise<{ observation: { id: number }; upserted: boolean }>, + projectId: string, +): Promise<{ observationId: number; text: string; duration?: number }> { + const result = await transcribeAudio(input); + + const entityName = input.filename + ? input.filename.replace(/\.[^.]+$/, '') + : `audio-${Date.now()}`; + + const { observation } = await storeFn({ + entityName, + type: 'discovery', + title: `Audio transcript: ${entityName}`, + narrative: result.text, + concepts: ['audio', 'transcript', ...(result.language ? [result.language] : [])], + projectId, + }); + + return { + observationId: observation.id, + text: result.text, + duration: result.duration, + }; +} diff --git a/src/multimodal/index.ts b/src/multimodal/index.ts new file mode 100644 index 0000000..823b7fc --- /dev/null +++ b/src/multimodal/index.ts @@ -0,0 +1,12 @@ +/** + * Multimodal Ingestion — Unified Entry Point + * + * Re-exports all multimodal loaders for convenient access. + */ + +export { + transcribeAudio, + ingestAudio, + type AudioInput, + type TranscriptionResult, +} from './audio-loader.js'; diff --git a/src/server.ts b/src/server.ts index 69162fd..8b78565 100644 --- a/src/server.ts +++ b/src/server.ts @@ -3055,6 +3055,51 @@ export async function createMemorixServer( }, ); + // ── Multimodal Ingestion Tools ───────────────────────────────────── + + server.registerTool( + 'memorix_ingest_audio', + { + title: 'Ingest Audio', + description: + 'Transcribe audio via Whisper API (OpenAI or Groq) and store the transcript as a memory observation. ' + + 'Supports mp3, wav, m4a, webm, ogg, flac formats.', + inputSchema: { + base64: z.string().describe('Base64-encoded audio data'), + mimeType: z.string().optional().describe('Audio MIME type (e.g. audio/mp3)'), + filename: z.string().optional().describe('Original filename'), + language: z.string().optional().describe('ISO language code for transcription hint'), + provider: z.enum(['openai', 'groq']).optional().describe('Whisper provider (default: openai)'), + }, + }, + async (args) => { + try { + const { ingestAudio } = await import('./multimodal/index.js'); + markInternalWrite(); + const result = await ingestAudio( + args, + (obs) => storeObservation(obs), + project.id, + ); + return { + content: [{ + type: 'text' as const, + text: `🎤 Audio transcribed (${result.duration ? result.duration.toFixed(1) + 's' : 'unknown duration'})\n` + + `Observation #${result.observationId}\n` + + `Preview: ${result.text.slice(0, 300)}${result.text.length > 300 ? '…' : ''}`, + }], + }; + } catch (err: unknown) { + return { + content: [{ + type: 'text' as const, + text: `❌ Audio ingestion failed: ${err instanceof Error ? err.message : String(err)}`, + }], + isError: true, + }; + } + }, + ); // Deferred initialization — runs AFTER transport connect so MCP handshake isn't blocked. // Sync advisory scan and file watcher are non-essential for tool functionality. const deferredInit = async () => { diff --git a/tests/multimodal/audio-loader.test.ts b/tests/multimodal/audio-loader.test.ts new file mode 100644 index 0000000..ce98f68 --- /dev/null +++ b/tests/multimodal/audio-loader.test.ts @@ -0,0 +1,173 @@ +import { describe, it, expect, afterEach, beforeEach } from 'bun:test'; +import { transcribeAudio, ingestAudio } from '../../src/multimodal/audio-loader.js'; +import { resetConfigCache } from '../../src/config.js'; + +describe('audio-loader', () => { + const originalFetch = globalThis.fetch; + + beforeEach(() => { + resetConfigCache(); + }); + + afterEach(() => { + globalThis.fetch = originalFetch; + // Clean up env vars + delete process.env.OPENAI_API_KEY; + delete process.env.MEMORIX_LLM_API_KEY; + delete process.env.MEMORIX_API_KEY; + delete process.env.MEMORIX_AUDIO_PROVIDER; + resetConfigCache(); + }); + + it('calls OpenAI Whisper endpoint by default', async () => { + let calledUrl = ''; + let calledHeaders: Record = {}; + globalThis.fetch = (async (url: any, opts: any) => { + calledUrl = String(url); + calledHeaders = opts?.headers ?? {}; + return new Response(JSON.stringify({ text: 'hello world', duration: 5.2 }), { status: 200 }); + }) as typeof fetch; + + process.env.OPENAI_API_KEY = 'test-key-123'; + const result = await transcribeAudio({ + base64: Buffer.from('fake audio data').toString('base64'), + }); + + expect(calledUrl).toContain('api.openai.com'); + expect(calledUrl).toContain('/audio/transcriptions'); + expect(calledHeaders['Authorization']).toBe('Bearer test-key-123'); + expect(result.text).toBe('hello world'); + expect(result.duration).toBe(5.2); + expect(result.provider).toBe('openai'); + }); + + it('calls Groq endpoint when provider=groq', async () => { + let calledUrl = ''; + globalThis.fetch = (async (url: any) => { + calledUrl = String(url); + return new Response(JSON.stringify({ text: 'groq result' }), { status: 200 }); + }) as typeof fetch; + + process.env.OPENAI_API_KEY = 'test-key'; + const result = await transcribeAudio({ + base64: Buffer.from('fake').toString('base64'), + provider: 'groq', + }); + + expect(calledUrl).toContain('api.groq.com'); + expect(result.provider).toBe('groq'); + }); + + it('uses MEMORIX_AUDIO_PROVIDER env var', async () => { + let calledUrl = ''; + globalThis.fetch = (async (url: any) => { + calledUrl = String(url); + return new Response(JSON.stringify({ text: 'env result' }), { status: 200 }); + }) as typeof fetch; + + process.env.OPENAI_API_KEY = 'test-key'; + process.env.MEMORIX_AUDIO_PROVIDER = 'groq'; + const result = await transcribeAudio({ + base64: Buffer.from('fake').toString('base64'), + }); + + expect(calledUrl).toContain('api.groq.com'); + expect(result.provider).toBe('groq'); + }); + + it('throws when no API key configured', async () => { + // Ensure no API keys are set + delete process.env.OPENAI_API_KEY; + delete process.env.MEMORIX_LLM_API_KEY; + delete process.env.MEMORIX_API_KEY; + delete process.env.ANTHROPIC_API_KEY; + delete process.env.OPENROUTER_API_KEY; + + await expect( + transcribeAudio({ base64: 'dGVzdA==' }), + ).rejects.toThrow('No API key configured'); + }); + + it('throws on API error response', async () => { + globalThis.fetch = (async () => { + return new Response('Rate limit exceeded', { status: 429 }); + }) as typeof fetch; + + process.env.OPENAI_API_KEY = 'test-key'; + await expect( + transcribeAudio({ base64: Buffer.from('audio').toString('base64') }), + ).rejects.toThrow('Whisper API error (429)'); + }); + + it('passes language parameter', async () => { + let formData: FormData | null = null; + globalThis.fetch = (async (_url: any, opts: any) => { + formData = opts?.body; + return new Response(JSON.stringify({ text: 'bonjour', language: 'fr' }), { status: 200 }); + }) as typeof fetch; + + process.env.OPENAI_API_KEY = 'test-key'; + const result = await transcribeAudio({ + base64: Buffer.from('french audio').toString('base64'), + language: 'fr', + }); + + expect(result.text).toBe('bonjour'); + expect(result.language).toBe('fr'); + // FormData should have language field + expect(formData).toBeTruthy(); + }); + + it('ingestAudio stores observation with correct fields', async () => { + globalThis.fetch = (async () => { + return new Response(JSON.stringify({ text: 'transcribed content', duration: 30 }), { status: 200 }); + }) as typeof fetch; + + process.env.OPENAI_API_KEY = 'test-key'; + + let storedObs: Record | null = null; + const storeFn = async (obs: Record) => { + storedObs = obs; + return { observation: { id: 42 }, upserted: false }; + }; + + const result = await ingestAudio( + { base64: Buffer.from('audio data').toString('base64'), filename: 'meeting-notes.mp3' }, + storeFn as any, + 'project-123', + ); + + expect(result.observationId).toBe(42); + expect(result.text).toBe('transcribed content'); + expect(result.duration).toBe(30); + expect(storedObs).toBeTruthy(); + expect(storedObs!.entityName).toBe('meeting-notes'); + expect(storedObs!.type).toBe('discovery'); + expect(storedObs!.projectId).toBe('project-123'); + expect((storedObs!.concepts as string[])).toContain('audio'); + expect((storedObs!.concepts as string[])).toContain('transcript'); + }); + + it('ingestAudio uses timestamp for unnamed files', async () => { + globalThis.fetch = (async () => { + return new Response(JSON.stringify({ text: 'text' }), { status: 200 }); + }) as typeof fetch; + + process.env.OPENAI_API_KEY = 'test-key'; + + let storedObs: Record | null = null; + const storeFn = async (obs: Record) => { + storedObs = obs; + return { observation: { id: 1 }, upserted: false }; + }; + + await ingestAudio( + { base64: Buffer.from('data').toString('base64') }, + storeFn as any, + 'proj', + ); + + expect(storedObs).toBeTruthy(); + expect((storedObs!.entityName as string)).toMatch(/^audio-\d+$/); + }); +});