-
Notifications
You must be signed in to change notification settings - Fork 28
feat: audio ingestion via Whisper API #31
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,145 @@ | ||
| /** | ||
| * Audio Loader — Whisper API Integration | ||
| * | ||
| * Transcribes audio files via OpenAI Whisper or Groq Whisper API, | ||
| * then stores the transcript as a Memorix observation. | ||
| * | ||
| * Supports: mp3, wav, m4a, webm, mp4, ogg, flac | ||
| * Providers: OpenAI (whisper-1), Groq (whisper-large-v3) | ||
| */ | ||
|
|
||
| import { getLLMApiKey } from '../config.js'; | ||
|
|
||
| // ── Types ──────────────────────────────────────────────────────────── | ||
|
|
||
| export interface AudioInput { | ||
| /** Base64-encoded audio data */ | ||
| base64: string; | ||
| /** Audio MIME type (default: audio/mp3) */ | ||
| mimeType?: string; | ||
| /** Original filename */ | ||
| filename?: string; | ||
| /** ISO language code for transcription hint */ | ||
| language?: string; | ||
| /** Whisper provider: openai or groq */ | ||
| provider?: 'openai' | 'groq'; | ||
| } | ||
|
|
||
| export interface TranscriptionResult { | ||
| /** Transcribed text */ | ||
| text: string; | ||
| /** Audio duration in seconds */ | ||
| duration?: number; | ||
| /** Detected language */ | ||
| language?: string; | ||
| /** Provider used */ | ||
| provider: string; | ||
| } | ||
|
|
||
| // ── Provider Config ────────────────────────────────────────────────── | ||
|
|
||
| const PROVIDERS = { | ||
| openai: { | ||
| baseUrl: 'https://api.openai.com/v1', | ||
| model: 'whisper-1', | ||
| }, | ||
| groq: { | ||
| baseUrl: 'https://api.groq.com/openai/v1', | ||
| model: 'whisper-large-v3', | ||
| }, | ||
| } as const; | ||
|
|
||
| // ── Core Functions ─────────────────────────────────────────────────── | ||
|
|
||
| /** | ||
| * Transcribe audio via Whisper API. | ||
| * | ||
| * @throws Error if no API key configured or API returns error. | ||
| */ | ||
| export async function transcribeAudio(input: AudioInput): Promise<TranscriptionResult> { | ||
| const apiKey = getLLMApiKey(); | ||
| if (!apiKey) { | ||
| throw new Error( | ||
| 'No API key configured for audio transcription. ' + | ||
| 'Set MEMORIX_LLM_API_KEY, MEMORIX_API_KEY, or OPENAI_API_KEY.', | ||
| ); | ||
| } | ||
|
|
||
| const providerName = input.provider | ||
| ?? (process.env.MEMORIX_AUDIO_PROVIDER as 'openai' | 'groq' | undefined) | ||
| ?? 'openai'; | ||
| const config = PROVIDERS[providerName] ?? PROVIDERS.openai; | ||
|
|
||
| // Build multipart form | ||
| const audioBuffer = Buffer.from(input.base64, 'base64'); | ||
| const blob = new Blob([audioBuffer], { type: input.mimeType ?? 'audio/mp3' }); | ||
| const form = new FormData(); | ||
| form.append('file', blob, input.filename ?? 'audio.mp3'); | ||
| form.append('model', config.model); | ||
| form.append('response_format', 'json'); | ||
| if (input.language) { | ||
| form.append('language', input.language); | ||
| } | ||
|
|
||
| const response = await fetch(`${config.baseUrl}/audio/transcriptions`, { | ||
| method: 'POST', | ||
| headers: { 'Authorization': `Bearer ${apiKey}` }, | ||
| body: form, | ||
| signal: AbortSignal.timeout(120_000), // 2 min timeout for large files | ||
| }); | ||
|
|
||
| if (!response.ok) { | ||
| const errorText = await response.text().catch(() => 'unknown error'); | ||
| throw new Error(`Whisper API error (${response.status}): ${errorText}`); | ||
| } | ||
|
|
||
| const data = await response.json() as { | ||
| text: string; | ||
| duration?: number; | ||
| language?: string; | ||
| }; | ||
|
|
||
| return { | ||
| text: data.text, | ||
| duration: data.duration, | ||
| language: data.language, | ||
| provider: providerName, | ||
| }; | ||
| } | ||
|
|
||
| /** | ||
| * Transcribe audio and store as a Memorix observation. | ||
| */ | ||
| export async function ingestAudio( | ||
| input: AudioInput, | ||
| storeFn: (obs: { | ||
| entityName: string; | ||
| type: string; | ||
| title: string; | ||
| narrative: string; | ||
| concepts: string[]; | ||
| projectId: string; | ||
| }) => Promise<{ observation: { id: number }; upserted: boolean }>, | ||
| projectId: string, | ||
| ): Promise<{ observationId: number; text: string; duration?: number }> { | ||
| const result = await transcribeAudio(input); | ||
|
|
||
| const entityName = input.filename | ||
| ? input.filename.replace(/\.[^.]+$/, '') | ||
| : `audio-${Date.now()}`; | ||
|
|
||
| const { observation } = await storeFn({ | ||
| entityName, | ||
| type: 'discovery', | ||
| title: `Audio transcript: ${entityName}`, | ||
| narrative: result.text, | ||
| concepts: ['audio', 'transcript', ...(result.language ? [result.language] : [])], | ||
| projectId, | ||
| }); | ||
|
|
||
| return { | ||
| observationId: observation.id, | ||
| text: result.text, | ||
| duration: result.duration, | ||
| }; | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,12 @@ | ||
| /** | ||
| * Multimodal Ingestion — Unified Entry Point | ||
| * | ||
| * Re-exports all multimodal loaders for convenient access. | ||
| */ | ||
|
|
||
| export { | ||
| transcribeAudio, | ||
| ingestAudio, | ||
| type AudioInput, | ||
| type TranscriptionResult, | ||
| } from './audio-loader.js'; |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,173 @@ | ||
| import { describe, it, expect, afterEach, beforeEach } from 'bun:test'; | ||
|
Check failure on line 1 in tests/multimodal/audio-loader.test.ts
|
||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The repository’s test command runs Vitest and includes Useful? React with 👍 / 👎. |
||
| import { transcribeAudio, ingestAudio } from '../../src/multimodal/audio-loader.js'; | ||
| import { resetConfigCache } from '../../src/config.js'; | ||
|
|
||
| describe('audio-loader', () => { | ||
| const originalFetch = globalThis.fetch; | ||
|
|
||
| beforeEach(() => { | ||
| resetConfigCache(); | ||
| }); | ||
|
|
||
| afterEach(() => { | ||
| globalThis.fetch = originalFetch; | ||
| // Clean up env vars | ||
| delete process.env.OPENAI_API_KEY; | ||
| delete process.env.MEMORIX_LLM_API_KEY; | ||
| delete process.env.MEMORIX_API_KEY; | ||
| delete process.env.MEMORIX_AUDIO_PROVIDER; | ||
| resetConfigCache(); | ||
| }); | ||
|
|
||
| it('calls OpenAI Whisper endpoint by default', async () => { | ||
| let calledUrl = ''; | ||
| let calledHeaders: Record<string, string> = {}; | ||
| globalThis.fetch = (async (url: any, opts: any) => { | ||
| calledUrl = String(url); | ||
| calledHeaders = opts?.headers ?? {}; | ||
| return new Response(JSON.stringify({ text: 'hello world', duration: 5.2 }), { status: 200 }); | ||
| }) as typeof fetch; | ||
|
|
||
| process.env.OPENAI_API_KEY = 'test-key-123'; | ||
| const result = await transcribeAudio({ | ||
| base64: Buffer.from('fake audio data').toString('base64'), | ||
| }); | ||
|
|
||
| expect(calledUrl).toContain('api.openai.com'); | ||
| expect(calledUrl).toContain('/audio/transcriptions'); | ||
| expect(calledHeaders['Authorization']).toBe('Bearer test-key-123'); | ||
| expect(result.text).toBe('hello world'); | ||
| expect(result.duration).toBe(5.2); | ||
| expect(result.provider).toBe('openai'); | ||
| }); | ||
|
|
||
| it('calls Groq endpoint when provider=groq', async () => { | ||
| let calledUrl = ''; | ||
| globalThis.fetch = (async (url: any) => { | ||
| calledUrl = String(url); | ||
| return new Response(JSON.stringify({ text: 'groq result' }), { status: 200 }); | ||
| }) as typeof fetch; | ||
|
|
||
| process.env.OPENAI_API_KEY = 'test-key'; | ||
| const result = await transcribeAudio({ | ||
| base64: Buffer.from('fake').toString('base64'), | ||
| provider: 'groq', | ||
| }); | ||
|
|
||
| expect(calledUrl).toContain('api.groq.com'); | ||
| expect(result.provider).toBe('groq'); | ||
| }); | ||
|
|
||
| it('uses MEMORIX_AUDIO_PROVIDER env var', async () => { | ||
| let calledUrl = ''; | ||
| globalThis.fetch = (async (url: any) => { | ||
| calledUrl = String(url); | ||
| return new Response(JSON.stringify({ text: 'env result' }), { status: 200 }); | ||
| }) as typeof fetch; | ||
|
|
||
| process.env.OPENAI_API_KEY = 'test-key'; | ||
| process.env.MEMORIX_AUDIO_PROVIDER = 'groq'; | ||
| const result = await transcribeAudio({ | ||
| base64: Buffer.from('fake').toString('base64'), | ||
| }); | ||
|
|
||
| expect(calledUrl).toContain('api.groq.com'); | ||
| expect(result.provider).toBe('groq'); | ||
| }); | ||
|
|
||
| it('throws when no API key configured', async () => { | ||
| // Ensure no API keys are set | ||
| delete process.env.OPENAI_API_KEY; | ||
| delete process.env.MEMORIX_LLM_API_KEY; | ||
| delete process.env.MEMORIX_API_KEY; | ||
| delete process.env.ANTHROPIC_API_KEY; | ||
| delete process.env.OPENROUTER_API_KEY; | ||
|
|
||
| await expect( | ||
| transcribeAudio({ base64: 'dGVzdA==' }), | ||
| ).rejects.toThrow('No API key configured'); | ||
| }); | ||
|
|
||
| it('throws on API error response', async () => { | ||
| globalThis.fetch = (async () => { | ||
| return new Response('Rate limit exceeded', { status: 429 }); | ||
| }) as typeof fetch; | ||
|
|
||
| process.env.OPENAI_API_KEY = 'test-key'; | ||
| await expect( | ||
| transcribeAudio({ base64: Buffer.from('audio').toString('base64') }), | ||
| ).rejects.toThrow('Whisper API error (429)'); | ||
| }); | ||
|
|
||
| it('passes language parameter', async () => { | ||
| let formData: FormData | null = null; | ||
| globalThis.fetch = (async (_url: any, opts: any) => { | ||
| formData = opts?.body; | ||
| return new Response(JSON.stringify({ text: 'bonjour', language: 'fr' }), { status: 200 }); | ||
| }) as typeof fetch; | ||
|
|
||
| process.env.OPENAI_API_KEY = 'test-key'; | ||
| const result = await transcribeAudio({ | ||
| base64: Buffer.from('french audio').toString('base64'), | ||
| language: 'fr', | ||
| }); | ||
|
|
||
| expect(result.text).toBe('bonjour'); | ||
| expect(result.language).toBe('fr'); | ||
| // FormData should have language field | ||
| expect(formData).toBeTruthy(); | ||
| }); | ||
|
|
||
| it('ingestAudio stores observation with correct fields', async () => { | ||
| globalThis.fetch = (async () => { | ||
| return new Response(JSON.stringify({ text: 'transcribed content', duration: 30 }), { status: 200 }); | ||
| }) as typeof fetch; | ||
|
|
||
| process.env.OPENAI_API_KEY = 'test-key'; | ||
|
|
||
| let storedObs: Record<string, unknown> | null = null; | ||
| const storeFn = async (obs: Record<string, unknown>) => { | ||
| storedObs = obs; | ||
| return { observation: { id: 42 }, upserted: false }; | ||
| }; | ||
|
|
||
| const result = await ingestAudio( | ||
| { base64: Buffer.from('audio data').toString('base64'), filename: 'meeting-notes.mp3' }, | ||
| storeFn as any, | ||
| 'project-123', | ||
| ); | ||
|
|
||
| expect(result.observationId).toBe(42); | ||
| expect(result.text).toBe('transcribed content'); | ||
| expect(result.duration).toBe(30); | ||
| expect(storedObs).toBeTruthy(); | ||
| expect(storedObs!.entityName).toBe('meeting-notes'); | ||
| expect(storedObs!.type).toBe('discovery'); | ||
| expect(storedObs!.projectId).toBe('project-123'); | ||
| expect((storedObs!.concepts as string[])).toContain('audio'); | ||
| expect((storedObs!.concepts as string[])).toContain('transcript'); | ||
| }); | ||
|
|
||
| it('ingestAudio uses timestamp for unnamed files', async () => { | ||
| globalThis.fetch = (async () => { | ||
| return new Response(JSON.stringify({ text: 'text' }), { status: 200 }); | ||
| }) as typeof fetch; | ||
|
|
||
| process.env.OPENAI_API_KEY = 'test-key'; | ||
|
|
||
| let storedObs: Record<string, unknown> | null = null; | ||
| const storeFn = async (obs: Record<string, unknown>) => { | ||
| storedObs = obs; | ||
| return { observation: { id: 1 }, upserted: false }; | ||
| }; | ||
|
|
||
| await ingestAudio( | ||
| { base64: Buffer.from('data').toString('base64') }, | ||
| storeFn as any, | ||
| 'proj', | ||
| ); | ||
|
|
||
| expect(storedObs).toBeTruthy(); | ||
| expect((storedObs!.entityName as string)).toMatch(/^audio-\d+$/); | ||
| }); | ||
| }); | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
markInternalWrite()is called beforeingestAudio(), but audio ingestion performs a remote transcription that can run up to 120 seconds; the hot-reload skip window is only 10 seconds, so long transcriptions can expire the skip window beforestoreObservationwrites. In that case the file watcher will treat this internal write as external and trigger unnecessary reload/reindex work, which can cause avoidable contention during ingest-heavy workflows.Useful? React with 👍 / 👎.