Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 145 additions & 0 deletions src/multimodal/audio-loader.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
/**
* Audio Loader — Whisper API Integration
*
* Transcribes audio files via OpenAI Whisper or Groq Whisper API,
* then stores the transcript as a Memorix observation.
*
* Supports: mp3, wav, m4a, webm, mp4, ogg, flac
* Providers: OpenAI (whisper-1), Groq (whisper-large-v3)
*/

import { getLLMApiKey } from '../config.js';

// ── Types ────────────────────────────────────────────────────────────

export interface AudioInput {
/** Base64-encoded audio data */
base64: string;
/** Audio MIME type (default: audio/mp3) */
mimeType?: string;
/** Original filename */
filename?: string;
/** ISO language code for transcription hint */
language?: string;
/** Whisper provider: openai or groq */
provider?: 'openai' | 'groq';
}

export interface TranscriptionResult {
/** Transcribed text */
text: string;
/** Audio duration in seconds */
duration?: number;
/** Detected language */
language?: string;
/** Provider used */
provider: string;
}

// ── Provider Config ──────────────────────────────────────────────────

const PROVIDERS = {
openai: {
baseUrl: 'https://api.openai.com/v1',
model: 'whisper-1',
},
groq: {
baseUrl: 'https://api.groq.com/openai/v1',
model: 'whisper-large-v3',
},
} as const;

// ── Core Functions ───────────────────────────────────────────────────

/**
* Transcribe audio via Whisper API.
*
* @throws Error if no API key configured or API returns error.
*/
export async function transcribeAudio(input: AudioInput): Promise<TranscriptionResult> {
const apiKey = getLLMApiKey();
if (!apiKey) {
throw new Error(
'No API key configured for audio transcription. ' +
'Set MEMORIX_LLM_API_KEY, MEMORIX_API_KEY, or OPENAI_API_KEY.',
);
}

const providerName = input.provider
?? (process.env.MEMORIX_AUDIO_PROVIDER as 'openai' | 'groq' | undefined)
?? 'openai';
const config = PROVIDERS[providerName] ?? PROVIDERS.openai;

// Build multipart form
const audioBuffer = Buffer.from(input.base64, 'base64');
const blob = new Blob([audioBuffer], { type: input.mimeType ?? 'audio/mp3' });
const form = new FormData();
form.append('file', blob, input.filename ?? 'audio.mp3');
form.append('model', config.model);
form.append('response_format', 'json');
if (input.language) {
form.append('language', input.language);
}

const response = await fetch(`${config.baseUrl}/audio/transcriptions`, {
method: 'POST',
headers: { 'Authorization': `Bearer ${apiKey}` },
body: form,
signal: AbortSignal.timeout(120_000), // 2 min timeout for large files
});

if (!response.ok) {
const errorText = await response.text().catch(() => 'unknown error');
throw new Error(`Whisper API error (${response.status}): ${errorText}`);
}

const data = await response.json() as {
text: string;
duration?: number;
language?: string;
};

return {
text: data.text,
duration: data.duration,
language: data.language,
provider: providerName,
};
}

/**
* Transcribe audio and store as a Memorix observation.
*/
export async function ingestAudio(
input: AudioInput,
storeFn: (obs: {
entityName: string;
type: string;
title: string;
narrative: string;
concepts: string[];
projectId: string;
}) => Promise<{ observation: { id: number }; upserted: boolean }>,
projectId: string,
): Promise<{ observationId: number; text: string; duration?: number }> {
const result = await transcribeAudio(input);

const entityName = input.filename
? input.filename.replace(/\.[^.]+$/, '')
: `audio-${Date.now()}`;

const { observation } = await storeFn({
entityName,
type: 'discovery',
title: `Audio transcript: ${entityName}`,
narrative: result.text,
concepts: ['audio', 'transcript', ...(result.language ? [result.language] : [])],
projectId,
});

return {
observationId: observation.id,
text: result.text,
duration: result.duration,
};
}
12 changes: 12 additions & 0 deletions src/multimodal/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
/**
* Multimodal Ingestion — Unified Entry Point
*
* Re-exports all multimodal loaders for convenient access.
*/

export {
transcribeAudio,
ingestAudio,
type AudioInput,
type TranscriptionResult,
} from './audio-loader.js';
45 changes: 45 additions & 0 deletions src/server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3055,6 +3055,51 @@
},
);

// ── Multimodal Ingestion Tools ─────────────────────────────────────

server.registerTool(
'memorix_ingest_audio',
{
title: 'Ingest Audio',
description:
'Transcribe audio via Whisper API (OpenAI or Groq) and store the transcript as a memory observation. ' +
'Supports mp3, wav, m4a, webm, ogg, flac formats.',
inputSchema: {
base64: z.string().describe('Base64-encoded audio data'),
mimeType: z.string().optional().describe('Audio MIME type (e.g. audio/mp3)'),
filename: z.string().optional().describe('Original filename'),
language: z.string().optional().describe('ISO language code for transcription hint'),
provider: z.enum(['openai', 'groq']).optional().describe('Whisper provider (default: openai)'),
},
},
async (args) => {
try {
const { ingestAudio } = await import('./multimodal/index.js');
markInternalWrite();
const result = await ingestAudio(
Comment on lines +3078 to +3079
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Move internal-write marker to the actual write point

markInternalWrite() is called before ingestAudio(), but audio ingestion performs a remote transcription that can run up to 120 seconds; the hot-reload skip window is only 10 seconds, so long transcriptions can expire the skip window before storeObservation writes. In that case the file watcher will treat this internal write as external and trigger unnecessary reload/reindex work, which can cause avoidable contention during ingest-heavy workflows.

Useful? React with 👍 / 👎.

args,
(obs) => storeObservation(obs),

Check failure on line 3081 in src/server.ts

View workflow job for this annotation

GitHub Actions / typecheck

Argument of type '{ entityName: string; type: string; title: string; narrative: string; concepts: string[]; projectId: string; }' is not assignable to parameter of type '{ entityName: string; type: ObservationType; title: string; narrative: string; facts?: string[] | undefined; filesModified?: string[] | undefined; concepts?: string[] | undefined; ... 7 more ...; relatedEntities?: string[] | undefined; }'.
project.id,
);
return {
content: [{
type: 'text' as const,
text: `🎤 Audio transcribed (${result.duration ? result.duration.toFixed(1) + 's' : 'unknown duration'})\n` +
`Observation #${result.observationId}\n` +
`Preview: ${result.text.slice(0, 300)}${result.text.length > 300 ? '…' : ''}`,
}],
};
} catch (err: unknown) {
return {
content: [{
type: 'text' as const,
text: `❌ Audio ingestion failed: ${err instanceof Error ? err.message : String(err)}`,
}],
isError: true,
};
}
},
);
// Deferred initialization — runs AFTER transport connect so MCP handshake isn't blocked.
// Sync advisory scan and file watcher are non-essential for tool functionality.
const deferredInit = async () => {
Expand Down
173 changes: 173 additions & 0 deletions tests/multimodal/audio-loader.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
import { describe, it, expect, afterEach, beforeEach } from 'bun:test';

Check failure on line 1 in tests/multimodal/audio-loader.test.ts

View workflow job for this annotation

GitHub Actions / test (ubuntu-latest, 22)

tests/multimodal/audio-loader.test.ts

Error: Cannot find package 'bun:test' imported from '/home/runner/work/memorix/memorix/tests/multimodal/audio-loader.test.ts' ❯ tests/multimodal/audio-loader.test.ts:1:1 ⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯ Serialized Error: { code: 'ERR_MODULE_NOT_FOUND' } Caused by: Caused by: Error: Failed to load url bun:test (resolved id: bun:test). Does the file exist? ❯ loadAndTransform node_modules/vite/dist/node/chunks/dep-D4NMHUTW.js:35729:17

Check failure on line 1 in tests/multimodal/audio-loader.test.ts

View workflow job for this annotation

GitHub Actions / test (ubuntu-latest, 20)

tests/multimodal/audio-loader.test.ts

Error: Cannot find package 'bun:test' imported from '/home/runner/work/memorix/memorix/tests/multimodal/audio-loader.test.ts' ❯ tests/multimodal/audio-loader.test.ts:1:1 ⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯ Serialized Error: { code: 'ERR_MODULE_NOT_FOUND' } Caused by: Caused by: Error: Failed to load url bun:test (resolved id: bun:test). Does the file exist? ❯ loadAndTransform node_modules/vite/dist/node/chunks/dep-D4NMHUTW.js:35729:17
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Use Vitest APIs in the new test file

The repository’s test command runs Vitest and includes tests/**/*.test.ts, but this new test imports from bun:test. Under the declared test runner, that import is unresolved, so CI/local npm test will fail before executing these assertions. Switching this file to Vitest imports (or globals) is necessary to keep the test suite runnable.

Useful? React with 👍 / 👎.

import { transcribeAudio, ingestAudio } from '../../src/multimodal/audio-loader.js';
import { resetConfigCache } from '../../src/config.js';

describe('audio-loader', () => {
const originalFetch = globalThis.fetch;

beforeEach(() => {
resetConfigCache();
});

afterEach(() => {
globalThis.fetch = originalFetch;
// Clean up env vars
delete process.env.OPENAI_API_KEY;
delete process.env.MEMORIX_LLM_API_KEY;
delete process.env.MEMORIX_API_KEY;
delete process.env.MEMORIX_AUDIO_PROVIDER;
resetConfigCache();
});

it('calls OpenAI Whisper endpoint by default', async () => {
let calledUrl = '';
let calledHeaders: Record<string, string> = {};
globalThis.fetch = (async (url: any, opts: any) => {
calledUrl = String(url);
calledHeaders = opts?.headers ?? {};
return new Response(JSON.stringify({ text: 'hello world', duration: 5.2 }), { status: 200 });
}) as typeof fetch;

process.env.OPENAI_API_KEY = 'test-key-123';
const result = await transcribeAudio({
base64: Buffer.from('fake audio data').toString('base64'),
});

expect(calledUrl).toContain('api.openai.com');
expect(calledUrl).toContain('/audio/transcriptions');
expect(calledHeaders['Authorization']).toBe('Bearer test-key-123');
expect(result.text).toBe('hello world');
expect(result.duration).toBe(5.2);
expect(result.provider).toBe('openai');
});

it('calls Groq endpoint when provider=groq', async () => {
let calledUrl = '';
globalThis.fetch = (async (url: any) => {
calledUrl = String(url);
return new Response(JSON.stringify({ text: 'groq result' }), { status: 200 });
}) as typeof fetch;

process.env.OPENAI_API_KEY = 'test-key';
const result = await transcribeAudio({
base64: Buffer.from('fake').toString('base64'),
provider: 'groq',
});

expect(calledUrl).toContain('api.groq.com');
expect(result.provider).toBe('groq');
});

it('uses MEMORIX_AUDIO_PROVIDER env var', async () => {
let calledUrl = '';
globalThis.fetch = (async (url: any) => {
calledUrl = String(url);
return new Response(JSON.stringify({ text: 'env result' }), { status: 200 });
}) as typeof fetch;

process.env.OPENAI_API_KEY = 'test-key';
process.env.MEMORIX_AUDIO_PROVIDER = 'groq';
const result = await transcribeAudio({
base64: Buffer.from('fake').toString('base64'),
});

expect(calledUrl).toContain('api.groq.com');
expect(result.provider).toBe('groq');
});

it('throws when no API key configured', async () => {
// Ensure no API keys are set
delete process.env.OPENAI_API_KEY;
delete process.env.MEMORIX_LLM_API_KEY;
delete process.env.MEMORIX_API_KEY;
delete process.env.ANTHROPIC_API_KEY;
delete process.env.OPENROUTER_API_KEY;

await expect(
transcribeAudio({ base64: 'dGVzdA==' }),
).rejects.toThrow('No API key configured');
});

it('throws on API error response', async () => {
globalThis.fetch = (async () => {
return new Response('Rate limit exceeded', { status: 429 });
}) as typeof fetch;

process.env.OPENAI_API_KEY = 'test-key';
await expect(
transcribeAudio({ base64: Buffer.from('audio').toString('base64') }),
).rejects.toThrow('Whisper API error (429)');
});

it('passes language parameter', async () => {
let formData: FormData | null = null;
globalThis.fetch = (async (_url: any, opts: any) => {
formData = opts?.body;
return new Response(JSON.stringify({ text: 'bonjour', language: 'fr' }), { status: 200 });
}) as typeof fetch;

process.env.OPENAI_API_KEY = 'test-key';
const result = await transcribeAudio({
base64: Buffer.from('french audio').toString('base64'),
language: 'fr',
});

expect(result.text).toBe('bonjour');
expect(result.language).toBe('fr');
// FormData should have language field
expect(formData).toBeTruthy();
});

it('ingestAudio stores observation with correct fields', async () => {
globalThis.fetch = (async () => {
return new Response(JSON.stringify({ text: 'transcribed content', duration: 30 }), { status: 200 });
}) as typeof fetch;

process.env.OPENAI_API_KEY = 'test-key';

let storedObs: Record<string, unknown> | null = null;
const storeFn = async (obs: Record<string, unknown>) => {
storedObs = obs;
return { observation: { id: 42 }, upserted: false };
};

const result = await ingestAudio(
{ base64: Buffer.from('audio data').toString('base64'), filename: 'meeting-notes.mp3' },
storeFn as any,
'project-123',
);

expect(result.observationId).toBe(42);
expect(result.text).toBe('transcribed content');
expect(result.duration).toBe(30);
expect(storedObs).toBeTruthy();
expect(storedObs!.entityName).toBe('meeting-notes');
expect(storedObs!.type).toBe('discovery');
expect(storedObs!.projectId).toBe('project-123');
expect((storedObs!.concepts as string[])).toContain('audio');
expect((storedObs!.concepts as string[])).toContain('transcript');
});

it('ingestAudio uses timestamp for unnamed files', async () => {
globalThis.fetch = (async () => {
return new Response(JSON.stringify({ text: 'text' }), { status: 200 });
}) as typeof fetch;

process.env.OPENAI_API_KEY = 'test-key';

let storedObs: Record<string, unknown> | null = null;
const storeFn = async (obs: Record<string, unknown>) => {
storedObs = obs;
return { observation: { id: 1 }, upserted: false };
};

await ingestAudio(
{ base64: Buffer.from('data').toString('base64') },
storeFn as any,
'proj',
);

expect(storedObs).toBeTruthy();
expect((storedObs!.entityName as string)).toMatch(/^audio-\d+$/);
});
});
Loading