diff --git a/src/multimodal/pdf-loader.ts b/src/multimodal/pdf-loader.ts new file mode 100644 index 0000000..e0abb07 --- /dev/null +++ b/src/multimodal/pdf-loader.ts @@ -0,0 +1,125 @@ +/** + * PDF Loader — unpdf Integration + * + * Extracts text from PDFs using unpdf (pure JS, optional dependency). + * Creates per-page observations for searchable memory storage. + */ + +// ── Types ──────────────────────────────────────────────────────────── + +export interface PdfInput { + base64: string; + filename?: string; + maxPages?: number; +} + +export interface PdfPage { + pageNumber: number; + text: string; + charCount: number; +} + +export interface PdfExtractionResult { + pages: PdfPage[]; + totalPages: number; + extractionMethod: 'unpdf'; +} + +// ── Core Functions ─────────────────────────────────────────────────── + +/** + * Extract text from a PDF document page-by-page. + * + * @throws Error if unpdf is not installed (it's an optional dependency). + */ +export async function extractPdfText(input: PdfInput): Promise { + // Dynamic import — unpdf is optional + const unpdf = await import('unpdf').catch(() => null) as { + extractText: (data: Uint8Array, options?: { mergePages?: boolean }) => Promise<{ totalPages: number; text: string; pages?: string[] }>; + } | null; + + if (!unpdf) { + throw new Error( + 'unpdf is not installed. To enable PDF ingestion, run:\n' + + ' npm install unpdf\n' + + 'or: bun add unpdf', + ); + } + + const buffer = Buffer.from(input.base64, 'base64'); + const maxPages = input.maxPages ?? 100; + + const result = await unpdf.extractText(new Uint8Array(buffer), { mergePages: false }); + + // unpdf returns pages as array when mergePages: false + const rawPages = result.pages ?? result.text.split('\f'); + + const pages: PdfPage[] = []; + const limit = Math.min(rawPages.length, maxPages); + + for (let i = 0; i < limit; i++) { + const text = String(rawPages[i] ?? '').trim(); + if (text.length >= 10) { + pages.push({ + pageNumber: i + 1, + text, + charCount: text.length, + }); + } + } + + return { + pages, + totalPages: rawPages.length, + extractionMethod: 'unpdf', + }; +} + +/** + * Extract PDF text and store each page as a Memorix observation. + */ +export async function ingestPdf( + input: PdfInput, + storeFn: (obs: { + entityName: string; + type: string; + title: string; + narrative: string; + concepts: string[]; + projectId: string; + }) => Promise<{ observation: { id: number }; upserted: boolean }>, + projectId: string, +): Promise<{ observationIds: number[]; pagesProcessed: number; totalChars: number }> { + const extraction = await extractPdfText(input); + + const entityName = input.filename + ? input.filename.replace(/\.[^.]+$/, '') + : `pdf-${Date.now()}`; + + const observationIds: number[] = []; + let totalChars = 0; + + for (const page of extraction.pages) { + const narrative = page.text.length > 5000 + ? page.text.slice(0, 5000) + '…' + : page.text; + + const { observation } = await storeFn({ + entityName, + type: 'discovery', + title: `${entityName} — Page ${page.pageNumber}`, + narrative, + concepts: ['pdf', 'document', entityName], + projectId, + }); + + observationIds.push(observation.id); + totalChars += page.charCount; + } + + return { + observationIds, + pagesProcessed: extraction.pages.length, + totalChars, + }; +} diff --git a/src/server.ts b/src/server.ts index 69162fd..248289f 100644 --- a/src/server.ts +++ b/src/server.ts @@ -3055,6 +3055,48 @@ export async function createMemorixServer( }, ); + // ── Multimodal Ingestion: PDF ─────────────────────────────────────── + + server.registerTool( + 'memorix_ingest_pdf', + { + title: 'Ingest PDF', + description: + 'Extract text from a PDF document and store each page as a memory observation. ' + + 'Requires unpdf (optional dependency: npm install unpdf).', + inputSchema: { + base64: z.string().describe('Base64-encoded PDF data'), + filename: z.string().optional().describe('Original filename'), + maxPages: z.number().optional().describe('Max pages to extract (default: 100)'), + }, + }, + async (args) => { + try { + const { ingestPdf } = await import('./multimodal/pdf-loader.js'); + markInternalWrite(); + const result = await ingestPdf( + args, + (obs) => storeObservation(obs), + project.id, + ); + return { + content: [{ + type: 'text' as const, + text: `\uD83D\uDCC4 PDF ingested: ${result.pagesProcessed} pages, ${result.totalChars} chars\n` + + `Observations: ${result.observationIds.join(', ')}`, + }], + }; + } catch (err: unknown) { + return { + content: [{ + type: 'text' as const, + text: `\u274C PDF ingestion failed: ${err instanceof Error ? err.message : String(err)}`, + }], + isError: true, + }; + } + }, + ); // Deferred initialization — runs AFTER transport connect so MCP handshake isn't blocked. // Sync advisory scan and file watcher are non-essential for tool functionality. const deferredInit = async () => { diff --git a/tests/multimodal/pdf-loader.test.ts b/tests/multimodal/pdf-loader.test.ts new file mode 100644 index 0000000..0035084 --- /dev/null +++ b/tests/multimodal/pdf-loader.test.ts @@ -0,0 +1,32 @@ +import { describe, it, expect } from 'bun:test'; +import { extractPdfText, ingestPdf } from '../../src/multimodal/pdf-loader.js'; + +describe('pdf-loader', () => { + it('throws clear error when unpdf is not installed', async () => { + await expect( + extractPdfText({ base64: 'dGVzdA==' }), + ).rejects.toThrow('unpdf is not installed'); + }); + + it('error message includes install instructions', async () => { + try { + await extractPdfText({ base64: 'dGVzdA==' }); + } catch (err) { + expect((err as Error).message).toContain('npm install unpdf'); + } + }); + + it('ingestPdf propagates extractPdfText errors', async () => { + const storeFn = async (_obs: any) => ({ observation: { id: 1 }, upserted: false }); + await expect( + ingestPdf({ base64: 'dGVzdA==' }, storeFn as any, 'proj-1'), + ).rejects.toThrow('unpdf is not installed'); + }); + + it('PdfInput interface accepts all expected fields', () => { + const input = { base64: 'test', filename: 'doc.pdf', maxPages: 5 }; + expect(input.base64).toBe('test'); + expect(input.filename).toBe('doc.pdf'); + expect(input.maxPages).toBe(5); + }); +});