AVIDS2 · RaviTharuma · Mar 29, 2026 · chatgpt-codex-connector · Mar 29, 2026 · chatgpt-codex-connector
diff --git a/src/multimodal/pdf-loader.ts b/src/multimodal/pdf-loader.ts
@@ -0,0 +1,125 @@
+/**
+ * PDF Loader — unpdf Integration
+ *
+ * Extracts text from PDFs using unpdf (pure JS, optional dependency).
+ * Creates per-page observations for searchable memory storage.
+ */
+
+// ── Types ────────────────────────────────────────────────────────────
+
+export interface PdfInput {
+  base64: string;
+  filename?: string;
+  maxPages?: number;
+}
+
+export interface PdfPage {
+  pageNumber: number;
+  text: string;
+  charCount: number;
+}
+
+export interface PdfExtractionResult {
+  pages: PdfPage[];
+  totalPages: number;
+  extractionMethod: 'unpdf';
+}
+
+// ── Core Functions ───────────────────────────────────────────────────
+
+/**
+ * Extract text from a PDF document page-by-page.
+ *
+ * @throws Error if unpdf is not installed (it's an optional dependency).
+ */
+export async function extractPdfText(input: PdfInput): Promise<PdfExtractionResult> {
+  // Dynamic import — unpdf is optional
+  const unpdf = await import('unpdf').catch(() => null) as {
+    extractText: (data: Uint8Array, options?: { mergePages?: boolean }) => Promise<{ totalPages: number; text: string; pages?: string[] }>;
+  } | null;
+
+  if (!unpdf) {
+    throw new Error(
+      'unpdf is not installed. To enable PDF ingestion, run:\n' +
+      '  npm install unpdf\n' +
+      'or: bun add unpdf',
+    );
+  }
+
+  const buffer = Buffer.from(input.base64, 'base64');
+  const maxPages = input.maxPages ?? 100;
+
+  const result = await unpdf.extractText(new Uint8Array(buffer), { mergePages: false });
+
+  // unpdf returns pages as array when mergePages: false
+  const rawPages = result.pages ?? result.text.split('\f');
+
+  const pages: PdfPage[] = [];
+  const limit = Math.min(rawPages.length, maxPages);
+
+  for (let i = 0; i < limit; i++) {
+    const text = String(rawPages[i] ?? '').trim();
+    if (text.length >= 10) {
+      pages.push({
+        pageNumber: i + 1,
+        text,
+        charCount: text.length,
+      });
+    }
+  }
+
+  return {
+    pages,
+    totalPages: rawPages.length,
+    extractionMethod: 'unpdf',
+  };
+}
+
+/**
+ * Extract PDF text and store each page as a Memorix observation.
+ */
+export async function ingestPdf(
+  input: PdfInput,
+  storeFn: (obs: {
+    entityName: string;
+    type: string;
+    title: string;
+    narrative: string;
+    concepts: string[];
+    projectId: string;
+  }) => Promise<{ observation: { id: number }; upserted: boolean }>,
+  projectId: string,
+): Promise<{ observationIds: number[]; pagesProcessed: number; totalChars: number }> {
+  const extraction = await extractPdfText(input);
+
+  const entityName = input.filename
+    ? input.filename.replace(/\.[^.]+$/, '')
+    : `pdf-${Date.now()}`;
+
+  const observationIds: number[] = [];
+  let totalChars = 0;
+
+  for (const page of extraction.pages) {
+    const narrative = page.text.length > 5000
+      ? page.text.slice(0, 5000) + '…'
+      : page.text;
+
+    const { observation } = await storeFn({
+      entityName,
+      type: 'discovery',
+      title: `${entityName} — Page ${page.pageNumber}`,
+      narrative,
+      concepts: ['pdf', 'document', entityName],
+      projectId,
+    });
+
+    observationIds.push(observation.id);
+    totalChars += page.charCount;
+  }
+
+  return {
+    observationIds,
+    pagesProcessed: extraction.pages.length,
+    totalChars,
+  };
+}
diff --git a/src/server.ts b/src/server.ts
@@ -3055,6 +3055,48 @@
     },
   );
 
+  // ── Multimodal Ingestion: PDF ───────────────────────────────────────
+
+  server.registerTool(
+    'memorix_ingest_pdf',
+    {
+      title: 'Ingest PDF',
+      description:
+        'Extract text from a PDF document and store each page as a memory observation. ' +
+        'Requires unpdf (optional dependency: npm install unpdf).',
+      inputSchema: {
+        base64: z.string().describe('Base64-encoded PDF data'),
+        filename: z.string().optional().describe('Original filename'),
+        maxPages: z.number().optional().describe('Max pages to extract (default: 100)'),
+      },
+    },
+    async (args) => {
+      try {
+        const { ingestPdf } = await import('./multimodal/pdf-loader.js');
+        markInternalWrite();
+        const result = await ingestPdf(
+          args,
+          (obs) => storeObservation(obs),
+          project.id,
+        );
+        return {
+          content: [{
+            type: 'text' as const,
+            text: `\uD83D\uDCC4 PDF ingested: ${result.pagesProcessed} pages, ${result.totalChars} chars\n` +
+              `Observations: ${result.observationIds.join(', ')}`,
+          }],
+        };
+      } catch (err: unknown) {
+        return {
+          content: [{
+            type: 'text' as const,
+            text: `\u274C PDF ingestion failed: ${err instanceof Error ? err.message : String(err)}`,
+          }],
+          isError: true,
+        };
+      }
+    },
+  );
   // Deferred initialization — runs AFTER transport connect so MCP handshake isn't blocked.
   // Sync advisory scan and file watcher are non-essential for tool functionality.
   const deferredInit = async () => {

diff --git a/tests/multimodal/pdf-loader.test.ts b/tests/multimodal/pdf-loader.test.ts
@@ -0,0 +1,32 @@
+import { describe, it, expect } from 'bun:test';
+import { extractPdfText, ingestPdf } from '../../src/multimodal/pdf-loader.js';
+
+describe('pdf-loader', () => {
+  it('throws clear error when unpdf is not installed', async () => {
+    await expect(
+      extractPdfText({ base64: 'dGVzdA==' }),
+    ).rejects.toThrow('unpdf is not installed');
+  });
+
+  it('error message includes install instructions', async () => {
+    try {
+      await extractPdfText({ base64: 'dGVzdA==' });
+    } catch (err) {
+      expect((err as Error).message).toContain('npm install unpdf');
+    }
+  });
+
+  it('ingestPdf propagates extractPdfText errors', async () => {
+    const storeFn = async (_obs: any) => ({ observation: { id: 1 }, upserted: false });
+    await expect(
+      ingestPdf({ base64: 'dGVzdA==' }, storeFn as any, 'proj-1'),
+    ).rejects.toThrow('unpdf is not installed');
+  });
+
+  it('PdfInput interface accepts all expected fields', () => {
+    const input = { base64: 'test', filename: 'doc.pdf', maxPages: 5 };
+    expect(input.base64).toBe('test');
+    expect(input.filename).toBe('doc.pdf');
+    expect(input.maxPages).toBe(5);
+  });
+});