-
Notifications
You must be signed in to change notification settings - Fork 28
feat: PDF ingestion via unpdf with page chunking #33
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,125 @@ | ||
| /** | ||
| * PDF Loader — unpdf Integration | ||
| * | ||
| * Extracts text from PDFs using unpdf (pure JS, optional dependency). | ||
| * Creates per-page observations for searchable memory storage. | ||
| */ | ||
|
|
||
| // ── Types ──────────────────────────────────────────────────────────── | ||
|
|
||
| export interface PdfInput { | ||
| base64: string; | ||
| filename?: string; | ||
| maxPages?: number; | ||
| } | ||
|
|
||
| export interface PdfPage { | ||
| pageNumber: number; | ||
| text: string; | ||
| charCount: number; | ||
| } | ||
|
|
||
| export interface PdfExtractionResult { | ||
| pages: PdfPage[]; | ||
| totalPages: number; | ||
| extractionMethod: 'unpdf'; | ||
| } | ||
|
|
||
| // ── Core Functions ─────────────────────────────────────────────────── | ||
|
|
||
| /** | ||
| * Extract text from a PDF document page-by-page. | ||
| * | ||
| * @throws Error if unpdf is not installed (it's an optional dependency). | ||
| */ | ||
| export async function extractPdfText(input: PdfInput): Promise<PdfExtractionResult> { | ||
| // Dynamic import — unpdf is optional | ||
| const unpdf = await import('unpdf').catch(() => null) as { | ||
| extractText: (data: Uint8Array, options?: { mergePages?: boolean }) => Promise<{ totalPages: number; text: string; pages?: string[] }>; | ||
| } | null; | ||
|
|
||
| if (!unpdf) { | ||
| throw new Error( | ||
| 'unpdf is not installed. To enable PDF ingestion, run:\n' + | ||
| ' npm install unpdf\n' + | ||
| 'or: bun add unpdf', | ||
| ); | ||
| } | ||
|
|
||
| const buffer = Buffer.from(input.base64, 'base64'); | ||
| const maxPages = input.maxPages ?? 100; | ||
|
|
||
| const result = await unpdf.extractText(new Uint8Array(buffer), { mergePages: false }); | ||
|
|
||
| // unpdf returns pages as array when mergePages: false | ||
| const rawPages = result.pages ?? result.text.split('\f'); | ||
|
|
||
| const pages: PdfPage[] = []; | ||
| const limit = Math.min(rawPages.length, maxPages); | ||
|
|
||
| for (let i = 0; i < limit; i++) { | ||
|
Comment on lines
+58
to
+60
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Useful? React with 👍 / 👎. |
||
| const text = String(rawPages[i] ?? '').trim(); | ||
| if (text.length >= 10) { | ||
| pages.push({ | ||
| pageNumber: i + 1, | ||
| text, | ||
| charCount: text.length, | ||
| }); | ||
| } | ||
| } | ||
|
|
||
| return { | ||
| pages, | ||
| totalPages: rawPages.length, | ||
| extractionMethod: 'unpdf', | ||
| }; | ||
| } | ||
|
|
||
| /** | ||
| * Extract PDF text and store each page as a Memorix observation. | ||
| */ | ||
| export async function ingestPdf( | ||
| input: PdfInput, | ||
| storeFn: (obs: { | ||
| entityName: string; | ||
| type: string; | ||
| title: string; | ||
| narrative: string; | ||
| concepts: string[]; | ||
| projectId: string; | ||
| }) => Promise<{ observation: { id: number }; upserted: boolean }>, | ||
| projectId: string, | ||
| ): Promise<{ observationIds: number[]; pagesProcessed: number; totalChars: number }> { | ||
| const extraction = await extractPdfText(input); | ||
|
|
||
| const entityName = input.filename | ||
| ? input.filename.replace(/\.[^.]+$/, '') | ||
| : `pdf-${Date.now()}`; | ||
|
|
||
| const observationIds: number[] = []; | ||
| let totalChars = 0; | ||
|
|
||
| for (const page of extraction.pages) { | ||
| const narrative = page.text.length > 5000 | ||
| ? page.text.slice(0, 5000) + '…' | ||
| : page.text; | ||
|
|
||
| const { observation } = await storeFn({ | ||
| entityName, | ||
| type: 'discovery', | ||
| title: `${entityName} — Page ${page.pageNumber}`, | ||
| narrative, | ||
| concepts: ['pdf', 'document', entityName], | ||
| projectId, | ||
| }); | ||
|
|
||
| observationIds.push(observation.id); | ||
| totalChars += page.charCount; | ||
| } | ||
|
|
||
| return { | ||
| observationIds, | ||
| pagesProcessed: extraction.pages.length, | ||
| totalChars, | ||
| }; | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,32 @@ | ||
| import { describe, it, expect } from 'bun:test'; | ||
|
Check failure on line 1 in tests/multimodal/pdf-loader.test.ts
|
||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| import { extractPdfText, ingestPdf } from '../../src/multimodal/pdf-loader.js'; | ||
|
|
||
| describe('pdf-loader', () => { | ||
| it('throws clear error when unpdf is not installed', async () => { | ||
| await expect( | ||
| extractPdfText({ base64: 'dGVzdA==' }), | ||
| ).rejects.toThrow('unpdf is not installed'); | ||
| }); | ||
|
|
||
| it('error message includes install instructions', async () => { | ||
| try { | ||
| await extractPdfText({ base64: 'dGVzdA==' }); | ||
| } catch (err) { | ||
| expect((err as Error).message).toContain('npm install unpdf'); | ||
| } | ||
| }); | ||
|
|
||
| it('ingestPdf propagates extractPdfText errors', async () => { | ||
| const storeFn = async (_obs: any) => ({ observation: { id: 1 }, upserted: false }); | ||
| await expect( | ||
| ingestPdf({ base64: 'dGVzdA==' }, storeFn as any, 'proj-1'), | ||
| ).rejects.toThrow('unpdf is not installed'); | ||
| }); | ||
|
|
||
| it('PdfInput interface accepts all expected fields', () => { | ||
| const input = { base64: 'test', filename: 'doc.pdf', maxPages: 5 }; | ||
| expect(input.base64).toBe('test'); | ||
| expect(input.filename).toBe('doc.pdf'); | ||
| expect(input.maxPages).toBe(5); | ||
| }); | ||
| }); | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This new loader hard-depends on resolving
import('unpdf'), but the package is not declared in Memorix’s dependency metadata, so default installs won’t provide it. In the documented global-install flow (npm install -g memorix), telling users to run localnpm install unpdfstill won’t make this import resolvable from the global package tree, somemorix_ingest_pdfwill consistently fail for those users.Useful? React with 👍 / 👎.