From 73d133cbd556cf172e67ce05d5e8ad5cb7f1d194 Mon Sep 17 00:00:00 2001 From: Gopal Patel Date: Sat, 18 Apr 2026 14:32:04 +0100 Subject: [PATCH 1/8] refactor(link-extraction): extract DEFAULT_ENTITY_DIRS constant Extracts the hardcoded entity directory list (people, companies, meetings, concepts, deal, civic, project, source, media, yc) into an exported frozen readonly array. This is the first step toward configurable entity dirs; subsequent commits build the regex dynamically from this list and add a config reader so users can extend or replace the defaults (e.g. Johnny Decimal filesystems). --- src/core/link-extraction.ts | 23 +++++++++++++++++++++++ test/link-extraction.test.ts | 22 ++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/src/core/link-extraction.ts b/src/core/link-extraction.ts index 55570b59..1266fb92 100644 --- a/src/core/link-extraction.ts +++ b/src/core/link-extraction.ts @@ -26,6 +26,29 @@ export interface EntityRef { dir: string; } +/** + * Canonical entity directory list. Each directory name is a top-level slug + * prefix that extractors recognise as an "entity" (e.g. `people/alice`, + * `companies/acme`, `meetings/2026-01-15`). Frozen so downstream callers can + * treat it as immutable. + * + * Users can extend or replace this list via the `entity_dirs` config key + * (see `getEntityDirs`). Custom dirs follow the same slug shape: + * `/^[a-z0-9][a-z0-9-]*$/`. + */ +export const DEFAULT_ENTITY_DIRS: readonly string[] = Object.freeze([ + 'people', + 'companies', + 'meetings', + 'concepts', + 'deal', + 'civic', + 'project', + 'source', + 'media', + 'yc', +]); + /** * Match `[Name](path)` markdown links pointing to `people/` or `companies/` * (and other entity directories). Accepts both filesystem-relative format diff --git a/test/link-extraction.test.ts b/test/link-extraction.test.ts index e5ed7ec5..f5e822c0 100644 --- a/test/link-extraction.test.ts +++ b/test/link-extraction.test.ts @@ -1,5 +1,6 @@ import { describe, test, expect } from 'bun:test'; import { + DEFAULT_ENTITY_DIRS, extractEntityRefs, extractPageLinks, inferLinkType, @@ -8,6 +9,27 @@ import { } from '../src/core/link-extraction.ts'; import type { BrainEngine } from '../src/core/engine.ts'; +// ─── DEFAULT_ENTITY_DIRS ─────────────────────────────────────── + +describe('DEFAULT_ENTITY_DIRS', () => { + test('is exported and contains the canonical entity dirs', () => { + expect(DEFAULT_ENTITY_DIRS).toContain('people'); + expect(DEFAULT_ENTITY_DIRS).toContain('companies'); + expect(DEFAULT_ENTITY_DIRS).toContain('meetings'); + expect(DEFAULT_ENTITY_DIRS).toContain('concepts'); + expect(DEFAULT_ENTITY_DIRS).toContain('deal'); + expect(DEFAULT_ENTITY_DIRS).toContain('civic'); + expect(DEFAULT_ENTITY_DIRS).toContain('project'); + expect(DEFAULT_ENTITY_DIRS).toContain('source'); + expect(DEFAULT_ENTITY_DIRS).toContain('media'); + expect(DEFAULT_ENTITY_DIRS).toContain('yc'); + }); + + test('is frozen (readonly at runtime)', () => { + expect(Object.isFrozen(DEFAULT_ENTITY_DIRS)).toBe(true); + }); +}); + // ─── extractEntityRefs ───────────────────────────────────────── describe('extractEntityRefs', () => { From db8bb0840bd36985b7298b5fea7d33d092ed1e90 Mon Sep 17 00:00:00 2001 From: Gopal Patel Date: Sat, 18 Apr 2026 14:32:34 +0100 Subject: [PATCH 2/8] refactor(link-extraction): add buildEntityRefRegex helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the hardcoded ENTITY_REF_RE literal with a builder that composes the alternation from a dir list. The default regex is now built once at module load from DEFAULT_ENTITY_DIRS, preserving the fast path for the common case. Adds escapeRegexChars as defense-in-depth — getEntityDirs already validates dir names against /^[a-z0-9][a-z0-9-]*$/, so no metachars should ever reach the regex builder, but future callers who reach for buildEntityRefRegex directly still get safe output. Both helpers are internal (not exported) to keep the public surface small. --- src/core/link-extraction.ts | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/src/core/link-extraction.ts b/src/core/link-extraction.ts index 1266fb92..c7f05bc9 100644 --- a/src/core/link-extraction.ts +++ b/src/core/link-extraction.ts @@ -50,16 +50,38 @@ export const DEFAULT_ENTITY_DIRS: readonly string[] = Object.freeze([ ]); /** - * Match `[Name](path)` markdown links pointing to `people/` or `companies/` - * (and other entity directories). Accepts both filesystem-relative format - * (`[Name](../people/slug.md)`) AND engine-slug format (`[Name](people/slug)`). + * Escape regex metacharacters so a value can be embedded inside a larger + * regex without changing its structure. Defense-in-depth: `getEntityDirs` + * already validates entries against `/^[a-z0-9][a-z0-9-]*$/`, so no metachar + * should ever reach here — this is belt-and-braces for future callers. + */ +function escapeRegexChars(s: string): string { + return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +} + +/** + * Build the `[Name](dir/slug)` entity-ref regex from a dir list. Accepts both + * filesystem-relative format (`[Name](../people/slug.md)`) AND engine-slug + * format (`[Name](people/slug)`). * - * Captures: name, dir (people/companies/...), slug. + * Captures: name, full `dir/slug` path, slug segment alone. * - * The regex permits an optional `../` prefix (any number) and an optional - * `.md` suffix so the same function works for both filesystem and DB content. + * Internal — callers use `extractEntityRefs(content, dirs?)` which threads the + * dir list through. Keeping this private limits API surface. + */ +function buildEntityRefRegex(dirs: readonly string[]): RegExp { + const alternation = dirs.map(escapeRegexChars).join('|'); + return new RegExp( + `\\[([^\\]]+)\\]\\((?:\\.\\.\\/)*((?:${alternation})\\/([^)\\s]+?))(?:\\.md)?\\)`, + 'g', + ); +} + +/** + * Default entity-ref regex built once from DEFAULT_ENTITY_DIRS. Callers that + * don't pass a custom dir list get this fast-path (no per-call regex compile). */ -const ENTITY_REF_RE = /\[([^\]]+)\]\((?:\.\.\/)*((?:people|companies|meetings|concepts|deal|civic|project|source|media|yc)\/([^)\s]+?))(?:\.md)?\)/g; +const ENTITY_REF_RE = buildEntityRefRegex(DEFAULT_ENTITY_DIRS); /** * Strip fenced code blocks (```...```) and inline code (`...`) from markdown, From 5b27cc8fdb13d240c0841f20563bbabe245e031f Mon Sep 17 00:00:00 2001 From: Gopal Patel Date: Sat, 18 Apr 2026 14:33:18 +0100 Subject: [PATCH 3/8] feat(link-extraction): extractEntityRefs accepts optional dirs param MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit extractEntityRefs(content, dirs?) now takes an optional readonly dir list. When omitted, the module-level regex built from DEFAULT_ENTITY_DIRS is reused (fast path, zero compile cost). When provided, a scoped regex is compiled from the custom list — only those dirs match. Callers who want custom dirs IN ADDITION to defaults must pass the union themselves; the upcoming getEntityDirs helper does exactly that. Unlocks non-default filesystem layouts (e.g. Johnny Decimal 01-notes/). --- src/core/link-extraction.ts | 12 ++++++++++-- test/link-extraction.test.ts | 19 +++++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/src/core/link-extraction.ts b/src/core/link-extraction.ts index c7f05bc9..607dad9c 100644 --- a/src/core/link-extraction.ts +++ b/src/core/link-extraction.ts @@ -125,13 +125,21 @@ function stripCodeBlocks(content: string): string { * slugs (`people/slug`) are matched. Returns one EntityRef per match (no dedup * here; caller dedups). Slugs appearing inside fenced or inline code blocks * are excluded — those are typically code samples, not real entity references. + * + * @param content Markdown text to scan. + * @param dirs Optional entity-dir list. When omitted, uses DEFAULT_ENTITY_DIRS. + * When provided, ONLY those dirs are matched — callers that want to extend + * the defaults should pass the union (see `getEntityDirs`). */ -export function extractEntityRefs(content: string): EntityRef[] { +export function extractEntityRefs(content: string, dirs?: readonly string[]): EntityRef[] { const stripped = stripCodeBlocks(content); const refs: EntityRef[] = []; let m: RegExpExecArray | null; + // When dirs omitted, reuse the module-level regex (built from DEFAULT_ENTITY_DIRS). + // When dirs provided, build a scoped regex from the custom list. // Fresh regex per call (g-flag state is per-instance). - const re = new RegExp(ENTITY_REF_RE.source, ENTITY_REF_RE.flags); + const base = dirs ? buildEntityRefRegex(dirs) : ENTITY_REF_RE; + const re = new RegExp(base.source, base.flags); while ((m = re.exec(stripped)) !== null) { const name = m[1]; const fullPath = m[2]; diff --git a/test/link-extraction.test.ts b/test/link-extraction.test.ts index f5e822c0..e7d1ba14 100644 --- a/test/link-extraction.test.ts +++ b/test/link-extraction.test.ts @@ -89,6 +89,25 @@ describe('extractEntityRefs', () => { expect(refs.length).toBe(1); expect(refs[0].dir).toBe('meetings'); }); + + test('custom dirs: Johnny Decimal style ([Rushi](01-notes/rushi)) matches when dir is configured', () => { + const refs = extractEntityRefs('Met [Rushi](01-notes/rushi) for coffee.', ['01-notes']); + expect(refs.length).toBe(1); + expect(refs[0]).toEqual({ name: 'Rushi', slug: '01-notes/rushi', dir: '01-notes' }); + }); + + test('custom-only dir list replaces defaults — default dirs do not match', () => { + // When caller supplies an explicit dirs list, only those dirs are used. + // The default `people/` dir does NOT match. + const refs = extractEntityRefs('[Alice](people/alice)', ['01-notes']); + expect(refs).toEqual([]); + }); + + test('omitting dirs uses default list (backwards compatible)', () => { + const refs = extractEntityRefs('[Alice](people/alice)'); + expect(refs.length).toBe(1); + expect(refs[0].slug).toBe('people/alice'); + }); }); // ─── extractPageLinks ────────────────────────────────────────── From 93e9219c70ba0d4044f046663d823327d949c688 Mon Sep 17 00:00:00 2001 From: Gopal Patel Date: Sat, 18 Apr 2026 14:34:12 +0100 Subject: [PATCH 4/8] feat(link-extraction): extractPageLinks accepts optional dirs param MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit extractPageLinks(content, frontmatter, pageType, dirs?) now threads the optional dir list through both the markdown-ref extractor and the bare-slug regex. Both paths use the same dir list so behavior stays consistent. When dirs is omitted, module-level defaults are reused (no per-call regex compile). When provided, scoped regexes are built from the custom list. Adds buildBareSlugRegex as the bare-slug counterpart to buildEntityRefRegex — both internal, both built from the same escaped alternation. --- src/core/link-extraction.ts | 26 +++++++++++++++++++++++--- test/link-extraction.test.ts | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 3 deletions(-) diff --git a/src/core/link-extraction.ts b/src/core/link-extraction.ts index 607dad9c..3833d554 100644 --- a/src/core/link-extraction.ts +++ b/src/core/link-extraction.ts @@ -161,6 +161,19 @@ export interface LinkCandidate { context: string; } +/** + * Build the bare-slug regex (`dir/slug` appearing anywhere in text) from the + * same dir list as the markdown-ref regex. Keeps the two extractors in sync + * so callers who pass custom dirs see consistent behavior in both paths. + */ +function buildBareSlugRegex(dirs: readonly string[]): RegExp { + const alternation = dirs.map(escapeRegexChars).join('|'); + return new RegExp(`\\b((?:${alternation})\\/[a-z0-9][a-z0-9-]*)\\b`, 'g'); +} + +/** Default bare-slug regex, built once from DEFAULT_ENTITY_DIRS. */ +const BARE_SLUG_RE = buildBareSlugRegex(DEFAULT_ENTITY_DIRS); + /** * Extract all link candidates from a page. * @@ -171,16 +184,22 @@ export interface LinkCandidate { * * Within-page dedup: multiple mentions of the same (targetSlug, linkType) * collapse to one candidate. The first occurrence's context wins. + * + * @param dirs Optional entity-dir list. When omitted, uses DEFAULT_ENTITY_DIRS + * for both the markdown-ref extractor and the bare-slug regex. When provided, + * ONLY those dirs are matched — callers wanting union-with-defaults must pass + * the union themselves (see `getEntityDirs`). */ export function extractPageLinks( content: string, frontmatter: Record, pageType: PageType, + dirs?: readonly string[], ): LinkCandidate[] { const candidates: LinkCandidate[] = []; // 1. Markdown entity refs. - for (const ref of extractEntityRefs(content)) { + for (const ref of extractEntityRefs(content, dirs)) { const idx = content.indexOf(ref.name); // Wider context window (240 chars vs original 80) catches verbs that // appear at sentence-or-paragraph distance from the slug — common in @@ -195,10 +214,11 @@ export function extractPageLinks( } // 2. Bare slug references (e.g. "see people/alice-chen for context"). - // Limited to the same entity directories ENTITY_REF_RE covers. + // Same dir list as the markdown extractor to keep behavior consistent. // Code blocks are stripped first — slugs in code samples are not real refs. const strippedContent = stripCodeBlocks(content); - const bareRe = /\b((?:people|companies|meetings|concepts|deal|civic|project|source|media|yc)\/[a-z0-9][a-z0-9-]*)\b/g; + const bareBase = dirs ? buildBareSlugRegex(dirs) : BARE_SLUG_RE; + const bareRe = new RegExp(bareBase.source, bareBase.flags); let m: RegExpExecArray | null; while ((m = bareRe.exec(strippedContent)) !== null) { // Skip matches that are part of a markdown link (already handled above). diff --git a/test/link-extraction.test.ts b/test/link-extraction.test.ts index e7d1ba14..3dde0a8b 100644 --- a/test/link-extraction.test.ts +++ b/test/link-extraction.test.ts @@ -154,6 +154,39 @@ describe('extractPageLinks', () => { const aliceLink = candidates.find(c => c.targetSlug === 'people/alice'); expect(aliceLink!.linkType).toBe('attended'); }); + + test('custom dirs: [Name](custom-dir/slug) extracted when dir is configured', () => { + const candidates = extractPageLinks( + 'Met [Rushi](01-notes/rushi) yesterday.', + {}, + 'concept', + ['01-notes'], + ); + const rushi = candidates.find(c => c.targetSlug === '01-notes/rushi'); + expect(rushi).toBeDefined(); + }); + + test('custom dirs: bare slug references use same dir list', () => { + const candidates = extractPageLinks( + 'See 01-notes/rushi for details.', + {}, + 'concept', + ['01-notes'], + ); + const rushi = candidates.find(c => c.targetSlug === '01-notes/rushi'); + expect(rushi).toBeDefined(); + }); + + test('custom-only dir list excludes default dirs from bare slug match', () => { + // With dirs=['01-notes'], a bare `people/alice` token is NOT extracted. + const candidates = extractPageLinks( + 'See people/alice for details.', + {}, + 'concept', + ['01-notes'], + ); + expect(candidates.find(c => c.targetSlug === 'people/alice')).toBeUndefined(); + }); }); // ─── inferLinkType ───────────────────────────────────────────── From 82308a85c9e3b45def015d531df85b4aa18089fb Mon Sep 17 00:00:00 2001 From: Gopal Patel Date: Sat, 18 Apr 2026 14:35:23 +0100 Subject: [PATCH 5/8] feat(link-extraction): add getEntityDirs config reader with union+replace modes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reads the effective entity-dir list from engine config: - entity_dirs: comma-separated custom dir names (optional, defaults empty) - entity_dirs_mode: 'union' (default) or 'replace' Union mode ADDS custom dirs to DEFAULT_ENTITY_DIRS (defaults first, custom appended, deduped). Replace mode uses ONLY the custom list. Empty replace falls back to defaults to prevent accidentally disabling extraction. Each custom entry is validated against /^[a-z0-9][a-z0-9-]*$/. On any invalid entry, the function logs a warning and returns defaults — a fail-safe that prevents malformed config from silently breaking the graph layer. Validation runs BEFORE mode resolution so bad input is caught once, regardless of mode. --- src/core/link-extraction.ts | 58 ++++++++++++++++++++ test/link-extraction.test.ts | 103 ++++++++++++++++++++++++++++++++++- 2 files changed, 160 insertions(+), 1 deletion(-) diff --git a/src/core/link-extraction.ts b/src/core/link-extraction.ts index 3833d554..76614c41 100644 --- a/src/core/link-extraction.ts +++ b/src/core/link-extraction.ts @@ -439,3 +439,61 @@ export async function isAutoLinkEnabled(engine: BrainEngine): Promise { const normalized = val.trim().toLowerCase(); return !['false', '0', 'no', 'off'].includes(normalized); } + +/** Regex that validates an entity-dir name. Matches slug shape. */ +const ENTITY_DIR_NAME_RE = /^[a-z0-9][a-z0-9-]*$/; + +/** + * Resolve the effective entity-dir list from engine config. + * + * Reads two config keys: + * - `entity_dirs`: comma-separated list of custom dir names (optional). + * - `entity_dirs_mode`: `"union"` (default) or `"replace"`. + * + * Modes: + * - `union` (default): custom dirs are ADDED to DEFAULT_ENTITY_DIRS. + * Duplicates are deduped; defaults come first, custom dirs append. + * - `replace`: ONLY the custom list is used. If the custom list is empty, + * falls back to defaults (empty replace is meaningless). + * + * Validation: + * Each custom entry must match `/^[a-z0-9][a-z0-9-]*$/`. On ANY invalid + * entry, the function logs a warning via `console.warn` and returns + * DEFAULT_ENTITY_DIRS. This fail-safe prevents malformed config from + * silently disabling all entity extraction. + * + * @returns A fresh string[] (mutable; callers may not mutate DEFAULT_ENTITY_DIRS). + */ +export async function getEntityDirs(engine: BrainEngine): Promise { + const raw = await engine.getConfig('entity_dirs'); + if (raw == null || raw.trim() === '') { + return [...DEFAULT_ENTITY_DIRS]; + } + + const entries = raw.split(',').map(s => s.trim()).filter(s => s.length > 0); + for (const entry of entries) { + if (!ENTITY_DIR_NAME_RE.test(entry)) { + console.warn( + `[gbrain] entity_dirs rejected: ${entry} (must match [a-z0-9][a-z0-9-]*). Falling back to defaults.`, + ); + return [...DEFAULT_ENTITY_DIRS]; + } + } + + const mode = (await engine.getConfig('entity_dirs_mode'))?.trim().toLowerCase(); + if (mode === 'replace' && entries.length > 0) { + // Dedupe while preserving input order. + return Array.from(new Set(entries)); + } + + // Union mode: defaults first, then custom entries not already present. + const seen = new Set(DEFAULT_ENTITY_DIRS); + const result: string[] = [...DEFAULT_ENTITY_DIRS]; + for (const entry of entries) { + if (!seen.has(entry)) { + seen.add(entry); + result.push(entry); + } + } + return result; +} diff --git a/test/link-extraction.test.ts b/test/link-extraction.test.ts index 3dde0a8b..8df1a6df 100644 --- a/test/link-extraction.test.ts +++ b/test/link-extraction.test.ts @@ -1,4 +1,4 @@ -import { describe, test, expect } from 'bun:test'; +import { describe, test, expect, spyOn } from 'bun:test'; import { DEFAULT_ENTITY_DIRS, extractEntityRefs, @@ -6,6 +6,7 @@ import { inferLinkType, parseTimelineEntries, isAutoLinkEnabled, + getEntityDirs, } from '../src/core/link-extraction.ts'; import type { BrainEngine } from '../src/core/engine.ts'; @@ -377,3 +378,103 @@ describe('isAutoLinkEnabled', () => { expect(await isAutoLinkEnabled(engine)).toBe(true); }); }); + +// ─── getEntityDirs ───────────────────────────────────────────── + +describe('getEntityDirs', () => { + test('null config -> DEFAULT_ENTITY_DIRS', async () => { + const engine = makeFakeEngine(new Map()); + const dirs = await getEntityDirs(engine); + expect(dirs).toEqual([...DEFAULT_ENTITY_DIRS]); + }); + + test('empty string config -> DEFAULT_ENTITY_DIRS', async () => { + const engine = makeFakeEngine(new Map([['entity_dirs', '']])); + const dirs = await getEntityDirs(engine); + expect(dirs).toEqual([...DEFAULT_ENTITY_DIRS]); + }); + + test('valid single custom dir -> union with defaults', async () => { + const engine = makeFakeEngine(new Map([['entity_dirs', '01-notes']])); + const dirs = await getEntityDirs(engine); + expect(dirs).toContain('01-notes'); + // defaults preserved + for (const d of DEFAULT_ENTITY_DIRS) expect(dirs).toContain(d); + }); + + test('multiple comma-separated dirs with whitespace -> parsed and unioned', async () => { + const engine = makeFakeEngine(new Map([['entity_dirs', ' 01-notes , 02-projects ,03-archive']])); + const dirs = await getEntityDirs(engine); + expect(dirs).toContain('01-notes'); + expect(dirs).toContain('02-projects'); + expect(dirs).toContain('03-archive'); + }); + + test('duplicate custom dir overlapping with defaults -> no duplicates', async () => { + const engine = makeFakeEngine(new Map([['entity_dirs', 'people,01-notes']])); + const dirs = await getEntityDirs(engine); + const peopleCount = dirs.filter(d => d === 'people').length; + expect(peopleCount).toBe(1); + expect(dirs).toContain('01-notes'); + }); + + test('entity_dirs_mode=replace -> only custom list, no defaults', async () => { + const engine = makeFakeEngine(new Map([ + ['entity_dirs', '01-notes,02-projects'], + ['entity_dirs_mode', 'replace'], + ])); + const dirs = await getEntityDirs(engine); + expect(dirs).toEqual(['01-notes', '02-projects']); + // defaults NOT included in replace mode + expect(dirs).not.toContain('people'); + expect(dirs).not.toContain('companies'); + }); + + test('entity_dirs_mode=replace with empty entity_dirs -> defaults (empty replace is meaningless)', async () => { + const engine = makeFakeEngine(new Map([ + ['entity_dirs', ''], + ['entity_dirs_mode', 'replace'], + ])); + const dirs = await getEntityDirs(engine); + expect(dirs).toEqual([...DEFAULT_ENTITY_DIRS]); + }); + + test('invalid entry (uppercase) -> warn + return defaults', async () => { + const warnSpy = spyOn(console, 'warn').mockImplementation(() => {}); + try { + const engine = makeFakeEngine(new Map([['entity_dirs', '01-notes,BAD_DIR']])); + const dirs = await getEntityDirs(engine); + expect(dirs).toEqual([...DEFAULT_ENTITY_DIRS]); + expect(warnSpy).toHaveBeenCalled(); + const firstCallArg = warnSpy.mock.calls[0]![0]; + expect(firstCallArg).toContain('entity_dirs rejected'); + expect(firstCallArg).toContain('BAD_DIR'); + } finally { + warnSpy.mockRestore(); + } + }); + + test('invalid entry (starts with dash) -> warn + return defaults', async () => { + const warnSpy = spyOn(console, 'warn').mockImplementation(() => {}); + try { + const engine = makeFakeEngine(new Map([['entity_dirs', '-bad']])); + const dirs = await getEntityDirs(engine); + expect(dirs).toEqual([...DEFAULT_ENTITY_DIRS]); + expect(warnSpy).toHaveBeenCalled(); + } finally { + warnSpy.mockRestore(); + } + }); + + test('invalid entry (contains slash) -> warn + return defaults', async () => { + const warnSpy = spyOn(console, 'warn').mockImplementation(() => {}); + try { + const engine = makeFakeEngine(new Map([['entity_dirs', 'people/extra']])); + const dirs = await getEntityDirs(engine); + expect(dirs).toEqual([...DEFAULT_ENTITY_DIRS]); + expect(warnSpy).toHaveBeenCalled(); + } finally { + warnSpy.mockRestore(); + } + }); +}); From 111c263276b69ad628c0a3ee7b9ef0a739b1fac4 Mon Sep 17 00:00:00 2001 From: Gopal Patel Date: Sat, 18 Apr 2026 14:37:13 +0100 Subject: [PATCH 6/8] feat(link-extraction): add wikilink [[dir/slug]] extraction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit extractEntityRefs now also picks up explicit-path wikilinks: [[people/alice]] -> { name: 'alice', slug: 'people/alice' } [[people/alice|Alice Chen]] -> same slug, alias consumed but not captured Scope intentionally limited to explicit dir-prefixed wikilinks. Bare [[alice]] form is OUT OF SCOPE — resolving it requires engine page lookup (walk the slug table, disambiguate aliases), which breaks the pure-function contract of extractEntityRefs. Documented in code and README. The wikilink regex honors the same configured dir list as the markdown extractor, so a custom dir (e.g. 01-notes) matches in both [Name](…) and [[…]] forms. Alias segment is length-bounded (100 chars) to cap worst-case regex cost. Slug segment is bounded — no ReDoS surface. Wikilinks inside fenced or inline code blocks are excluded via the existing stripCodeBlocks pass. Full test suite: 1326 pass / 141 skip / 0 fail. --- src/core/link-extraction.ts | 70 ++++++++++++++++++++++++++++++------ test/link-extraction.test.ts | 70 ++++++++++++++++++++++++++++++++++++ 2 files changed, 129 insertions(+), 11 deletions(-) diff --git a/src/core/link-extraction.ts b/src/core/link-extraction.ts index 76614c41..af508d5e 100644 --- a/src/core/link-extraction.ts +++ b/src/core/link-extraction.ts @@ -83,6 +83,35 @@ function buildEntityRefRegex(dirs: readonly string[]): RegExp { */ const ENTITY_REF_RE = buildEntityRefRegex(DEFAULT_ENTITY_DIRS); +/** + * Build the explicit-path wikilink regex (`[[dir/slug]]` and + * `[[dir/slug|alias]]`) from a dir list. + * + * Scope: ONLY explicit `[[dir/slug]]` form is matched. Bare `[[name]]` + * wikilinks are intentionally out of scope — resolving them requires engine + * page-lookup (walk the slug table, disambiguate aliases), which breaks the + * pure-function contract of extractEntityRefs. See README > entity_dirs for + * the design rationale. + * + * Captures: + * - group 1: full `dir/slug` path + * - alias segment `|display` is consumed but not captured. + * + * Safety: the slug segment is bounded (`[a-z0-9][a-z0-9-]*`) so there is no + * unbounded backtracking. The optional alias is bounded at 100 chars to cap + * worst-case regex cost. + */ +function buildWikilinkRegex(dirs: readonly string[]): RegExp { + const alternation = dirs.map(escapeRegexChars).join('|'); + return new RegExp( + `\\[\\[((?:${alternation})\\/[a-z0-9][a-z0-9-]*)(?:\\|[^\\]|\\n]{1,100})?\\]\\]`, + 'g', + ); +} + +/** Default wikilink regex built once from DEFAULT_ENTITY_DIRS (fast path). */ +const WIKILINK_RE = buildWikilinkRegex(DEFAULT_ENTITY_DIRS); + /** * Strip fenced code blocks (```...```) and inline code (`...`) from markdown, * replacing them with whitespace of equivalent length. Preserves byte offsets @@ -120,11 +149,20 @@ function stripCodeBlocks(content: string): string { } /** - * Extract `[Name](path-to-people-or-company)` references from arbitrary content. - * Both filesystem-relative paths (with `../` and `.md`) and bare engine-style - * slugs (`people/slug`) are matched. Returns one EntityRef per match (no dedup - * here; caller dedups). Slugs appearing inside fenced or inline code blocks - * are excluded — those are typically code samples, not real entity references. + * Extract entity references from arbitrary content. + * + * Two ref forms are matched: + * 1. Markdown links: `[Name](people/slug)` — both filesystem-relative paths + * (with `../` and `.md`) and bare engine-style slugs are accepted. + * 2. Explicit-path wikilinks: `[[dir/slug]]` and `[[dir/slug|alias]]` where + * `dir` is in the configured dir list. For wikilinks, `name` is the slug's + * last path segment (no display name is available). + * + * Bare `[[name]]` wikilinks (no dir prefix) are OUT OF SCOPE — resolving them + * requires engine page lookup, which breaks the pure-function contract. + * + * Returns one EntityRef per match (no dedup here; caller dedups). Slugs + * appearing inside fenced or inline code blocks are excluded. * * @param content Markdown text to scan. * @param dirs Optional entity-dir list. When omitted, uses DEFAULT_ENTITY_DIRS. @@ -134,19 +172,29 @@ function stripCodeBlocks(content: string): string { export function extractEntityRefs(content: string, dirs?: readonly string[]): EntityRef[] { const stripped = stripCodeBlocks(content); const refs: EntityRef[] = []; + + // 1. Markdown-style refs: [Name](dir/slug) + const mdBase = dirs ? buildEntityRefRegex(dirs) : ENTITY_REF_RE; + const mdRe = new RegExp(mdBase.source, mdBase.flags); let m: RegExpExecArray | null; - // When dirs omitted, reuse the module-level regex (built from DEFAULT_ENTITY_DIRS). - // When dirs provided, build a scoped regex from the custom list. - // Fresh regex per call (g-flag state is per-instance). - const base = dirs ? buildEntityRefRegex(dirs) : ENTITY_REF_RE; - const re = new RegExp(base.source, base.flags); - while ((m = re.exec(stripped)) !== null) { + while ((m = mdRe.exec(stripped)) !== null) { const name = m[1]; const fullPath = m[2]; const slug = fullPath; // dir/slug const dir = fullPath.split('/')[0]; refs.push({ name, slug, dir }); } + + // 2. Explicit-path wikilinks: [[dir/slug]] and [[dir/slug|alias]] + const wikiBase = dirs ? buildWikilinkRegex(dirs) : WIKILINK_RE; + const wikiRe = new RegExp(wikiBase.source, wikiBase.flags); + while ((m = wikiRe.exec(stripped)) !== null) { + const fullPath = m[1]; // dir/slug + const [dir, ...rest] = fullPath.split('/'); + const lastSegment = rest[rest.length - 1] ?? ''; + refs.push({ name: lastSegment, slug: fullPath, dir }); + } + return refs; } diff --git a/test/link-extraction.test.ts b/test/link-extraction.test.ts index 8df1a6df..945e97e5 100644 --- a/test/link-extraction.test.ts +++ b/test/link-extraction.test.ts @@ -109,6 +109,76 @@ describe('extractEntityRefs', () => { expect(refs.length).toBe(1); expect(refs[0].slug).toBe('people/alice'); }); + + // ── Explicit-path wikilinks [[dir/slug]] ── + // + // Scope note: only `[[dir/slug]]` and `[[dir/slug|alias]]` are in scope. + // Bare `[[name]]` wikilinks would need engine page-lookup to resolve, + // which breaks the pure-function contract of extractEntityRefs. See + // README for full explanation. + + test('extracts explicit-path wikilinks [[dir/slug]]', () => { + const refs = extractEntityRefs('See [[people/alice]] for context.'); + expect(refs.length).toBe(1); + expect(refs[0]).toEqual({ name: 'alice', slug: 'people/alice', dir: 'people' }); + }); + + test('extracts wikilinks with alias [[dir/slug|Display Name]]', () => { + const refs = extractEntityRefs('Met [[people/alice-chen|Alice Chen]] today.'); + expect(refs.length).toBe(1); + // Display is ignored; name falls back to the last slug segment. + expect(refs[0].slug).toBe('people/alice-chen'); + expect(refs[0].dir).toBe('people'); + expect(refs[0].name).toBe('alice-chen'); + }); + + test('ignores wikilinks when dir is NOT in configured list', () => { + // `notes/` is not a default entity dir, so [[notes/foo]] is ignored. + const refs = extractEntityRefs('See [[notes/foo]] for context.'); + expect(refs).toEqual([]); + }); + + test('wikilink dir honors custom dirs param', () => { + const refs = extractEntityRefs('See [[01-notes/rushi]] for details.', ['01-notes']); + expect(refs.length).toBe(1); + expect(refs[0].slug).toBe('01-notes/rushi'); + expect(refs[0].dir).toBe('01-notes'); + }); + + test('does NOT extract bare [[name]] wikilinks (out of scope)', () => { + // Bare wikilinks require engine page-lookup to resolve — out of scope + // for the pure-function extractor. + const refs = extractEntityRefs('See [[alice]] for context.'); + expect(refs).toEqual([]); + }); + + test('skips wikilinks inside fenced code blocks', () => { + const content = [ + 'Prose with [[people/alice]].', + '```', + '[[people/bob]]', + '```', + ].join('\n'); + const refs = extractEntityRefs(content); + const slugs = refs.map(r => r.slug); + expect(slugs).toContain('people/alice'); + expect(slugs).not.toContain('people/bob'); + }); + + test('skips wikilinks inside inline code', () => { + const refs = extractEntityRefs('Literal `[[people/ghost]]` in code vs [[people/alice]] real.'); + const slugs = refs.map(r => r.slug); + expect(slugs).toContain('people/alice'); + expect(slugs).not.toContain('people/ghost'); + }); + + test('dedupes markdown-ref and wikilink for the same slug (within extractEntityRefs returns both, caller dedups)', () => { + // extractEntityRefs does NOT dedupe (documented contract — caller dedups). + // Both forms should match and return 2 entries. + const refs = extractEntityRefs('[Alice](people/alice) and [[people/alice]]'); + expect(refs.length).toBe(2); + expect(refs.map(r => r.slug)).toEqual(['people/alice', 'people/alice']); + }); }); // ─── extractPageLinks ────────────────────────────────────────── From 9ebd75020d46d1afa869fb1791e020385d9c3976 Mon Sep 17 00:00:00 2001 From: Gopal Patel Date: Sat, 18 Apr 2026 14:38:42 +0100 Subject: [PATCH 7/8] feat(link-extraction): wire getEntityDirs into put_page + batch extract MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both production callsites of extractPageLinks now resolve the entity-dir list from config before extracting candidates: - src/core/operations.ts runAutoLink (put_page post-hook) — one read per put_page. Runs inside the auto-link branch (after the remote/ disabled guards) so disabled callers skip the config read too. - src/commands/extract.ts extractLinksFromDB (batch backfill) — one read per run, outside the page loop. Config doesn't change mid-run. timeline extraction (parseTimelineEntries) has no dir dependency, so no changes there. Full test suite: 1326 pass / 141 skip / 0 fail. --- src/commands/extract.ts | 8 ++++++-- src/core/operations.ts | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/commands/extract.ts b/src/commands/extract.ts index 1b5abb7e..b91e635d 100644 --- a/src/commands/extract.ts +++ b/src/commands/extract.ts @@ -21,7 +21,7 @@ import { join, relative, dirname } from 'path'; import type { BrainEngine } from '../core/engine.ts'; import type { PageType } from '../core/types.ts'; import { parseMarkdown } from '../core/markdown.ts'; -import { extractPageLinks, parseTimelineEntries, inferLinkType } from '../core/link-extraction.ts'; +import { extractPageLinks, parseTimelineEntries, inferLinkType, getEntityDirs } from '../core/link-extraction.ts'; // --- Types --- @@ -453,6 +453,10 @@ async function extractLinksFromDB( ): Promise<{ created: number; pages: number }> { const allSlugs = await engine.getAllSlugs(); const slugList = Array.from(allSlugs); + // Load the effective entity-dir list once for the whole batch. Config + // doesn't change mid-run, so a single read is sufficient and cheaper than + // per-page lookup. + const dirs = await getEntityDirs(engine); let processed = 0, created = 0; for (let i = 0; i < slugList.length; i++) { @@ -467,7 +471,7 @@ async function extractLinksFromDB( } const fullContent = page.compiled_truth + '\n' + page.timeline; - const candidates = extractPageLinks(fullContent, page.frontmatter, page.type); + const candidates = extractPageLinks(fullContent, page.frontmatter, page.type, dirs); for (const c of candidates) { if (!allSlugs.has(c.targetSlug)) continue; diff --git a/src/core/operations.ts b/src/core/operations.ts index 2f266cbe..821ec6ca 100644 --- a/src/core/operations.ts +++ b/src/core/operations.ts @@ -13,7 +13,7 @@ import { importFromContent } from './import-file.ts'; import { hybridSearch } from './search/hybrid.ts'; import { expandQuery } from './search/expansion.ts'; import { dedupResults } from './search/dedup.ts'; -import { extractPageLinks, isAutoLinkEnabled } from './link-extraction.ts'; +import { extractPageLinks, getEntityDirs, isAutoLinkEnabled } from './link-extraction.ts'; import * as db from './db.ts'; // --- Types --- @@ -288,7 +288,11 @@ async function runAutoLink( parsed: { type: PageType; compiled_truth: string; timeline: string; frontmatter: Record }, ): Promise<{ created: number; removed: number; errors: number }> { const fullContent = parsed.compiled_truth + '\n' + parsed.timeline; - const candidates = extractPageLinks(fullContent, parsed.frontmatter, parsed.type); + // Resolve the effective entity-dir list from config once per put_page. This + // reads `entity_dirs` + `entity_dirs_mode` and validates entries. On invalid + // config, getEntityDirs logs and returns defaults — the hook keeps running. + const dirs = await getEntityDirs(engine); + const candidates = extractPageLinks(fullContent, parsed.frontmatter, parsed.type, dirs); // Resolve which targets exist (skip refs to non-existent pages to avoid FK // violation churn in addLink). One getAllSlugs call upfront, O(1) lookup. From 492f2c95abf9a55c8cdcb921c65dac68d013faaa Mon Sep 17 00:00:00 2001 From: Gopal Patel Date: Sat, 18 Apr 2026 14:39:37 +0100 Subject: [PATCH 8/8] docs: add entity_dirs + wikilink section to README + CHANGELOG MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit README gains a 'Configuring entity directories' subsection under Knowledge Graph explaining union vs replace modes, validation rules, and the fail-safe fallback on invalid input. Plus a 'Wikilink scope' subsection documenting the explicit-path-only design decision — bare [[name]] wikilinks are out of scope because resolving them requires engine-side slug lookup, which would break the pure-function contract of the extractor. CHANGELOG gets an Unreleased section covering the configurable dirs and wikilink additions, including the new exported API surface (DEFAULT_ENTITY_DIRS, getEntityDirs, optional dirs param on extractEntityRefs and extractPageLinks). --- CHANGELOG.md | 27 +++++++++++++++++++++++++++ README.md | 45 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 29489ec9..e286ec6d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,33 @@ All notable changes to GBrain will be documented in this file. +## [Unreleased] + +### Added + +- **Configurable entity directories.** The link extractor now reads the + `entity_dirs` config key (comma-separated) and treats those top-level slug + prefixes as entities in addition to the built-in set (`people`, `companies`, + `meetings`, `concepts`, `deal`, `civic`, `project`, `source`, `media`, `yc`). + Set `entity_dirs_mode=replace` to drop the defaults entirely. Each entry is + validated against `/^[a-z0-9][a-z0-9-]*/`; invalid input logs a warning and + falls back to defaults. Unlocks Johnny Decimal, PARA, and other custom vault + taxonomies without forking the extractor. +- **Explicit-path wikilink extraction.** `[[people/alice]]` and + `[[people/alice|Alice Chen]]` are now recognised alongside Markdown-style + `[Name](people/alice)` references. Honors the configured dir list, so + custom dirs match in both forms. Bare `[[alice]]` (no dir prefix) is + intentionally out of scope — resolving bare names requires engine page + lookup, which breaks the pure-function contract of the extractor. +- `DEFAULT_ENTITY_DIRS` is now an exported frozen constant on + `src/core/link-extraction.ts`, for callers that want to reason about the + default set directly. +- `getEntityDirs(engine)` helper resolves the effective dir list from config + (union or replace mode, with validation). +- `extractEntityRefs(content, dirs?)` and `extractPageLinks(content, fm, type, dirs?)` + accept an optional dir list; existing callsites are unchanged (default + behavior preserved). + ## [0.12.0] - 2026-04-18 ## **The graph wires itself.** diff --git a/README.md b/README.md index f8e88a00..b500ba4e 100644 --- a/README.md +++ b/README.md @@ -350,6 +350,51 @@ gbrain extract timeline --source db # extract dated events from markdown tim Then ask graph questions or watch the search ranking improve. Benchmarked: **Recall@5 jumps from 83% to 95%, Precision@5 from 39% to 45%, +30 more correct answers in the agent's top-5 reads** on a 240-page Opus-generated rich-prose corpus. Graph-only F1 hits 86.6% vs grep's 57.8% (+28.8 pts). See [docs/benchmarks/2026-04-18-brainbench-v1.md](docs/benchmarks/2026-04-18-brainbench-v1.md). +### Configuring entity directories + +By default the extractor recognises these top-level slug prefixes as entities: +`people`, `companies`, `meetings`, `concepts`, `deal`, `civic`, `project`, +`source`, `media`, `yc`. Any `[Name](people/alice)` or `[[people/alice]]` +reference becomes a typed link. + +If you organise your vault differently — Johnny Decimal, PARA, a custom +taxonomy — you can extend or replace the list via two config keys: + +```bash +# Union mode (default): custom dirs are ADDED to the defaults. +gbrain config set entity_dirs "01-notes,02-projects,03-archive" + +# Replace mode: ONLY the custom list is used. Defaults are dropped. +gbrain config set entity_dirs "01-notes,02-projects" +gbrain config set entity_dirs_mode "replace" +``` + +Rules: + +- Each dir name must match `/^[a-z0-9][a-z0-9-]*/` (lowercase letters, digits, + hyphens; must start with a letter or digit). Invalid entries cause the + whole list to be rejected and extraction falls back to defaults with a + `console.warn`. This fail-safe prevents a typo from silently disabling the + graph layer. +- Comma-separated, whitespace around entries is trimmed. +- Empty / unset `entity_dirs` -> defaults, regardless of mode. +- Dedupes overlap with defaults in union mode. + +### Wikilink scope + +GBrain extracts **explicit-path** wikilinks: + +```markdown +[[people/alice]] -> links to people/alice +[[people/alice|Alice Chen]] -> same slug; alias consumed but ignored +[[01-notes/rushi]] -> works if 01-notes is a configured dir +``` + +**Bare `[[alice]]` wikilinks are not supported.** Resolving them requires +engine-side slug lookup (which page named "alice" — `people/alice-chen` or +`media/alice-in-wonderland`?), which breaks the pure-function contract of +the extractor. Use `[Alice](people/alice)` or `[[people/alice]]` instead. + ## Search Hybrid search: vector + keyword + RRF fusion + multi-query expansion + 4-layer dedup.