diff --git a/CHANGELOG.md b/CHANGELOG.md index 29489ec9..e286ec6d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,33 @@ All notable changes to GBrain will be documented in this file. +## [Unreleased] + +### Added + +- **Configurable entity directories.** The link extractor now reads the + `entity_dirs` config key (comma-separated) and treats those top-level slug + prefixes as entities in addition to the built-in set (`people`, `companies`, + `meetings`, `concepts`, `deal`, `civic`, `project`, `source`, `media`, `yc`). + Set `entity_dirs_mode=replace` to drop the defaults entirely. Each entry is + validated against `/^[a-z0-9][a-z0-9-]*/`; invalid input logs a warning and + falls back to defaults. Unlocks Johnny Decimal, PARA, and other custom vault + taxonomies without forking the extractor. +- **Explicit-path wikilink extraction.** `[[people/alice]]` and + `[[people/alice|Alice Chen]]` are now recognised alongside Markdown-style + `[Name](people/alice)` references. Honors the configured dir list, so + custom dirs match in both forms. Bare `[[alice]]` (no dir prefix) is + intentionally out of scope — resolving bare names requires engine page + lookup, which breaks the pure-function contract of the extractor. +- `DEFAULT_ENTITY_DIRS` is now an exported frozen constant on + `src/core/link-extraction.ts`, for callers that want to reason about the + default set directly. +- `getEntityDirs(engine)` helper resolves the effective dir list from config + (union or replace mode, with validation). +- `extractEntityRefs(content, dirs?)` and `extractPageLinks(content, fm, type, dirs?)` + accept an optional dir list; existing callsites are unchanged (default + behavior preserved). + ## [0.12.0] - 2026-04-18 ## **The graph wires itself.** diff --git a/README.md b/README.md index f8e88a00..b500ba4e 100644 --- a/README.md +++ b/README.md @@ -350,6 +350,51 @@ gbrain extract timeline --source db # extract dated events from markdown tim Then ask graph questions or watch the search ranking improve. Benchmarked: **Recall@5 jumps from 83% to 95%, Precision@5 from 39% to 45%, +30 more correct answers in the agent's top-5 reads** on a 240-page Opus-generated rich-prose corpus. Graph-only F1 hits 86.6% vs grep's 57.8% (+28.8 pts). See [docs/benchmarks/2026-04-18-brainbench-v1.md](docs/benchmarks/2026-04-18-brainbench-v1.md). +### Configuring entity directories + +By default the extractor recognises these top-level slug prefixes as entities: +`people`, `companies`, `meetings`, `concepts`, `deal`, `civic`, `project`, +`source`, `media`, `yc`. Any `[Name](people/alice)` or `[[people/alice]]` +reference becomes a typed link. + +If you organise your vault differently — Johnny Decimal, PARA, a custom +taxonomy — you can extend or replace the list via two config keys: + +```bash +# Union mode (default): custom dirs are ADDED to the defaults. +gbrain config set entity_dirs "01-notes,02-projects,03-archive" + +# Replace mode: ONLY the custom list is used. Defaults are dropped. +gbrain config set entity_dirs "01-notes,02-projects" +gbrain config set entity_dirs_mode "replace" +``` + +Rules: + +- Each dir name must match `/^[a-z0-9][a-z0-9-]*/` (lowercase letters, digits, + hyphens; must start with a letter or digit). Invalid entries cause the + whole list to be rejected and extraction falls back to defaults with a + `console.warn`. This fail-safe prevents a typo from silently disabling the + graph layer. +- Comma-separated, whitespace around entries is trimmed. +- Empty / unset `entity_dirs` -> defaults, regardless of mode. +- Dedupes overlap with defaults in union mode. + +### Wikilink scope + +GBrain extracts **explicit-path** wikilinks: + +```markdown +[[people/alice]] -> links to people/alice +[[people/alice|Alice Chen]] -> same slug; alias consumed but ignored +[[01-notes/rushi]] -> works if 01-notes is a configured dir +``` + +**Bare `[[alice]]` wikilinks are not supported.** Resolving them requires +engine-side slug lookup (which page named "alice" — `people/alice-chen` or +`media/alice-in-wonderland`?), which breaks the pure-function contract of +the extractor. Use `[Alice](people/alice)` or `[[people/alice]]` instead. + ## Search Hybrid search: vector + keyword + RRF fusion + multi-query expansion + 4-layer dedup. diff --git a/src/commands/extract.ts b/src/commands/extract.ts index 1b5abb7e..b91e635d 100644 --- a/src/commands/extract.ts +++ b/src/commands/extract.ts @@ -21,7 +21,7 @@ import { join, relative, dirname } from 'path'; import type { BrainEngine } from '../core/engine.ts'; import type { PageType } from '../core/types.ts'; import { parseMarkdown } from '../core/markdown.ts'; -import { extractPageLinks, parseTimelineEntries, inferLinkType } from '../core/link-extraction.ts'; +import { extractPageLinks, parseTimelineEntries, inferLinkType, getEntityDirs } from '../core/link-extraction.ts'; // --- Types --- @@ -453,6 +453,10 @@ async function extractLinksFromDB( ): Promise<{ created: number; pages: number }> { const allSlugs = await engine.getAllSlugs(); const slugList = Array.from(allSlugs); + // Load the effective entity-dir list once for the whole batch. Config + // doesn't change mid-run, so a single read is sufficient and cheaper than + // per-page lookup. + const dirs = await getEntityDirs(engine); let processed = 0, created = 0; for (let i = 0; i < slugList.length; i++) { @@ -467,7 +471,7 @@ async function extractLinksFromDB( } const fullContent = page.compiled_truth + '\n' + page.timeline; - const candidates = extractPageLinks(fullContent, page.frontmatter, page.type); + const candidates = extractPageLinks(fullContent, page.frontmatter, page.type, dirs); for (const c of candidates) { if (!allSlugs.has(c.targetSlug)) continue; diff --git a/src/core/link-extraction.ts b/src/core/link-extraction.ts index 55570b59..af508d5e 100644 --- a/src/core/link-extraction.ts +++ b/src/core/link-extraction.ts @@ -27,16 +27,90 @@ export interface EntityRef { } /** - * Match `[Name](path)` markdown links pointing to `people/` or `companies/` - * (and other entity directories). Accepts both filesystem-relative format - * (`[Name](../people/slug.md)`) AND engine-slug format (`[Name](people/slug)`). + * Canonical entity directory list. Each directory name is a top-level slug + * prefix that extractors recognise as an "entity" (e.g. `people/alice`, + * `companies/acme`, `meetings/2026-01-15`). Frozen so downstream callers can + * treat it as immutable. * - * Captures: name, dir (people/companies/...), slug. + * Users can extend or replace this list via the `entity_dirs` config key + * (see `getEntityDirs`). Custom dirs follow the same slug shape: + * `/^[a-z0-9][a-z0-9-]*$/`. + */ +export const DEFAULT_ENTITY_DIRS: readonly string[] = Object.freeze([ + 'people', + 'companies', + 'meetings', + 'concepts', + 'deal', + 'civic', + 'project', + 'source', + 'media', + 'yc', +]); + +/** + * Escape regex metacharacters so a value can be embedded inside a larger + * regex without changing its structure. Defense-in-depth: `getEntityDirs` + * already validates entries against `/^[a-z0-9][a-z0-9-]*$/`, so no metachar + * should ever reach here — this is belt-and-braces for future callers. + */ +function escapeRegexChars(s: string): string { + return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +} + +/** + * Build the `[Name](dir/slug)` entity-ref regex from a dir list. Accepts both + * filesystem-relative format (`[Name](../people/slug.md)`) AND engine-slug + * format (`[Name](people/slug)`). * - * The regex permits an optional `../` prefix (any number) and an optional - * `.md` suffix so the same function works for both filesystem and DB content. + * Captures: name, full `dir/slug` path, slug segment alone. + * + * Internal — callers use `extractEntityRefs(content, dirs?)` which threads the + * dir list through. Keeping this private limits API surface. */ -const ENTITY_REF_RE = /\[([^\]]+)\]\((?:\.\.\/)*((?:people|companies|meetings|concepts|deal|civic|project|source|media|yc)\/([^)\s]+?))(?:\.md)?\)/g; +function buildEntityRefRegex(dirs: readonly string[]): RegExp { + const alternation = dirs.map(escapeRegexChars).join('|'); + return new RegExp( + `\\[([^\\]]+)\\]\\((?:\\.\\.\\/)*((?:${alternation})\\/([^)\\s]+?))(?:\\.md)?\\)`, + 'g', + ); +} + +/** + * Default entity-ref regex built once from DEFAULT_ENTITY_DIRS. Callers that + * don't pass a custom dir list get this fast-path (no per-call regex compile). + */ +const ENTITY_REF_RE = buildEntityRefRegex(DEFAULT_ENTITY_DIRS); + +/** + * Build the explicit-path wikilink regex (`[[dir/slug]]` and + * `[[dir/slug|alias]]`) from a dir list. + * + * Scope: ONLY explicit `[[dir/slug]]` form is matched. Bare `[[name]]` + * wikilinks are intentionally out of scope — resolving them requires engine + * page-lookup (walk the slug table, disambiguate aliases), which breaks the + * pure-function contract of extractEntityRefs. See README > entity_dirs for + * the design rationale. + * + * Captures: + * - group 1: full `dir/slug` path + * - alias segment `|display` is consumed but not captured. + * + * Safety: the slug segment is bounded (`[a-z0-9][a-z0-9-]*`) so there is no + * unbounded backtracking. The optional alias is bounded at 100 chars to cap + * worst-case regex cost. + */ +function buildWikilinkRegex(dirs: readonly string[]): RegExp { + const alternation = dirs.map(escapeRegexChars).join('|'); + return new RegExp( + `\\[\\[((?:${alternation})\\/[a-z0-9][a-z0-9-]*)(?:\\|[^\\]|\\n]{1,100})?\\]\\]`, + 'g', + ); +} + +/** Default wikilink regex built once from DEFAULT_ENTITY_DIRS (fast path). */ +const WIKILINK_RE = buildWikilinkRegex(DEFAULT_ENTITY_DIRS); /** * Strip fenced code blocks (```...```) and inline code (`...`) from markdown, @@ -75,25 +149,52 @@ function stripCodeBlocks(content: string): string { } /** - * Extract `[Name](path-to-people-or-company)` references from arbitrary content. - * Both filesystem-relative paths (with `../` and `.md`) and bare engine-style - * slugs (`people/slug`) are matched. Returns one EntityRef per match (no dedup - * here; caller dedups). Slugs appearing inside fenced or inline code blocks - * are excluded — those are typically code samples, not real entity references. + * Extract entity references from arbitrary content. + * + * Two ref forms are matched: + * 1. Markdown links: `[Name](people/slug)` — both filesystem-relative paths + * (with `../` and `.md`) and bare engine-style slugs are accepted. + * 2. Explicit-path wikilinks: `[[dir/slug]]` and `[[dir/slug|alias]]` where + * `dir` is in the configured dir list. For wikilinks, `name` is the slug's + * last path segment (no display name is available). + * + * Bare `[[name]]` wikilinks (no dir prefix) are OUT OF SCOPE — resolving them + * requires engine page lookup, which breaks the pure-function contract. + * + * Returns one EntityRef per match (no dedup here; caller dedups). Slugs + * appearing inside fenced or inline code blocks are excluded. + * + * @param content Markdown text to scan. + * @param dirs Optional entity-dir list. When omitted, uses DEFAULT_ENTITY_DIRS. + * When provided, ONLY those dirs are matched — callers that want to extend + * the defaults should pass the union (see `getEntityDirs`). */ -export function extractEntityRefs(content: string): EntityRef[] { +export function extractEntityRefs(content: string, dirs?: readonly string[]): EntityRef[] { const stripped = stripCodeBlocks(content); const refs: EntityRef[] = []; + + // 1. Markdown-style refs: [Name](dir/slug) + const mdBase = dirs ? buildEntityRefRegex(dirs) : ENTITY_REF_RE; + const mdRe = new RegExp(mdBase.source, mdBase.flags); let m: RegExpExecArray | null; - // Fresh regex per call (g-flag state is per-instance). - const re = new RegExp(ENTITY_REF_RE.source, ENTITY_REF_RE.flags); - while ((m = re.exec(stripped)) !== null) { + while ((m = mdRe.exec(stripped)) !== null) { const name = m[1]; const fullPath = m[2]; const slug = fullPath; // dir/slug const dir = fullPath.split('/')[0]; refs.push({ name, slug, dir }); } + + // 2. Explicit-path wikilinks: [[dir/slug]] and [[dir/slug|alias]] + const wikiBase = dirs ? buildWikilinkRegex(dirs) : WIKILINK_RE; + const wikiRe = new RegExp(wikiBase.source, wikiBase.flags); + while ((m = wikiRe.exec(stripped)) !== null) { + const fullPath = m[1]; // dir/slug + const [dir, ...rest] = fullPath.split('/'); + const lastSegment = rest[rest.length - 1] ?? ''; + refs.push({ name: lastSegment, slug: fullPath, dir }); + } + return refs; } @@ -108,6 +209,19 @@ export interface LinkCandidate { context: string; } +/** + * Build the bare-slug regex (`dir/slug` appearing anywhere in text) from the + * same dir list as the markdown-ref regex. Keeps the two extractors in sync + * so callers who pass custom dirs see consistent behavior in both paths. + */ +function buildBareSlugRegex(dirs: readonly string[]): RegExp { + const alternation = dirs.map(escapeRegexChars).join('|'); + return new RegExp(`\\b((?:${alternation})\\/[a-z0-9][a-z0-9-]*)\\b`, 'g'); +} + +/** Default bare-slug regex, built once from DEFAULT_ENTITY_DIRS. */ +const BARE_SLUG_RE = buildBareSlugRegex(DEFAULT_ENTITY_DIRS); + /** * Extract all link candidates from a page. * @@ -118,16 +232,22 @@ export interface LinkCandidate { * * Within-page dedup: multiple mentions of the same (targetSlug, linkType) * collapse to one candidate. The first occurrence's context wins. + * + * @param dirs Optional entity-dir list. When omitted, uses DEFAULT_ENTITY_DIRS + * for both the markdown-ref extractor and the bare-slug regex. When provided, + * ONLY those dirs are matched — callers wanting union-with-defaults must pass + * the union themselves (see `getEntityDirs`). */ export function extractPageLinks( content: string, frontmatter: Record, pageType: PageType, + dirs?: readonly string[], ): LinkCandidate[] { const candidates: LinkCandidate[] = []; // 1. Markdown entity refs. - for (const ref of extractEntityRefs(content)) { + for (const ref of extractEntityRefs(content, dirs)) { const idx = content.indexOf(ref.name); // Wider context window (240 chars vs original 80) catches verbs that // appear at sentence-or-paragraph distance from the slug — common in @@ -142,10 +262,11 @@ export function extractPageLinks( } // 2. Bare slug references (e.g. "see people/alice-chen for context"). - // Limited to the same entity directories ENTITY_REF_RE covers. + // Same dir list as the markdown extractor to keep behavior consistent. // Code blocks are stripped first — slugs in code samples are not real refs. const strippedContent = stripCodeBlocks(content); - const bareRe = /\b((?:people|companies|meetings|concepts|deal|civic|project|source|media|yc)\/[a-z0-9][a-z0-9-]*)\b/g; + const bareBase = dirs ? buildBareSlugRegex(dirs) : BARE_SLUG_RE; + const bareRe = new RegExp(bareBase.source, bareBase.flags); let m: RegExpExecArray | null; while ((m = bareRe.exec(strippedContent)) !== null) { // Skip matches that are part of a markdown link (already handled above). @@ -366,3 +487,61 @@ export async function isAutoLinkEnabled(engine: BrainEngine): Promise { const normalized = val.trim().toLowerCase(); return !['false', '0', 'no', 'off'].includes(normalized); } + +/** Regex that validates an entity-dir name. Matches slug shape. */ +const ENTITY_DIR_NAME_RE = /^[a-z0-9][a-z0-9-]*$/; + +/** + * Resolve the effective entity-dir list from engine config. + * + * Reads two config keys: + * - `entity_dirs`: comma-separated list of custom dir names (optional). + * - `entity_dirs_mode`: `"union"` (default) or `"replace"`. + * + * Modes: + * - `union` (default): custom dirs are ADDED to DEFAULT_ENTITY_DIRS. + * Duplicates are deduped; defaults come first, custom dirs append. + * - `replace`: ONLY the custom list is used. If the custom list is empty, + * falls back to defaults (empty replace is meaningless). + * + * Validation: + * Each custom entry must match `/^[a-z0-9][a-z0-9-]*$/`. On ANY invalid + * entry, the function logs a warning via `console.warn` and returns + * DEFAULT_ENTITY_DIRS. This fail-safe prevents malformed config from + * silently disabling all entity extraction. + * + * @returns A fresh string[] (mutable; callers may not mutate DEFAULT_ENTITY_DIRS). + */ +export async function getEntityDirs(engine: BrainEngine): Promise { + const raw = await engine.getConfig('entity_dirs'); + if (raw == null || raw.trim() === '') { + return [...DEFAULT_ENTITY_DIRS]; + } + + const entries = raw.split(',').map(s => s.trim()).filter(s => s.length > 0); + for (const entry of entries) { + if (!ENTITY_DIR_NAME_RE.test(entry)) { + console.warn( + `[gbrain] entity_dirs rejected: ${entry} (must match [a-z0-9][a-z0-9-]*). Falling back to defaults.`, + ); + return [...DEFAULT_ENTITY_DIRS]; + } + } + + const mode = (await engine.getConfig('entity_dirs_mode'))?.trim().toLowerCase(); + if (mode === 'replace' && entries.length > 0) { + // Dedupe while preserving input order. + return Array.from(new Set(entries)); + } + + // Union mode: defaults first, then custom entries not already present. + const seen = new Set(DEFAULT_ENTITY_DIRS); + const result: string[] = [...DEFAULT_ENTITY_DIRS]; + for (const entry of entries) { + if (!seen.has(entry)) { + seen.add(entry); + result.push(entry); + } + } + return result; +} diff --git a/src/core/operations.ts b/src/core/operations.ts index 2f266cbe..821ec6ca 100644 --- a/src/core/operations.ts +++ b/src/core/operations.ts @@ -13,7 +13,7 @@ import { importFromContent } from './import-file.ts'; import { hybridSearch } from './search/hybrid.ts'; import { expandQuery } from './search/expansion.ts'; import { dedupResults } from './search/dedup.ts'; -import { extractPageLinks, isAutoLinkEnabled } from './link-extraction.ts'; +import { extractPageLinks, getEntityDirs, isAutoLinkEnabled } from './link-extraction.ts'; import * as db from './db.ts'; // --- Types --- @@ -288,7 +288,11 @@ async function runAutoLink( parsed: { type: PageType; compiled_truth: string; timeline: string; frontmatter: Record }, ): Promise<{ created: number; removed: number; errors: number }> { const fullContent = parsed.compiled_truth + '\n' + parsed.timeline; - const candidates = extractPageLinks(fullContent, parsed.frontmatter, parsed.type); + // Resolve the effective entity-dir list from config once per put_page. This + // reads `entity_dirs` + `entity_dirs_mode` and validates entries. On invalid + // config, getEntityDirs logs and returns defaults — the hook keeps running. + const dirs = await getEntityDirs(engine); + const candidates = extractPageLinks(fullContent, parsed.frontmatter, parsed.type, dirs); // Resolve which targets exist (skip refs to non-existent pages to avoid FK // violation churn in addLink). One getAllSlugs call upfront, O(1) lookup. diff --git a/test/link-extraction.test.ts b/test/link-extraction.test.ts index e5ed7ec5..945e97e5 100644 --- a/test/link-extraction.test.ts +++ b/test/link-extraction.test.ts @@ -1,13 +1,36 @@ -import { describe, test, expect } from 'bun:test'; +import { describe, test, expect, spyOn } from 'bun:test'; import { + DEFAULT_ENTITY_DIRS, extractEntityRefs, extractPageLinks, inferLinkType, parseTimelineEntries, isAutoLinkEnabled, + getEntityDirs, } from '../src/core/link-extraction.ts'; import type { BrainEngine } from '../src/core/engine.ts'; +// ─── DEFAULT_ENTITY_DIRS ─────────────────────────────────────── + +describe('DEFAULT_ENTITY_DIRS', () => { + test('is exported and contains the canonical entity dirs', () => { + expect(DEFAULT_ENTITY_DIRS).toContain('people'); + expect(DEFAULT_ENTITY_DIRS).toContain('companies'); + expect(DEFAULT_ENTITY_DIRS).toContain('meetings'); + expect(DEFAULT_ENTITY_DIRS).toContain('concepts'); + expect(DEFAULT_ENTITY_DIRS).toContain('deal'); + expect(DEFAULT_ENTITY_DIRS).toContain('civic'); + expect(DEFAULT_ENTITY_DIRS).toContain('project'); + expect(DEFAULT_ENTITY_DIRS).toContain('source'); + expect(DEFAULT_ENTITY_DIRS).toContain('media'); + expect(DEFAULT_ENTITY_DIRS).toContain('yc'); + }); + + test('is frozen (readonly at runtime)', () => { + expect(Object.isFrozen(DEFAULT_ENTITY_DIRS)).toBe(true); + }); +}); + // ─── extractEntityRefs ───────────────────────────────────────── describe('extractEntityRefs', () => { @@ -67,6 +90,95 @@ describe('extractEntityRefs', () => { expect(refs.length).toBe(1); expect(refs[0].dir).toBe('meetings'); }); + + test('custom dirs: Johnny Decimal style ([Rushi](01-notes/rushi)) matches when dir is configured', () => { + const refs = extractEntityRefs('Met [Rushi](01-notes/rushi) for coffee.', ['01-notes']); + expect(refs.length).toBe(1); + expect(refs[0]).toEqual({ name: 'Rushi', slug: '01-notes/rushi', dir: '01-notes' }); + }); + + test('custom-only dir list replaces defaults — default dirs do not match', () => { + // When caller supplies an explicit dirs list, only those dirs are used. + // The default `people/` dir does NOT match. + const refs = extractEntityRefs('[Alice](people/alice)', ['01-notes']); + expect(refs).toEqual([]); + }); + + test('omitting dirs uses default list (backwards compatible)', () => { + const refs = extractEntityRefs('[Alice](people/alice)'); + expect(refs.length).toBe(1); + expect(refs[0].slug).toBe('people/alice'); + }); + + // ── Explicit-path wikilinks [[dir/slug]] ── + // + // Scope note: only `[[dir/slug]]` and `[[dir/slug|alias]]` are in scope. + // Bare `[[name]]` wikilinks would need engine page-lookup to resolve, + // which breaks the pure-function contract of extractEntityRefs. See + // README for full explanation. + + test('extracts explicit-path wikilinks [[dir/slug]]', () => { + const refs = extractEntityRefs('See [[people/alice]] for context.'); + expect(refs.length).toBe(1); + expect(refs[0]).toEqual({ name: 'alice', slug: 'people/alice', dir: 'people' }); + }); + + test('extracts wikilinks with alias [[dir/slug|Display Name]]', () => { + const refs = extractEntityRefs('Met [[people/alice-chen|Alice Chen]] today.'); + expect(refs.length).toBe(1); + // Display is ignored; name falls back to the last slug segment. + expect(refs[0].slug).toBe('people/alice-chen'); + expect(refs[0].dir).toBe('people'); + expect(refs[0].name).toBe('alice-chen'); + }); + + test('ignores wikilinks when dir is NOT in configured list', () => { + // `notes/` is not a default entity dir, so [[notes/foo]] is ignored. + const refs = extractEntityRefs('See [[notes/foo]] for context.'); + expect(refs).toEqual([]); + }); + + test('wikilink dir honors custom dirs param', () => { + const refs = extractEntityRefs('See [[01-notes/rushi]] for details.', ['01-notes']); + expect(refs.length).toBe(1); + expect(refs[0].slug).toBe('01-notes/rushi'); + expect(refs[0].dir).toBe('01-notes'); + }); + + test('does NOT extract bare [[name]] wikilinks (out of scope)', () => { + // Bare wikilinks require engine page-lookup to resolve — out of scope + // for the pure-function extractor. + const refs = extractEntityRefs('See [[alice]] for context.'); + expect(refs).toEqual([]); + }); + + test('skips wikilinks inside fenced code blocks', () => { + const content = [ + 'Prose with [[people/alice]].', + '```', + '[[people/bob]]', + '```', + ].join('\n'); + const refs = extractEntityRefs(content); + const slugs = refs.map(r => r.slug); + expect(slugs).toContain('people/alice'); + expect(slugs).not.toContain('people/bob'); + }); + + test('skips wikilinks inside inline code', () => { + const refs = extractEntityRefs('Literal `[[people/ghost]]` in code vs [[people/alice]] real.'); + const slugs = refs.map(r => r.slug); + expect(slugs).toContain('people/alice'); + expect(slugs).not.toContain('people/ghost'); + }); + + test('dedupes markdown-ref and wikilink for the same slug (within extractEntityRefs returns both, caller dedups)', () => { + // extractEntityRefs does NOT dedupe (documented contract — caller dedups). + // Both forms should match and return 2 entries. + const refs = extractEntityRefs('[Alice](people/alice) and [[people/alice]]'); + expect(refs.length).toBe(2); + expect(refs.map(r => r.slug)).toEqual(['people/alice', 'people/alice']); + }); }); // ─── extractPageLinks ────────────────────────────────────────── @@ -113,6 +225,39 @@ describe('extractPageLinks', () => { const aliceLink = candidates.find(c => c.targetSlug === 'people/alice'); expect(aliceLink!.linkType).toBe('attended'); }); + + test('custom dirs: [Name](custom-dir/slug) extracted when dir is configured', () => { + const candidates = extractPageLinks( + 'Met [Rushi](01-notes/rushi) yesterday.', + {}, + 'concept', + ['01-notes'], + ); + const rushi = candidates.find(c => c.targetSlug === '01-notes/rushi'); + expect(rushi).toBeDefined(); + }); + + test('custom dirs: bare slug references use same dir list', () => { + const candidates = extractPageLinks( + 'See 01-notes/rushi for details.', + {}, + 'concept', + ['01-notes'], + ); + const rushi = candidates.find(c => c.targetSlug === '01-notes/rushi'); + expect(rushi).toBeDefined(); + }); + + test('custom-only dir list excludes default dirs from bare slug match', () => { + // With dirs=['01-notes'], a bare `people/alice` token is NOT extracted. + const candidates = extractPageLinks( + 'See people/alice for details.', + {}, + 'concept', + ['01-notes'], + ); + expect(candidates.find(c => c.targetSlug === 'people/alice')).toBeUndefined(); + }); }); // ─── inferLinkType ───────────────────────────────────────────── @@ -303,3 +448,103 @@ describe('isAutoLinkEnabled', () => { expect(await isAutoLinkEnabled(engine)).toBe(true); }); }); + +// ─── getEntityDirs ───────────────────────────────────────────── + +describe('getEntityDirs', () => { + test('null config -> DEFAULT_ENTITY_DIRS', async () => { + const engine = makeFakeEngine(new Map()); + const dirs = await getEntityDirs(engine); + expect(dirs).toEqual([...DEFAULT_ENTITY_DIRS]); + }); + + test('empty string config -> DEFAULT_ENTITY_DIRS', async () => { + const engine = makeFakeEngine(new Map([['entity_dirs', '']])); + const dirs = await getEntityDirs(engine); + expect(dirs).toEqual([...DEFAULT_ENTITY_DIRS]); + }); + + test('valid single custom dir -> union with defaults', async () => { + const engine = makeFakeEngine(new Map([['entity_dirs', '01-notes']])); + const dirs = await getEntityDirs(engine); + expect(dirs).toContain('01-notes'); + // defaults preserved + for (const d of DEFAULT_ENTITY_DIRS) expect(dirs).toContain(d); + }); + + test('multiple comma-separated dirs with whitespace -> parsed and unioned', async () => { + const engine = makeFakeEngine(new Map([['entity_dirs', ' 01-notes , 02-projects ,03-archive']])); + const dirs = await getEntityDirs(engine); + expect(dirs).toContain('01-notes'); + expect(dirs).toContain('02-projects'); + expect(dirs).toContain('03-archive'); + }); + + test('duplicate custom dir overlapping with defaults -> no duplicates', async () => { + const engine = makeFakeEngine(new Map([['entity_dirs', 'people,01-notes']])); + const dirs = await getEntityDirs(engine); + const peopleCount = dirs.filter(d => d === 'people').length; + expect(peopleCount).toBe(1); + expect(dirs).toContain('01-notes'); + }); + + test('entity_dirs_mode=replace -> only custom list, no defaults', async () => { + const engine = makeFakeEngine(new Map([ + ['entity_dirs', '01-notes,02-projects'], + ['entity_dirs_mode', 'replace'], + ])); + const dirs = await getEntityDirs(engine); + expect(dirs).toEqual(['01-notes', '02-projects']); + // defaults NOT included in replace mode + expect(dirs).not.toContain('people'); + expect(dirs).not.toContain('companies'); + }); + + test('entity_dirs_mode=replace with empty entity_dirs -> defaults (empty replace is meaningless)', async () => { + const engine = makeFakeEngine(new Map([ + ['entity_dirs', ''], + ['entity_dirs_mode', 'replace'], + ])); + const dirs = await getEntityDirs(engine); + expect(dirs).toEqual([...DEFAULT_ENTITY_DIRS]); + }); + + test('invalid entry (uppercase) -> warn + return defaults', async () => { + const warnSpy = spyOn(console, 'warn').mockImplementation(() => {}); + try { + const engine = makeFakeEngine(new Map([['entity_dirs', '01-notes,BAD_DIR']])); + const dirs = await getEntityDirs(engine); + expect(dirs).toEqual([...DEFAULT_ENTITY_DIRS]); + expect(warnSpy).toHaveBeenCalled(); + const firstCallArg = warnSpy.mock.calls[0]![0]; + expect(firstCallArg).toContain('entity_dirs rejected'); + expect(firstCallArg).toContain('BAD_DIR'); + } finally { + warnSpy.mockRestore(); + } + }); + + test('invalid entry (starts with dash) -> warn + return defaults', async () => { + const warnSpy = spyOn(console, 'warn').mockImplementation(() => {}); + try { + const engine = makeFakeEngine(new Map([['entity_dirs', '-bad']])); + const dirs = await getEntityDirs(engine); + expect(dirs).toEqual([...DEFAULT_ENTITY_DIRS]); + expect(warnSpy).toHaveBeenCalled(); + } finally { + warnSpy.mockRestore(); + } + }); + + test('invalid entry (contains slash) -> warn + return defaults', async () => { + const warnSpy = spyOn(console, 'warn').mockImplementation(() => {}); + try { + const engine = makeFakeEngine(new Map([['entity_dirs', 'people/extra']])); + const dirs = await getEntityDirs(engine); + expect(dirs).toEqual([...DEFAULT_ENTITY_DIRS]); + expect(warnSpy).toHaveBeenCalled(); + } finally { + warnSpy.mockRestore(); + } + }); +});