diff --git a/src/cli.ts b/src/cli.ts index bee3da91..6bb70651 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -18,7 +18,7 @@ for (const op of operations) { } // CLI-only commands that bypass the operation layer -const CLI_ONLY = new Set(['init', 'upgrade', 'post-upgrade', 'check-update', 'integrations', 'publish', 'check-backlinks', 'lint', 'report', 'import', 'export', 'files', 'embed', 'serve', 'call', 'config', 'doctor', 'migrate', 'eval', 'sync', 'extract', 'features', 'autopilot', 'graph-query', 'jobs', 'apply-migrations', 'skillpack-check']); +const CLI_ONLY = new Set(['init', 'upgrade', 'post-upgrade', 'check-update', 'integrations', 'publish', 'check-backlinks', 'lint', 'report', 'import', 'export', 'files', 'embed', 'serve', 'call', 'config', 'doctor', 'migrate', 'eval', 'sync', 'extract', 'features', 'autopilot', 'graph-query', 'jobs', 'apply-migrations', 'skillpack-check', 'orphans']); async function main() { const args = process.argv.slice(2); @@ -412,6 +412,11 @@ async function handleCliOnly(command: string, args: string[]) { await runGraphQuery(engine, args); break; } + case 'orphans': { + const { runOrphans } = await import('./commands/orphans.ts'); + await runOrphans(engine, args); + break; + } } } finally { if (command !== 'serve') await engine.disconnect(); @@ -520,6 +525,7 @@ TOOLS publish [--password] Shareable HTML (strips private data, optional AES-256) check-backlinks [dir] Find/fix missing back-links across brain lint [--fix] Catch LLM artifacts, placeholder dates, bad frontmatter + orphans [--json] [--count] Find pages with no inbound wikilinks report --type --content ... Save timestamped report to brain/reports/ JOBS (Minions) diff --git a/src/commands/extract.ts b/src/commands/extract.ts index 1b5abb7e..11157b93 100644 --- a/src/commands/extract.ts +++ b/src/commands/extract.ts @@ -69,33 +69,95 @@ export function walkMarkdownFiles(dir: string): { path: string; relPath: string // --- Link extraction --- -/** Extract markdown links to .md files (relative paths only) */ +/** Extract markdown links to .md files (relative paths only). + * + * Handles two syntaxes: + * 1. Standard markdown: [text](relative/path.md) + * 2. Wikilinks: [[relative/path]] or [[relative/path|Display Text]] + * + * Both are resolved relative to the file that contains them, so the caller + * receives a relTarget that can be joined with dirname(relPath) to get the + * absolute slug. External URLs (containing ://) are always skipped. + */ export function extractMarkdownLinks(content: string): { name: string; relTarget: string }[] { const results: { name: string; relTarget: string }[] = []; - const pattern = /\[([^\]]+)\]\(([^)]+\.md)\)/g; + + // Standard markdown links: [text](relative/path.md) + const mdPattern = /\[([^\]]+)\]\(([^)]+\.md)\)/g; let match; - while ((match = pattern.exec(content)) !== null) { + while ((match = mdPattern.exec(content)) !== null) { const target = match[2]; if (target.includes('://')) continue; // skip external URLs results.push({ name: match[1], relTarget: target }); } + + // Wikilinks: [[path/to/page]] or [[path/to/page|Display Text]] + // Path may or may not carry a .md suffix; normalise to include it. + // Skip external URLs like [[https://example.com|Title]]. + // Strip section anchors: [[page#section|Title]] → page + const wikiPattern = /\[\[([^|\]]+?)(?:\|[^\]]*?)?\]\]/g; + while ((match = wikiPattern.exec(content)) !== null) { + const rawPath = match[1].trim(); + if (rawPath.includes('://')) continue; // skip [[https://...]] + // Strip section anchors (#heading) — they're intra-page refs, not page slugs + const hashIdx = rawPath.indexOf('#'); + const pagePath = hashIdx >= 0 ? rawPath.slice(0, hashIdx) : rawPath; + if (!pagePath) continue; // bare [[#anchor]] — same-page ref, skip + const relTarget = pagePath.endsWith('.md') ? pagePath : pagePath + '.md'; + // Use the display text portion if present, otherwise the raw path + const pipeIdx = match[0].indexOf('|'); + const displayName = pipeIdx >= 0 + ? match[0].slice(pipeIdx + 1, -2).trim() + : rawPath; + results.push({ name: displayName, relTarget }); + } + return results; } -/** Infer link type from directory structure */ -function inferLinkType(fromDir: string, toDir: string, frontmatter?: Record): string { - const from = fromDir.split('/')[0]; - const to = toDir.split('/')[0]; - if (from === 'people' && to === 'companies') { - if (Array.isArray(frontmatter?.founded)) return 'founded'; - return 'works_at'; +/** + * Resolve a wikilink target (relative path from extractMarkdownLinks) to a + * canonical slug, given the directory of the containing page and the set of + * all known slugs in the brain. + * + * Wiki KBs often use inconsistent relative depths: + * - Same-directory bare name: [[foo-bar]] from tech/wiki/analysis/ → tech/wiki/analysis/foo-bar ✓ + * - Cross-type shorthand: [[analysis/foo]] from {domain}/wiki/guides/ → {domain}/wiki/analysis/foo + * (author omits the leading ../ because they think in "wiki-root-relative" terms) + * - Cross-domain with one-too-few ../: [[../../finance/wiki/...]] from {domain}/wiki/analysis/ + * resolves to {domain}/finance/wiki/... instead of finance/wiki/... because depth-3 dirs + * need 3 × ../ to reach KB root, but authors only write 2 × + * + * Resolution order (first match wins): + * 1. Standard join(fileDir, relTarget) — exact relative path as written + * 2. Progressively strip leading path components from fileDir (ancestor search): + * tries parent dir, grandparent dir, … up to KB root. + * Handles both cross-type and cross-domain under-specified paths. + * + * Returns null when no matching slug is found (dangling link). + */ +export function resolveSlug(fileDir: string, relTarget: string, allSlugs: Set): string | null { + const targetNoExt = relTarget.endsWith('.md') ? relTarget.slice(0, -3) : relTarget; + + // Strategy 1: standard relative resolution + const s1 = join(fileDir, targetNoExt); + if (allSlugs.has(s1)) return s1; + + // Strategy 2: ancestor search — try each parent directory in turn. + // This resolves links whose authors omitted one or more leading ../ + // (common when targeting sibling subdirectories or cross-domain pages). + const parts = fileDir.split('/').filter(Boolean); + for (let strip = 1; strip <= parts.length; strip++) { + const ancestor = parts.slice(0, parts.length - strip).join('/'); + const candidate = ancestor ? join(ancestor, targetNoExt) : targetNoExt; + if (allSlugs.has(candidate)) return candidate; } - if (from === 'people' && to === 'deals') return 'involved_in'; - if (from === 'deals' && to === 'companies') return 'deal_for'; - if (from === 'meetings' && to === 'people') return 'attendee'; - return 'mention'; + + return null; } +// inferLinkType is now imported from ../core/link-extraction.ts (v0.12.0 canonical extractor) + /** Extract links from frontmatter fields */ function extractFrontmatterLinks(slug: string, fm: Record): ExtractedLink[] { const links: ExtractedLink[] = []; @@ -139,8 +201,8 @@ export function extractLinksFromFile( const fm = parseFrontmatterFromContent(content, relPath); for (const { name, relTarget } of extractMarkdownLinks(content)) { - const resolved = join(fileDir, relTarget).replace('.md', ''); - if (allSlugs.has(resolved)) { + const resolved = resolveSlug(fileDir, relTarget, allSlugs); + if (resolved !== null) { links.push({ from_slug: slug, to_slug: resolved, link_type: inferLinkType(fileDir, dirname(resolved), fm), @@ -231,7 +293,15 @@ export async function runExtractCore(engine: BrainEngine, opts: ExtractOpts): Pr export async function runExtract(engine: BrainEngine, args: string[]) { const subcommand = args[0]; const dirIdx = args.indexOf('--dir'); - const brainDir = (dirIdx >= 0 && dirIdx + 1 < args.length) ? args[dirIdx + 1] : '.'; + // Support --dir flag, positional [dir] argument, or default to '.' + let brainDir: string; + if (dirIdx >= 0 && dirIdx + 1 < args.length) { + brainDir = args[dirIdx + 1]; + } else if (args[1] && !args[1].startsWith('--')) { + brainDir = args[1]; + } else { + brainDir = '.'; + } const sourceIdx = args.indexOf('--source'); const source = (sourceIdx >= 0 && sourceIdx + 1 < args.length) ? args[sourceIdx + 1] : 'fs'; const typeIdx = args.indexOf('--type'); diff --git a/src/commands/orphans.ts b/src/commands/orphans.ts new file mode 100644 index 00000000..43413f3c --- /dev/null +++ b/src/commands/orphans.ts @@ -0,0 +1,227 @@ +/** + * gbrain orphans — Surface pages with no inbound wikilinks. + * + * Deterministic: zero LLM calls. Queries the links table for pages with + * no entries where to_page_id = pages.id. By default filters out + * auto-generated pages and pseudo-pages where no inbound links is expected. + * + * Usage: + * gbrain orphans # list orphans grouped by domain + * gbrain orphans --json # JSON output for agent consumption + * gbrain orphans --count # just the number + * gbrain orphans --include-pseudo # include auto-generated/pseudo pages + */ + +import type { BrainEngine } from '../core/engine.ts'; +import * as db from '../core/db.ts'; + +// --- Types --- + +export interface OrphanPage { + slug: string; + title: string; + domain: string; +} + +export interface OrphanResult { + orphans: OrphanPage[]; + total_orphans: number; + total_linkable: number; + total_pages: number; + excluded: number; +} + +// --- Filter constants --- + +/** Slug suffixes that are always auto-generated root files */ +const AUTO_SUFFIX_PATTERNS = ['/_index', '/log']; + +/** Page slugs that are pseudo-pages by convention */ +const PSEUDO_SLUGS = new Set(['_atlas', '_index', '_stats', '_orphans', '_scratch', 'claude']); + +/** Slug segment that marks raw sources */ +const RAW_SEGMENT = '/raw/'; + +/** Slug prefixes where no inbound links is expected */ +const DENY_PREFIXES = [ + 'output/', + 'dashboards/', + 'scripts/', + 'templates/', + 'openclaw/config/', +]; + +/** First slug segments where no inbound links is expected */ +const FIRST_SEGMENT_EXCLUSIONS = new Set(['scratch', 'thoughts', 'catalog', 'entities']); + +// --- Filter logic --- + +/** + * Returns true if a slug should be excluded from orphan reporting by default. + * These are pages where having no inbound links is expected / not a content problem. + */ +export function shouldExclude(slug: string): boolean { + // Pseudo-pages (exact match) + if (PSEUDO_SLUGS.has(slug)) return true; + + // Auto-generated suffix patterns + for (const suffix of AUTO_SUFFIX_PATTERNS) { + if (slug.endsWith(suffix)) return true; + } + + // Raw source slugs + if (slug.includes(RAW_SEGMENT)) return true; + + // Deny-prefix slugs + for (const prefix of DENY_PREFIXES) { + if (slug.startsWith(prefix)) return true; + } + + // First-segment exclusions + const firstSegment = slug.split('/')[0]; + if (FIRST_SEGMENT_EXCLUSIONS.has(firstSegment)) return true; + + return false; +} + +/** + * Derive domain from frontmatter or first slug segment. + */ +export function deriveDomain(frontmatterDomain: string | null | undefined, slug: string): string { + if (frontmatterDomain && typeof frontmatterDomain === 'string' && frontmatterDomain.trim()) { + return frontmatterDomain.trim(); + } + return slug.split('/')[0] || 'root'; +} + +// --- Core query --- + +/** + * Find pages with no inbound links. + * Returns raw rows from the DB (all pages regardless of filter). + */ +export async function queryOrphanPages(): Promise<{ slug: string; title: string; domain: string | null }[]> { + const sql = db.getConnection(); + const rows = await sql` + SELECT + p.slug, + COALESCE(p.title, p.slug) AS title, + p.frontmatter->>'domain' AS domain + FROM pages p + WHERE NOT EXISTS ( + SELECT 1 FROM links l WHERE l.to_page_id = p.id + ) + ORDER BY p.slug + `; + return rows as { slug: string; title: string; domain: string | null }[]; +} + +/** + * Find orphan pages, with optional pseudo-page filtering. + * Returns structured OrphanResult with totals. + */ +export async function findOrphans(includePseudo: boolean = false): Promise { + const allOrphans = await queryOrphanPages(); + const totalPages = allOrphans.length; // pages with no inbound links + + // Count total pages in DB for the summary line + const sql = db.getConnection(); + const [{ count: totalPagesCount }] = await sql`SELECT count(*)::int AS count FROM pages`; + const total = Number(totalPagesCount); + + const filtered = includePseudo + ? allOrphans + : allOrphans.filter(row => !shouldExclude(row.slug)); + + const orphans: OrphanPage[] = filtered.map(row => ({ + slug: row.slug, + title: row.title, + domain: deriveDomain(row.domain, row.slug), + })); + + const excluded = allOrphans.length - filtered.length; + + return { + orphans, + total_orphans: orphans.length, + total_linkable: filtered.length + (total - allOrphans.length), + total_pages: total, + excluded, + }; +} + +// --- Output formatters --- + +export function formatOrphansText(result: OrphanResult): string { + const lines: string[] = []; + + const { orphans, total_orphans, total_linkable, total_pages, excluded } = result; + lines.push( + `${total_orphans} orphans out of ${total_linkable} linkable pages (${total_pages} total; ${excluded} excluded)\n`, + ); + + if (orphans.length === 0) { + lines.push('No orphan pages found.'); + return lines.join('\n'); + } + + // Group by domain, sort alphabetically within each group + const byDomain = new Map(); + for (const page of orphans) { + const list = byDomain.get(page.domain) || []; + list.push(page); + byDomain.set(page.domain, list); + } + + // Sort domains alphabetically + const sortedDomains = [...byDomain.keys()].sort(); + for (const domain of sortedDomains) { + const pages = byDomain.get(domain)!.sort((a, b) => a.slug.localeCompare(b.slug)); + lines.push(`[${domain}]`); + for (const page of pages) { + lines.push(` ${page.slug} ${page.title}`); + } + lines.push(''); + } + + return lines.join('\n').trimEnd(); +} + +// --- CLI entry point --- + +export async function runOrphans(_engine: BrainEngine, args: string[]) { + const json = args.includes('--json'); + const count = args.includes('--count'); + const includePseudo = args.includes('--include-pseudo'); + + if (args.includes('--help') || args.includes('-h')) { + console.log(`Usage: gbrain orphans [options] + +Find pages with no inbound wikilinks. + +Options: + --json Output as JSON (for agent consumption) + --count Output just the number of orphans + --include-pseudo Include auto-generated and pseudo pages in results + --help, -h Show this help + +Output (default): grouped by domain, sorted alphabetically within each group +Summary line: N orphans out of M linkable pages (K total; K-M excluded) +`); + return; + } + + const result = await findOrphans(includePseudo); + + if (count) { + console.log(String(result.total_orphans)); + return; + } + + if (json) { + console.log(JSON.stringify(result, null, 2)); + return; + } + + console.log(formatOrphansText(result)); +} diff --git a/src/core/link-extraction.ts b/src/core/link-extraction.ts index 55570b59..016ca1ff 100644 --- a/src/core/link-extraction.ts +++ b/src/core/link-extraction.ts @@ -27,16 +27,41 @@ export interface EntityRef { } /** - * Match `[Name](path)` markdown links pointing to `people/` or `companies/` - * (and other entity directories). Accepts both filesystem-relative format - * (`[Name](../people/slug.md)`) AND engine-slug format (`[Name](people/slug)`). + * Directory prefix whitelist. These are the top-level slug dirs the extractor + * recognizes as entity references. Upstream canonical + our extensions: + * - Gbrain canonical: people, companies, meetings, concepts, deal, civic, project, source, media, yc, projects + * - Our domain extensions: tech, finance, personal, openclaw (domain-organized wikis) + * - Our entity prefix: entities (we kept some legacy entities/projects/ pages) + */ +const DIR_PATTERN = '(?:people|companies|meetings|concepts|deal|civic|project|projects|source|media|yc|tech|finance|personal|openclaw|entities)'; + +/** + * Match `[Name](path)` markdown links pointing to entity directories. + * Accepts both filesystem-relative format (`[Name](../people/slug.md)`) + * AND engine-slug format (`[Name](people/slug)`). * - * Captures: name, dir (people/companies/...), slug. + * Captures: name, slug (dir/name, possibly deeper). * * The regex permits an optional `../` prefix (any number) and an optional * `.md` suffix so the same function works for both filesystem and DB content. */ -const ENTITY_REF_RE = /\[([^\]]+)\]\((?:\.\.\/)*((?:people|companies|meetings|concepts|deal|civic|project|source|media|yc)\/([^)\s]+?))(?:\.md)?\)/g; +const ENTITY_REF_RE = new RegExp( + `\\[([^\\]]+)\\]\\((?:\\.\\.\\/)*(${DIR_PATTERN}\\/[^)\\s]+?)(?:\\.md)?\\)`, + 'g', +); + +/** + * Match Obsidian-style `[[path]]` or `[[path|Display Text]]` wikilinks. + * Captures: slug (dir/...), displayName (optional). + * + * Same dir whitelist as ENTITY_REF_RE. Strips trailing `.md`, strips section + * anchors (`#heading`), skips external URLs. Wiki KBs use this format almost + * exclusively so missing it leaves the graph empty. + */ +const WIKILINK_RE = new RegExp( + `\\[\\[(${DIR_PATTERN}\\/[^|\\]#]+?)(?:#[^|\\]]*?)?(?:\\|([^\\]]+?))?\\]\\]`, + 'g', +); /** * Strip fenced code blocks (```...```) and inline code (`...`) from markdown, @@ -84,16 +109,30 @@ function stripCodeBlocks(content: string): string { export function extractEntityRefs(content: string): EntityRef[] { const stripped = stripCodeBlocks(content); const refs: EntityRef[] = []; - let m: RegExpExecArray | null; - // Fresh regex per call (g-flag state is per-instance). - const re = new RegExp(ENTITY_REF_RE.source, ENTITY_REF_RE.flags); - while ((m = re.exec(stripped)) !== null) { - const name = m[1]; - const fullPath = m[2]; - const slug = fullPath; // dir/slug + let match: RegExpExecArray | null; + + // 1. Markdown links: [Name](path) + const mdPattern = new RegExp(ENTITY_REF_RE.source, ENTITY_REF_RE.flags); + while ((match = mdPattern.exec(stripped)) !== null) { + const name = match[1]; + const fullPath = match[2]; + const slug = fullPath; const dir = fullPath.split('/')[0]; refs.push({ name, slug, dir }); } + + // 2. Obsidian wikilinks: [[path]] or [[path|Display Text]] + const wikiPattern = new RegExp(WIKILINK_RE.source, WIKILINK_RE.flags); + while ((match = wikiPattern.exec(stripped)) !== null) { + let slug = match[1].trim(); + if (!slug) continue; + if (slug.includes('://')) continue; + if (slug.endsWith('.md')) slug = slug.slice(0, -3); + const displayName = (match[2] || slug).trim(); + const dir = slug.split('/')[0]; + refs.push({ name: displayName, slug, dir }); + } + return refs; } @@ -145,7 +184,10 @@ export function extractPageLinks( // Limited to the same entity directories ENTITY_REF_RE covers. // Code blocks are stripped first — slugs in code samples are not real refs. const strippedContent = stripCodeBlocks(content); - const bareRe = /\b((?:people|companies|meetings|concepts|deal|civic|project|source|media|yc)\/[a-z0-9][a-z0-9-]*)\b/g; + const bareRe = new RegExp( + `\\b(${DIR_PATTERN}\\/[a-z0-9][a-z0-9/-]*[a-z0-9])\\b`, + 'g', + ); let m: RegExpExecArray | null; while ((m = bareRe.exec(strippedContent)) !== null) { // Skip matches that are part of a markdown link (already handled above). diff --git a/src/core/markdown.ts b/src/core/markdown.ts index 239fe054..28b8e5ae 100644 --- a/src/core/markdown.ts +++ b/src/core/markdown.ts @@ -62,39 +62,72 @@ export function parseMarkdown(content: string, filePath?: string): ParsedMarkdow } /** - * Split body content at first standalone --- separator. + * Split body content at an explicit timeline sentinel. * Returns compiled_truth (before) and timeline (after). + * + * Recognized sentinels (in priority order): + * 1. `` — explicit HTML comment marker (preferred, unambiguous) + * 2. `--- timeline ---` — decorated separator (unambiguous) + * 3. A standalone `---` line whose NEXT non-empty line is `## Timeline` or `## History` + * (heading-gated fallback for backward compatibility) + * + * Plain `---` horizontal rules in article bodies are NOT treated as sentinels. + * This avoids the truncation bug where wiki articles using `---` as section + * dividers had everything after the first divider incorrectly labelled as timeline. */ export function splitBody(body: string): { compiled_truth: string; timeline: string } { - // Match a line that is only --- (with optional whitespace) - // Must not be at the very start (that would be frontmatter) const lines = body.split('\n'); - let splitIndex = -1; for (let i = 0; i < lines.length; i++) { const trimmed = lines[i].trim(); + + // Sentinel 1: explicit HTML comment marker + if (trimmed === '') { + const compiled_truth = lines.slice(0, i).join('\n'); + const timeline = lines.slice(i + 1).join('\n'); + return { compiled_truth, timeline }; + } + + // Sentinel 2: decorated separator + if (trimmed === '--- timeline ---') { + const compiled_truth = lines.slice(0, i).join('\n'); + const timeline = lines.slice(i + 1).join('\n'); + return { compiled_truth, timeline }; + } + + // Sentinel 3: heading-gated --- (backward compat) + // Only split on plain `---` when the next non-empty line is a Timeline/History heading. if (trimmed === '---') { - // Skip if this is the very first non-empty line (leftover from frontmatter parsing) const beforeContent = lines.slice(0, i).join('\n').trim(); if (beforeContent.length > 0) { - splitIndex = i; - break; + // Find next non-empty line after this separator + let nextNonEmpty = ''; + for (let j = i + 1; j < lines.length; j++) { + if (lines[j].trim() !== '') { + nextNonEmpty = lines[j].trim(); + break; + } + } + if (/^##\s+(Timeline|History)\b/i.test(nextNonEmpty)) { + const compiled_truth = lines.slice(0, i).join('\n'); + const timeline = lines.slice(i + 1).join('\n'); + return { compiled_truth, timeline }; + } + // Plain --- not followed by Timeline/History heading — treat as horizontal rule, + // continue scanning for a proper sentinel. } } } - if (splitIndex === -1) { - return { compiled_truth: body, timeline: '' }; - } - - const compiled_truth = lines.slice(0, splitIndex).join('\n'); - const timeline = lines.slice(splitIndex + 1).join('\n'); - return { compiled_truth, timeline }; + return { compiled_truth: body, timeline: '' }; } /** * Serialize a page back to markdown format. - * Produces: frontmatter + compiled_truth + --- + timeline + * Produces: frontmatter + compiled_truth + + timeline + * + * Uses `` as the explicit sentinel (not plain `---`) so that + * the output is parseable by `splitBody()` without ambiguity. */ export function serializeMarkdown( frontmatter: Record, @@ -116,7 +149,7 @@ export function serializeMarkdown( let body = compiled_truth; if (timeline) { - body += '\n\n---\n\n' + timeline; + body += '\n\n\n\n' + timeline; } return yamlContent + '\n\n' + body + '\n'; @@ -135,6 +168,13 @@ function inferType(filePath?: string): PageType { if (lower.includes('/projects/') || lower.includes('/project/')) return 'project'; if (lower.includes('/sources/') || lower.includes('/source/')) return 'source'; if (lower.includes('/media/')) return 'media'; + // Wiki subdirectory types — checked after generic types so /wiki/projects/ still + // resolves to 'project' via the generic rule above, but wiki-specific subtypes win. + if (lower.includes('/wiki/analysis/')) return 'analysis'; + if (lower.includes('/wiki/guides/') || lower.includes('/wiki/guide/')) return 'guide'; + if (lower.includes('/wiki/hardware/')) return 'hardware'; + if (lower.includes('/wiki/architecture/')) return 'architecture'; + if (lower.includes('/wiki/concepts/') || lower.includes('/wiki/concept/')) return 'concept'; return 'concept'; } diff --git a/src/core/operations.ts b/src/core/operations.ts index 2f266cbe..ec6e4121 100644 --- a/src/core/operations.ts +++ b/src/core/operations.ts @@ -1082,6 +1082,24 @@ const send_job_message: Operation = { }, }; +// --- Orphans --- + +const find_orphans: Operation = { + name: 'find_orphans', + description: 'Find pages with no inbound wikilinks. Essential for content enrichment cycles.', + params: { + include_pseudo: { + type: 'boolean', + description: 'Include auto-generated and pseudo pages (default: false)', + }, + }, + handler: async (_ctx, p) => { + const { findOrphans } = await import('../commands/orphans.ts'); + return findOrphans((p.include_pseudo as boolean) || false); + }, + cliHints: { name: 'orphans', hidden: true }, +}; + // --- Exports --- export const operations: Operation[] = [ @@ -1110,6 +1128,8 @@ export const operations: Operation[] = [ // Jobs (Minions) submit_job, get_job, list_jobs, cancel_job, retry_job, get_job_progress, pause_job, resume_job, replay_job, send_job_message, + // Orphans + find_orphans, ]; export const operationsByName = Object.fromEntries( diff --git a/src/core/postgres-engine.ts b/src/core/postgres-engine.ts index a22aa587..b6424820 100644 --- a/src/core/postgres-engine.ts +++ b/src/core/postgres-engine.ts @@ -104,7 +104,7 @@ export class PostgresEngine implements BrainEngine { const rows = await sql` INSERT INTO pages (slug, type, title, compiled_truth, timeline, frontmatter, content_hash, updated_at) - VALUES (${slug}, ${page.type}, ${page.title}, ${page.compiled_truth}, ${page.timeline || ''}, ${JSON.stringify(frontmatter)}::jsonb, ${hash}, now()) + VALUES (${slug}, ${page.type}, ${page.title}, ${page.compiled_truth}, ${page.timeline || ''}, ${this.sql.json(frontmatter)}, ${hash}, now()) ON CONFLICT (slug) DO UPDATE SET type = EXCLUDED.type, title = EXCLUDED.title, @@ -665,7 +665,7 @@ export class PostgresEngine implements BrainEngine { const sql = this.sql; const result = await sql` INSERT INTO raw_data (page_id, source, data) - SELECT id, ${source}, ${JSON.stringify(data)}::jsonb + SELECT id, ${source}, ${this.sql.json(data)} FROM pages WHERE slug = ${slug} ON CONFLICT (page_id, source) DO UPDATE SET data = EXCLUDED.data, @@ -843,7 +843,7 @@ export class PostgresEngine implements BrainEngine { const sql = this.sql; await sql` INSERT INTO ingest_log (source_type, source_ref, pages_updated, summary) - VALUES (${entry.source_type}, ${entry.source_ref}, ${JSON.stringify(entry.pages_updated)}::jsonb, ${entry.summary}) + VALUES (${entry.source_type}, ${entry.source_ref}, ${this.sql.json(entry.pages_updated)}, ${entry.summary}) `; } diff --git a/src/core/utils.ts b/src/core/utils.ts index 726c5731..4d00c0bd 100644 --- a/src/core/utils.ts +++ b/src/core/utils.ts @@ -50,7 +50,11 @@ export function rowToChunk(row: Record, includeEmbedding = fals chunk_index: row.chunk_index as number, chunk_text: row.chunk_text as string, chunk_source: row.chunk_source as 'compiled_truth' | 'timeline', - embedding: includeEmbedding && row.embedding ? row.embedding as Float32Array : null, + embedding: includeEmbedding && row.embedding + ? (typeof row.embedding === 'string' + ? new Float32Array(JSON.parse(row.embedding)) + : row.embedding as Float32Array) + : null, model: row.model as string, token_count: row.token_count as number | null, embedded_at: row.embedded_at ? new Date(row.embedded_at as string) : null, diff --git a/test/extract.test.ts b/test/extract.test.ts index 78720eff..fe297cbe 100644 --- a/test/extract.test.ts +++ b/test/extract.test.ts @@ -4,6 +4,7 @@ import { extractLinksFromFile, extractTimelineFromContent, walkMarkdownFiles, + resolveSlug, } from '../src/commands/extract.ts'; describe('extractMarkdownLinks', () => { @@ -118,6 +119,172 @@ describe('extractTimelineFromContent', () => { }); }); +describe('extractMarkdownLinks — wikilinks', () => { + it('extracts bare wikilink [[path]]', () => { + const content = 'See [[concepts/ai-overview]] for details.'; + const links = extractMarkdownLinks(content); + expect(links).toHaveLength(1); + expect(links[0].relTarget).toBe('concepts/ai-overview.md'); + }); + + it('extracts wikilink with display text [[path|Title]]', () => { + const content = 'See [[concepts/ai-overview|AI Overview]] for details.'; + const links = extractMarkdownLinks(content); + expect(links).toHaveLength(1); + expect(links[0].relTarget).toBe('concepts/ai-overview.md'); + expect(links[0].name).toBe('AI Overview'); + }); + + it('extracts wikilink with relative path [[../../other/page|Title]]', () => { + const content = '[[../../finance/wiki/concepts/billionaire-patterns|Billionaire Patterns]]'; + const links = extractMarkdownLinks(content); + expect(links).toHaveLength(1); + expect(links[0].relTarget).toBe('../../finance/wiki/concepts/billionaire-patterns.md'); + }); + + it('skips external wikilinks [[https://example.com|Title]]', () => { + const content = 'See [[https://example.com|External]] for details.'; + const links = extractMarkdownLinks(content); + expect(links).toHaveLength(0); + }); + + it('does not double-add .md suffix for wikilinks already ending in .md', () => { + const content = '[[path/to/page.md|Title]]'; + const links = extractMarkdownLinks(content); + expect(links).toHaveLength(1); + expect(links[0].relTarget).toBe('path/to/page.md'); + }); + + it('extracts multiple wikilinks from same content', () => { + const content = '[[concepts/ai]] and [[concepts/ml|Machine Learning]] here.'; + const links = extractMarkdownLinks(content); + expect(links).toHaveLength(2); + expect(links[0].relTarget).toBe('concepts/ai.md'); + expect(links[1].relTarget).toBe('concepts/ml.md'); + }); + + it('mixes standard markdown and wikilinks', () => { + const content = '[Pedro](../people/pedro.md) and [[concepts/ai|AI]] are both here.'; + const links = extractMarkdownLinks(content); + expect(links).toHaveLength(2); + }); +}); + +describe('extractLinksFromFile — wikilink integration', () => { + it('resolves wikilink paths to slugs when target exists', () => { + // Wikilink [[../concepts/ai|AI Overview]] from page deals/test-deal.md + // resolves to concepts/ai which must be in allSlugs + const content = `---\ntitle: Test\n---\nSee [[../concepts/ai|AI Overview]] here.`; + const allSlugs = new Set(['concepts/ai', 'deals/test-deal']); + const links = extractLinksFromFile(content, 'deals/test-deal.md', allSlugs); + expect(links.length).toBeGreaterThanOrEqual(1); + const aiLink = links.find(l => l.to_slug === 'concepts/ai'); + expect(aiLink).toBeDefined(); + expect(aiLink!.from_slug).toBe('deals/test-deal'); + }); + + it('skips wikilinks to pages not in allSlugs', () => { + const content = `---\ntitle: Test\n---\nSee [[../concepts/ghost|Ghost]] here.`; + const allSlugs = new Set(['deals/test-deal']); + const links = extractLinksFromFile(content, 'deals/test-deal.md', allSlugs); + const ghostLink = links.find(l => l.to_slug === 'concepts/ghost'); + expect(ghostLink).toBeUndefined(); + }); +}); + +describe('resolveSlug', () => { + const allSlugs = new Set([ + 'tech/wiki/concepts/foo-bar', + 'tech/wiki/analysis/ai-overview', + 'tech/raw/source-x', + 'finance/wiki/analysis/foo', + 'finance/wiki/concepts/billionaire-patterns', + 'personal/wiki/analysis/life-design', + 'personal/wiki/guides/fire-planning', + ]); + + it('resolves relative wikilink in same directory', () => { + // [[foo-bar]] from tech/wiki/concepts/some-page → tech/wiki/concepts/foo-bar + expect(resolveSlug('tech/wiki/concepts', 'foo-bar.md', allSlugs)) + .toBe('tech/wiki/concepts/foo-bar'); + }); + + it('resolves cross-type wikilink (concepts → analysis sibling)', () => { + // [[analysis/ai-overview]] from tech/wiki/concepts/ → tech/wiki/analysis/ai-overview + // Author omits ../ and writes subdirectory-relative from the wiki root + expect(resolveSlug('tech/wiki/concepts', 'analysis/ai-overview.md', allSlugs)) + .toBe('tech/wiki/analysis/ai-overview'); + }); + + it('resolves parent-relative [[../raw/source-x]] from tech/wiki/analysis/', () => { + // Standard ../ traversal — already handled by join, verifying it still works + expect(resolveSlug('tech/wiki/analysis', '../raw/source-x.md', allSlugs)) + .toBe('tech/raw/source-x'); + }); + + it('resolves deep parent-relative [[../../finance/wiki/analysis/foo]] from tech/wiki/analysis/', () => { + // Author writes ../../finance from depth-3 dir; needs ancestor search to find + // the correct finance/wiki/analysis/foo rather than tech/finance/wiki/analysis/foo + expect(resolveSlug('tech/wiki/analysis', '../../finance/wiki/analysis/foo.md', allSlugs)) + .toBe('finance/wiki/analysis/foo'); + }); + + it('resolves fully-qualified wikilink [[tech/wiki/concepts/foo-bar]]', () => { + // Fully-qualified path: works as-is from any location if resolved against root + expect(resolveSlug('personal/wiki/analysis', 'tech/wiki/concepts/foo-bar.md', allSlugs)) + .toBe('tech/wiki/concepts/foo-bar'); + }); + + it('strips display-text suffix before resolving (via extractMarkdownLinks)', () => { + // [[tech/wiki/concepts/foo-bar|Foo Bar]] — relTarget already has .md, name is display text + // resolveSlug receives the relTarget without the | part (extractMarkdownLinks handles it) + expect(resolveSlug('personal/wiki/analysis', 'tech/wiki/concepts/foo-bar.md', allSlugs)) + .toBe('tech/wiki/concepts/foo-bar'); + }); + + it('returns null for dangling target (slug not in allSlugs)', () => { + expect(resolveSlug('tech/wiki/analysis', 'nonexistent-page.md', allSlugs)) + .toBeNull(); + }); + + it('resolves cross-domain from personal/wiki/guides with partial path', () => { + // [[analysis/life-design]] from personal/wiki/guides/ → personal/wiki/analysis/life-design + expect(resolveSlug('personal/wiki/guides', 'analysis/life-design.md', allSlugs)) + .toBe('personal/wiki/analysis/life-design'); + }); +}); + +describe('extractMarkdownLinks — section anchors', () => { + it('strips section anchor from wikilink [[page#section]]', () => { + const content = '[[tech/wiki/concepts/foo-bar#some-section|Foo Bar]]'; + const links = extractMarkdownLinks(content); + expect(links).toHaveLength(1); + expect(links[0].relTarget).toBe('tech/wiki/concepts/foo-bar.md'); + }); + + it('skips bare same-page anchor [[#section]]', () => { + const content = 'See [[#metrics|Metrics]] for details.'; + const links = extractMarkdownLinks(content); + expect(links).toHaveLength(0); + }); + + it('strips anchor from bare wikilink [[page#section]] without display text', () => { + const content = '[[ai-overview#key-findings]]'; + const links = extractMarkdownLinks(content); + expect(links).toHaveLength(1); + expect(links[0].relTarget).toBe('ai-overview.md'); + }); +}); + +describe('runExtract — positional dir argument', () => { + it('extracts positional dir from args[1] when no --dir flag', () => { + // We cannot run the full command without a DB, but we can verify the logic + // by checking that walkMarkdownFiles is called with the right path. + // This is a smoke-test: just confirm the import works and the function exists. + expect(typeof extractMarkdownLinks).toBe('function'); + }); +}); + describe('walkMarkdownFiles', () => { it('is a function', () => { expect(typeof walkMarkdownFiles).toBe('function'); diff --git a/test/import-file.test.ts b/test/import-file.test.ts index 60be770a..c2505f3d 100644 --- a/test/import-file.test.ts +++ b/test/import-file.test.ts @@ -252,7 +252,7 @@ title: Chunked This is compiled truth content that should be chunked as compiled_truth source. ---- + - 2024-01-01: This is timeline content that should be chunked as timeline source. `); diff --git a/test/markdown.test.ts b/test/markdown.test.ts index aa214024..4318f14b 100644 --- a/test/markdown.test.ts +++ b/test/markdown.test.ts @@ -2,7 +2,7 @@ import { describe, test, expect } from 'bun:test'; import { parseMarkdown, serializeMarkdown, splitBody } from '../src/core/markdown.ts'; describe('Markdown Parser', () => { - test('parses frontmatter + compiled_truth + timeline', () => { + test('parses frontmatter + compiled_truth + timeline (explicit sentinel)', () => { const md = `--- type: concept title: Do Things That Don't Scale @@ -11,7 +11,7 @@ tags: [startups, growth] Paul Graham argues that startups should do unscalable things early on. ---- + - 2013-07-01: Published on paulgraham.com - 2024-11-15: Referenced in batch kickoff talk @@ -90,30 +90,75 @@ Content }); describe('splitBody', () => { - test('splits at first standalone ---', () => { - const body = 'Above the line\n\n---\n\nBelow the line'; + test('splits at sentinel', () => { + const body = 'Above the line\n\n\n\nBelow the line'; + const { compiled_truth, timeline } = splitBody(body); + expect(compiled_truth).toContain('Above the line'); + expect(timeline).toContain('Below the line'); + }); + + test('splits at --- timeline --- sentinel', () => { + const body = 'Above the line\n\n--- timeline ---\n\nBelow the line'; const { compiled_truth, timeline } = splitBody(body); expect(compiled_truth).toContain('Above the line'); expect(timeline).toContain('Below the line'); }); - test('returns all as compiled_truth if no separator', () => { + test('splits at --- when followed by ## Timeline heading', () => { + const body = 'Article content\n\n---\n\n## Timeline\n\n- 2024: Event happened'; + const { compiled_truth, timeline } = splitBody(body); + expect(compiled_truth).toContain('Article content'); + expect(timeline).toContain('## Timeline'); + expect(timeline).toContain('Event happened'); + }); + + test('splits at --- when followed by ## History heading', () => { + const body = 'Article content\n\n---\n\n## History\n\n- 2020: Founded'; + const { compiled_truth, timeline } = splitBody(body); + expect(compiled_truth).toContain('Article content'); + expect(timeline).toContain('## History'); + }); + + test('does NOT split at plain --- (horizontal rule in article body)', () => { + const body = 'Above the line\n\n---\n\nBelow the line'; + const { compiled_truth, timeline } = splitBody(body); + expect(compiled_truth).toBe(body); + expect(timeline).toBe(''); + }); + + test('does NOT split on multiple plain --- horizontal rules', () => { + const body = 'Section 1\n\n---\n\nSection 2\n\n---\n\nSection 3'; + const { compiled_truth, timeline } = splitBody(body); + expect(compiled_truth).toBe(body); + expect(timeline).toBe(''); + }); + + test('returns all as compiled_truth if no sentinel', () => { const body = 'Just some content\nWith multiple lines'; const { compiled_truth, timeline } = splitBody(body); expect(compiled_truth).toBe(body); expect(timeline).toBe(''); }); - test('handles --- at end of content', () => { + test('plain --- at end of content stays in compiled_truth', () => { const body = 'Content here\n\n---\n'; const { compiled_truth, timeline } = splitBody(body); - expect(compiled_truth).toContain('Content here'); - expect(timeline.trim()).toBe(''); + expect(compiled_truth).toBe(body); + expect(timeline).toBe(''); + }); + + test(' with content before and after', () => { + const body = '## Summary\n\nArticle summary here.\n\n---\n\nMore body content.\n\n\n\n- 2024: Timeline entry'; + const { compiled_truth, timeline } = splitBody(body); + expect(compiled_truth).toContain('## Summary'); + expect(compiled_truth).toContain('More body content.'); + expect(compiled_truth).not.toContain('Timeline entry'); + expect(timeline).toContain('Timeline entry'); }); }); describe('serializeMarkdown', () => { - test('round-trips through parse and serialize', () => { + test('round-trips through parse and serialize (explicit sentinel)', () => { const original = `--- type: concept title: Do Things That Don't Scale @@ -125,7 +170,7 @@ custom: value Paul Graham argues that startups should do unscalable things early on. ---- + - 2013-07-01: Published on paulgraham.com `; @@ -148,7 +193,7 @@ Paul Graham argues that startups should do unscalable things early on. }); describe('parseMarkdown edge cases', () => { - test('handles content with multiple --- separators', () => { + test('does NOT split on plain --- separators (horizontal rules stay in compiled_truth)', () => { const md = `--- type: concept title: Test @@ -158,16 +203,39 @@ First section. --- -Timeline part 1. +Second section. --- -More timeline.`; +Third section.`; const parsed = parseMarkdown(md); - // Only splits at the FIRST standalone --- - expect(parsed.compiled_truth.trim()).toBe('First section.'); - expect(parsed.timeline).toContain('Timeline part 1.'); - expect(parsed.timeline).toContain('More timeline.'); + // Plain --- should NOT be treated as a timeline separator + expect(parsed.compiled_truth).toContain('First section.'); + expect(parsed.compiled_truth).toContain('Second section.'); + expect(parsed.compiled_truth).toContain('Third section.'); + expect(parsed.timeline).toBe(''); + }); + + test('splits on sentinel with horizontal rules in body', () => { + const md = `--- +type: concept +title: Test +--- + +First section. + +--- + +Second section. + + + +- 2024: Timeline entry`; + const parsed = parseMarkdown(md); + expect(parsed.compiled_truth).toContain('First section.'); + expect(parsed.compiled_truth).toContain('Second section.'); + expect(parsed.compiled_truth).not.toContain('Timeline entry'); + expect(parsed.timeline).toContain('Timeline entry'); }); test('handles frontmatter without type or title', () => { @@ -199,4 +267,14 @@ Some content.`; expect(parseMarkdown('', 'concepts/thing.md').type).toBe('concept'); expect(parseMarkdown('', 'companies/acme.md').type).toBe('company'); }); + + test('infers type from wiki subdirectory paths', () => { + expect(parseMarkdown('', 'tech/wiki/concepts/longevity-science.md').type).toBe('concept'); + expect(parseMarkdown('', 'tech/wiki/guides/team-os-claude-code.md').type).toBe('guide'); + expect(parseMarkdown('', 'tech/wiki/analysis/agi-timeline-debate.md').type).toBe('analysis'); + expect(parseMarkdown('', 'tech/wiki/hardware/h100-vs-gb200-training-benchmarks.md').type).toBe('hardware'); + expect(parseMarkdown('', 'tech/wiki/architecture/kb-infrastructure.md').type).toBe('architecture'); + expect(parseMarkdown('', 'finance/wiki/analysis/polymarket-bot-automation-thesis.md').type).toBe('analysis'); + expect(parseMarkdown('', 'personal/wiki/concepts/career-regrets-2026-framework.md').type).toBe('concept'); + }); }); diff --git a/test/orphans.test.ts b/test/orphans.test.ts new file mode 100644 index 00000000..d6748830 --- /dev/null +++ b/test/orphans.test.ts @@ -0,0 +1,203 @@ +import { describe, test, expect } from 'bun:test'; +import { + shouldExclude, + deriveDomain, + formatOrphansText, + type OrphanPage, + type OrphanResult, +} from '../src/commands/orphans.ts'; + +// --- shouldExclude --- + +describe('shouldExclude', () => { + test('excludes pseudo-page _atlas', () => { + expect(shouldExclude('_atlas')).toBe(true); + }); + + test('excludes pseudo-page _index', () => { + expect(shouldExclude('_index')).toBe(true); + }); + + test('excludes pseudo-page _stats', () => { + expect(shouldExclude('_stats')).toBe(true); + }); + + test('excludes pseudo-page _orphans', () => { + expect(shouldExclude('_orphans')).toBe(true); + }); + + test('excludes pseudo-page _scratch', () => { + expect(shouldExclude('_scratch')).toBe(true); + }); + + test('excludes pseudo-page claude', () => { + expect(shouldExclude('claude')).toBe(true); + }); + + test('excludes auto-generated _index suffix', () => { + expect(shouldExclude('companies/_index')).toBe(true); + expect(shouldExclude('people/_index')).toBe(true); + }); + + test('excludes auto-generated /log suffix', () => { + expect(shouldExclude('projects/acme/log')).toBe(true); + }); + + test('excludes raw source slugs', () => { + expect(shouldExclude('companies/acme/raw/crustdata')).toBe(true); + }); + + test('excludes deny-prefix: output/', () => { + expect(shouldExclude('output/2026-q1')).toBe(true); + }); + + test('excludes deny-prefix: dashboards/', () => { + expect(shouldExclude('dashboards/metrics')).toBe(true); + }); + + test('excludes deny-prefix: scripts/', () => { + expect(shouldExclude('scripts/ingest-runner')).toBe(true); + }); + + test('excludes deny-prefix: templates/', () => { + expect(shouldExclude('templates/meeting-note')).toBe(true); + }); + + test('excludes deny-prefix: openclaw/config/', () => { + expect(shouldExclude('openclaw/config/agent')).toBe(true); + }); + + test('excludes first-segment: scratch', () => { + expect(shouldExclude('scratch/idea-dump')).toBe(true); + }); + + test('excludes first-segment: thoughts', () => { + expect(shouldExclude('thoughts/2026-04-17')).toBe(true); + }); + + test('excludes first-segment: catalog', () => { + expect(shouldExclude('catalog/tools')).toBe(true); + }); + + test('excludes first-segment: entities', () => { + expect(shouldExclude('entities/product-hunt')).toBe(true); + }); + + test('does NOT exclude a normal content page', () => { + expect(shouldExclude('companies/acme')).toBe(false); + expect(shouldExclude('people/jane-doe')).toBe(false); + expect(shouldExclude('projects/gbrain')).toBe(false); + }); + + test('does NOT exclude a page ending with log-like text that is not /log', () => { + expect(shouldExclude('devlog')).toBe(false); + expect(shouldExclude('changelog')).toBe(false); + }); +}); + +// --- deriveDomain --- + +describe('deriveDomain', () => { + test('uses frontmatter domain when present', () => { + expect(deriveDomain('companies', 'companies/acme')).toBe('companies'); + }); + + test('falls back to first slug segment', () => { + expect(deriveDomain(null, 'people/jane-doe')).toBe('people'); + expect(deriveDomain(undefined, 'projects/gbrain')).toBe('projects'); + }); + + test('returns root for single-segment slugs with no frontmatter', () => { + expect(deriveDomain(null, 'readme')).toBe('readme'); + }); + + test('ignores empty-string frontmatter domain', () => { + expect(deriveDomain('', 'people/alice')).toBe('people'); + }); + + test('ignores whitespace-only frontmatter domain', () => { + expect(deriveDomain(' ', 'people/alice')).toBe('people'); + }); +}); + +// --- formatOrphansText --- + +describe('formatOrphansText', () => { + function makeResult(orphans: OrphanPage[], overrides?: Partial): OrphanResult { + return { + orphans, + total_orphans: orphans.length, + total_linkable: orphans.length + 50, + total_pages: orphans.length + 60, + excluded: 10, + ...overrides, + }; + } + + test('shows summary line', () => { + const result = makeResult([]); + const out = formatOrphansText(result); + expect(out).toContain('0 orphans out of'); + expect(out).toContain('total'); + expect(out).toContain('excluded'); + }); + + test('shows "No orphan pages found." when empty', () => { + const out = formatOrphansText(makeResult([])); + expect(out).toContain('No orphan pages found.'); + }); + + test('groups orphans by domain', () => { + const orphans: OrphanPage[] = [ + { slug: 'companies/acme', title: 'Acme Corp', domain: 'companies' }, + { slug: 'people/alice', title: 'Alice', domain: 'people' }, + { slug: 'companies/beta', title: 'Beta Inc', domain: 'companies' }, + ]; + const out = formatOrphansText(makeResult(orphans)); + expect(out).toContain('[companies]'); + expect(out).toContain('[people]'); + // companies section should appear before people (alphabetical) + const companiesIdx = out.indexOf('[companies]'); + const peopleIdx = out.indexOf('[people]'); + expect(companiesIdx).toBeLessThan(peopleIdx); + }); + + test('sorts orphans alphabetically within each domain group', () => { + const orphans: OrphanPage[] = [ + { slug: 'companies/zeta', title: 'Zeta', domain: 'companies' }, + { slug: 'companies/alpha', title: 'Alpha', domain: 'companies' }, + { slug: 'companies/beta', title: 'Beta', domain: 'companies' }, + ]; + const out = formatOrphansText(makeResult(orphans)); + const alphaIdx = out.indexOf('companies/alpha'); + const betaIdx = out.indexOf('companies/beta'); + const zetaIdx = out.indexOf('companies/zeta'); + expect(alphaIdx).toBeLessThan(betaIdx); + expect(betaIdx).toBeLessThan(zetaIdx); + }); + + test('includes slug and title in output', () => { + const orphans: OrphanPage[] = [ + { slug: 'companies/acme', title: 'Acme Corp', domain: 'companies' }, + ]; + const out = formatOrphansText(makeResult(orphans)); + expect(out).toContain('companies/acme'); + expect(out).toContain('Acme Corp'); + }); + + test('summary line shows correct numbers', () => { + const orphans: OrphanPage[] = [ + { slug: 'a/b', title: 'B', domain: 'a' }, + { slug: 'a/c', title: 'C', domain: 'a' }, + ]; + const result: OrphanResult = { + orphans, + total_orphans: 2, + total_linkable: 100, + total_pages: 120, + excluded: 20, + }; + const out = formatOrphansText(result); + expect(out).toContain('2 orphans out of 100 linkable pages (120 total; 20 excluded)'); + }); +});