Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion src/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ for (const op of operations) {
}

// CLI-only commands that bypass the operation layer
const CLI_ONLY = new Set(['init', 'upgrade', 'post-upgrade', 'check-update', 'integrations', 'publish', 'check-backlinks', 'lint', 'report', 'import', 'export', 'files', 'embed', 'serve', 'call', 'config', 'doctor', 'migrate', 'eval', 'sync', 'extract', 'features', 'autopilot', 'graph-query', 'jobs', 'apply-migrations', 'skillpack-check']);
const CLI_ONLY = new Set(['init', 'upgrade', 'post-upgrade', 'check-update', 'integrations', 'publish', 'check-backlinks', 'lint', 'report', 'import', 'export', 'files', 'embed', 'serve', 'call', 'config', 'doctor', 'migrate', 'eval', 'sync', 'extract', 'features', 'autopilot', 'graph-query', 'jobs', 'apply-migrations', 'skillpack-check', 'orphans']);

async function main() {
const args = process.argv.slice(2);
Expand Down Expand Up @@ -412,6 +412,11 @@ async function handleCliOnly(command: string, args: string[]) {
await runGraphQuery(engine, args);
break;
}
case 'orphans': {
const { runOrphans } = await import('./commands/orphans.ts');
await runOrphans(engine, args);
break;
}
}
} finally {
if (command !== 'serve') await engine.disconnect();
Expand Down Expand Up @@ -520,6 +525,7 @@ TOOLS
publish <page.md> [--password] Shareable HTML (strips private data, optional AES-256)
check-backlinks <check|fix> [dir] Find/fix missing back-links across brain
lint <dir|file> [--fix] Catch LLM artifacts, placeholder dates, bad frontmatter
orphans [--json] [--count] Find pages with no inbound wikilinks
report --type <name> --content ... Save timestamped report to brain/reports/

JOBS (Minions)
Expand Down
104 changes: 87 additions & 17 deletions src/commands/extract.ts
Original file line number Diff line number Diff line change
Expand Up @@ -69,33 +69,95 @@ export function walkMarkdownFiles(dir: string): { path: string; relPath: string

// --- Link extraction ---

/** Extract markdown links to .md files (relative paths only) */
/** Extract markdown links to .md files (relative paths only).
*
* Handles two syntaxes:
* 1. Standard markdown: [text](relative/path.md)
* 2. Wikilinks: [[relative/path]] or [[relative/path|Display Text]]
*
* Both are resolved relative to the file that contains them, so the caller
* receives a relTarget that can be joined with dirname(relPath) to get the
* absolute slug. External URLs (containing ://) are always skipped.
*/
export function extractMarkdownLinks(content: string): { name: string; relTarget: string }[] {
const results: { name: string; relTarget: string }[] = [];
const pattern = /\[([^\]]+)\]\(([^)]+\.md)\)/g;

// Standard markdown links: [text](relative/path.md)
const mdPattern = /\[([^\]]+)\]\(([^)]+\.md)\)/g;
let match;
while ((match = pattern.exec(content)) !== null) {
while ((match = mdPattern.exec(content)) !== null) {
const target = match[2];
if (target.includes('://')) continue; // skip external URLs
results.push({ name: match[1], relTarget: target });
}

// Wikilinks: [[path/to/page]] or [[path/to/page|Display Text]]
// Path may or may not carry a .md suffix; normalise to include it.
// Skip external URLs like [[https://example.com|Title]].
// Strip section anchors: [[page#section|Title]] → page
const wikiPattern = /\[\[([^|\]]+?)(?:\|[^\]]*?)?\]\]/g;
while ((match = wikiPattern.exec(content)) !== null) {
const rawPath = match[1].trim();
if (rawPath.includes('://')) continue; // skip [[https://...]]
// Strip section anchors (#heading) — they're intra-page refs, not page slugs
const hashIdx = rawPath.indexOf('#');
const pagePath = hashIdx >= 0 ? rawPath.slice(0, hashIdx) : rawPath;
if (!pagePath) continue; // bare [[#anchor]] — same-page ref, skip
const relTarget = pagePath.endsWith('.md') ? pagePath : pagePath + '.md';
// Use the display text portion if present, otherwise the raw path
const pipeIdx = match[0].indexOf('|');
const displayName = pipeIdx >= 0
? match[0].slice(pipeIdx + 1, -2).trim()
: rawPath;
results.push({ name: displayName, relTarget });
}

return results;
}

/** Infer link type from directory structure */
function inferLinkType(fromDir: string, toDir: string, frontmatter?: Record<string, unknown>): string {
const from = fromDir.split('/')[0];
const to = toDir.split('/')[0];
if (from === 'people' && to === 'companies') {
if (Array.isArray(frontmatter?.founded)) return 'founded';
return 'works_at';
/**
* Resolve a wikilink target (relative path from extractMarkdownLinks) to a
* canonical slug, given the directory of the containing page and the set of
* all known slugs in the brain.
*
* Wiki KBs often use inconsistent relative depths:
* - Same-directory bare name: [[foo-bar]] from tech/wiki/analysis/ → tech/wiki/analysis/foo-bar ✓
* - Cross-type shorthand: [[analysis/foo]] from {domain}/wiki/guides/ → {domain}/wiki/analysis/foo
* (author omits the leading ../ because they think in "wiki-root-relative" terms)
* - Cross-domain with one-too-few ../: [[../../finance/wiki/...]] from {domain}/wiki/analysis/
* resolves to {domain}/finance/wiki/... instead of finance/wiki/... because depth-3 dirs
* need 3 × ../ to reach KB root, but authors only write 2 ×
*
* Resolution order (first match wins):
* 1. Standard join(fileDir, relTarget) — exact relative path as written
* 2. Progressively strip leading path components from fileDir (ancestor search):
* tries parent dir, grandparent dir, … up to KB root.
* Handles both cross-type and cross-domain under-specified paths.
*
* Returns null when no matching slug is found (dangling link).
*/
export function resolveSlug(fileDir: string, relTarget: string, allSlugs: Set<string>): string | null {
const targetNoExt = relTarget.endsWith('.md') ? relTarget.slice(0, -3) : relTarget;

// Strategy 1: standard relative resolution
const s1 = join(fileDir, targetNoExt);
if (allSlugs.has(s1)) return s1;

// Strategy 2: ancestor search — try each parent directory in turn.
// This resolves links whose authors omitted one or more leading ../
// (common when targeting sibling subdirectories or cross-domain pages).
const parts = fileDir.split('/').filter(Boolean);
for (let strip = 1; strip <= parts.length; strip++) {
const ancestor = parts.slice(0, parts.length - strip).join('/');
const candidate = ancestor ? join(ancestor, targetNoExt) : targetNoExt;
if (allSlugs.has(candidate)) return candidate;
}
if (from === 'people' && to === 'deals') return 'involved_in';
if (from === 'deals' && to === 'companies') return 'deal_for';
if (from === 'meetings' && to === 'people') return 'attendee';
return 'mention';

return null;
}

// inferLinkType is now imported from ../core/link-extraction.ts (v0.12.0 canonical extractor)

/** Extract links from frontmatter fields */
function extractFrontmatterLinks(slug: string, fm: Record<string, unknown>): ExtractedLink[] {
const links: ExtractedLink[] = [];
Expand Down Expand Up @@ -139,8 +201,8 @@ export function extractLinksFromFile(
const fm = parseFrontmatterFromContent(content, relPath);

for (const { name, relTarget } of extractMarkdownLinks(content)) {
const resolved = join(fileDir, relTarget).replace('.md', '');
if (allSlugs.has(resolved)) {
const resolved = resolveSlug(fileDir, relTarget, allSlugs);
if (resolved !== null) {
links.push({
from_slug: slug, to_slug: resolved,
link_type: inferLinkType(fileDir, dirname(resolved), fm),
Expand Down Expand Up @@ -231,7 +293,15 @@ export async function runExtractCore(engine: BrainEngine, opts: ExtractOpts): Pr
export async function runExtract(engine: BrainEngine, args: string[]) {
const subcommand = args[0];
const dirIdx = args.indexOf('--dir');
const brainDir = (dirIdx >= 0 && dirIdx + 1 < args.length) ? args[dirIdx + 1] : '.';
// Support --dir <path> flag, positional [dir] argument, or default to '.'
let brainDir: string;
if (dirIdx >= 0 && dirIdx + 1 < args.length) {
brainDir = args[dirIdx + 1];
} else if (args[1] && !args[1].startsWith('--')) {
brainDir = args[1];
} else {
brainDir = '.';
}
const sourceIdx = args.indexOf('--source');
const source = (sourceIdx >= 0 && sourceIdx + 1 < args.length) ? args[sourceIdx + 1] : 'fs';
const typeIdx = args.indexOf('--type');
Expand Down
227 changes: 227 additions & 0 deletions src/commands/orphans.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
/**
* gbrain orphans — Surface pages with no inbound wikilinks.
*
* Deterministic: zero LLM calls. Queries the links table for pages with
* no entries where to_page_id = pages.id. By default filters out
* auto-generated pages and pseudo-pages where no inbound links is expected.
*
* Usage:
* gbrain orphans # list orphans grouped by domain
* gbrain orphans --json # JSON output for agent consumption
* gbrain orphans --count # just the number
* gbrain orphans --include-pseudo # include auto-generated/pseudo pages
*/

import type { BrainEngine } from '../core/engine.ts';
import * as db from '../core/db.ts';

// --- Types ---

export interface OrphanPage {
slug: string;
title: string;
domain: string;
}

export interface OrphanResult {
orphans: OrphanPage[];
total_orphans: number;
total_linkable: number;
total_pages: number;
excluded: number;
}

// --- Filter constants ---

/** Slug suffixes that are always auto-generated root files */
const AUTO_SUFFIX_PATTERNS = ['/_index', '/log'];

/** Page slugs that are pseudo-pages by convention */
const PSEUDO_SLUGS = new Set(['_atlas', '_index', '_stats', '_orphans', '_scratch', 'claude']);

/** Slug segment that marks raw sources */
const RAW_SEGMENT = '/raw/';

/** Slug prefixes where no inbound links is expected */
const DENY_PREFIXES = [
'output/',
'dashboards/',
'scripts/',
'templates/',
'openclaw/config/',
];

/** First slug segments where no inbound links is expected */
const FIRST_SEGMENT_EXCLUSIONS = new Set(['scratch', 'thoughts', 'catalog', 'entities']);

// --- Filter logic ---

/**
* Returns true if a slug should be excluded from orphan reporting by default.
* These are pages where having no inbound links is expected / not a content problem.
*/
export function shouldExclude(slug: string): boolean {
// Pseudo-pages (exact match)
if (PSEUDO_SLUGS.has(slug)) return true;

// Auto-generated suffix patterns
for (const suffix of AUTO_SUFFIX_PATTERNS) {
if (slug.endsWith(suffix)) return true;
}

// Raw source slugs
if (slug.includes(RAW_SEGMENT)) return true;

// Deny-prefix slugs
for (const prefix of DENY_PREFIXES) {
if (slug.startsWith(prefix)) return true;
}

// First-segment exclusions
const firstSegment = slug.split('/')[0];
if (FIRST_SEGMENT_EXCLUSIONS.has(firstSegment)) return true;

return false;
}

/**
* Derive domain from frontmatter or first slug segment.
*/
export function deriveDomain(frontmatterDomain: string | null | undefined, slug: string): string {
if (frontmatterDomain && typeof frontmatterDomain === 'string' && frontmatterDomain.trim()) {
return frontmatterDomain.trim();
}
return slug.split('/')[0] || 'root';
}

// --- Core query ---

/**
* Find pages with no inbound links.
* Returns raw rows from the DB (all pages regardless of filter).
*/
export async function queryOrphanPages(): Promise<{ slug: string; title: string; domain: string | null }[]> {
const sql = db.getConnection();
const rows = await sql`
SELECT
p.slug,
COALESCE(p.title, p.slug) AS title,
p.frontmatter->>'domain' AS domain
FROM pages p
WHERE NOT EXISTS (
SELECT 1 FROM links l WHERE l.to_page_id = p.id
)
ORDER BY p.slug
`;
return rows as { slug: string; title: string; domain: string | null }[];
}

/**
* Find orphan pages, with optional pseudo-page filtering.
* Returns structured OrphanResult with totals.
*/
export async function findOrphans(includePseudo: boolean = false): Promise<OrphanResult> {
const allOrphans = await queryOrphanPages();
const totalPages = allOrphans.length; // pages with no inbound links

// Count total pages in DB for the summary line
const sql = db.getConnection();
const [{ count: totalPagesCount }] = await sql`SELECT count(*)::int AS count FROM pages`;
const total = Number(totalPagesCount);

const filtered = includePseudo
? allOrphans
: allOrphans.filter(row => !shouldExclude(row.slug));

const orphans: OrphanPage[] = filtered.map(row => ({
slug: row.slug,
title: row.title,
domain: deriveDomain(row.domain, row.slug),
}));

const excluded = allOrphans.length - filtered.length;

return {
orphans,
total_orphans: orphans.length,
total_linkable: filtered.length + (total - allOrphans.length),
total_pages: total,
excluded,
};
}

// --- Output formatters ---

export function formatOrphansText(result: OrphanResult): string {
const lines: string[] = [];

const { orphans, total_orphans, total_linkable, total_pages, excluded } = result;
lines.push(
`${total_orphans} orphans out of ${total_linkable} linkable pages (${total_pages} total; ${excluded} excluded)\n`,
);

if (orphans.length === 0) {
lines.push('No orphan pages found.');
return lines.join('\n');
}

// Group by domain, sort alphabetically within each group
const byDomain = new Map<string, OrphanPage[]>();
for (const page of orphans) {
const list = byDomain.get(page.domain) || [];
list.push(page);
byDomain.set(page.domain, list);
}

// Sort domains alphabetically
const sortedDomains = [...byDomain.keys()].sort();
for (const domain of sortedDomains) {
const pages = byDomain.get(domain)!.sort((a, b) => a.slug.localeCompare(b.slug));
lines.push(`[${domain}]`);
for (const page of pages) {
lines.push(` ${page.slug} ${page.title}`);
}
lines.push('');
}

return lines.join('\n').trimEnd();
}

// --- CLI entry point ---

export async function runOrphans(_engine: BrainEngine, args: string[]) {
const json = args.includes('--json');
const count = args.includes('--count');
const includePseudo = args.includes('--include-pseudo');

if (args.includes('--help') || args.includes('-h')) {
console.log(`Usage: gbrain orphans [options]

Find pages with no inbound wikilinks.

Options:
--json Output as JSON (for agent consumption)
--count Output just the number of orphans
--include-pseudo Include auto-generated and pseudo pages in results
--help, -h Show this help

Output (default): grouped by domain, sorted alphabetically within each group
Summary line: N orphans out of M linkable pages (K total; K-M excluded)
`);
return;
}

const result = await findOrphans(includePseudo);

if (count) {
console.log(String(result.total_orphans));
return;
}

if (json) {
console.log(JSON.stringify(result, null, 2));
return;
}

console.log(formatOrphansText(result));
}
Loading