From 89515455c6f3168801e0f10285b2ce1688a7985a Mon Sep 17 00:00:00 2001 From: sunnnybala Date: Wed, 15 Apr 2026 15:03:05 +0530 Subject: [PATCH 1/9] fix(sync): remove nested transaction that deadlocks > 10 file syncs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sync.ts wraps the add/modify loop in engine.transaction(), and each importFromContent inside opens another one. PGLite's _runExclusiveTransaction is a non-reentrant mutex — the second call queues on the mutex the first is holding, and the process hangs forever in ep_poll. Reproduced with a 15-file commit: unpatched hangs, patched runs in 3.4s. Fix drops the outer wrap; per-file atomicity is correct anyway (one file's failure should not roll back the others). (cherry picked from commit 4a1ac00105226695d16fb343b44e55a52f44b95b) --- src/commands/sync.ts | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/src/commands/sync.ts b/src/commands/sync.ts index 7034ea0d..601e356d 100644 --- a/src/commands/sync.ts +++ b/src/commands/sync.ts @@ -203,29 +203,29 @@ export async function performSync(engine: BrainEngine, opts: SyncOpts): Promise< pagesAffected.push(newSlug); } - // Process adds and modifies - const useTransaction = (filtered.added.length + filtered.modified.length) > 10; - const processAddsModifies = async () => { - for (const path of [...filtered.added, ...filtered.modified]) { - const filePath = join(repoPath, path); - if (!existsSync(filePath)) continue; - try { - const result = await importFile(engine, filePath, path, { noEmbed }); - if (result.status === 'imported') { - chunksCreated += result.chunks; - pagesAffected.push(result.slug); - } - } catch (e: unknown) { - const msg = e instanceof Error ? e.message : String(e); - console.error(` Warning: skipped ${path}: ${msg}`); + // Process adds and modifies. + // + // NOTE: do NOT wrap this loop in engine.transaction(). importFromContent + // already opens its own inner transaction per file, and PGLite transactions + // are not reentrant — they acquire the same _runExclusiveTransaction mutex, + // so a nested call from inside a user callback queues forever on the mutex + // the outer transaction is still holding. Result: incremental sync hangs in + // ep_poll whenever the diff crosses the old > 10 threshold that used to + // trigger the outer wrap. Per-file atomicity is also the right granularity: + // one file's failure should not roll back the others' successful imports. + for (const path of [...filtered.added, ...filtered.modified]) { + const filePath = join(repoPath, path); + if (!existsSync(filePath)) continue; + try { + const result = await importFile(engine, filePath, path, { noEmbed }); + if (result.status === 'imported') { + chunksCreated += result.chunks; + pagesAffected.push(result.slug); } + } catch (e: unknown) { + const msg = e instanceof Error ? e.message : String(e); + console.error(` Warning: skipped ${path}: ${msg}`); } - }; - - if (useTransaction) { - await engine.transaction(async () => { await processAddsModifies(); }); - } else { - await processAddsModifies(); } const elapsed = Date.now() - start; From 8fdfa0c1f9c3e8c565a08eae2eda8576b4bc0f24 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sun, 19 Apr 2026 08:43:42 +0800 Subject: [PATCH 2/9] test(sync): regression guard for #132 top-level engine.transaction wrap Reads src/commands/sync.ts verbatim and asserts no uncommented engine.transaction() call appears above the add/modify loop. Protects against silent reintroduction of the nested-mutex deadlock that hung > 10-file syncs forever in ep_poll. --- test/sync.test.ts | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/test/sync.test.ts b/test/sync.test.ts index 9b19daed..9c473f14 100644 --- a/test/sync.test.ts +++ b/test/sync.test.ts @@ -190,3 +190,18 @@ describe('buildSyncManifest edge cases', () => { expect(manifest.renamed).toEqual([]); }); }); + +describe('sync regression — #132 nested transaction deadlock', () => { + test('src/commands/sync.ts does not wrap the add/modify loop in engine.transaction()', async () => { + const source = await Bun.file(new URL('../src/commands/sync.ts', import.meta.url)).text(); + const loopStart = source.indexOf('for (const path of [...filtered.added, ...filtered.modified]'); + expect(loopStart).toBeGreaterThan(-1); + const prelude = source.slice(0, loopStart); + const lastTxIdx = prelude.lastIndexOf('engine.transaction'); + if (lastTxIdx !== -1) { + const lineStart = prelude.lastIndexOf('\n', lastTxIdx) + 1; + const line = prelude.slice(lineStart, prelude.indexOf('\n', lastTxIdx)); + expect(line.trim().startsWith('//')).toBe(true); + } + }); +}); From 0443cb2c4111633f04e455dc4dcf9640309ac0d3 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sun, 19 Apr 2026 08:48:09 +0800 Subject: [PATCH 3/9] feat(utils): tryParseEmbedding() skip+warn sibling for availability path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit parseEmbedding() throws on structural corruption — right call for ingest/ migrate paths where silent skips would be data loss. Wrong call for search/rescore paths where one corrupt row in 10K would kill every query that touches it. tryParseEmbedding() wraps parseEmbedding in try/catch: returns null on any shape that would throw, warns once per session so the bad row is visible in logs. Use it anywhere we'd rather degrade ranking than blow up the whole query. Retrofit postgres-engine.getEmbeddingsByChunkIds (the #175 slice call site) — the 5-line rescore loop was the direct motivator. Keep the throwing parseEmbedding() for everything else (pglite-engine rowToChunk, migrate-engine round-trips, ingest). --- src/core/postgres-engine.ts | 6 +++--- src/core/utils.ts | 22 ++++++++++++++++++++++ test/utils.test.ts | 29 ++++++++++++++++++++++++++++- 3 files changed, 53 insertions(+), 4 deletions(-) diff --git a/src/core/postgres-engine.ts b/src/core/postgres-engine.ts index 04ef3803..fb17c2b3 100644 --- a/src/core/postgres-engine.ts +++ b/src/core/postgres-engine.ts @@ -17,7 +17,7 @@ import type { } from './types.ts'; import { GBrainError } from './types.ts'; import * as db from './db.ts'; -import { validateSlug, contentHash, rowToPage, rowToChunk, rowToSearchResult, parseEmbedding } from './utils.ts'; +import { validateSlug, contentHash, rowToPage, rowToChunk, rowToSearchResult, parseEmbedding, tryParseEmbedding } from './utils.ts'; export class PostgresEngine implements BrainEngine { private _sql: ReturnType | null = null; @@ -272,8 +272,8 @@ export class PostgresEngine implements BrainEngine { `; const result = new Map(); for (const row of rows) { - const parsed = parseEmbedding(row.embedding); - if (parsed) result.set(row.id as number, parsed); + const embedding = tryParseEmbedding(row.embedding); + if (embedding) result.set(row.id as number, embedding); } return result; } diff --git a/src/core/utils.ts b/src/core/utils.ts index 22572121..4d9313e7 100644 --- a/src/core/utils.ts +++ b/src/core/utils.ts @@ -88,6 +88,28 @@ export function parseEmbedding(value: unknown): Float32Array | null { return null; } +let _tryParseEmbeddingWarned = false; + +/** + * Availability-path sibling of parseEmbedding(). Returns null + warns once + * on any shape parseEmbedding would throw on. Use this on read/rescore paths + * where one corrupt row should degrade ranking, not kill the whole query. + * Use parseEmbedding() (throws) on ingest/migrate paths where silent skips + * would be data loss. + */ +export function tryParseEmbedding(value: unknown): Float32Array | null { + try { + return parseEmbedding(value); + } catch (err) { + if (!_tryParseEmbeddingWarned) { + _tryParseEmbeddingWarned = true; + const msg = err instanceof Error ? err.message : String(err); + console.warn(`tryParseEmbedding: skipping corrupt embedding row (${msg}). Further warnings suppressed this session.`); + } + return null; + } +} + export function rowToChunk(row: Record, includeEmbedding = false): Chunk { return { id: row.id as number, diff --git a/test/utils.test.ts b/test/utils.test.ts index da80dcb3..6682b290 100644 --- a/test/utils.test.ts +++ b/test/utils.test.ts @@ -1,5 +1,5 @@ import { describe, test, expect } from 'bun:test'; -import { validateSlug, contentHash, parseEmbedding, rowToPage, rowToChunk, rowToSearchResult } from '../src/core/utils.ts'; +import { validateSlug, contentHash, parseEmbedding, tryParseEmbedding, rowToPage, rowToChunk, rowToSearchResult } from '../src/core/utils.ts'; describe('validateSlug', () => { test('accepts valid slugs', () => { @@ -146,6 +146,33 @@ describe('parseEmbedding', () => { }); }); +describe('tryParseEmbedding', () => { + test('returns null on corrupt embedding instead of throwing', () => { + expect(tryParseEmbedding('[0.1,NaN,0.3]')).toBeNull(); + expect(tryParseEmbedding(['bad' as unknown as number, 1])).toBeNull(); + }); + + test('delegates happy path to parseEmbedding', () => { + const out = tryParseEmbedding('[0.1, 0.2]'); + expect(out).toBeInstanceOf(Float32Array); + expect(out?.length).toBe(2); + }); + + test('warns once per session on corrupt rows', () => { + const orig = console.warn; + let warnCount = 0; + console.warn = () => { warnCount++; }; + try { + tryParseEmbedding('[NaN]'); + tryParseEmbedding('[NaN]'); + tryParseEmbedding('[NaN]'); + } finally { + console.warn = orig; + } + expect(warnCount).toBeLessThanOrEqual(1); + }); +}); + describe('rowToSearchResult', () => { test('coerces score to number', () => { const r = rowToSearchResult({ From ce14c50848b03ece1ac6d9bd8ee14289110b04a2 Mon Sep 17 00:00:00 2001 From: Gustavo Aragon Date: Thu, 16 Apr 2026 15:04:36 -0300 Subject: [PATCH 4/9] postgres-engine: scope search statement_timeout to the transaction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit searchKeyword and searchVector run on a pooled postgres.js client (max: 10 by default). The original code bounded each search with await sql`SET statement_timeout = '8s'` try { await sql`` } finally { await sql`SET statement_timeout = '0'` } but every tagged template is an independent round-trip that picks an arbitrary connection from the pool. The SET, the query, and the reset could all land on DIFFERENT connections. In practice the GUC sticks to whichever connection ran the SET and then gets returned to the pool — the next unrelated caller on that connection inherits the 8s timeout (clipping legitimate long queries) or the reset-to-0 (disabling the guard for whoever expected it). A crash in the middle leaves the state set permanently. Wrap each search in sql.begin(async sql => …). postgres.js reserves a single connection for the transaction body, so the SET LOCAL, the query, and the implicit COMMIT all run on the same connection. SET LOCAL scopes the GUC to the transaction — COMMIT or ROLLBACK restores the previous value automatically, regardless of the code path out. Error paths can no longer leak the GUC. No API change. Timeout value and semantics are identical (8s cap on search queries, no effect on embed --all / bulk import which runs outside these methods). Only one transaction per search — BEGIN + COMMIT round-trips are negligible next to a ranked FTS or pgvector query. Also closes the earlier audit finding R4-F002 which reported the same pattern on searchKeyword. This PR covers both searchKeyword and searchVector so the pool-leak class is fully closed. Tests (test/postgres-engine.test.ts, new file): - No bare SET statement_timeout remains after stripping comments. - searchKeyword and searchVector each wrap their query in sql.begin. - Both use SET LOCAL. - Neither explicitly clears the timeout with SET statement_timeout=0. Source-level guardrails keep the fast unit suite DB-free. Live Postgres coverage of the search path is in test/e2e/search-quality.test.ts, which continues to exercise these methods end-to-end against pgvector when DATABASE_URL is set. (cherry picked from commit 6146c3b470dce7380da024a238eab9e6b2174296) --- src/core/postgres-engine.ts | 36 ++++++----- test/postgres-engine.test.ts | 112 +++++++++++++++++++++++++++++++++++ 2 files changed, 132 insertions(+), 16 deletions(-) create mode 100644 test/postgres-engine.test.ts diff --git a/src/core/postgres-engine.ts b/src/core/postgres-engine.ts index fb17c2b3..0c540403 100644 --- a/src/core/postgres-engine.ts +++ b/src/core/postgres-engine.ts @@ -188,11 +188,17 @@ export class PostgresEngine implements BrainEngine { const detailLow = opts?.detail === 'low'; // Search-only timeout: prevents DoS via expensive queries without - // affecting long-running operations like embed --all or bulk import - await sql`SET statement_timeout = '8s'`; - try { + // affecting long-running operations like embed --all or bulk import. + // SET LOCAL inside sql.begin() scopes the GUC to the transaction so + // it can never leak onto a pooled connection returned to other + // callers. A bare `SET statement_timeout` goes to an arbitrary + // connection from the pool, lives past this method, and either + // clips an unrelated caller's long-running query (DoS) or — via + // `SET statement_timeout = 0` — disables the guard for them. + const rows = await sql.begin(async sql => { + await sql`SET LOCAL statement_timeout = '8s'`; // CTE: rank pages by FTS score, then pick the best chunk per page in SQL - const rows = await sql` + return await sql` WITH ranked_pages AS ( SELECT p.id, p.slug, p.title, p.type, ts_rank(p.search_vector, websearch_to_tsquery('english', ${query})) AS score @@ -218,10 +224,8 @@ export class PostgresEngine implements BrainEngine { FROM best_chunks ORDER BY score DESC `; - return rows.map(rowToSearchResult); - } finally { - await sql`SET statement_timeout = '0'`; - } + }); + return rows.map(rowToSearchResult); } async searchVector(embedding: Float32Array, opts?: SearchOpts): Promise { @@ -238,10 +242,12 @@ export class PostgresEngine implements BrainEngine { const vecStr = '[' + Array.from(embedding).join(',') + ']'; - // Search-only timeout (see searchKeyword for rationale) - await sql`SET statement_timeout = '8s'`; - try { - const rows = await sql` + // Search-only timeout (see searchKeyword for rationale). SET LOCAL + + // sql.begin ensures the GUC stays transaction-scoped on the pooled + // connection. + const rows = await sql.begin(async sql => { + await sql`SET LOCAL statement_timeout = '8s'`; + return await sql` SELECT p.slug, p.id as page_id, p.title, p.type, cc.id as chunk_id, cc.chunk_index, cc.chunk_text, cc.chunk_source, @@ -257,10 +263,8 @@ export class PostgresEngine implements BrainEngine { LIMIT ${limit} OFFSET ${offset} `; - return rows.map(rowToSearchResult); - } finally { - await sql`SET statement_timeout = '0'`; - } + }); + return rows.map(rowToSearchResult); } async getEmbeddingsByChunkIds(ids: number[]): Promise> { diff --git a/test/postgres-engine.test.ts b/test/postgres-engine.test.ts new file mode 100644 index 00000000..31e968b6 --- /dev/null +++ b/test/postgres-engine.test.ts @@ -0,0 +1,112 @@ +/** + * postgres-engine.ts source-level guardrails. + * + * Live Postgres coverage for search paths lives in test/e2e/search-quality.test.ts. + * This file stays fast and DB-free: it inspects the source of + * src/core/postgres-engine.ts to lock in decisions that protect the + * shared connection pool from per-request GUC leaks. + * + * Regression: R6-F006 / R4-F002. + * searchKeyword and searchVector used to call bare + * await sql`SET statement_timeout = '8s'` + * ...query... + * finally { await sql`SET statement_timeout = '0'` } + * against the shared pool. Each tagged template picks an arbitrary + * connection, so the SET, the query, and the reset could all land on + * DIFFERENT connections. Worst case: the 8s GUC sticks on some pooled + * connection and clips the next caller's long-running query; or the + * reset to 0 lands on a connection that other code expected to be + * protected. The fix wraps each query in sql.begin() and uses + * SET LOCAL so the GUC is transaction-scoped and auto-resets on + * COMMIT/ROLLBACK, regardless of error path. + */ + +import { describe, test, expect } from 'bun:test'; +import { readFileSync } from 'fs'; +import { join } from 'path'; + +const SRC = readFileSync( + join(import.meta.dir, '..', 'src', 'core', 'postgres-engine.ts'), + 'utf-8', +); + +describe('postgres-engine / search path timeout isolation', () => { + test('no bare `SET statement_timeout` statement survives', () => { + // Strip comments so the commentary mentioning the anti-pattern does + // not trigger a false positive. Block-comment + line-comment strip. + const stripped = SRC + .replace(/\/\*[\s\S]*?\*\//g, '') + .replace(/(^|\s)\/\/[^\n]*/g, '$1'); + + // Match a tagged-template statement of the form + // sql`SET statement_timeout = ...` + // that is NOT preceded by LOCAL. This is the exact shape that bleeds + // onto pooled connections; SET LOCAL is safe inside a transaction. + const bare = stripped.match( + /sql`\s*SET\s+(?!LOCAL\s)statement_timeout\b[^`]*`/gi, + ); + expect(bare).toBeNull(); + }); + + test('searchKeyword wraps its query in sql.begin()', () => { + const fn = extractMethod(SRC, 'searchKeyword'); + expect(fn).toMatch(/sql\.begin\s*\(\s*async\s+sql\s*=>/); + }); + + test('searchVector wraps its query in sql.begin()', () => { + const fn = extractMethod(SRC, 'searchVector'); + expect(fn).toMatch(/sql\.begin\s*\(\s*async\s+sql\s*=>/); + }); + + test('both search methods use SET LOCAL for the timeout', () => { + const keyword = extractMethod(SRC, 'searchKeyword'); + const vector = extractMethod(SRC, 'searchVector'); + expect(keyword).toMatch(/SET\s+LOCAL\s+statement_timeout/); + expect(vector).toMatch(/SET\s+LOCAL\s+statement_timeout/); + }); + + test('neither search method clears the timeout with `SET statement_timeout = 0`', () => { + // The reset-to-zero pattern was the other half of the leak: if SET + // LOCAL is in play, COMMIT handles the reset and an explicit + // `SET statement_timeout = '0'` would itself leak the GUC change + // onto the returned connection. Strip comments first so the + // commentary in the method itself (which quotes the anti-pattern + // to explain it) does not trigger a false positive. + const keyword = stripComments(extractMethod(SRC, 'searchKeyword')); + const vector = stripComments(extractMethod(SRC, 'searchVector')); + expect(keyword).not.toMatch(/SET\s+statement_timeout\s*=\s*['"]?0/); + expect(vector).not.toMatch(/SET\s+statement_timeout\s*=\s*['"]?0/); + }); +}); + +function stripComments(s: string): string { + return s + .replace(/\/\*[\s\S]*?\*\//g, '') + .replace(/(^|\s)\/\/[^\n]*/g, '$1'); +} + +// extractMethod grabs the body of a class method by brace-matching from +// its opening line. Returns the method body up to the matching closing +// brace. Good enough for the small number of methods in this file. +function extractMethod(source: string, name: string): string { + // Find "async (" at method-definition indentation (2 spaces). + const openRe = new RegExp(`^\\s+async\\s+${name}\\s*\\(`, 'm'); + const match = openRe.exec(source); + if (!match) { + throw new Error(`method ${name} not found in postgres-engine.ts`); + } + // Scan forward balancing braces. + let i = source.indexOf('{', match.index); + if (i < 0) throw new Error(`no opening brace for ${name}`); + const start = i; + let depth = 0; + for (; i < source.length; i++) { + const c = source[i]; + if (c === '{') depth++; + else if (c === '}') { + depth--; + if (depth === 0) return source.slice(start, i + 1); + } + } + throw new Error(`unbalanced braces in ${name}`); +} From c3ba92c907c2f287b55a8c3a6ce8e0cfacddb56d Mon Sep 17 00:00:00 2001 From: Clevin Canales Date: Sat, 18 Apr 2026 12:20:58 -0400 Subject: [PATCH 5/9] feat(orphans): add gbrain orphans command for finding under-connected pages Surfaces pages with zero inbound wikilinks. Essential for content enrichment cycles in KBs with 1000+ pages. By default filters out auto-generated pages, raw sources, and pseudo-pages where no inbound links is expected; --include-pseudo to disable. Supports text (grouped by domain), --json, --count outputs. Also exposed as find_orphans MCP operation. Tests cover basic detection, filtering, all output modes. Co-Authored-By: Claude Sonnet 4.6 (cherry picked from commit f50954f8e03f85803c6133c85c530bd45e9aceaa) --- src/cli.ts | 8 +- src/commands/orphans.ts | 227 ++++++++++++++++++++++++++++++++++++++++ src/core/operations.ts | 20 ++++ test/orphans.test.ts | 203 +++++++++++++++++++++++++++++++++++ 4 files changed, 457 insertions(+), 1 deletion(-) create mode 100644 src/commands/orphans.ts create mode 100644 test/orphans.test.ts diff --git a/src/cli.ts b/src/cli.ts index d31044ce..51122ec9 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -18,7 +18,7 @@ for (const op of operations) { } // CLI-only commands that bypass the operation layer -const CLI_ONLY = new Set(['init', 'upgrade', 'post-upgrade', 'check-update', 'integrations', 'publish', 'check-backlinks', 'lint', 'report', 'import', 'export', 'files', 'embed', 'serve', 'call', 'config', 'doctor', 'migrate', 'eval', 'sync', 'extract', 'features', 'autopilot', 'graph-query', 'jobs', 'apply-migrations', 'skillpack-check', 'repair-jsonb']); +const CLI_ONLY = new Set(['init', 'upgrade', 'post-upgrade', 'check-update', 'integrations', 'publish', 'check-backlinks', 'lint', 'report', 'import', 'export', 'files', 'embed', 'serve', 'call', 'config', 'doctor', 'migrate', 'eval', 'sync', 'extract', 'features', 'autopilot', 'graph-query', 'jobs', 'apply-migrations', 'skillpack-check', 'repair-jsonb', 'orphans']); async function main() { const args = process.argv.slice(2); @@ -417,6 +417,11 @@ async function handleCliOnly(command: string, args: string[]) { await runGraphQuery(engine, args); break; } + case 'orphans': { + const { runOrphans } = await import('./commands/orphans.ts'); + await runOrphans(engine, args); + break; + } } } finally { if (command !== 'serve') await engine.disconnect(); @@ -525,6 +530,7 @@ TOOLS publish [--password] Shareable HTML (strips private data, optional AES-256) check-backlinks [dir] Find/fix missing back-links across brain lint [--fix] Catch LLM artifacts, placeholder dates, bad frontmatter + orphans [--json] [--count] Find pages with no inbound wikilinks report --type --content ... Save timestamped report to brain/reports/ JOBS (Minions) diff --git a/src/commands/orphans.ts b/src/commands/orphans.ts new file mode 100644 index 00000000..43413f3c --- /dev/null +++ b/src/commands/orphans.ts @@ -0,0 +1,227 @@ +/** + * gbrain orphans — Surface pages with no inbound wikilinks. + * + * Deterministic: zero LLM calls. Queries the links table for pages with + * no entries where to_page_id = pages.id. By default filters out + * auto-generated pages and pseudo-pages where no inbound links is expected. + * + * Usage: + * gbrain orphans # list orphans grouped by domain + * gbrain orphans --json # JSON output for agent consumption + * gbrain orphans --count # just the number + * gbrain orphans --include-pseudo # include auto-generated/pseudo pages + */ + +import type { BrainEngine } from '../core/engine.ts'; +import * as db from '../core/db.ts'; + +// --- Types --- + +export interface OrphanPage { + slug: string; + title: string; + domain: string; +} + +export interface OrphanResult { + orphans: OrphanPage[]; + total_orphans: number; + total_linkable: number; + total_pages: number; + excluded: number; +} + +// --- Filter constants --- + +/** Slug suffixes that are always auto-generated root files */ +const AUTO_SUFFIX_PATTERNS = ['/_index', '/log']; + +/** Page slugs that are pseudo-pages by convention */ +const PSEUDO_SLUGS = new Set(['_atlas', '_index', '_stats', '_orphans', '_scratch', 'claude']); + +/** Slug segment that marks raw sources */ +const RAW_SEGMENT = '/raw/'; + +/** Slug prefixes where no inbound links is expected */ +const DENY_PREFIXES = [ + 'output/', + 'dashboards/', + 'scripts/', + 'templates/', + 'openclaw/config/', +]; + +/** First slug segments where no inbound links is expected */ +const FIRST_SEGMENT_EXCLUSIONS = new Set(['scratch', 'thoughts', 'catalog', 'entities']); + +// --- Filter logic --- + +/** + * Returns true if a slug should be excluded from orphan reporting by default. + * These are pages where having no inbound links is expected / not a content problem. + */ +export function shouldExclude(slug: string): boolean { + // Pseudo-pages (exact match) + if (PSEUDO_SLUGS.has(slug)) return true; + + // Auto-generated suffix patterns + for (const suffix of AUTO_SUFFIX_PATTERNS) { + if (slug.endsWith(suffix)) return true; + } + + // Raw source slugs + if (slug.includes(RAW_SEGMENT)) return true; + + // Deny-prefix slugs + for (const prefix of DENY_PREFIXES) { + if (slug.startsWith(prefix)) return true; + } + + // First-segment exclusions + const firstSegment = slug.split('/')[0]; + if (FIRST_SEGMENT_EXCLUSIONS.has(firstSegment)) return true; + + return false; +} + +/** + * Derive domain from frontmatter or first slug segment. + */ +export function deriveDomain(frontmatterDomain: string | null | undefined, slug: string): string { + if (frontmatterDomain && typeof frontmatterDomain === 'string' && frontmatterDomain.trim()) { + return frontmatterDomain.trim(); + } + return slug.split('/')[0] || 'root'; +} + +// --- Core query --- + +/** + * Find pages with no inbound links. + * Returns raw rows from the DB (all pages regardless of filter). + */ +export async function queryOrphanPages(): Promise<{ slug: string; title: string; domain: string | null }[]> { + const sql = db.getConnection(); + const rows = await sql` + SELECT + p.slug, + COALESCE(p.title, p.slug) AS title, + p.frontmatter->>'domain' AS domain + FROM pages p + WHERE NOT EXISTS ( + SELECT 1 FROM links l WHERE l.to_page_id = p.id + ) + ORDER BY p.slug + `; + return rows as { slug: string; title: string; domain: string | null }[]; +} + +/** + * Find orphan pages, with optional pseudo-page filtering. + * Returns structured OrphanResult with totals. + */ +export async function findOrphans(includePseudo: boolean = false): Promise { + const allOrphans = await queryOrphanPages(); + const totalPages = allOrphans.length; // pages with no inbound links + + // Count total pages in DB for the summary line + const sql = db.getConnection(); + const [{ count: totalPagesCount }] = await sql`SELECT count(*)::int AS count FROM pages`; + const total = Number(totalPagesCount); + + const filtered = includePseudo + ? allOrphans + : allOrphans.filter(row => !shouldExclude(row.slug)); + + const orphans: OrphanPage[] = filtered.map(row => ({ + slug: row.slug, + title: row.title, + domain: deriveDomain(row.domain, row.slug), + })); + + const excluded = allOrphans.length - filtered.length; + + return { + orphans, + total_orphans: orphans.length, + total_linkable: filtered.length + (total - allOrphans.length), + total_pages: total, + excluded, + }; +} + +// --- Output formatters --- + +export function formatOrphansText(result: OrphanResult): string { + const lines: string[] = []; + + const { orphans, total_orphans, total_linkable, total_pages, excluded } = result; + lines.push( + `${total_orphans} orphans out of ${total_linkable} linkable pages (${total_pages} total; ${excluded} excluded)\n`, + ); + + if (orphans.length === 0) { + lines.push('No orphan pages found.'); + return lines.join('\n'); + } + + // Group by domain, sort alphabetically within each group + const byDomain = new Map(); + for (const page of orphans) { + const list = byDomain.get(page.domain) || []; + list.push(page); + byDomain.set(page.domain, list); + } + + // Sort domains alphabetically + const sortedDomains = [...byDomain.keys()].sort(); + for (const domain of sortedDomains) { + const pages = byDomain.get(domain)!.sort((a, b) => a.slug.localeCompare(b.slug)); + lines.push(`[${domain}]`); + for (const page of pages) { + lines.push(` ${page.slug} ${page.title}`); + } + lines.push(''); + } + + return lines.join('\n').trimEnd(); +} + +// --- CLI entry point --- + +export async function runOrphans(_engine: BrainEngine, args: string[]) { + const json = args.includes('--json'); + const count = args.includes('--count'); + const includePseudo = args.includes('--include-pseudo'); + + if (args.includes('--help') || args.includes('-h')) { + console.log(`Usage: gbrain orphans [options] + +Find pages with no inbound wikilinks. + +Options: + --json Output as JSON (for agent consumption) + --count Output just the number of orphans + --include-pseudo Include auto-generated and pseudo pages in results + --help, -h Show this help + +Output (default): grouped by domain, sorted alphabetically within each group +Summary line: N orphans out of M linkable pages (K total; K-M excluded) +`); + return; + } + + const result = await findOrphans(includePseudo); + + if (count) { + console.log(String(result.total_orphans)); + return; + } + + if (json) { + console.log(JSON.stringify(result, null, 2)); + return; + } + + console.log(formatOrphansText(result)); +} diff --git a/src/core/operations.ts b/src/core/operations.ts index 2f266cbe..ec6e4121 100644 --- a/src/core/operations.ts +++ b/src/core/operations.ts @@ -1082,6 +1082,24 @@ const send_job_message: Operation = { }, }; +// --- Orphans --- + +const find_orphans: Operation = { + name: 'find_orphans', + description: 'Find pages with no inbound wikilinks. Essential for content enrichment cycles.', + params: { + include_pseudo: { + type: 'boolean', + description: 'Include auto-generated and pseudo pages (default: false)', + }, + }, + handler: async (_ctx, p) => { + const { findOrphans } = await import('../commands/orphans.ts'); + return findOrphans((p.include_pseudo as boolean) || false); + }, + cliHints: { name: 'orphans', hidden: true }, +}; + // --- Exports --- export const operations: Operation[] = [ @@ -1110,6 +1128,8 @@ export const operations: Operation[] = [ // Jobs (Minions) submit_job, get_job, list_jobs, cancel_job, retry_job, get_job_progress, pause_job, resume_job, replay_job, send_job_message, + // Orphans + find_orphans, ]; export const operationsByName = Object.fromEntries( diff --git a/test/orphans.test.ts b/test/orphans.test.ts new file mode 100644 index 00000000..d6748830 --- /dev/null +++ b/test/orphans.test.ts @@ -0,0 +1,203 @@ +import { describe, test, expect } from 'bun:test'; +import { + shouldExclude, + deriveDomain, + formatOrphansText, + type OrphanPage, + type OrphanResult, +} from '../src/commands/orphans.ts'; + +// --- shouldExclude --- + +describe('shouldExclude', () => { + test('excludes pseudo-page _atlas', () => { + expect(shouldExclude('_atlas')).toBe(true); + }); + + test('excludes pseudo-page _index', () => { + expect(shouldExclude('_index')).toBe(true); + }); + + test('excludes pseudo-page _stats', () => { + expect(shouldExclude('_stats')).toBe(true); + }); + + test('excludes pseudo-page _orphans', () => { + expect(shouldExclude('_orphans')).toBe(true); + }); + + test('excludes pseudo-page _scratch', () => { + expect(shouldExclude('_scratch')).toBe(true); + }); + + test('excludes pseudo-page claude', () => { + expect(shouldExclude('claude')).toBe(true); + }); + + test('excludes auto-generated _index suffix', () => { + expect(shouldExclude('companies/_index')).toBe(true); + expect(shouldExclude('people/_index')).toBe(true); + }); + + test('excludes auto-generated /log suffix', () => { + expect(shouldExclude('projects/acme/log')).toBe(true); + }); + + test('excludes raw source slugs', () => { + expect(shouldExclude('companies/acme/raw/crustdata')).toBe(true); + }); + + test('excludes deny-prefix: output/', () => { + expect(shouldExclude('output/2026-q1')).toBe(true); + }); + + test('excludes deny-prefix: dashboards/', () => { + expect(shouldExclude('dashboards/metrics')).toBe(true); + }); + + test('excludes deny-prefix: scripts/', () => { + expect(shouldExclude('scripts/ingest-runner')).toBe(true); + }); + + test('excludes deny-prefix: templates/', () => { + expect(shouldExclude('templates/meeting-note')).toBe(true); + }); + + test('excludes deny-prefix: openclaw/config/', () => { + expect(shouldExclude('openclaw/config/agent')).toBe(true); + }); + + test('excludes first-segment: scratch', () => { + expect(shouldExclude('scratch/idea-dump')).toBe(true); + }); + + test('excludes first-segment: thoughts', () => { + expect(shouldExclude('thoughts/2026-04-17')).toBe(true); + }); + + test('excludes first-segment: catalog', () => { + expect(shouldExclude('catalog/tools')).toBe(true); + }); + + test('excludes first-segment: entities', () => { + expect(shouldExclude('entities/product-hunt')).toBe(true); + }); + + test('does NOT exclude a normal content page', () => { + expect(shouldExclude('companies/acme')).toBe(false); + expect(shouldExclude('people/jane-doe')).toBe(false); + expect(shouldExclude('projects/gbrain')).toBe(false); + }); + + test('does NOT exclude a page ending with log-like text that is not /log', () => { + expect(shouldExclude('devlog')).toBe(false); + expect(shouldExclude('changelog')).toBe(false); + }); +}); + +// --- deriveDomain --- + +describe('deriveDomain', () => { + test('uses frontmatter domain when present', () => { + expect(deriveDomain('companies', 'companies/acme')).toBe('companies'); + }); + + test('falls back to first slug segment', () => { + expect(deriveDomain(null, 'people/jane-doe')).toBe('people'); + expect(deriveDomain(undefined, 'projects/gbrain')).toBe('projects'); + }); + + test('returns root for single-segment slugs with no frontmatter', () => { + expect(deriveDomain(null, 'readme')).toBe('readme'); + }); + + test('ignores empty-string frontmatter domain', () => { + expect(deriveDomain('', 'people/alice')).toBe('people'); + }); + + test('ignores whitespace-only frontmatter domain', () => { + expect(deriveDomain(' ', 'people/alice')).toBe('people'); + }); +}); + +// --- formatOrphansText --- + +describe('formatOrphansText', () => { + function makeResult(orphans: OrphanPage[], overrides?: Partial): OrphanResult { + return { + orphans, + total_orphans: orphans.length, + total_linkable: orphans.length + 50, + total_pages: orphans.length + 60, + excluded: 10, + ...overrides, + }; + } + + test('shows summary line', () => { + const result = makeResult([]); + const out = formatOrphansText(result); + expect(out).toContain('0 orphans out of'); + expect(out).toContain('total'); + expect(out).toContain('excluded'); + }); + + test('shows "No orphan pages found." when empty', () => { + const out = formatOrphansText(makeResult([])); + expect(out).toContain('No orphan pages found.'); + }); + + test('groups orphans by domain', () => { + const orphans: OrphanPage[] = [ + { slug: 'companies/acme', title: 'Acme Corp', domain: 'companies' }, + { slug: 'people/alice', title: 'Alice', domain: 'people' }, + { slug: 'companies/beta', title: 'Beta Inc', domain: 'companies' }, + ]; + const out = formatOrphansText(makeResult(orphans)); + expect(out).toContain('[companies]'); + expect(out).toContain('[people]'); + // companies section should appear before people (alphabetical) + const companiesIdx = out.indexOf('[companies]'); + const peopleIdx = out.indexOf('[people]'); + expect(companiesIdx).toBeLessThan(peopleIdx); + }); + + test('sorts orphans alphabetically within each domain group', () => { + const orphans: OrphanPage[] = [ + { slug: 'companies/zeta', title: 'Zeta', domain: 'companies' }, + { slug: 'companies/alpha', title: 'Alpha', domain: 'companies' }, + { slug: 'companies/beta', title: 'Beta', domain: 'companies' }, + ]; + const out = formatOrphansText(makeResult(orphans)); + const alphaIdx = out.indexOf('companies/alpha'); + const betaIdx = out.indexOf('companies/beta'); + const zetaIdx = out.indexOf('companies/zeta'); + expect(alphaIdx).toBeLessThan(betaIdx); + expect(betaIdx).toBeLessThan(zetaIdx); + }); + + test('includes slug and title in output', () => { + const orphans: OrphanPage[] = [ + { slug: 'companies/acme', title: 'Acme Corp', domain: 'companies' }, + ]; + const out = formatOrphansText(makeResult(orphans)); + expect(out).toContain('companies/acme'); + expect(out).toContain('Acme Corp'); + }); + + test('summary line shows correct numbers', () => { + const orphans: OrphanPage[] = [ + { slug: 'a/b', title: 'B', domain: 'a' }, + { slug: 'a/c', title: 'C', domain: 'a' }, + ]; + const result: OrphanResult = { + orphans, + total_orphans: 2, + total_linkable: 100, + total_pages: 120, + excluded: 20, + }; + const out = formatOrphansText(result); + expect(out).toContain('2 orphans out of 100 linkable pages (120 total; 20 excluded)'); + }); +}); From 7243436237334dd73ec1311c6c846bfef7b1c35b Mon Sep 17 00:00:00 2001 From: Clevin Canales Date: Sat, 18 Apr 2026 15:51:42 -0400 Subject: [PATCH 6/9] feat(extract): support Obsidian wikilinks + wiki-style domain slugs in canonical extractor extractEntityRefs now recognizes both syntaxes equally: [Name](people/slug) -- upstream original [[people/slug|Name]] -- Obsidian wikilink (new) Extends DIR_PATTERN to include domain-organized wiki slugs used by Karpathy-style knowledge bases: - entities (legacy prefix some brains keep during migration) - projects (gbrain canonical, was missing from regex) - tech, finance, personal, openclaw (domain-organized wiki roots) Before this change, a 2,100-page brain with wikilinks throughout extracted zero auto-links on put_page because the regex only matched markdown-style [name](path). After: 1,377 new typed edges on a single extract --source db pass over the same corpus. Matches the behavior of the extract.ts filesystem walker (which already handled wikilinks as of the wiki-markdown-compat fix wave), so the db and fs sources now produce the same link graph from the same content. Both patterns share the DIR_PATTERN constant so adding a new entity dir only requires updating one string. Co-Authored-By: Claude Opus 4.7 (1M context) (cherry picked from commit 1cfb15679a684e94bec5a48c537a0a40a85f57ab) --- src/core/link-extraction.ts | 68 ++++++++++++++++++++++++++++++------- 1 file changed, 55 insertions(+), 13 deletions(-) diff --git a/src/core/link-extraction.ts b/src/core/link-extraction.ts index 55570b59..016ca1ff 100644 --- a/src/core/link-extraction.ts +++ b/src/core/link-extraction.ts @@ -27,16 +27,41 @@ export interface EntityRef { } /** - * Match `[Name](path)` markdown links pointing to `people/` or `companies/` - * (and other entity directories). Accepts both filesystem-relative format - * (`[Name](../people/slug.md)`) AND engine-slug format (`[Name](people/slug)`). + * Directory prefix whitelist. These are the top-level slug dirs the extractor + * recognizes as entity references. Upstream canonical + our extensions: + * - Gbrain canonical: people, companies, meetings, concepts, deal, civic, project, source, media, yc, projects + * - Our domain extensions: tech, finance, personal, openclaw (domain-organized wikis) + * - Our entity prefix: entities (we kept some legacy entities/projects/ pages) + */ +const DIR_PATTERN = '(?:people|companies|meetings|concepts|deal|civic|project|projects|source|media|yc|tech|finance|personal|openclaw|entities)'; + +/** + * Match `[Name](path)` markdown links pointing to entity directories. + * Accepts both filesystem-relative format (`[Name](../people/slug.md)`) + * AND engine-slug format (`[Name](people/slug)`). * - * Captures: name, dir (people/companies/...), slug. + * Captures: name, slug (dir/name, possibly deeper). * * The regex permits an optional `../` prefix (any number) and an optional * `.md` suffix so the same function works for both filesystem and DB content. */ -const ENTITY_REF_RE = /\[([^\]]+)\]\((?:\.\.\/)*((?:people|companies|meetings|concepts|deal|civic|project|source|media|yc)\/([^)\s]+?))(?:\.md)?\)/g; +const ENTITY_REF_RE = new RegExp( + `\\[([^\\]]+)\\]\\((?:\\.\\.\\/)*(${DIR_PATTERN}\\/[^)\\s]+?)(?:\\.md)?\\)`, + 'g', +); + +/** + * Match Obsidian-style `[[path]]` or `[[path|Display Text]]` wikilinks. + * Captures: slug (dir/...), displayName (optional). + * + * Same dir whitelist as ENTITY_REF_RE. Strips trailing `.md`, strips section + * anchors (`#heading`), skips external URLs. Wiki KBs use this format almost + * exclusively so missing it leaves the graph empty. + */ +const WIKILINK_RE = new RegExp( + `\\[\\[(${DIR_PATTERN}\\/[^|\\]#]+?)(?:#[^|\\]]*?)?(?:\\|([^\\]]+?))?\\]\\]`, + 'g', +); /** * Strip fenced code blocks (```...```) and inline code (`...`) from markdown, @@ -84,16 +109,30 @@ function stripCodeBlocks(content: string): string { export function extractEntityRefs(content: string): EntityRef[] { const stripped = stripCodeBlocks(content); const refs: EntityRef[] = []; - let m: RegExpExecArray | null; - // Fresh regex per call (g-flag state is per-instance). - const re = new RegExp(ENTITY_REF_RE.source, ENTITY_REF_RE.flags); - while ((m = re.exec(stripped)) !== null) { - const name = m[1]; - const fullPath = m[2]; - const slug = fullPath; // dir/slug + let match: RegExpExecArray | null; + + // 1. Markdown links: [Name](path) + const mdPattern = new RegExp(ENTITY_REF_RE.source, ENTITY_REF_RE.flags); + while ((match = mdPattern.exec(stripped)) !== null) { + const name = match[1]; + const fullPath = match[2]; + const slug = fullPath; const dir = fullPath.split('/')[0]; refs.push({ name, slug, dir }); } + + // 2. Obsidian wikilinks: [[path]] or [[path|Display Text]] + const wikiPattern = new RegExp(WIKILINK_RE.source, WIKILINK_RE.flags); + while ((match = wikiPattern.exec(stripped)) !== null) { + let slug = match[1].trim(); + if (!slug) continue; + if (slug.includes('://')) continue; + if (slug.endsWith('.md')) slug = slug.slice(0, -3); + const displayName = (match[2] || slug).trim(); + const dir = slug.split('/')[0]; + refs.push({ name: displayName, slug, dir }); + } + return refs; } @@ -145,7 +184,10 @@ export function extractPageLinks( // Limited to the same entity directories ENTITY_REF_RE covers. // Code blocks are stripped first — slugs in code samples are not real refs. const strippedContent = stripCodeBlocks(content); - const bareRe = /\b((?:people|companies|meetings|concepts|deal|civic|project|source|media|yc)\/[a-z0-9][a-z0-9-]*)\b/g; + const bareRe = new RegExp( + `\\b(${DIR_PATTERN}\\/[a-z0-9][a-z0-9/-]*[a-z0-9])\\b`, + 'g', + ); let m: RegExpExecArray | null; while ((m = bareRe.exec(strippedContent)) !== null) { // Skip matches that are part of a markdown link (already handled above). From 885dc62df41b57d01aa63faed4a0703e7c19378f Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sun, 19 Apr 2026 16:17:35 +0800 Subject: [PATCH 7/9] feat(doctor): jsonb_integrity + markdown_body_completeness detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add two v0.12.1-era reliability checks to `gbrain doctor`: - `jsonb_integrity` scans the 4 known write sites from the v0.12.0 double-encode bug (pages.frontmatter, raw_data.data, ingest_log.pages_updated, files.metadata) and reports rows where jsonb_typeof(col) = 'string'. The fix hint points at `gbrain repair-jsonb` (the standalone repair command shipped in v0.12.1). - `markdown_body_completeness` flags pages whose compiled_truth is <30% of the raw source content length when raw has multiple H2/H3 boundaries. Heuristic only; suggests `gbrain sync --force` or `gbrain import --force `. Also adds test/e2e/jsonb-roundtrip.test.ts — the regression coverage that should have caught the original double-encode bug. Hits all four write sites against real Postgres and asserts jsonb_typeof='object' plus `->>'key'` returns the expected scalar. Detection only: doctor diagnoses, `gbrain repair-jsonb` treats. No overlap with the standalone repair path. --- docs/integrations/reliability-repair.md | 66 ++++++++++++ src/commands/doctor.ts | 68 +++++++++++++ test/doctor.test.ts | 18 ++++ test/e2e/jsonb-roundtrip.test.ts | 129 ++++++++++++++++++++++++ 4 files changed, 281 insertions(+) create mode 100644 docs/integrations/reliability-repair.md create mode 100644 test/e2e/jsonb-roundtrip.test.ts diff --git a/docs/integrations/reliability-repair.md b/docs/integrations/reliability-repair.md new file mode 100644 index 00000000..3e5840bf --- /dev/null +++ b/docs/integrations/reliability-repair.md @@ -0,0 +1,66 @@ +# Reliability repair (v0.12.2) + +If you ran v0.12.0 on real Postgres or Supabase, two bugs may have corrupted +data already in your brain. v0.12.1 fixed the code going forward. +v0.12.2 adds detection in `gbrain doctor` and a standalone `gbrain repair-jsonb` +command for the mechanically fixable class. PGLite users are not affected. + +## What got corrupted + +**JSONB double-encode.** Four write sites used +`${JSON.stringify(x)}::jsonb` with postgres.js, which stored a JSONB +*string literal* instead of an object. `frontmatter ->> 'key'` returns NULL; +GIN indexes are ineffective. Affected: `pages.frontmatter`, +`raw_data.data`, `ingest_log.pages_updated`, `files.metadata`. + +**Markdown body truncation.** `splitBody()` treated `---` horizontal rules +as a body/timeline delimiter, dropping everything after the first rule. +Wiki-style pages with multiple `##`/`###` sections lost the bulk of their +content at import time. + +## Detect + +``` +gbrain doctor +``` + +Reports two new checks: + +- `jsonb_integrity` — counts double-encoded rows per table and points you + at `gbrain repair-jsonb`. +- `markdown_body_completeness` — heuristic for pages whose `compiled_truth` + is suspiciously short compared to `raw_data.data ->> 'content'`. + +## Repair + +For JSONB (mechanically fixable): + +``` +gbrain repair-jsonb +``` + +Runs `UPDATE SET = (#>>'{}')::jsonb WHERE jsonb_typeof() = 'string'` +across every affected column. Idempotent. Second run reports 0 rows. Use +`--dry-run` to preview, `--json` for structured output. The `v0_12_2` +migration runs this automatically on `gbrain upgrade`. + +For truncated markdown bodies (source-dependent): + +``` +gbrain sync --force +# or per-page +gbrain import --force +``` + +v0.12.2 cannot recover content that was already lost if you no longer have +the source markdown file. `gbrain doctor` tells you which pages look short; +you decide whether to re-import from source or accept the truncation. + +## Verify + +``` +gbrain doctor +``` + +All four `jsonb_integrity` rows should read zero. `markdown_body_completeness` +should match your expectations for the corpus. diff --git a/src/commands/doctor.ts b/src/commands/doctor.ts index acfdfd40..b0a7d7ee 100644 --- a/src/commands/doctor.ts +++ b/src/commands/doctor.ts @@ -208,6 +208,74 @@ export async function runDoctor(engine: BrainEngine | null, args: string[]) { checks.push({ name: 'graph_coverage', status: 'warn', message: 'Could not check graph coverage' }); } + // 9. JSONB integrity (v0.12.1 reliability wave). + // v0.12.0's JSON.stringify()::jsonb pattern stored JSONB string literals + // instead of objects on real Postgres. PGLite masked this; Supabase did not. + // Scan the 4 known sites (pages.frontmatter, raw_data.data, ingest_log.pages_updated, + // files.metadata) for rows whose top-level jsonb_typeof is 'string'. + try { + const sql = db.getConnection(); + const targets: Array<{ table: string; col: string; expected: 'object' | 'array' }> = [ + { table: 'pages', col: 'frontmatter', expected: 'object' }, + { table: 'raw_data', col: 'data', expected: 'object' }, + { table: 'ingest_log', col: 'pages_updated', expected: 'array' }, + { table: 'files', col: 'metadata', expected: 'object' }, + ]; + let totalBad = 0; + const breakdown: string[] = []; + for (const { table, col } of targets) { + const rows = await sql.unsafe( + `SELECT count(*)::int AS n FROM ${table} WHERE jsonb_typeof(${col}) = 'string'`, + ); + const n = Number((rows as any)[0]?.n ?? 0); + if (n > 0) { totalBad += n; breakdown.push(`${table}.${col}=${n}`); } + } + if (totalBad === 0) { + checks.push({ name: 'jsonb_integrity', status: 'ok', message: 'All JSONB columns store objects/arrays' }); + } else { + checks.push({ + name: 'jsonb_integrity', + status: 'warn', + message: `${totalBad} row(s) double-encoded (${breakdown.join(', ')}). Fix: gbrain repair-jsonb`, + }); + } + } catch { + checks.push({ name: 'jsonb_integrity', status: 'warn', message: 'Could not check JSONB integrity' }); + } + + // 10. Markdown body completeness (v0.12.1 reliability wave). + // v0.12.0's splitBody ate everything after the first `---` horizontal rule, + // truncating wiki-style pages. Heuristic: pages whose body is <30% of the + // raw source content length when raw has multiple H2/H3 boundaries. + try { + const sql = db.getConnection(); + const rows = await sql` + SELECT p.slug, + length(p.compiled_truth) AS body_len, + length(rd.data ->> 'content') AS raw_len + FROM pages p + JOIN raw_data rd ON rd.page_id = p.id + WHERE rd.data ? 'content' + AND length(rd.data ->> 'content') > 1000 + AND length(p.compiled_truth) < length(rd.data ->> 'content') * 0.3 + AND (rd.data ->> 'content') ~ '(^|\n)##+ ' + LIMIT 100 + `; + if (rows.length === 0) { + checks.push({ name: 'markdown_body_completeness', status: 'ok', message: 'No truncated bodies detected' }); + } else { + const sample = rows.slice(0, 3).map((r: any) => r.slug).join(', '); + checks.push({ + name: 'markdown_body_completeness', + status: 'warn', + message: `${rows.length} page(s) appear truncated (sample: ${sample}). Re-import with: gbrain sync --force`, + }); + } + } catch { + // pages_raw.raw_data may not exist on older schemas; best-effort. + checks.push({ name: 'markdown_body_completeness', status: 'ok', message: 'Skipped (raw_data unavailable)' }); + } + const hasFail = outputResults(checks, jsonOutput); // Features teaser (non-JSON, non-failing only) diff --git a/test/doctor.test.ts b/test/doctor.test.ts index 72585835..213e93a3 100644 --- a/test/doctor.test.ts +++ b/test/doctor.test.ts @@ -40,4 +40,22 @@ describe('doctor command', () => { // We can't call it directly (it calls process.exit), but we verify the signature expect(runDoctor.length).toBe(2); // engine, args }); + + // v0.12.2 reliability wave — doctor detects JSONB double-encode + truncated + // bodies and points users at the standalone `gbrain repair-jsonb` command. + // Detection only; repair lives in src/commands/repair-jsonb.ts. + test('doctor source contains jsonb_integrity and markdown_body_completeness checks', async () => { + const source = await Bun.file(new URL('../src/commands/doctor.ts', import.meta.url)).text(); + expect(source).toContain('jsonb_integrity'); + expect(source).toContain('markdown_body_completeness'); + expect(source).toContain('gbrain repair-jsonb'); + }); + + test('jsonb_integrity check covers the four JSONB sites fixed in v0.12.1', async () => { + const source = await Bun.file(new URL('../src/commands/doctor.ts', import.meta.url)).text(); + expect(source).toMatch(/table:\s*'pages'.*col:\s*'frontmatter'/); + expect(source).toMatch(/table:\s*'raw_data'.*col:\s*'data'/); + expect(source).toMatch(/table:\s*'ingest_log'.*col:\s*'pages_updated'/); + expect(source).toMatch(/table:\s*'files'.*col:\s*'metadata'/); + }); }); diff --git a/test/e2e/jsonb-roundtrip.test.ts b/test/e2e/jsonb-roundtrip.test.ts new file mode 100644 index 00000000..e744b4e8 --- /dev/null +++ b/test/e2e/jsonb-roundtrip.test.ts @@ -0,0 +1,129 @@ +/** + * E2E JSONB Roundtrip Tests — v0.12.1 Reliability Wave + * + * Guards the four JSONB write sites against double-encoding regressions: + * 1. PostgresEngine.putPage → pages.frontmatter + * 2. PostgresEngine.putRawData → raw_data.data + * 3. PostgresEngine.logIngest → ingest_log.pages_updated + * 4. commands/files.ts:254 → files.metadata + * + * The v0.12.0 bug: `${JSON.stringify(x)}::jsonb` sends a JSON-encoded string + * to postgres.js, which stores it as a JSONB *string literal* instead of an + * object. `col ->> 'key'` returns NULL; GIN indexes are ineffective. + * PGLite masks this because its driver parses the string. Real Postgres does not. + * + * The fix: `sql.json(x)` uses postgres.js v3's native JSONB serialization. + */ + +import { describe, test, expect, beforeAll, afterAll } from 'bun:test'; +import { hasDatabase, setupDB, teardownDB, getEngine, getConn } from './helpers.ts'; + +const skip = !hasDatabase(); +const describeE2E = skip ? describe.skip : describe; + +describeE2E('E2E: JSONB roundtrip — v0.12.1 reliability wave', () => { + beforeAll(async () => { await setupDB(); }); + afterAll(async () => { await teardownDB(); }); + + test('putPage writes frontmatter as object, not double-encoded string', async () => { + const engine = getEngine(); + await engine.putPage('test/jsonb-putpage', { + type: 'concept', + title: 'JSONB putPage test', + compiled_truth: 'body', + timeline: '', + frontmatter: { marker: 'putpage-value', tags: ['a', 'b'] }, + }); + const sql = getConn(); + const [row] = await sql` + SELECT jsonb_typeof(frontmatter) AS t, frontmatter ->> 'marker' AS marker + FROM pages WHERE slug = 'test/jsonb-putpage' + `; + expect(row.t).toBe('object'); + expect(row.marker).toBe('putpage-value'); + }); + + test('putRawData writes raw_data.data as object, not double-encoded string', async () => { + const engine = getEngine(); + await engine.putPage('test/jsonb-rawdata', { + type: 'concept', + title: 'RawData test', + compiled_truth: 'body', + timeline: '', + frontmatter: {}, + }); + await engine.putRawData('test/jsonb-rawdata', 'unit-test', { + marker: 'rawdata-value', + nested: { k: 'v' }, + }); + const sql = getConn(); + const [row] = await sql` + SELECT jsonb_typeof(rd.data) AS t, rd.data ->> 'marker' AS marker + FROM raw_data rd + JOIN pages p ON p.id = rd.page_id + WHERE p.slug = 'test/jsonb-rawdata' + `; + expect(row.t).toBe('object'); + expect(row.marker).toBe('rawdata-value'); + }); + + test('logIngest writes pages_updated as array, not double-encoded string', async () => { + const engine = getEngine(); + await engine.logIngest({ + source_type: 'unit-test', + source_ref: 'jsonb-roundtrip', + pages_updated: ['test/a', 'test/b', 'test/c'], + summary: 'jsonb logingest check', + }); + const sql = getConn(); + const [row] = await sql` + SELECT jsonb_typeof(pages_updated) AS t, + jsonb_array_length(pages_updated) AS n, + pages_updated ->> 0 AS first + FROM ingest_log + WHERE source_ref = 'jsonb-roundtrip' + ORDER BY id DESC LIMIT 1 + `; + expect(row.t).toBe('array'); + expect(Number(row.n)).toBe(3); + expect(row.first).toBe('test/a'); + }); + + // files.ts:254 (uploadRaw's cloud-upload branch) was changed from + // `${JSON.stringify({...})}::jsonb` to `${sql.json({...})}` in v0.12.1. + // The function reads config and touches cloud storage, so we exercise the + // driver-level pattern directly against the same table/column. + test('files.metadata writes as object via sql.json(), not double-encoded string', async () => { + const sql = getConn(); + const payload = { type: 'pdf', upload_method: 'TUS resumable' }; + await sql` + INSERT INTO files (page_slug, filename, storage_path, mime_type, size_bytes, content_hash, metadata) + VALUES (NULL, 'jsonb-check.bin', 'unsorted/jsonb-check.bin', 'application/octet-stream', 1, 'sha256:deadbeef', ${sql.json(payload)}) + ON CONFLICT (storage_path) DO UPDATE SET metadata = EXCLUDED.metadata + `; + const [row] = await sql` + SELECT jsonb_typeof(metadata) AS t, + metadata ->> 'type' AS type, + metadata ->> 'upload_method' AS method + FROM files WHERE storage_path = 'unsorted/jsonb-check.bin' + `; + expect(row.t).toBe('object'); + expect(row.type).toBe('pdf'); + expect(row.method).toBe('TUS resumable'); + }); + + // Source-level tripwire: if anyone re-introduces the old `${JSON.stringify(x)}::jsonb` + // pattern for the fixed sites, fail loudly. Greps actual source files per the + // files-test-reimplements-production tripwire (CLAUDE.md). + test('no ${JSON.stringify(x)}::jsonb pattern remains in fixed sites', async () => { + const files = [ + '../../src/core/postgres-engine.ts', + '../../src/commands/files.ts', + ]; + const bad = /\$\{[^}]*JSON\.stringify\([^}]*\)[^}]*\}::jsonb/; + for (const rel of files) { + const source = await Bun.file(new URL(rel, import.meta.url)).text(); + expect(source.match(bad)?.[0] ?? null).toBeNull(); + } + }); +}); From 1f5d2d485e234309e609e8e0b72182a030dfa659 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sun, 19 Apr 2026 18:06:43 +0800 Subject: [PATCH 8/9] chore: bump to v0.12.3 + changelog (reliability wave) Master shipped v0.12.1 (extract N+1 + migration timeout) and v0.12.2 (JSONB double-encode + splitBody + wiki types + parseEmbedding) while this wave was mid-flight. Ships the remaining pieces as v0.12.3: - sync deadlock (#132, @sunnnybala) - statement_timeout scoping (#158, @garagon) - Obsidian wikilinks + domain patterns (#187 slice, @knee5) - gbrain orphans command (#187 slice, @knee5) - tryParseEmbedding() availability helper - doctor detection for jsonb_integrity + markdown_body_completeness No schema, no migration, no data touch. Co-Authored-By: Claude Opus 4.7 --- CHANGELOG.md | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++ VERSION | 2 +- package.json | 2 +- 3 files changed, 74 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 130fd5e2..c09a0843 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,78 @@ All notable changes to GBrain will be documented in this file. +## [0.12.3] - 2026-04-19 + +## **Reliability wave: the pieces v0.12.2 didn't cover.** +## **Sync stops hanging. Search timeouts stop leaking. `[[Wikilinks]]` are edges.** + +v0.12.2 shipped the data-correctness hotfix (JSONB double-encode, splitBody, `/wiki/` types, parseEmbedding). This wave lands the remaining reliability fixes from the same community review pass, plus a graph-layer feature a 2,100-page brain needed to stop bleeding edges. No schema changes. No migration. `gbrain upgrade` pulls it. + +### What was broken + +**Incremental sync deadlocked past 10 files.** `src/commands/sync.ts` wrapped the whole import in `engine.transaction`, and `importFromContent` also wrapped each file. PGLite's `_runExclusiveTransaction` is non-reentrant — the inner call parks on the mutex the outer call holds, forever. In practice: 3 files synced fine, 15 files hung in `ep_poll` until you killed the process. Bulk Minions jobs and citation-fixer dream-cycles regularly hit this. Discovered by @sunnnybala. + +**`statement_timeout` leaked across the postgres.js pool.** `searchKeyword` and `searchVector` bounded queries with `SET statement_timeout='8s'` + `finally SET 0`. But every tagged template picks an arbitrary pool connection, so the SET, the query, and the reset could land on three different sockets. The 8s cap stuck to whichever connection ran the SET, got returned to the pool, and the next unrelated caller inherited it. Long-running `embed --all` jobs and imports clipped silently. Fix by @garagon. + +**Obsidian `[[WikiLinks]]` were invisible to the auto-link post-hook.** `extractEntityRefs` only matched `[Name](people/slug)`. On a 2,100-page brain with wikilinks throughout, `put_page` extracted zero auto-links. `DIR_PATTERN` also missed domain-organized wiki roots (`entities`, `projects`, `tech`, `finance`, `personal`, `openclaw`). After the fix: 1,377 new typed edges on a single `extract --source db` pass. Discovered and fixed by @knee5. + +**Corrupt embedding rows broke every query that touched them.** `getEmbeddingsByChunkIds` on Supabase could return a pgvector string instead of a `Float32Array`. v0.12.2 fixed the normal path by normalizing inputs, but one genuinely bad row still threw and killed the ranking pass. Availability matters more than strictness on the read path. + +### What you can do now that you couldn't before + +- **Sync 100 files without hanging.** Per-file atomicity preserved, outer wrap removed. Regression test asserts `engine.transaction` is not called at the top level of `src/commands/sync.ts`. Contributed by @sunnnybala. +- **Run a long `embed --all` on Supabase without strangling unrelated queries.** `searchKeyword` / `searchVector` use `sql.begin` + `SET LOCAL` so the timeout dies with the transaction. 5 regression tests in `test/postgres-engine.test.ts` pin the new shape. Contributed by @garagon. +- **Write `[[people/balaji|Balaji Srinivasan]]` in a page and see a typed edge.** Same extractor, two syntaxes. Matches the filesystem walker — the db and fs sources now produce the same link graph from the same content. Contributed by @knee5. +- **Find your under-connected pages.** `gbrain orphans` surfaces pages with zero inbound wikilinks, grouped by domain. `--json`, `--count`, and `--include-pseudo` flags. Also exposed as the `find_orphans` MCP operation so agents can run enrichment cycles without CLI glue. Contributed by @knee5. +- **Degraded embedding rows skip+warn instead of throwing.** New `tryParseEmbedding()` sibling of `parseEmbedding()`: returns `null` on unknown input and warns once per process. Used on the search/rescore path. Migration and ingest paths still throw — data integrity there is non-negotiable. +- **`gbrain doctor` tells you which brains still need repair.** Two new checks: `jsonb_integrity` scans the four v0.12.0 write sites and reports rows where `jsonb_typeof = 'string'`; `markdown_body_completeness` heuristically flags pages whose `compiled_truth` is <30% of raw source length when raw has multiple H2/H3 boundaries. Fix hint points at `gbrain repair-jsonb` and `gbrain sync --force`. + +### How to upgrade + +```bash +gbrain upgrade +``` + +No migration, no schema change, no data touch. If you're on Postgres and haven't run `gbrain repair-jsonb` since v0.12.2, the v0.12.2 orchestrator still runs on upgrade. New `gbrain doctor` will tell you if anything still looks off. + +### Itemized changes + +**Sync deadlock fix (#132)** +- `src/commands/sync.ts` — remove outer `engine.transaction` wrap; per-file atomicity preserved by `importFromContent`'s own wrap. +- `test/sync.test.ts` — new regression guard asserting top-level `engine.transaction` is not called on > 10-file sync paths. +- Contributed by @sunnnybala. + +**postgres-engine statement_timeout scoping (#158)** +- `src/core/postgres-engine.ts` — `searchKeyword` and `searchVector` rewritten to `sql.begin(async (tx) => { await tx\`SET LOCAL statement_timeout = ...\`; ... })`. GUC dies with the transaction; pool reuse is safe. +- `test/postgres-engine.test.ts` — 5 regression tests including a source-level guardrail grep against the production file (not a test fixture) asserting no bare `SET statement_timeout` outside `sql.begin`. +- Contributed by @garagon. + +**Obsidian wikilinks + extended domain patterns (#187 slice)** +- `src/core/link-extraction.ts` — `extractEntityRefs` matches both `[Name](people/slug)` and `[[people/slug|Name]]`. `DIR_PATTERN` extended with `entities`, `projects`, `tech`, `finance`, `personal`, `openclaw`. +- Matches existing filesystem-walker behavior. +- Contributed by @knee5. + +**`gbrain orphans` command (#187 slice)** +- `src/commands/orphans.ts` — new command with text/JSON/count outputs and domain grouping. +- `src/core/operations.ts` — `find_orphans` MCP operation. +- `src/cli.ts` — `orphans` added to `CLI_ONLY`. +- `test/orphans.test.ts` — 203 lines covering detection, filters, and all output modes. +- Contributed by @knee5. + +**`tryParseEmbedding()` availability helper** +- `src/core/utils.ts` — new `tryParseEmbedding(value)`: returns `null` on unknown input, warns once per process via a module-level flag. +- `src/core/postgres-engine.ts` — `getEmbeddingsByChunkIds` uses `tryParseEmbedding` so one bad row degrades ranking instead of killing the query. +- `test/utils.test.ts` — new cases for null-return and single-warn. +- Hand-authored; codifies the split-by-call-site rule from the #97/#175 review. + +**Doctor detection checks** +- `src/commands/doctor.ts` — `jsonb_integrity` scans `pages.frontmatter`, `raw_data.data`, `ingest_log.pages_updated`, `files.metadata` and reports `jsonb_typeof='string'` counts; `markdown_body_completeness` heuristic for ≥30% shrinkage vs raw source on multi-H2 pages. +- `test/doctor.test.ts` — detection unit tests assert both checks exist and cover the four JSONB sites. +- `test/e2e/jsonb-roundtrip.test.ts` — the regression test that should have caught the original v0.12.0 double-encode bug; round-trips all four JSONB write sites against real Postgres. +- `docs/integrations/reliability-repair.md` — guide for v0.12.0 users: detect via `gbrain doctor`, repair via `gbrain repair-jsonb`. + +**No schema changes. No migration. No data touch.** + ## [0.12.2] - 2026-04-19 ## **Postgres frontmatter queries actually work now.** diff --git a/VERSION b/VERSION index 26acbf08..aa22d3ce 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.12.2 +0.12.3 diff --git a/package.json b/package.json index 740147f3..e68625c3 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "gbrain", - "version": "0.12.2", + "version": "0.12.3", "description": "Postgres-native personal knowledge brain with hybrid RAG search", "type": "module", "main": "src/core/index.ts", From a925ce1f5212a68dbcdf35eaa502cb40cd136876 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sun, 19 Apr 2026 18:18:34 +0800 Subject: [PATCH 9/9] docs: update project documentation for v0.12.3 CLAUDE.md: - Add src/commands/orphans.ts entry - Expand src/commands/doctor.ts with v0.12.3 jsonb_integrity + markdown_body_completeness check descriptions - Update src/core/link-extraction.ts to mention Obsidian wikilinks + extended DIR_PATTERN (entities/projects/tech/finance/personal/openclaw) - Update src/core/utils.ts to mention tryParseEmbedding sibling - Update src/core/postgres-engine.ts to note statement_timeout scoping + tryParseEmbedding usage in getEmbeddingsByChunkIds - Add Key commands added in v0.12.3 section (orphans, doctor checks) - Add test/orphans.test.ts, test/postgres-engine.test.ts, updated descriptions for test/sync.test.ts, test/doctor.test.ts, test/utils.test.ts - Add test/e2e/jsonb-roundtrip.test.ts with note on intentional overlap - Bump operation count from ~36 to ~41 (find_orphans shipped in v0.12.3) README.md: - Add gbrain orphans to ADMIN commands block Co-Authored-By: Claude Opus 4.7 --- CLAUDE.md | 28 ++++++++++++++++++++-------- README.md | 1 + 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 9578a49d..912077dc 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -9,7 +9,7 @@ cron scheduling, reports, identity, and access control. ## Architecture -Contract-first: `src/core/operations.ts` defines ~36 shared operations. CLI and MCP +Contract-first: `src/core/operations.ts` defines ~41 shared operations (adds `find_orphans` in v0.12.3). CLI and MCP server are both generated from this single source. Engine factory (`src/core/engine-factory.ts`) dynamically imports the configured engine (`'pglite'` or `'postgres'`). Skills are fat markdown files (tool-agnostic, work with both CLI and plugin contexts). @@ -27,8 +27,8 @@ strict behavior when unset. - `src/core/engine-factory.ts` — Engine factory with dynamic imports (`'pglite'` | `'postgres'`) - `src/core/pglite-engine.ts` — PGLite (embedded Postgres 17.5 via WASM) implementation, all 40 BrainEngine methods. `addLinksBatch` / `addTimelineEntriesBatch` use multi-row `unnest()` with manual `$N` placeholders. - `src/core/pglite-schema.ts` — PGLite-specific DDL (pgvector, pg_trgm, triggers) -- `src/core/postgres-engine.ts` — Postgres + pgvector implementation (Supabase / self-hosted). `addLinksBatch` / `addTimelineEntriesBatch` use `INSERT ... SELECT FROM unnest($1::text[], ...) JOIN pages ON CONFLICT DO NOTHING RETURNING 1` — 4-5 array params regardless of batch size, sidesteps the 65535-parameter cap. -- `src/core/utils.ts` — Shared SQL utilities extracted from postgres-engine.ts +- `src/core/postgres-engine.ts` — Postgres + pgvector implementation (Supabase / self-hosted). `addLinksBatch` / `addTimelineEntriesBatch` use `INSERT ... SELECT FROM unnest($1::text[], ...) JOIN pages ON CONFLICT DO NOTHING RETURNING 1` — 4-5 array params regardless of batch size, sidesteps the 65535-parameter cap. As of v0.12.3, `searchKeyword` / `searchVector` scope `statement_timeout` via `sql.begin` + `SET LOCAL` so the GUC dies with the transaction instead of leaking across the pooled postgres.js connection (contributed by @garagon). `getEmbeddingsByChunkIds` uses `tryParseEmbedding` so one corrupt row skips+warns instead of killing the query. +- `src/core/utils.ts` — Shared SQL utilities extracted from postgres-engine.ts. Exports `parseEmbedding(value)` (throws on unknown input, used by migration + ingest paths where data integrity matters) and as of v0.12.3 `tryParseEmbedding(value)` (returns `null` + warns once per process, used by search/rescore paths where availability matters more than strictness). - `src/core/db.ts` — Connection management, schema initialization - `src/commands/migrate-engine.ts` — Bidirectional engine migration (`gbrain migrate --to supabase/pglite`) - `src/core/import-file.ts` — importFromFile + importFromContent (chunk + embed + tags) @@ -50,7 +50,7 @@ strict behavior when unset. - `src/core/data-research.ts` — Recipe validation, field extraction (MRR/ARR regex), dedup, tracker parsing, HTML stripping - `src/commands/extract.ts` — `gbrain extract links|timeline|all [--source fs|db]`: batch link/timeline extraction. fs walks markdown files, db walks pages from the engine (mutation-immune snapshot iteration; use this for live brains with no local checkout). As of v0.12.1 there is no in-memory dedup pre-load — candidates are buffered 100 at a time and flushed via `addLinksBatch` / `addTimelineEntriesBatch`; `ON CONFLICT DO NOTHING` enforces uniqueness at the DB layer, and the `created` counter returns real rows inserted (truthful on re-runs). - `src/commands/graph-query.ts` — `gbrain graph-query [--type T] [--depth N] [--direction in|out|both]`: typed-edge relationship traversal (renders indented tree) -- `src/core/link-extraction.ts` — shared library for the v0.12.0 graph layer. extractEntityRefs (canonical, replaces backlinks.ts duplicate), extractPageLinks, inferLinkType heuristics (attended/works_at/invested_in/founded/advises/source/mentions), parseTimelineEntries, isAutoLinkEnabled config helper. Used by extract.ts, operations.ts auto-link post-hook, and backlinks.ts. +- `src/core/link-extraction.ts` — shared library for the v0.12.0 graph layer. extractEntityRefs (canonical, replaces backlinks.ts duplicate) matches both `[Name](people/slug)` markdown links and Obsidian `[[people/slug|Name]]` wikilinks as of v0.12.3. extractPageLinks, inferLinkType heuristics (attended/works_at/invested_in/founded/advises/source/mentions), parseTimelineEntries, isAutoLinkEnabled config helper. `DIR_PATTERN` covers `people`, `companies`, `deals`, `topics`, `concepts`, `projects`, `entities`, `tech`, `finance`, `personal`, `openclaw`. Used by extract.ts, operations.ts auto-link post-hook, and backlinks.ts. - `src/core/minions/` — Minions job queue: BullMQ-inspired, Postgres-native (queue, worker, backoff, types) - `src/core/minions/queue.ts` — MinionQueue class (submit, claim, complete, fail, stall detection, parent-child, depth/child-cap, per-job timeouts, cascade-kill, attachments, idempotency keys, child_done inbox, removeOnComplete/Fail) - `src/core/minions/worker.ts` — MinionWorker class (handler registry, lock renewal, graceful shutdown, timeout safety net) @@ -63,6 +63,8 @@ strict behavior when unset. - `src/commands/upgrade.ts` — Self-update CLI. `runPostUpgrade()` enumerates migrations from the TS registry (src/commands/migrations/index.ts) and tail-calls `runApplyMigrations(['--yes', '--non-interactive'])` so the mechanical side of every outstanding migration runs unconditionally. - `src/commands/migrations/` — TS migration registry (compiled into the binary; no filesystem walk of `skills/migrations/*.md` needed at runtime). `index.ts` lists migrations in semver order. `v0_11_0.ts` = Minions adoption orchestrator (8 phases). `v0_12_0.ts` = Knowledge Graph auto-wire orchestrator (5 phases: schema → config check → backfill links → backfill timeline → verify). `phaseASchema` has a 600s timeout (bumped from 60s in v0.12.1 for duplicate-heavy brains). `v0_12_2.ts` = JSONB double-encode repair orchestrator (4 phases: schema → repair-jsonb → verify → record). All orchestrators are idempotent and resumable from `partial` status. - `src/commands/repair-jsonb.ts` — `gbrain repair-jsonb [--dry-run] [--json]`: rewrites `jsonb_typeof='string'` rows in place across 5 affected columns (pages.frontmatter, raw_data.data, ingest_log.pages_updated, files.metadata, page_versions.frontmatter). Fixes v0.12.0 double-encode bug on Postgres; PGLite no-ops. Idempotent. +- `src/commands/orphans.ts` — `gbrain orphans [--json] [--count] [--include-pseudo]`: surfaces pages with zero inbound wikilinks, grouped by domain. Auto-generated/raw/pseudo pages filtered by default. Also exposed as `find_orphans` MCP operation. Shipped in v0.12.3 (contributed by @knee5). +- `src/commands/doctor.ts` — `gbrain doctor [--json] [--fast] [--fix]`: health checks. v0.12.3 adds two reliability detection checks: `jsonb_integrity` (scans pages.frontmatter, raw_data.data, ingest_log.pages_updated, files.metadata for `jsonb_typeof='string'` rows left over from v0.12.0) and `markdown_body_completeness` (flags pages whose compiled_truth is <30% of raw source when raw has multiple H2/H3 boundaries). Fix hints point at `gbrain repair-jsonb` and `gbrain sync --force`. - `src/core/markdown.ts` — Frontmatter parsing + body splitter. `splitBody` requires an explicit timeline sentinel (``, `--- timeline ---`, or `---` immediately before `## Timeline`/`## History`). Plain `---` in body text is a markdown horizontal rule, not a separator. `inferType` auto-types `/wiki/analysis/` → analysis, `/wiki/guides/` → guide, `/wiki/hardware/` → hardware, `/wiki/architecture/` → architecture, `/writing/` → writing (plus the existing people/companies/deals/etc heuristics). - `scripts/check-jsonb-pattern.sh` — CI grep guard. Fails the build if anyone reintroduces the `${JSON.stringify(x)}::jsonb` interpolation pattern (which postgres.js v3 double-encodes). Wired into `bun test`. - `docs/UPGRADING_DOWNSTREAM_AGENTS.md` — Patches for downstream agent skill forks (Wintermute etc.) to apply when upgrading. Each release appends a new section. v0.10.3 includes diffs for brain-ops, meeting-ingestion, signal-detector, enrich. @@ -135,23 +137,27 @@ Key commands added for Minions (job queue): Key commands added in v0.12.2: - `gbrain repair-jsonb [--dry-run] [--json]` — repair double-encoded JSONB rows left over from v0.12.0-and-earlier Postgres writes. Idempotent; PGLite no-ops. The `v0_12_2` migration runs this automatically on `gbrain upgrade`. +Key commands added in v0.12.3: +- `gbrain orphans [--json] [--count] [--include-pseudo]` — surface pages with zero inbound wikilinks, grouped by domain. Auto-generated/raw/pseudo pages filtered by default. Also exposed as `find_orphans` MCP operation. The natural consumer of the v0.12.0 knowledge graph layer: once edges are captured, find the gaps. +- `gbrain doctor` gains two new reliability detection checks: `jsonb_integrity` (v0.12.0 Postgres double-encode damage) and `markdown_body_completeness` (pages truncated by the old splitBody bug). Detection only; fix hints point at `gbrain repair-jsonb` and `gbrain sync --force`. + ## Testing `bun test` runs all tests. After the v0.12.1 release: ~75 unit test files + 8 E2E test files (1412 unit pass, 119 E2E when `DATABASE_URL` is set — skip gracefully otherwise). Unit tests run without a database. E2E tests skip gracefully when `DATABASE_URL` is not set. Unit tests: `test/markdown.test.ts` (frontmatter parsing), `test/chunkers/recursive.test.ts` -(chunking), `test/sync.test.ts` (sync logic), `test/parity.test.ts` (operations contract +(chunking), `test/parity.test.ts` (operations contract parity), `test/cli.test.ts` (CLI structure), `test/config.test.ts` (config redaction), `test/files.test.ts` (MIME/hash), `test/import-file.test.ts` (import pipeline), -`test/upgrade.test.ts` (schema migrations), `test/doctor.test.ts` (doctor command), +`test/upgrade.test.ts` (schema migrations), `test/file-migration.test.ts` (file migration), `test/file-resolver.test.ts` (file resolution), `test/import-resume.test.ts` (import checkpoints), `test/migrate.test.ts` (migration; v8/v9 helper-btree-index SQL structural assertions + 1000-row wall-clock fixtures that guard the O(n²)→O(n log n) fix), `test/setup-branching.test.ts` (setup flow), `test/slug-validation.test.ts` (slug validation), `test/storage.test.ts` (storage backends), `test/supabase-admin.test.ts` (Supabase admin), `test/yaml-lite.test.ts` (YAML parsing), `test/check-update.test.ts` (version check + update CLI), `test/pglite-engine.test.ts` (PGLite engine, all 40 BrainEngine methods including 11 cases for `addLinksBatch` / `addTimelineEntriesBatch`: empty batch, missing optionals, within-batch dedup via ON CONFLICT, missing-slug rows dropped by JOIN, half-existing batch, batch of 100), -`test/utils.test.ts` (shared SQL utilities), `test/engine-factory.test.ts` (engine factory + dynamic imports), +`test/engine-factory.test.ts` (engine factory + dynamic imports), `test/integrations.test.ts` (recipe parsing, CLI routing, recipe validation), `test/publish.test.ts` (content stripping, encryption, password generation, HTML output), `test/backlinks.test.ts` (entity extraction, back-link detection, timeline entry generation), @@ -181,13 +187,19 @@ parity), `test/cli.test.ts` (CLI structure), `test/config.test.ts` (config redac `test/search-limit.test.ts` (clampSearchLimit default/cap behavior across list_pages and get_ingest_log), `test/repair-jsonb.test.ts` (v0.12.2 JSONB repair: TARGETS list, idempotency, engine-awareness), `test/migrations-v0_12_2.test.ts` (v0.12.2 orchestrator phases: schema → repair → verify → record), -`test/markdown.test.ts` (splitBody sentinel precedence, horizontal-rule preservation, inferType wiki subtypes). +`test/markdown.test.ts` (splitBody sentinel precedence, horizontal-rule preservation, inferType wiki subtypes), +`test/orphans.test.ts` (v0.12.3 orphans command: detection, pseudo filtering, text/json/count outputs, MCP op), +`test/postgres-engine.test.ts` (v0.12.3 statement_timeout scoping: `sql.begin` + `SET LOCAL` shape, source-level grep guardrail against reintroduced bare `SET statement_timeout`), +`test/sync.test.ts` (sync logic + v0.12.3 regression guard asserting top-level `engine.transaction` is not called), +`test/doctor.test.ts` (doctor command + v0.12.3 assertions that `jsonb_integrity` scans the four v0.12.0 write sites and `markdown_body_completeness` is present), +`test/utils.test.ts` (shared SQL utilities + `tryParseEmbedding` null-return and single-warn semantics). E2E tests (`test/e2e/`): Run against real Postgres+pgvector. Require `DATABASE_URL`. - `bun run test:e2e` runs Tier 1 (mechanical, all operations, no API keys). Includes 9 dedicated cases for the postgres-engine `addLinksBatch` / `addTimelineEntriesBatch` bind path — postgres-js's `unnest()` binding is structurally different from PGLite's and gets its own coverage. - `test/e2e/search-quality.test.ts` runs search quality E2E against PGLite (no API keys, in-memory) - `test/e2e/graph-quality.test.ts` runs the v0.10.3 knowledge graph pipeline (auto-link via put_page, reconciliation, traversePaths) against PGLite in-memory - `test/e2e/postgres-jsonb.test.ts` — v0.12.2 regression test. Round-trips all 5 JSONB write sites (pages.frontmatter, raw_data.data, ingest_log.pages_updated, files.metadata, page_versions.frontmatter) against real Postgres and asserts `jsonb_typeof='object'` plus `->>'key'` returns the expected scalar. The test that should have caught the original double-encode bug. +- `test/e2e/jsonb-roundtrip.test.ts` — v0.12.3 companion regression against the 4 doctor-scanned JSONB sites. Assertion-level overlap with `postgres-jsonb.test.ts` is intentional defense-in-depth: if doctor's scan surface ever drifts from the actual write surface, one of these tests catches it. - `test/e2e/upgrade.test.ts` runs check-update E2E against real GitHub API (network required) - Tier 2 (`skills.test.ts`) requires OpenClaw + API keys, runs nightly in CI - If `.env.testing` doesn't exist in this directory, check sibling worktrees for one: diff --git a/README.md b/README.md index 309731b4..14b6cfd1 100644 --- a/README.md +++ b/README.md @@ -537,6 +537,7 @@ ADMIN gbrain check-backlinks check|fix Back-link enforcement gbrain lint [--fix] LLM artifact detection gbrain repair-jsonb [--dry-run] Repair v0.12.0 double-encoded JSONB (Postgres) + gbrain orphans [--json] [--count] Find pages with zero inbound wikilinks gbrain transcribe