From 178109c22cdcff71455475c445bc79cc8fbb114a Mon Sep 17 00:00:00 2001 From: sharziki Date: Sat, 18 Apr 2026 16:01:13 -0400 Subject: [PATCH] fix(extract): skip _-prefixed directories in walkMarkdownFiles (#202) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The extract walker matched file-level behavior — `_foo.md` was skipped — but its recursion still walked into `_pending/originals/`, so `gbrain extract all --source fs` counted quarantined pages that the sync path correctly excluded. On a brain with 61 authoritative pages and a sizeable `_pending/` tree, extract reported "154 pages walked" instead of 61, wasting link/timeline extraction effort and producing misleading counts. Hoist the `_`-prefix skip above the `lstatSync` branch so it applies to directories as well. Dotted entries already behave this way. Added a regression test covering a nested `_pending/originals/buried.md` layout alongside a sibling `concepts/alpha.md` that must still be walked. Fixes #202 --- src/commands/extract.ts | 10 +++++++++- test/extract.test.ts | 26 ++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/src/commands/extract.ts b/src/commands/extract.ts index 1b5abb7e..9c3c70c7 100644 --- a/src/commands/extract.ts +++ b/src/commands/extract.ts @@ -52,12 +52,20 @@ export function walkMarkdownFiles(dir: string): { path: string; relPath: string const files: { path: string; relPath: string }[] = []; function walk(d: string) { for (const entry of readdirSync(d)) { + // Skip `.`-prefixed entries (hidden / dotfiles) and `_`-prefixed entries + // (user-quarantine convention — e.g. `_pending/` ambient captures) at + // BOTH file and directory level. Previously the file-level check + // skipped `_foo.md` but the directory recursion still walked into + // `_pending/originals/foo.md`, so extract counted quarantined content + // that `sync` correctly excluded — "N pages walked" ballooned from 61 + // to 154 on brains with large `_pending/` trees (issue #202). if (entry.startsWith('.')) continue; + if (entry.startsWith('_')) continue; const full = join(d, entry); try { if (lstatSync(full).isDirectory()) { walk(full); - } else if (entry.endsWith('.md') && !entry.startsWith('_')) { + } else if (entry.endsWith('.md')) { files.push({ path: full, relPath: relative(dir, full) }); } } catch { /* skip unreadable */ } diff --git a/test/extract.test.ts b/test/extract.test.ts index 78720eff..698f49a2 100644 --- a/test/extract.test.ts +++ b/test/extract.test.ts @@ -122,4 +122,30 @@ describe('walkMarkdownFiles', () => { it('is a function', () => { expect(typeof walkMarkdownFiles).toBe('function'); }); + + it('skips _-prefixed directories, matching the _-prefixed file behavior', async () => { + // Quarantine convention (e.g. `_pending/` for signal-detector ambient + // captures): users extend exclusions to skip these paths on sync. The + // walker already skipped `_foo.md` files but still recursed into + // `_pending/originals/`, so extract counted quarantined content that + // sync excluded (issue #202). + const { mkdtempSync, writeFileSync, mkdirSync, rmSync } = await import('fs'); + const { tmpdir } = await import('os'); + const { join } = await import('path'); + + const root = mkdtempSync(join(tmpdir(), 'gbrain-walk-test-')); + try { + mkdirSync(join(root, 'concepts')); + mkdirSync(join(root, '_pending', 'originals'), { recursive: true }); + writeFileSync(join(root, 'concepts', 'alpha.md'), '# alpha'); + writeFileSync(join(root, '_pending', 'ambient.md'), '# ambient'); + writeFileSync(join(root, '_pending', 'originals', 'buried.md'), '# buried'); + writeFileSync(join(root, '_skip-me.md'), '# file-level skip'); + + const files = walkMarkdownFiles(root).map(f => f.relPath).sort(); + expect(files).toEqual(['concepts/alpha.md']); + } finally { + rmSync(root, { recursive: true, force: true }); + } + }); });