diff --git a/.changeset/foraging-indexer.md b/.changeset/foraging-indexer.md new file mode 100644 index 0000000..adbdcc8 --- /dev/null +++ b/.changeset/foraging-indexer.md @@ -0,0 +1,34 @@ +--- +"@colony/foraging": minor +"@colony/storage": minor +"@colony/config": minor +--- + +Add the foraging indexer and a storage-aware `scanExamples` wrapper. + +`indexFoodSource(food, store, opts)` converts a discovered `FoodSource` +into 1–N `foraged-pattern` observations (manifest, README, +entrypoints, filetree), scrubs env-assignment secrets through +`redact`, and persists via `MemoryStore` so compression and the +`` tag stripper both run on the write path. + +`scanExamples({ repo_root, store, session_id, limits?, extra_secret_env_names? })` +walks `/examples/*`, compares each discovered source's +`content_hash` against `storage.getExample(...)`, and only re-indexes +when the hash has shifted. Before re-indexing it calls the new +`Storage.deleteForagedObservations(repo_root, example_name)` so the +observation set never duplicates across scans. + +Two helpers on `Storage` to let the indexer (and the forthcoming MCP +tool) work without opening the DB themselves: + +- `deleteForagedObservations(repo_root, example_name): number` +- `listForagedObservations(repo_root, example_name): ObservationRow[]` + +New `settings.foraging` block (defaults: enabled, `maxDepth: 2`, +`maxFileBytes: 200_000`, `maxFilesPerSource: 50`, +`scanOnSessionStart: true`, `extraSecretEnvNames: []`). `colony config +show` and `settingsDocs()` pick it up automatically. + +No MCP tools, CLI commands, or hook wiring yet — those arrive in the +next PR. diff --git a/packages/config/src/schema.ts b/packages/config/src/schema.ts index e18a0ec..624afd6 100644 --- a/packages/config/src/schema.ts +++ b/packages/config/src/schema.ts @@ -110,6 +110,49 @@ export const SettingsSchema = z .record(z.string(), z.boolean()) .default({}) .describe('Installed IDE integrations (set by `colony install`).'), + foraging: z + .object({ + enabled: z + .boolean() + .default(true) + .describe('Auto-index /examples food sources on SessionStart.'), + maxDepth: z + .number() + .int() + .positive() + .max(5) + .default(2) + .describe('How deep to walk into each example directory.'), + maxFileBytes: z + .number() + .int() + .positive() + .default(200_000) + .describe('Truncate indexed files larger than this.'), + maxFilesPerSource: z + .number() + .int() + .positive() + .default(50) + .describe('Stop indexing after this many files per example.'), + scanOnSessionStart: z + .boolean() + .default(true) + .describe('Fire-and-forget the scanner when SessionStart fires.'), + extraSecretEnvNames: z + .array(z.string()) + .default([]) + .describe('Additional env-var names to treat as secrets during redaction.'), + }) + .default({ + enabled: true, + maxDepth: 2, + maxFileBytes: 200_000, + maxFilesPerSource: 50, + scanOnSessionStart: true, + extraSecretEnvNames: [], + }) + .describe('Foraging: turn /examples into a reusable food source.'), }) .strict(); diff --git a/packages/foraging/package.json b/packages/foraging/package.json index a08a32e..1ca18cd 100644 --- a/packages/foraging/package.json +++ b/packages/foraging/package.json @@ -18,6 +18,11 @@ "test": "vitest run", "typecheck": "tsc --noEmit" }, + "dependencies": { + "@colony/config": "workspace:*", + "@colony/core": "workspace:*", + "@colony/storage": "workspace:*" + }, "devDependencies": { "tsup": "^8.3.5", "typescript": "^5.6.3", diff --git a/packages/foraging/src/index.ts b/packages/foraging/src/index.ts index f966667..b826fc4 100644 --- a/packages/foraging/src/index.ts +++ b/packages/foraging/src/index.ts @@ -1,7 +1,9 @@ -export { scanExamplesFs } from './scanner.js'; -export type { ScanFsOptions, ScanFsResult } from './scanner.js'; +export { scanExamples, scanExamplesFs } from './scanner.js'; +export type { ScanFsOptions, ScanFsResult, ScanOptions } from './scanner.js'; export { extract, readCapped } from './extractor.js'; export type { ExtractedShape } from './extractor.js'; +export { indexFoodSource } from './indexer.js'; +export type { IndexFoodSourceOptions } from './indexer.js'; export { redact } from './redact.js'; export type { ExampleManifestKind, diff --git a/packages/foraging/src/indexer.ts b/packages/foraging/src/indexer.ts new file mode 100644 index 0000000..9fbf6ad --- /dev/null +++ b/packages/foraging/src/indexer.ts @@ -0,0 +1,169 @@ +import { type Stats, readdirSync, statSync } from 'node:fs'; +import { join, relative } from 'node:path'; +import type { MemoryStore } from '@colony/core'; +import { readCapped } from './extractor.js'; +import { redact } from './redact.js'; +import { DEFAULT_SCAN_LIMITS, type FoodSource, type ForagedPattern } from './types.js'; + +export interface IndexFoodSourceOptions { + /** Session id that owns the foraged observations (scanner spawns one). */ + session_id: string; + max_file_bytes?: number; + extra_secret_env_names?: readonly string[]; +} + +/** + * Convert a discovered food source into 1–N `foraged-pattern` + * observations and persist them via `MemoryStore`. Returns the number + * of observations actually written. + * + * The function assumes the caller has already cleared stale observations + * for this (repo_root, example_name) — see `deleteForagedObservations` + * on `Storage`. Not clearing here lets the caller distinguish "same + * source, re-indexing" from "new source, first scan" in test assertions. + */ +export function indexFoodSource( + food: FoodSource, + store: MemoryStore, + opts: IndexFoodSourceOptions, +): number { + const maxBytes = opts.max_file_bytes ?? DEFAULT_SCAN_LIMITS.max_file_bytes; + const patterns = buildPatterns(food, maxBytes); + + let written = 0; + for (const p of patterns) { + const safe = redact(p.content, opts.extra_secret_env_names ?? []); + if (!safe.trim()) continue; + const id = store.addObservation({ + session_id: opts.session_id, + kind: 'foraged-pattern', + content: safe, + metadata: { + repo_root: food.repo_root, + example_name: food.example_name, + manifest_kind: food.manifest_kind, + file_path: p.file_path, + entry_kind: p.entry_kind, + }, + }); + if (id > 0) written += 1; + } + return written; +} + +/** + * Emit patterns in a stable order so the indexed observations sit in a + * predictable sequence: manifest first (highest signal for + * integration), README next (human prose with usage examples), + * entrypoints after (canonical call sites), filetree last (tail + * context). + */ +function buildPatterns(food: FoodSource, maxBytes: number): ForagedPattern[] { + const out: ForagedPattern[] = []; + + if (food.manifest_path) { + const text = readCapped(join(food.abs_path, food.manifest_path), maxBytes); + if (text !== null) { + out.push({ + example_name: food.example_name, + file_path: food.manifest_path, + entry_kind: 'manifest', + content: text, + }); + } + } + + if (food.readme_path) { + const text = readCapped(join(food.abs_path, food.readme_path), maxBytes); + if (text !== null) { + out.push({ + example_name: food.example_name, + file_path: food.readme_path, + entry_kind: 'readme', + content: text, + }); + } + } + + for (const ep of food.entrypoints) { + const text = readCapped(join(food.abs_path, ep), maxBytes); + if (text === null) continue; + out.push({ + example_name: food.example_name, + file_path: ep, + entry_kind: 'entrypoint', + content: text, + }); + } + + const tree = renderFiletree(food.abs_path); + if (tree) { + out.push({ + example_name: food.example_name, + file_path: '__filetree__', + entry_kind: 'filetree', + content: tree, + }); + } + + return out; +} + +/** + * Render a small, sorted two-line-per-dir outline of the example. + * Deliberately flat — deep directory trees get truncated by the caller + * (`max_files_per_source` on the scanner). The output is human-readable + * so when an agent calls `get_observations(ids[])` on a filetree + * observation they see something they can reason about. + */ +function renderFiletree(abs_path: string): string { + const lines: string[] = []; + const seenDirs = new Set(); + + function visit(dir: string, depth: number): void { + if (depth > 3 || lines.length > 200) return; + let entries: string[]; + try { + entries = readdirSync(dir).sort(); + } catch { + return; + } + for (const name of entries) { + if (SKIP_NAMES.has(name)) continue; + const abs = join(dir, name); + let st: Stats; + try { + st = statSync(abs); + } catch { + continue; + } + const rel = relative(abs_path, abs); + if (st.isDirectory()) { + if (!seenDirs.has(rel)) { + seenDirs.add(rel); + lines.push(`${rel}/`); + visit(abs, depth + 1); + } + } else if (st.isFile()) { + lines.push(rel); + } + } + } + + visit(abs_path, 0); + return lines.join('\n'); +} + +const SKIP_NAMES = new Set([ + 'node_modules', + '.git', + '.venv', + 'venv', + 'dist', + 'build', + 'target', + '.next', + '.turbo', + '.cache', + '__pycache__', +]); diff --git a/packages/foraging/src/scanner.ts b/packages/foraging/src/scanner.ts index ae61aa8..305d4ef 100644 --- a/packages/foraging/src/scanner.ts +++ b/packages/foraging/src/scanner.ts @@ -1,8 +1,10 @@ import { createHash } from 'node:crypto'; import { readdirSync, statSync } from 'node:fs'; import { join } from 'node:path'; +import type { MemoryStore } from '@colony/core'; import { type ExtractedShape, extract, readCapped } from './extractor.js'; -import { DEFAULT_SCAN_LIMITS, type FoodSource, type ScanLimits } from './types.js'; +import { indexFoodSource } from './indexer.js'; +import { DEFAULT_SCAN_LIMITS, type FoodSource, type ScanLimits, type ScanResult } from './types.js'; export interface ScanFsOptions { repo_root: string; @@ -94,3 +96,63 @@ function mergeLimits(partial?: Partial): ScanLimits { max_files_per_source: partial?.max_files_per_source ?? DEFAULT_SCAN_LIMITS.max_files_per_source, }; } + +export interface ScanOptions { + repo_root: string; + store: MemoryStore; + session_id: string; + limits?: Partial; + extra_secret_env_names?: readonly string[]; +} + +/** + * Storage-aware scan. For each discovered food source: check the + * cached `content_hash` on `storage.examples`. If unchanged, skip. + * Otherwise clear stale observations, re-index, and upsert the + * examples row with the new hash + observation count. + * + * Idempotent by construction: running twice on an unchanged tree + * yields the same result the second time (all skipped). A partial + * failure mid-index means the examples row is not upserted, so the + * next run treats the source as changed and retries cleanly. + */ +export function scanExamples(opts: ScanOptions): ScanResult { + const { scanned } = scanExamplesFs({ + repo_root: opts.repo_root, + ...(opts.limits !== undefined ? { limits: opts.limits } : {}), + }); + let skipped_unchanged = 0; + let indexed_observations = 0; + + for (const food of scanned) { + const existing = opts.store.storage.getExample(food.repo_root, food.example_name); + if (existing && existing.content_hash === food.content_hash) { + skipped_unchanged += 1; + continue; + } + + opts.store.storage.deleteForagedObservations(food.repo_root, food.example_name); + + const options: Parameters[2] = { + session_id: opts.session_id, + ...(opts.limits?.max_file_bytes !== undefined + ? { max_file_bytes: opts.limits.max_file_bytes } + : {}), + ...(opts.extra_secret_env_names !== undefined + ? { extra_secret_env_names: opts.extra_secret_env_names } + : {}), + }; + const count = indexFoodSource(food, opts.store, options); + indexed_observations += count; + + opts.store.storage.upsertExample({ + repo_root: food.repo_root, + example_name: food.example_name, + content_hash: food.content_hash, + manifest_kind: food.manifest_kind, + observation_count: count, + }); + } + + return { scanned, skipped_unchanged, indexed_observations }; +} diff --git a/packages/foraging/test/indexer.test.ts b/packages/foraging/test/indexer.test.ts new file mode 100644 index 0000000..4d5fdc9 --- /dev/null +++ b/packages/foraging/test/indexer.test.ts @@ -0,0 +1,179 @@ +import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { SettingsSchema } from '@colony/config'; +import { MemoryStore } from '@colony/core'; +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; +import { indexFoodSource } from '../src/indexer.js'; +import { scanExamples } from '../src/scanner.js'; +import { scanExamplesFs } from '../src/scanner.js'; + +let repo: string; +let dbPath: string; +let store: MemoryStore; + +beforeEach(() => { + repo = mkdtempSync(join(tmpdir(), 'colony-index-')); + dbPath = join(repo, 'colony.db'); + const settings = SettingsSchema.parse({}); + store = new MemoryStore({ dbPath, settings }); + store.startSession({ id: 'session-forage', ide: 'claude-code', cwd: repo }); +}); + +afterEach(() => { + store.close(); + rmSync(repo, { recursive: true, force: true }); +}); + +function write(rel: string, contents: string): void { + const abs = join(repo, rel); + mkdirSync(join(abs, '..'), { recursive: true }); + writeFileSync(abs, contents); +} + +describe('indexFoodSource', () => { + it('writes observations for manifest, README, entrypoint, and filetree', () => { + write('examples/stripe/package.json', '{"name":"stripe"}'); + write('examples/stripe/README.md', '# stripe\nUsage example.'); + write('examples/stripe/src/index.ts', 'export const x = 1'); + + const { scanned } = scanExamplesFs({ repo_root: repo }); + const stripe = scanned.find((s) => s.example_name === 'stripe'); + expect(stripe).toBeDefined(); + if (!stripe) throw new Error('fixture missing'); + + const count = indexFoodSource(stripe, store, { session_id: 'session-forage' }); + expect(count).toBeGreaterThanOrEqual(4); + + const rows = store.storage.listForagedObservations(repo, 'stripe'); + const kinds = rows.map((r) => { + const md = r.metadata ? (JSON.parse(r.metadata) as { entry_kind: string }) : null; + return md?.entry_kind; + }); + expect(kinds).toContain('manifest'); + expect(kinds).toContain('readme'); + expect(kinds).toContain('entrypoint'); + expect(kinds).toContain('filetree'); + }); + + it('persists repo_root and example_name metadata so listForagedObservations can filter', () => { + write('examples/alpha/package.json', '{"name":"alpha"}'); + write('examples/beta/package.json', '{"name":"beta"}'); + + const { scanned } = scanExamplesFs({ repo_root: repo }); + for (const f of scanned) { + indexFoodSource(f, store, { session_id: 'session-forage' }); + } + + const alpha = store.storage.listForagedObservations(repo, 'alpha'); + const beta = store.storage.listForagedObservations(repo, 'beta'); + expect(alpha.length).toBeGreaterThan(0); + expect(beta.length).toBeGreaterThan(0); + + const alphaNames = new Set( + alpha.map((r) => { + const md = r.metadata ? (JSON.parse(r.metadata) as { example_name: string }) : null; + return md?.example_name; + }), + ); + expect(alphaNames).toEqual(new Set(['alpha'])); + }); + + it('scrubs env-assignment secrets that appear in indexed content', () => { + write('examples/leaky/package.json', '{"name":"leaky"}'); + write( + 'examples/leaky/README.md', + [ + '# leaky', + '', + 'Copy this into `.env`:', + '', + '```', + 'GITHUB_TOKEN=ghp_LEAKEDvalue', + 'NORMAL_FLAG=ok', + '```', + ].join('\n'), + ); + + const { scanned } = scanExamplesFs({ repo_root: repo }); + const leaky = scanned.find((s) => s.example_name === 'leaky'); + if (!leaky) throw new Error('fixture missing'); + indexFoodSource(leaky, store, { session_id: 'session-forage' }); + + const rows = store.storage.listForagedObservations(repo, 'leaky'); + for (const r of rows) { + expect(r.content).not.toContain('ghp_LEAKEDvalue'); + } + // Benign config survives. + const readmeRow = rows.find((r) => { + const md = r.metadata ? (JSON.parse(r.metadata) as { entry_kind: string }) : null; + return md?.entry_kind === 'readme'; + }); + expect(readmeRow).toBeDefined(); + }); +}); + +describe('scanExamples (storage-aware)', () => { + it('indexes on first run and skips on second run when unchanged', () => { + write('examples/one/package.json', '{"name":"one"}'); + write('examples/one/src/index.ts', 'export {}'); + + const first = scanExamples({ + repo_root: repo, + store, + session_id: 'session-forage', + }); + expect(first.skipped_unchanged).toBe(0); + expect(first.indexed_observations).toBeGreaterThan(0); + expect(store.storage.listExamples(repo)).toHaveLength(1); + + const second = scanExamples({ + repo_root: repo, + store, + session_id: 'session-forage', + }); + expect(second.skipped_unchanged).toBe(1); + expect(second.indexed_observations).toBe(0); + // No new observations beyond the first pass. + expect(store.storage.listForagedObservations(repo, 'one')).toHaveLength( + first.indexed_observations, + ); + }); + + it('re-indexes and clears stale observations when content changes', () => { + write('examples/one/package.json', '{"name":"one"}'); + write('examples/one/src/index.ts', 'export {}'); + + scanExamples({ repo_root: repo, store, session_id: 'session-forage' }); + const before = store.storage.listForagedObservations(repo, 'one').length; + + // Mutate a file so the content_hash shifts. + write('examples/one/src/index.ts', 'export const y = 2 /* bigger */'); + + const result = scanExamples({ + repo_root: repo, + store, + session_id: 'session-forage', + }); + expect(result.skipped_unchanged).toBe(0); + expect(result.indexed_observations).toBeGreaterThan(0); + + const after = store.storage.listForagedObservations(repo, 'one'); + // Exactly one generation of observations, not two — the stale set + // must have been cleared before re-indexing. + expect(after.length).toBe(before); + }); + + it('caches observation_count on the examples row', () => { + write('examples/one/package.json', '{"name":"one"}'); + + const result = scanExamples({ + repo_root: repo, + store, + session_id: 'session-forage', + }); + const row = store.storage.getExample(repo, 'one'); + expect(row?.observation_count).toBe(result.indexed_observations); + expect(row?.content_hash).toBe(result.scanned[0]?.content_hash); + }); +}); diff --git a/packages/storage/src/storage.ts b/packages/storage/src/storage.ts index 403be0c..1e121c7 100644 --- a/packages/storage/src/storage.ts +++ b/packages/storage/src/storage.ts @@ -709,6 +709,41 @@ export class Storage { .run(repo_root, example_name); } + /** + * Drop every `foraged-pattern` observation that belongs to a food + * source. Called by the indexer before re-indexing a changed example — + * without it, each rescan would accumulate a parallel copy of the + * same content. Returns the number of rows deleted. + */ + deleteForagedObservations(repo_root: string, example_name: string): number { + const info = this.db + .prepare( + `DELETE FROM observations + WHERE kind = 'foraged-pattern' + AND json_extract(metadata, '$.repo_root') = ? + AND json_extract(metadata, '$.example_name') = ?`, + ) + .run(repo_root, example_name); + return Number(info.changes); + } + + /** + * All `foraged-pattern` observations for a (repo_root, example_name). + * Ordered oldest→newest so the MCP `examples_query` consumers (and + * tests here) see a stable shape. + */ + listForagedObservations(repo_root: string, example_name: string): ObservationRow[] { + return this.db + .prepare( + `SELECT * FROM observations + WHERE kind = 'foraged-pattern' + AND json_extract(metadata, '$.repo_root') = ? + AND json_extract(metadata, '$.example_name') = ? + ORDER BY ts ASC`, + ) + .all(repo_root, example_name) as ObservationRow[]; + } + // --- observe / debrief analytics --- // // These are read-heavy queries serving the CLI dashboards. They stay on diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index fc8c4ff..e3c7d07 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -267,6 +267,28 @@ importers: specifier: ^2.1.5 version: 2.1.9(@types/node@22.19.17) + packages/foraging: + dependencies: + '@colony/config': + specifier: workspace:* + version: link:../config + '@colony/core': + specifier: workspace:* + version: link:../core + '@colony/storage': + specifier: workspace:* + version: link:../storage + devDependencies: + tsup: + specifier: ^8.3.5 + version: 8.5.1(postcss@8.5.10)(tsx@4.21.0)(typescript@5.9.3) + typescript: + specifier: ^5.6.3 + version: 5.9.3 + vitest: + specifier: ^2.1.5 + version: 2.1.9(@types/node@22.19.17) + packages/hooks: dependencies: '@colony/config':