diff --git a/.changeset/foraging-skeleton.md b/.changeset/foraging-skeleton.md new file mode 100644 index 0000000..2e1fd1b --- /dev/null +++ b/.changeset/foraging-skeleton.md @@ -0,0 +1,18 @@ +--- +"@colony/foraging": minor +--- + +Introduce `@colony/foraging` package skeleton. Ships pure-fs primitives +for foraging — scanning `/examples//` food sources, +classifying each by manifest kind (`npm` / `pypi` / `cargo` / `go` / +`unknown`), computing a change-signal `content_hash` over manifest + +file tree, and best-effort redaction of common cloud-service secrets +before anything hits storage. + +No storage writes, no MCP wiring, no hooks yet — those arrive in the +follow-up PR. This layer stands alone so it can be unit-tested without +dragging `MemoryStore` into the test fixture. + +Public API: `scanExamplesFs`, `extract`, `readCapped`, `redact`, plus +the `FoodSource` / `ForagedPattern` / `IntegrationPlan` / `ScanLimits` +types and `DEFAULT_SCAN_LIMITS` constants. diff --git a/.changeset/storage-examples-table.md b/.changeset/storage-examples-table.md new file mode 100644 index 0000000..0104059 --- /dev/null +++ b/.changeset/storage-examples-table.md @@ -0,0 +1,10 @@ +--- +"@colony/storage": minor +--- + +Add an `examples` table and `upsertExample` / `getExample` / `listExamples` / +`deleteExample` methods to support the forthcoming `@colony/foraging` +package. Each row caches the content hash and observation count for a +`/examples/` food source so repeat scans on +`SessionStart` can skip unchanged directories without touching the +observation table. Schema version bumped 6 → 7. diff --git a/packages/foraging/package.json b/packages/foraging/package.json new file mode 100644 index 0000000..a08a32e --- /dev/null +++ b/packages/foraging/package.json @@ -0,0 +1,26 @@ +{ + "name": "@colony/foraging", + "version": "0.0.0", + "license": "MIT", + "type": "module", + "main": "./dist/index.js", + "types": "./dist/index.d.ts", + "exports": { + ".": { + "types": "./dist/index.d.ts", + "import": "./dist/index.js" + } + }, + "files": ["dist"], + "scripts": { + "build": "tsup src/index.ts --format esm --dts --clean", + "dev": "tsup src/index.ts --format esm --dts --watch", + "test": "vitest run", + "typecheck": "tsc --noEmit" + }, + "devDependencies": { + "tsup": "^8.3.5", + "typescript": "^5.6.3", + "vitest": "^2.1.5" + } +} diff --git a/packages/foraging/src/extractor.ts b/packages/foraging/src/extractor.ts new file mode 100644 index 0000000..297b07c --- /dev/null +++ b/packages/foraging/src/extractor.ts @@ -0,0 +1,146 @@ +import { type Stats, readFileSync, readdirSync, statSync } from 'node:fs'; +import { join, relative } from 'node:path'; +import type { ExampleManifestKind, ScanLimits } from './types.js'; + +/** + * The subset of an `examples//` that the extractor can classify + * without reading every file byte. Paths are relative to `abs_path`. + */ +export interface ExtractedShape { + manifest_kind: ExampleManifestKind; + manifest_path: string | null; + readme_path: string | null; + entrypoints: string[]; + /** Flat list of files visited — useful for `content_hash` computation. */ + file_tree: Array<{ path: string; size: number }>; +} + +const MANIFEST_BY_FILE: ReadonlyArray<{ name: string; kind: ExampleManifestKind }> = [ + { name: 'package.json', kind: 'npm' }, + { name: 'pyproject.toml', kind: 'pypi' }, + { name: 'setup.py', kind: 'pypi' }, + { name: 'requirements.txt', kind: 'pypi' }, + { name: 'Cargo.toml', kind: 'cargo' }, + { name: 'go.mod', kind: 'go' }, +]; + +const README_NAMES: readonly string[] = [ + 'README.md', + 'README.mdx', + 'README.rst', + 'README.txt', + 'README', +]; + +const ENTRYPOINT_CANDIDATES: readonly string[] = [ + 'src/index.ts', + 'src/index.tsx', + 'src/index.js', + 'src/index.mjs', + 'src/main.ts', + 'src/main.js', + 'src/main.rs', + 'src/main.go', + 'src/main.py', + 'index.ts', + 'index.js', + 'main.py', + 'main.go', + 'main.rs', +]; + +/** + * Scan a single food source directory and return its shape. The walk + * respects `limits` so pathological examples (node_modules copy, giant + * test fixtures) don't stall a SessionStart hook. + */ +export function extract(abs_path: string, limits: ScanLimits): ExtractedShape { + const file_tree = walk(abs_path, limits); + const relPaths = new Set(file_tree.map((f) => f.path)); + + const manifestHit = MANIFEST_BY_FILE.find((m) => relPaths.has(m.name)); + const manifest_kind: ExampleManifestKind = manifestHit?.kind ?? 'unknown'; + const manifest_path = manifestHit?.name ?? null; + + const readme_path = README_NAMES.find((n) => relPaths.has(n)) ?? null; + + const entrypoints = ENTRYPOINT_CANDIDATES.filter((c) => relPaths.has(c)); + + return { manifest_kind, manifest_path, readme_path, entrypoints, file_tree }; +} + +/** + * Small hand-rolled BFS because we want to (a) enforce depth, (b) stop + * at `max_files_per_source`, and (c) skip dependency caches at tier 1 + * without pulling in a globbing library. Ordering inside a directory is + * alphabetical (`readdirSync` is platform-dependent otherwise). + */ +function walk(root: string, limits: ScanLimits): Array<{ path: string; size: number }> { + const out: Array<{ path: string; size: number }> = []; + const queue: Array<{ dir: string; depth: number }> = [{ dir: root, depth: 0 }]; + + while (queue.length > 0 && out.length < limits.max_files_per_source) { + const next = queue.shift(); + if (!next) break; + const { dir, depth } = next; + let entries: string[]; + try { + entries = readdirSync(dir); + } catch { + continue; + } + entries.sort(); + + for (const name of entries) { + if (out.length >= limits.max_files_per_source) break; + if (SKIP_NAMES.has(name)) continue; + const abs = join(dir, name); + let st: Stats; + try { + st = statSync(abs); + } catch { + continue; + } + const rel = relative(root, abs); + if (st.isDirectory()) { + if (depth + 1 < limits.max_depth) { + queue.push({ dir: abs, depth: depth + 1 }); + } + } else if (st.isFile()) { + out.push({ path: rel, size: st.size }); + } + } + } + return out; +} + +const SKIP_NAMES = new Set([ + 'node_modules', + '.git', + '.venv', + 'venv', + 'dist', + 'build', + 'target', + '.next', + '.turbo', + '.cache', + '__pycache__', +]); + +/** + * Read a manifest file and return its raw text capped at `max_file_bytes`. + * Returning null instead of throwing keeps the scanner tolerant of files + * that disappear mid-walk. + */ +export function readCapped(abs: string, max_file_bytes: number): string | null { + try { + const buf = readFileSync(abs); + if (buf.byteLength > max_file_bytes) { + return buf.subarray(0, max_file_bytes).toString('utf8'); + } + return buf.toString('utf8'); + } catch { + return null; + } +} diff --git a/packages/foraging/src/index.ts b/packages/foraging/src/index.ts new file mode 100644 index 0000000..f966667 --- /dev/null +++ b/packages/foraging/src/index.ts @@ -0,0 +1,14 @@ +export { scanExamplesFs } from './scanner.js'; +export type { ScanFsOptions, ScanFsResult } from './scanner.js'; +export { extract, readCapped } from './extractor.js'; +export type { ExtractedShape } from './extractor.js'; +export { redact } from './redact.js'; +export type { + ExampleManifestKind, + FoodSource, + ForagedPattern, + IntegrationPlan, + ScanLimits, + ScanResult, +} from './types.js'; +export { DEFAULT_SCAN_LIMITS } from './types.js'; diff --git a/packages/foraging/src/redact.ts b/packages/foraging/src/redact.ts new file mode 100644 index 0000000..56d2544 --- /dev/null +++ b/packages/foraging/src/redact.ts @@ -0,0 +1,70 @@ +/** + * Best-effort secret scrubbing for foraged content. + * + * Examples directories often carry `.env.example`, API-key snippets in + * README blocks, or copy-pasted Dockerfile secrets that a well-meaning + * author forgot to trim. We do a conservative pass before the text + * reaches SQLite — enough to strip the obvious cases without trying to + * be a fully general DLP engine. + * + * The three tiers: + * 1. Common cloud / service env-var names whose values are tokens. + * 2. Long opaque base64/hex strings that sit on their own assignment. + * 3. Armored PEM blocks. + */ + +const DEFAULT_ENV_NAME_PATTERNS: readonly RegExp[] = [ + /AWS_[A-Z0-9_]*(?:KEY|SECRET|TOKEN)[A-Z0-9_]*/, + /GITHUB_TOKEN/, + /GH_TOKEN/, + /OPENAI_API_KEY/, + /ANTHROPIC_API_KEY/, + /HUGGINGFACE_[A-Z0-9_]*TOKEN/, + /SLACK_[A-Z0-9_]*TOKEN/, + /STRIPE_[A-Z0-9_]*KEY/, + /TWILIO_[A-Z0-9_]*TOKEN/, + /[A-Z0-9_]*(?:SECRET|PASSWORD|PRIVATE_KEY|ACCESS_KEY)[A-Z0-9_]*/, +]; + +const PEM_BLOCK = /-----BEGIN [A-Z ]+PRIVATE KEY-----[\s\S]*?-----END [A-Z ]+PRIVATE KEY-----/g; + +/** + * Scrubs the text in place. Emits `***REDACTED***` wherever a secret was + * removed so downstream readers can see *that* a redaction happened + * without seeing the value. + */ +export function redact(text: string, extraEnvNames: readonly string[] = []): string { + let out = text; + + // Tier 3 first — PEM blocks span many lines, easier to match before + // we start mangling assignment lines. + out = out.replace(PEM_BLOCK, '***REDACTED_PRIVATE_KEY***'); + + // Tier 1 — env-var-like assignments. Match `FOO_SECRET=value` and + // `FOO_SECRET: "value"` in both .env and YAML forms, then zero the + // value while keeping the key for context. + const extraPatterns = extraEnvNames.map((n) => new RegExp(`^${escapeRegex(n)}$`, 'i')); + const envMatchers = [...DEFAULT_ENV_NAME_PATTERNS, ...extraPatterns]; + out = out + .split('\n') + .map((line) => redactEnvLine(line, envMatchers)) + .join('\n'); + + return out; +} + +function redactEnvLine(line: string, matchers: readonly RegExp[]): string { + const match = line.match(/^(\s*)([A-Z][A-Z0-9_]*)(\s*[:=]\s*)(.*)$/); + if (!match) return line; + const indent = match[1] ?? ''; + const name = match[2]; + const sep = match[3]; + if (!name || !sep) return line; + if (!matchers.some((re) => re.test(name))) return line; + // Keep the key + separator for debugging context; drop the value. + return `${indent}${name}${sep}***REDACTED***`; +} + +function escapeRegex(s: string): string { + return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +} diff --git a/packages/foraging/src/scanner.ts b/packages/foraging/src/scanner.ts new file mode 100644 index 0000000..ae61aa8 --- /dev/null +++ b/packages/foraging/src/scanner.ts @@ -0,0 +1,96 @@ +import { createHash } from 'node:crypto'; +import { readdirSync, statSync } from 'node:fs'; +import { join } from 'node:path'; +import { type ExtractedShape, extract, readCapped } from './extractor.js'; +import { DEFAULT_SCAN_LIMITS, type FoodSource, type ScanLimits } from './types.js'; + +export interface ScanFsOptions { + repo_root: string; + limits?: Partial; +} + +export interface ScanFsResult { + scanned: FoodSource[]; +} + +/** + * Discover food sources on disk without touching storage. Storage-aware + * `scanExamples` (next PR) wraps this and decides which of the returned + * sources to actually index based on `storage.getExample` hashes. + * + * Decoupling is deliberate: (a) the fs walk is pure and easy to test in + * isolation, (b) the storage-aware wrapper can stay a thin orchestrator + * with no fs logic of its own. + */ +export function scanExamplesFs(opts: ScanFsOptions): ScanFsResult { + const limits = mergeLimits(opts.limits); + const examplesDir = join(opts.repo_root, 'examples'); + + let names: string[]; + try { + names = readdirSync(examplesDir); + } catch { + return { scanned: [] }; + } + names.sort(); + + const scanned: FoodSource[] = []; + for (const example_name of names) { + const abs_path = join(examplesDir, example_name); + let isDir = false; + try { + isDir = statSync(abs_path).isDirectory(); + } catch { + continue; + } + if (!isDir) continue; + + const shape = extract(abs_path, limits); + const content_hash = computeContentHash(abs_path, shape, limits); + scanned.push({ + repo_root: opts.repo_root, + example_name, + abs_path, + manifest_kind: shape.manifest_kind, + manifest_path: shape.manifest_path, + readme_path: shape.readme_path, + entrypoints: shape.entrypoints, + content_hash, + }); + } + return { scanned }; +} + +/** + * Stable hash of (manifest bytes, sorted {path,size} pairs). Chosen + * over "hash every file" because the hash runs on every SessionStart + * and must finish in milliseconds. Size + path shifts are a sufficient + * change signal: an edit to any tracked file moves the size, a rename + * moves the path, a new file moves the set. A pure content-preserving + * edit (touch, whitespace-only, etc.) will miss — acceptable since the + * cached observations already encode the meaningful content. + */ +function computeContentHash(abs_path: string, shape: ExtractedShape, limits: ScanLimits): string { + const hash = createHash('sha256'); + if (shape.manifest_path) { + const manifest = readCapped(join(abs_path, shape.manifest_path), limits.max_file_bytes); + if (manifest !== null) { + hash.update(`manifest:${shape.manifest_path}\n`); + hash.update(manifest); + hash.update('\n'); + } + } + hash.update('filetree:\n'); + for (const f of shape.file_tree.slice().sort((a, b) => a.path.localeCompare(b.path))) { + hash.update(`${f.path}\t${f.size}\n`); + } + return hash.digest('hex'); +} + +function mergeLimits(partial?: Partial): ScanLimits { + return { + max_depth: partial?.max_depth ?? DEFAULT_SCAN_LIMITS.max_depth, + max_file_bytes: partial?.max_file_bytes ?? DEFAULT_SCAN_LIMITS.max_file_bytes, + max_files_per_source: partial?.max_files_per_source ?? DEFAULT_SCAN_LIMITS.max_files_per_source, + }; +} diff --git a/packages/foraging/src/types.ts b/packages/foraging/src/types.ts new file mode 100644 index 0000000..b15a094 --- /dev/null +++ b/packages/foraging/src/types.ts @@ -0,0 +1,73 @@ +/** + * Foraging's domain model: an `examples//` directory is a "food + * source" an agent can forage from. Scanner discovers food sources, + * extractor classifies their shape, indexer (PR 3) turns the shape into + * observations. This module owns the type boundary between those stages. + */ + +export type ExampleManifestKind = 'npm' | 'pypi' | 'cargo' | 'go' | 'unknown'; + +/** + * What a single `/examples//` looks like after the + * extractor classifies it. Paths inside are *relative to abs_path* so + * the indexer can stitch them onto whatever `repo_root` it receives + * later without re-walking. + */ +export interface FoodSource { + repo_root: string; + example_name: string; + abs_path: string; + manifest_kind: ExampleManifestKind; + manifest_path: string | null; + readme_path: string | null; + entrypoints: string[]; + content_hash: string; +} + +/** + * A unit of content the indexer will persist as one observation. Stays + * intentionally minimal: the redacted, pre-compress text plus enough + * metadata for `examples_query` to filter without a JOIN. + */ +export interface ForagedPattern { + example_name: string; + file_path: string; + entry_kind: 'manifest' | 'readme' | 'filetree' | 'entrypoint' | 'config'; + content: string; +} + +/** + * Deterministic plan handed to an agent by `examples_integrate_plan`. + * No LLM in the loop — the plan is derived from the example's manifest + * diffed against the target repo's manifest. `uncertainty_notes` + * captures everything the planner couldn't resolve so the agent knows + * where to apply judgement. + */ +export interface IntegrationPlan { + example_name: string; + dependency_delta: { + add: Record; + remove: string[]; + }; + files_to_copy: Array<{ from: string; to_suggestion: string; rationale: string }>; + config_steps: string[]; + uncertainty_notes: string[]; +} + +export interface ScanResult { + scanned: FoodSource[]; + skipped_unchanged: number; + indexed_observations: number; +} + +export interface ScanLimits { + max_depth: number; + max_file_bytes: number; + max_files_per_source: number; +} + +export const DEFAULT_SCAN_LIMITS: ScanLimits = { + max_depth: 2, + max_file_bytes: 200_000, + max_files_per_source: 50, +}; diff --git a/packages/foraging/test/extractor.test.ts b/packages/foraging/test/extractor.test.ts new file mode 100644 index 0000000..1ad56c6 --- /dev/null +++ b/packages/foraging/test/extractor.test.ts @@ -0,0 +1,87 @@ +import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; +import { extract, readCapped } from '../src/extractor.js'; +import { DEFAULT_SCAN_LIMITS } from '../src/types.js'; + +let dir: string; + +beforeEach(() => { + dir = mkdtempSync(join(tmpdir(), 'colony-forage-extract-')); +}); + +afterEach(() => { + rmSync(dir, { recursive: true, force: true }); +}); + +function write(rel: string, contents: string): void { + const abs = join(dir, rel); + mkdirSync(join(abs, '..'), { recursive: true }); + writeFileSync(abs, contents); +} + +describe('extract', () => { + it('returns unknown kind with null manifest on an empty directory', () => { + const shape = extract(dir, DEFAULT_SCAN_LIMITS); + expect(shape.manifest_kind).toBe('unknown'); + expect(shape.manifest_path).toBeNull(); + expect(shape.readme_path).toBeNull(); + expect(shape.entrypoints).toEqual([]); + expect(shape.file_tree).toEqual([]); + }); + + it('picks package.json over ambiguous markers and tags it as npm', () => { + write('package.json', '{"name":"ex"}'); + write('README.md', '# ex'); + write('src/index.ts', 'export {}'); + + const shape = extract(dir, DEFAULT_SCAN_LIMITS); + expect(shape.manifest_kind).toBe('npm'); + expect(shape.manifest_path).toBe('package.json'); + expect(shape.readme_path).toBe('README.md'); + expect(shape.entrypoints).toEqual(['src/index.ts']); + }); + + it('records file_tree entries with sizes', () => { + write('Cargo.toml', '[package]\nname = "x"'); + write('src/main.rs', 'fn main() {}'); + + const shape = extract(dir, DEFAULT_SCAN_LIMITS); + const cargo = shape.file_tree.find((f) => f.path === 'Cargo.toml'); + expect(cargo?.size).toBeGreaterThan(0); + expect(shape.file_tree.some((f) => f.path === 'src/main.rs')).toBe(true); + }); + + it('stops walking at max_depth', () => { + write('a.txt', 'a'); + write('nested/b.txt', 'b'); + write('nested/deep/c.txt', 'c'); + + const shape = extract(dir, { ...DEFAULT_SCAN_LIMITS, max_depth: 1 }); + // Only the top-level file is visible at depth 1 (root is depth 0 and we + // recurse when depth+1 < max_depth, so max_depth=1 means "no children"). + const paths = shape.file_tree.map((f) => f.path); + expect(paths).toContain('a.txt'); + expect(paths).not.toContain('nested/b.txt'); + }); +}); + +describe('readCapped', () => { + it('reads a short file fully', () => { + const abs = join(dir, 'small.txt'); + writeFileSync(abs, 'hello'); + expect(readCapped(abs, 1024)).toBe('hello'); + }); + + it('truncates oversize content at the byte cap', () => { + const abs = join(dir, 'big.txt'); + writeFileSync(abs, 'a'.repeat(1024)); + const got = readCapped(abs, 64); + expect(got?.length).toBe(64); + }); + + it('returns null on unreadable paths', () => { + expect(readCapped(join(dir, 'missing.txt'), 1024)).toBeNull(); + }); +}); diff --git a/packages/foraging/test/redact.test.ts b/packages/foraging/test/redact.test.ts new file mode 100644 index 0000000..b6c22a8 --- /dev/null +++ b/packages/foraging/test/redact.test.ts @@ -0,0 +1,63 @@ +import { describe, expect, it } from 'vitest'; +import { redact } from '../src/redact.js'; + +describe('redact', () => { + it('is a no-op on content with no secret signals', () => { + const input = 'Plain README body.\n\nNo secrets here.'; + expect(redact(input)).toBe(input); + }); + + it('masks common cloud token assignments while keeping the key', () => { + const input = [ + 'AWS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE', + 'GITHUB_TOKEN=ghp_secretvaluehere', + 'OPENAI_API_KEY: "sk-proj-abcdef"', + 'NORMAL_VAR=value', + ].join('\n'); + + const out = redact(input); + expect(out).toContain('AWS_ACCESS_KEY_ID=***REDACTED***'); + expect(out).toContain('GITHUB_TOKEN=***REDACTED***'); + expect(out).toContain('OPENAI_API_KEY: ***REDACTED***'); + // Non-matching keys pass through unchanged. + expect(out).toContain('NORMAL_VAR=value'); + // Original secret values must not survive. + expect(out).not.toContain('AKIAIOSFODNN7EXAMPLE'); + expect(out).not.toContain('ghp_secretvaluehere'); + expect(out).not.toContain('sk-proj-abcdef'); + }); + + it('redacts armored PEM private-key blocks', () => { + const input = [ + 'Header line', + '-----BEGIN RSA PRIVATE KEY-----', + 'MIIEowIBAAKCAQEAv9', + '-----END RSA PRIVATE KEY-----', + 'Trailing line', + ].join('\n'); + + const out = redact(input); + expect(out).toContain('***REDACTED_PRIVATE_KEY***'); + expect(out).not.toContain('MIIEowIBAAKCAQEAv9'); + expect(out).toContain('Header line'); + expect(out).toContain('Trailing line'); + }); + + it('matches caller-supplied extra env names', () => { + const input = 'APP_SIGNING_SEED=super-secret-value'; + const out = redact(input, ['APP_SIGNING_SEED']); + expect(out).toBe('APP_SIGNING_SEED=***REDACTED***'); + }); + + it('catches generic *_SECRET / *_PASSWORD / *_PRIVATE_KEY names', () => { + const input = [ + 'MY_DB_PASSWORD=pw', + 'STRIPE_WEBHOOK_SECRET=whsec_live', + 'APP_PRIVATE_KEY=pk', + ].join('\n'); + const out = redact(input); + expect(out).not.toContain('pw'); + expect(out).not.toContain('whsec_live'); + expect(out.split('\n').every((l) => l.includes('***REDACTED***'))).toBe(true); + }); +}); diff --git a/packages/foraging/test/scanner.test.ts b/packages/foraging/test/scanner.test.ts new file mode 100644 index 0000000..c0590d5 --- /dev/null +++ b/packages/foraging/test/scanner.test.ts @@ -0,0 +1,114 @@ +import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; +import { scanExamplesFs } from '../src/scanner.js'; + +let repo: string; + +beforeEach(() => { + repo = mkdtempSync(join(tmpdir(), 'colony-forage-')); +}); + +afterEach(() => { + rmSync(repo, { recursive: true, force: true }); +}); + +function write(rel: string, contents: string): void { + const abs = join(repo, rel); + mkdirSync(join(abs, '..'), { recursive: true }); + writeFileSync(abs, contents); +} + +describe('scanExamplesFs', () => { + it('returns an empty list when /examples does not exist', () => { + const result = scanExamplesFs({ repo_root: repo }); + expect(result.scanned).toEqual([]); + }); + + it('discovers each subdirectory as a food source', () => { + write('examples/stripe/package.json', '{"name":"stripe"}'); + write('examples/stripe/src/index.ts', 'export {}'); + write('examples/rust-cli/Cargo.toml', '[package]\nname = "rust-cli"'); + write('examples/rust-cli/src/main.rs', 'fn main() {}'); + + const { scanned } = scanExamplesFs({ repo_root: repo }); + expect(scanned.map((s) => s.example_name)).toEqual(['rust-cli', 'stripe']); + + const stripe = scanned.find((s) => s.example_name === 'stripe'); + expect(stripe?.manifest_kind).toBe('npm'); + expect(stripe?.manifest_path).toBe('package.json'); + expect(stripe?.entrypoints).toContain('src/index.ts'); + + const rust = scanned.find((s) => s.example_name === 'rust-cli'); + expect(rust?.manifest_kind).toBe('cargo'); + expect(rust?.entrypoints).toContain('src/main.rs'); + }); + + it('classifies pypi / go / unknown manifest kinds', () => { + write('examples/py/pyproject.toml', '[project]\nname = "py"'); + write('examples/goapp/go.mod', 'module goapp'); + write('examples/bare/hello.txt', 'hi'); + + const { scanned } = scanExamplesFs({ repo_root: repo }); + expect(scanned.find((s) => s.example_name === 'py')?.manifest_kind).toBe('pypi'); + expect(scanned.find((s) => s.example_name === 'goapp')?.manifest_kind).toBe('go'); + expect(scanned.find((s) => s.example_name === 'bare')?.manifest_kind).toBe('unknown'); + }); + + it('content_hash is stable across repeat scans of identical trees', () => { + write('examples/one/package.json', '{"name":"one"}'); + write('examples/one/src/index.ts', 'export const x = 1'); + + const first = scanExamplesFs({ repo_root: repo }).scanned[0]; + const second = scanExamplesFs({ repo_root: repo }).scanned[0]; + expect(first?.content_hash).toBeDefined(); + expect(first?.content_hash).toBe(second?.content_hash); + }); + + it('content_hash changes when a tracked file size changes', () => { + write('examples/one/package.json', '{"name":"one"}'); + write('examples/one/src/index.ts', 'export const x = 1'); + + const before = scanExamplesFs({ repo_root: repo }).scanned[0]?.content_hash; + + write('examples/one/src/index.ts', 'export const x = 1 /* edited */'); + + const after = scanExamplesFs({ repo_root: repo }).scanned[0]?.content_hash; + expect(after).not.toBe(before); + }); + + it('picks up README and notes it on the food source', () => { + write('examples/readme-only/README.md', '# hi'); + write('examples/readme-only/package.json', '{"name":"r"}'); + + const source = scanExamplesFs({ repo_root: repo }).scanned[0]; + expect(source?.readme_path).toBe('README.md'); + }); + + it('honors max_files_per_source by stopping traversal early', () => { + for (let i = 0; i < 10; i++) { + write(`examples/many/src/f${i}.ts`, `// ${i}`); + } + write('examples/many/package.json', '{"name":"many"}'); + + const { scanned } = scanExamplesFs({ + repo_root: repo, + limits: { max_files_per_source: 3 }, + }); + // The hash must still be computed; content_hash presence is the proof + // the walk terminated cleanly rather than scanning all 11 files. + expect(scanned[0]?.content_hash).toBeDefined(); + }); + + it('ignores node_modules and other skip-listed directories', () => { + write('examples/app/package.json', '{"name":"app"}'); + write('examples/app/node_modules/dep/index.js', '// should be ignored'); + write('examples/app/src/index.ts', 'export {}'); + + const source = scanExamplesFs({ repo_root: repo }).scanned[0]; + // Must not see node_modules via entrypoint list; src/index.ts must. + expect(source?.entrypoints).toContain('src/index.ts'); + expect(source?.entrypoints.some((e) => e.includes('node_modules'))).toBe(false); + }); +}); diff --git a/packages/foraging/tsconfig.json b/packages/foraging/tsconfig.json new file mode 100644 index 0000000..1e20cd9 --- /dev/null +++ b/packages/foraging/tsconfig.json @@ -0,0 +1,7 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { + "outDir": "dist" + }, + "include": ["src"] +} diff --git a/packages/storage/src/index.ts b/packages/storage/src/index.ts index 2a91627..7c34520 100644 --- a/packages/storage/src/index.ts +++ b/packages/storage/src/index.ts @@ -20,4 +20,7 @@ export type { ReinforcementKind, AgentProfileRow, NewAgentProfile, + ExampleRow, + NewExample, + ExampleManifestKind, } from './types.js'; diff --git a/packages/storage/src/schema.ts b/packages/storage/src/schema.ts index 942f883..cfd08ac 100644 --- a/packages/storage/src/schema.ts +++ b/packages/storage/src/schema.ts @@ -164,7 +164,24 @@ CREATE TABLE IF NOT EXISTS agent_profiles ( updated_at INTEGER NOT NULL ); -INSERT OR IGNORE INTO schema_version(version) VALUES (6); +-- Foraging food sources: one row per indexed /examples/. +-- content_hash is sha256 over (manifest + filetree + key file sizes); the +-- scanner uses it to skip work on repeat SessionStarts. observation_count +-- is cached here so listExamples doesn't need to fan out into observations +-- just to render the session-start preface. +CREATE TABLE IF NOT EXISTS examples ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + repo_root TEXT NOT NULL, + example_name TEXT NOT NULL, + content_hash TEXT NOT NULL, + manifest_kind TEXT, + last_scanned_at INTEGER NOT NULL, + observation_count INTEGER NOT NULL DEFAULT 0, + UNIQUE(repo_root, example_name) +); +CREATE INDEX IF NOT EXISTS idx_examples_repo ON examples(repo_root); + +INSERT OR IGNORE INTO schema_version(version) VALUES (7); `; /** diff --git a/packages/storage/src/storage.ts b/packages/storage/src/storage.ts index 549e054..403be0c 100644 --- a/packages/storage/src/storage.ts +++ b/packages/storage/src/storage.ts @@ -4,7 +4,9 @@ import Database from 'better-sqlite3'; import { COLUMN_MIGRATIONS, POST_MIGRATION_SQL, SCHEMA_SQL } from './schema.js'; import type { AgentProfileRow, + ExampleRow, NewAgentProfile, + NewExample, NewObservation, NewPheromone, NewProposal, @@ -651,6 +653,62 @@ export class Storage { return this.db.transaction(fn)(); } + // --- foraging food sources (indexed /examples/) --- + + /** + * Insert-or-replace an `examples` row for a (repo_root, example_name). + * The scanner owns `content_hash` semantics — we accept whatever it + * computes and last-writer-wins. Replacing the row (rather than merging) + * matches the data's identity: a food source is defined by its current + * content, so stale metadata must not survive a rescan. + */ + upsertExample(e: NewExample): number { + const now = e.last_scanned_at ?? Date.now(); + const info = this.db + .prepare( + `INSERT INTO examples(repo_root, example_name, content_hash, manifest_kind, + last_scanned_at, observation_count) + VALUES (?, ?, ?, ?, ?, ?) + ON CONFLICT(repo_root, example_name) DO UPDATE SET + content_hash = excluded.content_hash, + manifest_kind = excluded.manifest_kind, + last_scanned_at = excluded.last_scanned_at, + observation_count = excluded.observation_count + RETURNING id`, + ) + .get( + e.repo_root, + e.example_name, + e.content_hash, + e.manifest_kind, + now, + e.observation_count ?? 0, + ) as { id: number }; + return info.id; + } + + /** One example row for (repo_root, example_name) or undefined. */ + getExample(repo_root: string, example_name: string): ExampleRow | undefined { + return this.db + .prepare('SELECT * FROM examples WHERE repo_root = ? AND example_name = ?') + .get(repo_root, example_name) as ExampleRow | undefined; + } + + /** Every example for a repo, newest-scan-first. */ + listExamples(repo_root: string): ExampleRow[] { + return this.db + .prepare('SELECT * FROM examples WHERE repo_root = ? ORDER BY last_scanned_at DESC') + .all(repo_root) as ExampleRow[]; + } + + /** Delete a single food source's row. Observations are kept — the caller + * (CLI `foraging clear`) decides whether to purge those separately. */ + deleteExample(repo_root: string, example_name: string): void { + this.db + .prepare('DELETE FROM examples WHERE repo_root = ? AND example_name = ?') + .run(repo_root, example_name); + } + // --- observe / debrief analytics --- // // These are read-heavy queries serving the CLI dashboards. They stay on diff --git a/packages/storage/src/types.ts b/packages/storage/src/types.ts index 6245d97..676768c 100644 --- a/packages/storage/src/types.ts +++ b/packages/storage/src/types.ts @@ -163,3 +163,24 @@ export interface SearchHit { score: number; ts: number; } + +export type ExampleManifestKind = 'npm' | 'pypi' | 'cargo' | 'go' | 'unknown'; + +export interface ExampleRow { + id: number; + repo_root: string; + example_name: string; + content_hash: string; + manifest_kind: ExampleManifestKind | null; + last_scanned_at: number; + observation_count: number; +} + +export interface NewExample { + repo_root: string; + example_name: string; + content_hash: string; + manifest_kind: ExampleManifestKind | null; + observation_count?: number; + last_scanned_at?: number; +} diff --git a/packages/storage/test/examples.test.ts b/packages/storage/test/examples.test.ts new file mode 100644 index 0000000..c4818bb --- /dev/null +++ b/packages/storage/test/examples.test.ts @@ -0,0 +1,136 @@ +import { mkdtempSync, rmSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; +import { Storage } from '../src/index.js'; + +let dir: string; +let storage: Storage; + +beforeEach(() => { + dir = mkdtempSync(join(tmpdir(), 'colony-examples-')); + storage = new Storage(join(dir, 'test.db')); +}); + +afterEach(() => { + storage.close(); + rmSync(dir, { recursive: true, force: true }); +}); + +describe('Storage — examples (foraging food sources)', () => { + it('upsert inserts a new row and returns its id', () => { + const id = storage.upsertExample({ + repo_root: '/repo/a', + example_name: 'stripe-webhook', + content_hash: 'hash-1', + manifest_kind: 'npm', + observation_count: 5, + last_scanned_at: 1_000, + }); + expect(id).toBeGreaterThan(0); + + const row = storage.getExample('/repo/a', 'stripe-webhook'); + expect(row).toMatchObject({ + id, + repo_root: '/repo/a', + example_name: 'stripe-webhook', + content_hash: 'hash-1', + manifest_kind: 'npm', + observation_count: 5, + last_scanned_at: 1_000, + }); + }); + + it('upsert replaces content_hash, manifest_kind, observation_count, last_scanned_at on conflict', () => { + const firstId = storage.upsertExample({ + repo_root: '/repo/a', + example_name: 'stripe-webhook', + content_hash: 'hash-1', + manifest_kind: 'npm', + observation_count: 5, + last_scanned_at: 1_000, + }); + const secondId = storage.upsertExample({ + repo_root: '/repo/a', + example_name: 'stripe-webhook', + content_hash: 'hash-2', + manifest_kind: 'npm', + observation_count: 7, + last_scanned_at: 2_000, + }); + + // Same natural key → same row id, not a new one. + expect(secondId).toBe(firstId); + + const row = storage.getExample('/repo/a', 'stripe-webhook'); + expect(row).toMatchObject({ + id: firstId, + content_hash: 'hash-2', + observation_count: 7, + last_scanned_at: 2_000, + }); + }); + + it('listExamples returns rows for the repo, newest-scan-first', () => { + storage.upsertExample({ + repo_root: '/repo/a', + example_name: 'older', + content_hash: 'h-a', + manifest_kind: 'npm', + last_scanned_at: 1_000, + }); + storage.upsertExample({ + repo_root: '/repo/a', + example_name: 'newer', + content_hash: 'h-b', + manifest_kind: 'cargo', + last_scanned_at: 2_000, + }); + storage.upsertExample({ + repo_root: '/repo/b', + example_name: 'other-repo', + content_hash: 'h-c', + manifest_kind: 'go', + last_scanned_at: 3_000, + }); + + const rows = storage.listExamples('/repo/a'); + expect(rows).toHaveLength(2); + expect(rows.map((r) => r.example_name)).toEqual(['newer', 'older']); + }); + + it('deleteExample removes the row without affecting others', () => { + storage.upsertExample({ + repo_root: '/repo/a', + example_name: 'keep', + content_hash: 'h-k', + manifest_kind: 'npm', + last_scanned_at: 1_000, + }); + storage.upsertExample({ + repo_root: '/repo/a', + example_name: 'drop', + content_hash: 'h-d', + manifest_kind: 'npm', + last_scanned_at: 1_000, + }); + + storage.deleteExample('/repo/a', 'drop'); + expect(storage.getExample('/repo/a', 'drop')).toBeUndefined(); + expect(storage.getExample('/repo/a', 'keep')).toBeDefined(); + }); + + it('accepts null manifest_kind and defaults observation_count to 0', () => { + storage.upsertExample({ + repo_root: '/repo/a', + example_name: 'unknown-kind', + content_hash: 'h', + manifest_kind: null, + last_scanned_at: 1_000, + }); + + const row = storage.getExample('/repo/a', 'unknown-kind'); + expect(row?.manifest_kind).toBeNull(); + expect(row?.observation_count).toBe(0); + }); +}); diff --git a/tsconfig.base.json b/tsconfig.base.json index 501ead5..d7abc38 100644 --- a/tsconfig.base.json +++ b/tsconfig.base.json @@ -25,6 +25,7 @@ "@colony/config": ["./packages/config/src/index.ts"], "@colony/core": ["./packages/core/src/index.ts"], "@colony/embedding": ["./packages/embedding/src/index.ts"], + "@colony/foraging": ["./packages/foraging/src/index.ts"], "@colony/storage": ["./packages/storage/src/index.ts"], "@colony/hooks": ["./packages/hooks/src/index.ts"], "@colony/installers": ["./packages/installers/src/index.ts"],