recodeee · NagyVikt · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026
diff --git a/.changeset/foraging-skeleton.md b/.changeset/foraging-skeleton.md
@@ -0,0 +1,18 @@
+---
+"@colony/foraging": minor
+---
+
+Introduce `@colony/foraging` package skeleton. Ships pure-fs primitives
+for foraging — scanning `<repo_root>/examples/<name>/` food sources,
+classifying each by manifest kind (`npm` / `pypi` / `cargo` / `go` /
+`unknown`), computing a change-signal `content_hash` over manifest +
+file tree, and best-effort redaction of common cloud-service secrets
+before anything hits storage.
+
+No storage writes, no MCP wiring, no hooks yet — those arrive in the
+follow-up PR. This layer stands alone so it can be unit-tested without
+dragging `MemoryStore` into the test fixture.
+
+Public API: `scanExamplesFs`, `extract`, `readCapped`, `redact`, plus
+the `FoodSource` / `ForagedPattern` / `IntegrationPlan` / `ScanLimits`
+types and `DEFAULT_SCAN_LIMITS` constants.
diff --git a/.changeset/storage-examples-table.md b/.changeset/storage-examples-table.md
@@ -0,0 +1,10 @@
+---
+"@colony/storage": minor
+---
+
+Add an `examples` table and `upsertExample` / `getExample` / `listExamples` /
+`deleteExample` methods to support the forthcoming `@colony/foraging`
+package. Each row caches the content hash and observation count for a
+`<repo_root>/examples/<name>` food source so repeat scans on
+`SessionStart` can skip unchanged directories without touching the
+observation table. Schema version bumped 6 → 7.
diff --git a/packages/foraging/package.json b/packages/foraging/package.json
@@ -0,0 +1,26 @@
+{
+  "name": "@colony/foraging",
+  "version": "0.0.0",
+  "license": "MIT",
+  "type": "module",
+  "main": "./dist/index.js",
+  "types": "./dist/index.d.ts",
+  "exports": {
+    ".": {
+      "types": "./dist/index.d.ts",
+      "import": "./dist/index.js"
+    }
+  },
+  "files": ["dist"],
+  "scripts": {
+    "build": "tsup src/index.ts --format esm --dts --clean",
+    "dev": "tsup src/index.ts --format esm --dts --watch",
+    "test": "vitest run",
+    "typecheck": "tsc --noEmit"
+  },
+  "devDependencies": {
+    "tsup": "^8.3.5",
+    "typescript": "^5.6.3",
+    "vitest": "^2.1.5"
+  }
+}
diff --git a/packages/foraging/src/extractor.ts b/packages/foraging/src/extractor.ts
@@ -0,0 +1,146 @@
+import { type Stats, readFileSync, readdirSync, statSync } from 'node:fs';
+import { join, relative } from 'node:path';
+import type { ExampleManifestKind, ScanLimits } from './types.js';
+
+/**
+ * The subset of an `examples/<name>/` that the extractor can classify
+ * without reading every file byte. Paths are relative to `abs_path`.
+ */
+export interface ExtractedShape {
+  manifest_kind: ExampleManifestKind;
+  manifest_path: string | null;
+  readme_path: string | null;
+  entrypoints: string[];
+  /** Flat list of files visited — useful for `content_hash` computation. */
+  file_tree: Array<{ path: string; size: number }>;
+}
+
+const MANIFEST_BY_FILE: ReadonlyArray<{ name: string; kind: ExampleManifestKind }> = [
+  { name: 'package.json', kind: 'npm' },
+  { name: 'pyproject.toml', kind: 'pypi' },
+  { name: 'setup.py', kind: 'pypi' },
+  { name: 'requirements.txt', kind: 'pypi' },
+  { name: 'Cargo.toml', kind: 'cargo' },
+  { name: 'go.mod', kind: 'go' },
+];
+
+const README_NAMES: readonly string[] = [
+  'README.md',
+  'README.mdx',
+  'README.rst',
+  'README.txt',
+  'README',
+];
+
+const ENTRYPOINT_CANDIDATES: readonly string[] = [
+  'src/index.ts',
+  'src/index.tsx',
+  'src/index.js',
+  'src/index.mjs',
+  'src/main.ts',
+  'src/main.js',
+  'src/main.rs',
+  'src/main.go',
+  'src/main.py',
+  'index.ts',
+  'index.js',
+  'main.py',
+  'main.go',
+  'main.rs',
+];
+
+/**
+ * Scan a single food source directory and return its shape. The walk
+ * respects `limits` so pathological examples (node_modules copy, giant
+ * test fixtures) don't stall a SessionStart hook.
+ */
+export function extract(abs_path: string, limits: ScanLimits): ExtractedShape {
+  const file_tree = walk(abs_path, limits);
+  const relPaths = new Set(file_tree.map((f) => f.path));
+
+  const manifestHit = MANIFEST_BY_FILE.find((m) => relPaths.has(m.name));
+  const manifest_kind: ExampleManifestKind = manifestHit?.kind ?? 'unknown';
+  const manifest_path = manifestHit?.name ?? null;
+
+  const readme_path = README_NAMES.find((n) => relPaths.has(n)) ?? null;
+
+  const entrypoints = ENTRYPOINT_CANDIDATES.filter((c) => relPaths.has(c));
+
+  return { manifest_kind, manifest_path, readme_path, entrypoints, file_tree };
+}
+
+/**
+ * Small hand-rolled BFS because we want to (a) enforce depth, (b) stop
+ * at `max_files_per_source`, and (c) skip dependency caches at tier 1
+ * without pulling in a globbing library. Ordering inside a directory is
+ * alphabetical (`readdirSync` is platform-dependent otherwise).
+ */
+function walk(root: string, limits: ScanLimits): Array<{ path: string; size: number }> {
+  const out: Array<{ path: string; size: number }> = [];
+  const queue: Array<{ dir: string; depth: number }> = [{ dir: root, depth: 0 }];
+
+  while (queue.length > 0 && out.length < limits.max_files_per_source) {
+    const next = queue.shift();
+    if (!next) break;
+    const { dir, depth } = next;
+    let entries: string[];
+    try {
+      entries = readdirSync(dir);
+    } catch {
+      continue;
+    }
+    entries.sort();
+
+    for (const name of entries) {
+      if (out.length >= limits.max_files_per_source) break;
+      if (SKIP_NAMES.has(name)) continue;
+      const abs = join(dir, name);
+      let st: Stats;
+      try {
+        st = statSync(abs);
+      } catch {
+        continue;
+      }
+      const rel = relative(root, abs);
+      if (st.isDirectory()) {
+        if (depth + 1 < limits.max_depth) {
+          queue.push({ dir: abs, depth: depth + 1 });
+        }
+      } else if (st.isFile()) {
+        out.push({ path: rel, size: st.size });
+      }
+    }
+  }
+  return out;
+}
+
+const SKIP_NAMES = new Set([
+  'node_modules',
+  '.git',
+  '.venv',
+  'venv',
+  'dist',
+  'build',
+  'target',
+  '.next',
+  '.turbo',
+  '.cache',
+  '__pycache__',
+]);
+
+/**
+ * Read a manifest file and return its raw text capped at `max_file_bytes`.
+ * Returning null instead of throwing keeps the scanner tolerant of files
+ * that disappear mid-walk.
+ */
+export function readCapped(abs: string, max_file_bytes: number): string | null {
+  try {
+    const buf = readFileSync(abs);
+    if (buf.byteLength > max_file_bytes) {
+      return buf.subarray(0, max_file_bytes).toString('utf8');
+    }
+    return buf.toString('utf8');
+  } catch {
+    return null;
+  }
+}
diff --git a/packages/foraging/src/index.ts b/packages/foraging/src/index.ts
@@ -0,0 +1,14 @@
+export { scanExamplesFs } from './scanner.js';
+export type { ScanFsOptions, ScanFsResult } from './scanner.js';
+export { extract, readCapped } from './extractor.js';
+export type { ExtractedShape } from './extractor.js';
+export { redact } from './redact.js';
+export type {
+  ExampleManifestKind,
+  FoodSource,
+  ForagedPattern,
+  IntegrationPlan,
+  ScanLimits,
+  ScanResult,
+} from './types.js';
+export { DEFAULT_SCAN_LIMITS } from './types.js';
diff --git a/packages/foraging/src/redact.ts b/packages/foraging/src/redact.ts
@@ -0,0 +1,70 @@
+/**
+ * Best-effort secret scrubbing for foraged content.
+ *
+ * Examples directories often carry `.env.example`, API-key snippets in
+ * README blocks, or copy-pasted Dockerfile secrets that a well-meaning
+ * author forgot to trim. We do a conservative pass before the text
+ * reaches SQLite — enough to strip the obvious cases without trying to
+ * be a fully general DLP engine.
+ *
+ * The three tiers:
+ *   1. Common cloud / service env-var names whose values are tokens.
+ *   2. Long opaque base64/hex strings that sit on their own assignment.
+ *   3. Armored PEM blocks.
+ */
+
+const DEFAULT_ENV_NAME_PATTERNS: readonly RegExp[] = [
+  /AWS_[A-Z0-9_]*(?:KEY|SECRET|TOKEN)[A-Z0-9_]*/,
+  /GITHUB_TOKEN/,
+  /GH_TOKEN/,
+  /OPENAI_API_KEY/,
+  /ANTHROPIC_API_KEY/,
+  /HUGGINGFACE_[A-Z0-9_]*TOKEN/,
+  /SLACK_[A-Z0-9_]*TOKEN/,
+  /STRIPE_[A-Z0-9_]*KEY/,
+  /TWILIO_[A-Z0-9_]*TOKEN/,
+  /[A-Z0-9_]*(?:SECRET|PASSWORD|PRIVATE_KEY|ACCESS_KEY)[A-Z0-9_]*/,
+];
+
+const PEM_BLOCK = /-----BEGIN [A-Z ]+PRIVATE KEY-----[\s\S]*?-----END [A-Z ]+PRIVATE KEY-----/g;
+
+/**
+ * Scrubs the text in place. Emits `***REDACTED***` wherever a secret was
+ * removed so downstream readers can see *that* a redaction happened
+ * without seeing the value.
+ */
+export function redact(text: string, extraEnvNames: readonly string[] = []): string {
+  let out = text;
+
+  // Tier 3 first — PEM blocks span many lines, easier to match before
+  // we start mangling assignment lines.
+  out = out.replace(PEM_BLOCK, '***REDACTED_PRIVATE_KEY***');
+
+  // Tier 1 — env-var-like assignments. Match `FOO_SECRET=value` and
+  // `FOO_SECRET: "value"` in both .env and YAML forms, then zero the
+  // value while keeping the key for context.
+  const extraPatterns = extraEnvNames.map((n) => new RegExp(`^${escapeRegex(n)}$`, 'i'));
+  const envMatchers = [...DEFAULT_ENV_NAME_PATTERNS, ...extraPatterns];
+  out = out
+    .split('\n')
+    .map((line) => redactEnvLine(line, envMatchers))
+    .join('\n');
+
+  return out;
+}
+
+function redactEnvLine(line: string, matchers: readonly RegExp[]): string {
+  const match = line.match(/^(\s*)([A-Z][A-Z0-9_]*)(\s*[:=]\s*)(.*)$/);
+  if (!match) return line;
+  const indent = match[1] ?? '';
+  const name = match[2];
+  const sep = match[3];
+  if (!name || !sep) return line;
+  if (!matchers.some((re) => re.test(name))) return line;
+  // Keep the key + separator for debugging context; drop the value.
+  return `${indent}${name}${sep}***REDACTED***`;
+}
+
+function escapeRegex(s: string): string {
+  return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+}
diff --git a/packages/foraging/src/scanner.ts b/packages/foraging/src/scanner.ts
@@ -0,0 +1,96 @@
+import { createHash } from 'node:crypto';
+import { readdirSync, statSync } from 'node:fs';
+import { join } from 'node:path';
+import { type ExtractedShape, extract, readCapped } from './extractor.js';
+import { DEFAULT_SCAN_LIMITS, type FoodSource, type ScanLimits } from './types.js';
+
+export interface ScanFsOptions {
+  repo_root: string;
+  limits?: Partial<ScanLimits>;
+}
+
+export interface ScanFsResult {
+  scanned: FoodSource[];
+}
+
+/**
+ * Discover food sources on disk without touching storage. Storage-aware
+ * `scanExamples` (next PR) wraps this and decides which of the returned
+ * sources to actually index based on `storage.getExample` hashes.
+ *
+ * Decoupling is deliberate: (a) the fs walk is pure and easy to test in
+ * isolation, (b) the storage-aware wrapper can stay a thin orchestrator
+ * with no fs logic of its own.
+ */
+export function scanExamplesFs(opts: ScanFsOptions): ScanFsResult {
+  const limits = mergeLimits(opts.limits);
+  const examplesDir = join(opts.repo_root, 'examples');
+
+  let names: string[];
+  try {
+    names = readdirSync(examplesDir);
+  } catch {
+    return { scanned: [] };
+  }
+  names.sort();
+
+  const scanned: FoodSource[] = [];
+  for (const example_name of names) {
+    const abs_path = join(examplesDir, example_name);
+    let isDir = false;
+    try {
+      isDir = statSync(abs_path).isDirectory();
+    } catch {
+      continue;
+    }
+    if (!isDir) continue;
+
+    const shape = extract(abs_path, limits);
+    const content_hash = computeContentHash(abs_path, shape, limits);
+    scanned.push({
+      repo_root: opts.repo_root,
+      example_name,
+      abs_path,
+      manifest_kind: shape.manifest_kind,
+      manifest_path: shape.manifest_path,
+      readme_path: shape.readme_path,
+      entrypoints: shape.entrypoints,
+      content_hash,
+    });
+  }
+  return { scanned };
+}
+
+/**
+ * Stable hash of (manifest bytes, sorted {path,size} pairs). Chosen
+ * over "hash every file" because the hash runs on every SessionStart
+ * and must finish in milliseconds. Size + path shifts are a sufficient
+ * change signal: an edit to any tracked file moves the size, a rename
+ * moves the path, a new file moves the set. A pure content-preserving
+ * edit (touch, whitespace-only, etc.) will miss — acceptable since the
+ * cached observations already encode the meaningful content.
+ */
+function computeContentHash(abs_path: string, shape: ExtractedShape, limits: ScanLimits): string {
+  const hash = createHash('sha256');
+  if (shape.manifest_path) {
+    const manifest = readCapped(join(abs_path, shape.manifest_path), limits.max_file_bytes);
+    if (manifest !== null) {
+      hash.update(`manifest:${shape.manifest_path}\n`);
+      hash.update(manifest);
+      hash.update('\n');
+    }
+  }
+  hash.update('filetree:\n');
+  for (const f of shape.file_tree.slice().sort((a, b) => a.path.localeCompare(b.path))) {
+    hash.update(`${f.path}\t${f.size}\n`);
+  }
+  return hash.digest('hex');
+}
+
+function mergeLimits(partial?: Partial<ScanLimits>): ScanLimits {
+  return {
+    max_depth: partial?.max_depth ?? DEFAULT_SCAN_LIMITS.max_depth,
+    max_file_bytes: partial?.max_file_bytes ?? DEFAULT_SCAN_LIMITS.max_file_bytes,
+    max_files_per_source: partial?.max_files_per_source ?? DEFAULT_SCAN_LIMITS.max_files_per_source,
+  };
+}