Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions .changeset/foraging-skeleton.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
---
"@colony/foraging": minor
---

Introduce `@colony/foraging` package skeleton. Ships pure-fs primitives
for foraging — scanning `<repo_root>/examples/<name>/` food sources,
classifying each by manifest kind (`npm` / `pypi` / `cargo` / `go` /
`unknown`), computing a change-signal `content_hash` over manifest +
file tree, and best-effort redaction of common cloud-service secrets
before anything hits storage.

No storage writes, no MCP wiring, no hooks yet — those arrive in the
follow-up PR. This layer stands alone so it can be unit-tested without
dragging `MemoryStore` into the test fixture.

Public API: `scanExamplesFs`, `extract`, `readCapped`, `redact`, plus
the `FoodSource` / `ForagedPattern` / `IntegrationPlan` / `ScanLimits`
types and `DEFAULT_SCAN_LIMITS` constants.
10 changes: 10 additions & 0 deletions .changeset/storage-examples-table.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
"@colony/storage": minor
---

Add an `examples` table and `upsertExample` / `getExample` / `listExamples` /
`deleteExample` methods to support the forthcoming `@colony/foraging`
package. Each row caches the content hash and observation count for a
`<repo_root>/examples/<name>` food source so repeat scans on
`SessionStart` can skip unchanged directories without touching the
observation table. Schema version bumped 6 → 7.
26 changes: 26 additions & 0 deletions packages/foraging/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"name": "@colony/foraging",
"version": "0.0.0",
"license": "MIT",
"type": "module",
"main": "./dist/index.js",
"types": "./dist/index.d.ts",
"exports": {
".": {
"types": "./dist/index.d.ts",
"import": "./dist/index.js"
}
},
"files": ["dist"],
"scripts": {
"build": "tsup src/index.ts --format esm --dts --clean",
"dev": "tsup src/index.ts --format esm --dts --watch",
"test": "vitest run",
"typecheck": "tsc --noEmit"
},
"devDependencies": {
"tsup": "^8.3.5",
"typescript": "^5.6.3",
"vitest": "^2.1.5"
}
}
146 changes: 146 additions & 0 deletions packages/foraging/src/extractor.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
import { type Stats, readFileSync, readdirSync, statSync } from 'node:fs';
import { join, relative } from 'node:path';
import type { ExampleManifestKind, ScanLimits } from './types.js';

/**
* The subset of an `examples/<name>/` that the extractor can classify
* without reading every file byte. Paths are relative to `abs_path`.
*/
export interface ExtractedShape {
manifest_kind: ExampleManifestKind;
manifest_path: string | null;
readme_path: string | null;
entrypoints: string[];
/** Flat list of files visited — useful for `content_hash` computation. */
file_tree: Array<{ path: string; size: number }>;
}

const MANIFEST_BY_FILE: ReadonlyArray<{ name: string; kind: ExampleManifestKind }> = [
{ name: 'package.json', kind: 'npm' },
{ name: 'pyproject.toml', kind: 'pypi' },
{ name: 'setup.py', kind: 'pypi' },
{ name: 'requirements.txt', kind: 'pypi' },
{ name: 'Cargo.toml', kind: 'cargo' },
{ name: 'go.mod', kind: 'go' },
];

const README_NAMES: readonly string[] = [
'README.md',
'README.mdx',
'README.rst',
'README.txt',
'README',
];

const ENTRYPOINT_CANDIDATES: readonly string[] = [
'src/index.ts',
'src/index.tsx',
'src/index.js',
'src/index.mjs',
'src/main.ts',
'src/main.js',
'src/main.rs',
'src/main.go',
'src/main.py',
'index.ts',
'index.js',
'main.py',
'main.go',
'main.rs',
];

/**
* Scan a single food source directory and return its shape. The walk
* respects `limits` so pathological examples (node_modules copy, giant
* test fixtures) don't stall a SessionStart hook.
*/
export function extract(abs_path: string, limits: ScanLimits): ExtractedShape {
const file_tree = walk(abs_path, limits);
const relPaths = new Set(file_tree.map((f) => f.path));

const manifestHit = MANIFEST_BY_FILE.find((m) => relPaths.has(m.name));
const manifest_kind: ExampleManifestKind = manifestHit?.kind ?? 'unknown';
const manifest_path = manifestHit?.name ?? null;

const readme_path = README_NAMES.find((n) => relPaths.has(n)) ?? null;

const entrypoints = ENTRYPOINT_CANDIDATES.filter((c) => relPaths.has(c));

return { manifest_kind, manifest_path, readme_path, entrypoints, file_tree };
}

/**
* Small hand-rolled BFS because we want to (a) enforce depth, (b) stop
* at `max_files_per_source`, and (c) skip dependency caches at tier 1
* without pulling in a globbing library. Ordering inside a directory is
* alphabetical (`readdirSync` is platform-dependent otherwise).
*/
function walk(root: string, limits: ScanLimits): Array<{ path: string; size: number }> {
const out: Array<{ path: string; size: number }> = [];
const queue: Array<{ dir: string; depth: number }> = [{ dir: root, depth: 0 }];

while (queue.length > 0 && out.length < limits.max_files_per_source) {
const next = queue.shift();
if (!next) break;
const { dir, depth } = next;
let entries: string[];
try {
entries = readdirSync(dir);
} catch {
continue;
}
entries.sort();

for (const name of entries) {
if (out.length >= limits.max_files_per_source) break;
if (SKIP_NAMES.has(name)) continue;
const abs = join(dir, name);
let st: Stats;
try {
st = statSync(abs);
} catch {
continue;
}
const rel = relative(root, abs);
if (st.isDirectory()) {
if (depth + 1 < limits.max_depth) {
queue.push({ dir: abs, depth: depth + 1 });
}
} else if (st.isFile()) {
out.push({ path: rel, size: st.size });
}
}
}
return out;
}

const SKIP_NAMES = new Set([
'node_modules',
'.git',
'.venv',
'venv',
'dist',
'build',
'target',
'.next',
'.turbo',
'.cache',
'__pycache__',
]);

/**
* Read a manifest file and return its raw text capped at `max_file_bytes`.
* Returning null instead of throwing keeps the scanner tolerant of files
* that disappear mid-walk.
*/
export function readCapped(abs: string, max_file_bytes: number): string | null {
try {
const buf = readFileSync(abs);
if (buf.byteLength > max_file_bytes) {
return buf.subarray(0, max_file_bytes).toString('utf8');
}
return buf.toString('utf8');
} catch {
return null;
}
}
14 changes: 14 additions & 0 deletions packages/foraging/src/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
export { scanExamplesFs } from './scanner.js';
export type { ScanFsOptions, ScanFsResult } from './scanner.js';
export { extract, readCapped } from './extractor.js';
export type { ExtractedShape } from './extractor.js';
export { redact } from './redact.js';
export type {
ExampleManifestKind,
FoodSource,
ForagedPattern,
IntegrationPlan,
ScanLimits,
ScanResult,
} from './types.js';
export { DEFAULT_SCAN_LIMITS } from './types.js';
70 changes: 70 additions & 0 deletions packages/foraging/src/redact.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
/**
* Best-effort secret scrubbing for foraged content.
*
* Examples directories often carry `.env.example`, API-key snippets in
* README blocks, or copy-pasted Dockerfile secrets that a well-meaning
* author forgot to trim. We do a conservative pass before the text
* reaches SQLite — enough to strip the obvious cases without trying to
* be a fully general DLP engine.
*
* The three tiers:
* 1. Common cloud / service env-var names whose values are tokens.
* 2. Long opaque base64/hex strings that sit on their own assignment.
* 3. Armored PEM blocks.
*/

const DEFAULT_ENV_NAME_PATTERNS: readonly RegExp[] = [
/AWS_[A-Z0-9_]*(?:KEY|SECRET|TOKEN)[A-Z0-9_]*/,
/GITHUB_TOKEN/,
/GH_TOKEN/,
/OPENAI_API_KEY/,
/ANTHROPIC_API_KEY/,
/HUGGINGFACE_[A-Z0-9_]*TOKEN/,
/SLACK_[A-Z0-9_]*TOKEN/,
/STRIPE_[A-Z0-9_]*KEY/,
/TWILIO_[A-Z0-9_]*TOKEN/,
/[A-Z0-9_]*(?:SECRET|PASSWORD|PRIVATE_KEY|ACCESS_KEY)[A-Z0-9_]*/,
];

const PEM_BLOCK = /-----BEGIN [A-Z ]+PRIVATE KEY-----[\s\S]*?-----END [A-Z ]+PRIVATE KEY-----/g;

/**
* Scrubs the text in place. Emits `***REDACTED***` wherever a secret was
* removed so downstream readers can see *that* a redaction happened
* without seeing the value.
*/
export function redact(text: string, extraEnvNames: readonly string[] = []): string {
let out = text;

// Tier 3 first — PEM blocks span many lines, easier to match before
// we start mangling assignment lines.
out = out.replace(PEM_BLOCK, '***REDACTED_PRIVATE_KEY***');

// Tier 1 — env-var-like assignments. Match `FOO_SECRET=value` and
// `FOO_SECRET: "value"` in both .env and YAML forms, then zero the
// value while keeping the key for context.
const extraPatterns = extraEnvNames.map((n) => new RegExp(`^${escapeRegex(n)}$`, 'i'));
const envMatchers = [...DEFAULT_ENV_NAME_PATTERNS, ...extraPatterns];
out = out
.split('\n')
.map((line) => redactEnvLine(line, envMatchers))
.join('\n');

return out;
}

function redactEnvLine(line: string, matchers: readonly RegExp[]): string {
const match = line.match(/^(\s*)([A-Z][A-Z0-9_]*)(\s*[:=]\s*)(.*)$/);
if (!match) return line;
const indent = match[1] ?? '';
const name = match[2];
const sep = match[3];
if (!name || !sep) return line;
if (!matchers.some((re) => re.test(name))) return line;
// Keep the key + separator for debugging context; drop the value.
return `${indent}${name}${sep}***REDACTED***`;
}

function escapeRegex(s: string): string {
return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}
96 changes: 96 additions & 0 deletions packages/foraging/src/scanner.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import { createHash } from 'node:crypto';
import { readdirSync, statSync } from 'node:fs';
import { join } from 'node:path';
import { type ExtractedShape, extract, readCapped } from './extractor.js';
import { DEFAULT_SCAN_LIMITS, type FoodSource, type ScanLimits } from './types.js';

export interface ScanFsOptions {
repo_root: string;
limits?: Partial<ScanLimits>;
}

export interface ScanFsResult {
scanned: FoodSource[];
}

/**
* Discover food sources on disk without touching storage. Storage-aware
* `scanExamples` (next PR) wraps this and decides which of the returned
* sources to actually index based on `storage.getExample` hashes.
*
* Decoupling is deliberate: (a) the fs walk is pure and easy to test in
* isolation, (b) the storage-aware wrapper can stay a thin orchestrator
* with no fs logic of its own.
*/
export function scanExamplesFs(opts: ScanFsOptions): ScanFsResult {
const limits = mergeLimits(opts.limits);
const examplesDir = join(opts.repo_root, 'examples');

let names: string[];
try {
names = readdirSync(examplesDir);
} catch {
return { scanned: [] };
}
names.sort();

const scanned: FoodSource[] = [];
for (const example_name of names) {
const abs_path = join(examplesDir, example_name);
let isDir = false;
try {
isDir = statSync(abs_path).isDirectory();
} catch {
continue;
}
if (!isDir) continue;

const shape = extract(abs_path, limits);
const content_hash = computeContentHash(abs_path, shape, limits);
scanned.push({
repo_root: opts.repo_root,
example_name,
abs_path,
manifest_kind: shape.manifest_kind,
manifest_path: shape.manifest_path,
readme_path: shape.readme_path,
entrypoints: shape.entrypoints,
content_hash,
});
}
return { scanned };
}

/**
* Stable hash of (manifest bytes, sorted {path,size} pairs). Chosen
* over "hash every file" because the hash runs on every SessionStart
* and must finish in milliseconds. Size + path shifts are a sufficient
* change signal: an edit to any tracked file moves the size, a rename
* moves the path, a new file moves the set. A pure content-preserving
* edit (touch, whitespace-only, etc.) will miss — acceptable since the
* cached observations already encode the meaningful content.
*/
function computeContentHash(abs_path: string, shape: ExtractedShape, limits: ScanLimits): string {
const hash = createHash('sha256');
if (shape.manifest_path) {
const manifest = readCapped(join(abs_path, shape.manifest_path), limits.max_file_bytes);
if (manifest !== null) {
hash.update(`manifest:${shape.manifest_path}\n`);
hash.update(manifest);
hash.update('\n');
}
}
hash.update('filetree:\n');
for (const f of shape.file_tree.slice().sort((a, b) => a.path.localeCompare(b.path))) {
hash.update(`${f.path}\t${f.size}\n`);
}
return hash.digest('hex');
}

function mergeLimits(partial?: Partial<ScanLimits>): ScanLimits {
return {
max_depth: partial?.max_depth ?? DEFAULT_SCAN_LIMITS.max_depth,
max_file_bytes: partial?.max_file_bytes ?? DEFAULT_SCAN_LIMITS.max_file_bytes,
max_files_per_source: partial?.max_files_per_source ?? DEFAULT_SCAN_LIMITS.max_files_per_source,
};
}
Loading
Loading