diff --git a/examples/real-autoresearch/grove.md b/examples/real-autoresearch/grove.md index 4a449a71..9ace58d9 100644 --- a/examples/real-autoresearch/grove.md +++ b/examples/real-autoresearch/grove.md @@ -16,25 +16,23 @@ metrics: unit: GB description: Peak VRAM usage during training outcome_policy: - auto_evaluate: true - accept_if: - metric: val_bpb - condition: improved_over_parent + auto_accept: + metric_improves: val_bpb stop_conditions: - no_improvement_rounds: 5 - max_rounds: 20 + max_rounds_without_improvement: 5 target_metric: metric: val_bpb - threshold: 0.85 - wall_clock_budget: "3h" -enforcement: - claim_policy: - max_concurrent: 3 - lease_duration: "10m" + value: 0.85 + budget: + max_wall_clock_seconds: 10800 + deliberation_limit: + max_rounds: 20 concurrency: max_active_claims: 3 max_claims_per_agent: 1 max_claims_per_target: 1 +execution: + default_lease_seconds: 600 rate_limits: max_contributions_per_agent_per_hour: 100 max_contributions_per_grove_per_hour: 300 diff --git a/src/core/event-bus.test.ts b/src/core/event-bus.test.ts index ab8f7f34..cc4a8e7d 100644 --- a/src/core/event-bus.test.ts +++ b/src/core/event-bus.test.ts @@ -213,17 +213,17 @@ describe("TopologyRouter", () => { bus.close(); }); - test("targetsFor returns correct targets", () => { + test("targetsFor returns RoleEdge objects for each outgoing edge", () => { const bus = new LocalEventBus(); const router = new TopologyRouter(reviewLoopTopology, bus); - expect(router.targetsFor("coder")).toEqual(["reviewer"]); - expect(router.targetsFor("reviewer")).toEqual(["coder"]); + expect(router.targetsFor("coder")).toEqual([{ target: "reviewer", edgeType: "delegates" }]); + expect(router.targetsFor("reviewer")).toEqual([{ target: "coder", edgeType: "feedback" }]); expect(router.targetsFor("unknown")).toEqual([]); bus.close(); }); - test("duplicate edges are deduplicated", () => { - const duped: AgentTopology = { + test("targetsFor returns multiple RoleEdges when different edge types point to same target", () => { + const multiEdge: AgentTopology = { structure: "graph", roles: [ { @@ -237,15 +237,60 @@ describe("TopologyRouter", () => { ], }; const bus = new LocalEventBus(); - const router = new TopologyRouter(duped, bus); + const router = new TopologyRouter(multiEdge, bus); + const edges = router.targetsFor("coder"); + // Both edges preserved — distinct (target, edgeType) pairs + expect(edges).toHaveLength(2); + expect(edges).toContainEqual({ target: "reviewer", edgeType: "delegates" }); + expect(edges).toContainEqual({ target: "reviewer", edgeType: "feeds" }); + bus.close(); + }); + + test("route() publishes one event per target even when multiple edge types point to same target", () => { + const multiEdge: AgentTopology = { + structure: "graph", + roles: [ + { + name: "coder", + edges: [ + { target: "reviewer", edgeType: "delegates" }, + { target: "reviewer", edgeType: "feeds" }, + ], + }, + { name: "reviewer" }, + ], + }; + const bus = new LocalEventBus(); + const router = new TopologyRouter(multiEdge, bus); const received: GroveEvent[] = []; bus.subscribe("reviewer", (e) => received.push(e)); const targets = router.route("coder", {}); - // Should only route once to reviewer despite two edges + // route() deduplicates by target: one event despite two distinct edges expect(targets).toEqual(["reviewer"]); expect(received).toHaveLength(1); bus.close(); }); + + test("targetsFor deduplicates exact (target, edgeType) duplicate pairs", () => { + const exactDupes: AgentTopology = { + structure: "graph", + roles: [ + { + name: "coder", + edges: [ + { target: "reviewer", edgeType: "delegates" }, + { target: "reviewer", edgeType: "delegates" }, // exact duplicate + ], + }, + { name: "reviewer" }, + ], + }; + const bus = new LocalEventBus(); + const router = new TopologyRouter(exactDupes, bus); + // Exact (target, edgeType) duplicate is collapsed to one entry + expect(router.targetsFor("coder")).toEqual([{ target: "reviewer", edgeType: "delegates" }]); + bus.close(); + }); }); diff --git a/src/core/examples.test.ts b/src/core/examples.test.ts new file mode 100644 index 00000000..c22fecf5 --- /dev/null +++ b/src/core/examples.test.ts @@ -0,0 +1,56 @@ +/** + * CI guard: parse all examples/\*\*\/grove.md files against parseGroveContract. + * + * This test ensures that example GROVE.md files always use valid field names + * and schema-conformant values. Without this guard, field-name drift goes + * undetected until an agent or user copy-pastes the example and gets a parse + * error from parseGroveContract. + * + * New examples are automatically covered — no manual registration required. + */ + +import { describe, expect, test } from "bun:test"; +import { readFileSync } from "node:fs"; +import { join } from "node:path"; + +import { parseGroveContract } from "./contract.js"; + +describe("example grove.md files", () => { + test("all examples/*/grove.md files parse against the canonical contract schema", async () => { + const { Glob } = await import("bun"); + + // Resolve the repo root relative to this file's compile-time location. + // process.cwd() is the repo root when running `bun test` from the project root. + const repoRoot = process.cwd(); + const pattern = "examples/**/grove.md"; + + const files: string[] = []; + for (const rel of new Glob(pattern).scanSync({ cwd: repoRoot, absolute: false })) { + files.push(join(repoRoot, rel)); + } + + expect(files.length).toBeGreaterThan(0); + + const failures: Array<{ file: string; error: string }> = []; + + for (const filePath of files) { + try { + const content = readFileSync(filePath, "utf8"); + parseGroveContract(content); + } catch (err) { + failures.push({ + file: filePath.replace(`${repoRoot}/`, ""), + error: err instanceof Error ? err.message : String(err), + }); + } + } + + if (failures.length > 0) { + const report = failures.map((f) => ` ${f.file}:\n ${f.error}`).join("\n"); + throw new Error( + `${failures.length} example grove.md file(s) failed to parse:\n${report}\n\n` + + "Fix the field names to match the canonical contract schema in src/core/contract.ts", + ); + } + }); +}); diff --git a/src/core/operations/contribute.test.ts b/src/core/operations/contribute.test.ts index eda13d15..66cfbbb1 100644 --- a/src/core/operations/contribute.test.ts +++ b/src/core/operations/contribute.test.ts @@ -939,6 +939,7 @@ describe("writeSerial: best-effort handoff failure paths", () => { }); test("emits console.warn when handoffStore.createMany throws", async () => { + // biome-ignore lint/suspicious/noEmptyBlockStatements: spy suppresses output intentionally const warnSpy = spyOn(console, "warn").mockImplementation(() => {}); const faultyHandoffStore: OperationDeps["handoffStore"] = { @@ -978,6 +979,7 @@ describe("writeSerial: best-effort handoff failure paths", () => { // Promise, the throw must still be caught — otherwise the already-committed // contribution would bubble out as an operation error and the idempotency // slot would be released, allowing duplicate contributions on retry. + // biome-ignore lint/suspicious/noEmptyBlockStatements: spy suppresses output intentionally const warnSpy = spyOn(console, "warn").mockImplementation(() => {}); // Non-async function so the throw happens synchronously, before any diff --git a/src/core/operations/contribute.ts b/src/core/operations/contribute.ts index 20ea77ef..e8aa8788 100644 --- a/src/core/operations/contribute.ts +++ b/src/core/operations/contribute.ts @@ -851,8 +851,12 @@ export async function contributeOperation( `[grove] Warning: topology router is active but agent '${contribution.agent.agentId}' has no role — routing skipped. Set agent.role to enable topology routing.\n`, ); } else { - const targets = deps.topologyRouter.targetsFor(contribution.agent.role); - if (targets.length > 0) routedTo = [...targets]; + const edges = deps.topologyRouter.targetsFor(contribution.agent.role); + // Deduplicate by target: a role may have multiple edge types (e.g. + // delegates + feeds) pointing at the same downstream role. Creating + // one handoff per (source, target) pair is correct; creating one per + // edge type would produce duplicate pending handoffs for the same work. + if (edges.length > 0) routedTo = [...new Set(edges.map((e) => e.target))]; } } diff --git a/src/core/operations/eval.ts b/src/core/operations/eval.ts new file mode 100644 index 00000000..a86979b5 --- /dev/null +++ b/src/core/operations/eval.ts @@ -0,0 +1,231 @@ +/** + * Eval operation. + * + * evalOperation — Run the contract's eval harness against a target CID and + * return structured metric scores. + * + * The operation spawns the evalCommand as a subprocess (via `sh -c`), + * streams stdout/stderr line-by-line looking for GROVE_SCORE lines, and + * returns the parsed scores along with exit metadata. + * + * Score line format (stdout or stderr): + * GROVE_SCORE = + * + * Example: + * GROVE_SCORE val_bpb=0.92 + * GROVE_SCORE peak_vram_gb=14.3 + * + * The target CID is passed to the subprocess via the GROVE_TARGET_CID + * environment variable so eval scripts can locate the artifact. + * + * Output is streamed and capped at MAX_OUTPUT_BYTES (16 MB). The last 4 KB + * of combined stdout+stderr is returned as rawTail for diagnostics. + */ + +import { spawn } from "node:child_process"; + +import type { OperationDeps } from "./deps.js"; +import type { OperationResult } from "./result.js"; +import { err, OperationErrorCode, ok, validationErr } from "./result.js"; + +// --------------------------------------------------------------------------- +// Constants +// --------------------------------------------------------------------------- + +/** Maximum combined stdout+stderr bytes buffered in memory. */ +const MAX_OUTPUT_BYTES = 16 * 1024 * 1024; // 16 MB + +/** Tail of combined output returned for diagnostics (bytes). */ +const TAIL_BYTES = 4096; + +/** Default timeout when neither input nor contract specifies one (ms). */ +const DEFAULT_TIMEOUT_MS = 300_000; // 5 minutes + +/** Pattern for score lines emitted by the eval subprocess. */ +const SCORE_LINE_RE = + /^GROVE_SCORE\s+([a-z][a-z0-9_]*)=([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?)$/i; + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +/** A single metric score returned by the eval harness. */ +export interface EvalScore { + readonly metric: string; + readonly value: number; +} + +/** Input for evalOperation. */ +export interface EvalInput { + /** CID of the contribution artifact to evaluate. Passed as GROVE_TARGET_CID env var. */ + readonly targetCid: string; + /** + * Shell command to execute as the eval harness. + * Optional if the contract's evaluation config provides a default in a + * future protocol version. Currently required — returns VALIDATION_ERROR + * if omitted and no contract default is available. + */ + readonly evalCommand?: string | undefined; + /** + * Timeout in milliseconds before the subprocess is killed. + * Defaults to the contract's evaluation timeout if available, then + * DEFAULT_TIMEOUT_MS (5 minutes). + */ + readonly timeoutMs?: number | undefined; +} + +/** Result of evalOperation on success. */ +export interface EvalResult { + /** Parsed metric scores from GROVE_SCORE lines in the subprocess output. */ + readonly scores: readonly EvalScore[]; + /** Exit code of the eval subprocess (0 = success). */ + readonly exitCode: number; + /** True when the subprocess was killed due to timeout. */ + readonly timedOut: boolean; + /** Last ~4 KB of combined stdout+stderr for diagnostics. */ + readonly rawTail: string; +} + +// --------------------------------------------------------------------------- +// Internal helpers +// --------------------------------------------------------------------------- + +/** Run the eval command as a subprocess, stream output, parse scores. */ +async function runEvalSubprocess( + command: string, + targetCid: string, + timeoutMs: number, +): Promise<{ scores: EvalScore[]; exitCode: number; timedOut: boolean; rawTail: string }> { + return new Promise((resolve) => { + const scores: EvalScore[] = []; + let timedOut = false; + let outputSize = 0; + // Ring-buffer approach: keep the tail of output for diagnostics. + let rawOutput = ""; + + // detached: true puts the child in its own process group so we can + // kill the entire tree (shell + any forked children) on timeout via + // process.kill(-pid, signal) instead of just the sh wrapper. + const child = spawn("sh", ["-c", command], { + env: { ...process.env, GROVE_TARGET_CID: targetCid }, + stdio: ["ignore", "pipe", "pipe"], + detached: true, + }); + + /** Kill the full process group; fall back to the direct PID on error. */ + const killGroup = (signal: NodeJS.Signals): void => { + try { + if (child.pid !== undefined) process.kill(-child.pid, signal); + } catch { + try { + child.kill(signal); + } catch { + // Already exited — ignore. + } + } + }; + + const timer = setTimeout(() => { + timedOut = true; + killGroup("SIGTERM"); + // Give the process group a moment to exit before SIGKILL. + setTimeout(() => killGroup("SIGKILL"), 5000); + }, timeoutMs); + + /** Append to the rolling output tail, respecting the cap. */ + const appendOutput = (chunk: string): void => { + if (outputSize >= MAX_OUTPUT_BYTES) return; + outputSize += chunk.length; + rawOutput += chunk; + }; + + /** Parse a single line for a GROVE_SCORE entry. */ + const parseLine = (line: string): void => { + const match = line.trim().match(SCORE_LINE_RE); + if (match) { + scores.push({ metric: match[1]?.toLowerCase() ?? "", value: parseFloat(match[2] ?? "0") }); + } + }; + + /** Stream a data chunk through a line buffer, calling parseLine per line. */ + const makeLineHandler = (): { handler: (chunk: Buffer) => void; flush: () => void } => { + let buf = ""; + return { + handler: (chunk: Buffer) => { + const text = chunk.toString("utf8"); + appendOutput(text); + const combined = buf + text; + const lines = combined.split("\n"); + buf = lines.pop() ?? ""; + for (const line of lines) parseLine(line); + }, + flush: () => { + if (buf) { + parseLine(buf); + buf = ""; + } + }, + }; + }; + + const stdoutHandler = makeLineHandler(); + const stderrHandler = makeLineHandler(); + child.stdout?.on("data", stdoutHandler.handler); + child.stderr?.on("data", stderrHandler.handler); + + child.on("close", (code) => { + // Flush any partial line that didn't end with a newline (e.g. final GROVE_SCORE line). + stdoutHandler.flush(); + stderrHandler.flush(); + clearTimeout(timer); + const exitCode = code ?? (timedOut ? 124 : 1); + // Return last TAIL_BYTES for diagnostics. + const rawTail = rawOutput.length > TAIL_BYTES ? rawOutput.slice(-TAIL_BYTES) : rawOutput; + resolve({ scores, exitCode, timedOut, rawTail }); + }); + }); +} + +// --------------------------------------------------------------------------- +// Operation +// --------------------------------------------------------------------------- + +/** Run the eval harness against a target CID and return structured scores. */ +export async function evalOperation( + _deps: OperationDeps, + input: EvalInput, +): Promise> { + const { targetCid, evalCommand, timeoutMs } = input; + + // Resolve the command to run. + const command = evalCommand; + if (!command) { + return validationErr( + "evalCommand is required: provide it as input or configure evaluation.eval_command in GROVE.md", + ); + } + + // Validate targetCid is non-empty (format validation; existence check is out of scope + // since eval may run before the contribution is written). + if (!targetCid || targetCid.trim().length === 0) { + return validationErr("targetCid must be a non-empty string"); + } + + // Resolve timeout: input > contract (future) > default. + const resolvedTimeout = timeoutMs ?? DEFAULT_TIMEOUT_MS; + + try { + const { scores, exitCode, timedOut, rawTail } = await runEvalSubprocess( + command, + targetCid.trim(), + resolvedTimeout, + ); + + return ok({ scores, exitCode, timedOut, rawTail }); + } catch (error) { + return err({ + code: OperationErrorCode.InternalError, + message: error instanceof Error ? error.message : String(error), + }); + } +} diff --git a/src/core/operations/index.ts b/src/core/operations/index.ts index bda06dde..e14697c9 100644 --- a/src/core/operations/index.ts +++ b/src/core/operations/index.ts @@ -61,6 +61,9 @@ export { } from "./contribute.js"; // Foundation export type { OperationDeps } from "./deps.js"; +// Eval operation +export type { EvalInput, EvalResult, EvalScore } from "./eval.js"; +export { evalOperation } from "./eval.js"; // Lifecycle operation export type { CheckStopResult, StopConditionStatus } from "./lifecycle.js"; export { checkStopOperation } from "./lifecycle.js"; diff --git a/src/core/operations/plan.test.ts b/src/core/operations/plan.test.ts index 3ae74071..0609b888 100644 --- a/src/core/operations/plan.test.ts +++ b/src/core/operations/plan.test.ts @@ -444,7 +444,8 @@ describe("plan routing semantics (Issues 1A + 13A)", () => { // Build a topology router that would route 'planner' -> 'coder' for any // contribution. Plans should still skip handoff creation. const topologyRouter = { - targetsFor: (role: string) => (role === "planner" ? ["coder"] : []), + targetsFor: (role: string) => + role === "planner" ? [{ target: "coder", edgeType: "delegates" as const }] : [], route: async () => { /* fire-and-forget event */ }, diff --git a/src/core/topology-router.ts b/src/core/topology-router.ts index a88684b5..abd51d54 100644 --- a/src/core/topology-router.ts +++ b/src/core/topology-router.ts @@ -1,32 +1,51 @@ import type { EventBus, GroveEvent } from "./event-bus.js"; -import type { AgentTopology } from "./topology.js"; +import type { AgentTopology, RoleEdge } from "./topology.js"; /** * Routes contribution events through topology edges. * * Given a contribution from a source role, finds all outgoing edges * from that role and publishes events to the target roles. + * + * Edge types are structurally preserved in the edge map (deduplicated by + * (target, edgeType) pair). Behavioral semantics for edge types are + * informational in the current protocol version — routing behavior is flat + * (all edges produce the same handoff pattern). Behavioral routing is planned + * for a future protocol version. + * + * Memory bound: max 50 roles × 50 edges = 2500 RoleEdge objects (~100KB). + * All caps are enforced by the contract Zod schema. */ export class TopologyRouter { private readonly topology: AgentTopology; private readonly eventBus: EventBus; - private readonly edgeMap: ReadonlyMap; + // source role → outgoing edges, deduplicated by (target, edgeType) pair + private readonly edgeMap: ReadonlyMap; constructor(topology: AgentTopology, eventBus: EventBus) { this.topology = topology; this.eventBus = eventBus; - // Pre-compute: source role -> target roles - const map = new Map(); + // Pre-compute: source role -> outgoing RoleEdge[], deduped by (target, edgeType). + // Use a Set keyed by "target:edgeType" for O(1) dedup instead of O(n) Array.includes. + const map = new Map(); + const seen = new Map>(); for (const role of topology.roles) { if (role.edges) { for (const edge of role.edges) { - let targets = map.get(role.name); - if (!targets) { - targets = []; - map.set(role.name, targets); + let edges = map.get(role.name); + if (!edges) { + edges = []; + map.set(role.name, edges); } - if (!targets.includes(edge.target)) { - targets.push(edge.target); + let seenForRole = seen.get(role.name); + if (!seenForRole) { + seenForRole = new Set(); + seen.set(role.name, seenForRole); + } + const key = `${edge.target}:${edge.edgeType}`; + if (!seenForRole.has(key)) { + seenForRole.add(key); + edges.push({ target: edge.target, edgeType: edge.edgeType }); } } } @@ -36,25 +55,31 @@ export class TopologyRouter { /** * Route an event from a source role to all downstream targets. - * Returns the list of target roles that received the event. + * Publishes one event per unique target role (deduplicates by target when + * multiple edge types point to the same target). Returns the list of unique + * target roles that received the event. */ route(sourceRole: string, payload: Record): readonly string[] { - const targets = this.edgeMap.get(sourceRole); - if (!targets || targets.length === 0) return []; + const edges = this.edgeMap.get(sourceRole); + if (!edges || edges.length === 0) return []; const timestamp = new Date().toISOString(); const routedTo: string[] = []; + const publishedTargets = new Set(); - for (const targetRole of targets) { - const event: GroveEvent = { - type: "contribution", - sourceRole, - targetRole, - payload, - timestamp, - }; - this.eventBus.publish(event); - routedTo.push(targetRole); + for (const edge of edges) { + if (!publishedTargets.has(edge.target)) { + publishedTargets.add(edge.target); + const event: GroveEvent = { + type: "contribution", + sourceRole, + targetRole: edge.target, + payload, + timestamp, + }; + this.eventBus.publish(event); + routedTo.push(edge.target); + } } return routedTo; @@ -77,8 +102,14 @@ export class TopologyRouter { } } - /** Get the target roles for a given source role. */ - targetsFor(sourceRole: string): readonly string[] { + /** + * Get all outgoing edges for a given source role. + * + * Returns all distinct (target, edgeType) pairs. Multiple edges to the + * same target with different edge types are preserved as separate entries. + * Returns an empty array for unknown roles. + */ + targetsFor(sourceRole: string): readonly RoleEdge[] { return this.edgeMap.get(sourceRole) ?? []; } } diff --git a/src/mcp/serve-http.ts b/src/mcp/serve-http.ts index 5698f089..4ab000cc 100644 --- a/src/mcp/serve-http.ts +++ b/src/mcp/serve-http.ts @@ -56,6 +56,7 @@ let nexusUrl: string | undefined; let nexusApiKey: string | undefined; let zoneId = "default"; let nexusClient: import("../nexus/nexus-http-client.js").NexusHttpClient | undefined; +// biome-ignore lint/suspicious/noEmptyBlockStatements: default no-op replaced in try block let closeStores: () => void = () => {}; try { @@ -305,6 +306,7 @@ async function buildScopedDeps(sessionId: string | undefined): Promise { const sid = transport.sessionId; diff --git a/src/mcp/serve.ts b/src/mcp/serve.ts index 53a1ab7f..206e3037 100644 --- a/src/mcp/serve.ts +++ b/src/mcp/serve.ts @@ -276,6 +276,7 @@ try { const onContributionWritten = envSessionId && !nexusClient ? (cid: string) => { + // biome-ignore lint/suspicious/noEmptyBlockStatements: fire-and-forget, errors intentionally swallowed void runtime.goalSessionStore.addContributionToSession(envSessionId, cid).catch(() => {}); } : undefined; @@ -302,6 +303,10 @@ try { const hasMetrics = loadedContract?.metrics !== undefined && Object.keys(loadedContract.metrics).length > 0; + // grove_eval executes arbitrary sh -c. Disabled by default on all transports; + // enable with GROVE_MCP_EVAL_ENABLED=true (stdio) or AUTH_TOKEN + + // GROVE_MCP_EVAL_ENABLED=true (HTTP — enforced in serve-http.ts). + const evalEnabled = process.env.GROVE_MCP_EVAL_ENABLED === "true"; preset = contractMode === "evaluation" ? { @@ -315,6 +320,7 @@ try { messaging: false, plans: true, goals: true, + eval: evalEnabled, } : { queries: true, @@ -327,6 +333,7 @@ try { messaging: false, plans: false, goals: true, + eval: evalEnabled, }; close = () => { diff --git a/src/mcp/server.integration.test.ts b/src/mcp/server.integration.test.ts index 5e3ac284..31892b0d 100644 --- a/src/mcp/server.integration.test.ts +++ b/src/mcp/server.integration.test.ts @@ -34,7 +34,7 @@ describe("MCP server integration", () => { testDeps = await createTestMcpDeps(); deps = testDeps.deps; - const server = await createMcpServer(deps); + const server = await createMcpServer(deps, { eval: true }); const [clientTransport, serverTransport] = InMemoryTransport.createLinkedPair(); client = new Client({ name: "test-client", version: "0.0.1" }); @@ -53,7 +53,7 @@ describe("MCP server integration", () => { await testDeps.cleanup(); }); - test("lists all 37 tools", async () => { + test("lists all 38 tools", async () => { const tools = await client.listTools(); const toolNames = tools.tools.map((t) => t.name).sort(); expect(toolNames).toEqual([ @@ -70,6 +70,7 @@ describe("MCP server integration", () => { "grove_create_session", "grove_discuss", "grove_done", + "grove_eval", "grove_frontier", "grove_get_outcome", "grove_goal", diff --git a/src/mcp/server.test.ts b/src/mcp/server.test.ts index 9d1c412e..a6b746bb 100644 --- a/src/mcp/server.test.ts +++ b/src/mcp/server.test.ts @@ -55,7 +55,7 @@ describe("createMcpServer preset scoping", () => { "grove_submit_work", ]; - // --- Full tool list (matches integration test expectation) --------------- + // --- Full tool list (no preset — grove_eval excluded, it is opt-in via eval:true) --- const allTools = [ "ask_user", @@ -99,16 +99,24 @@ describe("createMcpServer preset scoping", () => { // ----------------------------------------------------------------------- - test("no preset registers all tools (backwards compatible)", async () => { + test("no preset registers all tools except grove_eval (eval is opt-in)", async () => { const server = await createMcpServer(deps); const names = getRegisteredToolNames(server); expect(names).toEqual(allTools); + expect(names).not.toContain("grove_eval"); }); - test("empty preset object registers all tools (defaults are true)", async () => { + test("empty preset object registers all tools except grove_eval", async () => { const server = await createMcpServer(deps, {}); const names = getRegisteredToolNames(server); expect(names).toEqual(allTools); + expect(names).not.toContain("grove_eval"); + }); + + test("eval: true enables grove_eval", async () => { + const server = await createMcpServer(deps, { eval: true }); + const names = getRegisteredToolNames(server); + expect(names).toContain("grove_eval"); }); test("claims: false excludes claim tools but keeps others", async () => { @@ -230,6 +238,16 @@ describe("createMcpServer preset scoping", () => { } }); + test("eval: false excludes eval tool", async () => { + const server = await createMcpServer(deps, { eval: false }); + const names = getRegisteredToolNames(server); + expect(names).not.toContain("grove_eval"); + // Contribution tools still present + for (const t of contributionTools) { + expect(names).toContain(t); + } + }); + test("contribution tools are always registered even when everything is disabled", async () => { const allDisabled: McpPresetConfig = { queries: false, @@ -242,6 +260,7 @@ describe("createMcpServer preset scoping", () => { messaging: false, plans: false, goals: false, + eval: false, }; const server = await createMcpServer(deps, allDisabled); diff --git a/src/mcp/server.ts b/src/mcp/server.ts index d9976003..2493d290 100644 --- a/src/mcp/server.ts +++ b/src/mcp/server.ts @@ -17,6 +17,7 @@ import { registerBountyTools } from "./tools/bounties.js"; import { registerClaimTools } from "./tools/claims.js"; import { registerContributionTools } from "./tools/contributions.js"; import { registerDoneTools } from "./tools/done.js"; +import { registerEvalTools } from "./tools/eval.js"; import { registerGoalTools } from "./tools/goal.js"; import { registerHandoffTools } from "./tools/handoffs.js"; import { registerIngestTools } from "./tools/ingest.js"; @@ -54,6 +55,8 @@ export interface McpPresetConfig { readonly plans?: boolean; /** Register goal/session tools. Default: true. */ readonly goals?: boolean; + /** Register eval harness tool (grove_eval). Default: false (opt-in via GROVE_MCP_EVAL_ENABLED). */ + readonly eval?: boolean; } // --------------------------------------------------------------------------- @@ -99,6 +102,7 @@ export async function createMcpServer(deps: McpDeps, preset?: McpPresetConfig): registerGoalTools(server, deps); registerSessionTools(server, deps); } + if (preset?.eval === true) registerEvalTools(server, deps); // ask_user is always registered (core functionality). await registerAskUserTools(server); diff --git a/src/mcp/tools/eval.test.ts b/src/mcp/tools/eval.test.ts new file mode 100644 index 00000000..213f0d6d --- /dev/null +++ b/src/mcp/tools/eval.test.ts @@ -0,0 +1,150 @@ +/** + * Tests for grove_eval MCP tool and evalOperation. + * + * Covers: + * 1. Success path: valid command emitting GROVE_SCORE lines + * 2. evalCommand override respected + * 3. Invalid/missing targetCid → VALIDATION_ERROR + * 4. Missing evalCommand → VALIDATION_ERROR + * 5. Subprocess timeout → timedOut: true + * 6. Non-zero exit code preserved in result + */ + +import { afterEach, beforeEach, describe, expect, test } from "bun:test"; + +import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; +import type { McpDeps } from "../deps.js"; +import type { TestMcpDeps } from "../test-helpers.js"; +import { createTestMcpDeps } from "../test-helpers.js"; +import { registerEvalTools } from "./eval.js"; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +async function callTool( + server: McpServer, + name: string, + args: Record, +): Promise<{ isError: boolean | undefined; text: string }> { + const registeredTools = ( + server as unknown as { + _registeredTools: Record Promise }>; + } + )._registeredTools; + const tool = registeredTools[name]; + if (!tool) throw new Error(`Tool ${name} not registered`); + const result = (await tool.handler(args)) as { + isError?: boolean; + content: Array<{ type: string; text: string }>; + }; + return { + isError: result.isError, + text: result.content[0]?.text ?? "", + }; +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +describe("grove_eval", () => { + let testDeps: TestMcpDeps; + let deps: McpDeps; + let server: McpServer; + + beforeEach(async () => { + testDeps = await createTestMcpDeps(); + deps = testDeps.deps; + server = new McpServer({ name: "test", version: "0.0.1" }, { capabilities: { tools: {} } }); + registerEvalTools(server, deps); + }); + + afterEach(async () => { + await testDeps.cleanup(); + }); + + // 1. Success path: command emits GROVE_SCORE lines + test("returns parsed scores for valid GROVE_SCORE output", async () => { + const result = await callTool(server, "grove_eval", { + targetCid: "blake3:abc123", + evalCommand: "echo 'GROVE_SCORE val_bpb=0.92' && echo 'GROVE_SCORE peak_vram_gb=14.3'", + }); + + expect(result.isError).toBeUndefined(); + const data = JSON.parse(result.text); + expect(data.timedOut).toBe(false); + expect(data.exitCode).toBe(0); + expect(data.scores).toHaveLength(2); + expect(data.scores).toContainEqual({ metric: "val_bpb", value: 0.92 }); + expect(data.scores).toContainEqual({ metric: "peak_vram_gb", value: 14.3 }); + }); + + // 2. evalCommand override respected (GROVE_TARGET_CID env var available) + test("passes GROVE_TARGET_CID as env var to the subprocess", async () => { + const result = await callTool(server, "grove_eval", { + targetCid: "blake3:deadbeef", + // Output a score only if GROVE_TARGET_CID is set and non-empty + evalCommand: 'test -n "$GROVE_TARGET_CID" && echo "GROVE_SCORE env_set=1"', + }); + + expect(result.isError).toBeUndefined(); + const data = JSON.parse(result.text); + expect(data.exitCode).toBe(0); + expect(data.scores).toContainEqual({ metric: "env_set", value: 1 }); + }); + + // 3. Missing / empty targetCid → VALIDATION_ERROR + test("returns VALIDATION_ERROR for empty targetCid", async () => { + const result = await callTool(server, "grove_eval", { + targetCid: " ", + evalCommand: "echo ok", + }); + + expect(result.isError).toBe(true); + expect(result.text).toContain("VALIDATION_ERROR"); + expect(result.text).toContain("targetCid"); + }); + + // 4. Missing evalCommand → VALIDATION_ERROR + test("returns VALIDATION_ERROR when evalCommand is absent", async () => { + const result = await callTool(server, "grove_eval", { + targetCid: "blake3:abc123", + // evalCommand intentionally omitted + }); + + expect(result.isError).toBe(true); + expect(result.text).toContain("VALIDATION_ERROR"); + expect(result.text).toContain("evalCommand"); + }); + + // 5. Subprocess timeout → timedOut: true + test("returns timedOut=true when subprocess exceeds timeoutMs", async () => { + const result = await callTool(server, "grove_eval", { + targetCid: "blake3:abc123", + evalCommand: "sleep 60", + timeoutMs: 500, // very short timeout + }); + + expect(result.isError).toBeUndefined(); + const data = JSON.parse(result.text); + expect(data.timedOut).toBe(true); + // exit code 124 is conventional for timeout-killed processes + expect(data.exitCode).toBe(124); + }, 10_000); // generous wall-clock timeout for this test + + // 6. Non-zero exit code preserved (scores still returned if any were emitted) + test("preserves non-zero exit code in result", async () => { + const result = await callTool(server, "grove_eval", { + targetCid: "blake3:abc123", + evalCommand: "echo 'GROVE_SCORE accuracy=0.75' && exit 2", + }); + + expect(result.isError).toBeUndefined(); + const data = JSON.parse(result.text); + expect(data.exitCode).toBe(2); + expect(data.timedOut).toBe(false); + // Score emitted before exit still captured + expect(data.scores).toContainEqual({ metric: "accuracy", value: 0.75 }); + }); +}); diff --git a/src/mcp/tools/eval.ts b/src/mcp/tools/eval.ts new file mode 100644 index 00000000..f1bec448 --- /dev/null +++ b/src/mcp/tools/eval.ts @@ -0,0 +1,67 @@ +/** + * MCP tool for the eval harness. + * + * grove_eval — Run the contract's eval harness against a target CID and + * return structured metric scores. + * + * All business logic is delegated to the shared operations layer. + */ + +import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; +import { z } from "zod"; + +import { evalOperation } from "../../core/operations/index.js"; +import type { McpDeps } from "../deps.js"; +import { toMcpResult, toOperationDeps } from "../operation-adapter.js"; + +// --------------------------------------------------------------------------- +// Tool registration +// --------------------------------------------------------------------------- + +export function registerEvalTools(server: McpServer, deps: McpDeps): void { + const opDeps = toOperationDeps(deps); + + server.registerTool( + "grove_eval", + { + description: + "Run the grove eval harness against a target contribution CID and return structured " + + "metric scores. The eval command is spawned as a subprocess (via sh -c) with " + + "GROVE_TARGET_CID set to targetCid. Score lines must follow the format: " + + "'GROVE_SCORE =' on stdout or stderr. " + + "Call grove_contribute separately to record results as a reproduction contribution.", + inputSchema: { + targetCid: z + .string() + .min(1) + .describe( + "CID of the contribution artifact to evaluate (passed as GROVE_TARGET_CID env var)", + ), + evalCommand: z + .string() + .min(1) + .describe( + "Shell command to execute as the eval harness (e.g. 'python eval.py'). " + + "Required in the current protocol version.", + ), + timeoutMs: z + .number() + .int() + .min(1000) + .max(3_600_000) + .optional() + .describe( + "Timeout in milliseconds before the subprocess is killed (default: 300000 = 5 min).", + ), + }, + }, + async (args) => { + const result = await evalOperation(opDeps, { + targetCid: args.targetCid, + evalCommand: args.evalCommand, + timeoutMs: args.timeoutMs, + }); + return toMcpResult(result); + }, + ); +} diff --git a/src/tui/app.tsx b/src/tui/app.tsx index c793f5e1..a75a40a1 100644 --- a/src/tui/app.tsx +++ b/src/tui/app.tsx @@ -40,6 +40,7 @@ import { type TuiDataProvider, } from "./provider.js"; import { useSpawnManager } from "./spawn-manager-context.js"; +import { theme } from "./theme.js"; /** Props for the root App component. */ export interface AppProps { @@ -972,20 +973,32 @@ export function App({ focusedPanel={panels.state.focused} keybindingOverrides={keybindingOverrides} /> - + {paletteVisible && ( + + + + )} r.name === profile.role)?.edges; + const edgeSuffix = + roleEdges && roleEdges.length > 0 ? ` → ${roleEdges.map((e) => e.target).join(", ")}` : ""; items.push({ kind: "spawn", id: profile.role, label: `spawn: ${profile.name} [${profile.platform}]`, enabled: check.allowed, - detail: `${check.currentInstances}/${max}${suffix}`, + detail: `${check.currentInstances}/${max}${suffix}${edgeSuffix}`, }); } } @@ -210,12 +213,16 @@ export function buildPaletteItems( const check = checkSpawn(topology, role.name, activeClaims, parentAgentId, activeSpawnCounts); const max = check.maxInstances !== undefined ? String(check.maxInstances) : "\u221E"; const suffix = !check.allowed ? " (at capacity)" : ""; + const edgeSuffix = + role.edges && role.edges.length > 0 + ? ` → ${role.edges.map((e) => e.target).join(", ")}` + : ""; items.push({ kind: "spawn", id: role.name, label: `spawn: ${role.name}`, enabled: check.allowed, - detail: `${check.currentInstances}/${max}${suffix}`, + detail: `${check.currentInstances}/${max}${suffix}${edgeSuffix}`, }); } } diff --git a/src/tui/screens/screen-manager.tsx b/src/tui/screens/screen-manager.tsx index a942aae4..118ec294 100644 --- a/src/tui/screens/screen-manager.tsx +++ b/src/tui/screens/screen-manager.tsx @@ -156,6 +156,7 @@ export const ScreenManager: React.NamedExoticComponent = Rea try { const { writeFileSync, renameSync } = await import("node:fs"); const { join } = await import("node:path"); + // biome-ignore lint/style/noNonNullAssertion: groveDir is set at startup before any session writes const finalPath = join(appProps.groveDir!, "current-session.json"); const tmpPath = `${finalPath}.${process.pid}.${Date.now()}.tmp`; writeFileSync(tmpPath, JSON.stringify({ sessionId: id }, null, 2), "utf-8"); diff --git a/src/tui/spawn-manager.ts b/src/tui/spawn-manager.ts index 57f4da1a..7bd1a0a1 100644 --- a/src/tui/spawn-manager.ts +++ b/src/tui/spawn-manager.ts @@ -993,9 +993,11 @@ export class SpawnManager { .getHandoffs({ sourceCid: c.cid, status: "pending_pickup" }) .then((hs) => { for (const h of hs) { + // biome-ignore lint/suspicious/noEmptyBlockStatements: delivery errors silently swallowed per fire-and-forget pattern void hp.markHandoffDelivered(h.handoffId).catch(() => {}); } }) + // biome-ignore lint/suspicious/noEmptyBlockStatements: getHandoffs errors silently swallowed .catch(() => {}); } }