From e820e59cbd430cd308a4734747b381996c573996 Mon Sep 17 00:00:00 2001 From: Edward Irby Date: Tue, 10 Mar 2026 21:22:21 -0700 Subject: [PATCH 1/7] feat: replace pipeline with agent-build trial tooling Full replacement of src/ and bin/ with trial runner, schemas, and CLI utilities ported from agent-build. Strips BP-specific types (no RISK_TAG, SelectionBid, TOOL_STATUS). Inlines simplified TrajectoryStepSchema with 3 step types. BREAKING: New CLI entry point, new exports structure. Co-Authored-By: Claude Opus 4.6 --- .agents/skills/compare-trials/SKILL.md | 96 ++ .../compare-trials/references/bootstrap.ts | 58 + .../compare-trials/references/compare.ts | 195 +++ .agents/skills/trial-adapters/SKILL.md | 258 ++++ .agents/skills/trial-runner/SKILL.md | 182 +++ bin/cli.ts | 160 --- bin/tests/cli.spec.ts | 529 -------- bun.lock | 5 - package.json | 23 +- src/cli.ts | 22 + src/cli.utils.ts | 99 ++ src/commands.ts | 33 - src/commands/balance.ts | 245 ---- src/commands/calibrate.ts | 304 ----- src/commands/capture.ts | 391 ------ src/commands/execution.ts | 245 ---- src/commands/summarize.ts | 226 ---- src/commands/tests/balance-helpers.spec.ts | 279 ----- src/commands/tests/calibrate-helpers.spec.ts | 226 ---- src/commands/tests/capture-cli.spec.ts | 274 ----- src/commands/tests/capture-helpers.spec.ts | 634 ---------- src/commands/tests/summarize-helpers.spec.ts | 339 ------ .../tests/trials-calculations.spec.ts | 209 ---- src/commands/tests/trials-cli.spec.ts | 215 ---- src/commands/trials.ts | 377 ------ src/commands/validate-refs.ts | 171 --- src/core.ts | 46 - src/core/core.ts | 51 - src/core/loading.ts | 207 ---- src/core/output.ts | 120 -- src/core/streaming.ts | 172 --- src/core/tests/core.spec.ts | 310 ----- src/core/tests/streaming.spec.ts | 399 ------ src/core/tests/worker-pool.spec.ts | 377 ------ src/core/trajectory.ts | 172 --- src/core/worker-pool.ts | 220 ---- src/graders.ts | 39 - src/graders/bootstrap.ts | 135 --- src/graders/compare-statistical.ts | 115 -- src/graders/compare-weighted.ts | 112 -- src/graders/tests/bootstrap.spec.ts | 169 --- src/graders/tests/compare-graders.spec.ts | 293 ----- .../tests/trials-compare-graders.spec.ts | 358 ------ src/graders/trials-compare-statistical.ts | 183 --- src/graders/trials-compare-weighted.ts | 128 -- src/harness.ts | 46 - src/headless.ts | 72 -- src/headless/headless-cli.ts | 428 ------- src/headless/headless-history-builder.ts | 141 --- src/headless/headless-output-parser.ts | 388 ------ src/headless/headless-session-manager.ts | 590 --------- src/headless/headless.schemas.ts | 321 ----- src/headless/headless.types.ts | 19 - .../tests/fixtures/claude-headless.json | 40 - .../tests/fixtures/gemini-headless.json | 37 - src/headless/tests/headless.spec.ts | 873 -------------- src/integration_tests/claude.spec.ts | 157 --- src/integration_tests/gemini.spec.ts | 139 --- src/pipeline.ts | 34 - src/pipeline/compare-format-detection.ts | 100 -- src/pipeline/compare-trials.ts | 800 ------------ src/pipeline/compare-utils.ts | 85 -- src/pipeline/compare.ts | 818 ------------- src/pipeline/extract.ts | 241 ---- src/pipeline/format.ts | 291 ----- src/pipeline/grade.ts | 175 --- src/pipeline/pipeline.ts | 42 - src/pipeline/pipeline.types.ts | 325 ----- src/pipeline/run.ts | 414 ------- .../tests/compare-format-detection.spec.ts | 142 --- .../tests/compare-statistical.spec.ts | 289 ----- src/pipeline/tests/compare-trials.spec.ts | 592 --------- src/pipeline/tests/compare-utils.spec.ts | 128 -- src/pipeline/tests/pipeline.spec.ts | 356 ------ src/schemas.ts | 134 -- src/schemas/constants.ts | 94 -- src/schemas/grader-loader.ts | 203 ---- src/schemas/schemas-cli.ts | 227 ---- src/schemas/schemas.ts | 1073 ----------------- src/schemas/tests/constants.spec.ts | 121 -- .../tests/fixtures/grader-bad-module.ts | 5 - .../tests/fixtures/grader-exec-fail.py | 9 - .../tests/fixtures/grader-exec-invalid.py | 6 - src/schemas/tests/fixtures/grader-exec.py | 29 - src/schemas/tests/fixtures/grader-git.ts | 116 -- src/schemas/tests/fixtures/grader-module.ts | 14 - src/schemas/tests/grader-git.spec.ts | 222 ---- src/schemas/tests/grader-loader.spec.ts | 153 --- src/schemas/tests/schemas-cli.spec.ts | 142 --- src/schemas/tests/schemas.spec.ts | 606 ---------- src/tests/trial.spec.ts | 723 +++++++++++ src/trial.constants.ts | 11 + src/trial.schemas.ts | 288 +++++ src/trial.ts | 319 +++++ src/trial.utils.ts | 444 +++++++ tsconfig.json | 2 +- 96 files changed, 2703 insertions(+), 19722 deletions(-) create mode 100644 .agents/skills/compare-trials/SKILL.md create mode 100644 .agents/skills/compare-trials/references/bootstrap.ts create mode 100644 .agents/skills/compare-trials/references/compare.ts create mode 100644 .agents/skills/trial-adapters/SKILL.md create mode 100644 .agents/skills/trial-runner/SKILL.md delete mode 100644 bin/cli.ts delete mode 100644 bin/tests/cli.spec.ts create mode 100644 src/cli.ts create mode 100644 src/cli.utils.ts delete mode 100644 src/commands.ts delete mode 100644 src/commands/balance.ts delete mode 100644 src/commands/calibrate.ts delete mode 100644 src/commands/capture.ts delete mode 100644 src/commands/execution.ts delete mode 100644 src/commands/summarize.ts delete mode 100644 src/commands/tests/balance-helpers.spec.ts delete mode 100644 src/commands/tests/calibrate-helpers.spec.ts delete mode 100644 src/commands/tests/capture-cli.spec.ts delete mode 100644 src/commands/tests/capture-helpers.spec.ts delete mode 100644 src/commands/tests/summarize-helpers.spec.ts delete mode 100644 src/commands/tests/trials-calculations.spec.ts delete mode 100644 src/commands/tests/trials-cli.spec.ts delete mode 100644 src/commands/trials.ts delete mode 100644 src/commands/validate-refs.ts delete mode 100644 src/core.ts delete mode 100644 src/core/core.ts delete mode 100644 src/core/loading.ts delete mode 100644 src/core/output.ts delete mode 100644 src/core/streaming.ts delete mode 100644 src/core/tests/core.spec.ts delete mode 100644 src/core/tests/streaming.spec.ts delete mode 100644 src/core/tests/worker-pool.spec.ts delete mode 100644 src/core/trajectory.ts delete mode 100644 src/core/worker-pool.ts delete mode 100644 src/graders.ts delete mode 100644 src/graders/bootstrap.ts delete mode 100644 src/graders/compare-statistical.ts delete mode 100644 src/graders/compare-weighted.ts delete mode 100644 src/graders/tests/bootstrap.spec.ts delete mode 100644 src/graders/tests/compare-graders.spec.ts delete mode 100644 src/graders/tests/trials-compare-graders.spec.ts delete mode 100644 src/graders/trials-compare-statistical.ts delete mode 100644 src/graders/trials-compare-weighted.ts delete mode 100644 src/harness.ts delete mode 100644 src/headless.ts delete mode 100644 src/headless/headless-cli.ts delete mode 100644 src/headless/headless-history-builder.ts delete mode 100644 src/headless/headless-output-parser.ts delete mode 100644 src/headless/headless-session-manager.ts delete mode 100644 src/headless/headless.schemas.ts delete mode 100644 src/headless/headless.types.ts delete mode 100644 src/headless/tests/fixtures/claude-headless.json delete mode 100644 src/headless/tests/fixtures/gemini-headless.json delete mode 100644 src/headless/tests/headless.spec.ts delete mode 100644 src/integration_tests/claude.spec.ts delete mode 100644 src/integration_tests/gemini.spec.ts delete mode 100644 src/pipeline.ts delete mode 100644 src/pipeline/compare-format-detection.ts delete mode 100644 src/pipeline/compare-trials.ts delete mode 100644 src/pipeline/compare-utils.ts delete mode 100644 src/pipeline/compare.ts delete mode 100644 src/pipeline/extract.ts delete mode 100644 src/pipeline/format.ts delete mode 100644 src/pipeline/grade.ts delete mode 100644 src/pipeline/pipeline.ts delete mode 100644 src/pipeline/pipeline.types.ts delete mode 100644 src/pipeline/run.ts delete mode 100644 src/pipeline/tests/compare-format-detection.spec.ts delete mode 100644 src/pipeline/tests/compare-statistical.spec.ts delete mode 100644 src/pipeline/tests/compare-trials.spec.ts delete mode 100644 src/pipeline/tests/compare-utils.spec.ts delete mode 100644 src/pipeline/tests/pipeline.spec.ts delete mode 100644 src/schemas.ts delete mode 100644 src/schemas/constants.ts delete mode 100644 src/schemas/grader-loader.ts delete mode 100644 src/schemas/schemas-cli.ts delete mode 100644 src/schemas/schemas.ts delete mode 100644 src/schemas/tests/constants.spec.ts delete mode 100644 src/schemas/tests/fixtures/grader-bad-module.ts delete mode 100755 src/schemas/tests/fixtures/grader-exec-fail.py delete mode 100755 src/schemas/tests/fixtures/grader-exec-invalid.py delete mode 100755 src/schemas/tests/fixtures/grader-exec.py delete mode 100644 src/schemas/tests/fixtures/grader-git.ts delete mode 100644 src/schemas/tests/fixtures/grader-module.ts delete mode 100644 src/schemas/tests/grader-git.spec.ts delete mode 100644 src/schemas/tests/grader-loader.spec.ts delete mode 100644 src/schemas/tests/schemas-cli.spec.ts delete mode 100644 src/schemas/tests/schemas.spec.ts create mode 100644 src/tests/trial.spec.ts create mode 100644 src/trial.constants.ts create mode 100644 src/trial.schemas.ts create mode 100644 src/trial.ts create mode 100644 src/trial.utils.ts diff --git a/.agents/skills/compare-trials/SKILL.md b/.agents/skills/compare-trials/SKILL.md new file mode 100644 index 0000000..9ac63be --- /dev/null +++ b/.agents/skills/compare-trials/SKILL.md @@ -0,0 +1,96 @@ +--- +name: compare-trials +description: Compare trial results from the trial runner. Teaches agents to write comparison and analysis scripts against TrialResult JSONL files for pass@k reliability analysis, bootstrap confidence intervals, and flakiness detection. +license: ISC +--- + +# Compare Trials + +## Purpose + +This skill teaches agents how to analyze and compare `TrialResult` JSONL output from the `trial` runner. Instead of a built-in comparison command, agents write scripts directly — the analysis is domain-specific and benefits from code-level flexibility. + +**Use this when:** +- Comparing trial results from multiple adapter runs +- Computing statistical metrics (bootstrap confidence intervals, effect sizes) +- Analyzing flakiness (pass@k vs pass^k gap) +- Generating comparison reports + +## TrialResult Schema + +Each line in a trial JSONL file matches this shape: + +```typescript +type TrialResult = { + id: string // Prompt identifier + input: string | string[] // Original prompt + hint?: string // Grader context + k: number // Trials per prompt + passRate?: number // passes / k + passAtK?: number // 1 - (1 - passRate)^k + passExpK?: number // passRate^k + trials: TrialEntry[] // Individual trial data + metadata?: Record // Custom metadata +} + +type TrialEntry = { + trialNum: number + output: string + trajectory?: TrajectoryStep[] + duration: number // Wall-clock ms + timing?: { total?: number; inputTokens?: number; outputTokens?: number } + exitCode?: number | null + timedOut?: boolean + pass?: boolean + score?: number + reasoning?: string + outcome?: Record +} +``` + +## Key Metrics + +| Metric | Formula | Meaning | +|--------|---------|---------| +| `passRate` | passes / k | Raw success rate | +| `passAtK` | 1 - (1 - passRate)^k | Capability — can it solve this at all? | +| `passExpK` | passRate^k | Reliability — does it solve this every time? | +| `flakiness` | passAtK - passExpK | Gap between capability and reliability | + +## How to Compare + +1. Load two (or more) JSONL files +2. Index results by prompt `id` +3. Compute aggregate metrics per run +4. Bootstrap for confidence intervals +5. Output comparison as structured JSON + +## Reference Implementation + +**[compare.ts](references/compare.ts)** — Complete comparison script + +Takes two JSONL file paths as arguments, loads and indexes them, computes per-run and per-prompt metrics, runs bootstrap resampling for confidence intervals, and outputs a structured comparison report. + +**[bootstrap.ts](references/bootstrap.ts)** — Bootstrap sampling utility + +Reusable bootstrap function for computing confidence intervals on any metric. Used by the comparison script for reliable statistical comparisons. + +## Usage Pattern + +```bash +# Agent writes and runs a comparison script +bun run compare.ts baseline.jsonl challenger.jsonl > report.json + +# Or inline in the trial runner +const results = await runTrial({ adapter, prompts, k: 10, grader }) +// Agent analyzes results array directly — no file round-trip needed +``` + +## Key Points for Agents + +- `TrialResult` files are JSONL (one JSON object per line) +- Always match results by `id` — prompts may arrive in different order +- Bootstrap needs at least 30 samples for reliable CIs (use 1000+ resamples) +- Flakiness = passAtK - passExpK measures inconsistency +- Token usage is optional — only present if the adapter reports it +- Comparison is agent-written code, not a built-in command diff --git a/.agents/skills/compare-trials/references/bootstrap.ts b/.agents/skills/compare-trials/references/bootstrap.ts new file mode 100644 index 0000000..dbc9a19 --- /dev/null +++ b/.agents/skills/compare-trials/references/bootstrap.ts @@ -0,0 +1,58 @@ +/** + * Bootstrap sampling utility for confidence intervals. + * + * @remarks + * Reusable bootstrap function — computes confidence intervals for any + * numeric metric by resampling with replacement. + */ + +/** + * Compute bootstrap confidence interval for a metric. + * + * @param values - Array of observed metric values + * @param statFn - Function to compute the statistic (default: mean) + * @param options - Bootstrap configuration + * @returns [lower, upper] confidence interval bounds + */ +export const bootstrap = ( + values: number[], + statFn: (samples: number[]) => number = mean, + options: { resamples?: number; confidence?: number } = {}, +): [number, number] => { + const { resamples = 1000, confidence = 0.95 } = options + + if (values.length === 0) return [0, 0] + if (values.length === 1) return [values[0] ?? 0, values[0] ?? 0] + + const stats: number[] = [] + + for (let i = 0; i < resamples; i++) { + const sample = Array.from({ length: values.length }, () => { + const v = values[Math.floor(Math.random() * values.length)] + return v ?? 0 + }) + stats.push(statFn(sample)) + } + + stats.sort((a, b) => a - b) + + const alpha = (1 - confidence) / 2 + const lower = stats[Math.floor(alpha * stats.length)] ?? 0 + const upper = stats[Math.floor((1 - alpha) * stats.length)] ?? 0 + + return [lower, upper] +} + +/** Compute mean of an array */ +export const mean = (values: number[]): number => { + if (values.length === 0) return 0 + return values.reduce((sum, v) => sum + v, 0) / values.length +} + +/** Compute median of a sorted array */ +export const median = (values: number[]): number => { + if (values.length === 0) return 0 + const sorted = [...values].sort((a, b) => a - b) + const mid = Math.floor(sorted.length / 2) + return sorted.length % 2 === 0 ? ((sorted[mid - 1] ?? 0) + (sorted[mid] ?? 0)) / 2 : (sorted[mid] ?? 0) +} diff --git a/.agents/skills/compare-trials/references/compare.ts b/.agents/skills/compare-trials/references/compare.ts new file mode 100644 index 0000000..0131f4a --- /dev/null +++ b/.agents/skills/compare-trials/references/compare.ts @@ -0,0 +1,195 @@ +/** + * Reference comparison script for TrialResult JSONL files. + * + * @remarks + * Takes two JSONL file paths as arguments, computes aggregate and + * per-prompt metrics, runs bootstrap resampling for confidence intervals, + * and outputs a structured JSON comparison report. + * + * Usage: bun run compare.ts baseline.jsonl challenger.jsonl + */ + +import { bootstrap, mean, median } from './bootstrap.ts' + +// ============================================================================ +// Types (inline — this is a standalone script) +// ============================================================================ + +type TrialEntry = { + trialNum: number + output: string + duration: number + pass?: boolean + score?: number +} + +type TrialResult = { + id: string + input: string | string[] + k: number + passRate?: number + passAtK?: number + passExpK?: number + trials: TrialEntry[] + metadata?: Record +} + +type RunMetrics = { + label: string + promptCount: number + avgPassRate: number + avgPassAtK: number + avgPassExpK: number + avgFlakiness: number + avgDuration: number + medianDuration: number + passRateCI: [number, number] + passAtKCI: [number, number] +} + +type PerPromptComparison = { + id: string + baselinePassRate: number | null + challengerPassRate: number | null + baselinePassAtK: number | null + challengerPassAtK: number | null + winner: string | null +} + +type ComparisonReport = { + baseline: RunMetrics + challenger: RunMetrics + perPrompt: PerPromptComparison[] + summary: { + baselineWins: number + challengerWins: number + ties: number + totalPrompts: number + } +} + +// ============================================================================ +// Loading +// ============================================================================ + +const loadJsonl = async (path: string): Promise => { + const content = await Bun.file(path).text() + return content + .trim() + .split('\n') + .filter(Boolean) + .map((line) => JSON.parse(line) as TrialResult) +} + +const indexById = (results: TrialResult[]): Map => { + const map = new Map() + for (const r of results) map.set(r.id, r) + return map +} + +// ============================================================================ +// Metrics Computation +// ============================================================================ + +const computeRunMetrics = (label: string, results: TrialResult[]): RunMetrics => { + const passRates = results.map((r) => r.passRate ?? 0) + const passAtKs = results.map((r) => r.passAtK ?? 0) + const passExpKs = results.map((r) => r.passExpK ?? 0) + const flakiness = results.map((r) => (r.passAtK ?? 0) - (r.passExpK ?? 0)) + const durations = results.flatMap((r) => r.trials.map((t) => t.duration)) + + return { + label, + promptCount: results.length, + avgPassRate: mean(passRates), + avgPassAtK: mean(passAtKs), + avgPassExpK: mean(passExpKs), + avgFlakiness: mean(flakiness), + avgDuration: mean(durations), + medianDuration: median(durations), + passRateCI: bootstrap(passRates), + passAtKCI: bootstrap(passAtKs), + } +} + +// ============================================================================ +// Comparison +// ============================================================================ + +const compare = (baseline: TrialResult[], challenger: TrialResult[]): ComparisonReport => { + const baselineIndex = indexById(baseline) + const challengerIndex = indexById(challenger) + + // Get all unique prompt IDs + const allIds = new Set([...baselineIndex.keys(), ...challengerIndex.keys()]) + + const perPrompt: PerPromptComparison[] = [] + let baselineWins = 0 + let challengerWins = 0 + let ties = 0 + + for (const id of allIds) { + const b = baselineIndex.get(id) + const c = challengerIndex.get(id) + + const bPassAtK = b?.passAtK ?? null + const cPassAtK = c?.passAtK ?? null + + let winner: string | null = null + if (bPassAtK !== null && cPassAtK !== null) { + if (bPassAtK > cPassAtK) { + winner = 'baseline' + baselineWins++ + } else if (cPassAtK > bPassAtK) { + winner = 'challenger' + challengerWins++ + } else { + ties++ + } + } + + perPrompt.push({ + id, + baselinePassRate: b?.passRate ?? null, + challengerPassRate: c?.passRate ?? null, + baselinePassAtK: bPassAtK, + challengerPassAtK: cPassAtK, + winner, + }) + } + + return { + baseline: computeRunMetrics('baseline', baseline), + challenger: computeRunMetrics('challenger', challenger), + perPrompt, + summary: { + baselineWins, + challengerWins, + ties, + totalPrompts: allIds.size, + }, + } +} + +// ============================================================================ +// Main +// ============================================================================ + +const main = async () => { + const [baselinePath, challengerPath] = process.argv.slice(2) + + if (!baselinePath || !challengerPath) { + console.error('Usage: bun run compare.ts ') + process.exit(1) + } + + const baseline = await loadJsonl(baselinePath) + const challenger = await loadJsonl(challengerPath) + + const report = compare(baseline, challenger) + + // biome-ignore lint/suspicious/noConsole: CLI stdout output + console.log(JSON.stringify(report, null, 2)) +} + +await main() diff --git a/.agents/skills/trial-adapters/SKILL.md b/.agents/skills/trial-adapters/SKILL.md new file mode 100644 index 0000000..b8feb28 --- /dev/null +++ b/.agents/skills/trial-adapters/SKILL.md @@ -0,0 +1,258 @@ +--- +name: trial-adapters +description: Write adapter scripts for the trial runner. Adapters wrap any CLI agent as a polyglot script (TypeScript module or executable) using the stdin/stdout JSON contract. Includes patterns for rich trajectory capture and token usage reporting. +license: ISC +--- + +# Trial Adapters + +## Purpose + +Write adapter scripts that wrap any CLI agent for the trial runner. Adapters follow the polyglot pattern — TypeScript modules or executables with stdin/stdout JSON. + +**Use this when:** +- Wrapping a new CLI agent for evaluation +- Creating adapters for different agent configurations +- Building adapters that capture rich trajectory data +- Integrating external tools (Gemini CLI, local models, A2A agents) + +## Adapter Contract + +### TypeScript Module + +Export an `adapt` function matching the `Adapter` type: + +```typescript +import type { Adapter } from './src/trial.schemas.ts' + +export const adapt: Adapter = async ({ prompt, cwd }) => { + const text = Array.isArray(prompt) ? prompt.join('\n') : prompt + const proc = Bun.spawn(['my-agent', '--prompt', text], { cwd, stdout: 'pipe', stderr: 'pipe' }) + const output = await new Response(proc.stdout).text() + const exitCode = await proc.exited + return { output: output.trim(), exitCode } +} +``` + +### Executable Script + +Any executable — reads `AdapterInput` from stdin, writes `AdapterResult` to stdout: + +```python +#!/usr/bin/env python3 +import json, sys, subprocess + +data = json.load(sys.stdin) +prompt = data["prompt"] +cwd = data.get("cwd") + +result = subprocess.run( + ["my-agent", prompt], + capture_output=True, text=True, cwd=cwd +) + +print(json.dumps({ + "output": result.stdout.strip(), + "exitCode": result.returncode +})) +``` + +### Input Type + +```typescript +type AdapterInput = { + prompt: string | string[] // Single or multi-turn + cwd?: string // Working directory +} +``` + +### Output Type + +```typescript +type AdapterResult = { + output: string // Final agent response (required) + trajectory?: TrajectoryStep[] // Optional structured trajectory + timing?: { + total?: number // Adapter-measured duration (ms) + inputTokens?: number // Input tokens consumed + outputTokens?: number // Output tokens generated + } + exitCode?: number | null // Process exit code (null if signaled) + timedOut?: boolean // Whether the adapter timed out +} +``` + +## Loading + +The trial runner loads adapters via `loadAdapter()`: + +```typescript +import { loadAdapter } from './src/trial.utils.ts' + +// TS module: imports and extracts 'adapt' export +const adapter = await loadAdapter('./my-adapter.ts') + +// Executable: wraps as stdin/stdout JSON subprocess +const adapter = await loadAdapter('./my-adapter.py') +``` + +Detection is by file extension: `.ts`, `.js`, `.mjs`, `.cjs` are imported as ES modules. Everything else is spawned as a subprocess. + +## Patterns + +### Minimal Adapter (Output Only) + +Simplest possible adapter — just captures text output: + +```typescript +import type { Adapter } from './src/trial.schemas.ts' + +export const adapt: Adapter = async ({ prompt }) => { + const text = Array.isArray(prompt) ? prompt.join('\n') : prompt + const result = await Bun.$`echo ${text} | my-agent`.text() + return { output: result.trim() } +} +``` + +### Rich Adapter (Trajectory + Timing) + +Captures structured trajectory for detailed analysis: + +```typescript +import type { Adapter } from './src/trial.schemas.ts' +import type { TrajectoryStep } from './src/trial.schemas.ts' + +export const adapt: Adapter = async ({ prompt, cwd }) => { + const text = Array.isArray(prompt) ? prompt.join('\n') : prompt + const start = Date.now() + + const proc = Bun.spawn( + ['my-agent', '--prompt', text, '--output-format', 'json'], + { cwd, stdout: 'pipe', stderr: 'pipe' }, + ) + + const raw = await new Response(proc.stdout).text() + const exitCode = await proc.exited + const elapsed = Date.now() - start + + // Parse agent's JSON output into trajectory + const events = raw.trim().split('\n').filter(Boolean).map((l) => JSON.parse(l)) + const trajectory: TrajectoryStep[] = [] + let output = '' + + for (const event of events) { + if (event.type === 'thinking') { + trajectory.push({ type: 'thought', content: event.text, timestamp: Date.now() }) + } else if (event.type === 'tool_use') { + trajectory.push({ + type: 'tool_call', + name: event.name, + status: 'completed', + input: event.input, + output: event.result, + timestamp: Date.now(), + }) + } else if (event.type === 'text') { + output += event.text + trajectory.push({ type: 'message', content: event.text, timestamp: Date.now() }) + } + } + + return { + output, + trajectory, + timing: { + total: elapsed, + inputTokens: events.find((e) => e.usage)?.usage?.input_tokens, + outputTokens: events.find((e) => e.usage)?.usage?.output_tokens, + }, + exitCode, + } +} +``` + +### Multi-Turn Adapter + +Handles `prompt: string[]` by sending each turn sequentially: + +```typescript +import type { Adapter } from './src/trial.schemas.ts' + +export const adapt: Adapter = async ({ prompt, cwd }) => { + const turns = Array.isArray(prompt) ? prompt : [prompt] + const outputs: string[] = [] + + for (const turn of turns) { + const result = await Bun.$`my-agent --prompt ${turn} --cwd ${cwd ?? '.'}`.text() + outputs.push(result.trim()) + } + + return { output: outputs[outputs.length - 1] } +} +``` + +### Timeout-Aware Adapter + +Reports its own timeout detection: + +```typescript +import type { Adapter } from './src/trial.schemas.ts' + +export const adapt: Adapter = async ({ prompt, cwd }) => { + const text = Array.isArray(prompt) ? prompt.join('\n') : prompt + const timeout = 30_000 + + const proc = Bun.spawn(['my-agent', '--prompt', text], { cwd, stdout: 'pipe' }) + const timer = setTimeout(() => proc.kill(), timeout) + + const output = await new Response(proc.stdout).text() + const exitCode = await proc.exited + clearTimeout(timer) + + const timedOut = exitCode === null // null = killed by signal + return { output: timedOut ? '' : output.trim(), exitCode, timedOut } +} +``` + +### In-Process Adapter (No Subprocess) + +For agents with a library API — no process spawning needed: + +```typescript +import type { Adapter } from './src/trial.schemas.ts' + +export const adapt: Adapter = async ({ prompt, cwd }) => { + // Call agent library directly + const agent = createAgent({ workspace: cwd }) + const response = await agent.run(Array.isArray(prompt) ? prompt.join('\n') : prompt) + return { + output: response.text, + trajectory: response.steps, + timing: { inputTokens: response.usage.input, outputTokens: response.usage.output }, + } +} +``` + +## Usage with Trial Runner + +```bash +# CLI: path-based loading +agent-eval-harness trials '{"adapterPath":"./my-adapter.ts","promptsPath":"prompts.jsonl","k":5}' + +# Library: function-based (primary) +import { runTrial } from './src/trial.ts' +const results = await runTrial({ adapter: myAdapter, prompts, k: 5 }) +``` + +## Tips + +- Return `trajectory` for richer analysis (thought steps, tool calls) +- Return `timing.inputTokens`/`outputTokens` if the agent exposes usage +- Set `timedOut: true` if the adapter detects its own timeout +- Use `cwd` for workspace-isolated code generation tasks +- For multi-turn, the runner sends the full `prompt: string[]` — the adapter decides how to sequence turns + +## Related + +- **[trial-runner](../trial-runner/SKILL.md)** — Running trials with adapters +- **[compare-trials](../compare-trials/SKILL.md)** — Comparing trial results diff --git a/.agents/skills/trial-runner/SKILL.md b/.agents/skills/trial-runner/SKILL.md new file mode 100644 index 0000000..c1c87ef --- /dev/null +++ b/.agents/skills/trial-runner/SKILL.md @@ -0,0 +1,182 @@ +--- +name: trial-runner +description: Run trials against adapters (any CLI/agent), capture trajectories, and optionally grade results. Library-first API with CLI secondary. Supports pass@k reliability analysis, workspace isolation, and polyglot graders. +license: ISC +--- + +# Trial Runner + +## Purpose + +Run prompts against any adapter, capture structured results, and optionally grade them. The fundamental operation is a **trial** — running k attempts per prompt and measuring pass@k reliability. + +**The runner executes trials. You provide adapters and graders.** + +| Runner Provides | You Provide | +|-----------------|-------------| +| Trial execution (k runs per prompt) | Adapter script (wraps your agent CLI) | +| Structured JSONL output | Grader script (scores output) | +| pass@k/pass^k metrics | Prompts (JSONL) | +| Concurrent execution + workspace isolation | Comparison analysis scripts | + +**Use this when:** +- Evaluating agent quality with pass@k reliability metrics +- Capturing trajectories for downstream scoring or training +- Comparing agents across configurations (via `compare-trials` skill) +- Orchestrating distillation pipelines + +## Library API (Primary) + +The in-process API is the primary interface. Agents call `runTrial()` directly: + +```typescript +import { runTrial } from './src/trial.ts' +import type { Adapter, Grader } from './src/trial.schemas.ts' + +const adapter: Adapter = async ({ prompt, cwd }) => { + const proc = Bun.spawn(['my-agent', '--prompt', prompt], { cwd }) + const output = await new Response(proc.stdout).text() + return { output } +} + +const results = await runTrial({ + adapter, + prompts: [{ id: 'p1', input: 'Create a button component' }], + grader: async ({ output, cwd }) => { + const tests = await Bun.$`cd ${cwd} && bun test`.nothrow() + return { pass: tests.exitCode === 0, score: tests.exitCode === 0 ? 1 : 0 } + }, + k: 10, + concurrency: 4, + workspaceDir: './workspaces', +}) +// results[0].passRate, results[0].passAtK, results[0].passExpK +``` + +### runTrial Config + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `adapter` | `Adapter` | required | Function or loaded from path | +| `prompts` | `PromptCase[]` | required | Prompt cases to run | +| `grader` | `Grader` | none | Optional grading function | +| `k` | `number` | 1 | Trials per prompt | +| `outputPath` | `string` | none | JSONL output file (stdout if absent) | +| `cwd` | `string` | none | Working directory for adapter | +| `timeout` | `number` | 60000 | Timeout per prompt in ms | +| `concurrency` | `number` | 1 | Concurrent workers | +| `workspaceDir` | `string` | none | Per-prompt workspace isolation base dir | +| `progress` | `boolean` | false | Show progress to stderr | +| `append` | `boolean` | false | Append to output file | + +## CLI (Secondary) + +The CLI resolves file paths to functions, then delegates to `runTrial`: + +```bash +# Basic trial +agent-eval-harness trials '{"adapterPath":"./adapter.ts","promptsPath":"prompts.jsonl","k":5}' + +# With grader and progress +agent-eval-harness trials '{"adapterPath":"./adapter.ts","promptsPath":"prompts.jsonl","k":10,"graderPath":"./grader.ts","concurrency":4,"progress":true}' + +# Schema discovery +agent-eval-harness trials --schema input +agent-eval-harness trials --schema output +agent-eval-harness trials --help +``` + +## Input Format (prompts.jsonl) + +```jsonl +{"id":"test-001","input":"Create a button","hint":"should contain "},"output":"File written","duration":234,"timestamp":150,"stepId":"test-001-step-2"},{"type":"message","content":"I created the button","timestamp":500,"stepId":"test-001-step-3"}],"metadata":{"category":"ui","agent":"claude-headless"},"timing":{"start":1704067200000,"end":1704067201234,"firstResponse":100},"toolErrors":false} -{"id":"test-002","input":"Fix the bug","output":"I fixed the bug","trajectory":[{"type":"tool_call","name":"Read","status":"completed","input":{"file_path":"src/app.ts"},"output":"file contents...","duration":100,"timestamp":50,"stepId":"test-002-step-1"},{"type":"tool_call","name":"Edit","status":"completed","input":{"file_path":"src/app.ts","old_string":"bug","new_string":"fix"},"duration":150,"timestamp":200,"stepId":"test-002-step-2"},{"type":"message","content":"I fixed the bug","timestamp":400,"stepId":"test-002-step-3"}],"metadata":{"category":"bugfix","agent":"claude-headless"},"timing":{"start":1704067300000,"end":1704067302567},"toolErrors":false}` - -// ============================================================================ -// Downstream Pattern Tests -// ============================================================================ - -describe('downstream patterns: summary JSONL', () => { - const parseResults = (jsonl: string) => - jsonl - .trim() - .split('\n') - .map((line) => JSON.parse(line)) - - test('parses summary JSONL correctly', () => { - const results = parseResults(SAMPLE_SUMMARY_JSONL) - - expect(results).toHaveLength(3) - for (const result of results) { - expect(() => SummaryResultSchema.parse(result)).not.toThrow() - } - }) - - test('filters by output presence (jq pattern)', () => { - const results = parseResults(SAMPLE_SUMMARY_JSONL) - const withOutput = results.filter((r) => r.output.length > 0) - - expect(withOutput).toHaveLength(2) - }) - - test('calculates average duration (jq pattern)', () => { - const results = parseResults(SAMPLE_SUMMARY_JSONL) - const avg = results.reduce((sum, r) => sum + r.duration, 0) / results.length - - expect(avg).toBeCloseTo(1433.67, 0) - }) - - test('counts tool usage (jq pattern)', () => { - const results = parseResults(SAMPLE_SUMMARY_JSONL) - const allTools = results.flatMap((r) => r.toolCalls) - const toolCounts = allTools.reduce>((acc, tool) => { - acc[tool] = (acc[tool] ?? 0) + 1 - return acc - }, {}) - - expect(toolCounts).toEqual({ Write: 1, Read: 1, Edit: 1 }) - }) - - test('calculates success rate by output presence', () => { - const results = parseResults(SAMPLE_SUMMARY_JSONL) - const withOutput = results.filter((r) => r.output.length > 0).length - const total = results.length - - expect(withOutput).toBe(2) - expect(total).toBe(3) - expect(withOutput / total).toBeCloseTo(0.667, 2) - }) -}) - -describe('downstream patterns: capture JSONL', () => { - const parseResults = (jsonl: string) => - jsonl - .trim() - .split('\n') - .map((line) => JSON.parse(line)) - - test('parses capture JSONL with trajectories', () => { - const results = parseResults(SAMPLE_CAPTURE_JSONL) - - expect(results).toHaveLength(2) - for (const result of results) { - expect(() => CaptureResultSchema.parse(result)).not.toThrow() - } - }) - - test('step IDs follow expected format', () => { - const results = parseResults(SAMPLE_CAPTURE_JSONL) - - for (const result of results) { - for (const step of result.trajectory) { - expect(step.stepId).toMatch(new RegExp(`^${result.id}-step-\\d+$`)) - } - } - }) - - test('step-level retrieval pattern works', () => { - const results = parseResults(SAMPLE_CAPTURE_JSONL) - - // Build step index (pattern from downstream.md) - const stepIndex = new Map() - for (const result of results) { - for (const step of result.trajectory) { - stepIndex.set(step.stepId, step) - } - } - - // Retrieve specific step by ID - const step = stepIndex.get('test-001-step-2') as { name: string; input: { file_path: string } } - expect(step).toBeDefined() - expect(step.name).toBe('Write') - expect(step.input.file_path).toBe('src/button.tsx') - }) - - test('extracts tool calls from trajectory', () => { - const results = parseResults(SAMPLE_CAPTURE_JSONL) - const result = results[1] // test-002 - - const toolCalls = result.trajectory.filter((s: { type: string }) => s.type === 'tool_call') - expect(toolCalls).toHaveLength(2) - expect(toolCalls.map((t: { name: string }) => t.name)).toEqual(['Read', 'Edit']) - }) - - test('filters by metadata category', () => { - const results = parseResults(SAMPLE_CAPTURE_JSONL) - const uiResults = results.filter((r) => r.metadata.category === 'ui') - - expect(uiResults).toHaveLength(1) - expect(uiResults[0]?.id).toBe('test-001') - }) - - test('identifies results with tool errors', () => { - const results = parseResults(SAMPLE_CAPTURE_JSONL) - const withErrors = results.filter((r) => r.toolErrors) - - expect(withErrors).toHaveLength(0) // Sample data has no errors - }) -}) - -describe('downstream patterns: advanced filtering', () => { - const parseResults = (jsonl: string) => - jsonl - .trim() - .split('\n') - .map((line) => JSON.parse(line)) - - test('filters by tool usage (jq contains pattern)', () => { - const results = parseResults(SAMPLE_SUMMARY_JSONL) - const withWrite = results.filter((r) => r.toolCalls.includes('Write')) - - expect(withWrite).toHaveLength(1) - expect(withWrite[0]?.id).toBe('test-001') - }) - - test('filters by duration threshold (slow evaluations)', () => { - const results = parseResults(SAMPLE_SUMMARY_JSONL) - const slow = results.filter((r) => r.duration > 2000) - - expect(slow).toHaveLength(1) - expect(slow[0]?.id).toBe('test-002') - }) - - test('finds slowest evaluations (sorted)', () => { - const results = parseResults(SAMPLE_SUMMARY_JSONL) - const sorted = [...results].sort((a, b) => b.duration - a.duration) - const top2 = sorted.slice(0, 2) - - expect(top2[0]?.id).toBe('test-002') - expect(top2[1]?.id).toBe('test-001') - }) - - test('deduplicates by ID keeping latest (merge pattern)', () => { - const combinedJsonl = `${SAMPLE_SUMMARY_JSONL} -{"id":"test-001","input":"Create a button v2","output":"I created the button v2","toolCalls":["Write","Edit"],"duration":1500}` - - const results = parseResults(combinedJsonl) - - // Group by ID and keep last occurrence (simulates jq group_by + last) - const byId = new Map() - for (const result of results) { - byId.set(result.id, result) - } - const deduped = Array.from(byId.values()) - - expect(deduped).toHaveLength(3) // test-001, test-002, test-003 - const test001 = deduped.find((r) => (r as { id: string }).id === 'test-001') as { input: string } - expect(test001?.input).toBe('Create a button v2') - }) - - test('groups by category and counts', () => { - const results = parseResults(SAMPLE_CAPTURE_JSONL) - - // Group by category (simulates jq group_by pattern) - const grouped = results.reduce>((acc, r) => { - const cat = r.metadata.category as string - acc[cat] = (acc[cat] ?? 0) + 1 - return acc - }, {}) - - expect(grouped).toEqual({ ui: 1, bugfix: 1 }) - }) - - test('extracts timing information', () => { - const results = parseResults(SAMPLE_CAPTURE_JSONL) - const result = results[0] - - expect(result.timing.start).toBe(1704067200000) - expect(result.timing.end).toBe(1704067201234) - expect(result.timing.firstResponse).toBe(100) - expect(result.timing.end - result.timing.start).toBe(1234) // matches duration - }) -}) - -// ============================================================================ -// MCP Server Config Parsing Tests -// ============================================================================ - -describe('MCP server config parsing', () => { - test('parses stdio MCP server config', () => { - const json = '{"type":"stdio","name":"fs","command":"mcp-filesystem","args":["/data"],"env":[]}' - const proc = Bun.spawn( - ['bun', CLI_PATH, 'capture', '/tmp/test.jsonl', '--schema', './test-schema.json', '--mcp-server', json, '--help'], - { - stdout: 'pipe', - stderr: 'pipe', - }, - ) - - // If it doesn't crash, the parsing worked - expect(proc.exited).resolves.toBeDefined() - }) - - test('parses http MCP server config', () => { - const json = - '{"type":"http","name":"api","url":"https://example.com/mcp","headers":[{"name":"Authorization","value":"Bearer token"}]}' - const proc = Bun.spawn( - ['bun', CLI_PATH, 'capture', '/tmp/test.jsonl', '--schema', './test-schema.json', '--mcp-server', json, '--help'], - { - stdout: 'pipe', - stderr: 'pipe', - }, - ) - - // If it doesn't crash, the parsing worked - expect(proc.exited).resolves.toBeDefined() - }) - - test('accepts multiple MCP servers', () => { - const json1 = '{"type":"stdio","name":"fs","command":"mcp-filesystem","args":[],"env":[]}' - const json2 = '{"type":"http","name":"api","url":"https://example.com","headers":[]}' - const proc = Bun.spawn( - [ - 'bun', - CLI_PATH, - 'capture', - '/tmp/test.jsonl', - '--schema', - './test-schema.json', - '--mcp-server', - json1, - '--mcp-server', - json2, - '--help', - ], - { - stdout: 'pipe', - stderr: 'pipe', - }, - ) - - // If it doesn't crash, the parsing worked - expect(proc.exited).resolves.toBeDefined() - }) -}) - -// ============================================================================ -// Error Handling Tests -// ============================================================================ - -describe('error handling', () => { - test('fails when schema file does not exist', async () => { - const tmpFile = `/tmp/invalid-${Date.now()}.jsonl` - await Bun.write(tmpFile, '{"id": "t1", "input": "test"}\n') - - const proc = Bun.spawn(['bun', CLI_PATH, 'capture', tmpFile, '--schema', 'nonexistent-schema.json'], { - stdout: 'pipe', - stderr: 'pipe', - }) - const stderr = await new Response(proc.stderr).text() - const exitCode = await proc.exited - - expect(exitCode).not.toBe(0) - expect(stderr).toContain('Schema file not found') - }) - - test('capture command requires prompts path', async () => { - const proc = Bun.spawn(['bun', CLI_PATH, 'capture'], { - stdout: 'pipe', - stderr: 'pipe', - }) - const stderr = await new Response(proc.stderr).text() - const exitCode = await proc.exited - - expect(exitCode).toBe(1) - expect(stderr).toContain('prompts.jsonl path is required') - }) - - test('summarize command requires input path', async () => { - const proc = Bun.spawn(['bun', CLI_PATH, 'summarize'], { - stdout: 'pipe', - stderr: 'pipe', - }) - const stderr = await new Response(proc.stderr).text() - const exitCode = await proc.exited - - expect(exitCode).toBe(1) - expect(stderr).toContain('results.jsonl path is required') - }) -}) diff --git a/bun.lock b/bun.lock index 983a915..5737ff7 100644 --- a/bun.lock +++ b/bun.lock @@ -5,7 +5,6 @@ "": { "name": "@plaited/acp", "dependencies": { - "@plaited/development-skills": "0.8.0", "zod": "^4.3.6", }, "devDependencies": { @@ -52,8 +51,6 @@ "@nodelib/fs.walk": ["@nodelib/fs.walk@1.2.8", "", { "dependencies": { "@nodelib/fs.scandir": "2.1.5", "fastq": "^1.6.0" } }, "sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg=="], - "@plaited/development-skills": ["@plaited/development-skills@0.8.0", "", { "peerDependencies": { "typescript-language-server": "^5.1.3" }, "bin": { "development-skills": "bin/cli.ts" } }, "sha512-1dXWKPco9fkFLJhFuuOJB1aktF1qY97v/evthfNsun11VG1gA8efD44s3txxUIPs4FcL5KSvcvvlaqbmHQk7ew=="], - "@types/bun": ["@types/bun@1.3.9", "", { "dependencies": { "bun-types": "1.3.9" } }, "sha512-KQ571yULOdWJiMH+RIWIOZ7B2RXQGpL1YQrBtLIV3FqDcCu6FsbFUBwhdKUlCKUpS3PJDsHlJ1QKlpxoVR+xtw=="], "@types/node": ["@types/node@25.0.9", "", { "dependencies": { "undici-types": "~7.16.0" } }, "sha512-/rpCXHlCWeqClNBwUhDcusJxXYDjZTyE8v5oTO7WbL8eij2nKhUeU89/6xgjU7N4/Vh3He0BtyhJdQbDyhiXAw=="], @@ -218,8 +215,6 @@ "typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="], - "typescript-language-server": ["typescript-language-server@5.1.3", "", { "bin": { "typescript-language-server": "lib/cli.mjs" } }, "sha512-r+pAcYtWdN8tKlYZPwiiHNA2QPjXnI02NrW5Sf2cVM3TRtuQ3V9EKKwOxqwaQ0krsaEXk/CbN90I5erBuf84Vg=="], - "undici-types": ["undici-types@7.16.0", "", {}, "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw=="], "universalify": ["universalify@2.0.1", "", {}, "sha512-gptHNQghINnc/vTGIk0SOFGFNXw7JVrlRUtConJRlvaw6DuX0wO5Jeko9sWrMBhh+PsYAZ7oXAiOnf/UKogyiw=="], diff --git a/package.json b/package.json index 352c0cb..af150e1 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "@plaited/agent-eval-harness", - "version": "0.13.0", - "description": "CLI tool for capturing agent trajectories from headless CLI agents", + "version": "1.0.0", + "description": "General-purpose eval harness for running trials against CLI agents", "license": "ISC", "engines": { "bun": ">= v1.2.9" @@ -15,22 +15,17 @@ }, "homepage": "https://github.com/plaited/agent-eval-harness/tree/main#readme", "bin": { - "agent-eval-harness": "./bin/cli.ts" + "agent-eval-harness": "./src/cli.ts" }, "type": "module", "exports": { - ".": "./src/harness.ts", - "./schemas": "./src/schemas.ts", - "./headless": "./src/headless.ts", - "./pipeline": "./src/pipeline.ts" + ".": "./src/trial.ts", + "./schemas": "./src/trial.schemas.ts" }, "files": [ "./src/**", - "./bin/**", "!./src/**/tests/*", - "!./src/**/*.spec.ts", - "!./bin/**/tests/*", - "!./bin/**/*.spec.ts" + "!./src/**/*.spec.ts" ], "publishConfig": { "access": "public" @@ -42,10 +37,7 @@ "check:types": "tsc --noEmit", "check:write": "biome check --write && format-package --write", "prepare": "git rev-parse --git-dir > /dev/null 2>&1 && git config core.hooksPath .hooks || true", - "test": "bun run test:bin && bun test:src", - "test:bin": "bun test bin/tests/*.spec.ts", - "test:integration": "bun test ./**/integration_tests/*.spec.ts", - "test:src": "bun test src/**/tests/*.spec.ts" + "test": "bun test src/" }, "lint-staged": { "*.{js,cjs,jsx,tsx,ts}": [ @@ -56,7 +48,6 @@ ] }, "dependencies": { - "@plaited/development-skills": "0.8.0", "zod": "^4.3.6" }, "devDependencies": { diff --git a/src/cli.ts b/src/cli.ts new file mode 100644 index 0000000..7821670 --- /dev/null +++ b/src/cli.ts @@ -0,0 +1,22 @@ +#!/usr/bin/env bun +import { trialCli } from './trial.ts' + +const [command, ...args] = process.argv.slice(2) + +switch (command) { + case 'trials': + await trialCli(args) + break + case 'compare': + console.error('compare command not yet implemented') + process.exit(1) + break + case 'calibrate': + console.error('calibrate command not yet implemented') + process.exit(1) + break + default: + console.error(`Unknown command: ${command}`) + console.error('Available commands: trials, compare, calibrate') + process.exit(1) +} diff --git a/src/cli.utils.ts b/src/cli.utils.ts new file mode 100644 index 0000000..6567d8e --- /dev/null +++ b/src/cli.utils.ts @@ -0,0 +1,99 @@ +/** + * Shared CLI utilities for the eval harness. + * + * @remarks + * Implements the CLI tool pattern: JSON positional arg or stdin pipe, + * `--schema input|output` for discovery, `--help` for usage, exit codes 0/1/2. + * + * @internal + */ + +import * as z from 'zod' + +// ============================================================================ +// Raw Input Extraction (shared plumbing) +// ============================================================================ + +/** + * Extract and parse raw JSON from CLI args or stdin. + * + * @remarks + * Handles `--help`, `--schema`, positional JSON arg, and stdin pipe. + * Calls `process.exit()` on meta flags and bad input — only returns + * on valid JSON. + * + * @internal + */ +const parseRawCliInput = async ( + args: string[], + schema: z.ZodSchema, + options: { name: string; outputSchema?: z.ZodSchema }, +): Promise => { + if (args.includes('--help') || args.includes('-h')) { + console.error(`Usage: agent-eval-harness ${options.name} '' | --schema input`) + process.exit(0) + } + + const schemaIdx = args.indexOf('--schema') + if (schemaIdx !== -1) { + const target = args[schemaIdx + 1] + if (target === 'output' && options.outputSchema) { + console.log(JSON.stringify(z.toJSONSchema(options.outputSchema), null, 2)) + } else { + console.log(JSON.stringify(z.toJSONSchema(schema), null, 2)) + } + process.exit(0) + } + + const positionals = args.filter((arg) => !arg.startsWith('--')) + let rawInput: string | undefined + + if (positionals.length > 0) { + rawInput = positionals[0] + } else if (!process.stdin.isTTY) { + const stdinData = await Bun.stdin.text() + if (stdinData.trim()) rawInput = stdinData.trim() + } + + if (!rawInput) { + console.error(`Usage: agent-eval-harness ${options.name} '' | --schema input`) + process.exit(2) + } + + try { + return JSON.parse(rawInput) + } catch { + console.error('Invalid JSON input') + process.exit(2) + } +} + +// ============================================================================ +// CLI Input Parser +// ============================================================================ + +/** + * Parse CLI input following the CLI tool pattern. + * + * @remarks + * - `--help` / `-h`: prints usage, exits 0 + * - `--schema input`: emits input JSON Schema, exits 0 + * - `--schema output`: emits output JSON Schema (if provided), exits 0 + * - First positional arg or stdin pipe: JSON validated with Zod + * - Exit 2 on bad input + * + * @internal + */ +export const parseCli = async ( + args: string[], + schema: T, + options: { name: string; outputSchema?: z.ZodSchema }, +): Promise> => { + const raw = await parseRawCliInput(args, schema, options) + const parsed = schema.safeParse(raw) + if (!parsed.success) { + console.error(JSON.stringify(parsed.error.issues, null, 2)) + process.exit(2) + } + return parsed.data +} diff --git a/src/commands.ts b/src/commands.ts deleted file mode 100644 index cd1969c..0000000 --- a/src/commands.ts +++ /dev/null @@ -1,33 +0,0 @@ -/** - * CLI command implementations for agent evaluation harness. - * - * @remarks - * Re-exports all CLI commands for programmatic use. - * For CLI usage, run `agent-eval-harness --help`. - * - * @packageDocumentation - */ - -// Balance command -export type { BalanceConfig } from './commands/balance.ts' -export { balance, runBalance } from './commands/balance.ts' - -// Calibrate command -export type { CalibrateConfig } from './commands/calibrate.ts' -export { calibrate, runCalibrate } from './commands/calibrate.ts' - -// Capture command -export type { CaptureConfig } from './commands/capture.ts' -export { capture, runCapture } from './commands/capture.ts' - -// Summarize command -export type { SummarizeConfig } from './commands/summarize.ts' -export { runSummarize, summarize } from './commands/summarize.ts' - -// Trials command -export type { TrialsConfig } from './commands/trials.ts' -export { runTrials, trials } from './commands/trials.ts' - -// Validate-refs command -export type { ValidateRefsConfig } from './commands/validate-refs.ts' -export { runValidateRefs, validateRefs } from './commands/validate-refs.ts' diff --git a/src/commands/balance.ts b/src/commands/balance.ts deleted file mode 100644 index 115cfd2..0000000 --- a/src/commands/balance.ts +++ /dev/null @@ -1,245 +0,0 @@ -/** - * Balance command - analyze test set coverage. - * - * @remarks - * Analyzes the distribution of test cases by metadata categories. - * Identifies underrepresented categories and suggests improvements. - * - * @packageDocumentation - */ - -import { parseArgs } from 'node:util' -import { loadPrompts, resolvePath } from '../core.ts' -import type { BalanceAnalysis, CategoryDistribution, PromptCase } from '../schemas.ts' - -// ============================================================================ -// Types -// ============================================================================ - -/** Configuration for balance command */ -export type BalanceConfig = { - /** Path to prompts.jsonl file */ - promptsPath: string - /** Output file path */ - outputPath?: string - /** Metadata key to analyze (default: 'category') */ - key?: string - /** Threshold for underrepresentation (percentage) */ - threshold?: number -} - -/** - * Analyze category distribution across prompts. - * - * @param prompts - Array of prompt cases - * @param key - Metadata key to analyze - * @returns Array of category distributions sorted by count descending - * - * @public - */ -export const analyzeCategories = (prompts: PromptCase[], key: string): CategoryDistribution[] => { - const counts = new Map() - - for (const prompt of prompts) { - const value = prompt.metadata?.[key] - const category = value !== undefined ? String(value) : '(uncategorized)' - counts.set(category, (counts.get(category) ?? 0) + 1) - } - - const total = prompts.length - const distributions: CategoryDistribution[] = [] - - for (const [name, count] of counts) { - distributions.push({ - name, - count, - percentage: Math.round((count / total) * 100), - }) - } - - // Sort by count descending - distributions.sort((a, b) => b.count - a.count) - - return distributions -} - -/** - * Identify underrepresented categories. - * - * @param distributions - Array of category distributions - * @param threshold - Percentage threshold relative to even distribution - * @returns Array of underrepresented category names - * - * @public - */ -export const findUnderrepresented = (distributions: CategoryDistribution[], threshold: number): string[] => { - // Expected percentage if evenly distributed - const evenPercentage = 100 / distributions.length - - return distributions.filter((d) => d.percentage < evenPercentage * (threshold / 100)).map((d) => d.name) -} - -/** - * Generate suggestions for improving test set balance. - * - * @param distributions - Array of category distributions - * @param underrepresented - Array of underrepresented category names - * @param total - Total number of test cases - * @returns Array of suggestion strings - * - * @public - */ -export const generateSuggestions = ( - distributions: CategoryDistribution[], - underrepresented: string[], - total: number, -): string[] => { - const suggestions: string[] = [] - - if (underrepresented.length > 0) { - suggestions.push(`Consider adding more test cases for: ${underrepresented.join(', ')}`) - } - - // Check for category with > 50% of cases - const dominant = distributions.find((d) => d.percentage > 50) - if (dominant) { - suggestions.push(`Category '${dominant.name}' has ${dominant.percentage}% of cases - consider diversifying`) - } - - // Check for very small categories - const tiny = distributions.filter((d) => d.count < 3) - if (tiny.length > 0) { - suggestions.push(`Categories with < 3 cases may not be reliable: ${tiny.map((d) => d.name).join(', ')}`) - } - - // Check total test count - if (total < 20) { - suggestions.push(`Consider expanding test set (currently ${total} cases) for more statistical significance`) - } - - if (suggestions.length === 0) { - suggestions.push('Test set appears well-balanced') - } - - return suggestions -} - -// ============================================================================ -// Balance Implementation -// ============================================================================ - -/** - * Execute balance analysis with configuration object. - * - * @param config - Balance configuration - * @returns Balance analysis result - */ -export const runBalance = async (config: BalanceConfig): Promise => { - const { promptsPath, outputPath, key = 'category', threshold = 50 } = config - - // Load prompts - const prompts = await loadPrompts(promptsPath) - - console.error(`Analyzing ${prompts.length} prompts by '${key}' metadata...`) - - // Analyze distribution - const categories = analyzeCategories(prompts, key) - const underrepresented = findUnderrepresented(categories, threshold) - const suggestions = generateSuggestions(categories, underrepresented, prompts.length) - - const analysis: BalanceAnalysis = { - totalCases: prompts.length, - categories, - underrepresented, - suggestions, - } - - // Format output - const output = JSON.stringify(analysis, null, 2) - - // Write output - if (outputPath) { - await Bun.write(resolvePath(outputPath), output) - } else { - console.log(output) - } - - // Summary to stderr - console.error('\nCategory Distribution:') - for (const cat of categories) { - const bar = '█'.repeat(Math.round(cat.percentage / 5)) - console.error(` ${cat.name}: ${cat.count} (${cat.percentage}%) ${bar}`) - } - - if (underrepresented.length > 0) { - console.error(`\nUnderrepresented: ${underrepresented.join(', ')}`) - } - - console.error('\nSuggestions:') - for (const suggestion of suggestions) { - console.error(` - ${suggestion}`) - } - - return analysis -} - -// ============================================================================ -// CLI Entry Point -// ============================================================================ - -/** - * Balance command CLI handler. - * - * @param args - Command line arguments (after 'balance') - */ -export const balance = async (args: string[]): Promise => { - const { values, positionals } = parseArgs({ - args, - options: { - output: { type: 'string', short: 'o' }, - key: { type: 'string', short: 'k', default: 'category' }, - threshold: { type: 'string', short: 't', default: '50' }, - help: { type: 'boolean', short: 'h' }, - }, - allowPositionals: true, - }) - - if (values.help) { - console.log(` -Usage: agent-eval-harness balance [options] - -Arguments: - prompts.jsonl Input file with prompts - -Options: - -o, --output Output file (default: stdout) - -k, --key Metadata key to analyze (default: 'category') - -t, --threshold Underrepresentation threshold % (default: 50) - -h, --help Show this help message - -Output: - JSON with category distribution, underrepresented categories, and suggestions. - -Examples: - # Analyze by default 'category' key - agent-eval-harness balance prompts.jsonl -o balance.json - - # Analyze by custom metadata key - agent-eval-harness balance prompts.jsonl --key difficulty -o balance.json -`) - return - } - - const promptsPath = positionals[0] - if (!promptsPath) { - console.error('Error: prompts.jsonl path is required') - process.exit(1) - } - - await runBalance({ - promptsPath, - outputPath: values.output, - key: values.key ?? 'category', - threshold: Number.parseInt(values.threshold ?? '50', 10), - }) -} diff --git a/src/commands/calibrate.ts b/src/commands/calibrate.ts deleted file mode 100644 index 4bd9466..0000000 --- a/src/commands/calibrate.ts +++ /dev/null @@ -1,304 +0,0 @@ -/** - * Calibrate command - sample failures for grader review. - * - * @remarks - * Helps identify grader bugs by sampling failures for human review. - * Can optionally re-score with a different grader for comparison. - * - * @packageDocumentation - */ - -import { parseArgs } from 'node:util' -import { loadResults, resolvePath } from '../core.ts' -import { DEFAULT_CALIBRATION_SAMPLE_SIZE } from '../schemas/constants.ts' -import { loadGraderOrExit } from '../schemas/grader-loader.ts' -import type { CalibrationSample, Grader, GraderResult, TrajectoryStep } from '../schemas.ts' - -// ============================================================================ -// Types -// ============================================================================ - -/** Configuration for calibrate command */ -export type CalibrateConfig = { - /** Path to results.jsonl file */ - resultsPath: string - /** Output file path */ - outputPath?: string - /** Number of samples to include */ - sample?: number - /** Optional grader for re-scoring */ - grader?: Grader -} - -/** - * Randomly sample n elements from an array using Fisher-Yates shuffle. - * - * @param arr - Array to sample from - * @param n - Number of samples to take - * @returns Array of sampled elements in random order - * - * @remarks - * Uses Fisher-Yates (Knuth) shuffle for uniform distribution. - * Creates a copy to avoid mutating the input array. - * O(n) time complexity with O(n) space for the copy. - * Not cryptographically secure (uses Math.random). - * - * @public - */ -export const sampleArray = (arr: T[], n: number): T[] => { - if (n <= 0) return [] - if (n >= arr.length) return [...arr] - - const copy = [...arr] - - // Fisher-Yates shuffle working backwards through array - // Only shuffle enough elements to get n samples - const limit = copy.length - n - for (let i = copy.length - 1; i >= limit && i > 0; i--) { - // Random index from 0 to i (inclusive) - const j = Math.floor(Math.random() * (i + 1)) - // Swap elements - ;[copy[i], copy[j]] = [copy[j]!, copy[i]!] - } - - return copy.slice(-n) -} - -/** - * Get snippet of trajectory for review. - * - * @remarks - * Includes first 2 steps, middle step, and last 2 steps. - * - * @param trajectory - Full trajectory - * @param maxSteps - Maximum number of steps to include - * @returns Trajectory snippet - * - * @public - */ -export const getTrajectorySnippet = (trajectory: TrajectoryStep[], maxSteps = 5): TrajectoryStep[] => { - // Include first and last steps, plus some from the middle - if (trajectory.length <= maxSteps) return trajectory - - const result: TrajectoryStep[] = [] - - // First 2 steps - result.push(...trajectory.slice(0, 2)) - - // Middle step - const mid = Math.floor(trajectory.length / 2) - result.push(trajectory[mid] as TrajectoryStep) - - // Last 2 steps - result.push(...trajectory.slice(-2)) - - return result -} - -/** Format calibration sample as markdown */ -const formatCalibrationMarkdown = (samples: CalibrationSample[]): string => { - const lines: string[] = [ - '# Grader Calibration Report', - '', - `Generated: ${new Date().toISOString()}`, - `Samples: ${samples.length}`, - '', - '## Instructions', - '', - 'Review each failure below and mark whether:', - '- [ ] **Valid failure** - Grader correctly identified a problem', - '- [ ] **Grader bug** - Output was actually correct, grader was wrong', - '- [ ] **Ambiguous** - Unclear if the output is correct or not', - '', - '---', - '', - ] - - for (let i = 0; i < samples.length; i++) { - const sample = samples[i] - if (!sample) continue - - lines.push(`## Sample ${i + 1}: ${sample.id}`) - lines.push('') - lines.push(`**Input:** ${sample.input}`) - lines.push('') - - if (sample.hint) { - lines.push(`**Hint:** ${sample.hint}`) - lines.push('') - } - - lines.push(`**Output:** ${sample.output.slice(0, 500)}${sample.output.length > 500 ? '...' : ''}`) - lines.push('') - - lines.push(`**Original Score:** ${sample.originalScore.pass ? 'PASS' : 'FAIL'} (${sample.originalScore.score})`) - if (sample.originalScore.reasoning) { - lines.push(`**Reasoning:** ${sample.originalScore.reasoning}`) - } - lines.push('') - - if (sample.rescoredResult) { - lines.push(`**Re-scored:** ${sample.rescoredResult.pass ? 'PASS' : 'FAIL'} (${sample.rescoredResult.score})`) - if (sample.rescoredResult.reasoning) { - lines.push(`**Re-score Reasoning:** ${sample.rescoredResult.reasoning}`) - } - lines.push('') - } - - lines.push('**Trajectory Snippet:**') - lines.push('```') - for (const step of sample.trajectorySnippet) { - if (step.type === 'tool_call') { - lines.push(`[${step.type}] ${step.name}: ${step.status}`) - } else if (step.type === 'message' || step.type === 'thought') { - lines.push(`[${step.type}] ${step.content.slice(0, 100)}...`) - } else if (step.type === 'plan') { - lines.push(`[${step.type}] ${(step.entries as Array<{ content: string }>).length} entries`) - } - } - lines.push('```') - lines.push('') - - lines.push('**Review:**') - lines.push('- [ ] Valid failure') - lines.push('- [ ] Grader bug') - lines.push('- [ ] Ambiguous') - lines.push('') - lines.push('---') - lines.push('') - } - - return lines.join('\n') -} - -// ============================================================================ -// Calibrate Implementation -// ============================================================================ - -/** - * Execute calibrate with configuration object. - * - * @param config - Calibrate configuration - * @returns Calibration samples - */ -export const runCalibrate = async (config: CalibrateConfig): Promise => { - const { resultsPath, outputPath, sample = DEFAULT_CALIBRATION_SAMPLE_SIZE, grader } = config - - // Load results - const results = await loadResults(resultsPath) - - // Filter to failures (or results without scores) - const failures = results.filter((r) => r.score && !r.score.pass) - - if (failures.length === 0) { - console.error('No failures found in results') - return [] - } - - // Sample failures - const sampled = sampleArray(failures, Math.min(sample, failures.length)) - - // Build calibration samples - const samples: CalibrationSample[] = [] - - for (const result of sampled) { - const calibrationSample: CalibrationSample = { - id: result.id, - input: result.input, - output: result.output, - hint: result.hint, - originalScore: result.score as GraderResult, - trajectorySnippet: getTrajectorySnippet(result.trajectory), - } - - // Re-score with different grader if provided - if (grader) { - calibrationSample.rescoredResult = await grader({ - input: result.input, - output: result.output, - hint: result.hint, - trajectory: result.trajectory, - metadata: result.metadata, - }) - } - - samples.push(calibrationSample) - } - - // Format as markdown - const markdown = formatCalibrationMarkdown(samples) - - // Write output - if (outputPath) { - await Bun.write(resolvePath(outputPath), markdown) - } else { - console.log(markdown) - } - - return samples -} - -// ============================================================================ -// CLI Entry Point -// ============================================================================ - -/** - * Calibrate command CLI handler. - * - * @param args - Command line arguments (after 'calibrate') - */ -export const calibrate = async (args: string[]): Promise => { - const { values, positionals } = parseArgs({ - args, - options: { - output: { type: 'string', short: 'o' }, - sample: { type: 'string', short: 's', default: String(DEFAULT_CALIBRATION_SAMPLE_SIZE) }, - grader: { type: 'string', short: 'g' }, - help: { type: 'boolean', short: 'h' }, - }, - allowPositionals: true, - }) - - if (values.help) { - console.log(` -Usage: agent-eval-harness calibrate [options] - -Arguments: - results.jsonl Input file with scored capture results - -Options: - -o, --output Output file (default: stdout) - -s, --sample Number of failures to sample (default: ${DEFAULT_CALIBRATION_SAMPLE_SIZE}) - -g, --grader Path to alternative grader (.ts/.js module or executable script) - -h, --help Show this help message - -Output: - Markdown report with sampled failures for human review. - Includes checkboxes for labeling (valid failure / grader bug / ambiguous). - -Examples: - # Sample failures for review - agent-eval-harness calibrate results.jsonl --sample 10 -o calibration.md - - # Re-score with different grader to compare - agent-eval-harness calibrate results.jsonl --grader ./loose-grader.ts -o comparison.md -`) - return - } - - const resultsPath = positionals[0] - if (!resultsPath) { - console.error('Error: results.jsonl path is required') - process.exit(1) - } - - // Load grader if specified - const grader = values.grader ? await loadGraderOrExit(values.grader) : undefined - - await runCalibrate({ - resultsPath, - outputPath: values.output, - sample: Number.parseInt(values.sample ?? String(DEFAULT_CALIBRATION_SAMPLE_SIZE), 10), - grader, - }) -} diff --git a/src/commands/capture.ts b/src/commands/capture.ts deleted file mode 100644 index c78f863..0000000 --- a/src/commands/capture.ts +++ /dev/null @@ -1,391 +0,0 @@ -/** - * Core trajectory capture command. - * - * @remarks - * Executes prompts against a CLI agent and captures full trajectories. - * This is the foundational command - all other views derive from its output. - * - * Output format is always full trajectory JSONL (`CaptureResultSchema`). - * Use `summarize` command to derive compact views. - * - * @packageDocumentation - */ - -import { parseArgs } from 'node:util' -import { - createWorkspaceDir, - detectTrajectoryRichness, - extractOutput, - extractTrajectory, - getInputPreview, - hasToolErrors, - logProgress, - readStdinPrompts, -} from '../core.ts' -import type { ParsedUpdate } from '../headless/headless-output-parser.ts' -import type { ProcessExitInfo, PromptResult } from '../headless/headless-session-manager.ts' -import { loadGraderOrExit } from '../schemas/grader-loader.ts' -import type { CaptureResult, PromptCase, TrajectoryRichness } from '../schemas.ts' -import { type BaseExecutionConfig, executePrompts, parseConcurrency, prepareExecution } from './execution.ts' - -// ============================================================================ -// Re-exports for backward compatibility -// ============================================================================ - -// These functions are now in core/ but re-exported here for existing consumers -export { - detectTrajectoryRichness, - extractContent, - extractFilePath, - extractOutput, - extractTrajectory, - hasToolErrors, - headTailPreview, - loadPrompts, -} from '../core.ts' - -// ============================================================================ -// Types -// ============================================================================ - -/** Configuration for capture command */ -export type CaptureConfig = BaseExecutionConfig - -// ============================================================================ -// Capture Implementation -// ============================================================================ - -/** - * Execute capture with configuration object. - * - * @remarks - * Creates a fresh session for each JSONL entry to ensure isolation. - * Supports multi-turn conversations via `input: string[]`. - * - * @param config - Capture configuration - * @returns Array of capture results - */ -export const runCapture = async (config: CaptureConfig): Promise => { - const ctx = await prepareExecution(config) - const { - schema, - prompts, - sessions, - resolvedOutputPath, - resolvedWorkspaceDir, - defaultWorkingDir, - progress, - grader, - debug, - } = ctx - - // Log progress info - logProgress(`Loaded ${prompts.length} prompts from ${config.promptsPath ?? 'stdin'}`, progress) - logProgress(`Schema: ${schema.name} (${config.schemaPath})`, progress) - logProgress(`Timeout: ${ctx.effectiveTimeout}ms`, progress) - if (ctx.concurrency > 1) { - logProgress(`Concurrency: ${ctx.concurrency} workers`, progress) - } - if (resolvedWorkspaceDir) { - logProgress(`Workspace: ${resolvedWorkspaceDir}`, progress) - } - if (resolvedOutputPath) { - logProgress(`Output: ${resolvedOutputPath}`, progress) - } - if (debug) { - logProgress(`Debug mode: enabled`, progress) - } - - // Process a single prompt (used by worker pool) - const processPrompt = async (promptCase: (typeof prompts)[number], index: number): Promise => { - // Determine working directory (per-prompt workspace or default) - const workingDir = resolvedWorkspaceDir - ? await createWorkspaceDir(resolvedWorkspaceDir, promptCase.id) - : defaultWorkingDir - - logProgress(`[${index + 1}/${prompts.length}] ${promptCase.id}: ${getInputPreview(promptCase.input)}...`, progress) - - const startTime = Date.now() - let result: CaptureResult - let sessionId: string | undefined - - try { - // Create fresh session for each entry (ensures isolation) - const sessionStart = Date.now() - const session = await sessions.create(workingDir) - sessionId = session.id - const sessionCreation = Date.now() - sessionStart - logProgress(` Session: ${session.id}`, progress) - - // Handle string or array input - const inputs = Array.isArray(promptCase.input) ? promptCase.input : [promptCase.input] - const turnCount = inputs.length - - // Collect all updates from all turns - const allUpdates: ParsedUpdate[] = [] - let lastExitInfo: ProcessExitInfo | undefined - let lastOutput = '' - - // Execute each turn sequentially in the same session - for (const turnInput of inputs) { - const turnResult: PromptResult = await sessions.prompt(session.id, turnInput) - allUpdates.push(...turnResult.updates) - lastExitInfo = turnResult.exitInfo - lastOutput = turnResult.output - } - - const endTime = Date.now() - const trajectory = extractTrajectory(allUpdates, startTime) - - // Use last turn's output or extract from trajectory - const output = lastOutput || extractOutput(trajectory) - const toolErrors = hasToolErrors(trajectory) || (lastExitInfo?.timedOut ?? false) - const trajectoryRichness = detectTrajectoryRichness(trajectory) - - result = { - id: promptCase.id, - input: promptCase.input, - output, - ...(promptCase.hint && { hint: promptCase.hint }), - trajectory, - metadata: { - ...promptCase.metadata, - agent: schema.name, - trajectoryRichness, - turnCount, - ...(resolvedWorkspaceDir && { workspaceDir: workingDir }), - ...(lastExitInfo && { - exitCode: lastExitInfo.exitCode, - signal: lastExitInfo.signal, - timedOut: lastExitInfo.timedOut, - }), - }, - timing: { - start: startTime, - end: endTime, - firstResponse: trajectory.length > 0 ? trajectory[0]?.timestamp : undefined, - sessionCreation, - total: endTime - startTime, - }, - toolErrors, - } - - // Apply grader if provided - if (grader) { - const graderResult = await grader({ - input: promptCase.input, - output, - hint: promptCase.hint, - trajectory, - metadata: promptCase.metadata, - cwd: session.cwd, - }) - - result.score = graderResult - - if (graderResult.outcome) { - result.outcome = graderResult.outcome - } - } - } catch (error) { - const endTime = Date.now() - const message = error instanceof Error ? error.message : String(error) - const inputs = Array.isArray(promptCase.input) ? promptCase.input : [promptCase.input] - - result = { - id: promptCase.id, - input: promptCase.input, - output: '', - trajectory: [], - metadata: { - ...promptCase.metadata, - agent: schema.name, - trajectoryRichness: 'minimal' as TrajectoryRichness, - turnCount: inputs.length, - ...(resolvedWorkspaceDir && { workspaceDir: workingDir }), - }, - timing: { - start: startTime, - end: endTime, - sessionCreation: 0, - total: endTime - startTime, - }, - toolErrors: true, - errors: [message], - } - } finally { - // Always clean up session if it was created - if (sessionId) { - sessions.destroy(sessionId) - } - } - - // Write result immediately (coordinated via mutex for concurrent writes) - await ctx.writeResult(result) - - const statusIcon = result.toolErrors ? '!' : '✓' - const exitInfo = result.metadata?.timedOut - ? ' - TIMEOUT' - : result.metadata?.exitCode && result.metadata.exitCode !== 0 - ? ` - exit ${result.metadata.exitCode}` - : '' - logProgress(` ${statusIcon} ${promptCase.id} (${result.timing.total}ms)${exitInfo}`, progress) - - return result - } - - // Run with worker pool - return executePrompts(ctx, processPrompt) -} - -// ============================================================================ -// CLI Entry Point -// ============================================================================ - -/** - * Capture command CLI handler. - * - * @param args - Command line arguments (after 'capture') - */ -export const capture = async (args: string[]): Promise => { - const { values, positionals } = parseArgs({ - args, - options: { - schema: { type: 'string', short: 's' }, - output: { type: 'string', short: 'o' }, - cwd: { type: 'string', short: 'c' }, - timeout: { type: 'string', short: 't' }, - progress: { type: 'boolean', default: false }, - append: { type: 'boolean', default: false }, - grader: { type: 'string', short: 'g' }, - debug: { type: 'boolean', default: false }, - stdin: { type: 'boolean', default: false }, - concurrency: { type: 'string', short: 'j' }, - 'workspace-dir': { type: 'string' }, - help: { type: 'boolean', short: 'h' }, - }, - allowPositionals: true, - }) - - if (values.help) { - console.log(` -Usage: agent-eval-harness capture --schema [options] - cat prompts.jsonl | agent-eval-harness capture --stdin --schema [options] - -Arguments: - prompts.jsonl Input file with evaluation prompts - -Options: - -s, --schema Path to agent schema JSON file (required) - -o, --output Output file (default: stdout) - -c, --cwd Working directory for agent - -t, --timeout Request timeout in ms (overrides schema default) - -j, --concurrency Number of concurrent workers (default: 1) - --stdin Read prompts from stdin (mutually exclusive with file arg) - --workspace-dir Base directory for per-prompt workspace isolation - --progress Show progress to stderr - --append Append to output file instead of overwriting - -g, --grader Path to grader (.ts/.js module or executable script) - --debug Enable debug mode (shows raw output, JSONPath matching) - -h, --help Show this help message - -Output Format: - Full trajectory JSONL with toolErrors indicator. - Use 'agent-eval-harness summarize' to derive compact views. - -Exit Info (in metadata): - exitCode Process exit code (null if killed/timed out) - signal Signal that killed process (if any) - timedOut true if process was killed due to timeout - -Graders: - TS/JS modules must export a 'grade' function. - Executable scripts (Python, etc.) use stdin/stdout JSON protocol. - -Parallelization: - Use -j/--concurrency to run multiple prompts in parallel. - Each prompt gets its own agent session for isolation. - Results are written as they complete (order may differ from input). - - Memory: Stream-mode agents (e.g. Claude Code) spawn real subprocesses - at ~400-500MB RSS each. With -j 8 that is 3-4GB of resident memory. - In memory-constrained environments (Docker, CI) this can cause OOM kills. - Use --stdin to pipe prompts for container-level orchestration. - -Workspace Isolation: - Use --workspace-dir to create per-prompt directories. - Each prompt runs in {workspace-dir}/prompt-{id}/. - Useful for code generation tasks requiring filesystem isolation. - -Examples: - # Basic capture with schema - agent-eval-harness capture prompts.jsonl --schema claude.json -o results.jsonl - - # Run 4 prompts in parallel - agent-eval-harness capture prompts.jsonl -s claude.json -j 4 -o results.jsonl - - # With workspace isolation for code generation - agent-eval-harness capture prompts.jsonl -s claude.json -j 4 \\ - --workspace-dir ./workspaces -o results.jsonl - - # With TypeScript grader - agent-eval-harness capture prompts.jsonl -s claude.json --grader ./grader.ts -o results.jsonl - - # With debug mode - agent-eval-harness capture prompts.jsonl -s claude.json --debug -o results.jsonl - - # Read prompts from stdin (container orchestration) - cat prompts.jsonl | agent-eval-harness capture --stdin -s claude.json -o results.jsonl -`) - return - } - - const promptsPath = positionals[0] - const useStdin = values.stdin ?? false - - // Mutual exclusivity: --stdin and positional file - if (useStdin && promptsPath) { - console.error('Error: --stdin and prompts file argument are mutually exclusive') - process.exit(1) - } - - if (!useStdin && !promptsPath) { - console.error('Error: prompts.jsonl path is required (or use --stdin)') - process.exit(1) - } - - if (!values.schema) { - console.error('Error: --schema is required') - console.error('Example: agent-eval-harness capture prompts.jsonl --schema ./claude.json') - process.exit(1) - } - - // Read prompts from stdin if requested - let prompts: PromptCase[] | undefined - if (useStdin) { - const stdinPrompts = await readStdinPrompts() - if (!stdinPrompts || stdinPrompts.length === 0) { - console.error('Error: no prompts received on stdin') - process.exit(1) - } - prompts = stdinPrompts - } - - // Load grader if specified - const grader = values.grader ? await loadGraderOrExit(values.grader) : undefined - - await runCapture({ - promptsPath: promptsPath ?? undefined, - prompts, - schemaPath: values.schema, - outputPath: values.output, - cwd: values.cwd, - timeout: values.timeout ? Number.parseInt(values.timeout, 10) : undefined, - progress: values.progress ?? false, - append: values.append ?? false, - grader, - debug: values.debug ?? false, - concurrency: parseConcurrency(values.concurrency), - workspaceDir: values['workspace-dir'], - }) -} diff --git a/src/commands/execution.ts b/src/commands/execution.ts deleted file mode 100644 index 4a565fe..0000000 --- a/src/commands/execution.ts +++ /dev/null @@ -1,245 +0,0 @@ -/** - * Shared execution utilities for capture and trials commands. - * - * @remarks - * Extracts common setup logic: schema loading, prompt loading, path resolution, - * session manager creation, output initialization, and worker pool execution. - * - * @packageDocumentation - */ - -import { mkdir } from 'node:fs/promises' -import { createWriteMutex, loadPrompts, logProgress, resolvePath, runWorkerPool, writeOutput } from '../core.ts' -import { type HeadlessAdapterConfig, parseHeadlessConfig } from '../headless/headless.schemas.ts' -import { createSessionManager, type SessionManager } from '../headless/headless-session-manager.ts' -import { DEFAULT_HARNESS_TIMEOUT } from '../schemas/constants.ts' -import type { Grader, PromptCase } from '../schemas.ts' - -// ============================================================================ -// Types -// ============================================================================ - -/** Base configuration shared by capture and trials commands */ -export type BaseExecutionConfig = { - /** Path to prompts.jsonl file (required unless prompts provided) */ - promptsPath?: string - /** Path to agent schema JSON file */ - schemaPath: string - /** Pre-loaded prompt cases (from stdin); skips file loading when set */ - prompts?: PromptCase[] - /** Output file path (undefined for stdout) */ - outputPath?: string - /** Working directory for agent */ - cwd?: string - /** Timeout per prompt in milliseconds (overrides schema default) */ - timeout?: number - /** Show progress to stderr */ - progress?: boolean - /** Append to output file instead of overwriting */ - append?: boolean - /** Optional grader function */ - grader?: Grader - /** Enable debug mode */ - debug?: boolean - /** Number of concurrent workers (default: 1 for sequential) */ - concurrency?: number - /** Base directory for per-prompt workspace isolation */ - workspaceDir?: string -} - -/** Prepared execution context returned by prepareExecution */ -export type ExecutionContext = { - /** Parsed and validated headless adapter schema */ - schema: HeadlessAdapterConfig - /** Loaded and validated prompt cases */ - prompts: PromptCase[] - /** Session manager for creating/destroying agent sessions */ - sessions: SessionManager - /** Resolved absolute output path (undefined for stdout) */ - resolvedOutputPath?: string - /** Resolved absolute workspace directory path */ - resolvedWorkspaceDir?: string - /** Effective timeout in milliseconds */ - effectiveTimeout: number - /** Default working directory for agent sessions */ - defaultWorkingDir: string - /** Number of concurrent workers */ - concurrency: number - /** Whether to show progress output */ - progress: boolean - /** Optional grader function */ - grader?: Grader - /** Whether debug mode is enabled */ - debug: boolean - /** Write a result object as JSONL, coordinated via mutex */ - writeResult: (result: unknown) => Promise -} - -// ============================================================================ -// Execution Setup -// ============================================================================ - -/** - * Prepare execution context from base configuration. - * - * @remarks - * Handles all shared setup: schema loading/validation, prompt loading, - * path resolution, session manager creation, output file initialization, - * workspace directory creation, and write mutex coordination. - * - * @param config - Base execution configuration - * @returns Prepared execution context - * @throws Error if schema file not found, invalid, or prompts missing - * - * @public - */ -export const prepareExecution = async (config: BaseExecutionConfig): Promise => { - const { - promptsPath, - schemaPath, - outputPath, - cwd, - timeout, - progress = false, - append = false, - grader, - debug = false, - concurrency = 1, - workspaceDir, - } = config - - // Validate prompt source - if (!config.prompts && !promptsPath) { - throw new Error('Either promptsPath or prompts must be provided') - } - - // Load and validate schema - const schemaFile = Bun.file(schemaPath) - if (!(await schemaFile.exists())) { - throw new Error(`Schema file not found: ${schemaPath}`) - } - - let schema: HeadlessAdapterConfig - try { - const rawSchema = await schemaFile.json() - schema = parseHeadlessConfig(rawSchema) - } catch (error) { - throw new Error(`Invalid schema: ${error instanceof Error ? error.message : String(error)}`) - } - - // Load prompts - const prompts = config.prompts ?? (await loadPrompts(promptsPath!)) - - // Resolve paths - const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined - const resolvedWorkspaceDir = workspaceDir ? resolvePath(workspaceDir) : undefined - - // Determine effective timeout (CLI flag > schema default > harness default) - const schemaTimeout = 'timeout' in schema ? schema.timeout : undefined - const effectiveTimeout = timeout ?? schemaTimeout ?? DEFAULT_HARNESS_TIMEOUT - - // Create session manager - const sessions = createSessionManager({ - schema, - timeout: effectiveTimeout, - verbose: progress, - debug, - }) - - // Initialize output file (clear if not appending) - if (resolvedOutputPath && !append) { - await Bun.write(resolvedOutputPath, '') - } - - // Create workspace base directory if specified - if (resolvedWorkspaceDir) { - await mkdir(resolvedWorkspaceDir, { recursive: true }) - } - - const defaultWorkingDir = cwd ?? process.cwd() - - // Create write mutex with closure for coordinated result writing - const writeMutex = createWriteMutex() - let isFirstOutput = true - - const writeResult = async (result: unknown) => { - await writeMutex.write(async () => { - const formatted = JSON.stringify(result) - await writeOutput(formatted, resolvedOutputPath, !isFirstOutput) - isFirstOutput = false - }) - } - - return { - schema, - prompts, - sessions, - resolvedOutputPath, - resolvedWorkspaceDir, - effectiveTimeout, - defaultWorkingDir, - concurrency, - progress, - grader, - debug, - writeResult, - } -} - -// ============================================================================ -// Worker Pool Execution -// ============================================================================ - -/** - * Execute prompts through a worker pool with progress logging. - * - * @remarks - * Common wrapper for the runWorkerPool pattern used by both capture and trials. - * Handles progress callbacks, error logging, and completion logging. - * - * @param ctx - Execution context from prepareExecution - * @param processFn - Function to process each prompt - * @returns Array of results - * - * @public - */ -export const executePrompts = async ( - ctx: ExecutionContext, - processFn: (promptCase: PromptCase, index: number) => Promise, -): Promise => { - const { results, errors } = await runWorkerPool(ctx.prompts, processFn, { - concurrency: ctx.concurrency, - onProgress: (completed, total) => { - logProgress(`Progress: ${completed}/${total} prompts completed`, ctx.progress) - }, - }) - - if (errors.length > 0) { - logProgress(`Completed with ${errors.length} error(s)`, ctx.progress) - } - - logProgress('Done!', ctx.progress) - return results -} - -// ============================================================================ -// CLI Helpers -// ============================================================================ - -/** - * Parse and validate concurrency CLI argument. - * - * @param value - Raw string value from parseArgs - * @returns Validated positive integer (default: 1) - * - * @public - */ -export const parseConcurrency = (value: string | undefined): number => { - if (!value) return 1 - const parsed = Number.parseInt(value, 10) - if (Number.isNaN(parsed) || parsed < 1) { - console.error('Error: --concurrency must be a positive integer') - process.exit(1) - } - return parsed -} diff --git a/src/commands/summarize.ts b/src/commands/summarize.ts deleted file mode 100644 index 81499f7..0000000 --- a/src/commands/summarize.ts +++ /dev/null @@ -1,226 +0,0 @@ -/** - * Summarize command - derive compact views from full trajectory results. - * - * @remarks - * Transforms full trajectory JSONL into: - * - Summary JSONL: Compact format for jq analysis - * - Markdown: Human-readable format for LLM-as-judge workflows - * - * @packageDocumentation - */ - -import { parseArgs } from 'node:util' -import { extractContent, extractFilePath, headTailPreview, loadResults, resolvePath } from '../core.ts' -import { HEAD_LINES, MAX_CONTENT_LENGTH, TAIL_LINES } from '../schemas/constants.ts' -import type { CaptureResult, SummaryResult } from '../schemas.ts' - -// ============================================================================ -// Types -// ============================================================================ - -/** Configuration for summarize command */ -export type SummarizeConfig = { - /** Path to results.jsonl file */ - resultsPath: string - /** Output file path */ - outputPath?: string - /** Output as markdown instead of JSONL */ - markdown?: boolean -} - -/** - * Format capture result as compact summary. - * - * @param result - Full capture result - * @returns Compact summary result - * - * @public - */ -export const formatSummary = (result: CaptureResult): SummaryResult => { - const inputText = Array.isArray(result.input) ? result.input.join('\n') : result.input - return { - id: result.id, - input: inputText, - output: result.output, - toolCalls: result.trajectory.flatMap((s) => (s.type === 'tool_call' ? [s.name] : [])), - duration: result.timing.end - result.timing.start, - } -} - -/** - * Format capture result as markdown with step IDs. - * - * @param result - Full capture result - * @returns Markdown formatted string - * - * @public - */ -export const formatMarkdown = (result: CaptureResult): string => { - const inputText = Array.isArray(result.input) ? result.input.join('\n') : result.input - const lines: string[] = [`## Evaluation Record: ${result.id}`, '', `**Input:** ${inputText}`, '', '**Trajectory:**'] - - let stepNum = 1 - for (const step of result.trajectory) { - const stepId = `${result.id}-step-${stepNum}` - - if (step.type === 'thought') { - const preview = step.content.slice(0, 100) - const truncated = step.content.length > 100 ? '...' : '' - lines.push(`${stepNum}. [THOUGHT] ${preview}${truncated} [→${stepId}]`) - stepNum++ - } else if (step.type === 'tool_call') { - const duration = step.duration ? ` (${step.duration}ms)` : '' - const filePath = extractFilePath(step.input) - const content = extractContent(step.input) - - lines.push(`${stepNum}. [TOOL:${step.name}] -> ${step.status}${duration} [→${stepId}]`) - - // Add file path if present - if (filePath) { - const charCount = content?.length ?? 0 - lines.push(` File: ${filePath}${charCount > 0 ? ` (${charCount} chars)` : ''}`) - } - - // Add head/tail preview for content-producing tools - if (content && content.length > 0) { - const preview = content.length > MAX_CONTENT_LENGTH ? headTailPreview(content, HEAD_LINES, TAIL_LINES) : content - // Detect file extension for syntax highlighting - const ext = filePath?.split('.').pop() ?? 'typescript' - lines.push(` \`\`\`${ext}`) - lines.push(` ${preview.split('\n').join('\n ')}`) - lines.push(' ```') - } - stepNum++ - } else if (step.type === 'plan') { - const entries = step.entries as Array<{ content: string; status: string }> - const planSummary = entries.map((e) => `${e.content}: ${e.status}`).join(', ') - const truncated = planSummary.length > 80 ? '...' : '' - lines.push(`${stepNum}. [PLAN] ${planSummary.slice(0, 80)}${truncated} [→${stepId}]`) - stepNum++ - } else if (step.type === 'message') { - const preview = step.content.slice(0, 100) - const truncated = step.content.length > 100 ? '...' : '' - lines.push(`${stepNum}. [MESSAGE] ${preview}${truncated} [→${stepId}]`) - stepNum++ - } - } - - lines.push('') - const outputPreview = result.output.slice(0, 200) - const outputTruncated = result.output.length > 200 ? '...' : '' - lines.push(`**Output:** ${outputPreview}${outputTruncated}`) - lines.push('') - - const metadataStr = Object.entries(result.metadata) - .map(([k, v]) => `${k}=${v}`) - .join(', ') - lines.push(`**Metadata:** ${metadataStr}`) - lines.push(`**Tool Errors:** ${result.toolErrors}`) - lines.push(`**Duration:** ${result.timing.end - result.timing.start}ms`) - - if (result.score) { - lines.push(`**Score:** ${result.score.pass ? 'PASS' : 'FAIL'} (${result.score.score})`) - if (result.score.reasoning) { - lines.push(`**Reasoning:** ${result.score.reasoning}`) - } - } - - lines.push('') - lines.push('---') - lines.push('') - - return lines.join('\n') -} - -// ============================================================================ -// Summarize Implementation -// ============================================================================ - -/** - * Execute summarize with configuration object. - * - * @param config - Summarize configuration - * @returns Formatted output string - */ -export const runSummarize = async (config: SummarizeConfig): Promise => { - const { resultsPath, outputPath, markdown = false } = config - - // Load results - const results = await loadResults(resultsPath) - - // Format output - let output: string - if (markdown) { - output = results.map(formatMarkdown).join('\n') - } else { - output = results.map((r) => JSON.stringify(formatSummary(r))).join('\n') - } - - // Write output - if (outputPath) { - await Bun.write(resolvePath(outputPath), output) - } else { - console.log(output) - } - - return output -} - -// ============================================================================ -// CLI Entry Point -// ============================================================================ - -/** - * Summarize command CLI handler. - * - * @param args - Command line arguments (after 'summarize') - */ -export const summarize = async (args: string[]): Promise => { - const { values, positionals } = parseArgs({ - args, - options: { - output: { type: 'string', short: 'o' }, - markdown: { type: 'boolean', short: 'm', default: false }, - help: { type: 'boolean', short: 'h' }, - }, - allowPositionals: true, - }) - - if (values.help) { - console.log(` -Usage: agent-eval-harness summarize [options] - -Arguments: - results.jsonl Input file with capture results - -Options: - -o, --output Output file (default: stdout) - -m, --markdown Output as markdown instead of JSONL - -h, --help Show this help message - -Output Formats: - JSONL (default): Compact summary with id, input, output, toolCalls, duration - Markdown (-m): Human-readable format with step IDs for LLM-as-judge - -Examples: - # Summary JSONL for jq analysis - agent-eval-harness summarize results.jsonl -o summary.jsonl - - # Markdown for LLM evaluation - agent-eval-harness summarize results.jsonl --markdown -o results.md -`) - return - } - - const resultsPath = positionals[0] - if (!resultsPath) { - console.error('Error: results.jsonl path is required') - process.exit(1) - } - - await runSummarize({ - resultsPath, - outputPath: values.output, - markdown: values.markdown ?? false, - }) -} diff --git a/src/commands/tests/balance-helpers.spec.ts b/src/commands/tests/balance-helpers.spec.ts deleted file mode 100644 index 6641c34..0000000 --- a/src/commands/tests/balance-helpers.spec.ts +++ /dev/null @@ -1,279 +0,0 @@ -import { describe, expect, test } from 'bun:test' -import type { CategoryDistribution, PromptCase } from '../../schemas.ts' -import { analyzeCategories, findUnderrepresented, generateSuggestions } from '../balance.ts' - -// ============================================================================ -// analyzeCategories -// ============================================================================ - -describe('analyzeCategories', () => { - test('counts prompts by category', () => { - const prompts: PromptCase[] = [ - { id: '1', input: 'test', metadata: { category: 'math' } }, - { id: '2', input: 'test', metadata: { category: 'math' } }, - { id: '3', input: 'test', metadata: { category: 'code' } }, - ] - - const result = analyzeCategories(prompts, 'category') - - expect(result).toHaveLength(2) - const math = result.find((d) => d.name === 'math') - const code = result.find((d) => d.name === 'code') - - expect(math?.count).toBe(2) - expect(code?.count).toBe(1) - }) - - test('calculates percentages correctly', () => { - const prompts: PromptCase[] = [ - { id: '1', input: 'test', metadata: { category: 'a' } }, - { id: '2', input: 'test', metadata: { category: 'a' } }, - { id: '3', input: 'test', metadata: { category: 'b' } }, - { id: '4', input: 'test', metadata: { category: 'b' } }, - ] - - const result = analyzeCategories(prompts, 'category') - - expect(result[0]?.percentage).toBe(50) - expect(result[1]?.percentage).toBe(50) - }) - - test('sorts by count descending', () => { - const prompts: PromptCase[] = [ - { id: '1', input: 'test', metadata: { category: 'small' } }, - { id: '2', input: 'test', metadata: { category: 'large' } }, - { id: '3', input: 'test', metadata: { category: 'large' } }, - { id: '4', input: 'test', metadata: { category: 'large' } }, - { id: '5', input: 'test', metadata: { category: 'medium' } }, - { id: '6', input: 'test', metadata: { category: 'medium' } }, - ] - - const result = analyzeCategories(prompts, 'category') - - expect(result[0]?.name).toBe('large') - expect(result[1]?.name).toBe('medium') - expect(result[2]?.name).toBe('small') - }) - - test('handles missing metadata as (uncategorized)', () => { - const prompts: PromptCase[] = [ - { id: '1', input: 'test', metadata: { category: 'known' } }, - { id: '2', input: 'test' }, // No metadata - { id: '3', input: 'test', metadata: {} }, // Empty metadata - ] - - const result = analyzeCategories(prompts, 'category') - - const uncategorized = result.find((d) => d.name === '(uncategorized)') - expect(uncategorized?.count).toBe(2) - }) - - test('handles different metadata keys', () => { - const prompts: PromptCase[] = [ - { id: '1', input: 'test', metadata: { difficulty: 'easy', category: 'math' } }, - { id: '2', input: 'test', metadata: { difficulty: 'hard', category: 'math' } }, - { id: '3', input: 'test', metadata: { difficulty: 'easy', category: 'code' } }, - ] - - const byDifficulty = analyzeCategories(prompts, 'difficulty') - const byCategory = analyzeCategories(prompts, 'category') - - expect(byDifficulty.find((d) => d.name === 'easy')?.count).toBe(2) - expect(byCategory.find((d) => d.name === 'math')?.count).toBe(2) - }) - - test('converts non-string metadata values to strings', () => { - const prompts: PromptCase[] = [ - { id: '1', input: 'test', metadata: { level: 1 } }, - { id: '2', input: 'test', metadata: { level: 1 } }, - { id: '3', input: 'test', metadata: { level: 2 } }, - ] - - const result = analyzeCategories(prompts, 'level') - - expect(result.find((d) => d.name === '1')?.count).toBe(2) - expect(result.find((d) => d.name === '2')?.count).toBe(1) - }) - - test('handles empty prompts array', () => { - const result = analyzeCategories([], 'category') - expect(result).toEqual([]) - }) - - test('rounds percentages to integers', () => { - const prompts: PromptCase[] = [ - { id: '1', input: 'test', metadata: { category: 'a' } }, - { id: '2', input: 'test', metadata: { category: 'b' } }, - { id: '3', input: 'test', metadata: { category: 'c' } }, - ] - - const result = analyzeCategories(prompts, 'category') - - // 1/3 = 33.33... should round to 33 - for (const dist of result) { - expect(Number.isInteger(dist.percentage)).toBe(true) - } - }) -}) - -// ============================================================================ -// findUnderrepresented -// ============================================================================ - -describe('findUnderrepresented', () => { - test('identifies categories below threshold', () => { - const distributions: CategoryDistribution[] = [ - { name: 'large', count: 50, percentage: 50 }, - { name: 'medium', count: 30, percentage: 30 }, - { name: 'small', count: 20, percentage: 20 }, - ] - - // Even distribution would be 33.3% each - // With 50% threshold, anything below 16.65% is underrepresented - const result = findUnderrepresented(distributions, 50) - - // At 50% threshold, 20% is above 16.65%, so nothing should be underrepresented - expect(result).toEqual([]) - }) - - test('returns underrepresented categories at stricter threshold', () => { - const distributions: CategoryDistribution[] = [ - { name: 'large', count: 80, percentage: 80 }, - { name: 'small', count: 20, percentage: 20 }, - ] - - // Even distribution would be 50% each - // With 50% threshold, anything below 25% is underrepresented - const result = findUnderrepresented(distributions, 50) - - expect(result).toContain('small') - expect(result).not.toContain('large') - }) - - test('handles even distribution (no underrepresentation)', () => { - const distributions: CategoryDistribution[] = [ - { name: 'a', count: 25, percentage: 25 }, - { name: 'b', count: 25, percentage: 25 }, - { name: 'c', count: 25, percentage: 25 }, - { name: 'd', count: 25, percentage: 25 }, - ] - - const result = findUnderrepresented(distributions, 50) - expect(result).toEqual([]) - }) - - test('handles single category (never underrepresented)', () => { - const distributions: CategoryDistribution[] = [{ name: 'only', count: 100, percentage: 100 }] - - const result = findUnderrepresented(distributions, 50) - expect(result).toEqual([]) - }) - - test('threshold affects sensitivity', () => { - const distributions: CategoryDistribution[] = [ - { name: 'large', count: 70, percentage: 70 }, - { name: 'small', count: 30, percentage: 30 }, - ] - - // Even = 50%, at 50% threshold: below 25% is underrepresented - const strict = findUnderrepresented(distributions, 50) - expect(strict).toEqual([]) - - // At 80% threshold: below 40% is underrepresented - const lenient = findUnderrepresented(distributions, 80) - expect(lenient).toContain('small') - }) - - test('handles empty distributions', () => { - const result = findUnderrepresented([], 50) - expect(result).toEqual([]) - }) -}) - -// ============================================================================ -// generateSuggestions -// ============================================================================ - -describe('generateSuggestions', () => { - test('suggests adding cases for underrepresented categories', () => { - const distributions: CategoryDistribution[] = [ - { name: 'math', count: 80, percentage: 80 }, - { name: 'code', count: 20, percentage: 20 }, - ] - const underrepresented = ['code'] - - const suggestions = generateSuggestions(distributions, underrepresented, 100) - - expect(suggestions.some((s) => s.includes('code'))).toBe(true) - expect(suggestions.some((s) => s.toLowerCase().includes('add'))).toBe(true) - }) - - test('warns about dominant category (>50%)', () => { - const distributions: CategoryDistribution[] = [ - { name: 'dominant', count: 60, percentage: 60 }, - { name: 'other', count: 40, percentage: 40 }, - ] - - const suggestions = generateSuggestions(distributions, [], 100) - - expect(suggestions.some((s) => s.includes('dominant') && s.includes('60%'))).toBe(true) - expect(suggestions.some((s) => s.toLowerCase().includes('diversify'))).toBe(true) - }) - - test('warns about tiny categories (<3 cases)', () => { - const distributions: CategoryDistribution[] = [ - { name: 'large', count: 97, percentage: 97 }, - { name: 'tiny', count: 2, percentage: 2 }, - { name: 'also_tiny', count: 1, percentage: 1 }, - ] - - const suggestions = generateSuggestions(distributions, [], 100) - - expect(suggestions.some((s) => s.includes('tiny') || s.includes('also_tiny'))).toBe(true) - expect(suggestions.some((s) => s.includes('< 3 cases'))).toBe(true) - }) - - test('suggests expanding small test sets (<20 cases)', () => { - const distributions: CategoryDistribution[] = [ - { name: 'a', count: 5, percentage: 50 }, - { name: 'b', count: 5, percentage: 50 }, - ] - - const suggestions = generateSuggestions(distributions, [], 10) - - expect(suggestions.some((s) => s.includes('10 cases') && s.toLowerCase().includes('expand'))).toBe(true) - }) - - test('returns "well-balanced" when no issues found', () => { - const distributions: CategoryDistribution[] = [ - { name: 'a', count: 25, percentage: 25 }, - { name: 'b', count: 25, percentage: 25 }, - { name: 'c', count: 25, percentage: 25 }, - { name: 'd', count: 25, percentage: 25 }, - ] - - const suggestions = generateSuggestions(distributions, [], 100) - - expect(suggestions.some((s) => s.toLowerCase().includes('well-balanced'))).toBe(true) - }) - - test('combines multiple suggestions', () => { - const distributions: CategoryDistribution[] = [ - { name: 'huge', count: 8, percentage: 80 }, - { name: 'tiny', count: 2, percentage: 20 }, - ] - const underrepresented = ['tiny'] - - const suggestions = generateSuggestions(distributions, underrepresented, 10) - - // Should have multiple suggestions: underrepresented, dominant, tiny count, small test set - expect(suggestions.length).toBeGreaterThanOrEqual(2) - }) - - test('handles empty distributions', () => { - const suggestions = generateSuggestions([], [], 0) - - // Should suggest expanding (0 cases) - expect(suggestions.some((s) => s.includes('0 cases'))).toBe(true) - }) -}) diff --git a/src/commands/tests/calibrate-helpers.spec.ts b/src/commands/tests/calibrate-helpers.spec.ts deleted file mode 100644 index becdff1..0000000 --- a/src/commands/tests/calibrate-helpers.spec.ts +++ /dev/null @@ -1,226 +0,0 @@ -import { describe, expect, test } from 'bun:test' -import type { TrajectoryStep } from '../../schemas.ts' -import { getTrajectorySnippet, sampleArray } from '../calibrate.ts' - -// ============================================================================ -// sampleArray -// ============================================================================ - -describe('sampleArray', () => { - test('returns n elements from array', () => { - const arr = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] - const result = sampleArray(arr, 3) - - expect(result).toHaveLength(3) - }) - - test('returns all elements when n >= array length', () => { - const arr = [1, 2, 3] - const result = sampleArray(arr, 5) - - expect(result).toHaveLength(3) - expect(new Set(result)).toEqual(new Set(arr)) - }) - - test('returns empty array for empty input', () => { - const result = sampleArray([], 5) - expect(result).toEqual([]) - }) - - test('returns empty array when n is 0', () => { - const arr = [1, 2, 3] - const result = sampleArray(arr, 0) - - expect(result).toEqual([]) - }) - - test('does not modify original array', () => { - const arr = [1, 2, 3, 4, 5] - const original = [...arr] - sampleArray(arr, 3) - - expect(arr).toEqual(original) - }) - - test('returns unique elements (no duplicates)', () => { - const arr = [1, 2, 3, 4, 5] - const result = sampleArray(arr, 3) - - const uniqueResult = new Set(result) - expect(uniqueResult.size).toBe(result.length) - }) - - test('all returned elements exist in original array', () => { - const arr = ['a', 'b', 'c', 'd', 'e'] - const result = sampleArray(arr, 3) - - for (const item of result) { - expect(arr).toContain(item) - } - }) - - test('works with objects', () => { - const arr = [{ id: 1 }, { id: 2 }, { id: 3 }, { id: 4 }] - const result = sampleArray(arr, 2) - - expect(result).toHaveLength(2) - for (const item of result) { - expect(arr).toContainEqual(item) - } - }) - - test('produces different results on multiple calls (randomness)', () => { - const arr = Array.from({ length: 100 }, (_, i) => i) - const results = new Set() - - // Run multiple times and check we get different orderings - for (let i = 0; i < 10; i++) { - const sample = sampleArray(arr, 10) - results.add(sample.join(',')) - } - - // With 100 elements, sampling 10, we should get different results - // This is probabilistic but extremely unlikely to fail - expect(results.size).toBeGreaterThan(1) - }) -}) - -// ============================================================================ -// getTrajectorySnippet -// ============================================================================ - -describe('getTrajectorySnippet', () => { - const createStep = (index: number): TrajectoryStep => ({ - type: 'message', - content: `Step ${index}`, - timestamp: index * 100, - }) - - test('returns full trajectory when length <= maxSteps', () => { - const trajectory: TrajectoryStep[] = [createStep(1), createStep(2), createStep(3)] - - const result = getTrajectorySnippet(trajectory, 5) - - expect(result).toHaveLength(3) - expect(result).toEqual(trajectory) - }) - - test('returns maxSteps elements for longer trajectories', () => { - const trajectory: TrajectoryStep[] = Array.from({ length: 10 }, (_, i) => createStep(i + 1)) - - const result = getTrajectorySnippet(trajectory, 5) - - expect(result).toHaveLength(5) - }) - - test('includes first two steps', () => { - const trajectory: TrajectoryStep[] = Array.from({ length: 10 }, (_, i) => createStep(i + 1)) - - const result = getTrajectorySnippet(trajectory, 5) - - expect(result[0]).toEqual(createStep(1)) - expect(result[1]).toEqual(createStep(2)) - }) - - test('includes last two steps', () => { - const trajectory: TrajectoryStep[] = Array.from({ length: 10 }, (_, i) => createStep(i + 1)) - - const result = getTrajectorySnippet(trajectory, 5) - - expect(result[3]).toEqual(createStep(9)) - expect(result[4]).toEqual(createStep(10)) - }) - - test('includes middle step', () => { - const trajectory: TrajectoryStep[] = Array.from({ length: 10 }, (_, i) => createStep(i + 1)) - - const result = getTrajectorySnippet(trajectory, 5) - - // Middle of 10 is index 5 (0-indexed), which is Step 6 - expect(result[2]).toEqual(createStep(6)) - }) - - test('handles empty trajectory', () => { - const result = getTrajectorySnippet([], 5) - expect(result).toEqual([]) - }) - - test('handles single element trajectory', () => { - const trajectory: TrajectoryStep[] = [createStep(1)] - - const result = getTrajectorySnippet(trajectory, 5) - - expect(result).toEqual([createStep(1)]) - }) - - test('handles trajectory exactly at maxSteps boundary', () => { - const trajectory: TrajectoryStep[] = Array.from({ length: 5 }, (_, i) => createStep(i + 1)) - - const result = getTrajectorySnippet(trajectory, 5) - - expect(result).toHaveLength(5) - expect(result).toEqual(trajectory) - }) - - test('respects custom maxSteps parameter', () => { - const trajectory: TrajectoryStep[] = Array.from({ length: 20 }, (_, i) => createStep(i + 1)) - - const result3 = getTrajectorySnippet(trajectory, 3) - const result7 = getTrajectorySnippet(trajectory, 7) - - // With maxSteps=3, should still return 5 (first 2 + middle + last 2) - // because the algorithm always takes first 2, middle 1, last 2 - // But the function returns full trajectory if <= maxSteps - expect(result3.length).toBeLessThanOrEqual(trajectory.length) - expect(result7.length).toBeLessThanOrEqual(trajectory.length) - }) - - test('works with different step types', () => { - const trajectory: TrajectoryStep[] = [ - { type: 'thought', content: 'Thinking...', timestamp: 0 }, - { type: 'tool_call', name: 'Read', status: 'completed', timestamp: 100 }, - { type: 'tool_call', name: 'Write', status: 'completed', timestamp: 200 }, - { type: 'tool_call', name: 'Bash', status: 'completed', timestamp: 300 }, - { type: 'tool_call', name: 'Grep', status: 'completed', timestamp: 400 }, - { type: 'tool_call', name: 'Glob', status: 'completed', timestamp: 500 }, - { type: 'plan', entries: [{ content: 'Plan', status: 'done' }], timestamp: 600 }, - { type: 'message', content: 'Done!', timestamp: 700 }, - ] - - const result = getTrajectorySnippet(trajectory, 5) - - expect(result).toHaveLength(5) - // First two - expect(result[0]?.type).toBe('thought') - expect(result[1]?.type).toBe('tool_call') - // Last two - expect(result[3]?.type).toBe('plan') - expect(result[4]?.type).toBe('message') - }) - - test('preserves step content when extracting', () => { - const trajectory: TrajectoryStep[] = [ - { type: 'thought', content: 'First thought', timestamp: 0 }, - { type: 'message', content: 'First message', timestamp: 100 }, - { type: 'tool_call', name: 'Read', status: 'completed', input: { file_path: '/test.ts' }, timestamp: 200 }, - { type: 'tool_call', name: 'Write', status: 'completed', timestamp: 300 }, - { type: 'tool_call', name: 'Bash', status: 'failed', timestamp: 400 }, - { type: 'message', content: 'Last message', timestamp: 500 }, - ] - - const result = getTrajectorySnippet(trajectory, 5) - - // First step should preserve all properties - const firstStep = result[0] - if (firstStep?.type === 'thought') { - expect(firstStep.content).toBe('First thought') - expect(firstStep.timestamp).toBe(0) - } - - // Last step should preserve all properties - const lastStep = result[result.length - 1] - if (lastStep?.type === 'message') { - expect(lastStep.content).toBe('Last message') - } - }) -}) diff --git a/src/commands/tests/capture-cli.spec.ts b/src/commands/tests/capture-cli.spec.ts deleted file mode 100644 index 2e6e76d..0000000 --- a/src/commands/tests/capture-cli.spec.ts +++ /dev/null @@ -1,274 +0,0 @@ -import { afterEach, beforeEach, describe, expect, test } from 'bun:test' -import type { CaptureConfig } from '../capture.ts' -import { loadPrompts } from '../capture.ts' - -// ============================================================================ -// loadPrompts -// ============================================================================ - -describe('loadPrompts', () => { - const testPromptFile = '/tmp/agent-eval-harness-test-prompts.jsonl' - - beforeEach(async () => { - await Bun.$`rm -f ${testPromptFile}`.nothrow() - }) - - afterEach(async () => { - await Bun.$`rm -f ${testPromptFile}`.nothrow() - }) - - test('loads single-turn prompts', async () => { - await Bun.write( - testPromptFile, - `{"id": "t1", "input": "Hello"} -{"id": "t2", "input": "World"}`, - ) - - const prompts = await loadPrompts(testPromptFile) - - expect(prompts).toHaveLength(2) - expect(prompts[0]?.id).toBe('t1') - expect(prompts[0]?.input).toBe('Hello') - expect(prompts[1]?.id).toBe('t2') - expect(prompts[1]?.input).toBe('World') - }) - - test('loads multi-turn prompts', async () => { - await Bun.write(testPromptFile, `{"id": "conv1", "input": ["Hi", "How are you?", "Bye"]}`) - - const prompts = await loadPrompts(testPromptFile) - - expect(prompts).toHaveLength(1) - expect(prompts[0]?.id).toBe('conv1') - expect(Array.isArray(prompts[0]?.input)).toBe(true) - expect(prompts[0]?.input).toEqual(['Hi', 'How are you?', 'Bye']) - }) - - test('loads prompts with hint field', async () => { - await Bun.write(testPromptFile, `{"id": "t1", "input": "2+2?", "hint": "4"}`) - - const prompts = await loadPrompts(testPromptFile) - - expect(prompts).toHaveLength(1) - expect(prompts[0]?.hint).toBe('4') - }) - - test('loads prompts with metadata', async () => { - await Bun.write( - testPromptFile, - `{"id": "t1", "input": "Test", "metadata": {"category": "math", "difficulty": "easy"}}`, - ) - - const prompts = await loadPrompts(testPromptFile) - - expect(prompts).toHaveLength(1) - expect(prompts[0]?.metadata).toEqual({ category: 'math', difficulty: 'easy' }) - }) - - test('loads prompts with timeout override', async () => { - await Bun.write(testPromptFile, `{"id": "t1", "input": "Slow task", "timeout": 120000}`) - - const prompts = await loadPrompts(testPromptFile) - - expect(prompts).toHaveLength(1) - expect(prompts[0]?.timeout).toBe(120000) - }) - - test('skips empty lines', async () => { - await Bun.write( - testPromptFile, - `{"id": "t1", "input": "First"} - -{"id": "t2", "input": "Second"} -`, - ) - - const prompts = await loadPrompts(testPromptFile) - - expect(prompts).toHaveLength(2) - }) - - test('throws on invalid JSON', async () => { - await Bun.write(testPromptFile, 'not valid json') - - await expect(loadPrompts(testPromptFile)).rejects.toThrow() - }) - - test('throws on missing required fields', async () => { - await Bun.write(testPromptFile, `{"id": "t1"}`) // missing input - - await expect(loadPrompts(testPromptFile)).rejects.toThrow() - }) -}) - -// ============================================================================ -// runCapture configuration -// ============================================================================ - -describe('runCapture configuration', () => { - test('CaptureConfig type accepts valid configuration', () => { - // Type-level test - if this compiles, the types are correct - const config: CaptureConfig = { - promptsPath: '/tmp/prompts.jsonl', - schemaPath: './schemas/claude-headless.json', - outputPath: '/tmp/output.jsonl', - cwd: '/tmp', - timeout: 30000, - progress: true, - append: false, - debug: false, - concurrency: 4, - workspaceDir: '/tmp/workspaces', - } - - expect(config.promptsPath).toBe('/tmp/prompts.jsonl') - expect(config.schemaPath).toBe('./schemas/claude-headless.json') - expect(config.concurrency).toBe(4) - expect(config.workspaceDir).toBe('/tmp/workspaces') - }) - - test('CaptureConfig allows minimal configuration', () => { - const config: CaptureConfig = { - promptsPath: '/tmp/prompts.jsonl', - schemaPath: './test-schema.json', - } - - expect(config.outputPath).toBeUndefined() - expect(config.cwd).toBeUndefined() - expect(config.timeout).toBeUndefined() - expect(config.progress).toBeUndefined() - expect(config.append).toBeUndefined() - expect(config.grader).toBeUndefined() - expect(config.concurrency).toBeUndefined() - expect(config.workspaceDir).toBeUndefined() - }) - - test('CaptureConfig accepts prompts without promptsPath', () => { - const config: CaptureConfig = { - schemaPath: './test-schema.json', - prompts: [{ id: 't1', input: 'hello' }], - } - - expect(config.promptsPath).toBeUndefined() - expect(config.prompts).toHaveLength(1) - }) -}) - -// ============================================================================ -// CLI Help Output -// ============================================================================ - -describe('capture CLI', () => { - test('displays help with --help flag', async () => { - const proc = Bun.spawn(['bun', './bin/cli.ts', 'capture', '--help'], { - stdout: 'pipe', - stderr: 'pipe', - }) - - const stdout = await new Response(proc.stdout).text() - await proc.exited - - expect(stdout).toContain('Usage: agent-eval-harness capture') - expect(stdout).toContain('prompts.jsonl') - expect(stdout).toContain('-o, --output') - expect(stdout).toContain('-c, --cwd') - expect(stdout).toContain('-t, --timeout') - expect(stdout).toContain('--progress') - expect(stdout).toContain('-g, --grader') - expect(stdout).toContain('-s, --schema') - expect(stdout).toContain('-j, --concurrency') - expect(stdout).toContain('--workspace-dir') - expect(stdout).toContain('--stdin') - }) - - test('shows error for --stdin with positional file', async () => { - const proc = Bun.spawn( - ['bun', './bin/cli.ts', 'capture', '/tmp/prompts.jsonl', '--stdin', '-s', '/tmp/schema.json'], - { - stdout: 'pipe', - stderr: 'pipe', - }, - ) - - const stderr = await new Response(proc.stderr).text() - const exitCode = await proc.exited - - expect(exitCode).not.toBe(0) - expect(stderr).toContain('--stdin and prompts file argument are mutually exclusive') - }) - - test('shows error for missing prompts file argument', async () => { - const proc = Bun.spawn(['bun', './bin/cli.ts', 'capture'], { - stdout: 'pipe', - stderr: 'pipe', - }) - - const stderr = await new Response(proc.stderr).text() - const exitCode = await proc.exited - - expect(exitCode).not.toBe(0) - expect(stderr).toContain('prompts.jsonl path is required') - }) - - test('shows error for missing schema argument', async () => { - const proc = Bun.spawn(['bun', './bin/cli.ts', 'capture', '/tmp/prompts.jsonl'], { - stdout: 'pipe', - stderr: 'pipe', - }) - - const stderr = await new Response(proc.stderr).text() - const exitCode = await proc.exited - - expect(exitCode).not.toBe(0) - expect(stderr).toContain('--schema is required') - }) - - test('shows error for invalid concurrency value', async () => { - const proc = Bun.spawn( - ['bun', './bin/cli.ts', 'capture', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '-j', 'abc'], - { - stdout: 'pipe', - stderr: 'pipe', - }, - ) - - const stderr = await new Response(proc.stderr).text() - const exitCode = await proc.exited - - expect(exitCode).not.toBe(0) - expect(stderr).toContain('--concurrency must be a positive integer') - }) - - test('shows error for zero concurrency', async () => { - const proc = Bun.spawn( - ['bun', './bin/cli.ts', 'capture', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '-j', '0'], - { - stdout: 'pipe', - stderr: 'pipe', - }, - ) - - const stderr = await new Response(proc.stderr).text() - const exitCode = await proc.exited - - expect(exitCode).not.toBe(0) - expect(stderr).toContain('--concurrency must be a positive integer') - }) - - test('shows error for negative concurrency', async () => { - // Note: Using --concurrency=-1 format because -j -1 is ambiguous to parseArgs - const proc = Bun.spawn( - ['bun', './bin/cli.ts', 'capture', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '--concurrency=-1'], - { - stdout: 'pipe', - stderr: 'pipe', - }, - ) - - const stderr = await new Response(proc.stderr).text() - const exitCode = await proc.exited - - expect(exitCode).not.toBe(0) - expect(stderr).toContain('--concurrency must be a positive integer') - }) -}) diff --git a/src/commands/tests/capture-helpers.spec.ts b/src/commands/tests/capture-helpers.spec.ts deleted file mode 100644 index b772291..0000000 --- a/src/commands/tests/capture-helpers.spec.ts +++ /dev/null @@ -1,634 +0,0 @@ -import { describe, expect, test } from 'bun:test' -import type { ParsedUpdate } from '../../headless/headless-output-parser.ts' -import type { TrajectoryStep } from '../../schemas.ts' -import { - detectTrajectoryRichness, - extractContent, - extractFilePath, - extractOutput, - extractTrajectory, - hasToolErrors, - headTailPreview, - loadPrompts, -} from '../capture.ts' - -// ============================================================================ -// loadPrompts -// ============================================================================ - -describe('loadPrompts', () => { - test('parses valid JSONL file with string input', async () => { - // Create a temporary test file - const testPath = '/tmp/test-prompts-valid.jsonl' - await Bun.write( - testPath, - `{"id": "test-1", "input": "What is 2+2?"} -{"id": "test-2", "input": "Hello world", "hint": "greeting"}`, - ) - - const prompts = await loadPrompts(testPath) - - expect(prompts).toHaveLength(2) - expect(prompts[0]?.id).toBe('test-1') - expect(prompts[0]?.input).toBe('What is 2+2?') - expect(prompts[1]?.id).toBe('test-2') - expect(prompts[1]?.hint).toBe('greeting') - }) - - test('parses multi-turn input (string array)', async () => { - const testPath = '/tmp/test-prompts-multiturn.jsonl' - await Bun.write(testPath, `{"id": "test-1", "input": ["Hello", "How are you?", "Goodbye"], "hint": "farewell"}`) - - const prompts = await loadPrompts(testPath) - - expect(prompts).toHaveLength(1) - expect(prompts[0]?.id).toBe('test-1') - expect(Array.isArray(prompts[0]?.input)).toBe(true) - expect(prompts[0]?.input).toEqual(['Hello', 'How are you?', 'Goodbye']) - expect(prompts[0]?.hint).toBe('farewell') - }) - - test('parses prompts with metadata', async () => { - const testPath = '/tmp/test-prompts-metadata.jsonl' - await Bun.write( - testPath, - `{"id": "test-1", "input": "Test", "metadata": {"category": "math", "difficulty": "easy"}}`, - ) - - const prompts = await loadPrompts(testPath) - - expect(prompts).toHaveLength(1) - expect(prompts[0]?.metadata?.category).toBe('math') - expect(prompts[0]?.metadata?.difficulty).toBe('easy') - }) - - test('throws on invalid JSON at specific line', async () => { - const testPath = '/tmp/test-prompts-invalid.jsonl' - await Bun.write( - testPath, - `{"id": "test-1", "input": "Valid"} -{invalid json here} -{"id": "test-3", "input": "Also valid"}`, - ) - - await expect(loadPrompts(testPath)).rejects.toThrow('Invalid prompt at line 2') - }) - - test('throws on missing required fields', async () => { - const testPath = '/tmp/test-prompts-missing.jsonl' - await Bun.write(testPath, `{"id": "test-1"}`) - - await expect(loadPrompts(testPath)).rejects.toThrow('Invalid prompt at line 1') - }) - - test('handles empty lines gracefully', async () => { - const testPath = '/tmp/test-prompts-empty-lines.jsonl' - await Bun.write( - testPath, - `{"id": "test-1", "input": "First"} - -{"id": "test-2", "input": "Second"} -`, - ) - - const prompts = await loadPrompts(testPath) - expect(prompts).toHaveLength(2) - }) -}) - -// ============================================================================ -// extractTrajectory -// ============================================================================ - -describe('extractTrajectory', () => { - const baseTime = 0 - - test('extracts thoughts from thought type updates', () => { - const updates: ParsedUpdate[] = [ - { - type: 'thought', - content: 'Let me think about this...', - timestamp: 100, - raw: { type: 'thought', text: 'Let me think about this...' }, - }, - ] - - const trajectory = extractTrajectory(updates, baseTime) - - expect(trajectory).toHaveLength(1) - expect(trajectory[0]?.type).toBe('thought') - const step = trajectory[0]! - expect(step.type === 'thought' && step.content).toBe('Let me think about this...') - }) - - test('extracts messages from message type updates', () => { - const updates: ParsedUpdate[] = [ - { - type: 'message', - content: 'Here is my answer.', - timestamp: 200, - raw: { type: 'message', text: 'Here is my answer.' }, - }, - ] - - const trajectory = extractTrajectory(updates, baseTime) - - expect(trajectory).toHaveLength(1) - expect(trajectory[0]?.type).toBe('message') - const step = trajectory[0]! - expect(step.type === 'message' && step.content).toBe('Here is my answer.') - }) - - test('extracts tool calls with title and status', () => { - const updates: ParsedUpdate[] = [ - { - type: 'tool_call', - title: 'Read', - status: 'pending', - timestamp: 300, - raw: { tool: 'Read', input: { file_path: '/test.ts' } }, - }, - ] - - const trajectory = extractTrajectory(updates, baseTime) - - expect(trajectory).toHaveLength(1) - expect(trajectory[0]?.type).toBe('tool_call') - const step = trajectory[0]! - expect(step.type === 'tool_call' && step.name).toBe('Read') - expect(step.type === 'tool_call' && step.status).toBe('pending') - }) - - test('extracts plan type updates', () => { - const updates: ParsedUpdate[] = [ - { - type: 'plan', - timestamp: 400, - raw: { - entries: [ - { content: 'Step 1', status: 'completed' }, - { content: 'Step 2', status: 'in_progress' }, - ], - }, - }, - ] - - const trajectory = extractTrajectory(updates, baseTime) - - expect(trajectory).toHaveLength(1) - expect(trajectory[0]?.type).toBe('plan') - // Note: extractTrajectory creates plan entries from the update type - // but doesn't extract entries from raw (they are captured via output parser mappings) - const step = trajectory[0]! - expect(step.type === 'plan').toBe(true) - }) - - test('handles empty updates', () => { - const trajectory = extractTrajectory([], baseTime) - expect(trajectory).toEqual([]) - }) - - test('assigns timestamps relative to start time', () => { - const startTime = 1000 - const updates: ParsedUpdate[] = [ - { - type: 'message', - content: 'First', - timestamp: 1500, - raw: { type: 'message', text: 'First' }, - }, - { - type: 'message', - content: 'Second', - timestamp: 2000, - raw: { type: 'message', text: 'Second' }, - }, - ] - - const trajectory = extractTrajectory(updates, startTime) - - expect(trajectory[0]?.timestamp).toBe(500) - expect(trajectory[1]?.timestamp).toBe(1000) - }) - - test('handles updates without content for message/thought types', () => { - const updates: ParsedUpdate[] = [ - { - type: 'message', - content: undefined, // No content - will have empty string - timestamp: 100, - raw: { type: 'message' }, - }, - { - type: 'message', - content: 'Has content', - timestamp: 200, - raw: { type: 'message', text: 'Has content' }, - }, - ] - - const trajectory = extractTrajectory(updates, baseTime) - - // Both messages are included - ones without content get empty string - expect(trajectory).toHaveLength(2) - expect(trajectory[0]?.type).toBe('message') - expect(trajectory[1]?.type).toBe('message') - }) - - test('attaches input to new tool call from update', () => { - const updates: ParsedUpdate[] = [ - { - type: 'tool_call', - title: 'Read', - status: 'pending', - input: { file_path: '/src/main.ts' }, - timestamp: 500, - raw: {}, - }, - ] - - const trajectory = extractTrajectory(updates, baseTime) - - expect(trajectory).toHaveLength(1) - const step = trajectory[0]! - expect(step.type === 'tool_call' && step.input).toEqual({ file_path: '/src/main.ts' }) - }) - - test('attaches output to tool call on completion', () => { - const updates: ParsedUpdate[] = [ - { - type: 'tool_call', - title: 'Read', - status: 'pending', - input: { file_path: '/src/main.ts' }, - timestamp: 500, - raw: {}, - }, - { - type: 'tool_call', - title: 'Read', - status: 'completed', - output: 'file contents here', - timestamp: 800, - raw: {}, - }, - ] - - const trajectory = extractTrajectory(updates, baseTime) - - expect(trajectory).toHaveLength(1) - const step = trajectory[0]! - expect(step.type).toBe('tool_call') - if (step.type === 'tool_call') { - expect(step.input).toEqual({ file_path: '/src/main.ts' }) - expect(step.output).toBe('file contents here') - expect(step.status).toBe('completed') - expect(step.duration).toBe(300) - } - }) - - test('handles sequential same-named tool calls independently', () => { - const updates: ParsedUpdate[] = [ - // First Read: pending → completed - { - type: 'tool_call', - title: 'Read', - status: 'pending', - input: { file_path: '/src/a.ts' }, - timestamp: 100, - raw: {}, - }, - { - type: 'tool_call', - title: 'Read', - status: 'completed', - output: 'contents of a.ts', - timestamp: 300, - raw: {}, - }, - // Second Read: pending → completed (same tool name, different args) - { - type: 'tool_call', - title: 'Read', - status: 'pending', - input: { file_path: '/src/b.ts' }, - timestamp: 500, - raw: {}, - }, - { - type: 'tool_call', - title: 'Read', - status: 'completed', - output: 'contents of b.ts', - timestamp: 700, - raw: {}, - }, - ] - - const trajectory = extractTrajectory(updates, baseTime) - - // Both calls should appear as separate trajectory steps - const toolCalls = trajectory.filter((s) => s.type === 'tool_call') - expect(toolCalls).toHaveLength(2) - - const first = toolCalls[0]! - expect(first.type === 'tool_call' && first.input).toEqual({ file_path: '/src/a.ts' }) - expect(first.type === 'tool_call' && first.output).toBe('contents of a.ts') - expect(first.type === 'tool_call' && first.status).toBe('completed') - - const second = toolCalls[1]! - expect(second.type === 'tool_call' && second.input).toEqual({ file_path: '/src/b.ts' }) - expect(second.type === 'tool_call' && second.output).toBe('contents of b.ts') - expect(second.type === 'tool_call' && second.status).toBe('completed') - }) -}) - -// ============================================================================ -// extractOutput -// ============================================================================ - -describe('extractOutput', () => { - test('joins message contents with newlines', () => { - const trajectory: TrajectoryStep[] = [ - { type: 'message', content: 'First line', timestamp: 0 }, - { type: 'message', content: 'Second line', timestamp: 100 }, - ] - - expect(extractOutput(trajectory)).toBe('First line\nSecond line') - }) - - test('filters out non-message steps', () => { - const trajectory: TrajectoryStep[] = [ - { type: 'thought', content: 'Thinking...', timestamp: 0 }, - { type: 'message', content: 'Answer', timestamp: 100 }, - { type: 'tool_call', name: 'Read', status: 'completed', timestamp: 200 }, - { type: 'message', content: 'Done', timestamp: 300 }, - ] - - expect(extractOutput(trajectory)).toBe('Answer\nDone') - }) - - test('returns empty string for empty trajectory', () => { - expect(extractOutput([])).toBe('') - }) - - test('returns empty string when no messages', () => { - const trajectory: TrajectoryStep[] = [ - { type: 'thought', content: 'Just thinking', timestamp: 0 }, - { type: 'tool_call', name: 'Read', status: 'completed', timestamp: 100 }, - ] - - expect(extractOutput(trajectory)).toBe('') - }) - - test('handles single message', () => { - const trajectory: TrajectoryStep[] = [{ type: 'message', content: 'Only message', timestamp: 0 }] - - expect(extractOutput(trajectory)).toBe('Only message') - }) -}) - -// ============================================================================ -// hasToolErrors -// ============================================================================ - -describe('hasToolErrors', () => { - test('returns false when no tool calls', () => { - const trajectory: TrajectoryStep[] = [ - { type: 'thought', content: 'Thinking', timestamp: 0 }, - { type: 'message', content: 'Done', timestamp: 100 }, - ] - - expect(hasToolErrors(trajectory)).toBe(false) - }) - - test('returns false when all tool calls succeeded', () => { - const trajectory: TrajectoryStep[] = [ - { type: 'tool_call', name: 'Read', status: 'completed', timestamp: 0 }, - { type: 'tool_call', name: 'Write', status: 'completed', timestamp: 100 }, - ] - - expect(hasToolErrors(trajectory)).toBe(false) - }) - - test('returns true when any tool call failed', () => { - const trajectory: TrajectoryStep[] = [ - { type: 'tool_call', name: 'Read', status: 'completed', timestamp: 0 }, - { type: 'tool_call', name: 'Write', status: 'failed', timestamp: 100 }, - { type: 'tool_call', name: 'Bash', status: 'completed', timestamp: 200 }, - ] - - expect(hasToolErrors(trajectory)).toBe(true) - }) - - test('returns false for empty trajectory', () => { - expect(hasToolErrors([])).toBe(false) - }) - - test('returns true when only tool call failed', () => { - const trajectory: TrajectoryStep[] = [{ type: 'tool_call', name: 'Bash', status: 'failed', timestamp: 0 }] - - expect(hasToolErrors(trajectory)).toBe(true) - }) -}) - -// ============================================================================ -// headTailPreview -// ============================================================================ - -describe('headTailPreview', () => { - test('returns full content when under limit', () => { - const content = 'line1\nline2\nline3' - expect(headTailPreview(content, 5, 5)).toBe(content) - }) - - test('truncates with omitted count for long content', () => { - const lines = Array.from({ length: 20 }, (_, i) => `line${i + 1}`) - const content = lines.join('\n') - - const result = headTailPreview(content, 3, 3) - - expect(result).toContain('line1') - expect(result).toContain('line2') - expect(result).toContain('line3') - expect(result).toContain('line18') - expect(result).toContain('line19') - expect(result).toContain('line20') - expect(result).toContain('14 lines omitted') - }) - - test('respects custom head line count', () => { - const lines = Array.from({ length: 10 }, (_, i) => `line${i + 1}`) - const content = lines.join('\n') - - const result = headTailPreview(content, 2, 2) - - expect(result).toContain('line1') - expect(result).toContain('line2') - expect(result).not.toContain('line3') - expect(result).toContain('6 lines omitted') - }) - - test('respects custom tail line count', () => { - const lines = Array.from({ length: 10 }, (_, i) => `line${i + 1}`) - const content = lines.join('\n') - - const result = headTailPreview(content, 1, 4) - - expect(result).toContain('line1') - expect(result).toContain('line7') - expect(result).toContain('line10') - expect(result).toContain('5 lines omitted') - }) - - test('handles content exactly at boundary', () => { - const content = 'line1\nline2\nline3\nline4\nline5\nline6' - // 6 lines, head=3, tail=3 means no truncation needed - expect(headTailPreview(content, 3, 3)).toBe(content) - }) - - test('handles single line content', () => { - const content = 'single line' - expect(headTailPreview(content, 3, 3)).toBe(content) - }) - - test('handles empty content', () => { - expect(headTailPreview('', 3, 3)).toBe('') - }) -}) - -// ============================================================================ -// extractFilePath -// ============================================================================ - -describe('extractFilePath', () => { - test('extracts file_path field', () => { - const input = { file_path: '/path/to/file.ts' } - expect(extractFilePath(input)).toBe('/path/to/file.ts') - }) - - test('extracts path field as fallback', () => { - const input = { path: '/another/path.js' } - expect(extractFilePath(input)).toBe('/another/path.js') - }) - - test('prefers file_path over path', () => { - const input = { file_path: '/preferred.ts', path: '/fallback.ts' } - expect(extractFilePath(input)).toBe('/preferred.ts') - }) - - test('returns undefined for invalid input', () => { - expect(extractFilePath(null)).toBeUndefined() - expect(extractFilePath(undefined)).toBeUndefined() - expect(extractFilePath('string')).toBeUndefined() - expect(extractFilePath(123)).toBeUndefined() - }) - - test('returns undefined when no path fields present', () => { - const input = { content: 'some content' } - expect(extractFilePath(input)).toBeUndefined() - }) - - test('handles empty object', () => { - expect(extractFilePath({})).toBeUndefined() - }) -}) - -// ============================================================================ -// extractContent -// ============================================================================ - -describe('extractContent', () => { - test('extracts content field', () => { - const input = { content: 'const x = 1;' } - expect(extractContent(input)).toBe('const x = 1;') - }) - - test('extracts new_string field as fallback', () => { - const input = { new_string: 'const y = 2;' } - expect(extractContent(input)).toBe('const y = 2;') - }) - - test('prefers content over new_string', () => { - const input = { content: 'preferred', new_string: 'fallback' } - expect(extractContent(input)).toBe('preferred') - }) - - test('returns undefined for invalid input', () => { - expect(extractContent(null)).toBeUndefined() - expect(extractContent(undefined)).toBeUndefined() - expect(extractContent('string')).toBeUndefined() - expect(extractContent(123)).toBeUndefined() - }) - - test('returns undefined when no content fields present', () => { - const input = { file_path: '/some/path.ts' } - expect(extractContent(input)).toBeUndefined() - }) - - test('handles empty object', () => { - expect(extractContent({})).toBeUndefined() - }) - - test('handles multiline content', () => { - const input = { content: 'line1\nline2\nline3' } - expect(extractContent(input)).toBe('line1\nline2\nline3') - }) -}) - -// ============================================================================ -// detectTrajectoryRichness -// ============================================================================ - -describe('detectTrajectoryRichness', () => { - test('returns "full" when trajectory has thoughts', () => { - const trajectory: TrajectoryStep[] = [ - { type: 'thought', content: 'Let me think...', timestamp: 0 }, - { type: 'message', content: 'Answer', timestamp: 100 }, - ] - - expect(detectTrajectoryRichness(trajectory)).toBe('full') - }) - - test('returns "full" when trajectory has tool calls', () => { - const trajectory: TrajectoryStep[] = [ - { type: 'tool_call', name: 'Read', status: 'completed', timestamp: 0 }, - { type: 'message', content: 'Answer', timestamp: 100 }, - ] - - expect(detectTrajectoryRichness(trajectory)).toBe('full') - }) - - test('returns "full" when trajectory has plans', () => { - const trajectory: TrajectoryStep[] = [ - { type: 'plan', entries: [{ content: 'Step 1', status: 'completed' }], timestamp: 0 }, - { type: 'message', content: 'Answer', timestamp: 100 }, - ] - - expect(detectTrajectoryRichness(trajectory)).toBe('full') - }) - - test('returns "messages-only" when trajectory only has messages', () => { - const trajectory: TrajectoryStep[] = [ - { type: 'message', content: 'First', timestamp: 0 }, - { type: 'message', content: 'Second', timestamp: 100 }, - ] - - expect(detectTrajectoryRichness(trajectory)).toBe('messages-only') - }) - - test('returns "minimal" when trajectory is empty', () => { - expect(detectTrajectoryRichness([])).toBe('minimal') - }) - - test('returns "full" when trajectory has mixed rich content', () => { - const trajectory: TrajectoryStep[] = [ - { type: 'thought', content: 'Thinking...', timestamp: 0 }, - { type: 'tool_call', name: 'Read', status: 'completed', timestamp: 50 }, - { type: 'plan', entries: [], timestamp: 100 }, - { type: 'message', content: 'Done', timestamp: 150 }, - ] - - expect(detectTrajectoryRichness(trajectory)).toBe('full') - }) -}) diff --git a/src/commands/tests/summarize-helpers.spec.ts b/src/commands/tests/summarize-helpers.spec.ts deleted file mode 100644 index 9df86d1..0000000 --- a/src/commands/tests/summarize-helpers.spec.ts +++ /dev/null @@ -1,339 +0,0 @@ -import { describe, expect, test } from 'bun:test' -import type { CaptureResult } from '../../schemas.ts' -import { formatMarkdown, formatSummary } from '../summarize.ts' - -// ============================================================================ -// Test Fixtures -// ============================================================================ - -const createBasicResult = (overrides?: Partial): CaptureResult => ({ - id: 'test-001', - input: 'What is 2+2?', - output: 'The answer is 4.', - trajectory: [ - { type: 'thought', content: 'Let me think about this...', timestamp: 0 }, - { type: 'message', content: 'The answer is 4.', timestamp: 100 }, - ], - metadata: { category: 'math', agent: 'test-agent' }, - timing: { start: 1000, end: 2000, sessionCreation: 0, total: 1000 }, - toolErrors: false, - ...overrides, -}) - -const createResultWithToolCalls = (): CaptureResult => ({ - id: 'test-002', - input: 'Read and summarize file.txt', - output: 'File contains important data.', - trajectory: [ - { type: 'thought', content: 'I will read the file first.', timestamp: 0 }, - { - type: 'tool_call', - name: 'Read', - status: 'completed', - input: { file_path: '/path/to/file.txt' }, - output: 'file contents here', - duration: 50, - timestamp: 100, - }, - { - type: 'tool_call', - name: 'Write', - status: 'completed', - input: { file_path: '/output.md', content: 'Summary here' }, - duration: 30, - timestamp: 200, - }, - { type: 'message', content: 'File contains important data.', timestamp: 300 }, - ], - metadata: { agent: 'test-agent' }, - timing: { start: 1000, end: 1500, sessionCreation: 0, total: 500 }, - toolErrors: false, -}) - -// ============================================================================ -// formatSummary -// ============================================================================ - -describe('formatSummary', () => { - test('extracts id, input, and output', () => { - const result = createBasicResult() - const summary = formatSummary(result) - - expect(summary.id).toBe('test-001') - expect(summary.input).toBe('What is 2+2?') - expect(summary.output).toBe('The answer is 4.') - }) - - test('extracts tool call names', () => { - const result = createResultWithToolCalls() - const summary = formatSummary(result) - - expect(summary.toolCalls).toEqual(['Read', 'Write']) - }) - - test('calculates duration from timing', () => { - const result = createBasicResult() - const summary = formatSummary(result) - - expect(summary.duration).toBe(1000) // 2000 - 1000 - }) - - test('handles empty trajectory', () => { - const result = createBasicResult({ trajectory: [] }) - const summary = formatSummary(result) - - expect(summary.toolCalls).toEqual([]) - }) - - test('filters only tool_call steps for toolCalls list', () => { - const result = createBasicResult() - const summary = formatSummary(result) - - // trajectory has thought and message, but no tool_call - expect(summary.toolCalls).toEqual([]) - }) - - test('handles trajectory with only messages', () => { - const result = createBasicResult({ - trajectory: [ - { type: 'message', content: 'First message', timestamp: 0 }, - { type: 'message', content: 'Second message', timestamp: 100 }, - ], - }) - const summary = formatSummary(result) - - expect(summary.toolCalls).toEqual([]) - }) - - test('preserves original input/output exactly', () => { - const result = createBasicResult({ - input: 'Input with\nnewlines and "quotes"', - output: 'Output with\ttabs', - }) - const summary = formatSummary(result) - - expect(summary.input).toBe('Input with\nnewlines and "quotes"') - expect(summary.output).toBe('Output with\ttabs') - }) -}) - -// ============================================================================ -// formatMarkdown -// ============================================================================ - -describe('formatMarkdown', () => { - test('includes evaluation record header with id', () => { - const result = createBasicResult() - const markdown = formatMarkdown(result) - - expect(markdown).toContain('## Evaluation Record: test-001') - }) - - test('includes input field', () => { - const result = createBasicResult() - const markdown = formatMarkdown(result) - - expect(markdown).toContain('**Input:** What is 2+2?') - }) - - test('includes trajectory section', () => { - const result = createBasicResult() - const markdown = formatMarkdown(result) - - expect(markdown).toContain('**Trajectory:**') - }) - - test('formats thought steps with truncation', () => { - const result = createBasicResult({ - trajectory: [ - { type: 'thought', content: 'Short thought', timestamp: 0 }, - { type: 'thought', content: 'A'.repeat(150), timestamp: 100 }, - ], - }) - const markdown = formatMarkdown(result) - - expect(markdown).toContain('[THOUGHT] Short thought') - expect(markdown).toContain(`[THOUGHT] ${'A'.repeat(100)}...`) - }) - - test('formats tool calls with status and duration', () => { - const result = createResultWithToolCalls() - const markdown = formatMarkdown(result) - - expect(markdown).toContain('[TOOL:Read] -> completed (50ms)') - expect(markdown).toContain('[TOOL:Write] -> completed (30ms)') - }) - - test('includes file path for tool calls', () => { - const result = createResultWithToolCalls() - const markdown = formatMarkdown(result) - - expect(markdown).toContain('File: /path/to/file.txt') - expect(markdown).toContain('File: /output.md') - }) - - test('includes step IDs for reference', () => { - const result = createBasicResult() - const markdown = formatMarkdown(result) - - expect(markdown).toContain('[→test-001-step-1]') - expect(markdown).toContain('[→test-001-step-2]') - }) - - test('formats plan steps', () => { - const result = createBasicResult({ - trajectory: [ - { - type: 'plan', - entries: [ - { content: 'Step 1', status: 'completed' }, - { content: 'Step 2', status: 'in_progress' }, - ], - timestamp: 0, - }, - ], - }) - const markdown = formatMarkdown(result) - - expect(markdown).toContain('[PLAN]') - expect(markdown).toContain('Step 1: completed') - expect(markdown).toContain('Step 2: in_progress') - }) - - test('truncates long plan summaries', () => { - const result = createBasicResult({ - trajectory: [ - { - type: 'plan', - entries: [ - { content: 'A very long step description that goes on and on', status: 'completed' }, - { content: 'Another very long step description', status: 'pending' }, - { content: 'Yet another step', status: 'pending' }, - ], - timestamp: 0, - }, - ], - }) - const markdown = formatMarkdown(result) - - expect(markdown).toContain('...') - }) - - test('formats message steps', () => { - const result = createBasicResult({ - trajectory: [{ type: 'message', content: 'Here is my response to your question.', timestamp: 0 }], - }) - const markdown = formatMarkdown(result) - - expect(markdown).toContain('[MESSAGE] Here is my response') - }) - - test('includes output preview', () => { - const result = createBasicResult() - const markdown = formatMarkdown(result) - - expect(markdown).toContain('**Output:** The answer is 4.') - }) - - test('truncates long output', () => { - const result = createBasicResult({ - output: 'X'.repeat(300), - }) - const markdown = formatMarkdown(result) - - expect(markdown).toContain(`${'X'.repeat(200)}...`) - }) - - test('includes metadata', () => { - const result = createBasicResult() - const markdown = formatMarkdown(result) - - expect(markdown).toContain('**Metadata:**') - expect(markdown).toContain('category=math') - expect(markdown).toContain('agent=test-agent') - }) - - test('includes tool errors status', () => { - const result = createBasicResult({ toolErrors: true }) - const markdown = formatMarkdown(result) - - expect(markdown).toContain('**Tool Errors:** true') - }) - - test('includes duration', () => { - const result = createBasicResult() - const markdown = formatMarkdown(result) - - expect(markdown).toContain('**Duration:** 1000ms') - }) - - test('includes score when present', () => { - const result = createBasicResult({ - score: { - pass: true, - score: 0.95, - reasoning: 'Correct answer provided', - }, - }) - const markdown = formatMarkdown(result) - - expect(markdown).toContain('**Score:** PASS (0.95)') - expect(markdown).toContain('**Reasoning:** Correct answer provided') - }) - - test('handles failed score', () => { - const result = createBasicResult({ - score: { - pass: false, - score: 0.2, - reasoning: 'Incorrect answer', - }, - }) - const markdown = formatMarkdown(result) - - expect(markdown).toContain('**Score:** FAIL (0.2)') - }) - - test('includes content preview with syntax highlighting', () => { - const result: CaptureResult = { - id: 'test-003', - input: 'Write a function', - output: 'Done', - trajectory: [ - { - type: 'tool_call', - name: 'Write', - status: 'completed', - input: { - file_path: '/src/utils.ts', - content: 'export const add = (a: number, b: number) => a + b;', - }, - duration: 20, - timestamp: 0, - }, - ], - metadata: { agent: 'test' }, - timing: { start: 0, end: 100, sessionCreation: 0, total: 100 }, - toolErrors: false, - } - const markdown = formatMarkdown(result) - - expect(markdown).toContain('```ts') - expect(markdown).toContain('export const add') - }) - - test('ends with horizontal rule separator', () => { - const result = createBasicResult() - const markdown = formatMarkdown(result) - - expect(markdown).toContain('---') - }) - - test('handles empty trajectory', () => { - const result = createBasicResult({ trajectory: [] }) - const markdown = formatMarkdown(result) - - expect(markdown).toContain('**Trajectory:**') - expect(markdown).toContain('**Output:**') - }) -}) diff --git a/src/commands/tests/trials-calculations.spec.ts b/src/commands/tests/trials-calculations.spec.ts deleted file mode 100644 index 30ce9ae..0000000 --- a/src/commands/tests/trials-calculations.spec.ts +++ /dev/null @@ -1,209 +0,0 @@ -import { describe, expect, test } from 'bun:test' -import { calculatePassAtK, calculatePassExpK } from '../trials.ts' - -// ============================================================================ -// calculatePassAtK -// ============================================================================ - -describe('calculatePassAtK', () => { - test('returns 1 when all trials pass', () => { - expect(calculatePassAtK(5, 5)).toBe(1) - expect(calculatePassAtK(10, 10)).toBe(1) - expect(calculatePassAtK(1, 1)).toBe(1) - }) - - test('returns 0 when no trials pass', () => { - expect(calculatePassAtK(0, 5)).toBe(0) - expect(calculatePassAtK(0, 10)).toBe(0) - expect(calculatePassAtK(0, 1)).toBe(0) - }) - - test('calculates probability correctly for partial passes', () => { - // pass@k = 1 - (1 - passRate)^k - // For 3 passes out of 5: passRate = 0.6 - // pass@5 = 1 - (0.4)^5 = 1 - 0.01024 = 0.98976 - const result = calculatePassAtK(3, 5) - expect(result).toBeCloseTo(0.98976, 5) - }) - - test('k=1 equals the pass rate', () => { - // For k=1, pass@1 = 1 - (1 - p)^1 = p - expect(calculatePassAtK(1, 1)).toBe(1) - - // More interesting: 0 passes, 1 trial - expect(calculatePassAtK(0, 1)).toBe(0) - }) - - test('higher pass rate yields higher pass@k', () => { - const lowPassRate = calculatePassAtK(1, 5) // 20% pass rate - const highPassRate = calculatePassAtK(4, 5) // 80% pass rate - - expect(highPassRate).toBeGreaterThan(lowPassRate) - }) - - test('larger k amplifies probability of at least one pass', () => { - // With 50% pass rate, larger k means higher chance of at least one pass - // k=2: 1 - (0.5)^2 = 0.75 - // k=4: 1 - (0.5)^4 = 0.9375 - - const k2 = calculatePassAtK(1, 2) // 50% pass rate - const k4 = calculatePassAtK(2, 4) // Also 50% pass rate - - expect(k4).toBeGreaterThan(k2) - }) - - test('handles edge case where passes equals k', () => { - expect(calculatePassAtK(3, 3)).toBe(1) - }) - - test('handles passes greater than k (returns 1)', () => { - // This shouldn't happen in practice, but the function handles it - expect(calculatePassAtK(10, 5)).toBe(1) - }) - - test('mathematical verification with known values', () => { - // 1 out of 3 passes: passRate = 1/3 - // pass@3 = 1 - (2/3)^3 = 1 - 8/27 = 19/27 ≈ 0.7037 - const result = calculatePassAtK(1, 3) - expect(result).toBeCloseTo(19 / 27, 5) - - // 2 out of 4 passes: passRate = 0.5 - // pass@4 = 1 - (0.5)^4 = 1 - 0.0625 = 0.9375 - const result2 = calculatePassAtK(2, 4) - expect(result2).toBeCloseTo(0.9375, 5) - }) -}) - -// ============================================================================ -// calculatePassExpK -// ============================================================================ - -describe('calculatePassExpK', () => { - test('returns 1 when all trials pass', () => { - expect(calculatePassExpK(5, 5)).toBe(1) - expect(calculatePassExpK(10, 10)).toBe(1) - expect(calculatePassExpK(1, 1)).toBe(1) - }) - - test('returns 0 when no trials pass', () => { - expect(calculatePassExpK(0, 5)).toBe(0) - expect(calculatePassExpK(0, 10)).toBe(0) - expect(calculatePassExpK(0, 1)).toBe(0) - }) - - test('calculates probability correctly', () => { - // pass^k = passRate^k - // For 3 passes out of 5: passRate = 0.6 - // pass^5 = (0.6)^5 = 0.07776 - const result = calculatePassExpK(3, 5) - expect(result).toBeCloseTo(0.07776, 5) - }) - - test('k=1 equals the pass rate', () => { - // For k=1, pass^1 = p^1 = p - expect(calculatePassExpK(1, 1)).toBe(1) - }) - - test('higher pass rate yields higher pass^k', () => { - const lowPassRate = calculatePassExpK(1, 5) // 20% pass rate - const highPassRate = calculatePassExpK(4, 5) // 80% pass rate - - expect(highPassRate).toBeGreaterThan(lowPassRate) - }) - - test('larger k reduces probability of all passing (for non-100% rates)', () => { - // With 80% pass rate: - // k=2: (0.8)^2 = 0.64 - // k=5: (0.8)^5 = 0.32768 - - // Mathematical verification using known formulas - const k2_fair = 0.8 ** 2 // = 0.64 - const k5_fair = 0.8 ** 5 // = 0.32768 - - expect(k5_fair).toBeLessThan(k2_fair) - - // Also verify our function produces consistent results - // 4 out of 5 gives 80% pass rate - const result = calculatePassExpK(4, 5) - expect(result).toBeCloseTo(k5_fair, 5) - }) - - test('handles edge case where passes equals k', () => { - expect(calculatePassExpK(3, 3)).toBe(1) - }) - - test('mathematical verification with known values', () => { - // 1 out of 3 passes: passRate = 1/3 - // pass^3 = (1/3)^3 = 1/27 ≈ 0.037 - const result = calculatePassExpK(1, 3) - expect(result).toBeCloseTo(1 / 27, 5) - - // 2 out of 4 passes: passRate = 0.5 - // pass^4 = (0.5)^4 = 0.0625 - const result2 = calculatePassExpK(2, 4) - expect(result2).toBeCloseTo(0.0625, 5) - - // 3 out of 4 passes: passRate = 0.75 - // pass^4 = (0.75)^4 = 0.31640625 - const result3 = calculatePassExpK(3, 4) - expect(result3).toBeCloseTo(0.31640625, 5) - }) - - test('pass^k is always less than or equal to pass@k', () => { - // For any pass rate < 100%, pass^k <= pass@k - // This is because "all pass" is a subset of "at least one passes" - - const testCases = [ - { passes: 1, k: 5 }, - { passes: 2, k: 5 }, - { passes: 3, k: 5 }, - { passes: 4, k: 5 }, - { passes: 1, k: 3 }, - { passes: 2, k: 4 }, - ] - - for (const { passes, k } of testCases) { - const passExpK = calculatePassExpK(passes, k) - const passAtK = calculatePassAtK(passes, k) - expect(passExpK).toBeLessThanOrEqual(passAtK) - } - }) -}) - -// ============================================================================ -// Combined behavior tests -// ============================================================================ - -describe('pass@k and pass^k relationship', () => { - test('100% pass rate: both metrics equal 1', () => { - expect(calculatePassAtK(5, 5)).toBe(1) - expect(calculatePassExpK(5, 5)).toBe(1) - }) - - test('0% pass rate: both metrics equal 0', () => { - expect(calculatePassAtK(0, 5)).toBe(0) - expect(calculatePassExpK(0, 5)).toBe(0) - }) - - test('gap between metrics varies with pass rate', () => { - // At 50% pass rate, the gap is maximized - // At extreme pass rates (0% or 100%), the gap is 0 - - // 50% pass rate with k=4 - const midAtK = calculatePassAtK(2, 4) // 0.9375 - const midExpK = calculatePassExpK(2, 4) // 0.0625 - const midGap = midAtK - midExpK // 0.875 - - // 80% pass rate with k=5 - const highAtK = calculatePassAtK(4, 5) - const highExpK = calculatePassExpK(4, 5) - const highGap = highAtK - highExpK - - // Both gaps should be positive (pass@k > pass^k for partial pass rates) - expect(midGap).toBeGreaterThan(0) - expect(highGap).toBeGreaterThan(0) - - // Mid-range pass rate has larger gap than high pass rate - expect(midGap).toBeGreaterThan(highGap) - }) -}) diff --git a/src/commands/tests/trials-cli.spec.ts b/src/commands/tests/trials-cli.spec.ts deleted file mode 100644 index 8005755..0000000 --- a/src/commands/tests/trials-cli.spec.ts +++ /dev/null @@ -1,215 +0,0 @@ -import { describe, expect, test } from 'bun:test' -import type { TrialsConfig } from '../trials.ts' - -// ============================================================================ -// TrialsConfig type -// ============================================================================ - -describe('TrialsConfig configuration', () => { - test('TrialsConfig type accepts valid configuration', () => { - const config: TrialsConfig = { - promptsPath: '/tmp/prompts.jsonl', - schemaPath: './schemas/claude-headless.json', - k: 5, - outputPath: '/tmp/output.jsonl', - cwd: '/tmp', - timeout: 30000, - progress: true, - append: false, - debug: false, - concurrency: 4, - workspaceDir: '/tmp/workspaces', - } - - expect(config.promptsPath).toBe('/tmp/prompts.jsonl') - expect(config.schemaPath).toBe('./schemas/claude-headless.json') - expect(config.k).toBe(5) - expect(config.concurrency).toBe(4) - expect(config.workspaceDir).toBe('/tmp/workspaces') - }) - - test('TrialsConfig allows minimal configuration', () => { - const config: TrialsConfig = { - promptsPath: '/tmp/prompts.jsonl', - schemaPath: './test-schema.json', - k: 3, - } - - expect(config.outputPath).toBeUndefined() - expect(config.cwd).toBeUndefined() - expect(config.timeout).toBeUndefined() - expect(config.progress).toBeUndefined() - expect(config.append).toBeUndefined() - expect(config.grader).toBeUndefined() - expect(config.concurrency).toBeUndefined() - expect(config.workspaceDir).toBeUndefined() - }) - - test('TrialsConfig accepts prompts without promptsPath', () => { - const config: TrialsConfig = { - schemaPath: './test-schema.json', - k: 3, - prompts: [{ id: 't1', input: 'hello' }], - } - - expect(config.promptsPath).toBeUndefined() - expect(config.prompts).toHaveLength(1) - }) -}) - -// ============================================================================ -// CLI Help Output -// ============================================================================ - -describe('trials CLI', () => { - test('displays help with --help flag', async () => { - const proc = Bun.spawn(['bun', './bin/cli.ts', 'trials', '--help'], { - stdout: 'pipe', - stderr: 'pipe', - }) - - const stdout = await new Response(proc.stdout).text() - await proc.exited - - expect(stdout).toContain('Usage: agent-eval-harness trials') - expect(stdout).toContain('prompts.jsonl') - expect(stdout).toContain('-o, --output') - expect(stdout).toContain('-k') - expect(stdout).toContain('-c, --cwd') - expect(stdout).toContain('-t, --timeout') - expect(stdout).toContain('--progress') - expect(stdout).toContain('-g, --grader') - expect(stdout).toContain('-s, --schema') - expect(stdout).toContain('pass@k') - expect(stdout).toContain('-j, --concurrency') - expect(stdout).toContain('--workspace-dir') - expect(stdout).toContain('--stdin') - }) - - test('shows error for --stdin with positional file', async () => { - const proc = Bun.spawn( - ['bun', './bin/cli.ts', 'trials', '/tmp/prompts.jsonl', '--stdin', '-s', '/tmp/schema.json'], - { - stdout: 'pipe', - stderr: 'pipe', - }, - ) - - const stderr = await new Response(proc.stderr).text() - const exitCode = await proc.exited - - expect(exitCode).not.toBe(0) - expect(stderr).toContain('--stdin and prompts file argument are mutually exclusive') - }) - - test('shows error for missing prompts file argument', async () => { - const proc = Bun.spawn(['bun', './bin/cli.ts', 'trials'], { - stdout: 'pipe', - stderr: 'pipe', - }) - - const stderr = await new Response(proc.stderr).text() - const exitCode = await proc.exited - - expect(exitCode).not.toBe(0) - expect(stderr).toContain('prompts.jsonl path is required') - }) - - test('shows error for missing schema argument', async () => { - const proc = Bun.spawn(['bun', './bin/cli.ts', 'trials', '/tmp/prompts.jsonl'], { - stdout: 'pipe', - stderr: 'pipe', - }) - - const stderr = await new Response(proc.stderr).text() - const exitCode = await proc.exited - - expect(exitCode).not.toBe(0) - expect(stderr).toContain('--schema is required') - }) - - test('shows error for invalid concurrency value', async () => { - const proc = Bun.spawn( - ['bun', './bin/cli.ts', 'trials', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '-j', 'abc'], - { - stdout: 'pipe', - stderr: 'pipe', - }, - ) - - const stderr = await new Response(proc.stderr).text() - const exitCode = await proc.exited - - expect(exitCode).not.toBe(0) - expect(stderr).toContain('--concurrency must be a positive integer') - }) - - test('shows error for zero concurrency', async () => { - const proc = Bun.spawn( - ['bun', './bin/cli.ts', 'trials', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '-j', '0'], - { - stdout: 'pipe', - stderr: 'pipe', - }, - ) - - const stderr = await new Response(proc.stderr).text() - const exitCode = await proc.exited - - expect(exitCode).not.toBe(0) - expect(stderr).toContain('--concurrency must be a positive integer') - }) -}) - -// ============================================================================ -// Schemas CLI -// ============================================================================ - -describe('schemas CLI', () => { - test('displays help with --help flag', async () => { - const proc = Bun.spawn(['bun', './bin/cli.ts', 'schemas', '--help'], { - stdout: 'pipe', - stderr: 'pipe', - }) - - const stdout = await new Response(proc.stdout).text() - await proc.exited - - expect(stdout).toContain('Usage: agent-eval-harness schemas') - expect(stdout).toContain('-o, --output') - expect(stdout).toContain('-j, --json') - expect(stdout).toContain('-s, --split') - expect(stdout).toContain('-l, --list') - expect(stdout).toContain('Available Schemas') - }) - - test('lists schemas with --list flag', async () => { - const proc = Bun.spawn(['bun', './bin/cli.ts', 'schemas', '--list'], { - stdout: 'pipe', - stderr: 'pipe', - }) - - const stdout = await new Response(proc.stdout).text() - await proc.exited - - expect(stdout).toContain('Available schemas') - expect(stdout).toContain('PromptCase') - expect(stdout).toContain('CaptureResult') - expect(stdout).toContain('GraderResult') - }) - - test('exports schema as JSON', async () => { - const proc = Bun.spawn(['bun', './bin/cli.ts', 'schemas', 'PromptCase', '--json'], { - stdout: 'pipe', - stderr: 'pipe', - }) - - const stdout = await new Response(proc.stdout).text() - await proc.exited - - const schema = JSON.parse(stdout) - expect(schema.$schema).toBe('https://json-schema.org/draft/2020-12/schema') - expect(schema.title).toBe('PromptCase') - expect(schema.type).toBe('object') - }) -}) diff --git a/src/commands/trials.ts b/src/commands/trials.ts deleted file mode 100644 index 9d4adb4..0000000 --- a/src/commands/trials.ts +++ /dev/null @@ -1,377 +0,0 @@ -/** - * Multi-run trials command for pass@k/pass^k analysis. - * - * @remarks - * Runs each prompt k times to measure non-determinism. - * Without a grader, captures raw trials. With a grader, computes: - * - passRate: Simple pass rate (passes / k) - * - passAtK: Probability of at least one pass in k samples - * - passExpK: Probability of all k samples passing - * - * @packageDocumentation - */ - -import { parseArgs } from 'node:util' -import { createWorkspaceDir, extractOutput, extractTrajectory, logProgress, readStdinPrompts } from '../core.ts' -import type { ParsedUpdate } from '../headless/headless-output-parser.ts' -import { DEFAULT_TRIAL_COUNT } from '../schemas/constants.ts' -import { loadGraderOrExit } from '../schemas/grader-loader.ts' -import type { PromptCase, TrialEntry, TrialResult } from '../schemas.ts' -import { type BaseExecutionConfig, executePrompts, parseConcurrency, prepareExecution } from './execution.ts' - -// ============================================================================ -// Pass@k/Pass^k Calculation -// ============================================================================ - -/** - * Calculate pass@k: probability of at least one pass in k samples. - * - * @remarks - * Uses the unbiased estimator: 1 - C(n-c, k) / C(n, k) - * where n = total samples, c = correct samples, k = samples per trial - * - * For our case where n = k (we run exactly k trials per prompt): - * pass@k = 1 - (1 - passRate)^k (simplified) - * - * @param passes - Number of passing trials - * @param k - Total number of trials - * @returns Probability of at least one pass - * - * @public - */ -export const calculatePassAtK = (passes: number, k: number): number => { - if (passes >= k) return 1 - if (passes === 0) return 0 - - // Simplified formula when n = k - const passRate = passes / k - return 1 - (1 - passRate) ** k -} - -/** - * Calculate pass^k: probability of all k samples passing. - * - * @remarks - * This is simply passRate^k - * - * @param passes - Number of passing trials - * @param k - Total number of trials - * @returns Probability of all k samples passing - * - * @public - */ -export const calculatePassExpK = (passes: number, k: number): number => { - if (passes === k) return 1 - if (passes === 0) return 0 - - const passRate = passes / k - return passRate ** k -} - -// ============================================================================ -// Types -// ============================================================================ - -/** Configuration for trials command */ -export type TrialsConfig = BaseExecutionConfig & { - /** Number of trials per prompt */ - k: number -} - -// ============================================================================ -// Trials Implementation -// ============================================================================ - -/** - * Execute trials with configuration object. - * - * @param config - Trials configuration - * @returns Array of trial results - */ -export const runTrials = async (config: TrialsConfig): Promise => { - const { k } = config - const ctx = await prepareExecution(config) - const { schema, prompts, sessions, resolvedWorkspaceDir, defaultWorkingDir, progress, grader } = ctx - - // Log progress info - logProgress(`Loaded ${prompts.length} prompts from ${config.promptsPath ?? 'stdin'}`, progress) - logProgress(`Running ${k} trials per prompt (${prompts.length * k} total executions)`, progress) - logProgress(`Schema: ${schema.name} (${config.schemaPath})`, progress) - logProgress(`Timeout: ${ctx.effectiveTimeout}ms`, progress) - if (ctx.concurrency > 1) { - logProgress(`Concurrency: ${ctx.concurrency} workers`, progress) - } - if (resolvedWorkspaceDir) { - logProgress(`Workspace: ${resolvedWorkspaceDir}`, progress) - } - if (grader) { - logProgress('Grader: enabled (will compute pass@k metrics)', progress) - } - - // Process all trials for a single prompt - const processPromptTrials = async (promptCase: (typeof prompts)[number], index: number): Promise => { - logProgress(`[${index + 1}/${prompts.length}] ${promptCase.id}: Running ${k} trials...`, progress) - - const trialEntries: TrialEntry[] = [] - - for (let trialNum = 1; trialNum <= k; trialNum++) { - // Determine working directory (per-prompt workspace or default) - // For trials, include trial number in workspace path for isolation - const workingDir = resolvedWorkspaceDir - ? await createWorkspaceDir(resolvedWorkspaceDir, `${promptCase.id}-trial-${trialNum}`) - : defaultWorkingDir - - // Create fresh session for each trial - const session = await sessions.create(workingDir) - const startTime = Date.now() - - try { - // Handle string or array input - const inputs = Array.isArray(promptCase.input) ? promptCase.input : [promptCase.input] - const allUpdates: ParsedUpdate[] = [] - - // Execute each turn sequentially - for (const turnInput of inputs) { - const turnResult = await sessions.prompt(session.id, turnInput) - allUpdates.push(...turnResult.updates) - } - - const endTime = Date.now() - const trajectory = extractTrajectory(allUpdates, startTime) - const output = extractOutput(trajectory) - - const entry: TrialEntry = { - trialNum, - output, - trajectory, - duration: endTime - startTime, - } - - // Apply grader if provided - if (grader) { - const graderResult = await grader({ - input: promptCase.input, - output, - hint: promptCase.hint, - trajectory, - metadata: promptCase.metadata, - cwd: session.cwd, - }) - entry.pass = graderResult.pass - entry.score = graderResult.score - entry.reasoning = graderResult.reasoning - - if (graderResult.outcome) { - entry.outcome = graderResult.outcome - } - } - - trialEntries.push(entry) - logProgress( - ` Trial ${trialNum}/${k}: ${entry.pass !== undefined ? (entry.pass ? '✓' : '✗') : '?'}`, - progress, - ) - } catch (error) { - const endTime = Date.now() - const message = error instanceof Error ? error.message : String(error) - - trialEntries.push({ - trialNum, - output: '', - trajectory: [], - duration: endTime - startTime, - pass: false, - reasoning: `Error: ${message}`, - }) - logProgress(` Trial ${trialNum}/${k}: ! (error)`, progress) - } finally { - // Always clean up session - sessions.destroy(session.id) - } - } - - // Build result - const result: TrialResult = { - id: promptCase.id, - input: promptCase.input, - ...(promptCase.hint && { hint: promptCase.hint }), - k, - trials: trialEntries, - metadata: { - ...promptCase.metadata, - agent: schema.name, - ...(resolvedWorkspaceDir && { workspaceDir: resolvedWorkspaceDir }), - }, - } - - // Calculate metrics if grader was used - if (grader) { - const passes = trialEntries.filter((t) => t.pass).length - result.passRate = passes / k - result.passAtK = calculatePassAtK(passes, k) - result.passExpK = calculatePassExpK(passes, k) - } - - // Write result immediately (coordinated via mutex for concurrent writes) - await ctx.writeResult(result) - - if (grader) { - logProgress( - ` → ${promptCase.id}: passRate=${(result.passRate ?? 0).toFixed(2)}, pass@${k}=${(result.passAtK ?? 0).toFixed(2)}`, - progress, - ) - } - - return result - } - - // Run with worker pool (parallelizes across prompts, trials for each prompt run sequentially) - return executePrompts(ctx, processPromptTrials) -} - -// ============================================================================ -// CLI Entry Point -// ============================================================================ - -/** - * Trials command CLI handler. - * - * @param args - Command line arguments (after 'trials') - */ -export const trials = async (args: string[]): Promise => { - const { values, positionals } = parseArgs({ - args, - options: { - schema: { type: 'string', short: 's' }, - output: { type: 'string', short: 'o' }, - k: { type: 'string', short: 'k', default: String(DEFAULT_TRIAL_COUNT) }, - cwd: { type: 'string', short: 'c' }, - timeout: { type: 'string', short: 't' }, - progress: { type: 'boolean', default: false }, - append: { type: 'boolean', default: false }, - grader: { type: 'string', short: 'g' }, - debug: { type: 'boolean', default: false }, - stdin: { type: 'boolean', default: false }, - concurrency: { type: 'string', short: 'j' }, - 'workspace-dir': { type: 'string' }, - help: { type: 'boolean', short: 'h' }, - }, - allowPositionals: true, - }) - - if (values.help) { - console.log(` -Usage: agent-eval-harness trials --schema [options] - cat prompts.jsonl | agent-eval-harness trials --stdin --schema [options] - -Arguments: - prompts.jsonl Input file with evaluation prompts - -Options: - -s, --schema Path to agent schema JSON file (required) - -o, --output Output file (default: stdout) - -k Number of trials per prompt (default: ${DEFAULT_TRIAL_COUNT}) - -c, --cwd Working directory for agent - -t, --timeout Request timeout in ms (overrides schema default) - -j, --concurrency Number of concurrent workers (default: 1) - --stdin Read prompts from stdin (mutually exclusive with file arg) - --workspace-dir Base directory for per-trial workspace isolation - --progress Show progress to stderr - --append Append to output file - -g, --grader Path to grader (.ts/.js module or executable script) - --debug Enable debug mode - -h, --help Show this help message - -Output Format: - Without grader: Raw trials with trajectories - With grader: Trials plus pass@k metrics (passRate, passAtK, passExpK) - -Graders: - TS/JS modules must export a 'grade' function. - Executable scripts (Python, etc.) use stdin/stdout JSON protocol. - -Parallelization: - Use -j/--concurrency to run multiple prompts' trials in parallel. - Each prompt's k trials still run sequentially (required for aggregation). - With 151 prompts and -j 4, you get 4 prompts running trials concurrently. - - Memory: Stream-mode agents (e.g. Claude Code) spawn real subprocesses - at ~400-500MB RSS each. With -j 8 that is 3-4GB of resident memory. - In memory-constrained environments (Docker, CI) this can cause OOM kills. - Use --stdin to pipe prompts for container-level orchestration. - -Workspace Isolation: - Use --workspace-dir to create per-trial directories. - Each trial runs in {workspace-dir}/prompt-{id}-trial-{n}/. - Useful for code generation tasks requiring filesystem isolation. - -Examples: - # Basic trials - agent-eval-harness trials prompts.jsonl -s claude.json -k 5 -o trials.jsonl - - # Run 4 prompts' trials in parallel (4x faster for 151 prompts) - agent-eval-harness trials prompts.jsonl -s claude.json -k 5 -j 4 -o trials.jsonl - - # With workspace isolation for code generation - agent-eval-harness trials prompts.jsonl -s claude.json -k 5 -j 4 \\ - --workspace-dir ./workspaces -o trials.jsonl - - # With TypeScript grader - agent-eval-harness trials prompts.jsonl -s claude.json -k 5 --grader ./grader.ts -o trials.jsonl - - # Read prompts from stdin (container orchestration) - cat prompts.jsonl | agent-eval-harness trials --stdin -s claude.json -k 5 -o trials.jsonl -`) - return - } - - const promptsPath = positionals[0] - const useStdin = values.stdin ?? false - - // Mutual exclusivity: --stdin and positional file - if (useStdin && promptsPath) { - console.error('Error: --stdin and prompts file argument are mutually exclusive') - process.exit(1) - } - - if (!useStdin && !promptsPath) { - console.error('Error: prompts.jsonl path is required (or use --stdin)') - process.exit(1) - } - - if (!values.schema) { - console.error('Error: --schema is required') - console.error('Example: agent-eval-harness trials prompts.jsonl --schema ./claude.json') - process.exit(1) - } - - // Read prompts from stdin if requested - let prompts: PromptCase[] | undefined - if (useStdin) { - const stdinPrompts = await readStdinPrompts() - if (!stdinPrompts || stdinPrompts.length === 0) { - console.error('Error: no prompts received on stdin') - process.exit(1) - } - prompts = stdinPrompts - } - - // Load grader if specified - const grader = values.grader ? await loadGraderOrExit(values.grader) : undefined - - await runTrials({ - promptsPath: promptsPath ?? undefined, - prompts, - schemaPath: values.schema, - k: Number.parseInt(values.k ?? String(DEFAULT_TRIAL_COUNT), 10), - outputPath: values.output, - cwd: values.cwd, - timeout: values.timeout ? Number.parseInt(values.timeout, 10) : undefined, - progress: values.progress ?? false, - append: values.append ?? false, - grader, - debug: values.debug ?? false, - concurrency: parseConcurrency(values.concurrency), - workspaceDir: values['workspace-dir'], - }) -} diff --git a/src/commands/validate-refs.ts b/src/commands/validate-refs.ts deleted file mode 100644 index 003790d..0000000 --- a/src/commands/validate-refs.ts +++ /dev/null @@ -1,171 +0,0 @@ -/** - * Validate-refs command - check reference solutions against grader. - * - * @remarks - * Validates that reference solutions in prompts.jsonl pass the grader. - * Helps identify prompts with broken or incorrect reference solutions. - * - * @packageDocumentation - */ - -import { parseArgs } from 'node:util' -import { loadPrompts, resolvePath } from '../core.ts' -import { loadGraderOrExit } from '../schemas/grader-loader.ts' -import type { Grader, ValidationResult } from '../schemas.ts' - -// ============================================================================ -// Types -// ============================================================================ - -/** Configuration for validate-refs command */ -export type ValidateRefsConfig = { - /** Path to prompts.jsonl file */ - promptsPath: string - /** Output file path */ - outputPath?: string - /** Grader function */ - grader: Grader -} - -// ============================================================================ -// Validate-Refs Implementation -// ============================================================================ - -/** - * Execute validate-refs with configuration object. - * - * @param config - Validate-refs configuration - * @returns Array of validation results - */ -export const runValidateRefs = async (config: ValidateRefsConfig): Promise => { - const { promptsPath, outputPath, grader } = config - - // Load prompts - const prompts = await loadPrompts(promptsPath) - - // Filter to prompts with reference solutions - const promptsWithRefs = prompts.filter((p) => p.reference !== undefined) - - if (promptsWithRefs.length === 0) { - console.error('No prompts with reference solutions found') - return [] - } - - console.error(`Validating ${promptsWithRefs.length} reference solutions...`) - - const results: ValidationResult[] = [] - - for (const prompt of promptsWithRefs) { - const graderResult = await grader({ - input: prompt.input, - output: prompt.reference as string, - hint: prompt.hint, - trajectory: [], // No trajectory for reference validation - metadata: prompt.metadata, - }) - - results.push({ - id: prompt.id, - reference: prompt.reference as string, - passes: graderResult.pass, - graderResult, - }) - - const icon = graderResult.pass ? '✓' : '✗' - console.error(` ${icon} ${prompt.id}`) - } - - // Format output - const output = results.map((r) => JSON.stringify(r)).join('\n') - - // Write output - if (outputPath) { - await Bun.write(resolvePath(outputPath), output) - } else { - console.log(output) - } - - // Summary - const passed = results.filter((r) => r.passes).length - const failed = results.length - passed - console.error(`\nResults: ${passed} passed, ${failed} failed`) - - if (failed > 0) { - console.error('\nFailing references:') - for (const result of results.filter((r) => !r.passes)) { - console.error(` - ${result.id}: ${result.graderResult.reasoning ?? 'No reasoning'}`) - } - } - - return results -} - -// ============================================================================ -// CLI Entry Point -// ============================================================================ - -/** - * Validate-refs command CLI handler. - * - * @param args - Command line arguments (after 'validate-refs') - */ -export const validateRefs = async (args: string[]): Promise => { - const { values, positionals } = parseArgs({ - args, - options: { - output: { type: 'string', short: 'o' }, - grader: { type: 'string', short: 'g' }, - help: { type: 'boolean', short: 'h' }, - }, - allowPositionals: true, - }) - - if (values.help) { - console.log(` -Usage: agent-eval-harness validate-refs --grader [options] - -Arguments: - prompts.jsonl Input file with prompts (must have 'reference' field) - -Options: - -o, --output Output file (default: stdout) - -g, --grader Path to grader (.ts/.js module or executable script, required) - -h, --help Show this help message - -Output: - JSONL with validation results for each reference solution. - -Prompt Format: - { - "id": "test-001", - "input": "What is 2+2?", - "expected": "4", - "reference": "The answer is 4." - } - -Examples: - agent-eval-harness validate-refs prompts.jsonl --grader ./grader.ts -o validation.jsonl -`) - return - } - - const promptsPath = positionals[0] - if (!promptsPath) { - console.error('Error: prompts.jsonl path is required') - process.exit(1) - } - - if (!values.grader) { - console.error('Error: --grader is required for validate-refs') - process.exit(1) - } - - // Load grader - const grader = await loadGraderOrExit(values.grader) - - await runValidateRefs({ - promptsPath, - outputPath: values.output, - grader, - }) -} diff --git a/src/core.ts b/src/core.ts deleted file mode 100644 index 5d1df05..0000000 --- a/src/core.ts +++ /dev/null @@ -1,46 +0,0 @@ -/** - * Core utilities re-export. - * - * @remarks - * Public API for core utilities. Import from here for external use. - * - * @packageDocumentation - */ - -export { - // Loading - buildResultsIndex, - countLines, - // Native streaming - countLinesStreaming, - // Worker pool - createWorkspaceDir, - createWriteMutex, - // Trajectory - detectTrajectoryRichness, - extractContent, - extractFilePath, - extractOutput, - extractTrajectory, - // Output - getInputPreview, - hasToolErrors, - headTailPreview, - loadJsonl, - loadPrompts, - loadResults, - logProgress, - type ProgressCallback, - readStdinPrompts, - resolvePath, - runWorkerPool, - streamJsonl, - streamPrompts, - streamResults, - streamResultsNative, - streamTrialResults, - type WorkerPoolOptions, - type WorkerPoolResult, - type WriteMutex, - writeOutput, -} from './core/core.ts' diff --git a/src/core/core.ts b/src/core/core.ts deleted file mode 100644 index a36b7bd..0000000 --- a/src/core/core.ts +++ /dev/null @@ -1,51 +0,0 @@ -/** - * Core utilities for agent-eval-harness. - * - * @remarks - * Re-exports shared utilities used across all commands: - * - Loading: JSONL file parsing for prompts and results - * - Trajectory: Extraction and analysis of agent trajectories - * - Output: Writing results, progress logging, path resolution - * - * @packageDocumentation - */ - -// Loading utilities -export { - buildResultsIndex, - countLines, - loadJsonl, - loadPrompts, - loadResults, - readStdinPrompts, - streamResults, -} from './loading.ts' -// Output utilities -export { getInputPreview, headTailPreview, logProgress, resolvePath, writeOutput } from './output.ts' -// Native streaming utilities -export { - countLinesStreaming, - streamJsonl, - streamPrompts, - streamResultsNative, - streamTrialResults, -} from './streaming.ts' -// Trajectory utilities -export { - detectTrajectoryRichness, - extractContent, - extractFilePath, - extractOutput, - extractTrajectory, - hasToolErrors, -} from './trajectory.ts' -// Worker pool utilities -export { - createWorkspaceDir, - createWriteMutex, - type ProgressCallback, - runWorkerPool, - type WorkerPoolOptions, - type WorkerPoolResult, - type WriteMutex, -} from './worker-pool.ts' diff --git a/src/core/loading.ts b/src/core/loading.ts deleted file mode 100644 index 1bb5b41..0000000 --- a/src/core/loading.ts +++ /dev/null @@ -1,207 +0,0 @@ -/** - * Shared loading utilities for JSONL files. - * - * @remarks - * Provides consistent loading and parsing of prompts and results files. - * Used by capture, trials, summarize, calibrate, and pipeline commands. - * - * @packageDocumentation - */ - -import type { CaptureResult, PromptCase } from '../schemas.ts' -import { CaptureResultSchema, PromptCaseSchema } from '../schemas.ts' - -/** - * Load prompts from a JSONL file. - * - * @remarks - * Each line in the file should be a valid JSON object matching PromptCaseSchema. - * Supports both single-turn (string input) and multi-turn (string[] input) formats. - * - * @param path - Path to the prompts.jsonl file - * @returns Parsed and validated prompt cases - * @throws Error if file cannot be read or any line is invalid - * - * @public - */ -export const loadPrompts = async (path: string): Promise => { - const content = await Bun.file(path).text() - return content - .trim() - .split('\n') - .filter(Boolean) - .map((line, index) => { - try { - return PromptCaseSchema.parse(JSON.parse(line)) - } catch (error) { - throw new Error(`Invalid prompt at line ${index + 1}: ${error instanceof Error ? error.message : error}`) - } - }) -} - -/** - * Read prompts from stdin as JSONL. - * - * @remarks - * Reads all data from stdin, parses each line as JSON, and validates against - * PromptCaseSchema. Returns null when stdin is a TTY (no piped input). - * Uses chunked Buffer reads matching the pattern in pipeline/run.ts. - * - * @returns Parsed and validated prompt cases, or null if stdin is a TTY - * @throws Error if any line is invalid JSON or fails schema validation - * - * @public - */ -export const readStdinPrompts = async (): Promise => { - if (process.stdin.isTTY) { - return null - } - - const chunks: Buffer[] = [] - for await (const chunk of process.stdin) { - chunks.push(chunk) - } - - const content = Buffer.concat(chunks).toString('utf-8').trim() - if (!content) return null - - return content - .split('\n') - .filter(Boolean) - .map((line, index) => { - try { - return PromptCaseSchema.parse(JSON.parse(line)) - } catch (error) { - throw new Error(`Invalid stdin prompt at line ${index + 1}: ${error instanceof Error ? error.message : error}`) - } - }) -} - -/** - * Load capture results from a JSONL file. - * - * @remarks - * Each line should be a valid JSON object matching CaptureResultSchema. - * Used by summarize, calibrate, and compare commands. - * - * @param path - Path to the results.jsonl file - * @returns Parsed and validated capture results - * @throws Error if file cannot be read or any line is invalid - * - * @public - */ -export const loadResults = async (path: string): Promise => { - const content = await Bun.file(path).text() - return content - .trim() - .split('\n') - .filter(Boolean) - .map((line, index) => { - try { - return CaptureResultSchema.parse(JSON.parse(line)) - } catch (error) { - throw new Error(`Invalid result at line ${index + 1}: ${error instanceof Error ? error.message : error}`) - } - }) -} - -/** - * Load raw JSONL file as parsed JSON objects. - * - * @remarks - * Lower-level loading without schema validation. - * Useful for pipeline commands that need flexible input handling. - * - * @param path - Path to JSONL file - * @returns Array of parsed JSON objects - * @throws Error if file cannot be read or any line is invalid JSON - * - * @public - */ -export const loadJsonl = async (path: string): Promise => { - const content = await Bun.file(path).text() - return content - .trim() - .split('\n') - .filter(Boolean) - .map((line, index) => { - try { - return JSON.parse(line) as T - } catch (error) { - throw new Error(`Invalid JSON at line ${index + 1}: ${error instanceof Error ? error.message : error}`) - } - }) -} - -// ============================================================================ -// Streaming Loading -// ============================================================================ - -// Re-export native streaming functions for backward compatibility -export { - countLinesStreaming, - streamJsonl, - streamPrompts, - streamResultsNative, - streamTrialResults, -} from './streaming.ts' - -/** - * Stream capture results from a JSONL file. - * - * @remarks - * Memory-efficient alternative to loadResults for large files. - * Uses native streaming via Bun.file().stream() for O(1) memory usage. - * - * @param path - Path to the results.jsonl file - * @yields Parsed and validated capture results - * @throws Error if file cannot be read or any line is invalid - * - * @public - */ -export async function* streamResults(path: string): AsyncGenerator { - const { streamResultsNative } = await import('./streaming.ts') - yield* streamResultsNative(path) -} - -/** - * Build an indexed map of results by ID using streaming. - * - * @remarks - * Memory-efficient for the compare command. Loads results into a Map - * keyed by ID for O(1) lookups without holding raw file content. - * - * For very large files (10k+ results), this is more memory-efficient than - * loading everything into an array and then building an index. - * - * @param path - Path to the results.jsonl file - * @returns Map of result ID to CaptureResult - * - * @public - */ -export const buildResultsIndex = async (path: string): Promise> => { - const index = new Map() - - for await (const result of streamResults(path)) { - index.set(result.id, result) - } - - return index -} - -/** - * Count lines in a JSONL file without loading content. - * - * @remarks - * Useful for detecting large files that should use streaming mode. - * Uses native streaming for O(1) memory usage. - * - * @param path - Path to the JSONL file - * @returns Number of non-empty lines - * - * @public - */ -export const countLines = async (path: string): Promise => { - const { countLinesStreaming } = await import('./streaming.ts') - return countLinesStreaming(path) -} diff --git a/src/core/output.ts b/src/core/output.ts deleted file mode 100644 index c3fdd9f..0000000 --- a/src/core/output.ts +++ /dev/null @@ -1,120 +0,0 @@ -/** - * Shared output utilities for writing results and logging. - * - * @remarks - * Provides consistent output handling across all commands: - * - Writing to stdout or files - * - Progress logging to stderr - * - Path resolution - * - Content preview (head/tail) - * - * @packageDocumentation - */ - -import { appendFile } from 'node:fs/promises' -import { HEAD_LINES, TAIL_LINES } from '../schemas/constants.ts' - -/** - * Write output line to stdout or file. - * - * @remarks - * When writing to a file, supports both overwrite and append modes. - * When writing to stdout, uses console.log. - * - * @param line - Content to write (without trailing newline) - * @param outputPath - Optional file path (stdout if undefined) - * @param append - If true, append to file instead of overwrite - * - * @public - */ -export const writeOutput = async (line: string, outputPath?: string, append?: boolean): Promise => { - if (outputPath) { - if (append) { - await appendFile(outputPath, `${line}\n`) - } else { - await Bun.write(outputPath, `${line}\n`) - } - } else { - console.log(line) - } -} - -/** - * Log progress message to stderr. - * - * @remarks - * Progress output goes to stderr to avoid polluting stdout - * when piping command output. - * - * @param message - Progress message to display - * @param showProgress - If false, message is suppressed - * - * @public - */ -export const logProgress = (message: string, showProgress: boolean): void => { - if (showProgress) { - console.error(message) - } -} - -/** - * Resolve path relative to process.cwd(). - * - * @remarks - * Absolute paths (starting with /) are returned as-is. - * Relative paths are joined with current working directory. - * - * @param path - Path to resolve - * @returns Absolute path - * - * @public - */ -export const resolvePath = (path: string): string => { - if (path.startsWith('/')) return path - return `${process.cwd()}/${path}` -} - -/** - * Create head/tail preview of content. - * - * @remarks - * Shows first N and last M lines with omission indicator in between. - * Useful for large files/content in markdown output. - * - * @param content - Full content string - * @param headLines - Number of lines from start (default from constants) - * @param tailLines - Number of lines from end (default from constants) - * @returns Truncated content with omission indicator - * - * @public - */ -export const headTailPreview = (content: string, headLines = HEAD_LINES, tailLines = TAIL_LINES): string => { - const lines = content.split('\n') - if (lines.length <= headLines + tailLines) { - return content - } - const head = lines.slice(0, headLines).join('\n') - const tail = lines.slice(-tailLines).join('\n') - const omitted = lines.length - headLines - tailLines - return `${head}\n\n// ... ${omitted} lines omitted ...\n\n${tail}` -} - -/** - * Get preview text for input (handles string or array). - * - * @remarks - * For arrays (multi-turn), shows turn count and preview of first turn. - * For strings, shows first 50 characters. - * - * @param input - String or array input - * @returns Preview text suitable for progress display - * - * @public - */ -export const getInputPreview = (input: string | string[]): string => { - if (Array.isArray(input)) { - const first = input[0] ?? '' - return `[${input.length} turns] ${first.slice(0, 40)}...` - } - return input.slice(0, 50) -} diff --git a/src/core/streaming.ts b/src/core/streaming.ts deleted file mode 100644 index c1ec47a..0000000 --- a/src/core/streaming.ts +++ /dev/null @@ -1,172 +0,0 @@ -/** - * Native streaming utilities for JSONL files. - * - * @remarks - * Provides true memory-efficient streaming using Bun.file().stream(). - * Unlike the batch-then-yield approach in loading.ts, these functions - * process data chunk-by-chunk, maintaining O(1) memory usage regardless - * of file size. - * - * @packageDocumentation - */ - -import type { ZodSchema } from 'zod' -import type { CaptureResult, PromptCase, TrialResult } from '../schemas.ts' -import { CaptureResultSchema, PromptCaseSchema, TrialResultSchema } from '../schemas.ts' - -/** - * Stream JSONL file entries with optional schema validation. - * - * @remarks - * Uses Bun's native ReadableStream for true streaming - only holds one - * chunk in memory at a time. For files with 10k+ results, this provides - * constant memory usage vs O(file size) for batch loading. - * - * @typeParam T - The expected type of each JSON line - * @param path - Path to the JSONL file - * @param schema - Optional Zod schema for validation - * @yields Parsed (and optionally validated) JSON objects - * @throws Error with line number if JSON parsing or validation fails - * - * @public - */ -export async function* streamJsonl(path: string, schema?: ZodSchema): AsyncGenerator { - const file = Bun.file(path) - const stream = file.stream() - const decoder = new TextDecoder() - - let buffer = '' - let lineNum = 0 - - /** - * Process a single line of JSON. - */ - const processLine = (line: string): T => { - const parsed = JSON.parse(line) - return schema ? schema.parse(parsed) : (parsed as T) - } - - for await (const chunk of stream) { - buffer += decoder.decode(chunk, { stream: true }) - - let newlineIndex = buffer.indexOf('\n') - while (newlineIndex !== -1) { - const line = buffer.slice(0, newlineIndex).trim() - buffer = buffer.slice(newlineIndex + 1) - lineNum++ - - if (line) { - try { - yield processLine(line) - } catch (error) { - throw new Error(`Invalid JSON at line ${lineNum}: ${error instanceof Error ? error.message : error}`) - } - } - - newlineIndex = buffer.indexOf('\n') - } - } - - // Flush remaining buffer content (handles files without trailing newline) - buffer += decoder.decode() - - const finalLine = buffer.trim() - if (finalLine) { - lineNum++ - try { - yield processLine(finalLine) - } catch (error) { - throw new Error(`Invalid JSON at line ${lineNum}: ${error instanceof Error ? error.message : error}`) - } - } -} - -/** - * Stream prompt cases from a JSONL file. - * - * @remarks - * Memory-efficient streaming with PromptCaseSchema validation. - * Use this for large prompt files when you don't need random access. - * - * @param path - Path to the prompts.jsonl file - * @yields Validated PromptCase objects - * @throws Error with line number if validation fails - * - * @public - */ -export async function* streamPrompts(path: string): AsyncGenerator { - yield* streamJsonl(path, PromptCaseSchema) -} - -/** - * Stream capture results from a JSONL file using native streaming. - * - * @remarks - * True streaming alternative to the batch-then-yield streamResults in loading.ts. - * Maintains O(1) memory usage regardless of file size. - * - * @param path - Path to the results.jsonl file - * @yields Validated CaptureResult objects - * @throws Error with line number if validation fails - * - * @public - */ -export async function* streamResultsNative(path: string): AsyncGenerator { - yield* streamJsonl(path, CaptureResultSchema) -} - -/** - * Stream trial results from a JSONL file. - * - * @remarks - * Memory-efficient streaming with TrialResultSchema validation. - * Use for large trial result files from the trials command. - * - * @param path - Path to the trial results JSONL file - * @yields Validated TrialResult objects - * @throws Error with line number if validation fails - * - * @public - */ -export async function* streamTrialResults(path: string): AsyncGenerator { - yield* streamJsonl(path, TrialResultSchema) -} - -/** - * Count lines in a JSONL file using streaming. - * - * @remarks - * Counts non-empty lines without loading the entire file into memory. - * Uses byte-level newline scanning for efficiency. - * - * @param path - Path to the JSONL file - * @returns Number of non-empty lines - * - * @public - */ -export const countLinesStreaming = async (path: string): Promise => { - const file = Bun.file(path) - const stream = file.stream() - const decoder = new TextDecoder() - - let count = 0 - let buffer = '' - - for await (const chunk of stream) { - buffer += decoder.decode(chunk, { stream: true }) - - let newlineIndex = buffer.indexOf('\n') - while (newlineIndex !== -1) { - const line = buffer.slice(0, newlineIndex).trim() - buffer = buffer.slice(newlineIndex + 1) - if (line) count++ - newlineIndex = buffer.indexOf('\n') - } - } - - // Flush and check final line - buffer += decoder.decode() - if (buffer.trim()) count++ - - return count -} diff --git a/src/core/tests/core.spec.ts b/src/core/tests/core.spec.ts deleted file mode 100644 index 1d97f61..0000000 --- a/src/core/tests/core.spec.ts +++ /dev/null @@ -1,310 +0,0 @@ -/** - * Unit tests for core utilities. - * - * @remarks - * Tests for shared utility functions in the core module: - * - loading: loadPrompts, loadResults, loadJsonl - * - trajectory: extractTrajectory, extractOutput, hasToolErrors - * - output: writeOutput, logProgress, headTailPreview - * - * @packageDocumentation - */ - -import { afterEach, describe, expect, test } from 'bun:test' -import { unlink, writeFile } from 'node:fs/promises' -import type { ParsedUpdate } from '../../headless/headless-output-parser.ts' -import { loadJsonl, loadPrompts, loadResults } from '../loading.ts' -import { headTailPreview, resolvePath } from '../output.ts' -import { detectTrajectoryRichness, extractOutput, extractTrajectory, hasToolErrors } from '../trajectory.ts' - -// ============================================================================ -// Loading Tests -// ============================================================================ - -describe('loadJsonl', () => { - const testFile = '/tmp/core-test-jsonl.jsonl' - - afterEach(async () => { - try { - await unlink(testFile) - } catch { - // Ignore if file doesn't exist - } - }) - - test('loads and parses JSONL file', async () => { - await writeFile(testFile, '{"a":1}\n{"a":2}\n{"a":3}') - const results = await loadJsonl<{ a: number }>(testFile) - expect(results.length).toBe(3) - expect(results[0]?.a).toBe(1) - expect(results[2]?.a).toBe(3) - }) - - test('skips empty lines', async () => { - await writeFile(testFile, '{"a":1}\n\n{"a":2}\n') - const results = await loadJsonl<{ a: number }>(testFile) - expect(results.length).toBe(2) - }) - - test('handles empty file', async () => { - await writeFile(testFile, '') - const results = await loadJsonl(testFile) - expect(results.length).toBe(0) - }) -}) - -describe('loadPrompts', () => { - const testFile = '/tmp/core-test-prompts.jsonl' - - afterEach(async () => { - try { - await unlink(testFile) - } catch { - // Ignore - } - }) - - test('loads valid prompts', async () => { - await writeFile(testFile, '{"id":"p1","input":"hello"}\n{"id":"p2","input":"world"}') - const prompts = await loadPrompts(testFile) - expect(prompts.length).toBe(2) - expect(prompts[0]?.id).toBe('p1') - expect(prompts[0]?.input).toBe('hello') - }) - - test('loads multi-turn prompts', async () => { - await writeFile(testFile, '{"id":"m1","input":["turn1","turn2"]}') - const prompts = await loadPrompts(testFile) - expect(prompts.length).toBe(1) - expect(Array.isArray(prompts[0]?.input)).toBe(true) - expect((prompts[0]?.input as string[]).length).toBe(2) - }) -}) - -describe('loadResults', () => { - const testFile = '/tmp/core-test-results.jsonl' - - afterEach(async () => { - try { - await unlink(testFile) - } catch { - // Ignore - } - }) - - test('loads capture results with full schema', async () => { - const result = { - id: 'r1', - input: 'test', - output: 'result', - trajectory: [], - metadata: {}, - toolErrors: false, - timing: { - start: 0, - end: 100, - total: 100, - sessionCreation: 10, - }, - } - await writeFile(testFile, JSON.stringify(result)) - const results = await loadResults(testFile) - expect(results.length).toBe(1) - expect(results[0]?.id).toBe('r1') - expect(results[0]?.output).toBe('result') - }) -}) - -// ============================================================================ -// Trajectory Tests -// ============================================================================ - -describe('extractTrajectory', () => { - const startTime = 1000 - - test('extracts message updates', () => { - const updates: ParsedUpdate[] = [{ type: 'message', content: 'Hello', timestamp: 1100, raw: {} }] - const trajectory = extractTrajectory(updates, startTime) - expect(trajectory.length).toBe(1) - expect(trajectory[0]?.type).toBe('message') - expect(trajectory[0]?.type === 'message' && trajectory[0]?.content).toBe('Hello') - }) - - test('extracts thought updates', () => { - const updates: ParsedUpdate[] = [{ type: 'thought', content: 'Thinking...', timestamp: 1200, raw: {} }] - const trajectory = extractTrajectory(updates, startTime) - expect(trajectory.length).toBe(1) - expect(trajectory[0]?.type).toBe('thought') - }) - - test('extracts tool_call with title', () => { - const updates: ParsedUpdate[] = [ - { - type: 'tool_call', - title: 'Read', - status: 'completed', - timestamp: 1300, - raw: {}, - }, - ] - const trajectory = extractTrajectory(updates, startTime) - expect(trajectory.length).toBe(1) - expect(trajectory[0]?.type).toBe('tool_call') - const step = trajectory[0] - if (step?.type === 'tool_call') { - expect(step.name).toBe('Read') - } - }) - - test('handles empty updates', () => { - const trajectory = extractTrajectory([], startTime) - expect(trajectory.length).toBe(0) - }) -}) - -describe('extractOutput', () => { - test('concatenates all message content', () => { - const trajectory = [ - { type: 'thought' as const, content: 'Thinking', timestamp: 50 }, - { type: 'message' as const, content: 'First message', timestamp: 100 }, - { type: 'message' as const, content: 'Final answer', timestamp: 150 }, - ] - const output = extractOutput(trajectory) - // extractOutput joins all messages with newline - expect(output).toBe('First message\nFinal answer') - }) - - test('returns empty string when no messages', () => { - const trajectory = [{ type: 'thought' as const, content: 'Thinking only', timestamp: 50 }] - const output = extractOutput(trajectory) - expect(output).toBe('') - }) - - test('handles empty trajectory', () => { - const output = extractOutput([]) - expect(output).toBe('') - }) -}) - -describe('hasToolErrors', () => { - test('returns false for successful tool calls', () => { - const trajectory = [ - { - type: 'tool_call' as const, - name: 'Read', - status: 'completed', - timestamp: 100, - }, - ] - expect(hasToolErrors(trajectory)).toBe(false) - }) - - test('returns true for failed status', () => { - const trajectory = [ - { - type: 'tool_call' as const, - name: 'Read', - status: 'failed', - timestamp: 100, - }, - ] - // hasToolErrors checks for status === 'failed' - expect(hasToolErrors(trajectory)).toBe(true) - }) - - test('returns false for error status (not failed)', () => { - // The implementation checks for 'failed', not 'error' - const trajectory = [ - { - type: 'tool_call' as const, - name: 'Read', - status: 'error', - timestamp: 100, - }, - ] - expect(hasToolErrors(trajectory)).toBe(false) - }) - - test('returns false for empty trajectory', () => { - expect(hasToolErrors([])).toBe(false) - }) -}) - -describe('detectTrajectoryRichness', () => { - test('returns full when has thoughts', () => { - const trajectory = [ - { type: 'thought' as const, content: 'Let me think', timestamp: 50 }, - { type: 'message' as const, content: 'Done', timestamp: 150 }, - ] - expect(detectTrajectoryRichness(trajectory)).toBe('full') - }) - - test('returns full when has tool_calls', () => { - const trajectory = [ - { - type: 'tool_call' as const, - name: 'Read', - status: 'completed', - timestamp: 100, - }, - { type: 'message' as const, content: 'Done', timestamp: 150 }, - ] - // Any tool_call means 'full' - expect(detectTrajectoryRichness(trajectory)).toBe('full') - }) - - test('returns messages-only when only messages', () => { - const trajectory = [{ type: 'message' as const, content: 'Just a message', timestamp: 100 }] - expect(detectTrajectoryRichness(trajectory)).toBe('messages-only') - }) - - test('returns minimal for empty trajectory', () => { - // Empty trajectory returns 'minimal', not 'messages-only' - expect(detectTrajectoryRichness([])).toBe('minimal') - }) -}) - -// ============================================================================ -// Output Tests -// ============================================================================ - -describe('headTailPreview', () => { - test('returns full content when short', () => { - const content = 'line1\nline2\nline3' - const preview = headTailPreview(content, 5, 3) - expect(preview).toBe(content) - }) - - test('truncates long content with omission indicator', () => { - const lines = Array.from({ length: 20 }, (_, i) => `line${i + 1}`).join('\n') - const preview = headTailPreview(lines, 3, 2) - - expect(preview).toContain('line1') - expect(preview).toContain('line2') - expect(preview).toContain('line3') - // Actual format uses "// ... N lines omitted ..." - expect(preview).toContain('// ... 15 lines omitted ...') - expect(preview).toContain('line19') - expect(preview).toContain('line20') - }) - - test('handles exact boundary', () => { - const lines = 'line1\nline2\nline3\nline4\nline5' - const preview = headTailPreview(lines, 3, 2) - // 5 lines is exactly head(3) + tail(2), no truncation needed - expect(preview).toBe(lines) - }) -}) - -describe('resolvePath', () => { - test('resolves relative path from cwd', () => { - const resolved = resolvePath('./test.txt') - expect(resolved.endsWith('test.txt')).toBe(true) - expect(resolved.startsWith('/')).toBe(true) - }) - - test('returns absolute path unchanged', () => { - const path = '/absolute/path/file.txt' - expect(resolvePath(path)).toBe(path) - }) -}) diff --git a/src/core/tests/streaming.spec.ts b/src/core/tests/streaming.spec.ts deleted file mode 100644 index 633254c..0000000 --- a/src/core/tests/streaming.spec.ts +++ /dev/null @@ -1,399 +0,0 @@ -/** - * Unit tests for native streaming utilities. - * - * @remarks - * Tests for memory-efficient streaming functions in streaming.ts: - * - streamJsonl: Generic JSONL streaming with optional schema validation - * - streamPrompts: PromptCase streaming - * - streamResultsNative: CaptureResult streaming - * - streamTrialResults: TrialResult streaming - * - countLinesStreaming: Line counting without full file load - * - * @packageDocumentation - */ - -import { afterEach, describe, expect, test } from 'bun:test' -import { unlink } from 'node:fs/promises' -import { z } from 'zod' -import { - countLinesStreaming, - streamJsonl, - streamPrompts, - streamResultsNative, - streamTrialResults, -} from '../streaming.ts' - -// ============================================================================ -// streamJsonl Tests -// ============================================================================ - -describe('streamJsonl', () => { - const testFile = '/tmp/streaming-test-jsonl.jsonl' - - afterEach(async () => { - try { - await unlink(testFile) - } catch { - // Ignore if file doesn't exist - } - }) - - test('streams items one at a time', async () => { - await Bun.write(testFile, '{"a":1}\n{"a":2}\n{"a":3}') - - const items: Array<{ a: number }> = [] - for await (const item of streamJsonl<{ a: number }>(testFile)) { - items.push(item) - } - - expect(items.length).toBe(3) - expect(items[0]?.a).toBe(1) - expect(items[1]?.a).toBe(2) - expect(items[2]?.a).toBe(3) - }) - - test('handles files without trailing newline', async () => { - await Bun.write(testFile, '{"a":1}\n{"a":2}') - - const items: Array<{ a: number }> = [] - for await (const item of streamJsonl<{ a: number }>(testFile)) { - items.push(item) - } - - expect(items.length).toBe(2) - expect(items[1]?.a).toBe(2) - }) - - test('validates with schema when provided', async () => { - const schema = z.object({ id: z.string(), value: z.number() }) - await Bun.write(testFile, '{"id":"a","value":1}\n{"id":"b","value":2}') - - const items: Array<{ id: string; value: number }> = [] - for await (const item of streamJsonl(testFile, schema)) { - items.push(item) - } - - expect(items.length).toBe(2) - expect(items[0]?.id).toBe('a') - expect(items[0]?.value).toBe(1) - }) - - test('throws with line number on invalid JSON', async () => { - await Bun.write(testFile, '{"a":1}\ninvalid json\n{"a":3}') - - const items: unknown[] = [] - let error: Error | undefined - - try { - for await (const item of streamJsonl(testFile)) { - items.push(item) - } - } catch (e) { - error = e as Error - } - - expect(error).toBeDefined() - expect(error?.message).toContain('line 2') - }) - - test('throws with line number on schema validation failure', async () => { - const schema = z.object({ id: z.string(), required: z.number() }) - await Bun.write(testFile, '{"id":"a","required":1}\n{"id":"b"}') - - const items: unknown[] = [] - let error: Error | undefined - - try { - for await (const item of streamJsonl(testFile, schema)) { - items.push(item) - } - } catch (e) { - error = e as Error - } - - expect(error).toBeDefined() - expect(error?.message).toContain('line 2') - }) - - test('handles empty files', async () => { - await Bun.write(testFile, '') - - const items: unknown[] = [] - for await (const item of streamJsonl(testFile)) { - items.push(item) - } - - expect(items.length).toBe(0) - }) - - test('handles single-line files', async () => { - await Bun.write(testFile, '{"single":true}') - - const items: Array<{ single: boolean }> = [] - for await (const item of streamJsonl<{ single: boolean }>(testFile)) { - items.push(item) - } - - expect(items.length).toBe(1) - expect(items[0]?.single).toBe(true) - }) - - test('skips empty lines', async () => { - await Bun.write(testFile, '{"a":1}\n\n\n{"a":2}\n') - - const items: Array<{ a: number }> = [] - for await (const item of streamJsonl<{ a: number }>(testFile)) { - items.push(item) - } - - expect(items.length).toBe(2) - }) - - test('handles whitespace-only lines', async () => { - await Bun.write(testFile, '{"a":1}\n \n{"a":2}') - - const items: Array<{ a: number }> = [] - for await (const item of streamJsonl<{ a: number }>(testFile)) { - items.push(item) - } - - expect(items.length).toBe(2) - }) -}) - -// ============================================================================ -// streamPrompts Tests -// ============================================================================ - -describe('streamPrompts', () => { - const testFile = '/tmp/streaming-test-prompts.jsonl' - - afterEach(async () => { - try { - await unlink(testFile) - } catch { - // Ignore - } - }) - - test('yields validated PromptCase objects', async () => { - await Bun.write(testFile, '{"id":"p1","input":"hello"}\n{"id":"p2","input":"world"}') - - const prompts = [] - for await (const prompt of streamPrompts(testFile)) { - prompts.push(prompt) - } - - expect(prompts.length).toBe(2) - expect(prompts[0]?.id).toBe('p1') - expect(prompts[0]?.input).toBe('hello') - }) - - test('handles multi-turn prompts', async () => { - await Bun.write(testFile, '{"id":"m1","input":["turn1","turn2"]}') - - const prompts = [] - for await (const prompt of streamPrompts(testFile)) { - prompts.push(prompt) - } - - expect(prompts.length).toBe(1) - expect(Array.isArray(prompts[0]?.input)).toBe(true) - }) - - test('throws on schema validation failure', async () => { - // Missing required 'id' field - await Bun.write(testFile, '{"input":"hello"}') - - let error: Error | undefined - try { - for await (const _ of streamPrompts(testFile)) { - // Consume - } - } catch (e) { - error = e as Error - } - - expect(error).toBeDefined() - expect(error?.message).toContain('line 1') - }) -}) - -// ============================================================================ -// streamResultsNative Tests -// ============================================================================ - -describe('streamResultsNative', () => { - const testFile = '/tmp/streaming-test-results.jsonl' - - afterEach(async () => { - try { - await unlink(testFile) - } catch { - // Ignore - } - }) - - test('yields validated CaptureResult objects', async () => { - const result = { - id: 'r1', - input: 'test', - output: 'result', - trajectory: [], - metadata: {}, - toolErrors: false, - timing: { - start: 0, - end: 100, - total: 100, - sessionCreation: 10, - }, - } - await Bun.write(testFile, JSON.stringify(result)) - - const results = [] - for await (const r of streamResultsNative(testFile)) { - results.push(r) - } - - expect(results.length).toBe(1) - expect(results[0]?.id).toBe('r1') - expect(results[0]?.output).toBe('result') - }) - - test('streams multiple results', async () => { - const makeResult = (id: string) => ({ - id, - input: 'test', - output: 'result', - trajectory: [], - metadata: {}, - toolErrors: false, - timing: { start: 0, end: 100, total: 100, sessionCreation: 10 }, - }) - - await Bun.write( - testFile, - `${JSON.stringify(makeResult('r1'))}\n${JSON.stringify(makeResult('r2'))}\n${JSON.stringify(makeResult('r3'))}`, - ) - - const results = [] - for await (const r of streamResultsNative(testFile)) { - results.push(r) - } - - expect(results.length).toBe(3) - expect(results.map((r) => r.id)).toEqual(['r1', 'r2', 'r3']) - }) -}) - -// ============================================================================ -// streamTrialResults Tests -// ============================================================================ - -describe('streamTrialResults', () => { - const testFile = '/tmp/streaming-test-trials.jsonl' - - afterEach(async () => { - try { - await unlink(testFile) - } catch { - // Ignore - } - }) - - test('yields validated TrialResult objects', async () => { - const trialResult = { - id: 't1', - input: 'test prompt', - k: 3, - passRate: 0.67, - passAtK: 1, - passExpK: 0.7, - trials: [ - { trialNum: 1, output: 'output1', trajectory: [], duration: 100, pass: true }, - { trialNum: 2, output: 'output2', trajectory: [], duration: 150, pass: true }, - { trialNum: 3, output: 'output3', trajectory: [], duration: 120, pass: false }, - ], - } - await Bun.write(testFile, JSON.stringify(trialResult)) - - const results = [] - for await (const r of streamTrialResults(testFile)) { - results.push(r) - } - - expect(results.length).toBe(1) - expect(results[0]?.id).toBe('t1') - expect(results[0]?.k).toBe(3) - expect(results[0]?.passRate).toBe(0.67) - }) - - test('throws on invalid trial result', async () => { - // Missing required 'k' field - await Bun.write(testFile, '{"id":"t1","input":"test","trials":[]}') - - let error: Error | undefined - try { - for await (const _ of streamTrialResults(testFile)) { - // Consume - } - } catch (e) { - error = e as Error - } - - expect(error).toBeDefined() - expect(error?.message).toContain('line 1') - }) -}) - -// ============================================================================ -// countLinesStreaming Tests -// ============================================================================ - -describe('countLinesStreaming', () => { - const testFile = '/tmp/streaming-test-count.jsonl' - - afterEach(async () => { - try { - await unlink(testFile) - } catch { - // Ignore - } - }) - - test('counts lines without loading full file', async () => { - await Bun.write(testFile, '{"a":1}\n{"a":2}\n{"a":3}') - - const count = await countLinesStreaming(testFile) - expect(count).toBe(3) - }) - - test('handles empty file', async () => { - await Bun.write(testFile, '') - - const count = await countLinesStreaming(testFile) - expect(count).toBe(0) - }) - - test('handles file without trailing newline', async () => { - await Bun.write(testFile, '{"a":1}\n{"a":2}') - - const count = await countLinesStreaming(testFile) - expect(count).toBe(2) - }) - - test('skips empty lines', async () => { - await Bun.write(testFile, '{"a":1}\n\n{"a":2}\n\n') - - const count = await countLinesStreaming(testFile) - expect(count).toBe(2) - }) - - test('handles single-line file', async () => { - await Bun.write(testFile, '{"single":true}') - - const count = await countLinesStreaming(testFile) - expect(count).toBe(1) - }) -}) diff --git a/src/core/tests/worker-pool.spec.ts b/src/core/tests/worker-pool.spec.ts deleted file mode 100644 index d17ae9c..0000000 --- a/src/core/tests/worker-pool.spec.ts +++ /dev/null @@ -1,377 +0,0 @@ -/** - * Unit tests for worker pool utilities. - * - * @remarks - * Tests for parallel execution utilities: - * - runWorkerPool: Promise-based worker pool with concurrency limit - * - createWriteMutex: Coordinates concurrent file writes - * - createWorkspaceDir: Creates per-prompt workspace directories - * - * @packageDocumentation - */ - -import { afterEach, describe, expect, test } from 'bun:test' -import { rm, stat } from 'node:fs/promises' -import { createWorkspaceDir, createWriteMutex, runWorkerPool } from '../worker-pool.ts' - -// Helper to check if a directory exists -const dirExists = async (path: string): Promise => { - try { - const s = await stat(path) - return s.isDirectory() - } catch { - return false - } -} - -// ============================================================================ -// runWorkerPool Tests -// ============================================================================ - -describe('runWorkerPool', () => { - test('processes items sequentially with concurrency 1', async () => { - const order: number[] = [] - const items = [1, 2, 3, 4, 5] - - const { results } = await runWorkerPool( - items, - async (item) => { - order.push(item) - return item * 2 - }, - { concurrency: 1 }, - ) - - // With concurrency 1, order should be preserved - expect(order).toEqual([1, 2, 3, 4, 5]) - expect(results).toEqual([2, 4, 6, 8, 10]) - }) - - test('processes items in parallel with concurrency > 1', async () => { - const items = [1, 2, 3, 4] - const startTimes: number[] = [] - - const { results } = await runWorkerPool( - items, - async (item, index) => { - startTimes[index] = Date.now() - await Bun.sleep(50) // Simulate work - return item * 2 - }, - { concurrency: 4 }, - ) - - // All results should be correct - expect(results.sort((a, b) => a - b)).toEqual([2, 4, 6, 8]) - - // With concurrency 4, all items should start nearly simultaneously - const maxDiff = Math.max(...startTimes) - Math.min(...startTimes) - expect(maxDiff).toBeLessThan(30) // Should all start within 30ms - }) - - test('limits concurrency correctly', async () => { - let activeCount = 0 - let maxActive = 0 - const items = [1, 2, 3, 4, 5, 6] - - await runWorkerPool( - items, - async (item) => { - activeCount++ - maxActive = Math.max(maxActive, activeCount) - await Bun.sleep(20) // Simulate work - activeCount-- - return item - }, - { concurrency: 2 }, - ) - - // Should never exceed concurrency limit - expect(maxActive).toBeLessThanOrEqual(2) - }) - - test('collects errors without stopping other workers', async () => { - const items = [1, 2, 3, 4, 5] - - const { results, errors } = await runWorkerPool( - items, - async (item) => { - if (item === 3) { - throw new Error('Item 3 failed') - } - return item * 2 - }, - { concurrency: 2 }, - ) - - // Should have 4 results and 1 error - expect(results.length).toBe(4) - expect(errors.length).toBe(1) - expect(errors[0]?.index).toBe(2) // Index of item 3 - expect(errors[0]?.error.message).toBe('Item 3 failed') - }) - - test('calls onProgress callback', async () => { - const progressCalls: Array<{ completed: number; total: number }> = [] - const items = [1, 2, 3] - - await runWorkerPool(items, async (item) => item * 2, { - concurrency: 1, - onProgress: (completed, total) => { - progressCalls.push({ completed, total }) - }, - }) - - expect(progressCalls.length).toBe(3) - expect(progressCalls[0]).toEqual({ completed: 1, total: 3 }) - expect(progressCalls[1]).toEqual({ completed: 2, total: 3 }) - expect(progressCalls[2]).toEqual({ completed: 3, total: 3 }) - }) - - test('handles empty items array', async () => { - const { results, errors } = await runWorkerPool([] as number[], async (item) => item * 2, { concurrency: 4 }) - - expect(results).toEqual([]) - expect(errors).toEqual([]) - }) - - test('skips undefined items in array', async () => { - // Create a sparse array with holes - const items: (number | undefined)[] = [1, undefined, 3, undefined, 5] - - const { results } = await runWorkerPool( - items, - async (item) => { - if (item === undefined) throw new Error('Should not process undefined') - return item * 2 - }, - { concurrency: 2 }, - ) - - // Should only process defined items - expect(results.sort((a, b) => a - b)).toEqual([2, 6, 10]) - }) - - test('handles concurrency greater than items count', async () => { - const items = [1, 2] - const { results } = await runWorkerPool(items, async (item) => item * 2, { concurrency: 10 }) - - expect(results.sort((a, b) => a - b)).toEqual([2, 4]) - }) -}) - -// ============================================================================ -// createWriteMutex Tests -// ============================================================================ - -describe('createWriteMutex', () => { - test('serializes concurrent writes', async () => { - const mutex = createWriteMutex() - const order: number[] = [] - - // Start multiple writes concurrently - const promises = [1, 2, 3, 4, 5].map((n) => - mutex.write(async () => { - await Bun.sleep(10) // Simulate write delay - order.push(n) - }), - ) - - await Promise.all(promises) - - // All writes should complete in order - expect(order).toEqual([1, 2, 3, 4, 5]) - }) - - test('continues after failed write', async () => { - const mutex = createWriteMutex() - const order: number[] = [] - - const promise1 = mutex.write(async () => { - order.push(1) - }) - - const promise2 = mutex.write(async () => { - order.push(2) - throw new Error('Write 2 failed') - }) - - const promise3 = mutex.write(async () => { - order.push(3) - }) - - await promise1 - await promise2.catch(() => {}) // Ignore error - await promise3 - - // All writes should execute in order, even after failure - expect(order).toEqual([1, 2, 3]) - }) - - test('returns promise that resolves when write completes', async () => { - const mutex = createWriteMutex() - let writeCompleted = false - - const promise = mutex.write(async () => { - await Bun.sleep(10) - writeCompleted = true - }) - - expect(writeCompleted).toBe(false) - await promise - expect(writeCompleted).toBe(true) - }) -}) - -// ============================================================================ -// createWorkspaceDir Tests -// ============================================================================ - -describe('createWorkspaceDir', () => { - const testBaseDir = '/tmp/worker-pool-test-workspaces' - - afterEach(async () => { - try { - await rm(testBaseDir, { recursive: true, force: true }) - } catch { - // Ignore if doesn't exist - } - }) - - test('creates workspace directory', async () => { - const workspaceDir = await createWorkspaceDir(testBaseDir, 'test-prompt-1') - - expect(workspaceDir).toBe(`${testBaseDir}/prompt-test-prompt-1`) - expect(await dirExists(workspaceDir)).toBe(true) - }) - - test('sanitizes invalid filesystem characters', async () => { - const workspaceDir = await createWorkspaceDir(testBaseDir, 'test<>:"/\\|?*prompt') - - // Invalid characters should be replaced with underscore - expect(workspaceDir).toBe(`${testBaseDir}/prompt-test_________prompt`) - expect(await dirExists(workspaceDir)).toBe(true) - }) - - test('handles existing directory', async () => { - // Create first - const dir1 = await createWorkspaceDir(testBaseDir, 'existing') - // Create same again - const dir2 = await createWorkspaceDir(testBaseDir, 'existing') - - expect(dir1).toBe(dir2) - expect(await dirExists(dir1)).toBe(true) - }) - - test('creates nested base directory', async () => { - const nestedBase = `${testBaseDir}/deep/nested/path` - const workspaceDir = await createWorkspaceDir(nestedBase, 'prompt-1') - - expect(workspaceDir).toBe(`${nestedBase}/prompt-prompt-1`) - expect(await dirExists(workspaceDir)).toBe(true) - }) -}) - -// ============================================================================ -// Integration Tests -// ============================================================================ - -describe('worker pool with write mutex integration', () => { - test('coordinates writes from concurrent workers', async () => { - const mutex = createWriteMutex() - const writeOrder: string[] = [] - const items = ['a', 'b', 'c', 'd', 'e'] - - await runWorkerPool( - items, - async (item) => { - // Simulate variable processing time - await Bun.sleep(Math.random() * 20) - - // Write with mutex coordination - await mutex.write(async () => { - writeOrder.push(item) - }) - - return item - }, - { concurrency: 3 }, - ) - - // All items should be written exactly once - expect(writeOrder.sort()).toEqual(['a', 'b', 'c', 'd', 'e']) - // Order depends on which worker finishes first, but all should be present - expect(writeOrder.length).toBe(5) - }) - - test('produces valid JSONL with concurrent writes to file', async () => { - const mutex = createWriteMutex() - const items = Array.from({ length: 10 }, (_, i) => ({ id: `test-${i}`, value: i })) - - // Collect lines in memory, then verify structure - const lines: string[] = [] - - await runWorkerPool( - items, - async (item) => { - // Simulate variable processing time - await Bun.sleep(Math.random() * 30) - - // Write JSONL line with mutex coordination (same pattern as capture.ts) - await mutex.write(async () => { - const line = JSON.stringify(item) - lines.push(line) - }) - - return item - }, - { concurrency: 4 }, - ) - - // Should have all 10 items - expect(lines.length).toBe(10) - - // Each line should be valid JSON - const parsed = lines.map((line) => JSON.parse(line)) - const ids = parsed.map((p) => p.id).sort() - - // All items present (order may vary) - expect(ids).toEqual(items.map((i) => i.id).sort()) - }) - - test('creates workspace directories concurrently without collision', async () => { - const testBase = '/tmp/worker-pool-workspace-test' - const items = ['prompt-1', 'prompt-2', 'prompt-3', 'prompt-4', 'prompt-5'] - - // Clean up first - try { - await rm(testBase, { recursive: true, force: true }) - } catch { - // Ignore - } - - const createdDirs: string[] = [] - - await runWorkerPool( - items, - async (promptId) => { - const dir = await createWorkspaceDir(testBase, promptId) - createdDirs.push(dir) - return dir - }, - { concurrency: 5 }, - ) - - // All directories created - expect(createdDirs.length).toBe(5) - - // Verify each directory exists - for (const dir of createdDirs) { - const exists = await dirExists(dir) - expect(exists).toBe(true) - } - - // Cleanup - await rm(testBase, { recursive: true, force: true }) - }) -}) diff --git a/src/core/trajectory.ts b/src/core/trajectory.ts deleted file mode 100644 index a448c00..0000000 --- a/src/core/trajectory.ts +++ /dev/null @@ -1,172 +0,0 @@ -/** - * Shared trajectory utilities for extraction and analysis. - * - * @remarks - * Provides functions for extracting trajectory data from parsed updates, - * detecting richness levels, and checking for tool errors. - * - * @packageDocumentation - */ - -import type { ParsedUpdate } from '../headless/headless-output-parser.ts' -import type { TrajectoryRichness, TrajectoryStep } from '../schemas.ts' -import { ToolInputSchema } from '../schemas.ts' - -/** - * Extract trajectory from parsed updates. - * - * @remarks - * Converts ParsedUpdate stream into TrajectoryStep array. - * Handles tool call deduplication (start/completion events). - * - * @param updates - Parsed updates from output parser - * @param startTime - Reference time for timestamp calculation - * @returns Array of trajectory steps with relative timestamps - * - * @public - */ -export const extractTrajectory = (updates: ParsedUpdate[], startTime: number): TrajectoryStep[] => { - const trajectory: TrajectoryStep[] = [] - const toolCallMap = new Map() - - for (const update of updates) { - const timestamp = update.timestamp - startTime - - if (update.type === 'thought') { - trajectory.push({ - type: 'thought', - content: update.content ?? '', - timestamp, - }) - } else if (update.type === 'message') { - trajectory.push({ - type: 'message', - content: update.content ?? '', - timestamp, - }) - } else if (update.type === 'tool_call') { - const toolCallId = update.title ?? `tool_${timestamp}` - const existing = toolCallMap.get(toolCallId) - - if (existing && update.status === 'completed') { - // Update existing tool call with completion info - existing.step.status = update.status - existing.step.duration = timestamp - existing.start - if (update.output !== undefined) { - existing.step.output = update.output - } - // Remove from map so a subsequent call with the same name starts fresh - toolCallMap.delete(toolCallId) - } else if (!existing) { - // New tool call - const step: TrajectoryStep & { type: 'tool_call' } = { - type: 'tool_call', - name: update.title ?? 'unknown', - status: update.status ?? 'pending', - ...(update.input !== undefined && { input: update.input }), - timestamp, - } - toolCallMap.set(toolCallId, { start: timestamp, step }) - trajectory.push(step) - } - } else if (update.type === 'plan') { - trajectory.push({ - type: 'plan', - entries: [], - timestamp, - }) - } - } - - return trajectory -} - -/** - * Extract final text output from trajectory. - * - * @remarks - * Concatenates all message step content to produce final output string. - * - * @param trajectory - Trajectory steps from capture - * @returns Concatenated message content - * - * @public - */ -export const extractOutput = (trajectory: TrajectoryStep[]): string => { - return trajectory - .filter((step): step is TrajectoryStep & { type: 'message' } => step.type === 'message') - .map((step) => step.content) - .join('\n') -} - -/** - * Check if any tool calls failed in trajectory. - * - * @param trajectory - Trajectory steps from capture - * @returns True if any tool call has 'failed' status - * - * @public - */ -export const hasToolErrors = (trajectory: TrajectoryStep[]): boolean => { - return trajectory.some((step) => step.type === 'tool_call' && step.status === 'failed') -} - -/** - * Detect trajectory richness level from captured steps. - * - * @remarks - * Different adapters provide varying levels of detail: - * - `full`: Has thoughts, tool calls, or plans (e.g., Claude Code) - * - `messages-only`: Only message steps present - * - `minimal`: Empty or unknown content - * - * Uses single-pass iteration with early exit for efficiency. - * - * @param trajectory - Trajectory steps from capture - * @returns Detected richness level - * - * @public - */ -export const detectTrajectoryRichness = (trajectory: TrajectoryStep[]): TrajectoryRichness => { - let hasMessages = false - - for (const step of trajectory) { - // Early exit: any of these means 'full' richness - if (step.type === 'thought' || step.type === 'tool_call' || step.type === 'plan') { - return 'full' - } - if (step.type === 'message') { - hasMessages = true - } - } - - return hasMessages ? 'messages-only' : 'minimal' -} - -/** - * Extract file path from tool input if present. - * - * @param input - Tool call input object - * @returns File path string or undefined - * - * @public - */ -export const extractFilePath = (input: unknown): string | undefined => { - const result = ToolInputSchema.safeParse(input) - if (!result.success) return undefined - return result.data.file_path ?? result.data.path -} - -/** - * Extract content from tool input if present. - * - * @param input - Tool call input object - * @returns Content string or undefined - * - * @public - */ -export const extractContent = (input: unknown): string | undefined => { - const result = ToolInputSchema.safeParse(input) - if (!result.success) return undefined - return result.data.content ?? result.data.new_string -} diff --git a/src/core/worker-pool.ts b/src/core/worker-pool.ts deleted file mode 100644 index d851380..0000000 --- a/src/core/worker-pool.ts +++ /dev/null @@ -1,220 +0,0 @@ -/** - * Promise-based worker pool for parallel task execution. - * - * @remarks - * Implements a p-limit style concurrency limiter that: - * - Processes items with configurable concurrency - * - Maintains order-independent result collection - * - Supports progress callbacks - * - Coordinates file writes via mutex - * - * @packageDocumentation - */ - -import { mkdir } from 'node:fs/promises' - -// ============================================================================ -// Types -// ============================================================================ - -/** - * Progress callback for worker pool. - * - * @param completed - Number of completed tasks - * @param total - Total number of tasks - * @param result - Result of the just-completed task (if successful) - * @param error - Error from the just-completed task (if failed) - */ -export type ProgressCallback = (completed: number, total: number, result?: T, error?: Error) => void - -/** - * Options for worker pool execution. - */ -export type WorkerPoolOptions = { - /** Maximum concurrent workers (default: 1) */ - concurrency: number - /** Progress callback called after each task completes */ - onProgress?: ProgressCallback -} - -/** - * Result of worker pool execution. - */ -export type WorkerPoolResult = { - /** Successfully completed results (in completion order, not input order) */ - results: T[] - /** Errors encountered during execution */ - errors: Array<{ index: number; error: Error }> -} - -// ============================================================================ -// Write Mutex for JSONL Coordination -// ============================================================================ - -/** - * Simple mutex for coordinating file writes. - * - * @remarks - * Uses a promise chain to ensure only one write happens at a time. - * This prevents data corruption when multiple workers complete simultaneously. - */ -export type WriteMutex = { - /** Acquire lock, execute write, release lock */ - write: (fn: () => Promise) => Promise -} - -/** - * Create a write mutex for coordinating file output. - * - * @returns WriteMutex instance - */ -export const createWriteMutex = (): WriteMutex => { - let chain = Promise.resolve() - - return { - write: (fn: () => Promise): Promise => { - // Chain this write after all previous writes - chain = chain.then(fn, fn) // Continue even if previous failed - return chain - }, - } -} - -// ============================================================================ -// Worker Pool Implementation -// ============================================================================ - -/** - * Execute tasks in parallel with concurrency limit. - * - * @remarks - * Uses a semaphore-style approach where workers grab the next available - * task from a shared queue. Results are collected as tasks complete - * (order may differ from input order). - * - * @param items - Array of items to process - * @param worker - Async function to process each item - * @param options - Pool configuration - * @returns Results and any errors encountered - * - * @public - */ -export const runWorkerPool = async ( - items: TItem[], - worker: (item: TItem, index: number) => Promise, - options: WorkerPoolOptions, -): Promise> => { - const { concurrency, onProgress } = options - const results: TResult[] = [] - const errors: Array<{ index: number; error: Error }> = [] - - // Fast path: if concurrency is 1, process sequentially - if (concurrency === 1) { - for (let i = 0; i < items.length; i++) { - const item = items[i] - if (item === undefined) continue - - try { - const result = await worker(item, i) - results.push(result) - onProgress?.(results.length + errors.length, items.length, result) - } catch (err) { - const error = err instanceof Error ? err : new Error(String(err)) - errors.push({ index: i, error }) - onProgress?.(results.length + errors.length, items.length, undefined, error) - } - } - return { results, errors } - } - - // Shared state for work distribution - let nextIndex = 0 - let completed = 0 - const mutex = { lock: Promise.resolve() } - - // Get next work item (thread-safe via single-threaded JS) - // Uses iterative loop instead of recursion to avoid stack overflow with sparse arrays - const getNextItem = (): { item: TItem; index: number } | undefined => { - while (nextIndex < items.length) { - const index = nextIndex++ - const item = items[index] - if (item !== undefined) { - return { item, index } - } - // Skip undefined items and continue to next - } - return undefined - } - - // Worker function that processes items until none remain - const runWorker = async (): Promise => { - let work = getNextItem() - while (work) { - const { item, index } = work - try { - const result = await worker(item, index) - - // Coordinate result collection - await new Promise((resolve) => { - mutex.lock = mutex.lock.then(() => { - results.push(result) - completed++ - onProgress?.(completed, items.length, result) - resolve() - }) - }) - } catch (err) { - const error = err instanceof Error ? err : new Error(String(err)) - - // Coordinate error collection - await new Promise((resolve) => { - mutex.lock = mutex.lock.then(() => { - errors.push({ index, error }) - completed++ - onProgress?.(completed, items.length, undefined, error) - resolve() - }) - }) - } - - work = getNextItem() - } - } - - // Start N workers - const workers = Array.from({ length: Math.min(concurrency, items.length) }, () => runWorker()) - await Promise.all(workers) - - return { results, errors } -} - -// ============================================================================ -// Workspace Directory Management -// ============================================================================ - -/** - * Create a workspace directory for a prompt. - * - * @remarks - * Creates an isolated directory for each prompt execution. - * Directory is created if it doesn't exist. Directories persist - * after completion for debugging/inspection - clean up manually - * or via CI scripts if disk space is a concern. - * - * @param baseDir - Base workspace directory - * @param promptId - Unique prompt identifier - * @returns Absolute path to the workspace directory - * - * @public - */ -export const createWorkspaceDir = async (baseDir: string, promptId: string): Promise => { - // Sanitize promptId for filesystem (replace invalid chars with underscore) - const sanitizedId = promptId.replace(/[<>:"/\\|?*]/g, '_') - const workspaceDir = `${baseDir}/prompt-${sanitizedId}` - - // Create directory (recursive, no error if exists) - // Uses fs.mkdir instead of shell to prevent command injection - await mkdir(workspaceDir, { recursive: true }) - - return workspaceDir -} diff --git a/src/graders.ts b/src/graders.ts deleted file mode 100644 index f795ceb..0000000 --- a/src/graders.ts +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Built-in comparison graders for the agent eval harness. - * - * @remarks - * Provides built-in strategies for comparing multiple runs: - * - * **For CaptureResult (single-run) data:** - * - **weighted**: Configurable weights for quality, latency, reliability - * - **statistical**: Bootstrap sampling for confidence intervals - * - * **For TrialResult (multi-run reliability) data:** - * - **trialsWeighted**: Configurable weights for capability, reliability, consistency - * - **trialsStatistical**: Bootstrap sampling for passAtK confidence intervals - * - * @packageDocumentation - */ - -// CaptureResult graders -export { createStatisticalGrader, grade as statisticalGrade } from './graders/compare-statistical.ts' -export { - createWeightedGrader, - DEFAULT_WEIGHTS, - getWeightsFromEnv, - grade as weightedGrade, - type Weights, -} from './graders/compare-weighted.ts' - -// TrialResult graders -export { - createTrialsStatisticalGrader, - grade as trialsStatisticalGrade, -} from './graders/trials-compare-statistical.ts' -export { - createTrialsWeightedGrader, - DEFAULT_TRIALS_WEIGHTS, - getTrialsWeightsFromEnv, - grade as trialsWeightedGrade, - type TrialsWeights, -} from './graders/trials-compare-weighted.ts' diff --git a/src/graders/bootstrap.ts b/src/graders/bootstrap.ts deleted file mode 100644 index afcdff5..0000000 --- a/src/graders/bootstrap.ts +++ /dev/null @@ -1,135 +0,0 @@ -/** - * Shared bootstrap sampling utilities for confidence interval computation. - * - * @remarks - * Bootstrap resampling provides robust confidence intervals without - * assuming a specific distribution. For small samples, it's more - * reliable than parametric methods. - * - * Environment variable configuration: - * - `COMPARE_BOOTSTRAP_ITERATIONS` (default: 1000) - * - * @packageDocumentation - */ - -/** Default number of bootstrap iterations */ -export const DEFAULT_ITERATIONS = 1000 - -/** Default confidence level (95%) */ -export const DEFAULT_CONFIDENCE_LEVEL = 0.95 - -/** - * Confidence interval as [lower, upper] bounds. - */ -export type ConfidenceInterval = [number, number] - -/** - * Bootstrap confidence interval result. - */ -export type BootstrapResult = { - /** Median of bootstrap sample means (50th percentile) */ - median: number - /** Confidence interval [lower, upper] */ - ci: ConfidenceInterval -} - -/** - * Configuration for bootstrap sampling. - */ -export type BootstrapConfig = { - /** Number of bootstrap iterations (default: 1000) */ - iterations?: number - /** Confidence level between 0 and 1 (default: 0.95) */ - confidenceLevel?: number -} - -/** - * Compute bootstrap confidence interval for sample mean. - * - * @remarks - * Bootstrap resampling provides robust confidence intervals without - * assuming a specific distribution. For small samples, it's more - * reliable than parametric methods. - * - * @param samples - Array of numeric samples - * @param config - Optional bootstrap configuration - * @returns Bootstrap median and confidence interval - * - * @public - */ -export const bootstrap = (samples: number[], config?: BootstrapConfig): BootstrapResult => { - const iterations = config?.iterations ?? DEFAULT_ITERATIONS - const confidenceLevel = config?.confidenceLevel ?? DEFAULT_CONFIDENCE_LEVEL - - if (samples.length === 0) { - return { median: 0, ci: [0, 0] } - } - - if (samples.length === 1) { - const value = samples[0] ?? 0 - return { median: value, ci: [value, value] } - } - - const means: number[] = [] - - for (let i = 0; i < iterations; i++) { - // Resample with replacement - we know samples.length > 1 at this point - const resampled = Array.from( - { length: samples.length }, - () => samples[Math.floor(Math.random() * samples.length)] as number, - ) - - // Compute mean of resampled data - const sum = resampled.reduce((acc, val) => acc + val, 0) - means.push(sum / resampled.length) - } - - // Sort means for percentile calculation - means.sort((a, b) => a - b) - - // Compute percentile indices based on confidence level - // For 95% CI: lower = 2.5th percentile, upper = 97.5th percentile - const alpha = (1 - confidenceLevel) / 2 - const lowerIdx = Math.floor(iterations * alpha) - const upperIdx = Math.floor(iterations * (1 - alpha)) - - return { - median: means[Math.floor(iterations / 2)] ?? 0, - ci: [means[lowerIdx] ?? 0, means[upperIdx] ?? 0], - } -} - -/** - * Format confidence interval as string. - * - * @param ci - Confidence interval [lower, upper] - * @param decimals - Number of decimal places (default: 3) - * @returns Formatted CI string or empty string if undefined - * - * @public - */ -export const formatCI = (ci: ConfidenceInterval | undefined, decimals: number = 3): string => { - if (!ci) return '' - return `[${ci[0].toFixed(decimals)}, ${ci[1].toFixed(decimals)}]` -} - -/** - * Get bootstrap configuration from environment variables. - * - * @remarks - * Reads configuration from: - * - `COMPARE_BOOTSTRAP_ITERATIONS`: Number of iterations (min: 100) - * - * @returns Bootstrap configuration - * - * @public - */ -export const getBootstrapConfigFromEnv = (): BootstrapConfig => { - const envValue = process.env.COMPARE_BOOTSTRAP_ITERATIONS - if (!envValue) return { iterations: DEFAULT_ITERATIONS } - - const parsed = Number.parseInt(envValue, 10) - const iterations = Number.isNaN(parsed) || parsed < 100 ? DEFAULT_ITERATIONS : parsed - - return { iterations } -} diff --git a/src/graders/compare-statistical.ts b/src/graders/compare-statistical.ts deleted file mode 100644 index 9465277..0000000 --- a/src/graders/compare-statistical.ts +++ /dev/null @@ -1,115 +0,0 @@ -/** - * Built-in statistical significance comparison grader. - * - * @remarks - * Uses bootstrap sampling to compute confidence intervals for score estimates. - * Flags when the winner is statistically significant (p<0.05, non-overlapping CIs). - * - * Bootstrap iterations can be customized via environment variable: - * - `COMPARE_BOOTSTRAP_ITERATIONS` (default: 1000) - * - * @packageDocumentation - */ - -import type { ComparisonGrader, ComparisonGraderInput, ComparisonGraderResult } from '../pipeline/pipeline.types.ts' -import { bootstrap, getBootstrapConfigFromEnv } from './bootstrap.ts' - -/** - * Statistical significance comparison grader. - * - * @remarks - * Compares runs using bootstrap sampling to determine if differences - * are statistically significant. When confidence intervals don't overlap, - * the difference is flagged as significant (p<0.05). - * - * **Single-sample limitation:** When comparing individual prompts, each run - * provides only one score sample. Bootstrap with a single sample yields a - * degenerate CI of `[value, value]`. This grader is most useful when: - * - Aggregating results across multiple prompts - * - Using with the full comparison report (which combines per-prompt comparisons) - * - * For single-prompt comparisons, consider the weighted grader instead. - * - * @public - */ -export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput): Promise => { - const config = getBootstrapConfigFromEnv() - - // Collect scores for each run - const runStats = Object.entries(runs).map(([label, run]) => { - // Use grader score if available, otherwise 0 - const score = run.score?.score ?? 0 - - // For single-prompt comparison, we only have one sample - // In practice, this grader is most useful when aggregating across prompts - const stats = bootstrap([score], config) - - return { label, score, stats } - }) - - // Sort by bootstrap median descending - const sorted = runStats.sort((a, b) => b.stats.median - a.stats.median) - - // Check if winner is statistically significant - // CIs don't overlap = significant difference (approximately p<0.05) - let isSignificant = false - const first = sorted[0] - const second = sorted[1] - if (first && second) { - // Non-overlapping: first's lower bound > second's upper bound - isSignificant = first.stats.ci[0] > second.stats.ci[1] - } - - const reasoning = isSignificant - ? `Winner "${first?.label}" is statistically significant (p<0.05, non-overlapping 95% CIs)` - : 'No statistically significant difference between top runs (overlapping 95% CIs)' - - return { - rankings: sorted.map((s, i) => ({ - run: s.label, - rank: i + 1, - score: s.stats.median, - })), - reasoning, - } -} - -/** - * Create a statistical grader with custom iteration count. - * - * @param iterations - Number of bootstrap iterations - * @returns Comparison grader function - * - * @public - */ -export const createStatisticalGrader = (iterations?: number): ComparisonGrader => { - const config = iterations ? { iterations } : getBootstrapConfigFromEnv() - - return async ({ runs }: ComparisonGraderInput): Promise => { - const runStats = Object.entries(runs).map(([label, run]) => { - const score = run.score?.score ?? 0 - const stats = bootstrap([score], config) - return { label, score, stats } - }) - - const sorted = runStats.sort((a, b) => b.stats.median - a.stats.median) - - let isSignificant = false - const first = sorted[0] - const second = sorted[1] - if (first && second) { - isSignificant = first.stats.ci[0] > second.stats.ci[1] - } - - return { - rankings: sorted.map((s, i) => ({ - run: s.label, - rank: i + 1, - score: s.stats.median, - })), - reasoning: isSignificant - ? `Winner "${first?.label}" is statistically significant (p<0.05)` - : 'No statistically significant difference between top runs', - } - } -} diff --git a/src/graders/compare-weighted.ts b/src/graders/compare-weighted.ts deleted file mode 100644 index 5b05e1f..0000000 --- a/src/graders/compare-weighted.ts +++ /dev/null @@ -1,112 +0,0 @@ -/** - * Built-in weighted multi-dimensional comparison grader. - * - * @remarks - * Configurable weights for quality, latency, and reliability. - * Default strategy when no `--grader` is specified for the compare command. - * - * Weights can be customized via environment variables: - * - `COMPARE_QUALITY` (default: 0.5) - * - `COMPARE_LATENCY` (default: 0.3) - * - `COMPARE_RELIABILITY` (default: 0.2) - * - * @packageDocumentation - */ - -import type { ComparisonGrader, ComparisonGraderInput, ComparisonGraderResult } from '../pipeline/pipeline.types.ts' - -/** - * Weight configuration for comparison dimensions. - */ -export type Weights = { - /** Weight for quality (pass/score) - how much correctness matters */ - quality: number - /** Weight for latency - how much speed matters */ - latency: number - /** Weight for reliability - how much error-free execution matters */ - reliability: number -} - -/** Default weights: quality=0.5, latency=0.3, reliability=0.2 */ -export const DEFAULT_WEIGHTS: Weights = { - quality: 0.5, - latency: 0.3, - reliability: 0.2, -} - -/** - * Read weights from environment variables with fallback to defaults. - * - * @returns Weights configuration - */ -export const getWeightsFromEnv = (): Weights => { - const quality = Number.parseFloat(process.env.COMPARE_QUALITY ?? String(DEFAULT_WEIGHTS.quality)) - const latency = Number.parseFloat(process.env.COMPARE_LATENCY ?? String(DEFAULT_WEIGHTS.latency)) - const reliability = Number.parseFloat(process.env.COMPARE_RELIABILITY ?? String(DEFAULT_WEIGHTS.reliability)) - - return { - quality: Number.isNaN(quality) ? DEFAULT_WEIGHTS.quality : quality, - latency: Number.isNaN(latency) ? DEFAULT_WEIGHTS.latency : latency, - reliability: Number.isNaN(reliability) ? DEFAULT_WEIGHTS.reliability : reliability, - } -} - -/** - * Create a weighted comparison grader with custom weights. - * - * @param weights - Weight configuration for comparison dimensions - * @returns Comparison grader function - * - * @public - */ -export const createWeightedGrader = (weights: Weights = DEFAULT_WEIGHTS): ComparisonGrader => { - return async ({ runs }: ComparisonGraderInput): Promise => { - const scores = Object.entries(runs).map(([label, run]) => { - // Quality score: use grader score if available, otherwise 0 - // Note: run.score is only present if the result was graded - const qualityScore = run.score?.score ?? 0 - - // Latency score: inverse relationship (faster = better) - // Normalize: 1 / (1 + duration/1000) gives ~0.5 at 1s, ~0.1 at 10s - const duration = run.duration ?? 10000 - const latencyScore = 1 / (1 + duration / 1000) - - // Reliability score: 1 if no errors, 0 if errors - const hasErrors = run.toolErrors ?? false - const reliabilityScore = hasErrors ? 0 : 1 - - // Weighted combination - const weighted = - qualityScore * weights.quality + latencyScore * weights.latency + reliabilityScore * weights.reliability - - return { label, weighted, qualityScore, latencyScore, reliabilityScore } - }) - - // Sort by weighted score descending (highest = best) - const sorted = scores.sort((a, b) => b.weighted - a.weighted) - - return { - rankings: sorted.map((s, i) => ({ - run: s.label, - rank: i + 1, - score: s.weighted, - })), - reasoning: `Weighted: quality=${weights.quality}, latency=${weights.latency}, reliability=${weights.reliability}`, - } - } -} - -/** - * Default weighted comparison grader using environment or default weights. - * - * @remarks - * This is the default grader used when `--strategy weighted` is specified - * or when no strategy is specified for the compare command. - * - * @public - */ -export const grade: ComparisonGrader = async (input: ComparisonGraderInput): Promise => { - const weights = getWeightsFromEnv() - const grader = createWeightedGrader(weights) - return grader(input) -} diff --git a/src/graders/tests/bootstrap.spec.ts b/src/graders/tests/bootstrap.spec.ts deleted file mode 100644 index 83eecec..0000000 --- a/src/graders/tests/bootstrap.spec.ts +++ /dev/null @@ -1,169 +0,0 @@ -/** - * Unit tests for bootstrap sampling utilities. - */ - -import { afterEach, describe, expect, test } from 'bun:test' -import { bootstrap, DEFAULT_CONFIDENCE_LEVEL, DEFAULT_ITERATIONS, getBootstrapConfigFromEnv } from '../bootstrap.ts' - -describe('bootstrap', () => { - describe('edge cases', () => { - test('returns {median: 0, ci: [0, 0]} for empty array', () => { - const result = bootstrap([]) - expect(result.median).toBe(0) - expect(result.ci).toEqual([0, 0]) - }) - - test('returns {median: value, ci: [value, value]} for single sample', () => { - const result = bootstrap([0.75]) - expect(result.median).toBe(0.75) - expect(result.ci).toEqual([0.75, 0.75]) - }) - - test('handles single sample of 0', () => { - const result = bootstrap([0]) - expect(result.median).toBe(0) - expect(result.ci).toEqual([0, 0]) - }) - - test('handles single sample of 1', () => { - const result = bootstrap([1]) - expect(result.median).toBe(1) - expect(result.ci).toEqual([1, 1]) - }) - }) - - describe('confidence interval bounds', () => { - test('CI lower bound <= median <= CI upper bound', () => { - const samples = [0.5, 0.6, 0.7, 0.8, 0.9] - const result = bootstrap(samples, { iterations: 1000 }) - - expect(result.ci[0]).toBeLessThanOrEqual(result.median) - expect(result.median).toBeLessThanOrEqual(result.ci[1]) - }) - - test('CI contains the true median for uniform samples', () => { - // For identical samples, CI should collapse to the value - const samples = [0.5, 0.5, 0.5, 0.5, 0.5] - const result = bootstrap(samples, { iterations: 1000 }) - - expect(result.median).toBeCloseTo(0.5, 2) - expect(result.ci[0]).toBeCloseTo(0.5, 2) - expect(result.ci[1]).toBeCloseTo(0.5, 2) - }) - - test('CI widens with more variance in samples', () => { - const lowVariance = [0.49, 0.5, 0.51] - const highVariance = [0.1, 0.5, 0.9] - - const lowResult = bootstrap(lowVariance, { iterations: 1000 }) - const highResult = bootstrap(highVariance, { iterations: 1000 }) - - const lowWidth = lowResult.ci[1] - lowResult.ci[0] - const highWidth = highResult.ci[1] - highResult.ci[0] - - expect(highWidth).toBeGreaterThan(lowWidth) - }) - }) - - describe('configuration', () => { - test('uses default iterations when not specified', () => { - // Just verify it runs without error with defaults - const result = bootstrap([0.5, 0.6, 0.7]) - expect(result.median).toBeGreaterThan(0) - }) - - test('accepts custom iteration count', () => { - const result = bootstrap([0.5, 0.6, 0.7], { iterations: 100 }) - expect(result.median).toBeGreaterThan(0) - }) - - test('accepts custom confidence level', () => { - const samples = [0.3, 0.4, 0.5, 0.6, 0.7] - - // 90% CI should be narrower than 95% CI - const ci90 = bootstrap(samples, { iterations: 1000, confidenceLevel: 0.9 }) - const ci95 = bootstrap(samples, { iterations: 1000, confidenceLevel: 0.95 }) - - const width90 = ci90.ci[1] - ci90.ci[0] - const width95 = ci95.ci[1] - ci95.ci[0] - - // 95% CI should generally be wider than 90% CI - // Allow some tolerance due to randomness - expect(width95).toBeGreaterThanOrEqual(width90 * 0.8) - }) - }) - - describe('statistical properties', () => { - test('median is close to sample mean', () => { - const samples = [0.2, 0.4, 0.6, 0.8, 1.0] - const sampleMean = samples.reduce((a, b) => a + b, 0) / samples.length - - const result = bootstrap(samples, { iterations: 10000 }) - - // Bootstrap median should be close to sample mean for symmetric distributions - expect(result.median).toBeCloseTo(sampleMean, 1) - }) - - test('is deterministic-ish for large iteration counts', () => { - const samples = [0.3, 0.5, 0.7] - - // With many iterations, results should be similar across runs - const result1 = bootstrap(samples, { iterations: 10000 }) - const result2 = bootstrap(samples, { iterations: 10000 }) - - expect(result1.median).toBeCloseTo(result2.median, 1) - }) - }) -}) - -describe('getBootstrapConfigFromEnv', () => { - const originalEnv = process.env.COMPARE_BOOTSTRAP_ITERATIONS - - afterEach(() => { - if (originalEnv === undefined) { - delete process.env.COMPARE_BOOTSTRAP_ITERATIONS - } else { - process.env.COMPARE_BOOTSTRAP_ITERATIONS = originalEnv - } - }) - - test('returns default iterations when env var not set', () => { - delete process.env.COMPARE_BOOTSTRAP_ITERATIONS - const config = getBootstrapConfigFromEnv() - expect(config.iterations).toBe(DEFAULT_ITERATIONS) - }) - - test('parses valid iteration count from env', () => { - process.env.COMPARE_BOOTSTRAP_ITERATIONS = '5000' - const config = getBootstrapConfigFromEnv() - expect(config.iterations).toBe(5000) - }) - - test('returns default for invalid (non-numeric) env value', () => { - process.env.COMPARE_BOOTSTRAP_ITERATIONS = 'invalid' - const config = getBootstrapConfigFromEnv() - expect(config.iterations).toBe(DEFAULT_ITERATIONS) - }) - - test('returns default for iteration count below minimum (100)', () => { - process.env.COMPARE_BOOTSTRAP_ITERATIONS = '50' - const config = getBootstrapConfigFromEnv() - expect(config.iterations).toBe(DEFAULT_ITERATIONS) - }) - - test('accepts iteration count at minimum (100)', () => { - process.env.COMPARE_BOOTSTRAP_ITERATIONS = '100' - const config = getBootstrapConfigFromEnv() - expect(config.iterations).toBe(100) - }) -}) - -describe('constants', () => { - test('DEFAULT_ITERATIONS is 1000', () => { - expect(DEFAULT_ITERATIONS).toBe(1000) - }) - - test('DEFAULT_CONFIDENCE_LEVEL is 0.95', () => { - expect(DEFAULT_CONFIDENCE_LEVEL).toBe(0.95) - }) -}) diff --git a/src/graders/tests/compare-graders.spec.ts b/src/graders/tests/compare-graders.spec.ts deleted file mode 100644 index 1827420..0000000 --- a/src/graders/tests/compare-graders.spec.ts +++ /dev/null @@ -1,293 +0,0 @@ -/** - * Unit tests for built-in comparison graders. - * - * @remarks - * Tests for: - * - compare-weighted: Configurable weight grader - * - compare-statistical: Bootstrap confidence interval grader - * - * @packageDocumentation - */ - -import { describe, expect, test } from 'bun:test' -import type { ComparisonGraderInput, ComparisonRunData } from '../../pipeline/pipeline.types.ts' -import { createStatisticalGrader, grade as statisticalGrade } from '../compare-statistical.ts' -import { createWeightedGrader, DEFAULT_WEIGHTS, type Weights } from '../compare-weighted.ts' - -// ============================================================================ -// Test Fixtures -// ============================================================================ - -const createMockRuns = ( - overrides: Partial>> = {}, -): Record => ({ - baseline: { - output: 'Result A', - score: { pass: true, score: 0.8 }, - duration: 1000, - toolErrors: false, - ...overrides.baseline, - }, - variant: { - output: 'Result B', - score: { pass: true, score: 0.9 }, - duration: 1500, - toolErrors: false, - ...overrides.variant, - }, -}) - -const createMockInput = (runs: Record): ComparisonGraderInput => ({ - id: 'test-001', - input: 'Test prompt', - hint: 'Expected output', - runs, -}) - -// ============================================================================ -// Weighted Grader Tests -// ============================================================================ - -describe('compare-weighted grader', () => { - describe('DEFAULT_WEIGHTS', () => { - test('has expected default values', () => { - expect(DEFAULT_WEIGHTS.quality).toBe(0.5) - expect(DEFAULT_WEIGHTS.latency).toBe(0.3) - expect(DEFAULT_WEIGHTS.reliability).toBe(0.2) - }) - - test('weights sum to 1.0', () => { - const sum = DEFAULT_WEIGHTS.quality + DEFAULT_WEIGHTS.latency + DEFAULT_WEIGHTS.reliability - expect(sum).toBe(1.0) - }) - }) - - describe('createWeightedGrader', () => { - test('returns higher rank for better quality score', async () => { - const grader = createWeightedGrader({ quality: 1.0, latency: 0.0, reliability: 0.0 }) - const runs = createMockRuns({ - baseline: { score: { pass: true, score: 0.7 } }, - variant: { score: { pass: true, score: 0.9 } }, - }) - const input = createMockInput(runs) - - const result = await grader(input) - - expect(result.rankings.length).toBe(2) - expect(result.rankings[0]?.run).toBe('variant') - expect(result.rankings[0]?.rank).toBe(1) - expect(result.rankings[1]?.run).toBe('baseline') - expect(result.rankings[1]?.rank).toBe(2) - }) - - test('returns higher rank for lower latency when latency weight is high', async () => { - const grader = createWeightedGrader({ quality: 0.0, latency: 1.0, reliability: 0.0 }) - const runs = createMockRuns({ - baseline: { duration: 500, score: { pass: true, score: 0.5 } }, - variant: { duration: 2000, score: { pass: true, score: 0.9 } }, - }) - const input = createMockInput(runs) - - const result = await grader(input) - - // Faster run should win when latency is all that matters - expect(result.rankings[0]?.run).toBe('baseline') - }) - - test('penalizes runs with tool errors when reliability weight is high', async () => { - const grader = createWeightedGrader({ quality: 0.0, latency: 0.0, reliability: 1.0 }) - const runs = createMockRuns({ - baseline: { toolErrors: false }, - variant: { toolErrors: true }, - }) - const input = createMockInput(runs) - - const result = await grader(input) - - expect(result.rankings[0]?.run).toBe('baseline') - expect(result.rankings[1]?.run).toBe('variant') - }) - - test('includes weights in reasoning', async () => { - const weights: Weights = { quality: 0.6, latency: 0.3, reliability: 0.1 } - const grader = createWeightedGrader(weights) - const input = createMockInput(createMockRuns()) - - const result = await grader(input) - - expect(result.reasoning).toContain('quality=0.6') - expect(result.reasoning).toContain('latency=0.3') - expect(result.reasoning).toContain('reliability=0.1') - }) - - test('handles missing score gracefully', async () => { - const grader = createWeightedGrader() - const runs: Record = { - baseline: { output: 'A' }, - variant: { output: 'B', score: { pass: true, score: 0.8 } }, - } - const input = createMockInput(runs) - - const result = await grader(input) - - // Should not throw, variant should rank higher due to having a score - expect(result.rankings.length).toBe(2) - expect(result.rankings[0]?.run).toBe('variant') - }) - - test('handles three or more runs', async () => { - const grader = createWeightedGrader() - const runs: Record = { - a: { output: 'A', score: { pass: true, score: 0.9 }, duration: 1000, toolErrors: false }, - b: { output: 'B', score: { pass: true, score: 0.7 }, duration: 800, toolErrors: false }, - c: { output: 'C', score: { pass: false, score: 0.5 }, duration: 500, toolErrors: true }, - } - const input = createMockInput(runs) - - const result = await grader(input) - - expect(result.rankings.length).toBe(3) - // Ranks should be 1, 2, 3 - expect(result.rankings.map((r) => r.rank)).toEqual([1, 2, 3]) - }) - }) -}) - -// ============================================================================ -// Statistical Grader Tests -// ============================================================================ - -describe('compare-statistical grader', () => { - describe('createStatisticalGrader', () => { - test('returns rankings based on score means', async () => { - const grader = createStatisticalGrader(100) - const runs = createMockRuns({ - baseline: { score: { pass: true, score: 0.6 } }, - variant: { score: { pass: true, score: 0.9 } }, - }) - const input = createMockInput(runs) - - const result = await grader(input) - - expect(result.rankings.length).toBe(2) - expect(result.rankings[0]?.run).toBe('variant') - }) - - test('handles missing scores as zero', async () => { - const grader = createStatisticalGrader(100) - const runs: Record = { - baseline: { output: 'A' }, - variant: { output: 'B', score: { pass: true, score: 0.8 } }, - } - const input = createMockInput(runs) - - const result = await grader(input) - - expect(result.rankings[0]?.run).toBe('variant') - }) - - test('indicates significance when scores differ (single samples have no variance)', async () => { - const grader = createStatisticalGrader(100) - const runs = createMockRuns({ - baseline: { score: { pass: true, score: 0.8 } }, - variant: { score: { pass: true, score: 0.81 } }, - }) - const input = createMockInput(runs) - - const result = await grader(input) - - // Note: With single samples, bootstrap has no variance. - // CIs are [0.8, 0.8] and [0.81, 0.81] - non-overlapping. - // Statistical significance test is most meaningful with multiple samples. - expect(result.reasoning).toContain('statistically significant') - }) - - test('indicates non-significance when scores are identical', async () => { - const grader = createStatisticalGrader(100) - const runs = createMockRuns({ - baseline: { score: { pass: true, score: 0.8 } }, - variant: { score: { pass: true, score: 0.8 } }, - }) - const input = createMockInput(runs) - - const result = await grader(input) - - // Identical scores = overlapping CIs = not significant - expect(result.reasoning).toContain('No statistically significant difference') - }) - }) - - describe('grade function', () => { - test('works with default iterations', async () => { - const runs = createMockRuns() - const input = createMockInput(runs) - - const result = await statisticalGrade(input) - - expect(result.rankings).toBeDefined() - expect(result.rankings.length).toBe(2) - }) - }) -}) - -// ============================================================================ -// Edge Case Tests -// ============================================================================ - -describe('comparison grader edge cases', () => { - test('handles single run gracefully', async () => { - const grader = createWeightedGrader() - const runs: Record = { - only: { output: 'Only run', score: { pass: true, score: 1.0 } }, - } - const input = createMockInput(runs) - - const result = await grader(input) - - expect(result.rankings.length).toBe(1) - expect(result.rankings[0]?.rank).toBe(1) - }) - - test('handles empty trajectory', async () => { - const grader = createWeightedGrader() - const runs = createMockRuns({ - baseline: { trajectory: [] }, - variant: { trajectory: undefined }, - }) - const input = createMockInput(runs) - - const result = await grader(input) - - expect(result.rankings.length).toBe(2) - }) - - test('handles zero duration', async () => { - const grader = createWeightedGrader({ quality: 0.0, latency: 1.0, reliability: 0.0 }) - const runs = createMockRuns({ - baseline: { duration: 0 }, - variant: { duration: 1000 }, - }) - const input = createMockInput(runs) - - const result = await grader(input) - - // Zero duration should get highest latency score - expect(result.rankings[0]?.run).toBe('baseline') - }) - - test('deterministic ordering for equal scores', async () => { - const grader = createWeightedGrader() - const runs = createMockRuns({ - baseline: { score: { pass: true, score: 0.8 }, duration: 1000, toolErrors: false }, - variant: { score: { pass: true, score: 0.8 }, duration: 1000, toolErrors: false }, - }) - const input = createMockInput(runs) - - // Run multiple times to check stability - const results = await Promise.all([grader(input), grader(input), grader(input)]) - - // All should have same ordering - const orders = results.map((r) => r.rankings.map((rank) => rank.run).join(',')) - expect(new Set(orders).size).toBe(1) - }) -}) diff --git a/src/graders/tests/trials-compare-graders.spec.ts b/src/graders/tests/trials-compare-graders.spec.ts deleted file mode 100644 index 38f096a..0000000 --- a/src/graders/tests/trials-compare-graders.spec.ts +++ /dev/null @@ -1,358 +0,0 @@ -/** - * Unit tests for built-in trials comparison graders. - * - * @remarks - * Tests for: - * - trials-compare-weighted: Configurable weight grader for trials - * - trials-compare-statistical: Bootstrap confidence interval grader for trials - * - * @packageDocumentation - */ - -import { describe, expect, test } from 'bun:test' -import type { TrialsComparisonGraderInput, TrialsComparisonRunData } from '../../pipeline/pipeline.types.ts' -import { createTrialsStatisticalGrader, grade as statisticalGrade } from '../trials-compare-statistical.ts' -import { createTrialsWeightedGrader, DEFAULT_TRIALS_WEIGHTS, type TrialsWeights } from '../trials-compare-weighted.ts' - -// ============================================================================ -// Test Fixtures -// ============================================================================ - -const createMockTrialRuns = ( - overrides: Partial>> = {}, -): Record => ({ - baseline: { - passRate: 0.67, - passAtK: 0.9, - passExpK: 0.3, - k: 3, - trials: [ - { trialNum: 1, output: 'A', trajectory: [], duration: 100, pass: true, score: 1.0 }, - { trialNum: 2, output: 'B', trajectory: [], duration: 110, pass: true, score: 0.9 }, - { trialNum: 3, output: 'C', trajectory: [], duration: 120, pass: false, score: 0.2 }, - ], - ...overrides.baseline, - }, - variant: { - passRate: 1.0, - passAtK: 1.0, - passExpK: 1.0, - k: 3, - trials: [ - { trialNum: 1, output: 'X', trajectory: [], duration: 150, pass: true, score: 1.0 }, - { trialNum: 2, output: 'Y', trajectory: [], duration: 160, pass: true, score: 1.0 }, - { trialNum: 3, output: 'Z', trajectory: [], duration: 170, pass: true, score: 1.0 }, - ], - ...overrides.variant, - }, -}) - -const createMockTrialInput = (runs: Record): TrialsComparisonGraderInput => ({ - id: 'test-001', - input: 'Test prompt', - hint: 'Expected output', - runs, -}) - -// ============================================================================ -// Weighted Grader Tests -// ============================================================================ - -describe('trials-compare-weighted grader', () => { - describe('DEFAULT_TRIALS_WEIGHTS', () => { - test('has expected default values', () => { - expect(DEFAULT_TRIALS_WEIGHTS.capability).toBe(0.4) - expect(DEFAULT_TRIALS_WEIGHTS.reliability).toBe(0.4) - expect(DEFAULT_TRIALS_WEIGHTS.consistency).toBe(0.2) - }) - - test('weights sum to 1.0', () => { - const sum = - DEFAULT_TRIALS_WEIGHTS.capability + DEFAULT_TRIALS_WEIGHTS.reliability + DEFAULT_TRIALS_WEIGHTS.consistency - expect(sum).toBe(1.0) - }) - }) - - describe('createTrialsWeightedGrader', () => { - test('returns higher rank for better passAtK when capability weight is high', async () => { - const grader = createTrialsWeightedGrader({ capability: 1.0, reliability: 0.0, consistency: 0.0 }) - const runs = createMockTrialRuns({ - baseline: { passAtK: 0.7 }, - variant: { passAtK: 0.95 }, - }) - const input = createMockTrialInput(runs) - - const result = await grader(input) - - expect(result.rankings.length).toBe(2) - expect(result.rankings[0]?.run).toBe('variant') - expect(result.rankings[0]?.rank).toBe(1) - }) - - test('returns higher rank for better passExpK when reliability weight is high', async () => { - const grader = createTrialsWeightedGrader({ capability: 0.0, reliability: 1.0, consistency: 0.0 }) - const runs = createMockTrialRuns({ - baseline: { passExpK: 0.9 }, - variant: { passExpK: 0.3 }, - }) - const input = createMockTrialInput(runs) - - const result = await grader(input) - - expect(result.rankings[0]?.run).toBe('baseline') - }) - - test('penalizes flaky runs when consistency weight is high', async () => { - const grader = createTrialsWeightedGrader({ capability: 0.0, reliability: 0.0, consistency: 1.0 }) - const runs = createMockTrialRuns({ - // baseline: passAtK=0.9, passExpK=0.3, flakiness=0.6 - baseline: { passAtK: 0.9, passExpK: 0.3 }, - // variant: passAtK=0.8, passExpK=0.8, flakiness=0.0 - variant: { passAtK: 0.8, passExpK: 0.8 }, - }) - const input = createMockTrialInput(runs) - - const result = await grader(input) - - // Variant should win due to lower flakiness (higher consistency) - expect(result.rankings[0]?.run).toBe('variant') - }) - - test('includes weights in reasoning', async () => { - const weights: TrialsWeights = { capability: 0.5, reliability: 0.3, consistency: 0.2 } - const grader = createTrialsWeightedGrader(weights) - const input = createMockTrialInput(createMockTrialRuns()) - - const result = await grader(input) - - expect(result.reasoning).toContain('capability=0.5') - expect(result.reasoning).toContain('reliability=0.3') - expect(result.reasoning).toContain('consistency=0.2') - }) - - test('handles missing passAtK gracefully (treats as 0)', async () => { - const grader = createTrialsWeightedGrader() - const runs: Record = { - baseline: { - k: 3, - trials: [], - }, - variant: { - passAtK: 0.8, - passExpK: 0.5, - k: 3, - trials: [], - }, - } - const input = createMockTrialInput(runs) - - const result = await grader(input) - - // Should not throw, variant should rank higher - expect(result.rankings.length).toBe(2) - expect(result.rankings[0]?.run).toBe('variant') - }) - - test('handles three or more runs', async () => { - const grader = createTrialsWeightedGrader() - const runs: Record = { - a: { passAtK: 0.9, passExpK: 0.8, k: 3, trials: [] }, - b: { passAtK: 0.7, passExpK: 0.7, k: 3, trials: [] }, - c: { passAtK: 0.5, passExpK: 0.2, k: 3, trials: [] }, - } - const input = createMockTrialInput(runs) - - const result = await grader(input) - - expect(result.rankings.length).toBe(3) - // Ranks should be 1, 2, 3 - expect(result.rankings.map((r) => r.rank)).toEqual([1, 2, 3]) - }) - }) -}) - -// ============================================================================ -// Statistical Grader Tests -// ============================================================================ - -describe('trials-compare-statistical grader', () => { - describe('createTrialsStatisticalGrader', () => { - test('returns rankings based on bootstrapped passAtK', async () => { - const grader = createTrialsStatisticalGrader(100) - const runs = createMockTrialRuns({ - baseline: { passAtK: 0.6 }, - variant: { passAtK: 0.95 }, - }) - const input = createMockTrialInput(runs) - - const result = await grader(input) - - expect(result.rankings.length).toBe(2) - expect(result.rankings[0]?.run).toBe('variant') - }) - - test('uses trial outcomes for bootstrap variance estimation', async () => { - const grader = createTrialsStatisticalGrader(100) - // All trials pass for variant, mixed for baseline - const runs: Record = { - baseline: { - passAtK: 0.9, - passExpK: 0.3, - k: 5, - trials: [ - { trialNum: 1, output: 'A', trajectory: [], duration: 100, pass: true }, - { trialNum: 2, output: 'B', trajectory: [], duration: 100, pass: true }, - { trialNum: 3, output: 'C', trajectory: [], duration: 100, pass: false }, - { trialNum: 4, output: 'D', trajectory: [], duration: 100, pass: true }, - { trialNum: 5, output: 'E', trajectory: [], duration: 100, pass: false }, - ], - }, - variant: { - passAtK: 1.0, - passExpK: 1.0, - k: 5, - trials: [ - { trialNum: 1, output: 'X', trajectory: [], duration: 100, pass: true }, - { trialNum: 2, output: 'Y', trajectory: [], duration: 100, pass: true }, - { trialNum: 3, output: 'Z', trajectory: [], duration: 100, pass: true }, - { trialNum: 4, output: 'W', trajectory: [], duration: 100, pass: true }, - { trialNum: 5, output: 'V', trajectory: [], duration: 100, pass: true }, - ], - }, - } - const input = createMockTrialInput(runs) - - const result = await grader(input) - - // Variant with 100% pass rate should rank higher - expect(result.rankings[0]?.run).toBe('variant') - }) - - test('indicates significance when passAtK differs substantially', async () => { - const grader = createTrialsStatisticalGrader(500) - // Strong difference: all pass vs all fail - const runs: Record = { - baseline: { - passAtK: 0, - k: 5, - trials: [ - { trialNum: 1, output: 'A', trajectory: [], duration: 100, pass: false }, - { trialNum: 2, output: 'B', trajectory: [], duration: 100, pass: false }, - { trialNum: 3, output: 'C', trajectory: [], duration: 100, pass: false }, - { trialNum: 4, output: 'D', trajectory: [], duration: 100, pass: false }, - { trialNum: 5, output: 'E', trajectory: [], duration: 100, pass: false }, - ], - }, - variant: { - passAtK: 1.0, - k: 5, - trials: [ - { trialNum: 1, output: 'X', trajectory: [], duration: 100, pass: true }, - { trialNum: 2, output: 'Y', trajectory: [], duration: 100, pass: true }, - { trialNum: 3, output: 'Z', trajectory: [], duration: 100, pass: true }, - { trialNum: 4, output: 'W', trajectory: [], duration: 100, pass: true }, - { trialNum: 5, output: 'V', trajectory: [], duration: 100, pass: true }, - ], - }, - } - const input = createMockTrialInput(runs) - - const result = await grader(input) - - expect(result.reasoning).toContain('clear separation') - }) - - test('handles empty trials array', async () => { - const grader = createTrialsStatisticalGrader(100) - const runs: Record = { - baseline: { k: 3, trials: [] }, - variant: { - k: 3, - trials: [{ trialNum: 1, output: 'X', trajectory: [], duration: 100, pass: true }], - }, - } - const input = createMockTrialInput(runs) - - const result = await grader(input) - - // Should not throw - expect(result.rankings.length).toBe(2) - }) - }) - - describe('grade function', () => { - test('works with default iterations', async () => { - const runs = createMockTrialRuns() - const input = createMockTrialInput(runs) - - const result = await statisticalGrade(input) - - expect(result.rankings).toBeDefined() - expect(result.rankings.length).toBe(2) - }) - }) -}) - -// ============================================================================ -// Edge Case Tests -// ============================================================================ - -describe('trials comparison grader edge cases', () => { - test('handles single run gracefully', async () => { - const grader = createTrialsWeightedGrader() - const runs: Record = { - only: { passAtK: 1.0, passExpK: 0.8, k: 3, trials: [] }, - } - const input = createMockTrialInput(runs) - - const result = await grader(input) - - expect(result.rankings.length).toBe(1) - expect(result.rankings[0]?.rank).toBe(1) - }) - - test('handles zero passAtK and passExpK', async () => { - const grader = createTrialsWeightedGrader() - const runs: Record = { - baseline: { passAtK: 0, passExpK: 0, k: 3, trials: [] }, - variant: { passAtK: 0.5, passExpK: 0.2, k: 3, trials: [] }, - } - const input = createMockTrialInput(runs) - - const result = await grader(input) - - expect(result.rankings[0]?.run).toBe('variant') - }) - - test('deterministic ordering for equal scores', async () => { - const grader = createTrialsWeightedGrader() - const runs = createMockTrialRuns({ - baseline: { passAtK: 0.8, passExpK: 0.6 }, - variant: { passAtK: 0.8, passExpK: 0.6 }, - }) - const input = createMockTrialInput(runs) - - // Run multiple times to check stability - const results = await Promise.all([grader(input), grader(input), grader(input)]) - - // All should have same ordering - const orders = results.map((r) => r.rankings.map((rank) => rank.run).join(',')) - expect(new Set(orders).size).toBe(1) - }) - - test('flakiness is clamped to non-negative', async () => { - // Edge case: passExpK > passAtK shouldn't happen but handle gracefully - const grader = createTrialsWeightedGrader({ capability: 0.0, reliability: 0.0, consistency: 1.0 }) - const runs: Record = { - baseline: { passAtK: 0.5, passExpK: 0.7, k: 3, trials: [] }, // Invalid but should work - variant: { passAtK: 0.8, passExpK: 0.8, k: 3, trials: [] }, - } - const input = createMockTrialInput(runs) - - const result = await grader(input) - - // Both should have flakiness 0, so consistency score should be 1.0 for both - // Variant has higher capability/reliability so it wins on tiebreaker - expect(result.rankings).toBeDefined() - }) -}) diff --git a/src/graders/trials-compare-statistical.ts b/src/graders/trials-compare-statistical.ts deleted file mode 100644 index c146c39..0000000 --- a/src/graders/trials-compare-statistical.ts +++ /dev/null @@ -1,183 +0,0 @@ -/** - * Built-in statistical significance comparison grader for trials data. - * - * @remarks - * Uses bootstrap sampling to compute confidence intervals for passAtK and passExpK. - * Flags when the winner is statistically significant (p<0.05, non-overlapping CIs). - * - * Unlike the capture statistical grader which only has one score per prompt, - * trials data has multiple trial results per prompt, enabling proper bootstrap - * variance estimation. - * - * Bootstrap iterations can be customized via environment variable: - * - `COMPARE_BOOTSTRAP_ITERATIONS` (default: 1000) - * - * @packageDocumentation - */ - -import type { - ComparisonGraderResult, - TrialsComparisonGrader, - TrialsComparisonGraderInput, -} from '../pipeline/pipeline.types.ts' -import { DEFAULT_ITERATIONS, getBootstrapConfigFromEnv } from './bootstrap.ts' - -/** - * Bootstrap confidence interval result. - */ -type BootstrapResult = { - /** Median estimate from bootstrap samples (more robust than mean) */ - median: number - /** 95% confidence interval [lower, upper] */ - ci95: [number, number] -} - -/** - * Compute passAtK estimate from trial pass/fail samples via bootstrap. - * - * @remarks - * passAtK = 1 - (1 - p)^k where p is estimated pass rate. - * We bootstrap the pass rate and compute passAtK from each bootstrap sample. - * - * @param trials - Array of 0/1 values (0=fail, 1=pass) - * @param k - Number of trials - * @param iterations - Number of bootstrap iterations - * @returns Bootstrap estimate and CI for passAtK - */ -const bootstrapPassAtK = (trials: number[], k: number, iterations: number): BootstrapResult => { - if (trials.length === 0) { - return { median: 0, ci95: [0, 0] } - } - - const passAtKValues: number[] = [] - - for (let i = 0; i < iterations; i++) { - // Resample with replacement - const resampled = Array.from( - { length: trials.length }, - () => trials[Math.floor(Math.random() * trials.length)] as number, - ) - - // Compute pass rate from resample - const passRate = resampled.reduce((acc, val) => acc + val, 0) / resampled.length - - // Compute passAtK: probability of at least one pass in k samples - // passAtK = 1 - (1 - p)^k - const passAtK = 1 - (1 - passRate) ** k - passAtKValues.push(passAtK) - } - - // Sort for percentile calculation - passAtKValues.sort((a, b) => a - b) - - const lowerIdx = Math.floor(iterations * 0.025) - const upperIdx = Math.floor(iterations * 0.975) - - return { - median: passAtKValues[Math.floor(iterations / 2)] ?? 0, - ci95: [passAtKValues[lowerIdx] ?? 0, passAtKValues[upperIdx] ?? 0], - } -} - -/** - * Get bootstrap iterations from environment or use default. - * - * @returns Number of bootstrap iterations - */ -const getIterations = (): number => { - const config = getBootstrapConfigFromEnv() - return config.iterations ?? DEFAULT_ITERATIONS -} - -/** - * Statistical significance trials comparison grader. - * - * @remarks - * Compares runs using bootstrap sampling on trial outcomes to determine - * if differences in passAtK are statistically significant. - * - * Unlike single-sample comparisons, trials data provides multiple samples - * per prompt (k trials), enabling meaningful variance estimation. - * - * @public - */ -export const grade: TrialsComparisonGrader = async ({ - runs, -}: TrialsComparisonGraderInput): Promise => { - const iterations = getIterations() - - // Collect pass/fail outcomes for each run - const runStats = Object.entries(runs).map(([label, run]) => { - // Convert trials to 0/1 array - const trialOutcomes = run.trials.map((t) => (t.pass ? 1 : 0)) - - // Bootstrap passAtK estimate - const stats = bootstrapPassAtK(trialOutcomes, run.k, iterations) - - return { label, passAtK: run.passAtK ?? 0, stats } - }) - - // Sort by bootstrap median passAtK descending - const sorted = runStats.sort((a, b) => b.stats.median - a.stats.median) - - // Check if winner is statistically significant - // CIs don't overlap = significant difference (approximately p<0.05) - let isSignificant = false - const first = sorted[0] - const second = sorted[1] - if (first && second) { - // Non-overlapping: first's lower bound > second's upper bound - isSignificant = first.stats.ci95[0] > second.stats.ci95[1] - } - - const reasoning = isSignificant - ? `Winner "${first?.label}" shows clear separation (non-overlapping 95% CIs for passAtK)` - : 'No clear winner - confidence intervals overlap between top runs' - - return { - rankings: sorted.map((s, i) => ({ - run: s.label, - rank: i + 1, - score: s.stats.median, - })), - reasoning, - } -} - -/** - * Create a statistical grader with custom iteration count. - * - * @param iterations - Number of bootstrap iterations - * @returns Trials comparison grader function - * - * @public - */ -export const createTrialsStatisticalGrader = (iterations: number = DEFAULT_ITERATIONS): TrialsComparisonGrader => { - return async ({ runs }: TrialsComparisonGraderInput): Promise => { - const runStats = Object.entries(runs).map(([label, run]) => { - const trialOutcomes = run.trials.map((t) => (t.pass ? 1 : 0)) - const stats = bootstrapPassAtK(trialOutcomes, run.k, iterations) - return { label, passAtK: run.passAtK ?? 0, stats } - }) - - const sorted = runStats.sort((a, b) => b.stats.median - a.stats.median) - - let isSignificant = false - const first = sorted[0] - const second = sorted[1] - if (first && second) { - isSignificant = first.stats.ci95[0] > second.stats.ci95[1] - } - - return { - rankings: sorted.map((s, i) => ({ - run: s.label, - rank: i + 1, - score: s.stats.median, - })), - reasoning: isSignificant - ? `Winner "${first?.label}" shows clear separation (non-overlapping 95% CIs)` - : 'No clear winner - confidence intervals overlap between top runs', - } - } -} diff --git a/src/graders/trials-compare-weighted.ts b/src/graders/trials-compare-weighted.ts deleted file mode 100644 index 552957b..0000000 --- a/src/graders/trials-compare-weighted.ts +++ /dev/null @@ -1,128 +0,0 @@ -/** - * Built-in weighted comparison grader for trials data. - * - * @remarks - * Configurable weights for capability (passAtK), reliability (passExpK), - * and consistency (1 - flakiness) dimensions. - * - * Weights can be customized via environment variables: - * - `COMPARE_CAPABILITY` (default: 0.4) - * - `COMPARE_RELIABILITY` (default: 0.4) - * - `COMPARE_CONSISTENCY` (default: 0.2) - * - * @packageDocumentation - */ - -import type { - ComparisonGraderResult, - TrialsComparisonGrader, - TrialsComparisonGraderInput, -} from '../pipeline/pipeline.types.ts' - -/** - * Weight configuration for trials comparison dimensions. - */ -export type TrialsWeights = { - /** Weight for capability (passAtK) - can the agent solve this at least once? */ - capability: number - /** Weight for reliability (passExpK) - does the agent solve this consistently? */ - reliability: number - /** Weight for consistency (1 - flakiness) - low gap between capability and reliability */ - consistency: number -} - -/** Default weights: capability=0.4, reliability=0.4, consistency=0.2 */ -export const DEFAULT_TRIALS_WEIGHTS: TrialsWeights = { - capability: 0.4, - reliability: 0.4, - consistency: 0.2, -} - -/** - * Read weights from environment variables with fallback to defaults. - * - * @remarks - * Validates that weights are non-negative. Invalid or negative values - * fall back to defaults. - * - * @returns TrialsWeights configuration - * - * @public - */ -export const getTrialsWeightsFromEnv = (): TrialsWeights => { - const parseWeight = (envVar: string | undefined, defaultValue: number): number => { - if (!envVar) return defaultValue - const parsed = Number.parseFloat(envVar) - // Must be a valid non-negative number - if (Number.isNaN(parsed) || parsed < 0) return defaultValue - return parsed - } - - return { - capability: parseWeight(process.env.COMPARE_CAPABILITY, DEFAULT_TRIALS_WEIGHTS.capability), - reliability: parseWeight(process.env.COMPARE_RELIABILITY, DEFAULT_TRIALS_WEIGHTS.reliability), - consistency: parseWeight(process.env.COMPARE_CONSISTENCY, DEFAULT_TRIALS_WEIGHTS.consistency), - } -} - -/** - * Create a weighted trials comparison grader with custom weights. - * - * @param weights - Weight configuration for comparison dimensions - * @returns Trials comparison grader function - * - * @public - */ -export const createTrialsWeightedGrader = (weights: TrialsWeights = DEFAULT_TRIALS_WEIGHTS): TrialsComparisonGrader => { - return async ({ runs }: TrialsComparisonGraderInput): Promise => { - const scores = Object.entries(runs).map(([label, run]) => { - // Capability score: passAtK (0-1) - const capabilityScore = run.passAtK ?? 0 - - // Reliability score: passExpK (0-1) - const reliabilityScore = run.passExpK ?? 0 - - // Consistency score: 1 - flakiness - // Flakiness = passAtK - passExpK (how much gap between capability and reliability) - const flakiness = Math.max(0, capabilityScore - reliabilityScore) - const consistencyScore = 1 - flakiness - - // Weighted combination - const weighted = - capabilityScore * weights.capability + - reliabilityScore * weights.reliability + - consistencyScore * weights.consistency - - return { label, weighted, capabilityScore, reliabilityScore, consistencyScore, flakiness } - }) - - // Sort by weighted score descending (highest = best) - const sorted = scores.sort((a, b) => b.weighted - a.weighted) - - return { - rankings: sorted.map((s, i) => ({ - run: s.label, - rank: i + 1, - score: s.weighted, - })), - reasoning: `Weighted trials: capability=${weights.capability}, reliability=${weights.reliability}, consistency=${weights.consistency}`, - } - } -} - -/** - * Default weighted trials comparison grader using environment or default weights. - * - * @remarks - * This is the default grader used when `--strategy weighted` is specified - * for trials format comparison. - * - * @public - */ -export const grade: TrialsComparisonGrader = async ( - input: TrialsComparisonGraderInput, -): Promise => { - const weights = getTrialsWeightsFromEnv() - const grader = createTrialsWeightedGrader(weights) - return grader(input) -} diff --git a/src/harness.ts b/src/harness.ts deleted file mode 100644 index 042208d..0000000 --- a/src/harness.ts +++ /dev/null @@ -1,46 +0,0 @@ -/** - * Harness commands for agent evaluation. - * - * @remarks - * Re-exports all harness command modules for programmatic use. - * For CLI usage, run `agent-eval-harness --help`. - * - * **Commands:** - * - `capture` - Core trajectory capture - * - `trials` - Multi-run pass@k/pass^k analysis - * - `summarize` - Derive compact views from results - * - `calibrate` - Sample failures for grader review - * - `validateRefs` - Check reference solutions - * - `balance` - Analyze test set coverage - * - `schemasCli` - Export JSON schemas - * - `headless` - Schema-driven adapter for headless CLI agents - * - * @packageDocumentation - */ - -export type { BalanceConfig } from './commands/balance.ts' -export { balance, runBalance } from './commands/balance.ts' -export type { CalibrateConfig } from './commands/calibrate.ts' -export { calibrate, runCalibrate } from './commands/calibrate.ts' -// Config types -export type { CaptureConfig } from './commands/capture.ts' -// Command implementations (for programmatic use) -export { - capture, - extractOutput, - extractTrajectory, - hasToolErrors, - loadPrompts, - runCapture, -} from './commands/capture.ts' -export type { SummarizeConfig } from './commands/summarize.ts' -export { runSummarize, summarize } from './commands/summarize.ts' -export type { TrialsConfig } from './commands/trials.ts' -export { runTrials, trials } from './commands/trials.ts' -export type { ValidateRefsConfig } from './commands/validate-refs.ts' -export { runValidateRefs, validateRefs } from './commands/validate-refs.ts' -export type { HeadlessAdapterConfig } from './headless.ts' -// Headless adapter factory -export { headless } from './headless.ts' -export type { SchemasConfig } from './schemas/schemas-cli.ts' -export { runSchemas, schemasCli } from './schemas/schemas-cli.ts' diff --git a/src/headless.ts b/src/headless.ts deleted file mode 100644 index 02530b5..0000000 --- a/src/headless.ts +++ /dev/null @@ -1,72 +0,0 @@ -/** - * Headless adapter factory - schema-driven adapter for any CLI agent. - * - * @remarks - * Re-exports public API from the headless module. The headless adapter enables - * capturing trajectories from ANY headless CLI agent by defining a schema - * that describes how to interact with the CLI. - * - * **CLI Usage:** - * ```bash - * agent-eval-harness headless --schema ./my-agent.json - * ``` - * - * **Programmatic Usage:** - * ```typescript - * import { parseHeadlessConfig, createSessionManager } from '@plaited/agent-eval-harness/headless' - * - * const schema = parseHeadlessConfig(jsonConfig) - * const sessions = createSessionManager({ schema }) - * ``` - * - * @packageDocumentation - */ - -// Schema definitions and parsing -export { - HeadlessAdapterSchema, - OutputConfigSchema, - OutputEventExtractSchema, - OutputEventMappingSchema, - OutputEventMatchSchema, - PromptConfigSchema, - parseHeadlessConfig, - ResultConfigSchema, - ResumeConfigSchema, - safeParseHeadlessConfig, -} from './headless/headless.schemas.ts' -// Types -export type { - HeadlessAdapterConfig, - OutputConfig, - OutputEventExtract, - OutputEventMapping, - OutputEventMatch, - PromptConfig, - ResultConfig, - ResumeConfig, -} from './headless/headless.types.ts' -// CLI entry point -export { headless } from './headless/headless-cli.ts' -export type { HistoryBuilder, HistoryBuilderConfig, HistoryTurn } from './headless/headless-history-builder.ts' -// History builder -export { createHistoryBuilder } from './headless/headless-history-builder.ts' -export type { - OutputParser, - ParsedResult, - ParsedUpdate, - ResultParseResult, - SessionUpdateType, -} from './headless/headless-output-parser.ts' -// Output parser -export { createOutputParser, jsonPath, jsonPathString } from './headless/headless-output-parser.ts' -export type { - ProcessExitInfo, - PromptResult, - Session, - SessionManager, - SessionManagerConfig, - UpdateCallback, -} from './headless/headless-session-manager.ts' -// Session manager -export { createSessionManager } from './headless/headless-session-manager.ts' diff --git a/src/headless/headless-cli.ts b/src/headless/headless-cli.ts deleted file mode 100644 index 07ab7d4..0000000 --- a/src/headless/headless-cli.ts +++ /dev/null @@ -1,428 +0,0 @@ -#!/usr/bin/env bun -/** - * Headless adapter factory CLI entry point. - * - * @remarks - * This module implements a schema-driven adapter that can interact with - * ANY headless CLI agent. The adapter: - * - * 1. Reads a JSON schema defining how to interact with the CLI - * 2. Spawns the CLI process per schema's command + flags - * 3. Parses stdout using schema's outputEvents mappings - * 4. Emits session update notifications - * 5. Manages session state for multi-turn (stream or iterative mode) - * - * @packageDocumentation - */ - -import { createInterface } from 'node:readline' -import { parseArgs } from 'node:util' -import { PROTOCOL_VERSION } from '../schemas/constants.ts' -import { type HeadlessAdapterConfig, parseHeadlessConfig } from './headless.schemas.ts' -import { createSessionManager, type SessionManager } from './headless-session-manager.ts' - -// ============================================================================ -// Types -// ============================================================================ - -/** JSON-RPC 2.0 request */ -type JsonRpcRequest = { - jsonrpc: '2.0' - id: string | number - method: string - params?: unknown -} - -/** JSON-RPC 2.0 notification */ -type JsonRpcNotification = { - jsonrpc: '2.0' - method: string - params?: unknown -} - -/** JSON-RPC 2.0 success response */ -type JsonRpcSuccessResponse = { - jsonrpc: '2.0' - id: string | number - result: unknown -} - -/** JSON-RPC 2.0 error response */ -type JsonRpcErrorResponse = { - jsonrpc: '2.0' - id: string | number | null - error: { - code: number - message: string - data?: unknown - } -} - -/** JSON-RPC 2.0 response */ -type JsonRpcResponse = JsonRpcSuccessResponse | JsonRpcErrorResponse - -/** Content block for prompts */ -type ContentBlock = { type: 'text'; text: string } | { type: 'image'; source: unknown } - -// ============================================================================ -// Message Sending -// ============================================================================ - -/** - * Sends a JSON-RPC message to stdout. - */ -const sendMessage = (message: JsonRpcResponse | JsonRpcNotification): void => { - console.log(JSON.stringify(message)) -} - -/** - * Sends a session update notification. - */ -const sendSessionUpdate = (sessionId: string, update: unknown): void => { - sendMessage({ - jsonrpc: '2.0', - method: 'session/update', - params: { sessionId, update }, - }) -} - -// ============================================================================ -// Request Handlers -// ============================================================================ - -/** - * Creates request handlers for the headless adapter. - * - * @param schema - Headless adapter configuration - * @param sessions - Session manager instance - */ -const createHandlers = (schema: HeadlessAdapterConfig, sessions: SessionManager) => { - /** - * Handle initialize request. - */ - const handleInitialize = async (params: unknown): Promise => { - const { protocolVersion } = params as { protocolVersion: number } - - if (protocolVersion !== PROTOCOL_VERSION) { - throw new Error(`Unsupported protocol version: ${protocolVersion}`) - } - - return { - protocolVersion: PROTOCOL_VERSION, - agentInfo: { - name: schema.name, - version: '1.0.0', - }, - agentCapabilities: { - loadSession: !!schema.resume, - promptCapabilities: { - image: false, - }, - }, - } - } - - /** - * Handle session/new request. - */ - const handleSessionNew = async (params: unknown): Promise => { - const { cwd } = params as { cwd: string } - const session = await sessions.create(cwd) - return { sessionId: session.id } - } - - /** - * Handle session/load request. - */ - const handleSessionLoad = async (params: unknown): Promise => { - const { sessionId } = params as { sessionId: string } - const session = sessions.get(sessionId) - - if (!session) { - throw new Error(`Session not found: ${sessionId}`) - } - - return { sessionId } - } - - /** - * Handle session/prompt request. - */ - const handleSessionPrompt = async (params: unknown): Promise => { - const { sessionId, prompt } = params as { sessionId: string; prompt: ContentBlock[] } - - // Extract text from content blocks - const promptText = prompt - .filter((block): block is ContentBlock & { type: 'text' } => block.type === 'text') - .map((block) => block.text) - .join('\n') - - // Execute prompt and stream updates - const result = await sessions.prompt(sessionId, promptText, (update) => { - // Map parsed update to session update format - const sessionUpdate = mapToSessionUpdate(update) - sendSessionUpdate(sessionId, sessionUpdate) - }) - - return { - content: [{ type: 'text', text: result.output }], - } - } - - /** - * Handle session/cancel notification. - */ - const handleSessionCancel = async (params: unknown): Promise => { - const { sessionId } = params as { sessionId: string } - sessions.cancel(sessionId) - } - - return { - handleInitialize, - handleSessionNew, - handleSessionLoad, - handleSessionPrompt, - handleSessionCancel, - } -} - -/** - * Maps a parsed update to session update format. - */ -const mapToSessionUpdate = (update: { type: string; content?: string; title?: string; status?: string }): unknown => { - switch (update.type) { - case 'thought': - return { - sessionUpdate: 'agent_thought_chunk', - content: { type: 'text', text: update.content ?? '' }, - } - - case 'message': - return { - sessionUpdate: 'agent_message_chunk', - content: { type: 'text', text: update.content ?? '' }, - } - - case 'tool_call': - return { - sessionUpdate: 'agent_tool_call', - toolCall: { - name: update.title ?? 'unknown', - status: update.status ?? 'pending', - }, - } - - case 'plan': - return { - sessionUpdate: 'agent_plan', - content: { type: 'text', text: update.content ?? '' }, - } - - default: - return { - sessionUpdate: 'agent_message_chunk', - content: { type: 'text', text: update.content ?? '' }, - } - } -} - -// ============================================================================ -// Main Loop -// ============================================================================ - -/** - * Runs the headless adapter main loop. - * - * @param schema - Headless adapter configuration - * @param verbose - Whether to show debug output - */ -const runAdapter = async (schema: HeadlessAdapterConfig, verbose = false): Promise => { - const sessions = createSessionManager({ schema, verbose }) - const handlers = createHandlers(schema, sessions) - - // Method handlers (requests expect responses) - const methodHandlers: Record Promise> = { - initialize: handlers.handleInitialize, - 'session/new': handlers.handleSessionNew, - 'session/load': handlers.handleSessionLoad, - 'session/prompt': handlers.handleSessionPrompt, - } - - // Notification handlers (no response expected) - const notificationHandlers: Record Promise> = { - 'session/cancel': handlers.handleSessionCancel, - } - - /** - * Process incoming JSON-RPC message. - */ - const processMessage = async (line: string): Promise => { - let request: JsonRpcRequest | JsonRpcNotification - - try { - request = JSON.parse(line) - } catch { - sendMessage({ - jsonrpc: '2.0', - id: null, - error: { code: -32700, message: 'Parse error' }, - }) - return - } - - // Check if it's a notification (no id) - const isNotification = !('id' in request) - - if (isNotification) { - const handler = notificationHandlers[request.method] - if (handler) { - await handler(request.params) - } - // No response for notifications - return - } - - // It's a request - send response - const reqWithId = request as JsonRpcRequest - const handler = methodHandlers[reqWithId.method] - - if (!handler) { - sendMessage({ - jsonrpc: '2.0', - id: reqWithId.id, - error: { code: -32601, message: `Method not found: ${reqWithId.method}` }, - }) - return - } - - try { - const result = await handler(reqWithId.params) - sendMessage({ - jsonrpc: '2.0', - id: reqWithId.id, - result, - }) - } catch (error) { - sendMessage({ - jsonrpc: '2.0', - id: reqWithId.id, - error: { - code: -32603, - message: error instanceof Error ? error.message : 'Internal error', - }, - }) - } - } - - // Main loop: read lines from stdin - const rl = createInterface({ - input: process.stdin, - output: process.stdout, - terminal: false, - }) - - rl.on('line', processMessage) - - // Handle clean shutdown - process.on('SIGTERM', () => { - rl.close() - process.exit(0) - }) - - process.on('SIGINT', () => { - rl.close() - process.exit(0) - }) -} - -// ============================================================================ -// CLI Entry Point -// ============================================================================ - -/** - * Headless adapter CLI entry point. - * - * @param args - Command line arguments - */ -export const headless = async (args: string[]): Promise => { - const { values } = parseArgs({ - args, - options: { - schema: { type: 'string', short: 's' }, - verbose: { type: 'boolean', short: 'v' }, - help: { type: 'boolean', short: 'h' }, - }, - allowPositionals: false, - }) - - if (values.help) { - console.log(` -Usage: agent-eval-harness headless --schema [--verbose] - -Arguments: - -s, --schema Path to headless adapter schema (JSON) - -v, --verbose Show constructed commands (for debugging) - -h, --help Show this help message - -Description: - Schema-driven adapter for ANY headless CLI agent. The adapter reads - a JSON schema defining how to interact with the CLI and translates between - protocol and CLI stdio. - -Schema Format: - { - "version": 1, - "name": "my-agent", - "command": ["my-agent-cli"], - "sessionMode": "stream" | "iterative", - "prompt": { "flag": "-p" }, - "output": { "flag": "--output-format", "value": "stream-json" }, - "outputEvents": [...], - "result": { "matchPath": "$.type", "matchValue": "result", "contentPath": "$.content" } - } - -Examples: - # Run with Claude headless schema - agent-eval-harness headless --schema ./claude-headless.json - - # Use in capture pipeline - agent-eval-harness capture prompts.jsonl --schema ./claude-headless.json -o results.jsonl -`) - return - } - - if (!values.schema) { - console.error('Error: --schema is required') - console.error('Example: agent-eval-harness headless --schema ./my-agent.json') - process.exit(1) - } - - // Load and validate schema - const schemaPath = values.schema - const schemaFile = Bun.file(schemaPath) - - if (!(await schemaFile.exists())) { - console.error(`Error: schema file not found: ${schemaPath}`) - process.exit(1) - } - - let schema: HeadlessAdapterConfig - try { - const rawSchema = await schemaFile.json() - schema = parseHeadlessConfig(rawSchema) - } catch (error) { - console.error(`Error: invalid schema: ${error instanceof Error ? error.message : String(error)}`) - process.exit(1) - } - - // Run the adapter - await runAdapter(schema, values.verbose ?? false) -} - -// Allow direct execution -if (import.meta.main) { - headless(Bun.argv.slice(2)).catch((error) => { - console.error('Error:', error instanceof Error ? error.message : error) - process.exit(1) - }) -} diff --git a/src/headless/headless-history-builder.ts b/src/headless/headless-history-builder.ts deleted file mode 100644 index 28e04c9..0000000 --- a/src/headless/headless-history-builder.ts +++ /dev/null @@ -1,141 +0,0 @@ -/** - * History builder for iterative mode sessions. - * - * @remarks - * In iterative mode, each prompt spawns a new process. The history builder - * accumulates conversation context and formats it using the schema's - * historyTemplate for inclusion in subsequent prompts. - * - * @packageDocumentation - */ - -// ============================================================================ -// Types -// ============================================================================ - -/** A single turn in conversation history */ -export type HistoryTurn = { - /** User input */ - input: string - /** Agent output */ - output: string -} - -/** History builder configuration */ -export type HistoryBuilderConfig = { - /** Template for formatting history (e.g., "User: {{input}}\nAssistant: {{output}}") */ - template?: string -} - -// ============================================================================ -// Default Template -// ============================================================================ - -const DEFAULT_TEMPLATE = 'User: {{input}}\nAssistant: {{output}}' - -// ============================================================================ -// History Builder Factory -// ============================================================================ - -/** - * Creates a history builder for iterative mode sessions. - * - * @remarks - * The history builder: - * 1. Stores conversation turns - * 2. Formats history using the template - * 3. Builds complete prompts with context - * - * @param config - History builder configuration - * @returns History builder with add, format, and build methods - */ -export const createHistoryBuilder = (config: HistoryBuilderConfig = {}) => { - const template = config.template ?? DEFAULT_TEMPLATE - const history: HistoryTurn[] = [] - - /** - * Adds a turn to history. - * - * @param input - User input - * @param output - Agent output - */ - const addTurn = (input: string, output: string): void => { - history.push({ input, output }) - } - - /** - * Formats the current history as a string. - * - * @returns Formatted history string - */ - const formatHistory = (): string => { - return history.map((turn) => formatTurn(turn, template)).join('\n\n') - } - - /** - * Builds a prompt with history context. - * - * @remarks - * For the first turn, returns just the input. - * For subsequent turns, prepends formatted history. - * - * @param newInput - The new user input - * @returns Full prompt including history context - */ - const buildPrompt = (newInput: string): string => { - if (history.length === 0) { - return newInput - } - - const formattedHistory = formatHistory() - return `${formattedHistory}\n\nUser: ${newInput}` - } - - /** - * Gets the number of turns in history. - */ - const getLength = (): number => { - return history.length - } - - /** - * Clears all history. - */ - const clear = (): void => { - history.length = 0 - } - - /** - * Gets a copy of the history. - */ - const getHistory = (): HistoryTurn[] => { - return [...history] - } - - return { - addTurn, - formatHistory, - buildPrompt, - getLength, - clear, - getHistory, - } -} - -// ============================================================================ -// Helper Functions -// ============================================================================ - -/** - * Formats a single turn using the template. - * - * @param turn - History turn - * @param template - Template string with {{input}} and {{output}} placeholders - * @returns Formatted turn string - */ -const formatTurn = (turn: HistoryTurn, template: string): string => { - return template.replace('{{input}}', turn.input).replace('{{output}}', turn.output) -} - -/** History builder type */ -export type HistoryBuilder = ReturnType diff --git a/src/headless/headless-output-parser.ts b/src/headless/headless-output-parser.ts deleted file mode 100644 index cbf374f..0000000 --- a/src/headless/headless-output-parser.ts +++ /dev/null @@ -1,388 +0,0 @@ -/** - * Generic output parser for headless CLI agents. - * - * @remarks - * Uses schema-defined mappings to convert CLI JSON output into session updates. - * Supports JSONPath-like expressions for matching and extraction. - * - * @packageDocumentation - */ - -import type { HeadlessAdapterConfig, OutputEventMapping, PassthroughTypeMap } from './headless.schemas.ts' - -// ============================================================================ -// Types -// ============================================================================ - -/** session update types */ -export type SessionUpdateType = 'thought' | 'tool_call' | 'message' | 'plan' - -/** Parsed session update from CLI output */ -export type ParsedUpdate = { - type: SessionUpdateType - content?: string - title?: string - status?: string - input?: unknown - output?: unknown - timestamp: number - raw: unknown -} - -/** Result extraction from CLI output */ -export type ParsedResult = { - isResult: true - content: string - raw: unknown -} - -/** Not a result */ -export type NotResult = { - isResult: false -} - -/** Parse result for final output */ -export type ResultParseResult = ParsedResult | NotResult - -// ============================================================================ -// JSONPath Implementation -// ============================================================================ - -/** - * Extracts a value from an object using a simple JSONPath expression. - * - * @remarks - * Supports: - * - `$.field` - Root field access - * - `$.nested.field` - Nested field access - * - `$.array[0]` - Array index access - * - `$.array[*]` - Array wildcard (returns all items) - * - `$.array[0].field` - Combined array and field access - * - `'literal'` - Literal string values (single quotes) - * - * @param obj - Object to extract from - * @param path - JSONPath expression - * @returns Extracted value, array of values (for wildcard), or undefined - */ -export const jsonPath = (obj: unknown, path: string): unknown => { - // Handle literal strings (e.g., "'pending'") - if (path.startsWith("'") && path.endsWith("'")) { - return path.slice(1, -1) - } - - // Handle JSONPath expressions (e.g., "$.type", "$.message.content[0].text") - if (!path.startsWith('$.')) { - return undefined - } - - // Parse path into segments, handling both dot notation and array indices - // e.g., "message.content[0].text" -> ["message", "content", 0, "text"] - // e.g., "message.content[*].type" -> ["message", "content", "*", "type"] - const segments: (string | number | '*')[] = [] - const pathBody = path.slice(2) // Remove "$." - - // Split by dots first, then handle array indices within each part - for (const part of pathBody.split('.')) { - if (!part) continue - - // Check for array wildcard: "content[*]" - const wildcardMatch = part.match(/^([^[]*)\[\*\]$/) - if (wildcardMatch) { - const propName = wildcardMatch[1] - if (propName) { - segments.push(propName) - } - segments.push('*') - continue - } - - // Check for array index: "content[0]" or just "[0]" - const arrayMatch = part.match(/^([^[]*)\[(\d+)\]$/) - if (arrayMatch) { - const propName = arrayMatch[1] - const indexStr = arrayMatch[2] - if (propName) { - segments.push(propName) - } - if (indexStr) { - segments.push(parseInt(indexStr, 10)) - } - } else { - segments.push(part) - } - } - - let current: unknown = obj - - for (const segment of segments) { - if (current === null || current === undefined) { - return undefined - } - - if (segment === '*') { - // Array wildcard - return array as-is for further processing - if (!Array.isArray(current)) { - return undefined - } - return current - } else if (typeof segment === 'number') { - // Array index access - if (!Array.isArray(current)) { - return undefined - } - current = current[segment] - } else { - // Property access - if (typeof current !== 'object') { - return undefined - } - current = (current as Record)[segment] - } - } - - return current -} - -/** - * Extracts a string value from an object using JSONPath. - * - * @param obj - Object to extract from - * @param path - JSONPath expression - * @returns String value or undefined - */ -export const jsonPathString = (obj: unknown, path: string): string | undefined => { - const value = jsonPath(obj, path) - if (value === undefined || value === null) { - return undefined - } - return String(value) -} - -// ============================================================================ -// Output Parser Factory -// ============================================================================ - -/** - * Parse line using passthrough mode. - * - * @remarks - * Passthrough mode directly maps the agent's type field to session update types. - * Simpler than JSONPath for agents with well-structured output. - * - * @param line - JSON string from CLI stdout - * @param typeMap - Passthrough type mapping configuration - * @returns Parsed update or null if no mapping matches - */ -const parsePassthrough = (line: string, typeMap: PassthroughTypeMap): ParsedUpdate | null => { - let event: Record - try { - event = JSON.parse(line) as Record - } catch { - return null - } - - const typeField = typeMap.typeField ?? 'type' - const eventType = event[typeField] - - if (typeof eventType !== 'string') { - return null - } - - // Check if this type has a mapping - const typeValues = typeMap.typeValues as Record | undefined - const mappedType = typeValues?.[eventType] - if (!mappedType) { - // No explicit mapping - try direct match if it's a valid session type - const validTypes = ['thought', 'tool_call', 'message', 'plan'] as const - if (!validTypes.includes(eventType as (typeof validTypes)[number])) { - return null - } - // Use the event type directly if it's already a valid session type - return { - type: eventType as SessionUpdateType, - content: typeof event.content === 'string' ? event.content : undefined, - title: typeof event.name === 'string' ? event.name : typeof event.title === 'string' ? event.title : undefined, - status: typeof event.status === 'string' ? event.status : undefined, - input: event.input, - output: event.output, - timestamp: Date.now(), - raw: event, - } - } - - // Use mapped type - return { - type: mappedType, - content: typeof event.content === 'string' ? event.content : undefined, - title: typeof event.name === 'string' ? event.name : typeof event.title === 'string' ? event.title : undefined, - status: typeof event.status === 'string' ? event.status : undefined, - input: event.input, - output: event.output, - timestamp: Date.now(), - raw: event, - } -} - -/** - * Creates an output parser from adapter configuration. - * - * @remarks - * The parser uses the schema's outputEvents mappings to: - * 1. Match incoming JSON lines against patterns - * 2. Extract content using JSONPath expressions - * 3. Emit session update objects - * - * Supports two modes: - * - 'jsonpath' (default): Uses outputEvents for complex pattern matching - * - 'passthrough': Direct type mapping for well-structured output - * - * @param config - Headless adapter configuration - * @returns Parser function for individual lines - */ -export const createOutputParser = (config: HeadlessAdapterConfig) => { - const { result, outputMode = 'jsonpath', outputEvents = [], passthroughTypeMap } = config - - /** - * Parses a single JSON line from CLI output. - * - * @param line - JSON string from CLI stdout - * @returns Parsed update, array of updates (for wildcard matches), or null if no mapping matches - */ - const parseLine = (line: string): ParsedUpdate | ParsedUpdate[] | null => { - // Use passthrough mode if configured - if (outputMode === 'passthrough' && passthroughTypeMap) { - return parsePassthrough(line, passthroughTypeMap) - } - - // JSONPath mode (default) - if (!outputEvents || outputEvents.length === 0) { - return null - } - - let event: unknown - try { - event = JSON.parse(line) - } catch { - // Not valid JSON, skip - return null - } - - // Try each mapping until one matches - for (const mapping of outputEvents) { - const matchValue = jsonPath(event, mapping.match.path) - - // Handle array results from wildcard paths (e.g., $.message.content[*]) - if (Array.isArray(matchValue)) { - const updates: ParsedUpdate[] = [] - for (const item of matchValue) { - // Check if this array item matches the expected value - if (mapping.match.value === '*') { - // Wildcard: match any non-null item - if (item !== undefined && item !== null) { - updates.push(createUpdate(item, mapping)) - } - } else if (typeof item === 'object' && item !== null && 'type' in item) { - // For objects with 'type' property, check nested match - const itemType = (item as Record).type - if (itemType === mapping.match.value) { - updates.push(createUpdate(item, mapping)) - } - } else if (item === mapping.match.value) { - // For primitives, direct match - updates.push(createUpdate(item, mapping)) - } - } - if (updates.length > 0) { - return updates - } - } else { - // Single value matching (original behavior) - if (mapping.match.value === '*') { - if (matchValue !== undefined && matchValue !== null) { - return createUpdate(event, mapping) - } - } else if (matchValue === mapping.match.value) { - return createUpdate(event, mapping) - } - } - } - - return null - } - - /** - * Creates a ParsedUpdate from a matched event. - */ - const createUpdate = (event: unknown, mapping: OutputEventMapping): ParsedUpdate => { - const update: ParsedUpdate = { - type: mapping.emitAs, - timestamp: Date.now(), - raw: event, - } - - if (mapping.extract) { - if (mapping.extract.content) { - update.content = jsonPathString(event, mapping.extract.content) - } - if (mapping.extract.title) { - update.title = jsonPathString(event, mapping.extract.title) - } - if (mapping.extract.status) { - update.status = jsonPathString(event, mapping.extract.status) - } - if (mapping.extract.input) { - const value = jsonPath(event, mapping.extract.input) - if (value !== undefined) { - update.input = value - } - } - if (mapping.extract.output) { - const value = jsonPath(event, mapping.extract.output) - if (value !== undefined) { - update.output = value - } - } - } - - return update - } - - /** - * Checks if a JSON line represents the final result. - * - * @param line - JSON string from CLI stdout - * @returns Result extraction or indication that it's not a result - */ - const parseResult = (line: string): ResultParseResult => { - let event: unknown - try { - event = JSON.parse(line) - } catch { - return { isResult: false } - } - - const matchValue = jsonPath(event, result.matchPath) - // Support wildcard "*" to match any non-null value - const matches = - result.matchValue === '*' ? matchValue !== undefined && matchValue !== null : matchValue === result.matchValue - - if (matches) { - const content = jsonPathString(event, result.contentPath) - return { - isResult: true, - content: content ?? '', - raw: event, - } - } - - return { isResult: false } - } - - return { - parseLine, - parseResult, - } -} - -/** Output parser type */ -export type OutputParser = ReturnType diff --git a/src/headless/headless-session-manager.ts b/src/headless/headless-session-manager.ts deleted file mode 100644 index 73638ad..0000000 --- a/src/headless/headless-session-manager.ts +++ /dev/null @@ -1,590 +0,0 @@ -/** - * Session manager for headless CLI agents. - * - * @remarks - * Manages the lifecycle of CLI agent sessions including: - * - Process spawning and tracking - * - Stream mode (persistent process) vs iterative mode (new process per turn) - * - Output parsing and update emission - * - Session state management - * - * @packageDocumentation - */ - -import type { Subprocess } from 'bun' -import type { HeadlessAdapterConfig } from './headless.schemas.ts' -import { createHistoryBuilder, type HistoryBuilder } from './headless-history-builder.ts' -import { createOutputParser, type OutputParser, type ParsedUpdate } from './headless-output-parser.ts' - -// ============================================================================ -// Types -// ============================================================================ - -/** Session state */ -export type Session = { - /** Unique session identifier */ - id: string - /** Working directory for this session */ - cwd: string - /** Subprocess (stream mode only) */ - process?: Subprocess - /** History builder (iterative mode only) */ - history?: HistoryBuilder - /** Session ID from CLI (for resume, stream mode) */ - cliSessionId?: string - /** Whether the session is active */ - active: boolean - /** Turn count for this session */ - turnCount: number -} - -/** Process exit information for debugging */ -export type ProcessExitInfo = { - /** Exit code (null if killed by signal or timed out) */ - exitCode: number | null - /** Signal that killed the process (if any) */ - signal?: string - /** Whether the process was killed due to timeout */ - timedOut: boolean -} - -/** Update callback for emitting session updates */ -export type UpdateCallback = (update: ParsedUpdate) => void - -/** Prompt result with final output */ -export type PromptResult = { - /** Final output content */ - output: string - /** All updates collected during the prompt */ - updates: ParsedUpdate[] - /** Session ID from CLI (if available) */ - cliSessionId?: string - /** Process exit information */ - exitInfo?: ProcessExitInfo -} - -/** Session manager configuration */ -export type SessionManagerConfig = { - /** Headless adapter configuration */ - schema: HeadlessAdapterConfig - /** Default timeout for operations in ms (overrides schema timeout) */ - timeout?: number - /** Whether to show debug output (constructed commands, raw stdout) */ - verbose?: boolean - /** - * Debug mode - shows detailed output for troubleshooting. - * When enabled: - * - Raw CLI stdout/stderr is logged - * - JSONPath match attempts and results are shown - * - Process spawn/exit info is displayed - * - Timing for each stage is reported - */ - debug?: boolean -} - -// ============================================================================ -// Session Manager Factory -// ============================================================================ - -/** - * Creates a session manager for headless CLI agents. - * - * @remarks - * The session manager is the core orchestrator for CLI agent interaction: - * - * **Stream mode:** - * - Spawns one process per session - * - Keeps process alive across turns - * - Uses stdin/stdout for communication - * - Supports session resume via CLI flags - * - * **Iterative mode:** - * - Spawns a new process per turn - * - Accumulates history in prompts - * - No persistent process state - * - * @param config - Session manager configuration - * @returns Session manager with create, prompt, and cancel methods - */ -export const createSessionManager = (config: SessionManagerConfig) => { - const { schema, verbose = false, debug = false } = config - // Use schema timeout if available, otherwise default to 60000ms - const schemaTimeout = 'timeout' in schema ? (schema.timeout ?? 60000) : 60000 - const timeout = config.timeout ?? schemaTimeout - const sessions = new Map() - const outputParser = createOutputParser(schema) - - /** - * Debug logging helper - only logs when debug mode is enabled. - */ - const debugLog = (category: string, message: string, data?: unknown): void => { - if (debug) { - const timestamp = new Date().toISOString() - console.error(`[${timestamp}] [${category}] ${message}`) - if (data !== undefined) { - console.error(JSON.stringify(data, null, 2)) - } - } - } - - /** - * Creates a new session. - * - * @param cwd - Working directory for the session - * @returns Created session - */ - const create = async (cwd: string): Promise => { - const id = generateSessionId() - - const session: Session = { - id, - cwd, - active: true, - turnCount: 0, - } - - // Initialize mode-specific state - if (schema.sessionMode === 'iterative') { - // Normalize historyTemplate: v2 schemas can have object format, convert to string - let templateString: string | undefined - if (typeof schema.historyTemplate === 'object' && schema.historyTemplate !== null) { - // Use turnFormat from object-style template - templateString = schema.historyTemplate.turnFormat - } else { - templateString = schema.historyTemplate - } - session.history = createHistoryBuilder({ - template: templateString, - }) - } - - sessions.set(id, session) - return session - } - - /** - * Sends a prompt to a session and collects the response. - * - * @param sessionId - Session ID - * @param promptText - Prompt text to send - * @param onUpdate - Callback for streaming updates - * @returns Prompt result with output and updates - */ - const prompt = async (sessionId: string, promptText: string, onUpdate?: UpdateCallback): Promise => { - const session = sessions.get(sessionId) - if (!session) { - throw new Error(`Session not found: ${sessionId}`) - } - - if (!session.active) { - throw new Error(`Session is not active: ${sessionId}`) - } - - session.turnCount++ - - if (schema.sessionMode === 'stream') { - return promptStream(session, promptText, onUpdate) - } - - return promptIterative(session, promptText, onUpdate) - } - - /** - * Stream mode: send prompt via stdin to persistent process. - */ - const promptStream = async ( - session: Session, - promptText: string, - onUpdate?: UpdateCallback, - ): Promise => { - // Build command for first turn or if no process exists - if (!session.process || session.process.killed) { - const args = buildCommand(session, promptText) - - // Choose stdin mode based on schema configuration - const stdinMode = schema.prompt.stdin ? 'pipe' : 'ignore' - - session.process = Bun.spawn(args, { - cwd: session.cwd, - stdin: stdinMode, - stdout: 'pipe', - stderr: 'inherit', - }) - - // If using stdin, write the prompt and close stdin - // (stream mode spawns new process per turn, so stdin should close after writing) - if (schema.prompt.stdin && session.process) { - writePromptToStdin(session.process, promptText, true) - } - } else { - // Subsequent turns: spawn new process with resume flag - const args = buildCommand(session, promptText) - const stdinMode = schema.prompt.stdin ? 'pipe' : 'ignore' - - session.process = Bun.spawn(args, { - cwd: session.cwd, - stdin: stdinMode, - stdout: 'pipe', - stderr: 'inherit', - }) - - // If using stdin, write the prompt and close stdin - // (stream mode spawns new process per turn, so stdin should close after writing) - if (schema.prompt.stdin && session.process) { - writePromptToStdin(session.process, promptText, true) - } - } - - return collectOutput(session, outputParser, onUpdate, timeout, debugLog) - } - - /** - * Iterative mode: spawn new process per turn with history context. - */ - const promptIterative = async ( - session: Session, - promptText: string, - onUpdate?: UpdateCallback, - ): Promise => { - // Build full prompt with history - const fullPrompt = session.history?.buildPrompt(promptText) ?? promptText - - // Build and spawn command - const args = buildCommand(session, fullPrompt) - const stdinMode = schema.prompt.stdin ? 'pipe' : 'ignore' - - session.process = Bun.spawn(args, { - cwd: session.cwd, - stdin: stdinMode, - stdout: 'pipe', - stderr: 'inherit', - }) - - // If using stdin, write the prompt and close stdin - // (iterative mode spawns new process per turn, so stdin should close after writing) - if (schema.prompt.stdin && session.process) { - writePromptToStdin(session.process, fullPrompt, true) - } - - const result = await collectOutput(session, outputParser, onUpdate, timeout, debugLog) - - // Store in history for next turn - session.history?.addTurn(promptText, result.output) - - // Clean up process - session.process = undefined - - return result - } - - /** - * Builds the command array for spawning the CLI. - */ - const buildCommand = (session: Session, promptText: string): string[] => { - const args = [...schema.command] - - // Add output format flags (only if non-empty) - if (schema.output.flag) { - args.push(schema.output.flag, schema.output.value) - } - - // Add auto-approve flags - if (schema.autoApprove) { - args.push(...schema.autoApprove) - } - - // Add cwd flag if specified - if (schema.cwdFlag) { - args.push(schema.cwdFlag, session.cwd) - } - - // Add resume flag if available (stream mode, after first turn) - if (schema.sessionMode === 'stream' && schema.resume && session.cliSessionId) { - args.push(schema.resume.flag, session.cliSessionId) - } - - // Add prompt flag and text (skip if using stdin) - if (!schema.prompt.stdin) { - if (schema.prompt.flag) { - args.push(schema.prompt.flag, promptText) - } else { - // Positional argument (no flag) - args.push(promptText) - } - } - - // Debug output: show constructed command - if (verbose || debug) { - const stdinNote = schema.prompt.stdin ? ' (+ stdin)' : '' - console.error(`[headless] Command: ${args.join(' ')}${stdinNote}`) - } - - return args - } - - /** - * Cancels an active session. - * - * @param sessionId - Session ID to cancel - */ - const cancel = (sessionId: string): void => { - const session = sessions.get(sessionId) - if (!session) return - - session.active = false - - if (session.process && !session.process.killed) { - session.process.kill() - } - } - - /** - * Gets a session by ID. - * - * @param sessionId - Session ID - * @returns Session or undefined - */ - const get = (sessionId: string): Session | undefined => { - return sessions.get(sessionId) - } - - /** - * Deletes a session. - * - * @param sessionId - Session ID - */ - const destroy = (sessionId: string): void => { - cancel(sessionId) - sessions.delete(sessionId) - } - - return { - create, - prompt, - cancel, - get, - destroy, - } -} - -// ============================================================================ -// Helper Functions -// ============================================================================ - -/** - * Generates a unique session ID. - * - * @remarks - * Uses crypto.randomUUID() for secure random generation instead of Math.random(). - */ -const generateSessionId = (): string => { - return `sess_${crypto.randomUUID()}` -} - -/** - * Writes a prompt to a process stdin stream. - * - * @remarks - * Uses Bun's FileSink API to write text to the process stdin. - * The FileSink type provides `write()` and `flush()` methods for - * efficient stream writing without async overhead. - * - * Type guard ensures stdin is a FileSink (not a file descriptor number) - * before attempting to write. This handles Bun's subprocess stdin types: - * - `'pipe'` → FileSink with write/flush methods - * - `'ignore'` → null (not writable) - * - number → file descriptor (not a FileSink) - * - * **Closing stdin:** When `closeAfterWrite` is true, the stdin stream is - * closed after writing. This is required for CLIs that read from stdin - * with `-` and wait for EOF before processing (e.g., Codex). For stream - * mode sessions where stdin stays open for subsequent prompts, pass false. - * - * @param process - Subprocess with stdin stream - * @param prompt - Prompt text to write - * @param closeAfterWrite - Whether to close stdin after writing (default: false) - * - * @internal - */ -const writePromptToStdin = (process: Subprocess, prompt: string, closeAfterWrite = false): void => { - if (process.stdin && typeof process.stdin !== 'number') { - process.stdin.write(`${prompt}\n`) - process.stdin.flush() - if (closeAfterWrite) { - process.stdin.end() - } - } -} - -/** - * Collects output from a running process. - * - * @param session - Active session - * @param parser - Output parser - * @param onUpdate - Update callback - * @param timeoutMs - Timeout in ms - * @param logDebug - Debug logging function - * @returns Collected output and updates - */ -const collectOutput = async ( - session: Session, - parser: OutputParser, - onUpdate: UpdateCallback | undefined, - timeoutMs: number, - logDebug: (category: string, message: string, data?: unknown) => void, -): Promise => { - const updates: ParsedUpdate[] = [] - let output = '' - let cliSessionId: string | undefined - const accumulatedMessages: string[] = [] - let timedOut = false - - const stdout = session.process?.stdout - if (!stdout || typeof stdout === 'number') { - throw new Error('No stdout available') - } - - const reader = stdout.getReader() - const decoder = new TextDecoder() - let buffer = '' - - // Track timeout with a timer ID so we can clear it - let timeoutId: Timer | undefined - - const timeoutPromise = new Promise<'timeout'>((resolve) => { - timeoutId = setTimeout(() => resolve('timeout'), timeoutMs) - }) - - logDebug('process', `Starting output collection with ${timeoutMs}ms timeout`) - - try { - const readLoop = async (): Promise<'complete'> => { - readLines: while (true) { - const { done, value } = await reader.read() - - if (done) { - logDebug('process', 'Process stdout closed') - break - } - - const chunk = decoder.decode(value, { stream: true }) - logDebug('raw', `Received ${chunk.length} bytes`) - - buffer += chunk - - // Process complete lines - const lines = buffer.split('\n') - buffer = lines.pop() ?? '' - - for (const line of lines) { - if (!line.trim()) continue - - logDebug('line', `Processing line: ${line.slice(0, 100)}${line.length > 100 ? '...' : ''}`) - - // Parse as update first (so updates are emitted even for result lines) - const update = parser.parseLine(line) - if (update !== null) { - // Handle both single updates and arrays of updates (from wildcard matches) - const updatesToProcess = Array.isArray(update) ? update : [update] - - for (const singleUpdate of updatesToProcess) { - logDebug('parse', `Matched event: ${singleUpdate.type}`, { - title: singleUpdate.title, - status: singleUpdate.status, - content: singleUpdate.content?.slice(0, 50), - }) - - updates.push(singleUpdate) - onUpdate?.(singleUpdate) - - // Accumulate message content for fallback - if (singleUpdate.type === 'message' && singleUpdate.content) { - accumulatedMessages.push(singleUpdate.content) - } - - // Extract CLI session ID if available - if (!cliSessionId && singleUpdate.raw && typeof singleUpdate.raw === 'object') { - const raw = singleUpdate.raw as Record - if (typeof raw.session_id === 'string') { - cliSessionId = raw.session_id - session.cliSessionId = cliSessionId - logDebug('session', `Extracted CLI session ID: ${cliSessionId}`) - } - } - } - } else { - logDebug('parse', 'No matching event mapping for line') - } - - // Check for final result (after emitting update) - const resultCheck = parser.parseResult(line) - if (resultCheck.isResult) { - output = resultCheck.content - logDebug('result', `Found result: ${output.slice(0, 100)}${output.length > 100 ? '...' : ''}`) - break readLines // Exit both loops immediately on result - } - } - } - return 'complete' - } - - const raceResult = await Promise.race([readLoop(), timeoutPromise]) - - if (raceResult === 'timeout') { - timedOut = true - logDebug('timeout', `Process timed out after ${timeoutMs}ms`) - - // Kill the process on timeout - if (session.process && !session.process.killed) { - session.process.kill('SIGTERM') - logDebug('process', 'Sent SIGTERM to process') - } - } - } finally { - if (timeoutId) { - clearTimeout(timeoutId) - } - reader.releaseLock() - } - - // Fallback: if result contentPath didn't yield output, use accumulated messages - if (!output && accumulatedMessages.length > 0) { - output = accumulatedMessages.join('\n') - logDebug('fallback', `Using accumulated messages as output (${accumulatedMessages.length} messages)`) - } - - // Get exit info from process - let exitInfo: ProcessExitInfo | undefined - if (session.process) { - try { - // Wait for process to exit (with a short timeout to not block) - const exitCode = await Promise.race([ - session.process.exited, - new Promise((resolve) => setTimeout(() => resolve(null), 1000)), - ]) - - exitInfo = { - exitCode: exitCode, - timedOut, - signal: timedOut ? 'SIGTERM' : undefined, - } - - logDebug('exit', `Process exit info`, exitInfo) - } catch { - exitInfo = { - exitCode: null, - timedOut, - } - } - } - - return { - output, - updates, - cliSessionId, - exitInfo, - } -} - -/** Session manager type */ -export type SessionManager = ReturnType diff --git a/src/headless/headless.schemas.ts b/src/headless/headless.schemas.ts deleted file mode 100644 index 18f6145..0000000 --- a/src/headless/headless.schemas.ts +++ /dev/null @@ -1,321 +0,0 @@ -/** - * Zod schemas for headless adapter configuration. - * - * @remarks - * These schemas define how to interact with ANY headless CLI agent via a - * schema-driven approach. No hardcoded agent-specific logic - the schema - * defines everything: command, flags, output parsing rules. - * - * @packageDocumentation - */ - -import { z } from 'zod' - -// ============================================================================ -// Output Event Mapping Schema -// ============================================================================ - -/** - * Schema for matching CLI output to session update types. - * - * @remarks - * Uses JSONPath-like patterns to match events in CLI JSON output - * and map them to session update types. - */ -export const OutputEventMatchSchema = z.object({ - /** JSONPath to match event type in CLI output (e.g., "$.type") */ - path: z.string(), - /** Value to match at the path (e.g., "tool_use") */ - value: z.string(), -}) - -/** Output event match type */ -export type OutputEventMatch = z.infer - -/** - * Schema for extracting content from matched events. - * - * @remarks - * Known fields (`content`, `title`, `status`, `input`, `output`) are used by the - * output parser to populate `ParsedUpdate` properties. Additional string-valued - * fields are preserved during validation for forward compatibility but are not - * consumed by the parser. - * - * Paths can be: - * - JSONPath expressions (e.g., "$.message.text") - * - Literal strings in single quotes (e.g., "'pending'") - */ -export const OutputEventExtractSchema = z - .object({ - /** JSONPath to extract main content */ - content: z.string().optional(), - /** JSONPath to extract title (for tool calls) */ - title: z.string().optional(), - /** JSONPath to extract status (or literal like "'pending'") */ - status: z.string().optional(), - /** JSONPath to extract tool input arguments (e.g., "$.input") */ - input: z.string().optional(), - /** JSONPath to extract tool output/result content (e.g., "$.content") */ - output: z.string().optional(), - }) - .catchall(z.string()) - -/** Output event extract type */ -export type OutputEventExtract = z.infer - -/** - * Schema for mapping CLI output events to session update types. - * - * @remarks - * Each mapping specifies: - * 1. How to match events (match.path + match.value) - * 2. What session update type to emit (emitAs) - * 3. What content to extract (extract) - */ -export const OutputEventMappingSchema = z.object({ - /** Matching criteria for CLI output */ - match: OutputEventMatchSchema, - /** session update type to emit */ - emitAs: z.enum(['thought', 'tool_call', 'message', 'plan']), - /** Content extraction configuration */ - extract: OutputEventExtractSchema.optional(), -}) - -/** Output event mapping type */ -export type OutputEventMapping = z.infer - -// ============================================================================ -// Prompt Configuration Schema -// ============================================================================ - -/** - * Schema for how to pass prompts to the CLI. - * - * @remarks - * Three modes are supported: - * 1. **Flag-based**: `flag: "-p"` - Pass prompt via command-line flag - * 2. **Positional**: `flag: ""` - Pass prompt as positional argument - * 3. **Stdin**: `stdin: true` - Write prompt to stdin (command should include `-` or equivalent) - */ -export const PromptConfigSchema = z - .object({ - /** Flag to pass prompt (e.g., "-p", "--prompt"). Empty string for positional. */ - flag: z.string().optional(), - /** Use stdin to pass prompt instead of command args */ - stdin: z.boolean().optional(), - /** Format for stdin input in stream mode */ - stdinFormat: z.enum(['text', 'json']).optional(), - }) - .refine((data) => !(data.flag && data.stdin), { - message: "Cannot specify both 'flag' and 'stdin' modes - use either flag-based or stdin mode, not both", - }) - -/** Prompt configuration type */ -export type PromptConfig = z.infer - -// ============================================================================ -// Output Configuration Schema -// ============================================================================ - -/** - * Schema for output format configuration. - */ -export const OutputConfigSchema = z.object({ - /** Flag for output format (e.g., "--output-format") */ - flag: z.string(), - /** Value for output format (e.g., "stream-json") */ - value: z.string(), -}) - -/** Output configuration type */ -export type OutputConfig = z.infer - -// ============================================================================ -// Resume Configuration Schema -// ============================================================================ - -/** - * Schema for session resume support (stream mode). - */ -export const ResumeConfigSchema = z.object({ - /** Flag to resume session (e.g., "--resume") */ - flag: z.string(), - /** JSONPath to extract session ID from output */ - sessionIdPath: z.string(), -}) - -/** Resume configuration type */ -export type ResumeConfig = z.infer - -// ============================================================================ -// Result Configuration Schema -// ============================================================================ - -/** - * Schema for final result extraction. - */ -export const ResultConfigSchema = z.object({ - /** JSONPath to match result type (e.g., "$.type") */ - matchPath: z.string(), - /** Value indicating final result (e.g., "result") */ - matchValue: z.string(), - /** JSONPath to extract result content */ - contentPath: z.string(), -}) - -/** Result configuration type */ -export type ResultConfig = z.infer - -// ============================================================================ -// Passthrough Type Mapping Schema -// ============================================================================ - -/** - * Schema for passthrough type mapping. - * - * @remarks - * Used when outputMode is 'passthrough' to map agent's native type names - * to standard session update types. Useful for agents with well-structured - * output that doesn't need complex JSONPath parsing. - */ -export const PassthroughTypeMapSchema = z.object({ - /** JSON field that contains the event type (default: "type") */ - typeField: z.string().default('type'), - /** Mapping from agent type values to session update types */ - typeValues: z.record(z.string(), z.enum(['thought', 'tool_call', 'message', 'plan'])).optional(), -}) - -/** Passthrough type mapping type */ -export type PassthroughTypeMap = z.infer - -// ============================================================================ -// Main Adapter Schema -// ============================================================================ - -/** - * Schema for headless adapter configuration. - * - * @remarks - * This schema defines everything needed to interact with a headless CLI agent: - * - Command and flags to spawn - * - How to pass prompts - * - How to parse output (jsonpath or passthrough mode) - * - Session handling mode - * - * Supports two output parsing modes: - * - 'jsonpath': Use outputEvents for complex JSONPath-based parsing (default) - * - 'passthrough': Direct type mapping for well-structured output - * - * Example (Claude): - * ```json - * { - * "version": 1, - * "name": "claude-headless", - * "command": ["claude"], - * "sessionMode": "stream", - * "timeout": 90000, - * "prompt": { "flag": "-p" }, - * "output": { "flag": "--output-format", "value": "stream-json" }, - * "outputEvents": [...] - * } - * ``` - */ -export const HeadlessAdapterSchema = z.object({ - /** Schema version */ - version: z.literal(1), - - /** Human-readable adapter name */ - name: z.string(), - - /** Base command to spawn (e.g., ["claude"], ["gemini"]) */ - command: z.array(z.string()), - - /** - * Session mode determines how multi-turn conversations work: - * - 'stream': Keep process alive, multi-turn via stdin - * - 'iterative': New process per turn, accumulate context in prompt - */ - sessionMode: z.enum(['stream', 'iterative']), - - /** Default timeout for this agent in milliseconds (can be overridden per-prompt) */ - timeout: z.number().optional(), - - /** How to pass the prompt */ - prompt: PromptConfigSchema, - - /** Output format configuration */ - output: OutputConfigSchema, - - /** Flags for auto-approval in headless mode (e.g., ["--allowedTools", "*"]) */ - autoApprove: z.array(z.string()).optional(), - - /** Session resume support (stream mode only) */ - resume: ResumeConfigSchema.optional(), - - /** Working directory flag (if CLI needs explicit --cwd) */ - cwdFlag: z.string().optional(), - - /** - * Output parsing mode: - * - 'jsonpath': Use outputEvents for complex JSONPath-based parsing (default) - * - 'passthrough': Direct type mapping for well-structured output - */ - outputMode: z.enum(['jsonpath', 'passthrough']).default('jsonpath'), - - /** Output event mappings - how to parse CLI output into updates (jsonpath mode) */ - outputEvents: z.array(OutputEventMappingSchema).optional(), - - /** Type mapping for passthrough mode */ - passthroughTypeMap: PassthroughTypeMapSchema.optional(), - - /** Final result extraction configuration */ - result: ResultConfigSchema, - - /** - * Template for formatting conversation history (iterative mode only). - * - * @remarks - * Supports both string format (simple) and object format (advanced): - * - String: "User: {{input}}\nAssistant: {{output}}" - * - Object: { system: "...", turnFormat: "..." } - */ - historyTemplate: z - .union([ - z.string(), - z.object({ - /** System prefix for accumulated history */ - system: z.string().optional(), - /** Format for each turn: {{input}} and {{output}} placeholders */ - turnFormat: z.string(), - }), - ]) - .optional(), -}) - -/** Headless adapter configuration type */ -export type HeadlessAdapterConfig = z.infer - -// ============================================================================ -// Validation Helpers -// ============================================================================ - -/** - * Validates and parses a headless adapter configuration. - * - * @param config - Raw configuration object (e.g., from JSON file) - * @returns Validated HeadlessAdapterConfig - * @throws ZodError if validation fails - */ -export const parseHeadlessConfig = (config: unknown): HeadlessAdapterConfig => { - return HeadlessAdapterSchema.parse(config) -} - -/** - * Safely validates a headless adapter configuration. - * - * @param config - Raw configuration object - * @returns Result with success/failure and data or error - */ -export const safeParseHeadlessConfig = (config: unknown) => { - return HeadlessAdapterSchema.safeParse(config) -} diff --git a/src/headless/headless.types.ts b/src/headless/headless.types.ts deleted file mode 100644 index 95b0a80..0000000 --- a/src/headless/headless.types.ts +++ /dev/null @@ -1,19 +0,0 @@ -/** - * Type exports for headless adapter. - * - * @remarks - * Re-exports all types from the schemas module for external consumers. - * - * @packageDocumentation - */ - -export type { - HeadlessAdapterConfig, - OutputConfig, - OutputEventExtract, - OutputEventMapping, - OutputEventMatch, - PromptConfig, - ResultConfig, - ResumeConfig, -} from './headless.schemas.ts' diff --git a/src/headless/tests/fixtures/claude-headless.json b/src/headless/tests/fixtures/claude-headless.json deleted file mode 100644 index 632b72f..0000000 --- a/src/headless/tests/fixtures/claude-headless.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "version": 1, - "name": "claude-headless", - "command": ["claude"], - "sessionMode": "stream", - "prompt": { - "flag": "-p" - }, - "output": { - "flag": "--output-format", - "value": "stream-json" - }, - "autoApprove": ["--dangerously-skip-permissions", "--verbose"], - "resume": { - "flag": "--resume", - "sessionIdPath": "$.session_id" - }, - "outputEvents": [ - { - "match": { "path": "$.type", "value": "assistant" }, - "emitAs": "message", - "extract": { "content": "$.message.content[0].text" } - }, - { - "match": { "path": "$.type", "value": "tool_use" }, - "emitAs": "tool_call", - "extract": { "title": "$.name", "status": "'pending'", "input": "$.input" } - }, - { - "match": { "path": "$.type", "value": "tool_result" }, - "emitAs": "tool_call", - "extract": { "title": "$.name", "status": "'completed'", "output": "$.content" } - } - ], - "result": { - "matchPath": "$.type", - "matchValue": "result", - "contentPath": "$.result" - } -} diff --git a/src/headless/tests/fixtures/gemini-headless.json b/src/headless/tests/fixtures/gemini-headless.json deleted file mode 100644 index bd09dec..0000000 --- a/src/headless/tests/fixtures/gemini-headless.json +++ /dev/null @@ -1,37 +0,0 @@ -{ - "version": 1, - "name": "gemini-headless", - "command": ["gemini"], - "sessionMode": "iterative", - "prompt": { - "flag": "" - }, - "output": { - "flag": "--output-format", - "value": "stream-json" - }, - "autoApprove": ["--sandbox", "false"], - "outputEvents": [ - { - "match": { "path": "$.type", "value": "message" }, - "emitAs": "message", - "extract": { "content": "$.content" } - }, - { - "match": { "path": "$.type", "value": "tool_use" }, - "emitAs": "tool_call", - "extract": { "title": "$.tool_name", "status": "'pending'", "input": "$.args" } - }, - { - "match": { "path": "$.type", "value": "tool_result" }, - "emitAs": "tool_call", - "extract": { "title": "$.tool_name", "status": "'completed'", "output": "$.output" } - } - ], - "result": { - "matchPath": "$.type", - "matchValue": "result", - "contentPath": "$.content" - }, - "historyTemplate": "User: {{input}}\nAssistant: {{output}}" -} diff --git a/src/headless/tests/headless.spec.ts b/src/headless/tests/headless.spec.ts deleted file mode 100644 index 168d476..0000000 --- a/src/headless/tests/headless.spec.ts +++ /dev/null @@ -1,873 +0,0 @@ -/** - * Unit tests for headless adapter factory. - * - * @remarks - * Tests cover: - * - Schema validation with Zod - * - JSONPath extraction - * - Output parsing with event mappings - * - History building for iterative mode - */ - -import { describe, expect, test } from 'bun:test' -import { HeadlessAdapterSchema, parseHeadlessConfig, safeParseHeadlessConfig } from '../headless.schemas.ts' -import { createHistoryBuilder } from '../headless-history-builder.ts' -import { createOutputParser, jsonPath, jsonPathString } from '../headless-output-parser.ts' - -// ============================================================================ -// Test Fixtures -// ============================================================================ - -const validClaudeSchema = { - version: 1, - name: 'claude-headless', - command: ['claude'], - sessionMode: 'stream', - prompt: { flag: '-p' }, - output: { flag: '--output-format', value: 'stream-json' }, - autoApprove: ['--dangerously-skip-permissions'], - resume: { flag: '--resume', sessionIdPath: '$.session_id' }, - outputEvents: [ - { - match: { path: '$.type', value: 'assistant' }, - emitAs: 'message', - extract: { content: '$.message.text' }, - }, - { - match: { path: '$.type', value: 'tool_use' }, - emitAs: 'tool_call', - extract: { title: '$.name', status: "'pending'", input: '$.input' }, - }, - { - match: { path: '$.type', value: 'tool_result' }, - emitAs: 'tool_call', - extract: { title: '$.name', status: "'completed'", output: '$.content' }, - }, - ], - result: { - matchPath: '$.type', - matchValue: 'result', - contentPath: '$.result', - }, -} - -const validGeminiSchema = { - version: 1, - name: 'gemini-headless', - command: ['gemini'], - sessionMode: 'iterative', - prompt: { flag: '--prompt' }, - output: { flag: '--output-format', value: 'json' }, - outputEvents: [ - { - match: { path: '$.type', value: 'message' }, - emitAs: 'message', - extract: { content: '$.content' }, - }, - ], - result: { - matchPath: '$.type', - matchValue: 'result', - contentPath: '$.response', - }, - historyTemplate: 'User: {{input}}\nAssistant: {{output}}', -} - -// ============================================================================ -// Schema Validation Tests -// ============================================================================ - -describe('HeadlessAdapterSchema', () => { - describe('valid schemas', () => { - test('validates Claude headless schema', () => { - const result = HeadlessAdapterSchema.safeParse(validClaudeSchema) - expect(result.success).toBe(true) - }) - - test('validates Gemini headless schema', () => { - const result = HeadlessAdapterSchema.safeParse(validGeminiSchema) - expect(result.success).toBe(true) - }) - }) - - describe('validates schema files from disk', () => { - const fixturesDir = 'src/headless/tests/fixtures' - - test('validates claude-headless.json from disk', async () => { - const content = await Bun.file(`${fixturesDir}/claude-headless.json`).json() - const result = HeadlessAdapterSchema.safeParse(content) - expect(result.success).toBe(true) - }) - - test('validates gemini-headless.json from disk', async () => { - const content = await Bun.file(`${fixturesDir}/gemini-headless.json`).json() - const result = HeadlessAdapterSchema.safeParse(content) - expect(result.success).toBe(true) - }) - }) - - describe('extract input/output fields', () => { - test('validates schema with input and output in extract config', () => { - const schemaWithIO = { - ...validClaudeSchema, - outputEvents: [ - ...validClaudeSchema.outputEvents, - { - match: { path: '$.type', value: 'custom' }, - emitAs: 'tool_call', - extract: { title: '$.name', input: '$.args', output: '$.result' }, - }, - ], - } - const result = HeadlessAdapterSchema.safeParse(schemaWithIO) - expect(result.success).toBe(true) - }) - - test('preserves extra extract fields via catchall', () => { - const schemaWithExtras = { - ...validClaudeSchema, - outputEvents: [ - { - match: { path: '$.type', value: 'tool_use' }, - emitAs: 'tool_call', - extract: { - title: '$.name', - status: "'pending'", - input: '$.input', - toolName: '$.name', - mcpServer: '$.server', - }, - }, - ], - } - const result = HeadlessAdapterSchema.safeParse(schemaWithExtras) - expect(result.success).toBe(true) - if (result.success) { - const extract = result.data.outputEvents![0]!.extract! - expect(extract.title).toBe('$.name') - expect(extract.input).toBe('$.input') - // Catchall fields aren't in the inferred type — cast needed to access them - expect((extract as Record).toolName).toBe('$.name') - expect((extract as Record).mcpServer).toBe('$.server') - } - }) - - test('rejects non-string extra extract fields', () => { - const schemaWithBadExtras = { - ...validClaudeSchema, - outputEvents: [ - { - match: { path: '$.type', value: 'tool_use' }, - emitAs: 'tool_call', - extract: { title: '$.name', badField: 123 }, - }, - ], - } - const result = HeadlessAdapterSchema.safeParse(schemaWithBadExtras) - expect(result.success).toBe(false) - }) - }) - - describe('minimal valid schema', () => { - test('validates minimal required fields', () => { - const minimal = { - version: 1, - name: 'minimal', - command: ['agent'], - sessionMode: 'iterative', - prompt: {}, - output: { flag: '--format', value: 'json' }, - outputEvents: [], - result: { matchPath: '$.type', matchValue: 'done', contentPath: '$.text' }, - } - const result = HeadlessAdapterSchema.safeParse(minimal) - expect(result.success).toBe(true) - }) - }) - - describe('stdin mode configuration', () => { - test('validates schema with stdin: true', () => { - const stdinSchema = { - version: 1, - name: 'stdin-agent', - command: ['agent', 'exec', '-'], - sessionMode: 'stream', - prompt: { stdin: true }, - output: { flag: '--format', value: 'json' }, - outputEvents: [], - result: { matchPath: '$.type', matchValue: 'done', contentPath: '$.text' }, - } - const result = HeadlessAdapterSchema.safeParse(stdinSchema) - expect(result.success).toBe(true) - }) - - test('validates schema with stdin: false', () => { - const stdinSchema = { - version: 1, - name: 'stdin-agent', - command: ['agent'], - sessionMode: 'stream', - prompt: { stdin: false, flag: '-p' }, - output: { flag: '--format', value: 'json' }, - outputEvents: [], - result: { matchPath: '$.type', matchValue: 'done', contentPath: '$.text' }, - } - const result = HeadlessAdapterSchema.safeParse(stdinSchema) - expect(result.success).toBe(true) - }) - - test('validates schema with positional prompt and - in command', () => { - const stdinSchema = { - version: 1, - name: 'codex-like', - command: ['codex', 'exec', '--json', '-'], - sessionMode: 'iterative', - prompt: { stdin: true }, - output: { flag: '', value: '' }, - outputEvents: [ - { - match: { path: '$.item.type', value: 'agent_message' }, - emitAs: 'message', - extract: { content: '$.item.text' }, - }, - ], - result: { matchPath: '$.type', matchValue: 'turn.completed', contentPath: '$.usage.output_tokens' }, - } - const result = HeadlessAdapterSchema.safeParse(stdinSchema) - expect(result.success).toBe(true) - }) - }) - - describe('invalid schemas', () => { - test('rejects missing version', () => { - const invalid = { ...validClaudeSchema, version: undefined } - const result = HeadlessAdapterSchema.safeParse(invalid) - expect(result.success).toBe(false) - }) - - test('rejects unsupported version', () => { - const invalid = { ...validClaudeSchema, version: 2 } - const result = HeadlessAdapterSchema.safeParse(invalid) - expect(result.success).toBe(false) - }) - - test('rejects invalid sessionMode', () => { - const invalid = { ...validClaudeSchema, sessionMode: 'batch' } - const result = HeadlessAdapterSchema.safeParse(invalid) - expect(result.success).toBe(false) - }) - - test('rejects missing command', () => { - const invalid = { ...validClaudeSchema, command: undefined } - const result = HeadlessAdapterSchema.safeParse(invalid) - expect(result.success).toBe(false) - }) - - test('rejects both flag and stdin specified', () => { - const invalid = { - ...validClaudeSchema, - prompt: { - flag: '-p', - stdin: true, - }, - } - const result = HeadlessAdapterSchema.safeParse(invalid) - expect(result.success).toBe(false) - // Type assertion after checking success is false - const error = (result as { success: false; error: { issues: Array<{ message: string }> } }).error - expect(error.issues.length).toBeGreaterThan(0) - expect(error.issues[0]!.message).toContain("Cannot specify both 'flag' and 'stdin' modes") - }) - - test('rejects invalid emitAs type', () => { - const invalid = { - ...validClaudeSchema, - outputEvents: [ - { - match: { path: '$.type', value: 'x' }, - emitAs: 'invalid_type', - }, - ], - } - const result = HeadlessAdapterSchema.safeParse(invalid) - expect(result.success).toBe(false) - }) - }) - - describe('parseHeadlessConfig', () => { - test('returns parsed config for valid input', () => { - const config = parseHeadlessConfig(validClaudeSchema) - expect(config.name).toBe('claude-headless') - expect(config.command).toEqual(['claude']) - expect(config.sessionMode).toBe('stream') - }) - - test('throws for invalid input', () => { - expect(() => parseHeadlessConfig({ version: 99 })).toThrow() - }) - }) - - describe('safeParseHeadlessConfig', () => { - test('returns success for valid input', () => { - const result = safeParseHeadlessConfig(validClaudeSchema) - expect(result.success).toBe(true) - if (result.success) { - expect(result.data.name).toBe('claude-headless') - } - }) - - test('returns failure for invalid input', () => { - const result = safeParseHeadlessConfig({ version: 99 }) - expect(result.success).toBe(false) - }) - }) -}) - -// ============================================================================ -// JSONPath Tests -// ============================================================================ - -describe('jsonPath', () => { - const testObj = { - type: 'message', - message: { - text: 'Hello world', - nested: { value: 42 }, - }, - array: [1, 2, 3], - } - - describe('basic extraction', () => { - test('extracts root field', () => { - expect(jsonPath(testObj, '$.type')).toBe('message') - }) - - test('extracts nested field', () => { - expect(jsonPath(testObj, '$.message.text')).toBe('Hello world') - }) - - test('extracts deeply nested field', () => { - expect(jsonPath(testObj, '$.message.nested.value')).toBe(42) - }) - - test('returns undefined for non-existent path', () => { - expect(jsonPath(testObj, '$.missing')).toBeUndefined() - }) - - test('returns undefined for non-existent nested path', () => { - expect(jsonPath(testObj, '$.message.missing.deep')).toBeUndefined() - }) - }) - - describe('literal strings', () => { - test('returns literal string value', () => { - expect(jsonPath(testObj, "'pending'")).toBe('pending') - }) - - test('returns empty literal string', () => { - expect(jsonPath(testObj, "''")).toBe('') - }) - - test('returns literal with spaces', () => { - expect(jsonPath(testObj, "'hello world'")).toBe('hello world') - }) - }) - - describe('edge cases', () => { - test('handles null input', () => { - expect(jsonPath(null, '$.type')).toBeUndefined() - }) - - test('handles undefined input', () => { - expect(jsonPath(undefined, '$.type')).toBeUndefined() - }) - - test('handles non-object input', () => { - expect(jsonPath('string', '$.type')).toBeUndefined() - }) - - test('handles invalid path format', () => { - expect(jsonPath(testObj, 'type')).toBeUndefined() - }) - }) -}) - -describe('jsonPathString', () => { - test('extracts string value', () => { - expect(jsonPathString({ text: 'hello' }, '$.text')).toBe('hello') - }) - - test('converts number to string', () => { - expect(jsonPathString({ num: 42 }, '$.num')).toBe('42') - }) - - test('returns undefined for missing path', () => { - expect(jsonPathString({ x: 1 }, '$.y')).toBeUndefined() - }) - - test('returns undefined for null value', () => { - expect(jsonPathString({ x: null }, '$.x')).toBeUndefined() - }) -}) - -// ============================================================================ -// Output Parser Tests -// ============================================================================ - -describe('createOutputParser', () => { - const config = parseHeadlessConfig(validClaudeSchema) - const parser = createOutputParser(config) - - describe('parseLine', () => { - test('maps assistant type to message', () => { - const line = JSON.stringify({ type: 'assistant', message: { text: 'Hello' } }) - const result = parser.parseLine(line) - expect(result).not.toBeNull() - // Handle both single result and array of results - const singleResult = Array.isArray(result) ? result[0] : result - expect(singleResult?.type).toBe('message') - expect(singleResult?.content).toBe('Hello') - }) - - test('maps tool_use type to tool_call', () => { - const line = JSON.stringify({ type: 'tool_use', name: 'Read' }) - const result = parser.parseLine(line) - expect(result).not.toBeNull() - // Handle both single result and array of results - const singleResult = Array.isArray(result) ? result[0] : result - expect(singleResult?.type).toBe('tool_call') - expect(singleResult?.title).toBe('Read') - expect(singleResult?.status).toBe('pending') - }) - - test('returns null for unmapped event types', () => { - const line = JSON.stringify({ type: 'unknown', data: 'test' }) - const result = parser.parseLine(line) - expect(result).toBeNull() - }) - - test('returns null for invalid JSON', () => { - const result = parser.parseLine('not valid json') - expect(result).toBeNull() - }) - - test('returns null for empty line', () => { - const result = parser.parseLine('') - expect(result).toBeNull() - }) - - test('preserves raw event in result', () => { - const event = { type: 'assistant', message: { text: 'Hi' } } - const line = JSON.stringify(event) - const result = parser.parseLine(line) - // Handle both single result and array of results - const singleResult = Array.isArray(result) ? result[0] : result - expect(singleResult?.raw).toEqual(event) - }) - - test('extracts input from tool_use event', () => { - const line = JSON.stringify({ type: 'tool_use', name: 'Read', input: { file_path: '/test.ts' } }) - const result = parser.parseLine(line) - const singleResult = Array.isArray(result) ? result[0] : result - expect(singleResult?.input).toEqual({ file_path: '/test.ts' }) - }) - - test('extracts output from tool_result event', () => { - const line = JSON.stringify({ type: 'tool_result', name: 'Read', content: 'file contents' }) - const result = parser.parseLine(line) - const singleResult = Array.isArray(result) ? result[0] : result - expect(singleResult?.output).toBe('file contents') - }) - - test('sets timestamp on parsed updates', () => { - const before = Date.now() - const line = JSON.stringify({ type: 'assistant', message: { text: 'Hello' } }) - const result = parser.parseLine(line) - const after = Date.now() - const singleResult = Array.isArray(result) ? result[0] : result - expect(singleResult?.timestamp).toBeGreaterThanOrEqual(before) - expect(singleResult?.timestamp).toBeLessThanOrEqual(after) - }) - }) - - describe('parseLine with extra extract fields', () => { - test('extra extract fields do not break parser', () => { - const configWithExtras = parseHeadlessConfig({ - version: 1, - name: 'extras-test', - command: ['test'], - sessionMode: 'stream', - prompt: { flag: '-p' }, - output: { flag: '--output', value: 'json' }, - outputEvents: [ - { - match: { path: '$.type', value: 'tool_use' }, - emitAs: 'tool_call', - extract: { - title: '$.name', - status: "'pending'", - input: '$.input', - toolName: '$.name', - mcpServer: '$.server', - }, - }, - ], - result: { matchPath: '$.type', matchValue: 'done', contentPath: '$.text' }, - }) - const extrasParser = createOutputParser(configWithExtras) - const line = JSON.stringify({ - type: 'tool_use', - name: 'WebSearch', - input: { query: 'test' }, - server: 'mcp-search', - }) - const result = extrasParser.parseLine(line) - const singleResult = Array.isArray(result) ? result[0] : result - expect(singleResult).not.toBeNull() - expect(singleResult?.type).toBe('tool_call') - expect(singleResult?.title).toBe('WebSearch') - expect(singleResult?.input).toEqual({ query: 'test' }) - }) - }) - - describe('parseLine with array wildcards', () => { - const wildcardConfig = parseHeadlessConfig({ - version: 1, - name: 'wildcard-test', - command: ['test'], - sessionMode: 'stream', - prompt: { flag: '-p' }, - output: { flag: '--output', value: 'json' }, - outputEvents: [ - { - match: { path: '$.message.content[*].type', value: 'tool_use' }, - emitAs: 'tool_call', - extract: { title: '$.name', status: "'pending'" }, - }, - { - match: { path: '$.items[*]', value: '*' }, - emitAs: 'message', - extract: { content: '$.text' }, - }, - ], - result: { - matchPath: '$.type', - matchValue: 'result', - contentPath: '$.output', - }, - }) - const wildcardParser = createOutputParser(wildcardConfig) - - test('returns array of updates for matching array items', () => { - const line = JSON.stringify({ - message: { - content: [ - { type: 'tool_use', name: 'Read', input: {} }, - { type: 'text', value: 'Hello' }, - { type: 'tool_use', name: 'Write', input: {} }, - ], - }, - }) - const result = wildcardParser.parseLine(line) - expect(Array.isArray(result)).toBe(true) - if (Array.isArray(result)) { - expect(result).toHaveLength(2) - expect(result[0]!.type).toBe('tool_call') - expect(result[0]!.title).toBe('Read') - expect(result[0]!.status).toBe('pending') - expect(result[1]!.type).toBe('tool_call') - expect(result[1]!.title).toBe('Write') - expect(result[1]!.status).toBe('pending') - } - }) - - test('handles empty array gracefully', () => { - const line = JSON.stringify({ - message: { content: [] }, - }) - const result = wildcardParser.parseLine(line) - expect(result).toBeNull() - }) - - test('handles non-matching array items', () => { - const line = JSON.stringify({ - message: { - content: [ - { type: 'text', value: 'No tool use here' }, - { type: 'image', data: 'base64...' }, - ], - }, - }) - const result = wildcardParser.parseLine(line) - expect(result).toBeNull() - }) - - test('matches wildcard value for all non-null items', () => { - const line = JSON.stringify({ - items: [{ text: 'Item 1' }, { text: 'Item 2' }, { text: 'Item 3' }], - }) - const result = wildcardParser.parseLine(line) - expect(Array.isArray(result)).toBe(true) - if (Array.isArray(result)) { - expect(result).toHaveLength(3) - expect(result[0]!.content).toBe('Item 1') - expect(result[1]!.content).toBe('Item 2') - expect(result[2]!.content).toBe('Item 3') - } - }) - - test('handles mixed array content with type guards', () => { - const line = JSON.stringify({ - message: { - content: [ - { type: 'tool_use', name: 'Valid' }, - 'string-item', - { no_type_property: true }, - null, - { type: 'tool_use', name: 'AlsoValid' }, - ], - }, - }) - const result = wildcardParser.parseLine(line) - expect(Array.isArray(result)).toBe(true) - if (Array.isArray(result)) { - expect(result).toHaveLength(2) - expect(result[0]!.title).toBe('Valid') - expect(result[1]!.title).toBe('AlsoValid') - } - }) - }) - - describe('jsonPath with array wildcard', () => { - test('extracts array with [*] wildcard', () => { - const obj = { items: [{ id: 1 }, { id: 2 }] } - const result = jsonPath(obj, '$.items[*]') - expect(Array.isArray(result)).toBe(true) - if (Array.isArray(result)) { - expect(result).toHaveLength(2) - } - }) - - test('returns undefined for non-array at wildcard position', () => { - const obj = { items: 'not-an-array' } - const result = jsonPath(obj, '$.items[*]') - expect(result).toBeUndefined() - }) - - test('handles empty array', () => { - const obj = { items: [] } - const result = jsonPath(obj, '$.items[*]') - expect(result).toEqual([]) - }) - - test('handles nested path to array', () => { - const obj = { message: { content: [1, 2, 3] } } - const result = jsonPath(obj, '$.message.content[*]') - expect(result).toEqual([1, 2, 3]) - }) - - test('returns undefined when path before wildcard is invalid', () => { - const obj = { items: [1, 2, 3] } - const result = jsonPath(obj, '$.missing[*]') - expect(result).toBeUndefined() - }) - }) - - describe('parseResult', () => { - test('detects result event', () => { - const line = JSON.stringify({ type: 'result', result: 'Final answer' }) - const result = parser.parseResult(line) - expect(result.isResult).toBe(true) - if (result.isResult) { - expect(result.content).toBe('Final answer') - } - }) - - test('returns not-result for non-result events', () => { - const line = JSON.stringify({ type: 'assistant', message: { text: 'Hi' } }) - const result = parser.parseResult(line) - expect(result.isResult).toBe(false) - }) - - test('returns not-result for invalid JSON', () => { - const result = parser.parseResult('invalid') - expect(result.isResult).toBe(false) - }) - - test('handles missing content path', () => { - const line = JSON.stringify({ type: 'result' }) - const result = parser.parseResult(line) - expect(result.isResult).toBe(true) - if (result.isResult) { - expect(result.content).toBe('') - } - }) - }) -}) - -// ============================================================================ -// Passthrough Mode Tests -// ============================================================================ - -describe('passthrough mode', () => { - const passthroughConfig = parseHeadlessConfig({ - version: 1, - name: 'passthrough-test', - command: ['test-agent'], - sessionMode: 'stream', - prompt: { flag: '-p' }, - output: { flag: '--output', value: 'json' }, - outputMode: 'passthrough', - passthroughTypeMap: { - typeField: 'type', - typeValues: { tool_use: 'tool_call', tool_result: 'tool_call' }, - }, - result: { matchPath: '$.type', matchValue: 'result', contentPath: '$.content' }, - }) - const passthroughParser = createOutputParser(passthroughConfig) - - test('extracts input from tool_call event', () => { - const line = JSON.stringify({ type: 'tool_use', name: 'Read', input: { file_path: '/test.ts' }, status: 'pending' }) - const result = passthroughParser.parseLine(line) - const singleResult = Array.isArray(result) ? result[0] : result - expect(singleResult?.type).toBe('tool_call') - expect(singleResult?.input).toEqual({ file_path: '/test.ts' }) - }) - - test('extracts output from tool_result event', () => { - const line = JSON.stringify({ type: 'tool_result', name: 'Read', output: 'file contents', status: 'completed' }) - const result = passthroughParser.parseLine(line) - const singleResult = Array.isArray(result) ? result[0] : result - expect(singleResult?.type).toBe('tool_call') - expect(singleResult?.output).toBe('file contents') - }) - - test('preserves object input type', () => { - const line = JSON.stringify({ type: 'tool_use', name: 'Write', input: { path: '/a.ts', content: 'code' } }) - const result = passthroughParser.parseLine(line) - const singleResult = Array.isArray(result) ? result[0] : result - expect(singleResult?.input).toEqual({ path: '/a.ts', content: 'code' }) - }) - - test('sets timestamp on passthrough updates', () => { - const before = Date.now() - const line = JSON.stringify({ type: 'message', content: 'Hello' }) - const result = passthroughParser.parseLine(line) - const after = Date.now() - const singleResult = Array.isArray(result) ? result[0] : result - expect(singleResult?.timestamp).toBeGreaterThanOrEqual(before) - expect(singleResult?.timestamp).toBeLessThanOrEqual(after) - }) - - test('handles absent input/output fields gracefully', () => { - const line = JSON.stringify({ type: 'tool_use', name: 'Bash', status: 'pending' }) - const result = passthroughParser.parseLine(line) - const singleResult = Array.isArray(result) ? result[0] : result - expect(singleResult?.type).toBe('tool_call') - expect(singleResult?.input).toBeUndefined() - expect(singleResult?.output).toBeUndefined() - }) -}) - -// ============================================================================ -// History Builder Tests -// ============================================================================ - -describe('createHistoryBuilder', () => { - describe('basic operations', () => { - test('starts with empty history', () => { - const builder = createHistoryBuilder() - expect(builder.getLength()).toBe(0) - expect(builder.getHistory()).toEqual([]) - }) - - test('adds turns to history', () => { - const builder = createHistoryBuilder() - builder.addTurn('Hello', 'Hi there') - expect(builder.getLength()).toBe(1) - expect(builder.getHistory()).toEqual([{ input: 'Hello', output: 'Hi there' }]) - }) - - test('accumulates multiple turns', () => { - const builder = createHistoryBuilder() - builder.addTurn('Hello', 'Hi') - builder.addTurn('How are you?', 'Fine') - expect(builder.getLength()).toBe(2) - }) - - test('clears history', () => { - const builder = createHistoryBuilder() - builder.addTurn('Hello', 'Hi') - builder.clear() - expect(builder.getLength()).toBe(0) - }) - }) - - describe('formatHistory', () => { - test('uses default template', () => { - const builder = createHistoryBuilder() - builder.addTurn('Hello', 'Hi there') - const formatted = builder.formatHistory() - expect(formatted).toBe('User: Hello\nAssistant: Hi there') - }) - - test('uses custom template', () => { - const builder = createHistoryBuilder({ - template: 'Q: {{input}}\nA: {{output}}', - }) - builder.addTurn('Question', 'Answer') - const formatted = builder.formatHistory() - expect(formatted).toBe('Q: Question\nA: Answer') - }) - - test('separates multiple turns with double newline', () => { - const builder = createHistoryBuilder() - builder.addTurn('First', 'One') - builder.addTurn('Second', 'Two') - const formatted = builder.formatHistory() - expect(formatted).toBe('User: First\nAssistant: One\n\nUser: Second\nAssistant: Two') - }) - - test('returns empty string for no history', () => { - const builder = createHistoryBuilder() - expect(builder.formatHistory()).toBe('') - }) - }) - - describe('buildPrompt', () => { - test('returns just input for first turn', () => { - const builder = createHistoryBuilder() - const prompt = builder.buildPrompt('Hello') - expect(prompt).toBe('Hello') - }) - - test('includes history for subsequent turns', () => { - const builder = createHistoryBuilder() - builder.addTurn('Hello', 'Hi') - const prompt = builder.buildPrompt('Next question') - expect(prompt).toContain('User: Hello') - expect(prompt).toContain('Assistant: Hi') - expect(prompt).toContain('User: Next question') - }) - - test('builds complete context with multiple turns', () => { - const builder = createHistoryBuilder() - builder.addTurn('One', 'Reply one') - builder.addTurn('Two', 'Reply two') - const prompt = builder.buildPrompt('Three') - expect(prompt).toContain('User: One') - expect(prompt).toContain('User: Two') - expect(prompt).toContain('User: Three') - }) - }) - - describe('getHistory returns copy', () => { - test('modifying returned array does not affect internal state', () => { - const builder = createHistoryBuilder() - builder.addTurn('Hello', 'Hi') - const history = builder.getHistory() - history.push({ input: 'Fake', output: 'Fake' }) - expect(builder.getLength()).toBe(1) - }) - }) -}) diff --git a/src/integration_tests/claude.spec.ts b/src/integration_tests/claude.spec.ts deleted file mode 100644 index aacb491..0000000 --- a/src/integration_tests/claude.spec.ts +++ /dev/null @@ -1,157 +0,0 @@ -/** - * Integration tests for Claude Code headless adapter. - * - * @remarks - * Tests verify the headless session manager works correctly with Claude Code CLI - * using the schema-driven headless adapter approach. - * - * Run locally with API key: - * ```bash - * ANTHROPIC_API_KEY=sk-... bun test ./src/integration_tests/claude.spec.ts - * ``` - * - * Prerequisites: - * 1. Claude CLI installed (`curl -fsSL https://claude.ai/install.sh | bash`) - * 2. API key: `ANTHROPIC_API_KEY` environment variable - * - * These tests make real API calls and consume credits. - */ - -import { afterAll, beforeAll, describe, expect, setDefaultTimeout, test } from 'bun:test' -import { join } from 'node:path' -import { parseHeadlessConfig } from '../headless/headless.schemas.ts' -import { createSessionManager } from '../headless/headless-session-manager.ts' - -// Long timeout for real agent interactions (2 minutes) -setDefaultTimeout(120000) - -// Use project root as cwd - agents discover MCP servers from config files -const PROJECT_ROOT = process.cwd() - -// Schema path for Claude headless adapter -const SCHEMA_PATH = join(PROJECT_ROOT, 'src/headless/tests/fixtures/claude-headless.json') - -// Get API key from environment -const API_KEY = process.env.ANTHROPIC_API_KEY ?? '' - -// Skip all tests if no API key is available -const describeWithApiKey = API_KEY ? describe : describe.skip - -describeWithApiKey('Claude Code Integration', () => { - let sessionManager: ReturnType - let schemaConfig: ReturnType - - beforeAll(async () => { - // Load JSON from file, then parse with Zod schema - const schemaJson = await Bun.file(SCHEMA_PATH).json() - schemaConfig = parseHeadlessConfig(schemaJson) - - // Create session manager with the schema - sessionManager = createSessionManager({ - schema: schemaConfig, - timeout: 120000, - debug: false, - }) - }) - - afterAll(async () => { - // Cleanup handled automatically by session manager - }) - - test('creates session successfully', async () => { - const session = await sessionManager.create(PROJECT_ROOT) - - expect(session).toBeDefined() - expect(session.id).toBeDefined() - expect(typeof session.id).toBe('string') - expect(session.active).toBe(true) - expect(session.cwd).toBe(PROJECT_ROOT) - }) - - test('sends prompt and receives response', async () => { - const session = await sessionManager.create(PROJECT_ROOT) - - // Simple prompt that doesn't require tools - const result = await sessionManager.prompt(session.id, 'What is 2 + 2? Reply with just the number.') - - expect(result).toBeDefined() - expect(result.output).toBeDefined() - expect(result.output.length).toBeGreaterThan(0) - expect(result.updates).toBeInstanceOf(Array) - - // Should contain "4" somewhere in the response - expect(result.output).toMatch(/4/) - }) - - test('collects trajectory updates during execution', async () => { - const session = await sessionManager.create(PROJECT_ROOT) - const collectedUpdates: unknown[] = [] - - const result = await sessionManager.prompt(session.id, 'Say "hello" and nothing else.', (update) => { - collectedUpdates.push(update) - }) - - expect(result.updates.length).toBeGreaterThan(0) - - // Should have at least one message update - const messageUpdates = result.updates.filter((u) => u.type === 'message') - expect(messageUpdates.length).toBeGreaterThan(0) - }) - - test('uses MCP server from project config', async () => { - // This test verifies that Claude discovers MCP servers from .mcp.json - // The bun-docs MCP server is configured at project root - const session = await sessionManager.create(PROJECT_ROOT) - - // Query the bun-docs MCP server (configured in .mcp.json) - const result = await sessionManager.prompt( - session.id, - 'Use the bun-docs MCP server to search for information about Bun.serve(). ' + - 'What are the key options for creating an HTTP server with Bun?', - ) - - // Response should contain Bun server-related information - expect(result.output.length).toBeGreaterThan(0) - // Should mention server/HTTP-related concepts from Bun docs - expect(result.output.toLowerCase()).toMatch(/serve|server|http|port|fetch|handler/) - }) - - test('multi-turn conversation maintains context (stream mode)', async () => { - // Multi-turn: multiple prompts to same session - const session = await sessionManager.create(PROJECT_ROOT) - - // Turn 1: Establish context - const turn1Result = await sessionManager.prompt(session.id, 'Remember this number: 42. Just confirm you have it.') - expect(turn1Result.output).toMatch(/42|forty.?two|remember/i) - - // Turn 2: Reference previous context - const turn2Result = await sessionManager.prompt( - session.id, - 'What number did I ask you to remember? Reply with just the number.', - ) - expect(turn2Result.output).toMatch(/42/) - }) - - test('receives valid trajectory updates', async () => { - const session = await sessionManager.create(PROJECT_ROOT) - - // Prompt that generates a response with trajectory updates - const result = await sessionManager.prompt( - session.id, - 'What programming language is this project written in? Look at the file extensions.', - ) - - // Result should have output - expect(result.output).toBeDefined() - expect(result.output.length).toBeGreaterThan(0) - - // Should have collected updates during execution - expect(result.updates).toBeInstanceOf(Array) - expect(result.updates.length).toBeGreaterThan(0) - - // All updates should have valid types - const validTypes = ['thought', 'tool_call', 'message', 'plan'] - const allValidTypes = result.updates.every((u) => validTypes.includes(u.type)) - expect(allValidTypes).toBe(true) - }) -}) diff --git a/src/integration_tests/gemini.spec.ts b/src/integration_tests/gemini.spec.ts deleted file mode 100644 index d95216f..0000000 --- a/src/integration_tests/gemini.spec.ts +++ /dev/null @@ -1,139 +0,0 @@ -/** - * Integration tests for Gemini CLI headless adapter. - * - * @remarks - * Tests verify the headless session manager works correctly with Gemini CLI - * using the schema-driven headless adapter approach. - * - * Run locally with API key: - * ```bash - * GEMINI_API_KEY=... bun test ./src/integration_tests/gemini.spec.ts - * ``` - * - * Prerequisites: - * 1. Gemini CLI installed (`npm install -g @google/gemini-cli`) - * 2. API key: `GEMINI_API_KEY` environment variable - * - * These tests make real API calls and consume credits. - */ - -import { afterAll, beforeAll, describe, expect, setDefaultTimeout, test } from 'bun:test' -import { join } from 'node:path' -import { parseHeadlessConfig } from '../headless/headless.schemas.ts' -import { createSessionManager } from '../headless/headless-session-manager.ts' - -// Long timeout for real agent interactions (2 minutes) -setDefaultTimeout(120000) - -// Use project root as cwd - agents discover MCP servers from config files -const PROJECT_ROOT = process.cwd() - -// Schema path for Gemini headless adapter -const SCHEMA_PATH = join(PROJECT_ROOT, 'src/headless/tests/fixtures/gemini-headless.json') - -// Get API key from environment -const GEMINI_API_KEY = process.env.GEMINI_API_KEY ?? '' - -// Skip all tests if no API key is available -const describeWithApiKey = GEMINI_API_KEY ? describe : describe.skip - -describeWithApiKey('Gemini CLI Integration', () => { - let sessionManager: ReturnType - let schemaConfig: ReturnType - - beforeAll(async () => { - // Load JSON from file, then parse with Zod schema - const schemaJson = await Bun.file(SCHEMA_PATH).json() - schemaConfig = parseHeadlessConfig(schemaJson) - - // Create session manager with the schema - sessionManager = createSessionManager({ - schema: schemaConfig, - timeout: 120000, - debug: false, - }) - }) - - afterAll(async () => { - // Cleanup handled automatically by session manager - }) - - test('creates session successfully', async () => { - const session = await sessionManager.create(PROJECT_ROOT) - - expect(session).toBeDefined() - expect(session.id).toBeDefined() - expect(typeof session.id).toBe('string') - expect(session.active).toBe(true) - expect(session.cwd).toBe(PROJECT_ROOT) - }) - - test('sends prompt and receives response', async () => { - const session = await sessionManager.create(PROJECT_ROOT) - - // Simple prompt that doesn't require tools - const result = await sessionManager.prompt(session.id, 'What is 2 + 2? Reply with just the number.') - - expect(result).toBeDefined() - expect(result.output).toBeDefined() - expect(result.output.length).toBeGreaterThan(0) - expect(result.updates).toBeInstanceOf(Array) - - // Should contain "4" somewhere in the response - expect(result.output).toMatch(/4/) - }) - - test('collects trajectory updates during execution', async () => { - const session = await sessionManager.create(PROJECT_ROOT) - const collectedUpdates: unknown[] = [] - - const result = await sessionManager.prompt(session.id, 'Say "hello" and nothing else.', (update) => { - collectedUpdates.push(update) - }) - - expect(result.updates.length).toBeGreaterThan(0) - - // Should have at least one message update - const messageUpdates = result.updates.filter((u) => u.type === 'message') - expect(messageUpdates.length).toBeGreaterThan(0) - }) - - test('multi-turn conversation maintains context (iterative mode)', async () => { - // Multi-turn via headless adapter in iterative mode (history accumulation) - const session = await sessionManager.create(PROJECT_ROOT) - - // Turn 1: Establish context - const turn1Result = await sessionManager.prompt(session.id, 'Remember this number: 42. Just confirm you have it.') - expect(turn1Result.output).toMatch(/42|forty.?two|remember/i) - - // Turn 2: Reference previous context - const turn2Result = await sessionManager.prompt( - session.id, - 'What number did I ask you to remember? Reply with just the number.', - ) - expect(turn2Result.output).toMatch(/42/) - }) - - test('handles simple math question correctly', async () => { - const session = await sessionManager.create(PROJECT_ROOT) - - const result = await sessionManager.prompt(session.id, 'Calculate 15 * 7. Reply with just the number.') - - // Gemini CLI may include formatting variations (newlines, spaces) - // Strip whitespace to verify the correct answer is present - expect(result.output.replace(/\s/g, '')).toContain('105') - }) - - test('processes longer response without timeout', async () => { - const session = await sessionManager.create(PROJECT_ROOT) - - const result = await sessionManager.prompt( - session.id, - 'List 5 programming languages and one key feature of each. Be brief.', - ) - - expect(result.output.length).toBeGreaterThan(50) - // Should mention at least some programming languages - expect(result.output.toLowerCase()).toMatch(/python|javascript|java|rust|go|typescript|c\+\+|ruby/) - }) -}) diff --git a/src/pipeline.ts b/src/pipeline.ts deleted file mode 100644 index 7c3c1bd..0000000 --- a/src/pipeline.ts +++ /dev/null @@ -1,34 +0,0 @@ -/** - * Pipeline commands re-export. - * - * @remarks - * Public API for pipeline commands. Import from here for external use. - * - * @packageDocumentation - */ - -export { - // Types - type CompareConfig, - type ComparisonGrader, - type ComparisonGraderInput, - type ComparisonGraderResult, - type ComparisonRanking, - type ComparisonResult, - // Commands - compare, - type ExtractConfig, - type ExtractedResult, - extract, - type FormatConfig, - type FormatStyle, - format, - type GradeConfig, - type GradedResult, - grade, - type LabeledRun, - type RawOutput, - type RunConfig, - type RunMode, - run, -} from './pipeline/pipeline.ts' diff --git a/src/pipeline/compare-format-detection.ts b/src/pipeline/compare-format-detection.ts deleted file mode 100644 index edbc0b6..0000000 --- a/src/pipeline/compare-format-detection.ts +++ /dev/null @@ -1,100 +0,0 @@ -/** - * Format detection for compare command. - * - * @remarks - * Auto-detects whether input files contain CaptureResult or TrialResult data - * by inspecting the first line of the JSONL file. - * - * Detection logic: - * - TrialResult: has `trials` array and `k` number - * - CaptureResult: has `trajectory` array and `timing` object - * - * @packageDocumentation - */ - -/** Detected input format for compare command */ -export type CompareInputFormat = 'capture' | 'trials' - -/** - * Detect input format from JSONL file. - * - * @remarks - * Reads the first non-empty line of the file and checks for - * discriminating fields to determine the format. - * - * @param path - Path to JSONL file - * @returns Detected format ('capture' or 'trials') - * @throws Error if file is empty or format cannot be detected - * - * @public - */ -export const detectInputFormat = async (path: string): Promise => { - const file = Bun.file(path) - const text = await file.text() - const firstLine = text.split('\n').find((line) => line.trim()) - - if (!firstLine) { - throw new Error(`Empty file: ${path}`) - } - - let parsed: unknown - try { - parsed = JSON.parse(firstLine) - } catch { - throw new Error(`Invalid JSON in first line of: ${path}`) - } - - if (typeof parsed !== 'object' || parsed === null) { - throw new Error(`Expected object in first line of: ${path}`) - } - - const obj = parsed as Record - - // TrialResult has `trials` array and `k` number - if ('trials' in obj && Array.isArray(obj.trials) && 'k' in obj && typeof obj.k === 'number') { - return 'trials' - } - - // CaptureResult has `trajectory` array and `timing` object - if ('trajectory' in obj && Array.isArray(obj.trajectory) && 'timing' in obj && typeof obj.timing === 'object') { - return 'capture' - } - - throw new Error( - `Unable to detect format for: ${path}. ` + - `Expected either TrialResult (with trials/k fields) or CaptureResult (with trajectory/timing fields).`, - ) -} - -/** - * Validate that all files have the same format. - * - * @param paths - Paths to JSONL files - * @returns Detected format (all files must match) - * @throws Error if files have different formats - * - * @public - */ -export const detectAndValidateFormat = async (paths: string[]): Promise => { - const firstPath = paths[0] - if (!firstPath) { - throw new Error('No files provided for format detection') - } - - const format = await detectInputFormat(firstPath) - - for (let i = 1; i < paths.length; i++) { - const path = paths[i] - if (!path) continue - - const otherFormat = await detectInputFormat(path) - if (otherFormat !== format) { - throw new Error( - `Format mismatch: ${firstPath} is ${format}, but ${path} is ${otherFormat}. ` + - `All files must have the same format.`, - ) - } - } - - return format -} diff --git a/src/pipeline/compare-trials.ts b/src/pipeline/compare-trials.ts deleted file mode 100644 index d476c7e..0000000 --- a/src/pipeline/compare-trials.ts +++ /dev/null @@ -1,800 +0,0 @@ -/** - * Pipeline compare command for trials data. - * - * @remarks - * Compares multiple runs of TrialResult data, analyzing capability (passAtK), - * reliability (passExpK), and flakiness metrics. - * - * Outputs a TrialsComparisonReport JSON (not JSONL) containing aggregate - * statistics across all dimensions plus head-to-head comparisons. - * - * Built-in strategies: - * - `weighted`: Configurable weights for capability, reliability, consistency (default) - * - `statistical`: Bootstrap sampling for confidence intervals on passAtK - * - * @packageDocumentation - */ - -import { logProgress, writeOutput } from '../core.ts' -import { bootstrap, formatCI, getBootstrapConfigFromEnv } from '../graders/bootstrap.ts' -import { grade as statisticalGrade } from '../graders/trials-compare-statistical.ts' -import { grade as weightedGrade } from '../graders/trials-compare-weighted.ts' -import type { - PairwiseComparison, - TrialResult, - TrialsCapabilityMetrics, - TrialsComparisonMeta, - TrialsComparisonReport, - TrialsFlakinessMetrics, - TrialsPerformanceMetrics, - TrialsPromptComparison, - TrialsQualityMetrics, - TrialsReliabilityMetrics, -} from '../schemas.ts' -import { TrialResultSchema } from '../schemas.ts' -import { computeLatencyStats, percentile } from './compare-utils.ts' -import type { - ComparisonGraderResult, - LabeledRun, - TrialsComparisonGrader, - TrialsComparisonGraderInput, - TrialsComparisonRunData, -} from './pipeline.types.ts' - -/** Comparison strategy type for trials */ -export type TrialsCompareStrategy = 'weighted' | 'statistical' | 'custom' - -/** Extended compare config for trials */ -export type TrialsCompareConfig = { - /** Labeled runs to compare */ - runs: LabeledRun[] - /** Comparison strategy (default: weighted) */ - strategy?: TrialsCompareStrategy - /** Path to custom grader (required if strategy is 'custom') */ - graderPath?: string - /** Output file path */ - outputPath?: string - /** Show progress to stderr */ - progress?: boolean - /** Output format (default: json) */ - format?: 'json' | 'markdown' -} - -/** - * Stream trial results from a JSONL file. - * - * @param path - Path to the trials.jsonl file - * @yields Parsed and validated trial results - */ -async function* streamTrialResults(path: string): AsyncGenerator { - const file = Bun.file(path) - const text = await file.text() - const lines = text.split('\n') - - for (let i = 0; i < lines.length; i++) { - const line = lines[i]?.trim() - if (!line) continue - - try { - yield TrialResultSchema.parse(JSON.parse(line)) - } catch (error) { - throw new Error(`Invalid trial result at line ${i + 1}: ${error instanceof Error ? error.message : error}`) - } - } -} - -/** - * Build an indexed map of trial results by ID. - * - * @param path - Path to the trials.jsonl file - * @returns Map of result ID to TrialResult - */ -export const buildTrialsIndex = async (path: string): Promise> => { - const index = new Map() - - for await (const result of streamTrialResults(path)) { - index.set(result.id, result) - } - - return index -} - -/** - * Load trials comparison grader from file. - * - * @param path - Path to grader module - * @returns Loaded trials comparison grader function - * @throws Error if module cannot be loaded or doesn't export a grader function - */ -const loadTrialsComparisonGrader = async (path: string): Promise => { - let module: Record - try { - module = (await import(path)) as Record - } catch (error) { - throw new Error(`Failed to load grader from '${path}': ${error instanceof Error ? error.message : error}`) - } - - if (typeof module.grade === 'function') { - return module.grade as TrialsComparisonGrader - } - if (typeof module.default === 'function') { - return module.default as TrialsComparisonGrader - } - if (typeof module.compare === 'function') { - return module.compare as TrialsComparisonGrader - } - - throw new Error(`Trials comparison grader must export 'grade', 'compare', or 'default' function`) -} - -/** - * Get grader function based on strategy. - * - * @param strategy - Comparison strategy - * @param graderPath - Path to custom grader (for 'custom' strategy) - * @returns Trials comparison grader function - */ -const getTrialsGrader = async ( - strategy: TrialsCompareStrategy, - graderPath?: string, -): Promise => { - switch (strategy) { - case 'weighted': - return weightedGrade - case 'statistical': - return statisticalGrade - case 'custom': - if (!graderPath) { - throw new Error('Custom strategy requires --grader path') - } - return loadTrialsComparisonGrader(graderPath) - } -} - -/** - * Compute capability metrics from trial results. - * - * @param results - Array of trial results - * @returns Capability metrics (passAtK statistics) - */ -const computeCapabilityMetrics = (results: TrialResult[]): TrialsCapabilityMetrics => { - const passAtKValues = results.map((r) => r.passAtK ?? 0) - - if (passAtKValues.length === 0) { - return { avgPassAtK: 0, medianPassAtK: 0, p25PassAtK: 0, p75PassAtK: 0 } - } - - const sorted = [...passAtKValues].sort((a, b) => a - b) - const sum = passAtKValues.reduce((a, b) => a + b, 0) - - return { - avgPassAtK: sum / passAtKValues.length, - medianPassAtK: percentile(sorted, 0.5), - p25PassAtK: percentile(sorted, 0.25), - p75PassAtK: percentile(sorted, 0.75), - } -} - -/** - * Compute reliability metrics from trial results. - * - * @param results - Array of trial results - * @returns Reliability metrics (passExpK statistics) - */ -const computeReliabilityMetrics = (results: TrialResult[]): TrialsReliabilityMetrics => { - const passExpKValues = results.map((r) => r.passExpK ?? 0) - - if (passExpKValues.length === 0) { - return { type: 'trial', avgPassExpK: 0, medianPassExpK: 0, p25PassExpK: 0, p75PassExpK: 0 } - } - - const sorted = [...passExpKValues].sort((a, b) => a - b) - const sum = passExpKValues.reduce((a, b) => a + b, 0) - - return { - type: 'trial', - avgPassExpK: sum / passExpKValues.length, - medianPassExpK: percentile(sorted, 0.5), - p25PassExpK: percentile(sorted, 0.25), - p75PassExpK: percentile(sorted, 0.75), - } -} - -/** - * Compute flakiness metrics from trial results. - * - * @param results - Array of trial results - * @param maxTopFlaky - Maximum number of top flaky prompts to include - * @returns Flakiness metrics - */ -const computeFlakinessMetrics = (results: TrialResult[], maxTopFlaky: number = 10): TrialsFlakinessMetrics => { - const flakinessData = results.map((r) => ({ - id: r.id, - flakiness: Math.max(0, (r.passAtK ?? 0) - (r.passExpK ?? 0)), - })) - - if (flakinessData.length === 0) { - return { avgFlakiness: 0, medianFlakiness: 0, flakyPromptCount: 0, topFlakyPrompts: [] } - } - - const flakinessValues = flakinessData.map((d) => d.flakiness) - const sorted = [...flakinessValues].sort((a, b) => a - b) - const sum = flakinessValues.reduce((a, b) => a + b, 0) - - // Sort by flakiness descending to get top flaky prompts - const topFlaky = [...flakinessData] - .filter((d) => d.flakiness > 0) - .sort((a, b) => b.flakiness - a.flakiness) - .slice(0, maxTopFlaky) - - return { - avgFlakiness: sum / flakinessValues.length, - medianFlakiness: percentile(sorted, 0.5), - flakyPromptCount: flakinessData.filter((d) => d.flakiness > 0).length, - topFlakyPrompts: topFlaky, - } -} - -/** Result from quality metrics computation, including raw scores for CI reuse */ -type QualityComputeResult = { - metrics: TrialsQualityMetrics - rawScores: number[] -} - -/** - * Compute quality metrics from trial results. - * - * @remarks - * Flattens all trial scores across all prompts into a single distribution. - * Returns undefined if no scores are present (no grader was used). - * Returns raw scores alongside metrics to avoid re-traversal for CI computation. - * - * @param results - Array of trial results - * @returns Quality metrics with raw scores, or undefined if no scores - */ -const computeTrialsQualityMetrics = (results: TrialResult[]): QualityComputeResult | undefined => { - const rawScores = results.flatMap((r) => r.trials.filter((t) => t.score !== undefined).map((t) => t.score as number)) - - if (rawScores.length === 0) return undefined - - const sorted = [...rawScores].sort((a, b) => a - b) - const sum = rawScores.reduce((a, b) => a + b, 0) - - return { - metrics: { - type: 'trial', - avgScore: sum / rawScores.length, - medianScore: percentile(sorted, 0.5), - p25Score: percentile(sorted, 0.25), - p75Score: percentile(sorted, 0.75), - }, - rawScores, - } -} - -/** Result from performance metrics computation, including raw durations for CI reuse */ -type PerformanceComputeResult = { - metrics: TrialsPerformanceMetrics - rawDurations: number[] -} - -/** - * Compute performance metrics from trial results. - * - * @remarks - * Flattens all trial durations across all prompts into latency statistics. - * Always returns a value since TrialEntry.duration is required. - * Returns raw durations alongside metrics to avoid re-traversal for CI computation. - * - * @param results - Array of trial results - * @returns Performance metrics with raw durations - */ -const computeTrialsPerformanceMetrics = (results: TrialResult[]): PerformanceComputeResult => { - const rawDurations = results.flatMap((r) => r.trials.map((t) => t.duration)) - - return { - metrics: { - latency: computeLatencyStats(rawDurations), - totalDuration: rawDurations.reduce((a, b) => a + b, 0), - }, - rawDurations, - } -} - -/** - * Execute trials comparison and generate aggregate report. - * - * @param config - Trials compare configuration - * @returns Trials comparison report - */ -export const runTrialsCompare = async (config: TrialsCompareConfig): Promise => { - const { runs, strategy = 'weighted', graderPath, outputPath, progress = false, format = 'json' } = config - - if (runs.length < 2) { - throw new Error('At least 2 runs required for comparison') - } - - // Get grader based on strategy - const grader = await getTrialsGrader(strategy, graderPath) - - const strategyLabel = strategy === 'custom' ? `custom: ${graderPath}` : strategy - logProgress(`Comparing ${runs.length} trials runs with strategy: ${strategyLabel}`, progress) - for (const run of runs) { - logProgress(` - ${run.label}: ${run.path}`, progress) - } - - // Load all runs using indexed streaming - const runResults: Record> = {} - for (const run of runs) { - logProgress(`Loading ${run.label}...`, progress) - runResults[run.label] = await buildTrialsIndex(run.path) - } - - // Build set of all prompt IDs across runs - const promptIds = new Set() - for (const resultsMap of Object.values(runResults)) { - for (const id of resultsMap.keys()) { - promptIds.add(id) - } - } - - logProgress(`Comparing ${promptIds.size} prompts...`, progress) - - // Per-prompt comparison results - const promptComparisons: TrialsPromptComparison[] = [] - const perPromptGraderResults: { id: string; result: ComparisonGraderResult }[] = [] - - // Track k value (should be consistent across all results) - let trialsPerPrompt = 0 - - for (const promptId of promptIds) { - logProgress(` ${promptId}`, progress) - - // Build comparison input - const runsData: TrialsComparisonGraderInput['runs'] = {} - let input: string | string[] = '' - let hint: string | undefined - - for (const [label, resultsMap] of Object.entries(runResults)) { - const result = resultsMap.get(promptId) - if (result) { - const runData: TrialsComparisonRunData = { - passRate: result.passRate, - passAtK: result.passAtK, - passExpK: result.passExpK, - k: result.k, - trials: result.trials, - } - runsData[label] = runData - - // Track k value - if (trialsPerPrompt === 0) { - trialsPerPrompt = result.k - } - - // Use first found input/hint as the reference - if (!input) { - input = result.input - hint = result.hint - } - } - } - - // Skip if not present in at least 2 runs - if (Object.keys(runsData).length < 2) { - logProgress(` Skipped (only in ${Object.keys(runsData).length} run)`, progress) - continue - } - - // Apply comparison grader - const graderInput: TrialsComparisonGraderInput = { - id: promptId, - input, - hint, - runs: runsData, - } - - const graderResult = await grader(graderInput) - perPromptGraderResults.push({ id: promptId, result: graderResult }) - - // Build prompt comparison for head-to-head - const passAtK: Record = {} - const passExpK: Record = {} - const flakiness: Record = {} - - for (const [label, data] of Object.entries(runsData)) { - passAtK[label] = data.passAtK ?? 0 - passExpK[label] = data.passExpK ?? 0 - flakiness[label] = Math.max(0, (data.passAtK ?? 0) - (data.passExpK ?? 0)) - } - - // Determine winners - const labels = Object.keys(runsData) - let capabilityWinner: string | null = null - let reliabilityWinner: string | null = null - - // Capability winner: highest passAtK - const sortedByCapability = [...labels].sort((a, b) => (passAtK[b] ?? 0) - (passAtK[a] ?? 0)) - if (sortedByCapability.length >= 2) { - const first = sortedByCapability[0] - const second = sortedByCapability[1] - if (first && second && (passAtK[first] ?? 0) > (passAtK[second] ?? 0)) { - capabilityWinner = first - } - } - - // Reliability winner: highest passExpK - const sortedByReliability = [...labels].sort((a, b) => (passExpK[b] ?? 0) - (passExpK[a] ?? 0)) - if (sortedByReliability.length >= 2) { - const first = sortedByReliability[0] - const second = sortedByReliability[1] - if (first && second && (passExpK[first] ?? 0) > (passExpK[second] ?? 0)) { - reliabilityWinner = first - } - } - - promptComparisons.push({ - id: promptId, - capabilityWinner, - reliabilityWinner, - passAtK, - passExpK, - flakiness, - }) - - // Log winner - const winner = graderResult.rankings.find((r) => r.rank === 1) - if (winner) { - logProgress(` Overall winner: ${winner.run} (${winner.score.toFixed(3)})`, progress) - } - } - - // Compute aggregate metrics per run - const runLabels = runs.map((r) => r.label) - - const capability: Record = {} - const reliability: Record = {} - const flakiness: Record = {} - const quality: Record = {} - const performance: Record = {} - const rawScoresByRun: Record = {} - const rawDurationsByRun: Record = {} - - let hasQuality = false - - for (const label of runLabels) { - const resultsMap = runResults[label] ?? new Map() - const results = [...resultsMap.values()] - - capability[label] = computeCapabilityMetrics(results) - reliability[label] = computeReliabilityMetrics(results) - flakiness[label] = computeFlakinessMetrics(results) - - const perfResult = computeTrialsPerformanceMetrics(results) - performance[label] = perfResult.metrics - rawDurationsByRun[label] = perfResult.rawDurations - - const qualityResult = computeTrialsQualityMetrics(results) - if (qualityResult) { - quality[label] = qualityResult.metrics - rawScoresByRun[label] = qualityResult.rawScores - hasQuality = true - } - } - - // Compute confidence intervals when using statistical strategy - if (strategy === 'statistical') { - const bootstrapConfig = getBootstrapConfigFromEnv() - - for (const label of runLabels) { - const resultsMap = runResults[label] ?? new Map() - const resultsArr = [...resultsMap.values()] - const passAtKValues = resultsArr.map((r) => r.passAtK ?? 0) - const passExpKValues = resultsArr.map((r) => r.passExpK ?? 0) - - // Capability CIs - const capabilityMetrics = capability[label] - if (capabilityMetrics) { - capabilityMetrics.confidenceIntervals = { - avgPassAtK: bootstrap(passAtKValues, bootstrapConfig).ci, - } - } - - // Reliability CIs - const reliabilityMetrics = reliability[label] - if (reliabilityMetrics) { - reliabilityMetrics.confidenceIntervals = { - avgPassExpK: bootstrap(passExpKValues, bootstrapConfig).ci, - } - } - - // Quality CIs (only when scores present) - const qualityMetrics = quality[label] - const scores = rawScoresByRun[label] - if (qualityMetrics && scores && scores.length > 0) { - qualityMetrics.confidenceIntervals = { - avgScore: bootstrap(scores, bootstrapConfig).ci, - } - } - - // Performance CIs - const performanceMetrics = performance[label] - const durations = rawDurationsByRun[label] - if (performanceMetrics && durations && durations.length > 0) { - performanceMetrics.confidenceIntervals = { - latencyMean: bootstrap(durations, bootstrapConfig).ci, - } - } - } - } - - // Compute pairwise comparisons - const capabilityPairwise: PairwiseComparison[] = [] - const reliabilityPairwise: PairwiseComparison[] = [] - const overallPairwise: PairwiseComparison[] = [] - - for (let i = 0; i < runLabels.length; i++) { - for (let j = i + 1; j < runLabels.length; j++) { - const runA = runLabels[i] - const runB = runLabels[j] - - if (!runA || !runB) continue - - // Capability pairwise - let capAWins = 0 - let capBWins = 0 - let capTies = 0 - - // Reliability pairwise - let relAWins = 0 - let relBWins = 0 - let relTies = 0 - - // Overall pairwise (from grader results) - let overallAWins = 0 - let overallBWins = 0 - let overallTies = 0 - - for (const pc of promptComparisons) { - // Capability - if (pc.capabilityWinner === runA) capAWins++ - else if (pc.capabilityWinner === runB) capBWins++ - else capTies++ - - // Reliability - if (pc.reliabilityWinner === runA) relAWins++ - else if (pc.reliabilityWinner === runB) relBWins++ - else relTies++ - } - - // Overall from grader results - for (const { result } of perPromptGraderResults) { - const winner = result.rankings.find((r) => r.rank === 1) - if (winner?.run === runA) overallAWins++ - else if (winner?.run === runB) overallBWins++ - else overallTies++ - } - - capabilityPairwise.push({ runA, runB, aWins: capAWins, bWins: capBWins, ties: capTies }) - reliabilityPairwise.push({ runA, runB, aWins: relAWins, bWins: relBWins, ties: relTies }) - overallPairwise.push({ runA, runB, aWins: overallAWins, bWins: overallBWins, ties: overallTies }) - } - } - - // Build meta - const meta: TrialsComparisonMeta = { - generatedAt: new Date().toISOString(), - runs: runLabels, - promptCount: promptIds.size, - trialsPerPrompt, - inputFormat: 'trials', - } - - // Assemble report - const report: TrialsComparisonReport = { - meta, - capability, - reliability, - flakiness, - quality: hasQuality ? quality : undefined, - performance, - headToHead: { - capability: capabilityPairwise, - reliability: reliabilityPairwise, - overall: overallPairwise, - }, - perPrompt: promptComparisons, - } - - // Output - if (format === 'markdown') { - const markdown = formatTrialsReportAsMarkdown(report) - await writeOutput(markdown, outputPath, false) - } else { - await writeOutput(JSON.stringify(report, null, 2), outputPath, false) - } - - // Summary statistics - logProgress('', progress) - logProgress('=== Summary ===', progress) - - for (const [label, cap] of Object.entries(capability)) { - const rel = reliability[label] - const flak = flakiness[label] - const perf = performance[label] - const qual = quality[label] - const qualStr = qual ? ` avgScore=${qual.avgScore.toFixed(3)}` : '' - const perfStr = perf ? ` latencyP50=${perf.latency.p50.toFixed(0)}ms` : '' - logProgress( - ` ${label}: passAtK=${cap?.avgPassAtK.toFixed(3)} passExpK=${rel?.avgPassExpK.toFixed(3)} flakiness=${flak?.avgFlakiness.toFixed(3)}${qualStr}${perfStr}`, - progress, - ) - } - - logProgress('', progress) - logProgress('Overall wins:', progress) - for (const pw of overallPairwise) { - logProgress(` ${pw.runA} vs ${pw.runB}: ${pw.aWins}-${pw.bWins}-${pw.ties}`, progress) - } - - logProgress('Done!', progress) - - return report -} - -/** - * Format trials comparison report as markdown. - * - * @param report - Trials comparison report - * @returns Markdown string - */ -const formatTrialsReportAsMarkdown = (report: TrialsComparisonReport): string => { - const lines: string[] = [] - - lines.push('# Trials Comparison Report') - lines.push('') - lines.push(`Generated: ${report.meta.generatedAt}`) - lines.push(`Runs: ${report.meta.runs.join(', ')}`) - lines.push(`Prompts: ${report.meta.promptCount} | Trials per prompt: ${report.meta.trialsPerPrompt}`) - lines.push('') - - // Check if any run has confidence intervals (statistical strategy was used) - const hasCIs = Object.values(report.capability).some((c) => c.confidenceIntervals) - - // Capability table - lines.push('## Capability (passAtK)') - lines.push('') - if (hasCIs) { - lines.push('| Run | Avg | 95% CI | Median | P25 | P75 |') - lines.push('|-----|-----|--------|--------|-----|-----|') - for (const [label, c] of Object.entries(report.capability)) { - const avgCI = formatCI(c.confidenceIntervals?.avgPassAtK) - lines.push( - `| ${label} | ${c.avgPassAtK.toFixed(3)} | ${avgCI} | ${c.medianPassAtK.toFixed(3)} | ${c.p25PassAtK.toFixed(3)} | ${c.p75PassAtK.toFixed(3)} |`, - ) - } - } else { - lines.push('| Run | Avg | Median | P25 | P75 |') - lines.push('|-----|-----|--------|-----|-----|') - for (const [label, c] of Object.entries(report.capability)) { - lines.push( - `| ${label} | ${c.avgPassAtK.toFixed(3)} | ${c.medianPassAtK.toFixed(3)} | ${c.p25PassAtK.toFixed(3)} | ${c.p75PassAtK.toFixed(3)} |`, - ) - } - } - lines.push('') - - // Reliability table - lines.push('## Reliability (passExpK)') - lines.push('') - if (hasCIs) { - lines.push('| Run | Avg | 95% CI | Median | P25 | P75 |') - lines.push('|-----|-----|--------|--------|-----|-----|') - for (const [label, r] of Object.entries(report.reliability)) { - const avgCI = formatCI(r.confidenceIntervals?.avgPassExpK) - lines.push( - `| ${label} | ${r.avgPassExpK.toFixed(3)} | ${avgCI} | ${r.medianPassExpK.toFixed(3)} | ${r.p25PassExpK.toFixed(3)} | ${r.p75PassExpK.toFixed(3)} |`, - ) - } - } else { - lines.push('| Run | Avg | Median | P25 | P75 |') - lines.push('|-----|-----|--------|-----|-----|') - for (const [label, r] of Object.entries(report.reliability)) { - lines.push( - `| ${label} | ${r.avgPassExpK.toFixed(3)} | ${r.medianPassExpK.toFixed(3)} | ${r.p25PassExpK.toFixed(3)} | ${r.p75PassExpK.toFixed(3)} |`, - ) - } - } - lines.push('') - - // Flakiness table - lines.push('## Flakiness') - lines.push('') - lines.push('| Run | Avg | Median | Flaky Prompts |') - lines.push('|-----|-----|--------|---------------|') - for (const [label, f] of Object.entries(report.flakiness)) { - lines.push(`| ${label} | ${f.avgFlakiness.toFixed(3)} | ${f.medianFlakiness.toFixed(3)} | ${f.flakyPromptCount} |`) - } - lines.push('') - - // Quality table (only when scores present) - if (report.quality && Object.keys(report.quality).length > 0) { - const hasQualityCIs = Object.values(report.quality).some((q) => q.confidenceIntervals) - - lines.push('## Quality (Scores)') - lines.push('') - if (hasQualityCIs) { - lines.push('| Run | Avg Score | 95% CI | Median | P25 | P75 |') - lines.push('|-----|-----------|--------|--------|-----|-----|') - for (const [label, q] of Object.entries(report.quality)) { - const avgCI = formatCI(q.confidenceIntervals?.avgScore) - lines.push( - `| ${label} | ${q.avgScore.toFixed(3)} | ${avgCI} | ${q.medianScore.toFixed(3)} | ${q.p25Score.toFixed(3)} | ${q.p75Score.toFixed(3)} |`, - ) - } - } else { - lines.push('| Run | Avg Score | Median | P25 | P75 |') - lines.push('|-----|-----------|--------|-----|-----|') - for (const [label, q] of Object.entries(report.quality)) { - lines.push( - `| ${label} | ${q.avgScore.toFixed(3)} | ${q.medianScore.toFixed(3)} | ${q.p25Score.toFixed(3)} | ${q.p75Score.toFixed(3)} |`, - ) - } - } - lines.push('') - } - - // Performance table (always present) - const hasPerfCIs = Object.values(report.performance).some((p) => p.confidenceIntervals) - - lines.push('## Performance (Latency)') - lines.push('') - if (hasPerfCIs) { - lines.push('| Run | P50 (ms) | P90 (ms) | P99 (ms) | Mean (ms) | 95% CI | Total (ms) |') - lines.push('|-----|----------|----------|----------|-----------|--------|------------|') - for (const [label, p] of Object.entries(report.performance)) { - const latencyCI = formatCI(p.confidenceIntervals?.latencyMean, 0) - lines.push( - `| ${label} | ${p.latency.p50.toFixed(0)} | ${p.latency.p90.toFixed(0)} | ${p.latency.p99.toFixed(0)} | ${p.latency.mean.toFixed(0)} | ${latencyCI} | ${p.totalDuration.toFixed(0)} |`, - ) - } - } else { - lines.push('| Run | P50 (ms) | P90 (ms) | P99 (ms) | Mean (ms) | Total (ms) |') - lines.push('|-----|----------|----------|----------|-----------|------------|') - for (const [label, p] of Object.entries(report.performance)) { - lines.push( - `| ${label} | ${p.latency.p50.toFixed(0)} | ${p.latency.p90.toFixed(0)} | ${p.latency.p99.toFixed(0)} | ${p.latency.mean.toFixed(0)} | ${p.totalDuration.toFixed(0)} |`, - ) - } - } - lines.push('') - - // Head-to-head - lines.push('## Head-to-Head') - lines.push('') - lines.push('### By Capability') - lines.push('| Matchup | A Wins | B Wins | Ties |') - lines.push('|---------|--------|--------|------|') - for (const p of report.headToHead.capability) { - lines.push(`| ${p.runA} vs ${p.runB} | ${p.aWins} | ${p.bWins} | ${p.ties} |`) - } - lines.push('') - - lines.push('### By Reliability') - lines.push('| Matchup | A Wins | B Wins | Ties |') - lines.push('|---------|--------|--------|------|') - for (const p of report.headToHead.reliability) { - lines.push(`| ${p.runA} vs ${p.runB} | ${p.aWins} | ${p.bWins} | ${p.ties} |`) - } - lines.push('') - - lines.push('### Overall (Weighted)') - lines.push('| Matchup | A Wins | B Wins | Ties |') - lines.push('|---------|--------|--------|------|') - for (const p of report.headToHead.overall) { - lines.push(`| ${p.runA} vs ${p.runB} | ${p.aWins} | ${p.bWins} | ${p.ties} |`) - } - lines.push('') - - return lines.join('\n') -} diff --git a/src/pipeline/compare-utils.ts b/src/pipeline/compare-utils.ts deleted file mode 100644 index 81bea33..0000000 --- a/src/pipeline/compare-utils.ts +++ /dev/null @@ -1,85 +0,0 @@ -/** - * Shared utility functions for comparison modules. - * - * @remarks - * Extracted from compare.ts and compare-trials.ts to avoid duplication. - * Contains statistical helpers used by both CaptureResult and TrialResult comparisons. - * - * @packageDocumentation - */ - -import type { LatencyStats, ScoreDistribution } from '../schemas.ts' - -/** - * Compute percentile from sorted array using nearest rank method. - * - * @remarks - * Uses floor indexing (nearest rank method). For an array of length N, - * returns the element at index `floor(N * p)`, clamped to the last element. - * This does not interpolate between ranks. - * - * @param sorted - Sorted array of numbers - * @param p - Percentile (0-1) - * @returns Value at percentile - * - * @public - */ -export const percentile = (sorted: number[], p: number): number => { - if (sorted.length === 0) return 0 - const idx = Math.floor(sorted.length * p) - return sorted[Math.min(idx, sorted.length - 1)] ?? 0 -} - -/** - * Compute latency statistics from array of durations. - * - * @param durations - Array of durations in milliseconds - * @returns Latency statistics - * - * @public - */ -export const computeLatencyStats = (durations: number[]): LatencyStats => { - if (durations.length === 0) { - return { p50: 0, p90: 0, p99: 0, mean: 0, min: 0, max: 0 } - } - - const sorted = [...durations].sort((a, b) => a - b) - const sum = sorted.reduce((a, b) => a + b, 0) - - return { - p50: percentile(sorted, 0.5), - p90: percentile(sorted, 0.9), - p99: percentile(sorted, 0.99), - mean: sum / sorted.length, - min: sorted[0] ?? 0, - max: sorted[sorted.length - 1] ?? 0, - } -} - -/** - * Compute score distribution histogram. - * - * @param scores - Array of scores (0-1) - * @returns Score distribution histogram - * - * @public - */ -export const computeScoreDistribution = (scores: number[]): ScoreDistribution => { - const dist: ScoreDistribution = { - '0.0-0.2': 0, - '0.2-0.4': 0, - '0.4-0.6': 0, - '0.6-0.8': 0, - '0.8-1.0': 0, - } - - for (const score of scores) { - if (score < 0.2) dist['0.0-0.2']++ - else if (score < 0.4) dist['0.2-0.4']++ - else if (score < 0.6) dist['0.4-0.6']++ - else if (score < 0.8) dist['0.6-0.8']++ - else dist['0.8-1.0']++ - } - - return dist -} diff --git a/src/pipeline/compare.ts b/src/pipeline/compare.ts deleted file mode 100644 index 0d8c1f2..0000000 --- a/src/pipeline/compare.ts +++ /dev/null @@ -1,818 +0,0 @@ -/** - * Pipeline compare command - compare multiple runs of the same prompts. - * - * @remarks - * Compares results from different configurations (agents, MCP servers, models) - * using either built-in strategies or a user-provided comparison grader. - * - * Outputs a holistic ComparisonReport JSON (not JSONL) containing aggregate - * statistics across quality, performance, reliability, and head-to-head metrics. - * - * Terminology: "runs" (not "agents") because comparisons can be: - * - Same agent, different MCP servers - * - Same agent, different skills enabled - * - Same agent, different system prompts - * - Same agent, different model versions - * - Different agents entirely - * - * Built-in strategies: - * - `weighted`: Configurable weights for quality, latency, reliability (default) - * - `statistical`: Bootstrap sampling for confidence intervals - * - * @packageDocumentation - */ - -import { basename, extname } from 'node:path' -import { parseArgs } from 'node:util' -import { buildResultsIndex, logProgress, writeOutput } from '../core.ts' -import { bootstrap, formatCI, getBootstrapConfigFromEnv } from '../graders/bootstrap.ts' -import { grade as statisticalGrade } from '../graders/compare-statistical.ts' -import { grade as weightedGrade } from '../graders/compare-weighted.ts' -import type { - CaptureResult, - ComparisonMeta, - ComparisonReport, - HeadToHead, - PairwiseComparison, - PerformanceMetrics, - PromptComparison, - QualityMetrics, - ReliabilityMetrics, - TrajectoryInfo, - TrajectoryRichness, -} from '../schemas.ts' -import { type CompareInputFormat, detectAndValidateFormat } from './compare-format-detection.ts' -import { runTrialsCompare } from './compare-trials.ts' -import { computeLatencyStats, computeScoreDistribution } from './compare-utils.ts' -import type { - CompareConfig, - ComparisonGrader, - ComparisonGraderInput, - ComparisonResult, - LabeledRun, -} from './pipeline.types.ts' - -/** Comparison strategy type */ -export type CompareStrategy = 'weighted' | 'statistical' | 'custom' - -/** Extended compare config with strategy support */ -export type ExtendedCompareConfig = Omit & { - /** Comparison strategy (default: weighted) */ - strategy?: CompareStrategy - /** Path to custom grader (required if strategy is 'custom') */ - graderPath?: string - /** Output format (default: json) */ - format?: 'json' | 'markdown' -} - -/** - * Load comparison grader from file. - * - * @remarks - * Similar to loadGrader but expects ComparisonGrader interface. - * - * @param path - Path to grader module - * @returns Loaded comparison grader function - */ -const loadComparisonGrader = async (path: string): Promise => { - const module = await import(path) - - if (typeof module.grade === 'function') { - return module.grade as ComparisonGrader - } - if (typeof module.default === 'function') { - return module.default as ComparisonGrader - } - if (typeof module.compare === 'function') { - return module.compare as ComparisonGrader - } - - throw new Error(`Comparison grader must export 'grade', 'compare', or 'default' function`) -} - -/** - * Derive label from file path. - * - * @param path - File path - * @returns Label derived from filename without extension - */ -const labelFromPath = (path: string): string => { - const base = basename(path) - const ext = extname(base) - return base.slice(0, -ext.length) -} - -/** - * Parse labeled run argument. - * - * @remarks - * Supports formats: - * - "path.jsonl" - label derived from filename - * - "label:path.jsonl" - explicit label - * - * @param arg - Run argument string - * @returns Labeled run object - */ -const parseLabeledRun = (arg: string): LabeledRun => { - const colonIndex = arg.indexOf(':') - - // Check if this looks like a label:path format (not a Windows drive letter) - if (colonIndex > 0 && colonIndex !== 1) { - return { - label: arg.slice(0, colonIndex), - path: arg.slice(colonIndex + 1), - } - } - - return { - label: labelFromPath(arg), - path: arg, - } -} - -/** - * Validate that all run files exist. - * - * @param runs - Labeled runs to validate - * @throws Error if any file doesn't exist - */ -const validateRunFiles = async (runs: LabeledRun[]): Promise => { - const missing: string[] = [] - - for (const run of runs) { - const exists = await Bun.file(run.path).exists() - if (!exists) { - missing.push(`${run.label}: ${run.path}`) - } - } - - if (missing.length > 0) { - throw new Error(`Result file(s) not found:\n ${missing.join('\n ')}`) - } -} - -/** - * Infer output format from file extension. - * - * @param outputPath - Output file path - * @param explicitFormat - Explicitly provided format (takes precedence) - * @returns Inferred format - */ -const inferFormat = (outputPath: string | undefined, explicitFormat: string | undefined): 'json' | 'markdown' => { - // Explicit format takes precedence - if (explicitFormat === 'json' || explicitFormat === 'markdown') { - return explicitFormat - } - - // Infer from file extension - if (outputPath) { - const ext = extname(outputPath).toLowerCase() - if (ext === '.md' || ext === '.markdown') { - return 'markdown' - } - } - - return 'json' -} - -/** - * Get grader function based on strategy. - * - * @param strategy - Comparison strategy - * @param graderPath - Path to custom grader (for 'custom' strategy) - * @returns Comparison grader function - */ -const getGrader = async (strategy: CompareStrategy, graderPath?: string): Promise => { - switch (strategy) { - case 'weighted': - return weightedGrade - case 'statistical': - return statisticalGrade - case 'custom': - if (!graderPath) { - throw new Error('Custom strategy requires --grader path') - } - return loadComparisonGrader(graderPath) - } -} - -/** - * Detect trajectory richness from capture results. - * - * @param results - Array of capture results - * @returns Most common trajectory richness level - */ -const detectTrajectoryRichness = (results: CaptureResult[]): TrajectoryRichness => { - // Check metadata first - for (const r of results) { - const richness = r.metadata?.trajectoryRichness - if (richness === 'full' || richness === 'minimal' || richness === 'messages-only') { - return richness as TrajectoryRichness - } - } - - // Infer from trajectory content - for (const r of results) { - const hasThought = r.trajectory.some((s) => s.type === 'thought') - const hasToolCall = r.trajectory.some((s) => s.type === 'tool_call') - if (hasThought || hasToolCall) return 'full' - } - - // Check if we have any trajectory at all - const hasTrajectory = results.some((r) => r.trajectory.length > 0) - return hasTrajectory ? 'messages-only' : 'minimal' -} - -/** - * Execute pipeline compare and generate aggregate report. - * - * @param config - Extended compare configuration - * @returns Comparison report - */ -export const runCompare = async (config: ExtendedCompareConfig): Promise => { - const { runs, strategy = 'weighted', graderPath, outputPath, progress = false, format = 'json' } = config - - if (runs.length < 2) { - throw new Error('At least 2 runs required for comparison') - } - - // Get grader based on strategy - const grader = await getGrader(strategy, graderPath) - - const strategyLabel = strategy === 'custom' ? `custom: ${graderPath}` : strategy - logProgress(`Comparing ${runs.length} runs with strategy: ${strategyLabel}`, progress) - for (const run of runs) { - logProgress(` - ${run.label}: ${run.path}`, progress) - } - - // Load all runs using indexed streaming (memory-efficient for large files) - // Uses Map instead of arrays for O(1) lookups - const runResults: Record> = {} - for (const run of runs) { - logProgress(`Loading ${run.label}...`, progress) - runResults[run.label] = await buildResultsIndex(run.path) - } - - // Build set of all prompt IDs across runs - const promptIds = new Set() - for (const resultsMap of Object.values(runResults)) { - for (const id of resultsMap.keys()) { - promptIds.add(id) - } - } - - logProgress(`Comparing ${promptIds.size} prompts...`, progress) - - // Per-prompt comparison results - const perPromptResults: ComparisonResult[] = [] - const promptComparisons: PromptComparison[] = [] - - for (const promptId of promptIds) { - logProgress(` ${promptId}`, progress) - - // Build comparison input - const runsData: ComparisonGraderInput['runs'] = {} - let input: string | string[] = '' - let hint: string | undefined - let metadata: Record | undefined - - for (const [label, resultsMap] of Object.entries(runResults)) { - const result = resultsMap.get(promptId) - if (result) { - runsData[label] = { - output: result.output, - trajectory: result.trajectory, - // Include additional fields for graders that need them - ...(result.score && { score: result.score }), - ...(result.timing && { duration: result.timing.total }), - ...(result.toolErrors !== undefined && { toolErrors: result.toolErrors }), - } - // Use first found input/hint/metadata as the reference - if (!input) { - input = result.input - hint = result.hint - metadata = result.metadata - } - } - } - - // Skip if not present in at least 2 runs - if (Object.keys(runsData).length < 2) { - logProgress(` Skipped (only in ${Object.keys(runsData).length} run)`, progress) - continue - } - - // Apply comparison grader - const graderInput: ComparisonGraderInput = { - id: promptId, - input, - hint, - metadata, - runs: runsData, - } - - const graderResult = await grader(graderInput) - - const comparisonResult: ComparisonResult = { - id: promptId, - input, - hint, - rankings: graderResult.rankings, - reasoning: graderResult.reasoning, - } - - perPromptResults.push(comparisonResult) - - // Build prompt comparison for head-to-head - const winner = graderResult.rankings.find((r) => r.rank === 1) - const scores: Record = {} - const latencies: Record = {} - const hadErrors: Record = {} - - for (const ranking of graderResult.rankings) { - scores[ranking.run] = ranking.score - } - - for (const [label, data] of Object.entries(runsData)) { - latencies[label] = data.duration ?? 0 - hadErrors[label] = data.toolErrors ?? false - } - - promptComparisons.push({ - id: promptId, - winner: winner?.run ?? null, - scores, - latencies, - hadErrors, - }) - - // Log winner - if (winner) { - logProgress(` Winner: ${winner.run} (${winner.score.toFixed(2)})`, progress) - } - } - - // Compute aggregate metrics - const runLabels = runs.map((r) => r.label) - - // Quality metrics (iterate over Map values) - const quality: Record = {} - for (const label of runLabels) { - const resultsMap = runResults[label] ?? new Map() - const results = [...resultsMap.values()] - const scores = results.map((r) => r.score?.score ?? 0) - const passes = results.filter((r) => r.score?.pass === true).length - const fails = results.length - passes - - quality[label] = { - type: 'run', - avgScore: scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : 0, - passRate: results.length > 0 ? passes / results.length : 0, - passCount: passes, - failCount: fails, - scoreDistribution: computeScoreDistribution(scores), - } - } - - // Performance metrics - const performance: Record = {} - for (const label of runLabels) { - const resultsMap = runResults[label] ?? new Map() - const results = [...resultsMap.values()] - const durations = results.map((r) => r.timing?.total ?? 0) - const firstResponses = results.map((r) => r.timing?.firstResponse).filter((v): v is number => v !== undefined) - - performance[label] = { - latency: computeLatencyStats(durations), - firstResponse: firstResponses.length > 0 ? computeLatencyStats(firstResponses) : undefined, - totalDuration: durations.reduce((a, b) => a + b, 0), - } - } - - // Reliability metrics - const reliability: Record = {} - for (const label of runLabels) { - const resultsMap = runResults[label] ?? new Map() - const results = [...resultsMap.values()] - const toolErrorCount = results.filter((r) => r.toolErrors === true).length - const timeoutCount = results.filter((r) => - r.errors?.some((e: string) => e.toLowerCase().includes('timeout')), - ).length - const completedCount = results.filter((r) => r.output && !r.errors?.length).length - - reliability[label] = { - type: 'run', - toolErrors: toolErrorCount, - toolErrorRate: results.length > 0 ? toolErrorCount / results.length : 0, - timeouts: timeoutCount, - timeoutRate: results.length > 0 ? timeoutCount / results.length : 0, - completionRate: results.length > 0 ? completedCount / results.length : 1, - } - } - - // Compute confidence intervals when using statistical strategy - if (strategy === 'statistical') { - const bootstrapConfig = getBootstrapConfigFromEnv() - - for (const label of runLabels) { - const resultsMap = runResults[label] ?? new Map() - const results = [...resultsMap.values()] - const scores = results.map((r) => r.score?.score ?? 0) - const passes = results.map((r) => (r.score?.pass === true ? 1 : 0)) - const latencies = results.map((r) => r.timing?.total ?? 0) - - // Quality CIs - const qualityMetrics = quality[label] - if (qualityMetrics) { - qualityMetrics.confidenceIntervals = { - avgScore: bootstrap(scores, bootstrapConfig).ci, - passRate: bootstrap(passes, bootstrapConfig).ci, - } - } - - // Performance CIs - const performanceMetrics = performance[label] - if (performanceMetrics) { - performanceMetrics.confidenceIntervals = { - latencyMean: bootstrap(latencies, bootstrapConfig).ci, - } - } - } - } - - // Trajectory info - const trajectoryInfo: Record = {} - for (const label of runLabels) { - const resultsMap = runResults[label] ?? new Map() - const results = [...resultsMap.values()] - const stepCounts = results.map((r) => r.trajectory?.length ?? 0) - const avgStepCount = stepCounts.length > 0 ? stepCounts.reduce((a, b) => a + b, 0) / stepCounts.length : 0 - - trajectoryInfo[label] = { - richness: detectTrajectoryRichness(results), - avgStepCount, - } - } - - // Pairwise comparisons - const pairwise: PairwiseComparison[] = [] - for (let i = 0; i < runLabels.length; i++) { - for (let j = i + 1; j < runLabels.length; j++) { - const runA = runLabels[i] - const runB = runLabels[j] - - // Skip if labels are undefined (shouldn't happen but TypeScript requires check) - if (!runA || !runB) continue - - let aWins = 0 - let bWins = 0 - let ties = 0 - - for (const pc of promptComparisons) { - if (pc.winner === runA) aWins++ - else if (pc.winner === runB) bWins++ - else ties++ - } - - pairwise.push({ runA, runB, aWins, bWins, ties }) - } - } - - // Head-to-head - const headToHead: HeadToHead = { - prompts: promptComparisons, - pairwise, - } - - // Count prompts where all runs are present - const promptsWithAllRuns = promptComparisons.filter((pc) => Object.keys(pc.scores).length === runLabels.length).length - - // Build meta - const meta: ComparisonMeta = { - generatedAt: new Date().toISOString(), - runs: runLabels, - promptCount: promptIds.size, - promptsWithAllRuns, - } - - // Assemble report - const report: ComparisonReport = { - meta, - quality, - performance, - reliability, - trajectoryInfo, - headToHead, - } - - // Output - if (format === 'markdown') { - const markdown = formatReportAsMarkdown(report) - await writeOutput(markdown, outputPath, false) - } else { - await writeOutput(JSON.stringify(report, null, 2), outputPath, false) - } - - // Summary statistics - logProgress('', progress) - logProgress('=== Summary ===', progress) - - const winCounts: Record = {} - for (const label of runLabels) { - winCounts[label] = 0 - } - - for (const pc of promptComparisons) { - if (pc.winner && pc.winner in winCounts) { - const current = winCounts[pc.winner] ?? 0 - winCounts[pc.winner] = current + 1 - } - } - - for (const [label, wins] of Object.entries(winCounts)) { - const pct = promptComparisons.length > 0 ? ((wins / promptComparisons.length) * 100).toFixed(1) : '0.0' - logProgress(` ${label}: ${wins} wins (${pct}%)`, progress) - } - - logProgress('Done!', progress) - - return report -} - -/** - * Format comparison report as markdown. - * - * @param report - Comparison report - * @returns Markdown string - */ -const formatReportAsMarkdown = (report: ComparisonReport): string => { - const lines: string[] = [] - - lines.push('# Comparison Report') - lines.push('') - lines.push(`Generated: ${report.meta.generatedAt}`) - lines.push(`Runs: ${report.meta.runs.join(', ')}`) - lines.push(`Prompts: ${report.meta.promptCount} total, ${report.meta.promptsWithAllRuns} with all runs`) - lines.push('') - - // Check if any run has confidence intervals (statistical strategy was used) - const hasCIs = Object.values(report.quality).some((q) => q.confidenceIntervals) - - // Quality table - lines.push('## Quality') - lines.push('') - if (hasCIs) { - lines.push('| Run | Avg Score | 95% CI | Pass Rate | 95% CI | Pass | Fail |') - lines.push('|-----|-----------|--------|-----------|--------|------|------|') - for (const [label, q] of Object.entries(report.quality)) { - const avgScoreCI = formatCI(q.confidenceIntervals?.avgScore) - const passRateCI = formatCI(q.confidenceIntervals?.passRate) - lines.push( - `| ${label} | ${q.avgScore.toFixed(3)} | ${avgScoreCI} | ${(q.passRate * 100).toFixed(1)}% | ${passRateCI} | ${q.passCount} | ${q.failCount} |`, - ) - } - } else { - lines.push('| Run | Avg Score | Pass Rate | Pass | Fail |') - lines.push('|-----|-----------|-----------|------|------|') - for (const [label, q] of Object.entries(report.quality)) { - lines.push( - `| ${label} | ${q.avgScore.toFixed(3)} | ${(q.passRate * 100).toFixed(1)}% | ${q.passCount} | ${q.failCount} |`, - ) - } - } - lines.push('') - - // Performance table - lines.push('## Performance') - lines.push('') - if (hasCIs) { - lines.push('| Run | P50 (ms) | P90 (ms) | P99 (ms) | Mean (ms) | 95% CI |') - lines.push('|-----|----------|----------|----------|-----------|--------|') - for (const [label, p] of Object.entries(report.performance)) { - const latencyCI = formatCI(p.confidenceIntervals?.latencyMean, 0) - lines.push( - `| ${label} | ${p.latency.p50.toFixed(0)} | ${p.latency.p90.toFixed(0)} | ${p.latency.p99.toFixed(0)} | ${p.latency.mean.toFixed(0)} | ${latencyCI} |`, - ) - } - } else { - lines.push('| Run | P50 (ms) | P90 (ms) | P99 (ms) | Mean (ms) |') - lines.push('|-----|----------|----------|----------|-----------|') - for (const [label, p] of Object.entries(report.performance)) { - lines.push( - `| ${label} | ${p.latency.p50.toFixed(0)} | ${p.latency.p90.toFixed(0)} | ${p.latency.p99.toFixed(0)} | ${p.latency.mean.toFixed(0)} |`, - ) - } - } - lines.push('') - - // Reliability table - lines.push('## Reliability') - lines.push('') - lines.push('| Run | Tool Errors | Error Rate | Completion Rate |') - lines.push('|-----|-------------|------------|-----------------|') - for (const [label, r] of Object.entries(report.reliability)) { - lines.push( - `| ${label} | ${r.toolErrors} | ${(r.toolErrorRate * 100).toFixed(1)}% | ${(r.completionRate * 100).toFixed(1)}% |`, - ) - } - lines.push('') - - // Pairwise wins - lines.push('## Head-to-Head') - lines.push('') - lines.push('| Matchup | Wins | Wins | Ties |') - lines.push('|---------|------|------|------|') - for (const p of report.headToHead.pairwise) { - lines.push(`| ${p.runA} vs ${p.runB} | ${p.aWins} | ${p.bWins} | ${p.ties} |`) - } - lines.push('') - - return lines.join('\n') -} - -/** - * Pipeline compare command CLI handler. - * - * @param args - Command line arguments (after 'compare') - */ -export const compare = async (args: string[]): Promise => { - const { values, positionals } = parseArgs({ - args, - options: { - run: { type: 'string', multiple: true }, - grader: { type: 'string', short: 'g' }, - strategy: { type: 'string', short: 's' }, - output: { type: 'string', short: 'o' }, - format: { type: 'string', short: 'f' }, - 'input-format': { type: 'string' }, - progress: { type: 'boolean', default: false }, - help: { type: 'boolean', short: 'h' }, - }, - allowPositionals: true, - }) - - if (values.help) { - console.log(` -Usage: agent-eval-harness compare [files...] [options] - -Compare multiple runs of the same prompts and generate aggregate report. -Supports both CaptureResult (single-run) and TrialResult (multi-run reliability) formats. - -Arguments: - files... Result files to compare (positional, unlimited) - -Options: - --run Labeled run format: "label:path.jsonl" (alternative to positional) - -s, --strategy Comparison strategy: weighted (default), statistical, or custom - -g, --grader Path to custom grader (required if strategy=custom) - -o, --output Output file (default: stdout) - -f, --format Output format: json (default) or markdown - --input-format Input format: auto (default), capture, or trials - --progress Show progress to stderr - -h, --help Show this help message - -Input Formats: - auto Auto-detect from file content (default) - capture CaptureResult format (trajectory/timing fields) - trials TrialResult format (trials/k fields) for pass@k analysis - -Built-in Strategies: - For CaptureResult (capture format): - weighted Configurable weights for quality, latency, reliability - Env vars: COMPARE_QUALITY, COMPARE_LATENCY, COMPARE_RELIABILITY - statistical Bootstrap sampling for confidence intervals - Env var: COMPARE_BOOTSTRAP_ITERATIONS - - For TrialResult (trials format): - weighted Configurable weights for capability, reliability, consistency - Env vars: COMPARE_CAPABILITY, COMPARE_RELIABILITY, COMPARE_CONSISTENCY - statistical Bootstrap sampling for passAtK confidence intervals - Env var: COMPARE_BOOTSTRAP_ITERATIONS - -Custom Grader: - Must export 'grade' or 'compare' function with signature: - CaptureResult: (params: ComparisonGraderInput) => Promise - TrialResult: (params: TrialsComparisonGraderInput) => Promise - -Examples: - # Default: auto-detect format, weighted strategy, JSON output - agent-eval-harness compare run1.jsonl run2.jsonl -o comparison.json - - # Explicit trials format for pass@k comparison - agent-eval-harness compare trials1.jsonl trials2.jsonl --input-format trials -o comparison.json - - # Trials comparison with custom weights - COMPARE_CAPABILITY=0.5 COMPARE_RELIABILITY=0.3 COMPARE_CONSISTENCY=0.2 \\ - agent-eval-harness compare trials1.jsonl trials2.jsonl -o comparison.json - - # Statistical significance strategy - agent-eval-harness compare run1.jsonl run2.jsonl --strategy statistical -o comparison.json - - # Markdown report - agent-eval-harness compare run1.jsonl run2.jsonl --format markdown -o report.md - - # Custom grader - agent-eval-harness compare run1.jsonl run2.jsonl \\ - --strategy custom --grader ./my-llm-judge.ts -o comparison.json - - # With explicit labels - agent-eval-harness compare \\ - --run "with-bun-mcp:results-bun.jsonl" \\ - --run "vanilla:results-vanilla.jsonl" \\ - -o comparison.json -`) - return - } - - // Collect runs from positional args and --run flags - const runs: LabeledRun[] = [] - - // Positional arguments (file paths) - for (const arg of positionals) { - runs.push(parseLabeledRun(arg)) - } - - // --run flags - if (values.run) { - for (const arg of values.run) { - runs.push(parseLabeledRun(arg)) - } - } - - if (runs.length < 2) { - console.error('Error: At least 2 result files required for comparison') - console.error('Example: agent-eval-harness compare run1.jsonl run2.jsonl') - process.exit(1) - } - - // Validate that all run files exist (early error for better UX) - try { - await validateRunFiles(runs) - } catch (error) { - console.error(`Error: ${error instanceof Error ? error.message : error}`) - process.exit(1) - } - - // Validate strategy - const strategy = (values.strategy as CompareStrategy) ?? 'weighted' - if (!['weighted', 'statistical', 'custom'].includes(strategy)) { - console.error(`Error: Invalid strategy '${strategy}'. Use: weighted, statistical, or custom`) - process.exit(1) - } - - if (strategy === 'custom' && !values.grader) { - console.error('Error: --grader is required when using --strategy custom') - process.exit(1) - } - - // Validate output format (explicit format takes precedence, otherwise infer from extension) - const format = inferFormat(values.output, values.format) - if (values.format && !['json', 'markdown'].includes(values.format)) { - console.error(`Error: Invalid format '${values.format}'. Use: json or markdown`) - process.exit(1) - } - - // Validate input format - const inputFormatArg = values['input-format'] - if (inputFormatArg && !['auto', 'capture', 'trials'].includes(inputFormatArg)) { - console.error(`Error: Invalid input-format '${inputFormatArg}'. Use: auto, capture, or trials`) - process.exit(1) - } - - // Detect or use specified input format - let inputFormat: CompareInputFormat - try { - if (inputFormatArg === 'capture') { - inputFormat = 'capture' - } else if (inputFormatArg === 'trials') { - inputFormat = 'trials' - } else { - // Auto-detect from file content - inputFormat = await detectAndValidateFormat(runs.map((r) => r.path)) - } - } catch (error) { - console.error(`Error: ${error instanceof Error ? error.message : error}`) - process.exit(1) - } - - // Route to appropriate comparison function based on input format - if (inputFormat === 'trials') { - await runTrialsCompare({ - runs, - strategy, - graderPath: values.grader, - outputPath: values.output, - progress: values.progress, - format, - }) - } else { - await runCompare({ - runs, - strategy, - graderPath: values.grader, - outputPath: values.output, - progress: values.progress, - format, - }) - } -} diff --git a/src/pipeline/extract.ts b/src/pipeline/extract.ts deleted file mode 100644 index 69f7464..0000000 --- a/src/pipeline/extract.ts +++ /dev/null @@ -1,241 +0,0 @@ -/** - * Pipeline extract command - parse raw output into trajectories. - * - * @remarks - * Converts RawOutput from `run` command into ExtractedResult with - * parsed trajectory and final output. Uses the same schema-driven - * parsing as the capture command. - * - * @packageDocumentation - */ - -import { parseArgs } from 'node:util' -import { loadJsonl, logProgress, writeOutput } from '../core.ts' -import { parseHeadlessConfig } from '../headless/headless.schemas.ts' -import { createOutputParser } from '../headless/headless-output-parser.ts' -import type { TrajectoryStep } from '../schemas.ts' -import type { ExtractedResult, RawOutput } from './pipeline.types.ts' - -/** - * Extract trajectory from raw output using schema parser. - * - * @param rawOutput - Raw output from run command - * @param parser - Output parser created from schema - * @returns Extracted result with trajectory - */ -const extractFromRaw = (rawOutput: RawOutput, parser: ReturnType): ExtractedResult => { - const trajectory: TrajectoryStep[] = [] - let finalOutput = '' - let toolErrors = false - - // Parse each raw line - for (const line of rawOutput.rawLines) { - // Try to parse as trajectory update - const parsed = parser.parseLine(line) - if (parsed) { - const updates = Array.isArray(parsed) ? parsed : [parsed] - for (const update of updates) { - const timestamp = Date.now() - rawOutput.timing.start - - if (update.type === 'thought') { - trajectory.push({ - type: 'thought', - content: update.content ?? '', - timestamp, - }) - } else if (update.type === 'message') { - trajectory.push({ - type: 'message', - content: update.content ?? '', - timestamp, - }) - } else if (update.type === 'tool_call') { - trajectory.push({ - type: 'tool_call', - name: update.title ?? 'unknown', - status: update.status ?? 'pending', - timestamp, - }) - if (update.status === 'failed') { - toolErrors = true - } - } else if (update.type === 'plan') { - trajectory.push({ - type: 'plan', - entries: [], - timestamp, - }) - } - } - } - - // Try to parse as result - const result = parser.parseResult(line) - if (result.isResult) { - finalOutput = result.content - } - } - - // If no explicit result, extract from messages - if (!finalOutput) { - finalOutput = trajectory - .filter((step): step is TrajectoryStep & { type: 'message' } => step.type === 'message') - .map((step) => step.content) - .join('\n') - } - - return { - id: rawOutput.id, - input: rawOutput.input, - hint: rawOutput.hint, - output: finalOutput, - trajectory, - toolErrors: toolErrors || !!rawOutput.error, - metadata: rawOutput.metadata, - timing: rawOutput.timing, - ...(rawOutput.error && { error: rawOutput.error }), - } -} - -/** - * Execute pipeline extract with configuration. - * - * @param schemaPath - Path to headless adapter schema - * @param rawOutputs - Raw outputs from run command - * @param outputPath - Optional output file path - * @param progress - Show progress to stderr - */ -export const runExtract = async ( - schemaPath: string, - rawOutputs: RawOutput[], - outputPath?: string, - progress = false, -): Promise => { - // Load and validate schema - const schemaFile = Bun.file(schemaPath) - if (!(await schemaFile.exists())) { - throw new Error(`Schema file not found: ${schemaPath}`) - } - - const rawSchema = await schemaFile.json() - const schema = parseHeadlessConfig(rawSchema) - const parser = createOutputParser(schema) - - logProgress(`Extracting with schema: ${schema.name}`, progress) - - let isFirstOutput = true - - // Clear output file if specified - if (outputPath) { - await Bun.write(outputPath, '') - } - - for (let i = 0; i < rawOutputs.length; i++) { - const rawOutput = rawOutputs[i] - if (!rawOutput) continue - - logProgress(`[${i + 1}/${rawOutputs.length}] ${rawOutput.id}`, progress) - - const extracted = extractFromRaw(rawOutput, parser) - - await writeOutput(JSON.stringify(extracted), outputPath, !isFirstOutput) - isFirstOutput = false - } - - logProgress('Done!', progress) -} - -/** - * Read raw outputs from stdin. - * - * @returns Array of parsed raw outputs or null if stdin is empty - */ -const readStdinRawOutputs = async (): Promise => { - if (process.stdin.isTTY) { - return null - } - - const chunks: Buffer[] = [] - for await (const chunk of process.stdin) { - chunks.push(chunk) - } - - const content = Buffer.concat(chunks).toString('utf-8').trim() - if (!content) return null - - return content - .split('\n') - .filter(Boolean) - .map((line) => JSON.parse(line) as RawOutput) -} - -/** - * Pipeline extract command CLI handler. - * - * @param args - Command line arguments (after 'extract') - */ -export const extract = async (args: string[]): Promise => { - const { values, positionals } = parseArgs({ - args, - options: { - schema: { type: 'string', short: 's' }, - output: { type: 'string', short: 'o' }, - progress: { type: 'boolean', default: false }, - help: { type: 'boolean', short: 'h' }, - }, - allowPositionals: true, - }) - - if (values.help) { - console.log(` -Usage: agent-eval-harness extract [raw.jsonl] --schema [options] - -Parse raw output into trajectories and final output. - -Arguments: - raw.jsonl Input file from 'run' command (or pipe from stdin) - -Options: - -s, --schema Path to headless adapter schema (required) - -o, --output Output file (default: stdout) - --progress Show progress to stderr - -h, --help Show this help message - -Examples: - # From file - agent-eval-harness extract raw.jsonl --schema claude.json -o extracted.jsonl - - # Piped from run - agent-eval-harness run prompts.jsonl -s claude.json | agent-eval-harness extract -s claude.json - - # Full pipeline - cat prompts.jsonl | \\ - agent-eval-harness run -s claude.json | \\ - agent-eval-harness extract -s claude.json | \\ - agent-eval-harness grade --grader ./grader.ts -`) - return - } - - if (!values.schema) { - console.error('Error: --schema is required') - process.exit(1) - } - - // Load raw outputs from file or stdin - const inputPath = positionals[0] - let rawOutputs: RawOutput[] - - if (inputPath) { - rawOutputs = await loadJsonl(inputPath) - } else { - const stdinOutputs = await readStdinRawOutputs() - if (!stdinOutputs || stdinOutputs.length === 0) { - console.error('Error: No raw output provided (use file argument or pipe to stdin)') - process.exit(1) - } - rawOutputs = stdinOutputs - } - - await runExtract(values.schema, rawOutputs, values.output, values.progress) -} diff --git a/src/pipeline/format.ts b/src/pipeline/format.ts deleted file mode 100644 index 54cdb6c..0000000 --- a/src/pipeline/format.ts +++ /dev/null @@ -1,291 +0,0 @@ -/** - * Pipeline format command - convert results to different output formats. - * - * @remarks - * Transforms graded or extracted results into various formats: - * - jsonl: Pass-through JSONL (default) - * - markdown: Human-readable report - * - csv: Comma-separated values for spreadsheets - * - * @packageDocumentation - */ - -import { parseArgs } from 'node:util' -import { loadJsonl, logProgress, writeOutput } from '../core.ts' -import type { CaptureResult } from '../schemas.ts' -import type { ExtractedResult, FormatStyle, GradedResult } from './pipeline.types.ts' - -/** Union of all formattable result types */ -type FormattableResult = ExtractedResult | GradedResult | CaptureResult - -/** - * Check if result has a score (graded). - */ -const isGraded = ( - result: FormattableResult, -): result is GradedResult | (CaptureResult & { score: NonNullable }) => { - return 'score' in result && result.score !== undefined -} - -/** - * Format results as markdown report. - * - * @param results - Results to format - * @returns Markdown string - */ -const formatMarkdown = (results: FormattableResult[]): string => { - const lines: string[] = [ - '# Evaluation Results', - '', - `Generated: ${new Date().toISOString()}`, - `Total: ${results.length} test cases`, - '', - ] - - // Summary statistics if graded - const gradedResults = results.filter(isGraded) - if (gradedResults.length > 0) { - const passed = gradedResults.filter((r) => r.score.pass).length - const avgScore = gradedResults.reduce((sum, r) => sum + r.score.score, 0) / gradedResults.length - - lines.push('## Summary') - lines.push('') - lines.push( - `- **Pass rate**: ${passed}/${gradedResults.length} (${((passed / gradedResults.length) * 100).toFixed(1)}%)`, - ) - lines.push(`- **Average score**: ${avgScore.toFixed(3)}`) - lines.push('') - } - - lines.push('## Results') - lines.push('') - - for (const result of results) { - const input = Array.isArray(result.input) ? result.input.join(' → ') : result.input - const inputPreview = input.length > 100 ? `${input.slice(0, 100)}...` : input - - lines.push(`### ${result.id}`) - lines.push('') - lines.push(`**Input**: ${inputPreview}`) - lines.push('') - - if (result.hint) { - lines.push(`**Hint**: ${result.hint}`) - lines.push('') - } - - const outputPreview = result.output.length > 500 ? `${result.output.slice(0, 500)}...` : result.output - lines.push(`**Output**:`) - lines.push('```') - lines.push(outputPreview) - lines.push('```') - lines.push('') - - if (isGraded(result)) { - const icon = result.score.pass ? '✅' : '❌' - lines.push(`**Score**: ${icon} ${result.score.score.toFixed(3)} (${result.score.pass ? 'PASS' : 'FAIL'})`) - if (result.score.reasoning) { - lines.push(`**Reasoning**: ${result.score.reasoning}`) - } - lines.push('') - } - - if (result.toolErrors) { - lines.push('⚠️ **Tool errors detected**') - lines.push('') - } - - if ('error' in result && result.error) { - lines.push(`❌ **Error**: ${result.error}`) - lines.push('') - } - - lines.push('---') - lines.push('') - } - - return lines.join('\n') -} - -/** - * Format results as CSV. - * - * @param results - Results to format - * @returns CSV string - */ -const formatCsv = (results: FormattableResult[]): string => { - const lines: string[] = [] - - // Header - const hasScores = results.some(isGraded) - const headers = ['id', 'input', 'hint', 'output', 'tool_errors', 'duration_ms'] - if (hasScores) { - headers.push('pass', 'score', 'reasoning') - } - lines.push(headers.join(',')) - - // Data rows - for (const result of results) { - const input = Array.isArray(result.input) ? result.input.join(' | ') : result.input - const escapeCsv = (str: string) => `"${str.replace(/"/g, '""').replace(/\n/g, '\\n')}"` - - const row = [ - escapeCsv(result.id), - escapeCsv(input), - escapeCsv(result.hint ?? ''), - escapeCsv(result.output), - result.toolErrors ? 'true' : 'false', - String(result.timing.total), - ] - - if (hasScores) { - if (isGraded(result)) { - row.push( - result.score.pass ? 'true' : 'false', - result.score.score.toFixed(3), - escapeCsv(result.score.reasoning ?? ''), - ) - } else { - row.push('', '', '') - } - } - - lines.push(row.join(',')) - } - - return lines.join('\n') -} - -/** - * Execute pipeline format with configuration. - * - * @param style - Output format style - * @param results - Results to format - * @param outputPath - Optional output file path - * @param progress - Show progress to stderr - */ -export const runFormat = async ( - style: FormatStyle, - results: FormattableResult[], - outputPath?: string, - progress = false, -): Promise => { - logProgress(`Formatting ${results.length} results as ${style}`, progress) - - let output: string - - switch (style) { - case 'jsonl': - // Pass-through as JSONL - output = results.map((r) => JSON.stringify(r)).join('\n') - break - - case 'markdown': - output = formatMarkdown(results) - break - - case 'csv': - output = formatCsv(results) - break - } - - await writeOutput(output, outputPath, false) - logProgress('Done!', progress) -} - -/** - * Read results from stdin. - * - * @returns Array of parsed results or null if stdin is empty - */ -const readStdinResults = async (): Promise => { - if (process.stdin.isTTY) { - return null - } - - const chunks: Buffer[] = [] - for await (const chunk of process.stdin) { - chunks.push(chunk) - } - - const content = Buffer.concat(chunks).toString('utf-8').trim() - if (!content) return null - - return content - .split('\n') - .filter(Boolean) - .map((line) => JSON.parse(line) as FormattableResult) -} - -/** - * Pipeline format command CLI handler. - * - * @param args - Command line arguments (after 'format') - */ -export const format = async (args: string[]): Promise => { - const { values, positionals } = parseArgs({ - args, - options: { - style: { type: 'string', short: 'f', default: 'jsonl' }, - output: { type: 'string', short: 'o' }, - progress: { type: 'boolean', default: false }, - help: { type: 'boolean', short: 'h' }, - }, - allowPositionals: true, - }) - - if (values.help) { - console.log(` -Usage: agent-eval-harness format [results.jsonl] [options] - -Convert results to different output formats. - -Arguments: - results.jsonl Input file (or pipe from stdin) - -Options: - -f, --style Output format: jsonl, markdown, csv (default: jsonl) - -o, --output Output file (default: stdout) - --progress Show progress to stderr - -h, --help Show this help message - -Examples: - # Convert to markdown report - agent-eval-harness format graded.jsonl --style markdown -o report.md - - # Piped from grade - agent-eval-harness grade extracted.jsonl -g ./grader.ts | agent-eval-harness format -f csv - - # Full pipeline to markdown - cat prompts.jsonl | \\ - agent-eval-harness run -s claude.json | \\ - agent-eval-harness extract -s claude.json | \\ - agent-eval-harness grade -g ./grader.ts | \\ - agent-eval-harness format -f markdown > report.md -`) - return - } - - const style = values.style as FormatStyle - if (!['jsonl', 'markdown', 'csv'].includes(style)) { - console.error(`Error: Invalid format style '${style}'. Must be: jsonl, markdown, csv`) - process.exit(1) - } - - // Load results from file or stdin - const inputPath = positionals[0] - let results: FormattableResult[] - - if (inputPath) { - results = await loadJsonl(inputPath) - } else { - const stdinResults = await readStdinResults() - if (!stdinResults || stdinResults.length === 0) { - console.error('Error: No results provided (use file argument or pipe to stdin)') - process.exit(1) - } - results = stdinResults - } - - await runFormat(style, results, values.output, values.progress) -} diff --git a/src/pipeline/grade.ts b/src/pipeline/grade.ts deleted file mode 100644 index 16ad736..0000000 --- a/src/pipeline/grade.ts +++ /dev/null @@ -1,175 +0,0 @@ -/** - * Pipeline grade command - apply grader to extracted results. - * - * @remarks - * Takes ExtractedResult from `extract` command and adds grader scores. - * Uses the same grader loading mechanism as the capture command. - * - * @packageDocumentation - */ - -import { parseArgs } from 'node:util' -import { loadJsonl, logProgress, writeOutput } from '../core.ts' -import { loadGrader } from '../schemas/grader-loader.ts' -import type { ExtractedResult, GradedResult } from './pipeline.types.ts' - -/** - * Execute pipeline grade with configuration. - * - * @param graderPath - Path to grader module or executable - * @param extractedResults - Extracted results from extract command - * @param outputPath - Optional output file path - * @param progress - Show progress to stderr - */ -export const runGrade = async ( - graderPath: string, - extractedResults: ExtractedResult[], - outputPath?: string, - progress = false, -): Promise => { - // Load grader - const grader = await loadGrader(graderPath) - - logProgress(`Grading with: ${graderPath}`, progress) - - let isFirstOutput = true - - // Clear output file if specified - if (outputPath) { - await Bun.write(outputPath, '') - } - - for (let i = 0; i < extractedResults.length; i++) { - const extracted = extractedResults[i] - if (!extracted) continue - - logProgress(`[${i + 1}/${extractedResults.length}] ${extracted.id}`, progress) - - // Apply grader - const score = await grader({ - input: extracted.input, - output: extracted.output, - hint: extracted.hint, - trajectory: extracted.trajectory, - metadata: extracted.metadata, - cwd: extracted.cwd, - }) - - const graded: GradedResult = { - ...extracted, - score, - } - - // Merge outcome from grader if present - if (score.outcome) { - graded.outcome = score.outcome - } - - const icon = score.pass ? '✓' : '✗' - logProgress(` ${icon} score=${score.score.toFixed(2)}`, progress) - - await writeOutput(JSON.stringify(graded), outputPath, !isFirstOutput) - isFirstOutput = false - } - - logProgress('Done!', progress) -} - -/** - * Read extracted results from stdin. - * - * @returns Array of parsed extracted results or null if stdin is empty - */ -const readStdinExtracted = async (): Promise => { - if (process.stdin.isTTY) { - return null - } - - const chunks: Buffer[] = [] - for await (const chunk of process.stdin) { - chunks.push(chunk) - } - - const content = Buffer.concat(chunks).toString('utf-8').trim() - if (!content) return null - - return content - .split('\n') - .filter(Boolean) - .map((line) => JSON.parse(line) as ExtractedResult) -} - -/** - * Pipeline grade command CLI handler. - * - * @param args - Command line arguments (after 'grade') - */ -export const grade = async (args: string[]): Promise => { - const { values, positionals } = parseArgs({ - args, - options: { - grader: { type: 'string', short: 'g' }, - output: { type: 'string', short: 'o' }, - progress: { type: 'boolean', default: false }, - help: { type: 'boolean', short: 'h' }, - }, - allowPositionals: true, - }) - - if (values.help) { - console.log(` -Usage: agent-eval-harness grade [extracted.jsonl] --grader [options] - -Apply grader to extracted results. - -Arguments: - extracted.jsonl Input file from 'extract' command (or pipe from stdin) - -Options: - -g, --grader Path to grader (.ts/.js module or executable script) (required) - -o, --output Output file (default: stdout) - --progress Show progress to stderr - -h, --help Show this help message - -Graders: - TS/JS modules must export a 'grade' function. - Executable scripts (Python, etc.) use stdin/stdout JSON protocol. - -Examples: - # From file - agent-eval-harness grade extracted.jsonl --grader ./grader.ts -o graded.jsonl - - # Piped from extract - agent-eval-harness extract raw.jsonl -s claude.json | agent-eval-harness grade -g ./grader.ts - - # Full pipeline - cat prompts.jsonl | \\ - agent-eval-harness run -s claude.json | \\ - agent-eval-harness extract -s claude.json | \\ - agent-eval-harness grade -g ./grader.ts > results.jsonl -`) - return - } - - if (!values.grader) { - console.error('Error: --grader is required') - process.exit(1) - } - - // Load extracted results from file or stdin - const inputPath = positionals[0] - let extractedResults: ExtractedResult[] - - if (inputPath) { - extractedResults = await loadJsonl(inputPath) - } else { - const stdinResults = await readStdinExtracted() - if (!stdinResults || stdinResults.length === 0) { - console.error('Error: No extracted results provided (use file argument or pipe to stdin)') - process.exit(1) - } - extractedResults = stdinResults - } - - await runGrade(values.grader, extractedResults, values.output, values.progress) -} diff --git a/src/pipeline/pipeline.ts b/src/pipeline/pipeline.ts deleted file mode 100644 index 6d607a8..0000000 --- a/src/pipeline/pipeline.ts +++ /dev/null @@ -1,42 +0,0 @@ -/** - * Pipeline commands for Unix-style composable evaluation. - * - * @remarks - * Re-exports pipeline commands and types. - * - * Commands: - * - run: Execute prompts and output raw results - * - extract: Parse raw output into trajectories - * - grade: Apply grader to extracted results - * - format: Convert results to different output formats - * - compare: Compare multiple runs of the same prompts - * - * @packageDocumentation - */ - -// Commands -export { type CompareStrategy, compare, type ExtendedCompareConfig, runCompare } from './compare.ts' -export { extract } from './extract.ts' -export { format } from './format.ts' -export { grade } from './grade.ts' -// Types -export type { - CompareConfig, - ComparisonGrader, - ComparisonGraderInput, - ComparisonGraderResult, - ComparisonRanking, - ComparisonResult, - ComparisonRunData, - ExtractConfig, - ExtractedResult, - FormatConfig, - FormatStyle, - GradeConfig, - GradedResult, - LabeledRun, - RawOutput, - RunConfig, - RunMode, -} from './pipeline.types.ts' -export { run } from './run.ts' diff --git a/src/pipeline/pipeline.types.ts b/src/pipeline/pipeline.types.ts deleted file mode 100644 index d56a824..0000000 --- a/src/pipeline/pipeline.types.ts +++ /dev/null @@ -1,325 +0,0 @@ -/** - * Type definitions for pipeline commands. - * - * @remarks - * These types define the data flow between pipeline stages: - * run → extract → grade → format - * - * Each stage transforms the data, enabling Unix-style piping. - * - * @packageDocumentation - */ - -import type { GraderResult, TrajectoryStep, TrialEntry } from '../schemas.ts' - -/** - * Raw output from the `run` command. - * - * @remarks - * Captures the raw agent output before trajectory extraction. - * Used when piping `run` output to `extract`. - */ -export type RawOutput = { - /** Test case identifier */ - id: string - /** Original prompt input (string for single turn, array for multi-turn) */ - input: string | string[] - /** Grader context hint */ - hint?: string - /** Optional metadata from original prompt */ - metadata?: Record - /** Raw output lines from the agent (JSON strings) */ - rawLines: string[] - /** Timing metadata */ - timing: { - start: number - end: number - total: number - } - /** Error message if execution failed */ - error?: string -} - -/** - * Extracted result from the `extract` command. - * - * @remarks - * Converts raw output lines into structured trajectory and output. - * Ready for grading or formatting. - */ -export type ExtractedResult = { - /** Test case identifier */ - id: string - /** Original prompt input */ - input: string | string[] - /** Grader context hint */ - hint?: string - /** Final agent output (extracted from trajectory) */ - output: string - /** Parsed trajectory steps */ - trajectory: TrajectoryStep[] - /** Whether tool errors were detected */ - toolErrors: boolean - /** Optional metadata from original prompt */ - metadata?: Record - /** Working directory path (optional, for git-based grading) */ - cwd?: string - /** Timing metadata */ - timing: { - start: number - end: number - total: number - } - /** Error message if extraction failed */ - error?: string -} - -/** - * Graded result from the `grade` command. - * - * @remarks - * Adds grader score to extracted result. - * Outcome field is merged from grader result if present. - */ -export type GradedResult = ExtractedResult & { - /** Grader score */ - score: GraderResult - /** Outcome data from grader (if grader returned outcome) */ - outcome?: Record -} - -/** - * Run mode for the pipeline run command. - * - * @remarks - * - `schema`: Use headless adapter with schema file - * - `simple`: Use Bun shell with placeholder substitution - * - `shell`: Use Bun shell with PROMPT env variable - */ -export type RunMode = 'schema' | 'simple' | 'shell' - -/** - * Configuration for pipeline run command. - */ -export type RunConfig = { - /** Run mode */ - mode: RunMode - /** Path to schema file (for 'schema' mode) */ - schemaPath?: string - /** Command template (for 'simple' mode) - {} is replaced with prompt */ - simpleCommand?: string - /** Shell template (for 'shell' mode) - $PROMPT env var is available */ - shellTemplate?: string - /** Working directory */ - cwd?: string - /** Timeout per prompt in milliseconds */ - timeout?: number - /** Show progress to stderr */ - progress?: boolean -} - -/** - * Configuration for pipeline extract command. - */ -export type ExtractConfig = { - /** Path to schema file for output parsing */ - schemaPath: string - /** Show progress to stderr */ - progress?: boolean -} - -/** - * Configuration for pipeline grade command. - */ -export type GradeConfig = { - /** Path to grader module or executable */ - graderPath: string - /** Show progress to stderr */ - progress?: boolean -} - -/** - * Output format for pipeline format command. - */ -export type FormatStyle = 'jsonl' | 'markdown' | 'csv' - -/** - * Configuration for pipeline format command. - */ -export type FormatConfig = { - /** Output format style */ - style: FormatStyle - /** Show progress to stderr */ - progress?: boolean -} - -/** - * Labeled run for comparison. - * - * @remarks - * Associates a results file with a human-readable label - * for the compare command output. - */ -export type LabeledRun = { - /** Human-readable label (derived from filename or explicit) */ - label: string - /** Path to results JSONL file */ - path: string -} - -/** - * Run data provided to comparison graders. - * - * @remarks - * Extended run data includes optional fields that built-in graders use: - * - `score`: Grader result if the run was previously graded - * - `duration`: Total duration from timing - * - `toolErrors`: Whether tool errors occurred - */ -export type ComparisonRunData = { - /** Final agent output */ - output: string - /** Execution trajectory (optional, varies by adapter) */ - trajectory?: TrajectoryStep[] - /** Grader score (if run was graded) */ - score?: GraderResult - /** Total duration in milliseconds */ - duration?: number - /** Whether tool errors occurred */ - toolErrors?: boolean -} - -/** - * Input to comparison grader function. - * - * @remarks - * Provides all runs' results for a single prompt ID - * so the grader can compare and rank them. - */ -export type ComparisonGraderInput = { - /** Test case identifier */ - id: string - /** Original prompt input */ - input: string | string[] - /** Grader context hint */ - hint?: string - /** Optional metadata from original prompt */ - metadata?: Record - /** Results keyed by run label */ - runs: Record -} - -/** - * Single ranking entry in comparison result. - */ -export type ComparisonRanking = { - /** Run label */ - run: string - /** Rank position (1 = best) */ - rank: number - /** Numeric score */ - score: number -} - -/** - * Result from comparison grader function. - * - * @remarks - * Rankings should be ordered from best to worst. - */ -export type ComparisonGraderResult = { - /** Rankings from best to worst */ - rankings: ComparisonRanking[] - /** Optional reasoning for the rankings */ - reasoning?: string -} - -/** - * Comparison grader function type. - * - * @remarks - * User-provided graders implement this interface to compare - * multiple runs of the same prompt. - */ -export type ComparisonGrader = (params: ComparisonGraderInput) => Promise - -/** - * Configuration for pipeline compare command. - */ -export type CompareConfig = { - /** Labeled runs to compare */ - runs: LabeledRun[] - /** Path to comparison grader */ - graderPath: string - /** Output file path */ - outputPath?: string - /** Show progress to stderr */ - progress?: boolean -} - -/** - * Comparison result for a single prompt. - */ -export type ComparisonResult = { - /** Test case identifier */ - id: string - /** Original prompt input */ - input: string | string[] - /** Grader context hint */ - hint?: string - /** Rankings from comparison grader */ - rankings: ComparisonRanking[] - /** Optional reasoning */ - reasoning?: string -} - -// ============================================================================ -// Trials Comparison Types -// ============================================================================ - -/** - * Run data for trials comparison. - * - * @remarks - * Contains the trials-specific metrics (passAtK, passExpK) plus - * the individual trial entries for deeper analysis. - */ -export type TrialsComparisonRunData = { - /** Simple pass rate: passes / k */ - passRate?: number - /** pass@k: probability of at least one pass in k samples */ - passAtK?: number - /** pass^k: probability of all k samples passing */ - passExpK?: number - /** Number of trials (k) */ - k: number - /** Individual trial results */ - trials: TrialEntry[] -} - -/** - * Input to trials comparison grader function. - * - * @remarks - * Provides all runs' trial results for a single prompt ID - * so the grader can compare capability and reliability. - */ -export type TrialsComparisonGraderInput = { - /** Test case identifier */ - id: string - /** Original prompt input */ - input: string | string[] - /** Grader context hint */ - hint?: string - /** Results keyed by run label */ - runs: Record -} - -/** - * Trials comparison grader function type. - * - * @remarks - * User-provided graders implement this interface to compare - * multiple runs of the same prompt using trials data. - */ -export type TrialsComparisonGrader = (params: TrialsComparisonGraderInput) => Promise diff --git a/src/pipeline/run.ts b/src/pipeline/run.ts deleted file mode 100644 index 7b5a84a..0000000 --- a/src/pipeline/run.ts +++ /dev/null @@ -1,414 +0,0 @@ -/** - * Pipeline run command - execute prompts and output raw results. - * - * @remarks - * Supports three modes: - * - `schema`: Use headless adapter with schema file (full trajectory capture) - * - `simple`: Use Bun shell with `{}` placeholder for prompt - * - `shell`: Use Bun shell with `$PROMPT` environment variable - * - * Output is RawOutput JSONL suitable for piping to `extract`. - * - * @packageDocumentation - */ - -import { parseArgs } from 'node:util' -import { loadPrompts, logProgress, writeOutput } from '../core.ts' -import { parseHeadlessConfig } from '../headless/headless.schemas.ts' -import { createSessionManager } from '../headless/headless-session-manager.ts' -import { DEFAULT_HARNESS_TIMEOUT } from '../schemas/constants.ts' -import type { RawOutput, RunConfig } from './pipeline.types.ts' - -/** - * Execute a single prompt in simple mode. - * - * @remarks - * Replaces `{}` placeholder in command with the prompt text. - * Uses Bun shell for execution. - * - * @param prompt - Prompt text to execute - * @param command - Command template with `{}` placeholder - * @param timeout - Execution timeout in milliseconds - * @returns Object with output lines and optional stderr error - */ -const runSimple = async ( - prompt: string, - command: string, - timeout: number, -): Promise<{ lines: string[]; error?: string }> => { - const escapedPrompt = prompt.replace(/'/g, "'\\''") - const finalCmd = command.replace('{}', `'${escapedPrompt}'`) - - const proc = Bun.spawn(['sh', '-c', finalCmd], { - stdout: 'pipe', - stderr: 'pipe', - }) - - const timeoutId = setTimeout(() => proc.kill(), timeout) - - try { - const [stdout, stderr] = await Promise.all([new Response(proc.stdout).text(), new Response(proc.stderr).text()]) - clearTimeout(timeoutId) - const lines = stdout.trim().split('\n').filter(Boolean) - return stderr.trim() ? { lines, error: stderr.trim() } : { lines } - } catch (err) { - clearTimeout(timeoutId) - return { lines: [], error: err instanceof Error ? err.message : String(err) } - } -} - -/** - * Execute a single prompt in shell mode. - * - * @remarks - * Sets PROMPT environment variable and executes shell template. - * - * @param prompt - Prompt text to execute - * @param template - Shell command template - * @param timeout - Execution timeout in milliseconds - * @returns Object with output lines and optional stderr error - */ -const runShell = async ( - prompt: string, - template: string, - timeout: number, -): Promise<{ lines: string[]; error?: string }> => { - const proc = Bun.spawn(['sh', '-c', template], { - stdout: 'pipe', - stderr: 'pipe', - env: { ...process.env, PROMPT: prompt }, - }) - - const timeoutId = setTimeout(() => proc.kill(), timeout) - - try { - const [stdout, stderr] = await Promise.all([new Response(proc.stdout).text(), new Response(proc.stderr).text()]) - clearTimeout(timeoutId) - const lines = stdout.trim().split('\n').filter(Boolean) - return stderr.trim() ? { lines, error: stderr.trim() } : { lines } - } catch (err) { - clearTimeout(timeoutId) - return { lines: [], error: err instanceof Error ? err.message : String(err) } - } -} - -/** - * Execute pipeline run with configuration object. - * - * @remarks - * Processes prompts from stdin (if available) or from a file, - * executing each and outputting RawOutput JSONL. - * - * @param config - Run configuration - * @param prompts - Array of prompts to execute - * @param outputPath - Optional output file path - */ -export const runPipeline = async ( - config: RunConfig, - prompts: Array<{ id: string; input: string | string[]; hint?: string; metadata?: Record }>, - outputPath?: string, -): Promise => { - const { - mode, - schemaPath, - simpleCommand, - shellTemplate, - cwd, - timeout = DEFAULT_HARNESS_TIMEOUT, - progress = false, - } = config - - const workingDir = cwd ?? process.cwd() - let isFirstOutput = true - - // Clear output file if specified - if (outputPath) { - await Bun.write(outputPath, '') - } - - if (mode === 'schema') { - // Schema mode: use headless adapter - if (!schemaPath) { - throw new Error('Schema path required for schema mode') - } - - const schemaFile = Bun.file(schemaPath) - if (!(await schemaFile.exists())) { - throw new Error(`Schema file not found: ${schemaPath}`) - } - - const rawSchema = await schemaFile.json() - const schema = parseHeadlessConfig(rawSchema) - - const sessions = createSessionManager({ - schema, - timeout, - verbose: progress, - }) - - logProgress(`Schema mode: ${schema.name}`, progress) - - for (let i = 0; i < prompts.length; i++) { - const promptCase = prompts[i] - if (!promptCase) continue - - logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}`, progress) - - const startTime = Date.now() - const rawLines: string[] = [] - let error: string | undefined - - try { - const session = await sessions.create(workingDir) - const inputs = Array.isArray(promptCase.input) ? promptCase.input : [promptCase.input] - - for (const turnInput of inputs) { - const result = await sessions.prompt(session.id, turnInput) - // Collect raw JSON lines from updates - for (const update of result.updates) { - rawLines.push(JSON.stringify(update.raw)) - } - } - - sessions.destroy(session.id) - } catch (err) { - error = err instanceof Error ? err.message : String(err) - } - - const endTime = Date.now() - - const output: RawOutput = { - id: promptCase.id, - input: promptCase.input, - hint: promptCase.hint, - metadata: promptCase.metadata, - rawLines, - timing: { - start: startTime, - end: endTime, - total: endTime - startTime, - }, - ...(error && { error }), - } - - await writeOutput(JSON.stringify(output), outputPath, !isFirstOutput) - isFirstOutput = false - } - } else if (mode === 'simple') { - // Simple mode: placeholder substitution - if (!simpleCommand) { - throw new Error('Command required for simple mode') - } - - logProgress(`Simple mode: ${simpleCommand}`, progress) - - for (let i = 0; i < prompts.length; i++) { - const promptCase = prompts[i] - if (!promptCase) continue - - logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}`, progress) - - const startTime = Date.now() - const inputs = Array.isArray(promptCase.input) ? promptCase.input : [promptCase.input] - const allLines: string[] = [] - const errors: string[] = [] - - for (const input of inputs) { - const result = await runSimple(input, simpleCommand, timeout) - allLines.push(...result.lines) - if (result.error) errors.push(result.error) - } - - const endTime = Date.now() - - const output: RawOutput = { - id: promptCase.id, - input: promptCase.input, - hint: promptCase.hint, - metadata: promptCase.metadata, - rawLines: allLines, - timing: { - start: startTime, - end: endTime, - total: endTime - startTime, - }, - ...(errors.length > 0 && { error: errors.join('\n') }), - } - - await writeOutput(JSON.stringify(output), outputPath, !isFirstOutput) - isFirstOutput = false - } - } else if (mode === 'shell') { - // Shell mode: PROMPT env variable - if (!shellTemplate) { - throw new Error('Shell template required for shell mode') - } - - logProgress(`Shell mode: ${shellTemplate}`, progress) - - for (let i = 0; i < prompts.length; i++) { - const promptCase = prompts[i] - if (!promptCase) continue - - logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}`, progress) - - const startTime = Date.now() - const inputs = Array.isArray(promptCase.input) ? promptCase.input : [promptCase.input] - const allLines: string[] = [] - const errors: string[] = [] - - for (const input of inputs) { - const result = await runShell(input, shellTemplate, timeout) - allLines.push(...result.lines) - if (result.error) errors.push(result.error) - } - - const endTime = Date.now() - - const output: RawOutput = { - id: promptCase.id, - input: promptCase.input, - hint: promptCase.hint, - metadata: promptCase.metadata, - rawLines: allLines, - timing: { - start: startTime, - end: endTime, - total: endTime - startTime, - }, - ...(errors.length > 0 && { error: errors.join('\n') }), - } - - await writeOutput(JSON.stringify(output), outputPath, !isFirstOutput) - isFirstOutput = false - } - } - - logProgress('Done!', progress) -} - -/** - * Read prompts from stdin if available. - * - * @returns Array of parsed prompts or null if stdin is empty - */ -const readStdinPrompts = async (): Promise | null> => { - // Check if stdin has data (not a TTY) - if (process.stdin.isTTY) { - return null - } - - const chunks: Buffer[] = [] - for await (const chunk of process.stdin) { - chunks.push(chunk) - } - - const content = Buffer.concat(chunks).toString('utf-8').trim() - if (!content) return null - - return content - .split('\n') - .filter(Boolean) - .map((line) => JSON.parse(line)) -} - -/** - * Pipeline run command CLI handler. - * - * @param args - Command line arguments (after 'run') - */ -export const run = async (args: string[]): Promise => { - const { values, positionals } = parseArgs({ - args, - options: { - schema: { type: 'string', short: 's' }, - simple: { type: 'string' }, - shell: { type: 'string' }, - output: { type: 'string', short: 'o' }, - cwd: { type: 'string', short: 'c' }, - timeout: { type: 'string', short: 't' }, - progress: { type: 'boolean', default: false }, - help: { type: 'boolean', short: 'h' }, - }, - allowPositionals: true, - }) - - if (values.help) { - console.log(` -Usage: agent-eval-harness run [prompts.jsonl] [options] - -Execute prompts and output raw results for pipeline processing. - -Arguments: - prompts.jsonl Input file (or pipe from stdin) - -Modes (choose one): - -s, --schema Path to headless adapter schema (recommended) - --simple Command template with {} placeholder - --shell Shell template with $PROMPT env variable - -Options: - -o, --output Output file (default: stdout) - -c, --cwd Working directory for agent - -t, --timeout Request timeout in ms (default: ${DEFAULT_HARNESS_TIMEOUT}) - --progress Show progress to stderr - -h, --help Show this help message - -Examples: - # Schema mode (recommended) - agent-eval-harness run prompts.jsonl --schema claude.json | agent-eval-harness extract - - # Simple mode with placeholder - agent-eval-harness run prompts.jsonl --simple "claude -p {} --output-format stream-json" - - # Shell mode with env variable - agent-eval-harness run prompts.jsonl --shell 'claude -p "$PROMPT" --output-format stream-json' - - # Pipe from stdin - cat prompts.jsonl | agent-eval-harness run --schema claude.json -`) - return - } - - // Determine mode - let mode: 'schema' | 'simple' | 'shell' - if (values.schema) { - mode = 'schema' - } else if (values.simple) { - mode = 'simple' - } else if (values.shell) { - mode = 'shell' - } else { - console.error('Error: Must specify --schema, --simple, or --shell mode') - process.exit(1) - } - - // Load prompts from file or stdin - const promptsPath = positionals[0] - let prompts: Array<{ id: string; input: string | string[]; hint?: string; metadata?: Record }> - - if (promptsPath) { - prompts = await loadPrompts(promptsPath) - } else { - const stdinPrompts = await readStdinPrompts() - if (!stdinPrompts || stdinPrompts.length === 0) { - console.error('Error: No prompts provided (use file argument or pipe to stdin)') - process.exit(1) - } - prompts = stdinPrompts - } - - await runPipeline( - { - mode, - schemaPath: values.schema, - simpleCommand: values.simple, - shellTemplate: values.shell, - cwd: values.cwd, - timeout: values.timeout ? Number.parseInt(values.timeout, 10) : undefined, - progress: values.progress, - }, - prompts, - values.output, - ) -} diff --git a/src/pipeline/tests/compare-format-detection.spec.ts b/src/pipeline/tests/compare-format-detection.spec.ts deleted file mode 100644 index 4a958aa..0000000 --- a/src/pipeline/tests/compare-format-detection.spec.ts +++ /dev/null @@ -1,142 +0,0 @@ -/** - * Unit tests for compare format detection. - * - * @remarks - * Tests for auto-detecting CaptureResult vs TrialResult format. - * - * @packageDocumentation - */ - -import { afterAll, beforeAll, describe, expect, test } from 'bun:test' -import { detectAndValidateFormat, detectInputFormat } from '../compare-format-detection.ts' - -// ============================================================================ -// Test Fixtures -// ============================================================================ - -const CAPTURE_RESULT = JSON.stringify({ - id: 'test-001', - input: 'Hello', - output: 'Hi there', - trajectory: [{ type: 'message', content: 'Hi', timestamp: 1234567890 }], - timing: { start: 1234567890, end: 1234567891, total: 1, sessionCreation: 0 }, - metadata: {}, - toolErrors: false, -}) - -const TRIAL_RESULT = JSON.stringify({ - id: 'test-001', - input: 'Hello', - k: 3, - passRate: 0.67, - passAtK: 0.9, - passExpK: 0.3, - trials: [ - { trialNum: 1, output: 'Hi', trajectory: [], duration: 100, pass: true, score: 1.0 }, - { trialNum: 2, output: 'Hello', trajectory: [], duration: 120, pass: true, score: 0.8 }, - { trialNum: 3, output: 'Error', trajectory: [], duration: 150, pass: false, score: 0.2 }, - ], -}) - -const tempDir = `${import.meta.dir}/.test-tmp/format-detection` - -beforeAll(async () => { - await Bun.$`mkdir -p ${tempDir}` -}) - -afterAll(async () => { - await Bun.$`rm -rf ${tempDir}` -}) - -// ============================================================================ -// detectInputFormat Tests -// ============================================================================ - -describe('detectInputFormat', () => { - test('detects CaptureResult format', async () => { - const path = `${tempDir}/capture.jsonl` - await Bun.write(path, `${CAPTURE_RESULT}\n`) - - const format = await detectInputFormat(path) - - expect(format).toBe('capture') - }) - - test('detects TrialResult format', async () => { - const path = `${tempDir}/trial.jsonl` - await Bun.write(path, `${TRIAL_RESULT}\n`) - - const format = await detectInputFormat(path) - - expect(format).toBe('trials') - }) - - test('throws on empty file', async () => { - const path = `${tempDir}/empty.jsonl` - await Bun.write(path, '') - - await expect(detectInputFormat(path)).rejects.toThrow('Empty file') - }) - - test('throws on invalid JSON', async () => { - const path = `${tempDir}/invalid.jsonl` - await Bun.write(path, 'not json\n') - - await expect(detectInputFormat(path)).rejects.toThrow('Invalid JSON') - }) - - test('throws on unrecognized format', async () => { - const path = `${tempDir}/unknown.jsonl` - await Bun.write(path, `${JSON.stringify({ id: 'test', foo: 'bar' })}\n`) - - await expect(detectInputFormat(path)).rejects.toThrow('Unable to detect format') - }) - - test('ignores empty lines and uses first non-empty line', async () => { - const path = `${tempDir}/with-empty.jsonl` - await Bun.write(path, `\n\n${CAPTURE_RESULT}\n`) - - const format = await detectInputFormat(path) - - expect(format).toBe('capture') - }) -}) - -// ============================================================================ -// detectAndValidateFormat Tests -// ============================================================================ - -describe('detectAndValidateFormat', () => { - test('validates all files have same format', async () => { - const path1 = `${tempDir}/capture1.jsonl` - const path2 = `${tempDir}/capture2.jsonl` - await Bun.write(path1, `${CAPTURE_RESULT}\n`) - await Bun.write(path2, `${CAPTURE_RESULT}\n`) - - const format = await detectAndValidateFormat([path1, path2]) - - expect(format).toBe('capture') - }) - - test('throws on format mismatch', async () => { - const capturePath = `${tempDir}/capture-mixed.jsonl` - const trialPath = `${tempDir}/trial-mixed.jsonl` - await Bun.write(capturePath, `${CAPTURE_RESULT}\n`) - await Bun.write(trialPath, `${TRIAL_RESULT}\n`) - - await expect(detectAndValidateFormat([capturePath, trialPath])).rejects.toThrow('Format mismatch') - }) - - test('throws on empty file list', async () => { - await expect(detectAndValidateFormat([])).rejects.toThrow('No files provided') - }) - - test('works with single file', async () => { - const path = `${tempDir}/single-trial.jsonl` - await Bun.write(path, `${TRIAL_RESULT}\n`) - - const format = await detectAndValidateFormat([path]) - - expect(format).toBe('trials') - }) -}) diff --git a/src/pipeline/tests/compare-statistical.spec.ts b/src/pipeline/tests/compare-statistical.spec.ts deleted file mode 100644 index 8daf9a8..0000000 --- a/src/pipeline/tests/compare-statistical.spec.ts +++ /dev/null @@ -1,289 +0,0 @@ -/** - * Integration tests for compare command statistical strategy. - * - * @remarks - * Tests verify confidence interval computation for the statistical strategy - * in the compare command with CaptureResult format. - * - * @packageDocumentation - */ - -import { afterAll, beforeAll, describe, expect, test } from 'bun:test' -import type { CaptureResult } from '../../schemas.ts' -import { runCompare } from '../compare.ts' - -// ============================================================================ -// Test Fixtures -// ============================================================================ - -const createCaptureResult = (id: string, score: number, pass: boolean, duration: number = 1000): CaptureResult => ({ - id, - input: `Prompt for ${id}`, - output: `Output for ${id}`, - trajectory: [{ type: 'message', content: `Output for ${id}`, timestamp: Date.now() }], - metadata: {}, - timing: { - start: Date.now(), - end: Date.now() + duration, - sessionCreation: 100, - total: duration, - }, - toolErrors: false, - score: { - pass, - score, - reasoning: pass ? 'Passed' : 'Failed', - }, -}) - -const tempDir = `${import.meta.dir}/.test-tmp/compare-statistical` - -beforeAll(async () => { - await Bun.$`mkdir -p ${tempDir}` -}) - -afterAll(async () => { - await Bun.$`rm -rf ${tempDir}` -}) - -// ============================================================================ -// Statistical Strategy CI Tests -// ============================================================================ - -describe('runCompare statistical strategy', () => { - test('computes confidence intervals for quality metrics', async () => { - const run1Path = `${tempDir}/ci-qual-run1.jsonl` - const run2Path = `${tempDir}/ci-qual-run2.jsonl` - - // Create multiple prompts with varying scores for meaningful CI computation - const results1 = [ - createCaptureResult('p1', 0.9, true, 1000), - createCaptureResult('p2', 0.85, true, 1100), - createCaptureResult('p3', 0.95, true, 900), - createCaptureResult('p4', 0.8, true, 1200), - ] - const results2 = [ - createCaptureResult('p1', 0.6, false, 2000), - createCaptureResult('p2', 0.5, false, 2100), - createCaptureResult('p3', 0.7, true, 1900), - createCaptureResult('p4', 0.55, false, 2200), - ] - - await Bun.write(run1Path, results1.map((r) => JSON.stringify(r)).join('\n')) - await Bun.write(run2Path, results2.map((r) => JSON.stringify(r)).join('\n')) - - const report = await runCompare({ - runs: [ - { label: 'high', path: run1Path }, - { label: 'low', path: run2Path }, - ], - strategy: 'statistical', - progress: false, - }) - - // Verify confidence intervals are computed for quality - const highQuality = report.quality.high - expect(highQuality).toBeDefined() - expect(highQuality?.confidenceIntervals).toBeDefined() - expect(highQuality?.confidenceIntervals?.avgScore).toBeDefined() - expect(highQuality?.confidenceIntervals?.passRate).toBeDefined() - - // avgScore CI should be a tuple [lower, upper] - const avgScoreCI = highQuality?.confidenceIntervals?.avgScore - expect(avgScoreCI).toHaveLength(2) - expect(avgScoreCI?.[0]).toBeLessThanOrEqual(avgScoreCI?.[1] ?? 0) - - // CI should contain the average (within reasonable bounds) - expect(avgScoreCI?.[0]).toBeLessThanOrEqual(highQuality?.avgScore ?? 0) - expect(avgScoreCI?.[1]).toBeGreaterThanOrEqual(highQuality?.avgScore ?? 1) - - // passRate CI should also be valid - const passRateCI = highQuality?.confidenceIntervals?.passRate - expect(passRateCI).toHaveLength(2) - expect(passRateCI?.[0]).toBeLessThanOrEqual(passRateCI?.[1] ?? 0) - - // Verify reliability metrics include type discriminator - expect(report.reliability.high?.type).toBe('run') - expect(report.reliability.low?.type).toBe('run') - - // Verify quality metrics include type discriminator - expect(report.quality.high?.type).toBe('run') - expect(report.quality.low?.type).toBe('run') - }) - - test('computes confidence intervals for performance metrics', async () => { - const run1Path = `${tempDir}/ci-perf-run1.jsonl` - const run2Path = `${tempDir}/ci-perf-run2.jsonl` - - // Create results with varying latencies - const results1 = [ - createCaptureResult('p1', 0.9, true, 1000), - createCaptureResult('p2', 0.85, true, 1100), - createCaptureResult('p3', 0.95, true, 900), - createCaptureResult('p4', 0.8, true, 1050), - ] - const results2 = [ - createCaptureResult('p1', 0.7, true, 2000), - createCaptureResult('p2', 0.65, true, 2200), - createCaptureResult('p3', 0.75, true, 1800), - createCaptureResult('p4', 0.6, true, 2100), - ] - - await Bun.write(run1Path, results1.map((r) => JSON.stringify(r)).join('\n')) - await Bun.write(run2Path, results2.map((r) => JSON.stringify(r)).join('\n')) - - const report = await runCompare({ - runs: [ - { label: 'fast', path: run1Path }, - { label: 'slow', path: run2Path }, - ], - strategy: 'statistical', - progress: false, - }) - - // Verify confidence intervals are computed for performance - const fastPerf = report.performance.fast - expect(fastPerf).toBeDefined() - expect(fastPerf?.confidenceIntervals).toBeDefined() - expect(fastPerf?.confidenceIntervals?.latencyMean).toBeDefined() - - // latencyMean CI should be a tuple [lower, upper] - const latencyCI = fastPerf?.confidenceIntervals?.latencyMean - expect(latencyCI).toHaveLength(2) - expect(latencyCI?.[0]).toBeLessThanOrEqual(latencyCI?.[1] ?? 0) - - // Fast run should have lower latency CI than slow run - const slowPerf = report.performance.slow - const slowLatencyCI = slowPerf?.confidenceIntervals?.latencyMean - expect(latencyCI?.[1]).toBeLessThan(slowLatencyCI?.[0] ?? 0) - }) - - test('weighted strategy does not compute confidence intervals', async () => { - const run1Path = `${tempDir}/no-ci-run1.jsonl` - const run2Path = `${tempDir}/no-ci-run2.jsonl` - - const results1 = [createCaptureResult('p1', 0.9, true), createCaptureResult('p2', 0.85, true)] - const results2 = [createCaptureResult('p1', 0.6, false), createCaptureResult('p2', 0.5, false)] - - await Bun.write(run1Path, results1.map((r) => JSON.stringify(r)).join('\n')) - await Bun.write(run2Path, results2.map((r) => JSON.stringify(r)).join('\n')) - - const report = await runCompare({ - runs: [ - { label: 'run1', path: run1Path }, - { label: 'run2', path: run2Path }, - ], - strategy: 'weighted', // Default strategy - progress: false, - }) - - // Confidence intervals should NOT be present for weighted strategy - const quality = report.quality.run1 - expect(quality?.confidenceIntervals).toBeUndefined() - - const perf = report.performance.run1 - expect(perf?.confidenceIntervals).toBeUndefined() - }) - - test('statistical strategy includes CIs in markdown output', async () => { - const run1Path = `${tempDir}/ci-md-run1.jsonl` - const run2Path = `${tempDir}/ci-md-run2.jsonl` - const outputPath = `${tempDir}/ci-report.md` - - const results1 = [createCaptureResult('p1', 0.9, true, 1000), createCaptureResult('p2', 0.85, true, 1100)] - const results2 = [createCaptureResult('p1', 0.6, false, 2000), createCaptureResult('p2', 0.5, false, 2100)] - - await Bun.write(run1Path, results1.map((r) => JSON.stringify(r)).join('\n')) - await Bun.write(run2Path, results2.map((r) => JSON.stringify(r)).join('\n')) - - await runCompare({ - runs: [ - { label: 'agent1', path: run1Path }, - { label: 'agent2', path: run2Path }, - ], - strategy: 'statistical', - outputPath, - format: 'markdown', - progress: false, - }) - - const content = await Bun.file(outputPath).text() - - // Markdown should include 95% CI column headers - expect(content).toContain('95% CI') - // Should contain CI values in bracket format [lower, upper] - expect(content).toMatch(/\[\d+\.\d+, \d+\.\d+\]/) - }) - - test('handles single sample gracefully with degenerate CI', async () => { - const run1Path = `${tempDir}/single-run1.jsonl` - const run2Path = `${tempDir}/single-run2.jsonl` - - // Single sample per run - const result1 = createCaptureResult('p1', 0.9, true) - const result2 = createCaptureResult('p1', 0.5, false) - - await Bun.write(run1Path, JSON.stringify(result1)) - await Bun.write(run2Path, JSON.stringify(result2)) - - const report = await runCompare({ - runs: [ - { label: 'single1', path: run1Path }, - { label: 'single2', path: run2Path }, - ], - strategy: 'statistical', - progress: false, - }) - - // Should still compute CIs (they will be degenerate for single sample) - const quality = report.quality.single1 - expect(quality?.confidenceIntervals).toBeDefined() - expect(quality?.confidenceIntervals?.avgScore).toBeDefined() - - // For single sample, CI should collapse to the value - const ci = quality?.confidenceIntervals?.avgScore - expect(ci?.[0]).toBeCloseTo(ci?.[1] ?? 0, 2) - expect(ci?.[0]).toBeCloseTo(quality?.avgScore ?? 0, 2) - }) - - test('JSON output includes confidence intervals structure', async () => { - const run1Path = `${tempDir}/json-ci-run1.jsonl` - const run2Path = `${tempDir}/json-ci-run2.jsonl` - const outputPath = `${tempDir}/ci-report.json` - - const results1 = [ - createCaptureResult('p1', 0.9, true), - createCaptureResult('p2', 0.85, true), - createCaptureResult('p3', 0.95, true), - ] - const results2 = [ - createCaptureResult('p1', 0.6, false), - createCaptureResult('p2', 0.5, false), - createCaptureResult('p3', 0.7, true), - ] - - await Bun.write(run1Path, results1.map((r) => JSON.stringify(r)).join('\n')) - await Bun.write(run2Path, results2.map((r) => JSON.stringify(r)).join('\n')) - - await runCompare({ - runs: [ - { label: 'high', path: run1Path }, - { label: 'low', path: run2Path }, - ], - strategy: 'statistical', - outputPath, - format: 'json', - progress: false, - }) - - const content = await Bun.file(outputPath).text() - const parsed = JSON.parse(content) - - // Verify JSON structure includes confidenceIntervals - expect(parsed.quality.high.confidenceIntervals).toBeDefined() - expect(parsed.quality.high.confidenceIntervals.avgScore).toBeInstanceOf(Array) - expect(parsed.quality.high.confidenceIntervals.avgScore.length).toBe(2) - expect(parsed.performance.high.confidenceIntervals).toBeDefined() - expect(parsed.performance.high.confidenceIntervals.latencyMean).toBeInstanceOf(Array) - }) -}) diff --git a/src/pipeline/tests/compare-trials.spec.ts b/src/pipeline/tests/compare-trials.spec.ts deleted file mode 100644 index 9064344..0000000 --- a/src/pipeline/tests/compare-trials.spec.ts +++ /dev/null @@ -1,592 +0,0 @@ -/** - * Unit tests for trials comparison module. - * - * @remarks - * Tests for runTrialsCompare and supporting functions. - * - * @packageDocumentation - */ - -import { afterAll, beforeAll, describe, expect, test } from 'bun:test' -import { buildTrialsIndex, runTrialsCompare } from '../compare-trials.ts' - -// ============================================================================ -// Test Fixtures -// ============================================================================ - -const createTrialResult = ( - id: string, - passAtK: number, - passExpK: number, - k: number = 3, - includeScores: boolean = true, -) => ({ - id, - input: `Prompt for ${id}`, - k, - ...(includeScores && { passRate: passAtK, passAtK, passExpK }), - trials: Array.from({ length: k }, (_, i) => ({ - trialNum: i + 1, - output: `Output ${i + 1}`, - trajectory: [], - duration: 100 + i * 10, - ...(includeScores && { pass: Math.random() < passAtK, score: passAtK }), - })), -}) - -const tempDir = `${import.meta.dir}/.test-tmp/compare-trials` - -beforeAll(async () => { - await Bun.$`mkdir -p ${tempDir}` -}) - -afterAll(async () => { - await Bun.$`rm -rf ${tempDir}` -}) - -// ============================================================================ -// buildTrialsIndex Tests -// ============================================================================ - -describe('buildTrialsIndex', () => { - test('builds index from JSONL file', async () => { - const path = `${tempDir}/trials-index.jsonl` - const trial1 = createTrialResult('test-001', 0.9, 0.3) - const trial2 = createTrialResult('test-002', 0.8, 0.6) - await Bun.write(path, [JSON.stringify(trial1), JSON.stringify(trial2)].join('\n')) - - const index = await buildTrialsIndex(path) - - expect(index.size).toBe(2) - expect(index.get('test-001')?.passAtK).toBe(0.9) - expect(index.get('test-002')?.passExpK).toBe(0.6) - }) - - test('handles empty file', async () => { - const path = `${tempDir}/empty-trials.jsonl` - await Bun.write(path, '') - - const index = await buildTrialsIndex(path) - - expect(index.size).toBe(0) - }) - - test('throws on invalid JSON', async () => { - const path = `${tempDir}/invalid-trials.jsonl` - await Bun.write(path, 'not json\n') - - await expect(buildTrialsIndex(path)).rejects.toThrow() - }) -}) - -// ============================================================================ -// runTrialsCompare Tests -// ============================================================================ - -describe('runTrialsCompare', () => { - test('compares two trial runs and produces report', async () => { - const run1Path = `${tempDir}/run1.jsonl` - const run2Path = `${tempDir}/run2.jsonl` - - const trial1a = createTrialResult('test-001', 0.9, 0.7) - const trial1b = createTrialResult('test-002', 0.8, 0.5) - const trial2a = createTrialResult('test-001', 0.95, 0.9) - const trial2b = createTrialResult('test-002', 0.6, 0.4) - - await Bun.write(run1Path, [JSON.stringify(trial1a), JSON.stringify(trial1b)].join('\n')) - await Bun.write(run2Path, [JSON.stringify(trial2a), JSON.stringify(trial2b)].join('\n')) - - const outputPath = `${tempDir}/comparison.json` - const report = await runTrialsCompare({ - runs: [ - { label: 'baseline', path: run1Path }, - { label: 'variant', path: run2Path }, - ], - outputPath, - progress: false, - }) - - expect(report.meta.inputFormat).toBe('trials') - expect(report.meta.runs).toEqual(['baseline', 'variant']) - expect(report.meta.promptCount).toBe(2) - expect(report.capability).toBeDefined() - expect(report.reliability).toBeDefined() - expect(report.reliability.baseline?.type).toBe('trial') - expect(report.reliability.variant?.type).toBe('trial') - expect(report.flakiness).toBeDefined() - expect(report.headToHead.capability.length).toBeGreaterThan(0) - - // Verify output file was written - const outputExists = await Bun.file(outputPath).exists() - expect(outputExists).toBe(true) - }) - - test('throws with fewer than 2 runs', async () => { - const run1Path = `${tempDir}/single-run.jsonl` - await Bun.write(run1Path, JSON.stringify(createTrialResult('test-001', 0.9, 0.7))) - - await expect( - runTrialsCompare({ - runs: [{ label: 'only', path: run1Path }], - progress: false, - }), - ).rejects.toThrow('At least 2 runs required') - }) - - test('skips prompts only in one run', async () => { - const run1Path = `${tempDir}/partial1.jsonl` - const run2Path = `${tempDir}/partial2.jsonl` - - // Only run1 has test-001 - const trial1a = createTrialResult('test-001', 0.9, 0.7) - // Both have test-002 - const trial1b = createTrialResult('test-002', 0.8, 0.5) - const trial2b = createTrialResult('test-002', 0.6, 0.4) - - await Bun.write(run1Path, [JSON.stringify(trial1a), JSON.stringify(trial1b)].join('\n')) - await Bun.write(run2Path, JSON.stringify(trial2b)) - - const report = await runTrialsCompare({ - runs: [ - { label: 'run1', path: run1Path }, - { label: 'run2', path: run2Path }, - ], - progress: false, - }) - - // Only test-002 should be compared (both runs have it) - expect(report.headToHead.overall.length).toBeGreaterThan(0) - // Per-prompt should only have test-002 - const perPromptIds = report.perPrompt?.map((p) => p.id) ?? [] - expect(perPromptIds).toContain('test-002') - expect(perPromptIds).not.toContain('test-001') - }) - - test('generates markdown output when format is markdown', async () => { - const run1Path = `${tempDir}/md-run1.jsonl` - const run2Path = `${tempDir}/md-run2.jsonl` - const outputPath = `${tempDir}/report.md` - - const trial1 = createTrialResult('test-001', 0.9, 0.7) - const trial2 = createTrialResult('test-001', 0.8, 0.6) - - await Bun.write(run1Path, JSON.stringify(trial1)) - await Bun.write(run2Path, JSON.stringify(trial2)) - - await runTrialsCompare({ - runs: [ - { label: 'agent1', path: run1Path }, - { label: 'agent2', path: run2Path }, - ], - outputPath, - format: 'markdown', - progress: false, - }) - - const content = await Bun.file(outputPath).text() - expect(content).toContain('# Trials Comparison Report') - expect(content).toContain('## Capability') - expect(content).toContain('## Reliability') - expect(content).toContain('## Flakiness') - expect(content).toContain('agent1') - expect(content).toContain('agent2') - }) - - test('uses statistical strategy when specified', async () => { - const run1Path = `${tempDir}/stat-run1.jsonl` - const run2Path = `${tempDir}/stat-run2.jsonl` - - const trial1 = createTrialResult('test-001', 0.9, 0.7) - const trial2 = createTrialResult('test-001', 0.5, 0.3) - - await Bun.write(run1Path, JSON.stringify(trial1)) - await Bun.write(run2Path, JSON.stringify(trial2)) - - const report = await runTrialsCompare({ - runs: [ - { label: 'better', path: run1Path }, - { label: 'worse', path: run2Path }, - ], - strategy: 'statistical', - progress: false, - }) - - // Report should be generated without error - expect(report.meta.runs).toEqual(['better', 'worse']) - }) - - test('statistical strategy computes confidence intervals for capability metrics', async () => { - const run1Path = `${tempDir}/ci-cap-run1.jsonl` - const run2Path = `${tempDir}/ci-cap-run2.jsonl` - - // Create multiple prompts for meaningful CI computation - const trials1 = [ - createTrialResult('p1', 0.9, 0.8), - createTrialResult('p2', 0.85, 0.7), - createTrialResult('p3', 0.95, 0.9), - ] - const trials2 = [ - createTrialResult('p1', 0.6, 0.4), - createTrialResult('p2', 0.5, 0.3), - createTrialResult('p3', 0.7, 0.5), - ] - - await Bun.write(run1Path, trials1.map((t) => JSON.stringify(t)).join('\n')) - await Bun.write(run2Path, trials2.map((t) => JSON.stringify(t)).join('\n')) - - const report = await runTrialsCompare({ - runs: [ - { label: 'high', path: run1Path }, - { label: 'low', path: run2Path }, - ], - strategy: 'statistical', - progress: false, - }) - - // Verify confidence intervals are computed for capability - const highCap = report.capability.high - expect(highCap).toBeDefined() - expect(highCap?.confidenceIntervals).toBeDefined() - expect(highCap?.confidenceIntervals?.avgPassAtK).toBeDefined() - - // CI should be a tuple [lower, upper] - const ci = highCap?.confidenceIntervals?.avgPassAtK - expect(ci).toHaveLength(2) - expect(ci?.[0]).toBeLessThanOrEqual(ci?.[1] ?? 0) - - // CI should contain the average (within reasonable bounds) - expect(ci?.[0]).toBeLessThanOrEqual(highCap?.avgPassAtK ?? 0) - expect(ci?.[1]).toBeGreaterThanOrEqual(highCap?.avgPassAtK ?? 1) - }) - - test('statistical strategy computes confidence intervals for reliability metrics', async () => { - const run1Path = `${tempDir}/ci-rel-run1.jsonl` - const run2Path = `${tempDir}/ci-rel-run2.jsonl` - - const trials1 = [ - createTrialResult('p1', 0.9, 0.85), - createTrialResult('p2', 0.8, 0.75), - createTrialResult('p3', 0.85, 0.8), - ] - const trials2 = [ - createTrialResult('p1', 0.7, 0.3), - createTrialResult('p2', 0.6, 0.2), - createTrialResult('p3', 0.65, 0.25), - ] - - await Bun.write(run1Path, trials1.map((t) => JSON.stringify(t)).join('\n')) - await Bun.write(run2Path, trials2.map((t) => JSON.stringify(t)).join('\n')) - - const report = await runTrialsCompare({ - runs: [ - { label: 'reliable', path: run1Path }, - { label: 'flaky', path: run2Path }, - ], - strategy: 'statistical', - progress: false, - }) - - // Verify confidence intervals are computed for reliability - const reliableRel = report.reliability.reliable - expect(reliableRel).toBeDefined() - expect(reliableRel?.type).toBe('trial') - expect(reliableRel?.confidenceIntervals).toBeDefined() - expect(reliableRel?.confidenceIntervals?.avgPassExpK).toBeDefined() - - // CI should be a tuple [lower, upper] - const ci = reliableRel?.confidenceIntervals?.avgPassExpK - expect(ci).toHaveLength(2) - expect(ci?.[0]).toBeLessThanOrEqual(ci?.[1] ?? 0) - }) - - test('weighted strategy does not compute confidence intervals', async () => { - const run1Path = `${tempDir}/no-ci-run1.jsonl` - const run2Path = `${tempDir}/no-ci-run2.jsonl` - - const trial1 = createTrialResult('test-001', 0.9, 0.7) - const trial2 = createTrialResult('test-001', 0.5, 0.3) - - await Bun.write(run1Path, JSON.stringify(trial1)) - await Bun.write(run2Path, JSON.stringify(trial2)) - - const report = await runTrialsCompare({ - runs: [ - { label: 'run1', path: run1Path }, - { label: 'run2', path: run2Path }, - ], - strategy: 'weighted', // Default strategy - progress: false, - }) - - // Confidence intervals should NOT be present for weighted strategy - const cap = report.capability.run1 - expect(cap?.confidenceIntervals).toBeUndefined() - - const rel = report.reliability.run1 - expect(rel?.confidenceIntervals).toBeUndefined() - }) - - test('statistical strategy includes CIs in markdown output', async () => { - const run1Path = `${tempDir}/ci-md-run1.jsonl` - const run2Path = `${tempDir}/ci-md-run2.jsonl` - const outputPath = `${tempDir}/ci-report.md` - - const trials1 = [createTrialResult('p1', 0.9, 0.8), createTrialResult('p2', 0.85, 0.75)] - const trials2 = [createTrialResult('p1', 0.6, 0.4), createTrialResult('p2', 0.5, 0.3)] - - await Bun.write(run1Path, trials1.map((t) => JSON.stringify(t)).join('\n')) - await Bun.write(run2Path, trials2.map((t) => JSON.stringify(t)).join('\n')) - - await runTrialsCompare({ - runs: [ - { label: 'agent1', path: run1Path }, - { label: 'agent2', path: run2Path }, - ], - strategy: 'statistical', - outputPath, - format: 'markdown', - progress: false, - }) - - const content = await Bun.file(outputPath).text() - - // Markdown should include 95% CI column headers - expect(content).toContain('95% CI') - // Should contain CI values in bracket format [lower, upper] - expect(content).toMatch(/\[\d+\.\d+, \d+\.\d+\]/) - }) - - test('computes correct capability metrics', async () => { - const run1Path = `${tempDir}/cap-run1.jsonl` - - // Create 3 prompts with known passAtK values - const trials = [ - createTrialResult('p1', 1.0, 0.8), // passAtK = 1.0 - createTrialResult('p2', 0.5, 0.3), // passAtK = 0.5 - createTrialResult('p3', 0.8, 0.6), // passAtK = 0.8 - ] - // Average passAtK = (1.0 + 0.5 + 0.8) / 3 = 0.767 - // Sorted: 0.5, 0.8, 1.0 -> median = 0.8 - - await Bun.write(run1Path, trials.map((t) => JSON.stringify(t)).join('\n')) - - const run2Path = `${tempDir}/cap-run2.jsonl` - await Bun.write(run2Path, trials.map((t) => JSON.stringify(t)).join('\n')) - - const report = await runTrialsCompare({ - runs: [ - { label: 'test', path: run1Path }, - { label: 'test2', path: run2Path }, - ], - progress: false, - }) - - const cap = report.capability.test - expect(cap).toBeDefined() - // Average should be approximately 0.767 - expect(cap?.avgPassAtK).toBeCloseTo(0.767, 2) - // Median of [0.5, 0.8, 1.0] = 0.8 - expect(cap?.medianPassAtK).toBeCloseTo(0.8, 2) - }) - - test('identifies flaky prompts correctly', async () => { - const run1Path = `${tempDir}/flaky-run1.jsonl` - - // Create prompts with varying flakiness - const trials = [ - createTrialResult('consistent', 0.9, 0.9), // flakiness = 0 - createTrialResult('flaky', 0.9, 0.1), // flakiness = 0.8 - createTrialResult('moderate', 0.7, 0.5), // flakiness = 0.2 - ] - - await Bun.write(run1Path, trials.map((t) => JSON.stringify(t)).join('\n')) - - const run2Path = `${tempDir}/flaky-run2.jsonl` - await Bun.write(run2Path, trials.map((t) => JSON.stringify(t)).join('\n')) - - const report = await runTrialsCompare({ - runs: [ - { label: 'test', path: run1Path }, - { label: 'test2', path: run2Path }, - ], - progress: false, - }) - - const flak = report.flakiness.test - expect(flak).toBeDefined() - // 2 prompts have non-zero flakiness - expect(flak?.flakyPromptCount).toBe(2) - // Top flaky should include 'flaky' prompt - const topFlakyIds = flak?.topFlakyPrompts.map((p) => p.id) ?? [] - expect(topFlakyIds).toContain('flaky') - }) - - test('includes performance metrics with latency stats', async () => { - const run1Path = `${tempDir}/perf-run1.jsonl` - const run2Path = `${tempDir}/perf-run2.jsonl` - - const trial1 = createTrialResult('test-001', 0.9, 0.7) - const trial2 = createTrialResult('test-001', 0.8, 0.6) - - await Bun.write(run1Path, JSON.stringify(trial1)) - await Bun.write(run2Path, JSON.stringify(trial2)) - - const report = await runTrialsCompare({ - runs: [ - { label: 'run1', path: run1Path }, - { label: 'run2', path: run2Path }, - ], - progress: false, - }) - - // Performance should always be present - expect(report.performance).toBeDefined() - expect(report.performance.run1).toBeDefined() - expect(report.performance.run2).toBeDefined() - - const perf = report.performance.run1 - expect(perf?.latency).toBeDefined() - expect(perf?.latency.p50).toBeGreaterThan(0) - expect(perf?.latency.mean).toBeGreaterThan(0) - expect(perf?.latency.min).toBeGreaterThan(0) - expect(perf?.latency.max).toBeGreaterThan(0) - expect(perf?.totalDuration).toBeGreaterThan(0) - }) - - test('includes quality metrics when scores are present', async () => { - const run1Path = `${tempDir}/qual-run1.jsonl` - const run2Path = `${tempDir}/qual-run2.jsonl` - - // createTrialResult always includes score fields - const trial1 = createTrialResult('test-001', 0.9, 0.7) - const trial2 = createTrialResult('test-001', 0.8, 0.6) - - await Bun.write(run1Path, JSON.stringify(trial1)) - await Bun.write(run2Path, JSON.stringify(trial2)) - - const report = await runTrialsCompare({ - runs: [ - { label: 'run1', path: run1Path }, - { label: 'run2', path: run2Path }, - ], - progress: false, - }) - - // Quality should be present since trials have scores - expect(report.quality).toBeDefined() - expect(report.quality?.run1).toBeDefined() - - const qual = report.quality?.run1 - expect(qual?.type).toBe('trial') - expect(qual?.avgScore).toBeGreaterThan(0) - expect(qual?.medianScore).toBeGreaterThan(0) - expect(qual?.p25Score).toBeDefined() - expect(qual?.p75Score).toBeDefined() - }) - - test('omits quality metrics when scores are absent', async () => { - const run1Path = `${tempDir}/noqual-run1.jsonl` - const run2Path = `${tempDir}/noqual-run2.jsonl` - - // Create trials without scores (includeScores=false) - const trial1 = createTrialResult('test-001', 0, 0, 3, false) - const trial2 = createTrialResult('test-001', 0, 0, 3, false) - - await Bun.write(run1Path, JSON.stringify(trial1)) - await Bun.write(run2Path, JSON.stringify(trial2)) - - const report = await runTrialsCompare({ - runs: [ - { label: 'run1', path: run1Path }, - { label: 'run2', path: run2Path }, - ], - progress: false, - }) - - // Quality should NOT be present since no trials have scores - expect(report.quality).toBeUndefined() - - // Performance should still be present - expect(report.performance).toBeDefined() - expect(report.performance.run1?.latency.mean).toBeGreaterThan(0) - }) - - test('statistical strategy computes CIs for quality and performance', async () => { - const run1Path = `${tempDir}/ci-qp-run1.jsonl` - const run2Path = `${tempDir}/ci-qp-run2.jsonl` - - const trials1 = [ - createTrialResult('p1', 0.9, 0.8), - createTrialResult('p2', 0.85, 0.7), - createTrialResult('p3', 0.95, 0.9), - ] - const trials2 = [ - createTrialResult('p1', 0.6, 0.4), - createTrialResult('p2', 0.5, 0.3), - createTrialResult('p3', 0.7, 0.5), - ] - - await Bun.write(run1Path, trials1.map((t) => JSON.stringify(t)).join('\n')) - await Bun.write(run2Path, trials2.map((t) => JSON.stringify(t)).join('\n')) - - const report = await runTrialsCompare({ - runs: [ - { label: 'high', path: run1Path }, - { label: 'low', path: run2Path }, - ], - strategy: 'statistical', - progress: false, - }) - - // Quality CIs - const highQual = report.quality?.high - expect(highQual).toBeDefined() - expect(highQual?.confidenceIntervals).toBeDefined() - expect(highQual?.confidenceIntervals?.avgScore).toBeDefined() - - const qualCI = highQual?.confidenceIntervals?.avgScore - expect(qualCI).toHaveLength(2) - expect(qualCI?.[0]).toBeLessThanOrEqual(qualCI?.[1] ?? 0) - - // Performance CIs - const highPerf = report.performance.high - expect(highPerf).toBeDefined() - expect(highPerf?.confidenceIntervals).toBeDefined() - expect(highPerf?.confidenceIntervals?.latencyMean).toBeDefined() - - const perfCI = highPerf?.confidenceIntervals?.latencyMean - expect(perfCI).toHaveLength(2) - expect(perfCI?.[0]).toBeLessThanOrEqual(perfCI?.[1] ?? 0) - }) - - test('markdown output includes quality and performance tables', async () => { - const run1Path = `${tempDir}/md-qp-run1.jsonl` - const run2Path = `${tempDir}/md-qp-run2.jsonl` - const outputPath = `${tempDir}/qp-report.md` - - const trial1 = createTrialResult('test-001', 0.9, 0.7) - const trial2 = createTrialResult('test-001', 0.8, 0.6) - - await Bun.write(run1Path, JSON.stringify(trial1)) - await Bun.write(run2Path, JSON.stringify(trial2)) - - await runTrialsCompare({ - runs: [ - { label: 'agent1', path: run1Path }, - { label: 'agent2', path: run2Path }, - ], - outputPath, - format: 'markdown', - progress: false, - }) - - const content = await Bun.file(outputPath).text() - - // Should contain quality and performance sections - expect(content).toContain('## Quality (Scores)') - expect(content).toContain('## Performance (Latency)') - expect(content).toContain('Avg Score') - expect(content).toContain('P50 (ms)') - expect(content).toContain('Mean (ms)') - }) -}) diff --git a/src/pipeline/tests/compare-utils.spec.ts b/src/pipeline/tests/compare-utils.spec.ts deleted file mode 100644 index 31cf933..0000000 --- a/src/pipeline/tests/compare-utils.spec.ts +++ /dev/null @@ -1,128 +0,0 @@ -/** - * Unit tests for compare-utils shared utilities. - * - * @remarks - * Tests for percentile, computeLatencyStats, and computeScoreDistribution. - * - * @packageDocumentation - */ - -import { describe, expect, test } from 'bun:test' -import { computeLatencyStats, computeScoreDistribution, percentile } from '../compare-utils.ts' - -// ============================================================================ -// percentile Tests -// ============================================================================ - -describe('percentile', () => { - test('computes correct percentile values', () => { - const sorted = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100] - - expect(percentile(sorted, 0.5)).toBe(60) - expect(percentile(sorted, 0.25)).toBe(30) - expect(percentile(sorted, 0.75)).toBe(80) - expect(percentile(sorted, 0.9)).toBe(100) - }) - - test('returns 0 for empty array', () => { - expect(percentile([], 0.5)).toBe(0) - }) - - test('handles single-element array', () => { - expect(percentile([42], 0.5)).toBe(42) - expect(percentile([42], 0.0)).toBe(42) - expect(percentile([42], 1.0)).toBe(42) - }) - - test('handles p=0 and p=1 boundary values', () => { - const sorted = [10, 20, 30] - - expect(percentile(sorted, 0)).toBe(10) - expect(percentile(sorted, 1)).toBe(30) - }) -}) - -// ============================================================================ -// computeLatencyStats Tests -// ============================================================================ - -describe('computeLatencyStats', () => { - test('returns correct stats for typical durations', () => { - const durations = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000] - const stats = computeLatencyStats(durations) - - expect(stats.min).toBe(100) - expect(stats.max).toBe(1000) - expect(stats.mean).toBe(550) - expect(stats.p50).toBe(600) - expect(stats.p90).toBe(1000) - }) - - test('returns zeros for empty array', () => { - const stats = computeLatencyStats([]) - - expect(stats.p50).toBe(0) - expect(stats.p90).toBe(0) - expect(stats.p99).toBe(0) - expect(stats.mean).toBe(0) - expect(stats.min).toBe(0) - expect(stats.max).toBe(0) - }) - - test('handles single-element array', () => { - const stats = computeLatencyStats([42]) - - expect(stats.p50).toBe(42) - expect(stats.p90).toBe(42) - expect(stats.mean).toBe(42) - expect(stats.min).toBe(42) - expect(stats.max).toBe(42) - }) - - test('sorts unsorted input', () => { - const stats = computeLatencyStats([500, 100, 300, 200, 400]) - - expect(stats.min).toBe(100) - expect(stats.max).toBe(500) - expect(stats.mean).toBe(300) - }) -}) - -// ============================================================================ -// computeScoreDistribution Tests -// ============================================================================ - -describe('computeScoreDistribution', () => { - test('distributes scores into correct buckets', () => { - const scores = [0.1, 0.3, 0.5, 0.7, 0.9] - const dist = computeScoreDistribution(scores) - - expect(dist['0.0-0.2']).toBe(1) - expect(dist['0.2-0.4']).toBe(1) - expect(dist['0.4-0.6']).toBe(1) - expect(dist['0.6-0.8']).toBe(1) - expect(dist['0.8-1.0']).toBe(1) - }) - - test('handles empty scores array', () => { - const dist = computeScoreDistribution([]) - - expect(dist['0.0-0.2']).toBe(0) - expect(dist['0.2-0.4']).toBe(0) - expect(dist['0.4-0.6']).toBe(0) - expect(dist['0.6-0.8']).toBe(0) - expect(dist['0.8-1.0']).toBe(0) - }) - - test('handles boundary values correctly', () => { - // 0.0 → first bucket, 0.2 → second bucket (not first), 1.0 → last bucket - const scores = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0] - const dist = computeScoreDistribution(scores) - - expect(dist['0.0-0.2']).toBe(1) // 0.0 - expect(dist['0.2-0.4']).toBe(1) // 0.2 - expect(dist['0.4-0.6']).toBe(1) // 0.4 - expect(dist['0.6-0.8']).toBe(1) // 0.6 - expect(dist['0.8-1.0']).toBe(2) // 0.8, 1.0 - }) -}) diff --git a/src/pipeline/tests/pipeline.spec.ts b/src/pipeline/tests/pipeline.spec.ts deleted file mode 100644 index 2a1fb30..0000000 --- a/src/pipeline/tests/pipeline.spec.ts +++ /dev/null @@ -1,356 +0,0 @@ -/** - * Unit tests for pipeline commands. - * - * @remarks - * Tests for the Unix-style pipeline commands: - * - format: formatMarkdown, formatCsv helpers - * - compare: parseLabeledRun helper - * - type validation - * - * @packageDocumentation - */ - -import { describe, expect, test } from 'bun:test' -import type { - ComparisonGraderInput, - ComparisonGraderResult, - ExtractedResult, - FormatStyle, - GradedResult, - LabeledRun, - RawOutput, -} from '../pipeline.types.ts' - -// ============================================================================ -// Type Validation Tests -// ============================================================================ - -describe('RawOutput type', () => { - test('accepts valid raw output', () => { - const raw: RawOutput = { - id: 'test-001', - input: 'What is 2+2?', - rawLines: ['{"type":"message","content":"4"}'], - timing: { - start: 1000, - end: 2000, - total: 1000, - }, - } - expect(raw.id).toBe('test-001') - expect(raw.timing.total).toBe(1000) - }) - - test('accepts array input for multi-turn', () => { - const raw: RawOutput = { - id: 'multi-001', - input: ['Hello', 'How are you?'], - rawLines: [], - timing: { start: 0, end: 100, total: 100 }, - } - expect(Array.isArray(raw.input)).toBe(true) - expect((raw.input as string[]).length).toBe(2) - }) - - test('accepts optional hint', () => { - const raw: RawOutput = { - id: 'hint-001', - input: 'Calculate something', - hint: 'Expected: numeric answer', - rawLines: [], - timing: { start: 0, end: 0, total: 0 }, - } - expect(raw.hint).toBe('Expected: numeric answer') - }) - - test('accepts optional error', () => { - const raw: RawOutput = { - id: 'error-001', - input: 'fail test', - rawLines: [], - timing: { start: 0, end: 100, total: 100 }, - error: 'Timeout exceeded', - } - expect(raw.error).toBe('Timeout exceeded') - }) -}) - -describe('ExtractedResult type', () => { - test('accepts valid extracted result', () => { - const extracted: ExtractedResult = { - id: 'test-001', - input: 'What is 2+2?', - output: '4', - trajectory: [ - { - type: 'message', - content: '4', - timestamp: 100, - }, - ], - toolErrors: false, - timing: { start: 0, end: 100, total: 100 }, - } - expect(extracted.output).toBe('4') - expect(extracted.trajectory.length).toBe(1) - expect(extracted.toolErrors).toBe(false) - }) - - test('accepts thought and tool_call steps', () => { - const extracted: ExtractedResult = { - id: 'complex-001', - input: 'Create a file', - output: 'Done', - trajectory: [ - { type: 'thought', content: 'I need to create a file', timestamp: 50 }, - { - type: 'tool_call', - name: 'Write', - input: { path: '/tmp/test.txt', content: 'hello' }, - status: 'completed', - timestamp: 200, - }, - { type: 'message', content: 'Done', timestamp: 250 }, - ], - toolErrors: false, - timing: { start: 0, end: 300, total: 300 }, - } - expect(extracted.trajectory.length).toBe(3) - expect(extracted.trajectory[1]?.type).toBe('tool_call') - }) -}) - -describe('GradedResult type', () => { - test('extends ExtractedResult with score', () => { - const graded: GradedResult = { - id: 'graded-001', - input: 'What is 2+2?', - output: '4', - trajectory: [], - toolErrors: false, - timing: { start: 0, end: 100, total: 100 }, - score: { - pass: true, - score: 1.0, - reasoning: 'Correct answer', - }, - } - expect(graded.score.pass).toBe(true) - expect(graded.score.score).toBe(1.0) - expect(graded.score.reasoning).toBe('Correct answer') - }) - - test('accepts failing score', () => { - const graded: GradedResult = { - id: 'fail-001', - input: 'What is 2+2?', - output: '5', - trajectory: [], - toolErrors: false, - timing: { start: 0, end: 100, total: 100 }, - score: { - pass: false, - score: 0.0, - reasoning: 'Incorrect answer', - }, - } - expect(graded.score.pass).toBe(false) - expect(graded.score.score).toBe(0.0) - }) -}) - -describe('FormatStyle type', () => { - test('accepts valid format styles', () => { - const styles: FormatStyle[] = ['jsonl', 'markdown', 'csv'] - expect(styles).toContain('jsonl') - expect(styles).toContain('markdown') - expect(styles).toContain('csv') - }) -}) - -describe('LabeledRun type', () => { - test('accepts label and path', () => { - const run: LabeledRun = { - label: 'baseline', - path: './results/baseline.jsonl', - } - expect(run.label).toBe('baseline') - expect(run.path).toBe('./results/baseline.jsonl') - }) -}) - -describe('ComparisonGraderInput type', () => { - test('accepts multiple runs', () => { - const input: ComparisonGraderInput = { - id: 'compare-001', - input: 'What is 2+2?', - runs: { - baseline: { output: '4' }, - experiment: { output: 'Four', trajectory: [] }, - }, - } - expect(Object.keys(input.runs).length).toBe(2) - expect(input.runs.baseline?.output).toBe('4') - expect(input.runs.experiment?.trajectory).toEqual([]) - }) -}) - -describe('ComparisonGraderResult type', () => { - test('accepts rankings with reasoning', () => { - const result: ComparisonGraderResult = { - rankings: [ - { run: 'baseline', rank: 1, score: 0.95 }, - { run: 'experiment', rank: 2, score: 0.8 }, - ], - reasoning: 'Baseline was more concise', - } - expect(result.rankings.length).toBe(2) - expect(result.rankings[0]?.rank).toBe(1) - expect(result.reasoning).toBeDefined() - }) -}) - -// ============================================================================ -// Helper Function Tests (via import) -// ============================================================================ - -// Note: Some helper functions are not exported from the modules. -// These tests verify the type contracts that the helpers must satisfy. - -describe('pipeline data flow', () => { - test('RawOutput can flow to ExtractedResult', () => { - const raw: RawOutput = { - id: 'flow-001', - input: 'test', - hint: 'expected: something', - rawLines: ['{"type":"message","content":"result"}'], - timing: { start: 0, end: 100, total: 100 }, - } - - // Simulate extraction - const extracted: ExtractedResult = { - id: raw.id, - input: raw.input, - hint: raw.hint, - output: 'result', - trajectory: [{ type: 'message', content: 'result', timestamp: 100 }], - toolErrors: false, - timing: raw.timing, - } - - expect(extracted.id).toBe(raw.id) - expect(extracted.input).toBe(raw.input) - expect(extracted.hint).toBe(raw.hint) - }) - - test('ExtractedResult can flow to GradedResult', () => { - const extracted: ExtractedResult = { - id: 'grade-flow-001', - input: 'test', - output: 'result', - trajectory: [], - toolErrors: false, - timing: { start: 0, end: 100, total: 100 }, - } - - // Simulate grading - const graded: GradedResult = { - ...extracted, - score: { pass: true, score: 1.0 }, - } - - expect(graded.id).toBe(extracted.id) - expect(graded.score.pass).toBe(true) - }) -}) - -describe('comparison data structures', () => { - test('LabeledRun derived from filename', () => { - // Simulate parseLabeledRun behavior - const path = '/path/to/results-baseline.jsonl' - const basename = path.split('/').pop() ?? '' - const label = basename.replace('.jsonl', '') - - const run: LabeledRun = { label, path } - expect(run.label).toBe('results-baseline') - }) - - test('LabeledRun with explicit label', () => { - // Simulate explicit label:path format - const arg = 'my-baseline:/path/to/results.jsonl' - const colonIdx = arg.indexOf(':') - const label = arg.slice(0, colonIdx) - const path = arg.slice(colonIdx + 1) - - const run: LabeledRun = { label, path } - expect(run.label).toBe('my-baseline') - expect(run.path).toBe('/path/to/results.jsonl') - }) - - test('comparison aggregates results by prompt ID', () => { - const results1 = [ - { id: 'p1', output: 'a' }, - { id: 'p2', output: 'b' }, - ] - const results2 = [ - { id: 'p1', output: 'x' }, - { id: 'p2', output: 'y' }, - ] - - // Simulate comparison aggregation - const promptIds = new Set([...results1.map((r) => r.id), ...results2.map((r) => r.id)]) - expect(promptIds.size).toBe(2) - - const comparisonInput: ComparisonGraderInput = { - id: 'p1', - input: 'test prompt', - runs: { - run1: { output: results1.find((r) => r.id === 'p1')?.output ?? '' }, - run2: { output: results2.find((r) => r.id === 'p1')?.output ?? '' }, - }, - } - expect(comparisonInput.runs.run1?.output).toBe('a') - expect(comparisonInput.runs.run2?.output).toBe('x') - }) -}) - -describe('format style contracts', () => { - test('markdown format includes summary when graded', () => { - // Verify the type contract for markdown formatting - const gradedResults: GradedResult[] = [ - { - id: 't1', - input: 'a', - output: 'x', - trajectory: [], - toolErrors: false, - timing: { start: 0, end: 100, total: 100 }, - score: { pass: true, score: 1.0 }, - }, - { - id: 't2', - input: 'b', - output: 'y', - trajectory: [], - toolErrors: false, - timing: { start: 0, end: 100, total: 100 }, - score: { pass: false, score: 0.5 }, - }, - ] - - const passed = gradedResults.filter((r) => r.score.pass).length - const total = gradedResults.length - const passRate = passed / total - - expect(passRate).toBe(0.5) - }) - - test('csv format escapes special characters', () => { - // Test CSV escaping contract - const escapeCsv = (str: string) => `"${str.replace(/"/g, '""').replace(/\n/g, '\\n')}"` - - expect(escapeCsv('hello')).toBe('"hello"') - expect(escapeCsv('say "hello"')).toBe('"say ""hello"""') - expect(escapeCsv('line1\nline2')).toBe('"line1\\nline2"') - }) -}) diff --git a/src/schemas.ts b/src/schemas.ts deleted file mode 100644 index deac478..0000000 --- a/src/schemas.ts +++ /dev/null @@ -1,134 +0,0 @@ -/** - * Schemas and types for agent evaluation harness. - * - * @remarks - * Re-exports all Zod schemas and inferred types for capture results, - * trajectories, grader results, and CLI data structures. - * - * @packageDocumentation - */ - -// Constants -export { - DEFAULT_CALIBRATION_SAMPLE_SIZE, - DEFAULT_HARNESS_TIMEOUT, - DEFAULT_TRIAL_COUNT, - HEAD_LINES, - MAX_CONTENT_LENGTH, - TAIL_LINES, -} from './schemas/constants.ts' -// Grader loader -export { loadGrader, loadGraderOrExit } from './schemas/grader-loader.ts' -// Core session types -// JSON-RPC types (MCP compatibility) -// MCP server configuration -// Prompt and grading -// Trajectory types -// Timing and richness -// Result types -export { - type BalanceAnalysis, - BalanceAnalysisSchema, - type CalibrationSample, - CalibrationSampleSchema, - type CaptureResult, - CaptureResultSchema, - type CategoryDistribution, - CategoryDistributionSchema, - // Comparison report types - type ComparisonMeta, - ComparisonMetaSchema, - type ComparisonReport, - ComparisonReportSchema, - EnvVariableSchema, - type Grader, - type GraderResult, - GraderResultSchema, - type HeadToHead, - HeadToHeadSchema, - HttpHeaderSchema, - type IndexedStep, - type JsonRpcError, - type JsonRpcErrorResponse, - JsonRpcErrorResponseSchema, - JsonRpcErrorSchema, - type JsonRpcMessage, - JsonRpcMessageSchema, - type JsonRpcNotification, - JsonRpcNotificationSchema, - type JsonRpcRequest, - JsonRpcRequestSchema, - type JsonRpcResponse, - JsonRpcResponseSchema, - type JsonRpcSuccessResponse, - JsonRpcSuccessResponseSchema, - type LatencyStats, - LatencyStatsSchema, - type McpServerConfig, - McpServerHttpSchema, - McpServerSchema, - McpServerStdioSchema, - MessageStepSchema, - type PairwiseComparison, - PairwiseComparisonSchema, - type PerformanceMetrics, - PerformanceMetricsSchema, - PlanStepSchema, - type PromptCase, - PromptCaseSchema, - type PromptComparison, - PromptComparisonSchema, - type QualityMetrics, - QualityMetricsSchema, - type ReliabilityMetrics, - ReliabilityMetricsSchema, - type ScoreDistribution, - ScoreDistributionSchema, - type Session, - SessionSchema, - type SummaryResult, - SummaryResultSchema, - ThoughtStepSchema, - type Timing, - TimingSchema, - ToolCallStepSchema, - type ToolInput, - ToolInputSchema, - type TrajectoryInfo, - TrajectoryInfoSchema, - type TrajectoryRichness, - TrajectoryRichnessSchema, - type TrajectoryStep, - TrajectoryStepSchema, - type TrialEntry, - TrialEntrySchema, - type TrialResult, - TrialResultSchema, - // Trials comparison report types - type TrialsCapabilityMetrics, - TrialsCapabilityMetricsSchema, - type TrialsComparisonMeta, - TrialsComparisonMetaSchema, - type TrialsComparisonReport, - TrialsComparisonReportSchema, - type TrialsFlakinessMetrics, - TrialsFlakinessMetricsSchema, - type TrialsPerformanceConfidenceIntervals, - TrialsPerformanceConfidenceIntervalsSchema, - type TrialsPerformanceMetrics, - TrialsPerformanceMetricsSchema, - type TrialsPromptComparison, - TrialsPromptComparisonSchema, - type TrialsQualityConfidenceIntervals, - TrialsQualityConfidenceIntervalsSchema, - type TrialsQualityMetrics, - TrialsQualityMetricsSchema, - type TrialsReliabilityMetrics, - TrialsReliabilityMetricsSchema, - type ValidationResult, - ValidationResultSchema, -} from './schemas/schemas.ts' - -// Schemas CLI -export type { SchemasConfig } from './schemas/schemas-cli.ts' -export { runSchemas, schemasCli } from './schemas/schemas-cli.ts' diff --git a/src/schemas/constants.ts b/src/schemas/constants.ts deleted file mode 100644 index e7dbd2f..0000000 --- a/src/schemas/constants.ts +++ /dev/null @@ -1,94 +0,0 @@ -/** - * Constants for harness and JSON-RPC protocol operations. - * - * @remarks - * Contains all constant values used across the implementation: - * - JSON-RPC method names and protocol version - * - JSON-RPC error codes - * - Harness defaults (timeouts, preview limits) - * - * @packageDocumentation - */ - -// ============================================================================ -// JSON-RPC Protocol Methods -// ============================================================================ - -/** JSON-RPC method names for headless adapter protocol */ -export const PROTOCOL_METHODS = { - // Lifecycle - INITIALIZE: 'initialize', - SHUTDOWN: 'shutdown', - - // Sessions - CREATE_SESSION: 'session/new', - LOAD_SESSION: 'session/load', - PROMPT: 'session/prompt', - CANCEL: 'session/cancel', - UPDATE: 'session/update', - REQUEST_PERMISSION: 'session/request_permission', - SET_MODEL: 'session/set_model', - - // Protocol-level - CANCEL_REQUEST: '$/cancel_request', -} as const - -// ============================================================================ -// Protocol Version -// ============================================================================ - -/** Current protocol version */ -export const PROTOCOL_VERSION = 1 as const - -// ============================================================================ -// JSON-RPC Error Codes -// ============================================================================ - -/** Standard JSON-RPC error codes */ -export const JSON_RPC_ERRORS = { - PARSE_ERROR: -32700, - INVALID_REQUEST: -32600, - METHOD_NOT_FOUND: -32601, - INVALID_PARAMS: -32602, - INTERNAL_ERROR: -32603, - REQUEST_CANCELLED: -32800, -} as const - -// ============================================================================ -// Client Defaults -// ============================================================================ - -/** Default client name for protocol handshake */ -export const DEFAULT_CLIENT_NAME = 'plaited-eval-harness' - -/** Default timeout for protocol operations in milliseconds */ -export const DEFAULT_PROTOCOL_TIMEOUT = 30000 - -/** Default polling interval for streaming updates in milliseconds */ -export const DEFAULT_POLLING_INTERVAL = 50 - -// ============================================================================ -// Harness Preview Configuration -// ============================================================================ - -/** Number of lines to show at the head of content previews */ -export const HEAD_LINES = 8 - -/** Number of lines to show at the tail of content previews */ -export const TAIL_LINES = 4 - -/** Maximum content length before applying head/tail preview */ -export const MAX_CONTENT_LENGTH = 500 - -// ============================================================================ -// Harness Defaults -// ============================================================================ - -/** Default timeout for prompt evaluation in milliseconds */ -export const DEFAULT_HARNESS_TIMEOUT = 60000 - -/** Default number of trials for pass@k analysis */ -export const DEFAULT_TRIAL_COUNT = 5 - -/** Default sample size for calibration */ -export const DEFAULT_CALIBRATION_SAMPLE_SIZE = 10 diff --git a/src/schemas/grader-loader.ts b/src/schemas/grader-loader.ts deleted file mode 100644 index e2b4175..0000000 --- a/src/schemas/grader-loader.ts +++ /dev/null @@ -1,203 +0,0 @@ -/** - * Polyglot grader loader module. - * - * @remarks - * Supports loading graders from: - * - TypeScript/JavaScript modules (import as ES module) - * - Executable scripts (Python, Ruby, shell, etc. via subprocess) - * - * Executable graders use stdin/stdout JSON protocol: - * - Input: `{"input": "...", "output": "...", "expected": "...", "trajectory": [...]}` - * - Output: `{"pass": true, "score": 1.0, "reasoning": "..."}` - * - * @packageDocumentation - */ - -import { resolvePath } from '../core.ts' -import type { Grader, TrajectoryStep } from './schemas.ts' -import { GraderResultSchema } from './schemas.ts' - -// ============================================================================ -// Constants -// ============================================================================ - -/** File extensions that are imported as ES modules */ -const JS_EXTENSIONS = ['.ts', '.js', '.mjs', '.cjs'] - -// ============================================================================ -// Helpers -// ============================================================================ - -/** Check if a file path is a JavaScript/TypeScript module */ -const isJsModule = (path: string): boolean => JS_EXTENSIONS.some((ext) => path.endsWith(ext)) - -// ============================================================================ -// Executable Grader -// ============================================================================ - -/** - * Input format for executable graders (stdin JSON). - * - * @remarks - * The metadata field contains arbitrary key-value pairs from the original - * prompt JSONL (e.g., category, difficulty, tags). Use this to implement - * category-specific grading logic or filter calibration samples. - * The cwd field provides the working directory path for git-based outcome detection. - */ -type ExecGraderInput = { - input: string | string[] - output: string - hint?: string - trajectory?: TrajectoryStep[] - metadata?: Record - cwd?: string -} - -/** - * Create a grader function that executes an external script. - * - * @remarks - * The script receives JSON on stdin and must output JSON on stdout. - * Non-zero exit codes are treated as errors. - * - * @param execPath - Absolute path to the executable script - * @returns Grader function - */ -const createExecGrader = (execPath: string): Grader => { - return async (params) => { - const input: ExecGraderInput = { - input: params.input, - output: params.output, - hint: params.hint, - trajectory: params.trajectory, - metadata: params.metadata, - cwd: params.cwd, - } - - const inputJson = JSON.stringify(input) - - const proc = Bun.spawn([execPath], { - stdin: new TextEncoder().encode(inputJson), - stdout: 'pipe', - stderr: 'pipe', - }) - - const [stdout, stderr, exitCode] = await Promise.all([ - new Response(proc.stdout).text(), - new Response(proc.stderr).text(), - proc.exited, - ]) - - if (exitCode !== 0) { - throw new Error(`Grader exited with code ${exitCode}: ${stderr.trim() || 'No error output'}`) - } - - const trimmedStdout = stdout.trim() - if (!trimmedStdout) { - throw new Error('Grader produced no output') - } - - let parsed: unknown - try { - parsed = JSON.parse(trimmedStdout) - } catch { - throw new Error(`Grader output is not valid JSON: ${trimmedStdout.slice(0, 100)}`) - } - - const result = GraderResultSchema.safeParse(parsed) - if (!result.success) { - throw new Error(`Invalid grader result: ${result.error.message}`) - } - - return result.data - } -} - -// ============================================================================ -// Module Grader -// ============================================================================ - -/** - * Load a grader from a JavaScript/TypeScript module. - * - * @remarks - * The module must export a `grade` function matching the `Grader` type. - * - * @param modulePath - Absolute path to the module - * @returns Grader function - */ -const loadModuleGrader = async (modulePath: string): Promise => { - const graderModule = await import(modulePath) - - if (typeof graderModule.grade !== 'function') { - throw new Error(`Grader module must export a 'grade' function`) - } - - return graderModule.grade as Grader -} - -// ============================================================================ -// Public API -// ============================================================================ - -/** - * Load a grader from a file path. - * - * @remarks - * Detection logic: - * - `.ts`, `.js`, `.mjs`, `.cjs` → Import as ES module - * - Everything else → Execute as subprocess - * - * @param graderPath - Path to the grader (relative or absolute) - * @returns Grader function - * @throws Error if grader not found or invalid - * - * @example - * ```typescript - * // TypeScript grader - * const grader = await loadGrader('./grader.ts') - * - * // Python grader - * const grader = await loadGrader('./grader.py') - * - * // Any executable - * const grader = await loadGrader('./my-grader') - * ``` - */ -/** - * Load a grader from a file path, exiting on failure. - * - * @remarks - * CLI-friendly wrapper around `loadGrader` that prints the error to stderr - * and calls `process.exit(1)` on failure. Eliminates the duplicated - * try/catch pattern across CLI handlers. - * - * @param graderPath - Path to the grader (relative or absolute) - * @returns Grader function (never returns on failure) - * - * @public - */ -export const loadGraderOrExit = async (graderPath: string): Promise => { - try { - return await loadGrader(graderPath) - } catch (error) { - console.error(`Error: ${error instanceof Error ? error.message : error}`) - process.exit(1) - } -} - -export const loadGrader = async (graderPath: string): Promise => { - const resolvedPath = resolvePath(graderPath) - - // Check file exists - const file = Bun.file(resolvedPath) - if (!(await file.exists())) { - throw new Error(`Grader not found: ${resolvedPath}`) - } - - if (isJsModule(resolvedPath)) { - return loadModuleGrader(resolvedPath) - } - - return createExecGrader(resolvedPath) -} diff --git a/src/schemas/schemas-cli.ts b/src/schemas/schemas-cli.ts deleted file mode 100644 index 7b3e66a..0000000 --- a/src/schemas/schemas-cli.ts +++ /dev/null @@ -1,227 +0,0 @@ -/** - * Schemas command - export JSON schemas for non-TypeScript users. - * - * @remarks - * Uses Zod 4's native `z.toJSONSchema()` to generate JSON Schema from - * the harness schemas. Useful for validation in other languages/tools. - * - * @packageDocumentation - */ - -import { parseArgs } from 'node:util' -import { z } from 'zod' -import { resolvePath } from '../core.ts' -import * as schemas from './schemas.ts' - -// ============================================================================ -// Schema Registry -// ============================================================================ - -/** Available schemas for export */ -const SCHEMA_REGISTRY: Record = { - PromptCase: schemas.PromptCaseSchema, - GraderResult: schemas.GraderResultSchema, - TrajectoryStep: schemas.TrajectoryStepSchema, - CaptureResult: schemas.CaptureResultSchema, - SummaryResult: schemas.SummaryResultSchema, - TrialEntry: schemas.TrialEntrySchema, - TrialResult: schemas.TrialResultSchema, - CalibrationSample: schemas.CalibrationSampleSchema, - BalanceAnalysis: schemas.BalanceAnalysisSchema, - ValidationResult: schemas.ValidationResultSchema, - McpServerConfig: schemas.McpServerSchema, - Session: schemas.SessionSchema, - JsonRpcRequest: schemas.JsonRpcRequestSchema, - JsonRpcResponse: schemas.JsonRpcResponseSchema, - JsonRpcError: schemas.JsonRpcErrorSchema, -} - -// ============================================================================ -// Types -// ============================================================================ - -/** Configuration for schemas command */ -export type SchemasConfig = { - /** Specific schema name to export (undefined = all) */ - schemaName?: string - /** Output file path */ - outputPath?: string - /** Output as JSON (vs list) */ - json?: boolean - /** Split into separate files */ - split?: boolean - /** List available schemas */ - list?: boolean -} - -// ============================================================================ -// Helpers -// ============================================================================ - -/** Generate JSON Schema from Zod schema */ -const toJsonSchema = (schema: z.ZodSchema, name: string): object => { - try { - // Zod 4's native JSON Schema generation - const jsonSchema = z.toJSONSchema(schema) - return { - $schema: 'https://json-schema.org/draft/2020-12/schema', - title: name, - ...jsonSchema, - } - } catch (error) { - // Fallback for schemas that can't be converted - return { - $schema: 'https://json-schema.org/draft/2020-12/schema', - title: name, - description: `Schema for ${name} (auto-generation failed: ${error instanceof Error ? error.message : 'unknown error'})`, - } - } -} - -// ============================================================================ -// Schemas Implementation -// ============================================================================ - -/** - * Execute schemas command with configuration object. - * - * @param config - Schemas configuration - * @returns Generated JSON schemas - */ -export const runSchemas = async (config: SchemasConfig): Promise | string[]> => { - const { schemaName, outputPath, json = false, split = false, list = false } = config - - // List mode - if (list) { - const names = Object.keys(SCHEMA_REGISTRY) - console.log('Available schemas:') - for (const name of names) { - console.log(` - ${name}`) - } - return names - } - - // Single schema mode - if (schemaName) { - const schema = SCHEMA_REGISTRY[schemaName] - if (!schema) { - console.error(`Error: Unknown schema '${schemaName}'`) - console.error(`Available: ${Object.keys(SCHEMA_REGISTRY).join(', ')}`) - process.exit(1) - } - - const jsonSchema = toJsonSchema(schema, schemaName) - const output = JSON.stringify(jsonSchema, null, 2) - - if (outputPath) { - await Bun.write(resolvePath(outputPath), output) - } else { - console.log(output) - } - - return { [schemaName]: jsonSchema } - } - - // All schemas mode - const allSchemas: Record = {} - - for (const [name, schema] of Object.entries(SCHEMA_REGISTRY)) { - allSchemas[name] = toJsonSchema(schema, name) - } - - if (split && outputPath) { - // Create directory and write separate files - const dir = resolvePath(outputPath) - await Bun.$`mkdir -p ${dir}` - - for (const [name, jsonSchema] of Object.entries(allSchemas)) { - const filePath = `${dir}/${name}.json` - await Bun.write(filePath, JSON.stringify(jsonSchema, null, 2)) - } - - console.error(`Wrote ${Object.keys(allSchemas).length} schema files to ${dir}/`) - } else if (json) { - const output = JSON.stringify(allSchemas, null, 2) - - if (outputPath) { - await Bun.write(resolvePath(outputPath), output) - } else { - console.log(output) - } - } else { - // Default: list schemas - console.log('Available schemas (use --json to export):') - for (const name of Object.keys(allSchemas)) { - console.log(` - ${name}`) - } - } - - return allSchemas -} - -// ============================================================================ -// CLI Entry Point -// ============================================================================ - -/** - * Schemas command CLI handler. - * - * @param args - Command line arguments (after 'schemas') - */ -export const schemasCli = async (args: string[]): Promise => { - const { values, positionals } = parseArgs({ - args, - options: { - output: { type: 'string', short: 'o' }, - json: { type: 'boolean', short: 'j', default: false }, - split: { type: 'boolean', short: 's', default: false }, - list: { type: 'boolean', short: 'l', default: false }, - help: { type: 'boolean', short: 'h' }, - }, - allowPositionals: true, - }) - - if (values.help) { - console.log(` -Usage: agent-eval-harness schemas [schema-name] [options] - -Arguments: - schema-name Specific schema to export (optional) - -Options: - -o, --output Output file or directory (with --split) - -j, --json Export as JSON (default: list names) - -s, --split Split into separate files (requires --output dir) - -l, --list List available schemas - -h, --help Show this help message - -Available Schemas: - PromptCase, GraderResult, TrajectoryStep, CaptureResult, SummaryResult, - TrialEntry, TrialResult, CalibrationSample, BalanceAnalysis, ValidationResult, - McpServerConfig, Session, JsonRpcRequest, JsonRpcResponse, JsonRpcError - -Examples: - # List available schemas - agent-eval-harness schemas --list - - # Export all schemas as single JSON file - agent-eval-harness schemas --json -o schemas.json - - # Export specific schema - agent-eval-harness schemas CaptureResult --json - agent-eval-harness schemas TrialResult --json -o trial-schema.json - - # Export all schemas as separate files - agent-eval-harness schemas --json --split -o schemas/ -`) - return - } - - await runSchemas({ - schemaName: positionals[0], - outputPath: values.output, - json: values.json ?? false, - split: values.split ?? false, - list: values.list ?? false, - }) -} diff --git a/src/schemas/schemas.ts b/src/schemas/schemas.ts deleted file mode 100644 index e084159..0000000 --- a/src/schemas/schemas.ts +++ /dev/null @@ -1,1073 +0,0 @@ -/** - * Unified Zod schemas and types for the agent eval harness. - * - * @remarks - * This module follows a schema-first approach where Zod schemas are the - * single source of truth. TypeScript types are derived using `z.infer<>`. - * - * **Exports:** - * - Harness schemas: PromptCaseSchema, GraderResultSchema, CaptureResultSchema, etc. - * - JSON-RPC schemas: JsonRpcRequestSchema, JsonRpcResponseSchema, etc. (for headless adapter) - * - All inferred types via `z.infer<>` - * - * **JSON Schema generation (Zod 4):** - * ```typescript - * import { z } from 'zod' - * import { CaptureResultSchema } from '@plaited/agent-eval-harness/schemas' - * const jsonSchema = z.toJSONSchema(CaptureResultSchema) - * ``` - * - * @packageDocumentation - */ - -import { z } from 'zod' - -// ============================================================================ -// Session Types -// ============================================================================ - -/** - * Session schema for session creation responses. - */ -export const SessionSchema = z.object({ - id: z.string(), - _meta: z.record(z.string(), z.unknown()).nullish(), -}) - -/** Session object returned from session creation */ -export type Session = z.infer - -// ============================================================================ -// JSON-RPC 2.0 Schemas (for headless adapter) -// ============================================================================ - -/** JSON-RPC version literal */ -const JsonRpcVersionSchema = z.literal('2.0') - -/** Request/response identifier */ -const RequestIdSchema = z.union([z.string(), z.number()]) - -/** - * JSON-RPC 2.0 error object schema. - * - * @remarks - * Standard error codes: - * - `-32700`: Parse error - * - `-32600`: Invalid request - * - `-32601`: Method not found - * - `-32602`: Invalid params - * - `-32603`: Internal error - */ -export const JsonRpcErrorSchema = z.object({ - code: z.number(), - message: z.string(), - data: z.unknown().optional(), -}) - -/** JSON-RPC 2.0 error object */ -export type JsonRpcError = z.infer - -/** JSON-RPC 2.0 request schema */ -export const JsonRpcRequestSchema = z.object({ - jsonrpc: JsonRpcVersionSchema, - id: RequestIdSchema, - method: z.string(), - params: z.unknown().optional(), -}) - -/** JSON-RPC 2.0 request structure */ -export type JsonRpcRequest = Omit, 'params'> & { - params?: T -} - -/** JSON-RPC 2.0 notification schema (no id, no response expected) */ -export const JsonRpcNotificationSchema = z.object({ - jsonrpc: JsonRpcVersionSchema, - method: z.string(), - params: z.unknown().optional(), -}) - -/** JSON-RPC 2.0 notification structure (no id, no response expected) */ -export type JsonRpcNotification = Omit, 'params'> & { - params?: T -} - -/** JSON-RPC 2.0 success response schema */ -export const JsonRpcSuccessResponseSchema = z.object({ - jsonrpc: JsonRpcVersionSchema, - id: RequestIdSchema, - result: z.unknown(), -}) - -/** JSON-RPC 2.0 success response */ -export type JsonRpcSuccessResponse = Omit, 'result'> & { - result: T -} - -/** JSON-RPC 2.0 error response schema */ -export const JsonRpcErrorResponseSchema = z.object({ - jsonrpc: JsonRpcVersionSchema, - id: z.union([RequestIdSchema, z.null()]), - error: JsonRpcErrorSchema, -}) - -/** JSON-RPC 2.0 error response */ -export type JsonRpcErrorResponse = z.infer - -/** Union of all JSON-RPC response types */ -export const JsonRpcResponseSchema = z.union([JsonRpcSuccessResponseSchema, JsonRpcErrorResponseSchema]) - -/** Union of all JSON-RPC response types */ -export type JsonRpcResponse = JsonRpcSuccessResponse | JsonRpcErrorResponse - -/** - * Union of all JSON-RPC message types. - * - * @remarks - * Use `safeParse` at transport boundaries for runtime validation. - */ -export const JsonRpcMessageSchema = z.union([JsonRpcRequestSchema, JsonRpcNotificationSchema, JsonRpcResponseSchema]) - -/** Union of all JSON-RPC message types */ -export type JsonRpcMessage = JsonRpcRequest | JsonRpcNotification | JsonRpcResponse - -// ============================================================================ -// MCP Server Configuration Schemas -// ============================================================================ - -/** Environment variable configuration */ -export const EnvVariableSchema = z.object({ - name: z.string(), - value: z.string(), -}) - -/** HTTP header configuration */ -export const HttpHeaderSchema = z.object({ - name: z.string(), - value: z.string(), -}) - -/** MCP server stdio transport configuration */ -export const McpServerStdioSchema = z.object({ - type: z.literal('stdio').optional(), - name: z.string(), - command: z.string(), - args: z.array(z.string()), - env: z.array(EnvVariableSchema), -}) - -/** MCP server HTTP transport configuration */ -export const McpServerHttpSchema = z.object({ - type: z.literal('http'), - name: z.string(), - url: z.string(), - headers: z.array(HttpHeaderSchema), -}) - -/** MCP server configuration (stdio or HTTP) */ -export const McpServerSchema = z.union([McpServerStdioSchema, McpServerHttpSchema]) - -/** MCP server configuration type */ -export type McpServerConfig = z.infer - -// ============================================================================ -// Harness Input Schemas -// ============================================================================ - -/** - * Prompt case schema for evaluation inputs. - * - * @remarks - * Each line in a prompts.jsonl file should match this schema. - * - Single turn: `input: "Hello"` - one prompt, one session - * - Multi-turn: `input: ["Hello", "How are you?", "Goodbye"]` - sequential turns in one session - */ -export const PromptCaseSchema = z.object({ - /** Unique identifier for the test case */ - id: z.string(), - /** Prompt text(s) - string for single turn, array for multi-turn conversation */ - input: z.union([z.string(), z.array(z.string())]), - /** Optional grader context hint (not a strict expected match) */ - hint: z.string().optional(), - /** Optional reference solution for validation */ - reference: z.string().optional(), - /** Optional metadata for categorization and analysis */ - metadata: z.record(z.string(), z.unknown()).optional(), - /** Optional per-case timeout override in milliseconds */ - timeout: z.number().optional(), -}) - -/** Prompt case type */ -export type PromptCase = z.infer - -// ============================================================================ -// Grader Schemas -// ============================================================================ - -/** - * Grader result schema. - * - * @remarks - * Result returned by user-provided grader functions. - * - `outcome`: Optional structured outcome data detected by the grader - */ -export const GraderResultSchema = z.object({ - /** Whether the output passes the evaluation criteria */ - pass: z.boolean(), - /** Numeric score from 0.0 to 1.0 */ - score: z.number().min(0).max(1), - /** Optional explanation for the score */ - reasoning: z.string().optional(), - /** Optional outcome data (e.g., files created, tests passed) */ - outcome: z.record(z.string(), z.unknown()).optional(), -}) - -/** Grader result type */ -export type GraderResult = z.infer - -/** - * Grader function type. - * - * @remarks - * User-provided graders implement this interface to score agent outputs. - * - `input` is the original prompt (string or array for multi-turn) - * - `hint` provides grader context (renamed from `expected`) - * - `metadata` contains arbitrary key-value pairs from the original prompt JSONL - * - `cwd` is the working directory path (optional, enables git-based outcome detection) - */ -export type Grader = (params: { - input: string | string[] - output: string - hint?: string - trajectory?: TrajectoryStep[] - metadata?: Record - cwd?: string -}) => Promise - -// ============================================================================ -// Trajectory Schemas -// ============================================================================ - -/** Tool input schema for extracting file paths and content */ -export const ToolInputSchema = z - .object({ - file_path: z.string().optional(), - path: z.string().optional(), - content: z.string().optional(), - new_string: z.string().optional(), - }) - .passthrough() - -/** Tool input type */ -export type ToolInput = z.infer - -/** Thought trajectory step */ -export const ThoughtStepSchema = z.object({ - type: z.literal('thought'), - content: z.string(), - timestamp: z.number(), - stepId: z.string().optional(), -}) - -/** Message trajectory step */ -export const MessageStepSchema = z.object({ - type: z.literal('message'), - content: z.string(), - timestamp: z.number(), - stepId: z.string().optional(), -}) - -/** Tool call trajectory step */ -export const ToolCallStepSchema = z.object({ - type: z.literal('tool_call'), - name: z.string(), - status: z.string(), - input: z.unknown().optional(), - output: z.unknown().optional(), - duration: z.number().optional(), - timestamp: z.number(), - stepId: z.string().optional(), -}) - -/** Plan trajectory step */ -export const PlanStepSchema = z.object({ - type: z.literal('plan'), - entries: z.array(z.unknown()), - timestamp: z.number(), - stepId: z.string().optional(), -}) - -/** - * Trajectory step schema (discriminated union). - * - * @remarks - * Represents a single step in the agent's execution trajectory. - */ -export const TrajectoryStepSchema = z.discriminatedUnion('type', [ - ThoughtStepSchema, - MessageStepSchema, - ToolCallStepSchema, - PlanStepSchema, -]) - -/** Trajectory step type */ -export type TrajectoryStep = z.infer - -/** Indexed trajectory step with unique ID for correlation */ -export type IndexedStep = TrajectoryStep & { stepId: string } - -// ============================================================================ -// Capture Result Schemas -// ============================================================================ - -/** - * Timing information for a capture result. - * - * @remarks - * Captures both absolute timestamps and derived durations for analysis: - * - `sessionCreation`: Time to initialize session (agent startup overhead) - * - `total`: End-to-end duration including all turns - * - `firstResponse`: Latency to first agent output (optional) - * - * Token counts are adapter-dependent and only present if the adapter - * exposes usage information (e.g., Claude Code includes them, others may not). - * - * @public - */ -export const TimingSchema = z.object({ - /** Epoch timestamp when capture started */ - start: z.number(), - /** Epoch timestamp when capture ended */ - end: z.number(), - /** Time to first response (ms from start) */ - firstResponse: z.number().optional(), - /** Time to create session (ms) - measures agent initialization overhead */ - sessionCreation: z.number(), - /** Total duration (end - start) in milliseconds */ - total: z.number(), - /** Input tokens consumed (if available from headless adapter) */ - inputTokens: z.number().optional(), - /** Output tokens generated (if available from headless adapter) */ - outputTokens: z.number().optional(), -}) - -/** - * Timing information type inferred from TimingSchema. - * - * @public - */ -export type Timing = z.infer - -/** - * Trajectory richness level indicating the depth of captured agent activity. - * - * @remarks - * Different adapters provide varying levels of detail: - * - `full`: Thoughts, tool calls, plans (e.g., Claude Code adapter) - * - `minimal`: Basic output only (e.g., Droid adapter) - * - `messages-only`: Messages without internal reasoning - */ -export const TrajectoryRichnessSchema = z.enum(['full', 'minimal', 'messages-only']) - -/** Trajectory richness type */ -export type TrajectoryRichness = z.infer - -/** - * Capture result schema. - * - * @remarks - * Full trajectory output from the `capture` command. - * - `input` can be string (single turn) or string[] (multi-turn) - * - `hint` provides grader context (renamed from `expected`) - * - `toolErrors` replaces misleading `status: 'passed'|'failed'` - * - `outcome` is merged from grader result if grader returns outcome data - * Real pass/fail determination comes from your grader. - */ -export const CaptureResultSchema = z.object({ - /** Test case identifier */ - id: z.string(), - /** Original prompt input (string for single turn, array for multi-turn) */ - input: z.union([z.string(), z.array(z.string())]), - /** Final agent output */ - output: z.string(), - /** Grader context hint (renamed from expected) */ - hint: z.string().optional(), - /** Full execution trajectory */ - trajectory: z.array(TrajectoryStepSchema), - /** Metadata including category, agent info, trajectoryRichness, turnCount */ - metadata: z.record(z.string(), z.unknown()), - /** Timing information */ - timing: TimingSchema, - /** Whether any tool calls failed */ - toolErrors: z.boolean(), - /** Error messages (if any) */ - errors: z.array(z.string()).optional(), - /** Grader score (if grader was provided) */ - score: GraderResultSchema.optional(), - /** Outcome data from grader (if grader provided and returned outcome) */ - outcome: z.record(z.string(), z.unknown()).optional(), -}) - -/** Capture result type */ -export type CaptureResult = z.infer - -// ============================================================================ -// Summary Result Schemas -// ============================================================================ - -/** - * Summary result schema. - * - * @remarks - * Compact view derived from full capture results via the `summarize` command. - */ -export const SummaryResultSchema = z.object({ - /** Test case identifier */ - id: z.string(), - /** Original prompt input */ - input: z.string(), - /** Final agent output */ - output: z.string(), - /** List of tool names called */ - toolCalls: z.array(z.string()), - /** Duration in milliseconds */ - duration: z.number(), -}) - -/** Summary result type */ -export type SummaryResult = z.infer - -// ============================================================================ -// Trial Result Schemas -// ============================================================================ - -/** Single trial within a trial run */ -export const TrialEntrySchema = z.object({ - /** Trial number (1-indexed) */ - trialNum: z.number(), - /** Agent output for this trial */ - output: z.string(), - /** Full trajectory for this trial */ - trajectory: z.array(TrajectoryStepSchema), - /** Duration in milliseconds */ - duration: z.number(), - /** Pass/fail (if grader provided) */ - pass: z.boolean().optional(), - /** Numeric score (if grader provided) */ - score: z.number().optional(), - /** Grader reasoning (if grader provided) */ - reasoning: z.string().optional(), - /** Outcome data from grader (if grader provided and returned outcome) */ - outcome: z.record(z.string(), z.unknown()).optional(), -}) - -/** Trial entry type */ -export type TrialEntry = z.infer - -/** - * Trial result schema. - * - * @remarks - * Output from the `trials` command for pass@k/pass^k analysis. - * Metrics (passRate, passAtK, passExpK) are only present when a grader is provided. - */ -export const TrialResultSchema = z.object({ - /** Test case identifier */ - id: z.string(), - /** Original prompt input (string for single turn, array for multi-turn) */ - input: z.union([z.string(), z.array(z.string())]), - /** Grader context hint (renamed from expected) */ - hint: z.string().optional(), - /** Number of trials (k) */ - k: z.number(), - /** Simple pass rate: passes / k (with grader only) */ - passRate: z.number().optional(), - /** pass@k: probability of at least one pass in k samples (with grader only) */ - passAtK: z.number().optional(), - /** pass^k: probability of all k samples passing (with grader only) */ - passExpK: z.number().optional(), - /** Individual trial results */ - trials: z.array(TrialEntrySchema), - /** Metadata including agent info, workspaceDir, and custom fields */ - metadata: z.record(z.string(), z.unknown()).optional(), -}) - -/** Trial result type */ -export type TrialResult = z.infer - -// ============================================================================ -// Calibration Schemas -// ============================================================================ - -/** Calibration sample for grader review */ -export const CalibrationSampleSchema = z.object({ - /** Test case identifier */ - id: z.string(), - /** Original prompt input (string for single turn, array for multi-turn) */ - input: z.union([z.string(), z.array(z.string())]), - /** Agent output */ - output: z.string(), - /** Grader context hint (renamed from expected) */ - hint: z.string().optional(), - /** Original grader score */ - originalScore: GraderResultSchema, - /** Re-scored result (if different grader provided) */ - rescoredResult: GraderResultSchema.optional(), - /** Key trajectory snippets */ - trajectorySnippet: z.array(TrajectoryStepSchema), -}) - -/** Calibration sample type */ -export type CalibrationSample = z.infer - -// ============================================================================ -// Balance Analysis Schemas -// ============================================================================ - -/** Category distribution in test set */ -export const CategoryDistributionSchema = z.object({ - /** Category name */ - name: z.string(), - /** Number of test cases */ - count: z.number(), - /** Percentage of total */ - percentage: z.number(), -}) - -/** Category distribution type */ -export type CategoryDistribution = z.infer - -/** Balance analysis result */ -export const BalanceAnalysisSchema = z.object({ - /** Total number of test cases */ - totalCases: z.number(), - /** Distribution by category */ - categories: z.array(CategoryDistributionSchema), - /** Categories that may need more test cases */ - underrepresented: z.array(z.string()), - /** Suggested improvements */ - suggestions: z.array(z.string()), -}) - -/** Balance analysis type */ -export type BalanceAnalysis = z.infer - -// ============================================================================ -// Validation Reference Schemas -// ============================================================================ - -/** Validation result for a reference solution */ -export const ValidationResultSchema = z.object({ - /** Test case identifier */ - id: z.string(), - /** Reference solution provided */ - reference: z.string(), - /** Whether reference passes the grader */ - passes: z.boolean(), - /** Grader result */ - graderResult: GraderResultSchema, -}) - -/** Validation result type */ -export type ValidationResult = z.infer - -// ============================================================================ -// Comparison Report Schemas -// ============================================================================ - -/** - * Confidence interval schema as [lower, upper] bounds. - * - * @remarks - * Used for bootstrap-computed confidence intervals when strategy=statistical. - */ -export const ConfidenceIntervalSchema = z.tuple([z.number(), z.number()]) - -/** Confidence interval type */ -export type ConfidenceInterval = z.infer - -/** - * Score distribution histogram for quality analysis. - * - * @remarks - * Buckets divide the 0-1 score range into 5 equal bins. - */ -export const ScoreDistributionSchema = z.object({ - '0.0-0.2': z.number(), - '0.2-0.4': z.number(), - '0.4-0.6': z.number(), - '0.6-0.8': z.number(), - '0.8-1.0': z.number(), -}) - -/** Score distribution type */ -export type ScoreDistribution = z.infer - -/** - * Confidence intervals for quality metrics. - */ -export const QualityConfidenceIntervalsSchema = z.object({ - /** CI for avgScore */ - avgScore: ConfidenceIntervalSchema.optional(), - /** CI for passRate */ - passRate: ConfidenceIntervalSchema.optional(), -}) - -/** Quality confidence intervals type */ -export type QualityConfidenceIntervals = z.infer - -/** - * Quality metrics for a single run in comparison. - */ -export const QualityMetricsSchema = z.object({ - /** Discriminator for run-level quality metrics */ - type: z.literal('run'), - /** Mean grader score (0-1) */ - avgScore: z.number(), - /** Percentage of pass=true results */ - passRate: z.number(), - /** Count of passing results */ - passCount: z.number(), - /** Count of failing results */ - failCount: z.number(), - /** Score distribution histogram */ - scoreDistribution: ScoreDistributionSchema, - /** Confidence intervals (only with strategy=statistical) */ - confidenceIntervals: QualityConfidenceIntervalsSchema.optional(), -}) - -/** Quality metrics type */ -export type QualityMetrics = z.infer - -/** - * Latency statistics for performance analysis. - */ -export const LatencyStatsSchema = z.object({ - /** 50th percentile (median) in milliseconds */ - p50: z.number(), - /** 90th percentile in milliseconds */ - p90: z.number(), - /** 99th percentile in milliseconds */ - p99: z.number(), - /** Mean latency in milliseconds */ - mean: z.number(), - /** Minimum latency in milliseconds */ - min: z.number(), - /** Maximum latency in milliseconds */ - max: z.number(), -}) - -/** Latency stats type */ -export type LatencyStats = z.infer - -/** - * Confidence intervals for performance metrics. - */ -export const PerformanceConfidenceIntervalsSchema = z.object({ - /** CI for latency mean */ - latencyMean: ConfidenceIntervalSchema.optional(), -}) - -/** Performance confidence intervals type */ -export type PerformanceConfidenceIntervals = z.infer - -/** - * Performance metrics for a single run in comparison. - */ -export const PerformanceMetricsSchema = z.object({ - /** End-to-end latency statistics */ - latency: LatencyStatsSchema, - /** Time to first response statistics (optional, not all adapters support) */ - firstResponse: LatencyStatsSchema.optional(), - /** Sum of all run durations in milliseconds */ - totalDuration: z.number(), - /** Confidence intervals (only with strategy=statistical) */ - confidenceIntervals: PerformanceConfidenceIntervalsSchema.optional(), -}) - -/** Performance metrics type */ -export type PerformanceMetrics = z.infer - -/** - * Reliability metrics for a single run in comparison. - */ -export const ReliabilityMetricsSchema = z.object({ - /** Discriminator for run-based reliability metrics */ - type: z.literal('run'), - /** Count of runs with toolErrors=true */ - toolErrors: z.number(), - /** Percentage of runs with tool errors */ - toolErrorRate: z.number(), - /** Count of runs that hit timeout */ - timeouts: z.number(), - /** Percentage of runs that hit timeout */ - timeoutRate: z.number(), - /** Percentage of runs that completed successfully */ - completionRate: z.number(), -}) - -/** Reliability metrics type */ -export type ReliabilityMetrics = z.infer - -/** - * Trajectory info for a single run in comparison. - */ -export const TrajectoryInfoSchema = z.object({ - /** Trajectory richness level */ - richness: TrajectoryRichnessSchema, - /** Average trajectory steps per run */ - avgStepCount: z.number(), -}) - -/** Trajectory info type */ -export type TrajectoryInfo = z.infer - -/** - * Per-prompt comparison entry for head-to-head drill-down. - */ -export const PromptComparisonSchema = z.object({ - /** Prompt identifier */ - id: z.string(), - /** Run label of the winner, or null if tie */ - winner: z.string().nullable(), - /** Scores by run label */ - scores: z.record(z.string(), z.number()), - /** Latencies by run label in milliseconds */ - latencies: z.record(z.string(), z.number()), - /** Whether each run had errors */ - hadErrors: z.record(z.string(), z.boolean()), -}) - -/** Prompt comparison type */ -export type PromptComparison = z.infer - -/** - * Pairwise win/loss/tie statistics between two runs. - */ -export const PairwiseComparisonSchema = z.object({ - /** First run label */ - runA: z.string(), - /** Second run label */ - runB: z.string(), - /** Number of prompts where A won */ - aWins: z.number(), - /** Number of prompts where B won */ - bWins: z.number(), - /** Number of prompts where A and B tied */ - ties: z.number(), -}) - -/** Pairwise comparison type */ -export type PairwiseComparison = z.infer - -/** - * Head-to-head comparison section. - */ -export const HeadToHeadSchema = z.object({ - /** Per-prompt breakdown for drill-down */ - prompts: z.array(PromptComparisonSchema), - /** Pairwise win rates between runs */ - pairwise: z.array(PairwiseComparisonSchema), -}) - -/** Head-to-head type */ -export type HeadToHead = z.infer - -/** - * Metadata for the comparison report. - */ -export const ComparisonMetaSchema = z.object({ - /** ISO timestamp when report was generated */ - generatedAt: z.string(), - /** Run labels included in comparison */ - runs: z.array(z.string()), - /** Total prompts compared */ - promptCount: z.number(), - /** Prompts where all runs completed */ - promptsWithAllRuns: z.number(), -}) - -/** Comparison meta type */ -export type ComparisonMeta = z.infer - -/** - * Holistic comparison report schema. - * - * @remarks - * Aggregates comparison output across all dimensions: - * - Quality: pass rates, scores, distributions - * - Performance: latency percentiles - * - Reliability: error rates, completion rates - * - Head-to-head: per-prompt winners, pairwise stats - * - * Note: Tool usage analysis is NOT included because adapter formats vary. - * Different adapters provide different `trajectoryRichness` levels and - * the `tool_call.name` field often contains tool use IDs rather than - * human-readable names. - */ -export const ComparisonReportSchema = z.object({ - /** Report metadata */ - meta: ComparisonMetaSchema, - /** Quality metrics by run label */ - quality: z.record(z.string(), QualityMetricsSchema), - /** Performance metrics by run label */ - performance: z.record(z.string(), PerformanceMetricsSchema), - /** Reliability metrics by run label */ - reliability: z.record(z.string(), ReliabilityMetricsSchema), - /** Trajectory info by run label */ - trajectoryInfo: z.record(z.string(), TrajectoryInfoSchema), - /** Head-to-head comparison details */ - headToHead: HeadToHeadSchema, -}) - -/** Comparison report type */ -export type ComparisonReport = z.infer - -// ============================================================================ -// Trials Comparison Report Schemas -// ============================================================================ - -/** - * Confidence intervals for trials capability metrics. - */ -export const TrialsCapabilityConfidenceIntervalsSchema = z.object({ - /** CI for avgPassAtK */ - avgPassAtK: ConfidenceIntervalSchema.optional(), -}) - -/** Trials capability confidence intervals type */ -export type TrialsCapabilityConfidenceIntervals = z.infer - -/** - * Capability metrics for trials comparison (passAtK-based). - * - * @remarks - * Measures whether the agent CAN solve the task (at least once in K tries). - * Higher passAtK means the agent has the capability to solve the task. - */ -export const TrialsCapabilityMetricsSchema = z.object({ - /** Average passAtK across all prompts */ - avgPassAtK: z.number(), - /** Median passAtK */ - medianPassAtK: z.number(), - /** 25th percentile passAtK */ - p25PassAtK: z.number(), - /** 75th percentile passAtK */ - p75PassAtK: z.number(), - /** Confidence intervals (only with strategy=statistical) */ - confidenceIntervals: TrialsCapabilityConfidenceIntervalsSchema.optional(), -}) - -/** Trials capability metrics type */ -export type TrialsCapabilityMetrics = z.infer - -/** - * Confidence intervals for trials reliability metrics. - */ -export const TrialsReliabilityConfidenceIntervalsSchema = z.object({ - /** CI for avgPassExpK */ - avgPassExpK: ConfidenceIntervalSchema.optional(), -}) - -/** Trials reliability confidence intervals type */ -export type TrialsReliabilityConfidenceIntervals = z.infer - -/** - * Reliability metrics for trials comparison (passExpK-based). - * - * @remarks - * Measures whether the agent CONSISTENTLY solves the task (all K tries). - * Higher passExpK means the agent reliably solves the task every time. - */ -export const TrialsReliabilityMetricsSchema = z.object({ - /** Discriminator for trial-based reliability metrics */ - type: z.literal('trial'), - /** Average passExpK across all prompts */ - avgPassExpK: z.number(), - /** Median passExpK */ - medianPassExpK: z.number(), - /** 25th percentile passExpK */ - p25PassExpK: z.number(), - /** 75th percentile passExpK */ - p75PassExpK: z.number(), - /** Confidence intervals (only with strategy=statistical) */ - confidenceIntervals: TrialsReliabilityConfidenceIntervalsSchema.optional(), -}) - -/** Trials reliability metrics type */ -export type TrialsReliabilityMetrics = z.infer - -/** - * Flakiness metrics for trials comparison. - * - * @remarks - * Flakiness = passAtK - passExpK, measuring the gap between capability and reliability. - * A high flakiness score means the agent can sometimes solve the task but not consistently. - */ -export const TrialsFlakinessMetricsSchema = z.object({ - /** Average flakiness across all prompts */ - avgFlakiness: z.number(), - /** Median flakiness */ - medianFlakiness: z.number(), - /** Number of prompts with non-zero flakiness */ - flakyPromptCount: z.number(), - /** Top flaky prompts by flakiness score */ - topFlakyPrompts: z.array( - z.object({ - /** Prompt identifier */ - id: z.string(), - /** Flakiness score (passAtK - passExpK) */ - flakiness: z.number(), - }), - ), -}) - -/** Trials flakiness metrics type */ -export type TrialsFlakinessMetrics = z.infer - -/** - * Confidence intervals for trials quality metrics. - */ -export const TrialsQualityConfidenceIntervalsSchema = z.object({ - /** CI for avgScore */ - avgScore: ConfidenceIntervalSchema.optional(), -}) - -/** Trials quality confidence intervals type */ -export type TrialsQualityConfidenceIntervals = z.infer - -/** - * Quality metrics for trials comparison (score-based). - * - * @remarks - * Aggregates grader scores across all trials for each prompt. - * Only present when a grader was used during trials capture. - */ -export const TrialsQualityMetricsSchema = z.object({ - /** Discriminator for trial-level quality metrics */ - type: z.literal('trial'), - /** Average score across all trials */ - avgScore: z.number(), - /** Median score */ - medianScore: z.number(), - /** 25th percentile score */ - p25Score: z.number(), - /** 75th percentile score */ - p75Score: z.number(), - /** Confidence intervals (only with strategy=statistical) */ - confidenceIntervals: TrialsQualityConfidenceIntervalsSchema.optional(), -}) - -/** Trials quality metrics type */ -export type TrialsQualityMetrics = z.infer - -/** - * Confidence intervals for trials performance metrics. - */ -export const TrialsPerformanceConfidenceIntervalsSchema = z.object({ - /** CI for latency mean */ - latencyMean: ConfidenceIntervalSchema.optional(), -}) - -/** Trials performance confidence intervals type */ -export type TrialsPerformanceConfidenceIntervals = z.infer - -/** - * Performance metrics for trials comparison (latency-based). - * - * @remarks - * Aggregates trial durations across all prompts. - * Always present since TrialEntry.duration is required. - */ -export const TrialsPerformanceMetricsSchema = z.object({ - /** End-to-end latency statistics across all trials */ - latency: LatencyStatsSchema, - /** Sum of all trial durations in milliseconds */ - totalDuration: z.number(), - /** Confidence intervals (only with strategy=statistical) */ - confidenceIntervals: TrialsPerformanceConfidenceIntervalsSchema.optional(), -}) - -/** Trials performance metrics type */ -export type TrialsPerformanceMetrics = z.infer - -/** - * Per-prompt metrics for trials comparison drill-down. - */ -export const TrialsPromptComparisonSchema = z.object({ - /** Prompt identifier */ - id: z.string(), - /** Run label of the capability winner, or null if tie */ - capabilityWinner: z.string().nullable(), - /** Run label of the reliability winner, or null if tie */ - reliabilityWinner: z.string().nullable(), - /** passAtK by run label */ - passAtK: z.record(z.string(), z.number()), - /** passExpK by run label */ - passExpK: z.record(z.string(), z.number()), - /** Flakiness by run label */ - flakiness: z.record(z.string(), z.number()), -}) - -/** Trials prompt comparison type */ -export type TrialsPromptComparison = z.infer - -/** - * Metadata for trials comparison report. - */ -export const TrialsComparisonMetaSchema = z.object({ - /** ISO timestamp when report was generated */ - generatedAt: z.string(), - /** Run labels included in comparison */ - runs: z.array(z.string()), - /** Total prompts compared */ - promptCount: z.number(), - /** Number of trials per prompt (k value) */ - trialsPerPrompt: z.number(), - /** Input format indicator */ - inputFormat: z.literal('trials'), -}) - -/** Trials comparison meta type */ -export type TrialsComparisonMeta = z.infer - -/** - * Trials comparison report schema. - * - * @remarks - * Aggregates trials comparison output across capability, reliability, and flakiness dimensions. - * Used when comparing TrialResult JSONL files instead of CaptureResult files. - * - * Key metrics: - * - Capability: passAtK - can the agent solve this at least once? - * - Reliability: passExpK - does the agent solve this consistently? - * - Flakiness: passAtK - passExpK - how inconsistent is the agent? - */ -export const TrialsComparisonReportSchema = z.object({ - /** Report metadata */ - meta: TrialsComparisonMetaSchema, - /** Capability metrics by run label */ - capability: z.record(z.string(), TrialsCapabilityMetricsSchema), - /** Reliability metrics by run label */ - reliability: z.record(z.string(), TrialsReliabilityMetricsSchema), - /** Flakiness metrics by run label */ - flakiness: z.record(z.string(), TrialsFlakinessMetricsSchema), - /** Quality metrics by run label (only when grader scores are present) */ - quality: z.record(z.string(), TrialsQualityMetricsSchema).optional(), - /** Performance metrics by run label (always present, uses trial.duration) */ - performance: z.record(z.string(), TrialsPerformanceMetricsSchema), - /** Head-to-head comparison details */ - headToHead: z.object({ - /** Pairwise wins by capability */ - capability: z.array(PairwiseComparisonSchema), - /** Pairwise wins by reliability */ - reliability: z.array(PairwiseComparisonSchema), - /** Pairwise wins by overall weighted score */ - overall: z.array(PairwiseComparisonSchema), - }), - /** Per-prompt breakdown for drill-down (optional, can be large) */ - perPrompt: z.array(TrialsPromptComparisonSchema).optional(), -}) - -/** Trials comparison report type */ -export type TrialsComparisonReport = z.infer diff --git a/src/schemas/tests/constants.spec.ts b/src/schemas/tests/constants.spec.ts deleted file mode 100644 index d140314..0000000 --- a/src/schemas/tests/constants.spec.ts +++ /dev/null @@ -1,121 +0,0 @@ -import { describe, expect, test } from 'bun:test' -import { - DEFAULT_CALIBRATION_SAMPLE_SIZE, - DEFAULT_CLIENT_NAME, - DEFAULT_HARNESS_TIMEOUT, - DEFAULT_POLLING_INTERVAL, - DEFAULT_PROTOCOL_TIMEOUT, - DEFAULT_TRIAL_COUNT, - HEAD_LINES, - JSON_RPC_ERRORS, - MAX_CONTENT_LENGTH, - PROTOCOL_METHODS, - PROTOCOL_VERSION, - TAIL_LINES, -} from '../constants.ts' - -// ============================================================================ -// JSON-RPC Protocol Constants -// ============================================================================ - -describe('PROTOCOL_METHODS', () => { - test('contains all required lifecycle methods', () => { - expect(PROTOCOL_METHODS.INITIALIZE).toBe('initialize') - expect(PROTOCOL_METHODS.SHUTDOWN).toBe('shutdown') - }) - - test('contains all required session methods', () => { - expect(PROTOCOL_METHODS.CREATE_SESSION).toBe('session/new') - expect(PROTOCOL_METHODS.LOAD_SESSION).toBe('session/load') - expect(PROTOCOL_METHODS.PROMPT).toBe('session/prompt') - expect(PROTOCOL_METHODS.CANCEL).toBe('session/cancel') - expect(PROTOCOL_METHODS.UPDATE).toBe('session/update') - expect(PROTOCOL_METHODS.REQUEST_PERMISSION).toBe('session/request_permission') - expect(PROTOCOL_METHODS.SET_MODEL).toBe('session/set_model') - }) - - test('contains protocol-level methods', () => { - expect(PROTOCOL_METHODS.CANCEL_REQUEST).toBe('$/cancel_request') - }) -}) - -describe('PROTOCOL_VERSION', () => { - test('is version 1', () => { - expect(PROTOCOL_VERSION).toBe(1) - }) -}) - -// ============================================================================ -// JSON-RPC Error Codes -// ============================================================================ - -describe('JSON_RPC_ERRORS', () => { - test('contains standard JSON-RPC error codes', () => { - expect(JSON_RPC_ERRORS.PARSE_ERROR).toBe(-32700) - expect(JSON_RPC_ERRORS.INVALID_REQUEST).toBe(-32600) - expect(JSON_RPC_ERRORS.METHOD_NOT_FOUND).toBe(-32601) - expect(JSON_RPC_ERRORS.INVALID_PARAMS).toBe(-32602) - expect(JSON_RPC_ERRORS.INTERNAL_ERROR).toBe(-32603) - }) - - test('contains extension error codes', () => { - expect(JSON_RPC_ERRORS.REQUEST_CANCELLED).toBe(-32800) - }) -}) - -// ============================================================================ -// Client Defaults -// ============================================================================ - -describe('Client defaults', () => { - test('DEFAULT_CLIENT_NAME is set', () => { - expect(DEFAULT_CLIENT_NAME).toBe('plaited-eval-harness') - }) - - test('DEFAULT_PROTOCOL_TIMEOUT is 30 seconds', () => { - expect(DEFAULT_PROTOCOL_TIMEOUT).toBe(30000) - }) - - test('DEFAULT_POLLING_INTERVAL is 50ms', () => { - expect(DEFAULT_POLLING_INTERVAL).toBe(50) - }) -}) - -// ============================================================================ -// Harness Preview Configuration -// ============================================================================ - -describe('Preview configuration', () => { - test('HEAD_LINES is positive', () => { - expect(HEAD_LINES).toBeGreaterThan(0) - expect(HEAD_LINES).toBe(8) - }) - - test('TAIL_LINES is positive', () => { - expect(TAIL_LINES).toBeGreaterThan(0) - expect(TAIL_LINES).toBe(4) - }) - - test('MAX_CONTENT_LENGTH is reasonable', () => { - expect(MAX_CONTENT_LENGTH).toBeGreaterThan(0) - expect(MAX_CONTENT_LENGTH).toBe(500) - }) -}) - -// ============================================================================ -// Harness Defaults -// ============================================================================ - -describe('Harness defaults', () => { - test('DEFAULT_HARNESS_TIMEOUT is 60 seconds', () => { - expect(DEFAULT_HARNESS_TIMEOUT).toBe(60000) - }) - - test('DEFAULT_TRIAL_COUNT is 5', () => { - expect(DEFAULT_TRIAL_COUNT).toBe(5) - }) - - test('DEFAULT_CALIBRATION_SAMPLE_SIZE is 10', () => { - expect(DEFAULT_CALIBRATION_SAMPLE_SIZE).toBe(10) - }) -}) diff --git a/src/schemas/tests/fixtures/grader-bad-module.ts b/src/schemas/tests/fixtures/grader-bad-module.ts deleted file mode 100644 index 42516e3..0000000 --- a/src/schemas/tests/fixtures/grader-bad-module.ts +++ /dev/null @@ -1,5 +0,0 @@ -/** - * Test fixture: Invalid TypeScript grader (no 'grade' export). - */ - -export const evaluate = () => ({ pass: true, score: 1.0 }) diff --git a/src/schemas/tests/fixtures/grader-exec-fail.py b/src/schemas/tests/fixtures/grader-exec-fail.py deleted file mode 100755 index 34571e7..0000000 --- a/src/schemas/tests/fixtures/grader-exec-fail.py +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env python3 -""" -Test fixture: Python grader that exits with non-zero code. -""" - -import sys - -sys.stderr.write("Intentional failure") -sys.exit(1) diff --git a/src/schemas/tests/fixtures/grader-exec-invalid.py b/src/schemas/tests/fixtures/grader-exec-invalid.py deleted file mode 100755 index bc36395..0000000 --- a/src/schemas/tests/fixtures/grader-exec-invalid.py +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env python3 -""" -Test fixture: Python grader that outputs invalid JSON. -""" - -print("not valid json") diff --git a/src/schemas/tests/fixtures/grader-exec.py b/src/schemas/tests/fixtures/grader-exec.py deleted file mode 100755 index 5d51dcc..0000000 --- a/src/schemas/tests/fixtures/grader-exec.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python3 -""" -Test fixture: Python grader script using stdin/stdout JSON protocol. -""" - -import json -import sys - -def main(): - data = json.load(sys.stdin) - - output = data.get("output", "").lower() - hint = (data.get("hint") or "").lower() - - if hint: - pass_result = hint in output - else: - pass_result = True - - result = { - "pass": pass_result, - "score": 1.0 if pass_result else 0.0, - "reasoning": "Contains expected" if pass_result else "Missing expected" - } - - print(json.dumps(result)) - -if __name__ == "__main__": - main() diff --git a/src/schemas/tests/fixtures/grader-git.ts b/src/schemas/tests/fixtures/grader-git.ts deleted file mode 100644 index d8a7df7..0000000 --- a/src/schemas/tests/fixtures/grader-git.ts +++ /dev/null @@ -1,116 +0,0 @@ -/** - * Test fixture: Git-based grader that detects file changes. - * - * @remarks - * This grader uses git to detect environmental outcomes instead of just - * checking output text. It demonstrates the "grade outcomes, not paths" principle. - * - * SECURITY NOTE: This fixture validates the cwd parameter to prevent command injection. - * When implementing your own git-based graders, always validate paths from untrusted sources. - * The cwd parameter should only come from trusted sources (process.cwd(), CLI flags, etc.). - */ - -import { resolve } from 'node:path' -import type { Grader } from '../../schemas.ts' - -/** - * Validates that a path is safe to use in shell commands. - * - * @remarks - * Rejects paths containing shell metacharacters or suspicious patterns - * that could be used for command injection. - * - * @param path - The path to validate - * @returns True if path appears safe, false otherwise - */ -const isValidPath = (path: string): boolean => { - // Reject paths with shell metacharacters that could enable command injection - const dangerousChars = /[;&|`$(){}[\]<>'"\\]/ - if (dangerousChars.test(path)) { - return false - } - - // Reject paths with suspicious patterns - if (path.includes('..') || path.startsWith('-')) { - return false - } - - return true -} - -export const grade: Grader = async ({ output: _output, hint, cwd }) => { - // If no cwd provided, fall back to hint-based grading - if (!cwd) { - return { - pass: false, - score: 0, - reasoning: 'No working directory provided', - } - } - - // SECURITY: Validate cwd to prevent command injection - if (!isValidPath(cwd)) { - return { - pass: false, - score: 0, - reasoning: 'Invalid working directory path (contains suspicious characters)', - } - } - - // Normalize path to prevent directory traversal - const safeCwd = resolve(cwd) - - // Check if we're in a git repo - const isGit = await Bun.$`git -C ${safeCwd} rev-parse --git-dir 2>/dev/null`.nothrow() - - if (isGit.exitCode !== 0) { - return { - pass: false, - score: 0, - reasoning: 'Not a git repository', - } - } - - // Detect what files were created/modified using git - // Note: This detects untracked (??) and modified (M) files. - // Staged (A), renamed (R), deleted (D) files are not included in this example. - const status = await Bun.$`git -C ${safeCwd} status --porcelain`.text() - - const filesCreated = status - .split('\n') - .filter((line) => line.startsWith('??')) // ?? = untracked files - .map((line) => line.slice(3).trim()) - .filter(Boolean) - - const filesModified = status - .split('\n') - .filter((line) => line.startsWith(' M') || line.startsWith('M ')) // M = modified - .map((line) => line.slice(3).trim()) - .filter(Boolean) - - const hasChanges = filesCreated.length > 0 || filesModified.length > 0 - - // If hint is provided, check if any changed file matches the hint - let matchesHint = true - if (hint) { - const allChangedFiles = [...filesCreated, ...filesModified] - matchesHint = allChangedFiles.some((file) => file.toLowerCase().includes(hint.toLowerCase())) - } - - const pass = hasChanges && matchesHint - - return { - pass, - score: pass ? 1.0 : hasChanges ? 0.5 : 0.0, - reasoning: pass - ? `Files changed: ${[...filesCreated, ...filesModified].join(', ')}` - : hasChanges - ? 'File changes do not match hint' - : 'No file changes detected', - outcome: { - filesCreated, - filesModified, - type: 'git_status_check', - }, - } -} diff --git a/src/schemas/tests/fixtures/grader-module.ts b/src/schemas/tests/fixtures/grader-module.ts deleted file mode 100644 index a871167..0000000 --- a/src/schemas/tests/fixtures/grader-module.ts +++ /dev/null @@ -1,14 +0,0 @@ -/** - * Test fixture: TypeScript grader module. - */ - -import type { Grader } from '../../schemas.ts' - -export const grade: Grader = async ({ input: _input, output, hint }) => { - const pass = hint ? output.toLowerCase().includes(hint.toLowerCase()) : true - return { - pass, - score: pass ? 1.0 : 0.0, - reasoning: pass ? 'Contains expected text' : 'Missing expected text', - } -} diff --git a/src/schemas/tests/grader-git.spec.ts b/src/schemas/tests/grader-git.spec.ts deleted file mode 100644 index 580da90..0000000 --- a/src/schemas/tests/grader-git.spec.ts +++ /dev/null @@ -1,222 +0,0 @@ -/** - * Tests for git-based grader fixture. - * - * @remarks - * Verifies that graders can use git to detect environmental outcomes - * and return structured outcome data. - */ - -import { afterEach, beforeEach, describe, expect, test } from 'bun:test' -import { mkdtemp, rm } from 'node:fs/promises' -import { tmpdir } from 'node:os' -import { join } from 'node:path' -import type { Grader } from '../schemas.ts' - -describe('Git-based grader', () => { - let tempDir: string - let grader: Grader - - beforeEach(async () => { - // Create temporary directory - tempDir = await mkdtemp(join(tmpdir(), 'git-grader-test-')) - - // Initialize git repo - await Bun.$`git -C ${tempDir} init`.quiet() - await Bun.$`git -C ${tempDir} config user.email "test@test.com"`.quiet() - await Bun.$`git -C ${tempDir} config user.name "Test User"`.quiet() - - // Load the git-based grader - const module = await import('./fixtures/grader-git.ts') - grader = module.grade - }) - - afterEach(async () => { - // Clean up temporary directory - await rm(tempDir, { recursive: true, force: true }) - }) - - test('detects newly created files', async () => { - // Create a new file (untracked) - await Bun.write(join(tempDir, 'button.tsx'), 'export const Button = () => ') - - const result = await grader({ - input: 'Create a button component', - output: 'I created Button.tsx', - hint: 'button', - cwd: tempDir, - }) - - expect(result.pass).toBe(true) - expect(result.score).toBe(1.0) - expect(result.reasoning).toContain('button.tsx') - expect(result.outcome).toBeDefined() - expect(result.outcome?.filesCreated).toEqual(['button.tsx']) - expect(result.outcome?.type).toBe('git_status_check') - }) - - test('detects modified files', async () => { - // Create and commit a file - await Bun.write(join(tempDir, 'config.ts'), 'export const config = { value: 1 }') - await Bun.$`git -C ${tempDir} add config.ts`.quiet() - await Bun.$`git -C ${tempDir} commit -m "Initial commit"`.quiet() - - // Modify the file - await Bun.write(join(tempDir, 'config.ts'), 'export const config = { value: 2 }') - - const result = await grader({ - input: 'Update config value', - output: 'I updated the config', - hint: 'config', - cwd: tempDir, - }) - - expect(result.pass).toBe(true) - expect(result.score).toBe(1.0) - expect(result.reasoning).toContain('config.ts') - expect(result.outcome).toBeDefined() - expect(result.outcome?.filesModified).toEqual(['config.ts']) - expect(result.outcome?.type).toBe('git_status_check') - }) - - test('fails when no changes detected', async () => { - // No files created or modified - const result = await grader({ - input: 'Create a button component', - output: 'I created a button component', - cwd: tempDir, - }) - - expect(result.pass).toBe(false) - expect(result.score).toBe(0) - expect(result.reasoning).toContain('No file changes detected') - expect(result.outcome).toBeDefined() - expect(result.outcome?.filesCreated).toEqual([]) - expect(result.outcome?.filesModified).toEqual([]) - }) - - test('partial score when changes do not match hint', async () => { - // Create a file that does not match the hint - await Bun.write(join(tempDir, 'unrelated.ts'), 'export const foo = 1') - - const result = await grader({ - input: 'Create a button component', - output: 'I created something', - hint: 'button', - cwd: tempDir, - }) - - expect(result.pass).toBe(false) - expect(result.score).toBe(0.5) // Has changes but doesn't match hint - expect(result.reasoning).toContain('do not match hint') - expect(result.outcome?.filesCreated).toEqual(['unrelated.ts']) - }) - - test('handles missing cwd parameter', async () => { - const result = await grader({ - input: 'Create a button component', - output: 'I created a button', - hint: 'button', - // cwd not provided - }) - - expect(result.pass).toBe(false) - expect(result.score).toBe(0) - expect(result.reasoning).toBe('No working directory provided') - }) - - test('handles non-git directory', async () => { - // Create a non-git temp directory - const nonGitDir = await mkdtemp(join(tmpdir(), 'non-git-test-')) - - try { - const result = await grader({ - input: 'Create a button component', - output: 'I created a button', - cwd: nonGitDir, - }) - - expect(result.pass).toBe(false) - expect(result.score).toBe(0) - expect(result.reasoning).toBe('Not a git repository') - } finally { - await rm(nonGitDir, { recursive: true, force: true }) - } - }) - - test('works without hint parameter', async () => { - // Create a file - await Bun.write(join(tempDir, 'any-file.ts'), 'export const x = 1') - - const result = await grader({ - input: 'Create a file', - output: 'I created a file', - cwd: tempDir, - // hint not provided - }) - - expect(result.pass).toBe(true) - expect(result.score).toBe(1.0) - expect(result.reasoning).toContain('any-file.ts') - expect(result.outcome?.filesCreated).toEqual(['any-file.ts']) - }) - - test('returns structured outcome for downstream analysis', async () => { - // Create multiple files - await Bun.write(join(tempDir, 'button.tsx'), 'export const Button = () => "} -``` - -### Output Format - -```jsonl -{"id":"test-001","input":"Create a button component","reference":"export const Button = () => ","pass":true,"score":1.0,"reasoning":"Contains hint content"} -``` - -## Balance Command - -Analyze test set coverage to ensure balanced evaluation. - -```bash -# Analyze prompt distribution -bunx @plaited/agent-eval-harness balance prompts.jsonl -o balance.json - -# Pretty print -bunx @plaited/agent-eval-harness balance prompts.jsonl | jq . -``` - -### Why Use This? - -An eval with only "make X work" misses "don't break Y". Balance analysis shows: - -- **Category distribution** (from `metadata.category`) -- **Positive/negative case ratio** -- **Coverage gaps** - -### Output Format - -```json -{ - "totalCases": 50, - "categories": [ - { "name": "ui", "count": 20, "percentage": 40 }, - { "name": "logic", "count": 15, "percentage": 30 }, - { "name": "api", "count": 10, "percentage": 20 }, - { "name": "edge-case", "count": 5, "percentage": 10 } - ], - "underrepresented": ["edge-case"], - "suggestions": ["Consider adding more test cases for: edge-case"] -} -``` - -### Balanced Eval Design - -Include both positive and negative cases: - -| Type | Example | Purpose | -|------|---------|---------| -| Positive | "Add a login button" | Agent should succeed | -| Negative | "Add a button without breaking tests" | Agent should not break things | -| Edge case | "Handle empty input gracefully" | Agent should be robust | - -See [eval-concepts.md](references/eval-concepts.md#test-set-balance) for more on balanced test sets. - -## Pipeline Workflow - -The pipeline commands enable Unix-style composition for flexible evaluation workflows. - -### Full Pipeline Example - -```bash -# Execute → Extract → Grade → Format in one pipeline -cat prompts.jsonl | \ - bunx @plaited/agent-eval-harness run -s claude.json | \ - bunx @plaited/agent-eval-harness extract -s claude.json | \ - bunx @plaited/agent-eval-harness grade -g ./grader.ts | \ - bunx @plaited/agent-eval-harness format -f markdown > report.md -``` - -### Run Command - -Execute prompts and output raw results. Three modes available: - -```bash -# Schema mode (recommended) -bunx @plaited/agent-eval-harness run prompts.jsonl --schema claude.json - -# Simple mode: {} placeholder substitution -bunx @plaited/agent-eval-harness run prompts.jsonl --simple "claude -p {} --output-format stream-json" - -# Shell mode: $PROMPT environment variable -bunx @plaited/agent-eval-harness run prompts.jsonl --shell 'claude -p "$PROMPT" --output-format stream-json' -``` - -> **⚠️ Security Warning:** The `--simple` and `--shell` modes execute prompts via shell commands. Prompts are escaped but **do not use untrusted prompt content** with these modes. Malicious prompt text could potentially escape the quoting and execute arbitrary commands. Use `--schema` mode (headless adapter) for untrusted inputs. - -### Extract Command - -Parse raw output into structured trajectories: - -```bash -# From file -bunx @plaited/agent-eval-harness extract raw.jsonl --schema claude.json -o extracted.jsonl - -# Piped from run -bunx @plaited/agent-eval-harness run prompts.jsonl -s claude.json | \ - bunx @plaited/agent-eval-harness extract -s claude.json -``` - -### Grade Command - -Apply grader to extracted results: - -```bash -bunx @plaited/agent-eval-harness grade extracted.jsonl --grader ./grader.ts -o graded.jsonl -``` - -### Format Command - -Convert results to different output formats: - -```bash -# Markdown report -bunx @plaited/agent-eval-harness format results.jsonl --style markdown -o report.md - -# CSV for spreadsheets -bunx @plaited/agent-eval-harness format results.jsonl --style csv -o results.csv - -# JSONL (pass-through, default) -bunx @plaited/agent-eval-harness format results.jsonl --style jsonl -``` - -### Compare Command - -Compare multiple runs of the same prompts. Supports both **CaptureResult** (single-run) and **TrialResult** (multi-run reliability) formats with auto-detection. - -```bash -# Default: auto-detect format, weighted strategy, JSON output -bunx @plaited/agent-eval-harness compare run1.jsonl run2.jsonl -o comparison.json - -# Statistical significance strategy -bunx @plaited/agent-eval-harness compare run1.jsonl run2.jsonl --strategy statistical -o comparison.json - -# Custom weights via environment variables (CaptureResult) -COMPARE_QUALITY=0.7 COMPARE_LATENCY=0.2 COMPARE_RELIABILITY=0.1 \ - bunx @plaited/agent-eval-harness compare run1.jsonl run2.jsonl -o comparison.json - -# Markdown report format -bunx @plaited/agent-eval-harness compare run1.jsonl run2.jsonl --format markdown -o report.md - -# Custom grader (LLM-as-Judge) -bunx @plaited/agent-eval-harness compare run1.jsonl run2.jsonl \ - --strategy custom --grader ./my-llm-judge.ts -o comparison.json - -# With explicit labels -bunx @plaited/agent-eval-harness compare \ - --run "with-mcp:results-mcp.jsonl" \ - --run "vanilla:results-vanilla.jsonl" \ - -o comparison.json -``` - -**Use cases for compare:** -- Same agent, different MCP servers -- Same agent, different skills enabled -- Same agent, different model versions -- Different agents entirely - -### Trials Comparison (pass@k Analysis) - -Compare TrialResult files for reliability analysis: - -```bash -# Auto-detect trials format -bunx @plaited/agent-eval-harness compare trials1.jsonl trials2.jsonl -o comparison.json - -# Explicit format (skip auto-detection) -bunx @plaited/agent-eval-harness compare trials1.jsonl trials2.jsonl --input-format trials -o comparison.json - -# Custom weights for trials comparison -COMPARE_CAPABILITY=0.5 COMPARE_RELIABILITY=0.3 COMPARE_CONSISTENCY=0.2 \ - bunx @plaited/agent-eval-harness compare trials1.jsonl trials2.jsonl -o comparison.json -``` - -**Trials metrics:** - -| Metric | Description | Formula | -|--------|-------------|---------| -| **Capability** (passAtK) | Can solve at least once in K tries | `1 - (1-p)^k` | -| **Reliability** (passExpK) | Solves consistently every time | `p^k` | -| **Flakiness** | Gap between capability and reliability | `passAtK - passExpK` | -| **Quality** (scores) | Aggregate grader scores across trials | avg/median/p25/p75 (only with grader) | -| **Performance** (latency) | Aggregate trial durations | p50/p90/p99/mean/min/max (always present) | - -### Built-in Comparison Strategies - -**For CaptureResult (single-run):** - -| Strategy | Description | Env Vars | -|----------|-------------|----------| -| `weighted` (default) | Quality, latency, reliability | `COMPARE_QUALITY`, `COMPARE_LATENCY`, `COMPARE_RELIABILITY` | -| `statistical` | Bootstrap for confidence intervals | `COMPARE_BOOTSTRAP_ITERATIONS` | -| `custom` | Your own grader | `--grader path` | - -**For TrialResult (multi-run):** - -| Strategy | Description | Env Vars | -|----------|-------------|----------| -| `weighted` (default) | Capability, reliability, consistency | `COMPARE_CAPABILITY`, `COMPARE_RELIABILITY`, `COMPARE_CONSISTENCY` | -| `statistical` | Bootstrap passAtK confidence intervals | `COMPARE_BOOTSTRAP_ITERATIONS` | -| `custom` | Your own grader | `--grader path` | - -### Comparison Report Output - -**CaptureResult format** outputs `ComparisonReport`: - -```json -{ - "meta": { "generatedAt": "...", "runs": ["baseline", "variant"], "promptCount": 100 }, - "quality": { "baseline": { "avgScore": 0.85, "passRate": 0.82 }, "variant": { ... } }, - "performance": { "baseline": { "latency": { "p50": 1200, "p90": 3400 } }, ... }, - "reliability": { "baseline": { "type": "run", "toolErrors": 5, "completionRate": 0.99 }, ... }, - "headToHead": { "pairwise": [{ "runA": "baseline", "runB": "variant", "aWins": 35, "bWins": 55 }] } -} -``` - -With `--strategy statistical`, quality and performance metrics include 95% confidence intervals: - -```json -{ - "quality": { - "baseline": { - "avgScore": 0.85, - "passRate": 0.82, - "confidenceIntervals": { - "avgScore": [0.82, 0.88], - "passRate": [0.79, 0.85] - } - } - }, - "performance": { - "baseline": { - "latency": { "p50": 1200, "mean": 1350 }, - "confidenceIntervals": { - "latencyMean": [1280, 1420] - } - } - } -} -``` - -**TrialResult format** outputs `TrialsComparisonReport`: - -```json -{ - "meta": { "generatedAt": "...", "runs": ["claude", "gemini"], "promptCount": 50, "trialsPerPrompt": 5, "inputFormat": "trials" }, - "capability": { "claude": { "avgPassAtK": 0.92, "medianPassAtK": 0.95 }, "gemini": { "..." : "..." } }, - "reliability": { "claude": { "type": "trial", "avgPassExpK": 0.78, "medianPassExpK": 0.82 }, "gemini": { "..." : "..." } }, - "flakiness": { "claude": { "avgFlakiness": 0.14, "flakyPromptCount": 12 }, "gemini": { "..." : "..." } }, - "quality": { "claude": { "avgScore": 0.85, "medianScore": 0.90, "p25Score": 0.75, "p75Score": 0.95 }, "gemini": { "..." : "..." } }, - "performance": { "claude": { "latency": { "p50": 1200, "p90": 3400, "p99": 5100, "mean": 1500, "min": 800, "max": 5200 }, "totalDuration": 375000 }, "gemini": { "..." : "..." } }, - "headToHead": { - "capability": [{ "runA": "claude", "runB": "gemini", "aWins": 28, "bWins": 18, "ties": 4 }], - "reliability": ["..."], - "overall": ["..."] - } -} -``` - -**Notes:** -- `quality` is only present when a grader was used (trials have `score` fields) -- `performance` is always present (every trial has `duration`) - -With `--strategy statistical`, capability, reliability, quality, and performance metrics include 95% confidence intervals: - -```json -{ - "capability": { - "claude": { - "avgPassAtK": 0.92, - "confidenceIntervals": { "avgPassAtK": [0.88, 0.95] } - } - }, - "reliability": { - "claude": { - "type": "trial", - "avgPassExpK": 0.78, - "confidenceIntervals": { "avgPassExpK": [0.72, 0.84] } - } - }, - "quality": { - "claude": { - "avgScore": 0.85, - "confidenceIntervals": { "avgScore": [0.82, 0.88] } - } - }, - "performance": { - "claude": { - "latency": { "mean": 1500 }, - "confidenceIntervals": { "latencyMean": [1380, 1620] } - } - } -} -``` - -See [comparison-graders.md](references/comparison-graders.md) for complete comparison grader documentation including LLM-as-Judge patterns. - -### Comparison Grader Interface - -**CaptureResult grader:** - -```typescript -import type { ComparisonGrader } from '@plaited/agent-eval-harness/pipeline' - -export const grade: ComparisonGrader = async ({ id, input, hint, runs }) => { - // runs is Record - return { - rankings: [ - { run: 'with-mcp', rank: 1, score: 0.9 }, - { run: 'vanilla', rank: 2, score: 0.7 }, - ], - reasoning: 'MCP run produced more accurate output' - } -} -``` - -**TrialResult grader:** - -```typescript -import type { TrialsComparisonGrader } from '@plaited/agent-eval-harness/pipeline' - -export const grade: TrialsComparisonGrader = async ({ id, input, hint, runs }) => { - // runs is Record - // Each trial in trials has: { duration, score?, pass?, output, trajectory } - return { - rankings: [ - { run: 'claude', rank: 1, score: 0.92 }, - { run: 'gemini', rank: 2, score: 0.85 }, - ], - reasoning: 'Claude has higher reliability with lower flakiness' - } -} -``` - -### Pipeline Workflow Diagram - -```mermaid -flowchart LR - Prompts["prompts.jsonl"] --> Run["run"] - Schema["headless schema"] --> Run - Run --> Raw["raw.jsonl"] - Raw --> Extract["extract"] - Schema --> Extract - Extract --> Extracted["extracted.jsonl"] - Extracted --> Grade["grade"] - Grader["grader.ts"] --> Grade - Grade --> Graded["graded.jsonl"] - Graded --> Format["format"] - Format --> Output["report.md / .csv / .jsonl"] - - Graded --> Compare["compare"] - Results2["other runs..."] --> Compare - CompareGrader["compare-grader.ts"] --> Compare - Compare --> Comparison["comparison.jsonl"] -``` - -## Schemas Command - -Export JSON schemas for non-TypeScript tools. - -```bash -# List available schemas -bunx @plaited/agent-eval-harness schemas - -# Export all schemas as JSON -bunx @plaited/agent-eval-harness schemas --json -o schemas.json - -# Export specific schema -bunx @plaited/agent-eval-harness schemas CaptureResult --json -bunx @plaited/agent-eval-harness schemas TrialResult --json -bunx @plaited/agent-eval-harness schemas GraderResult --json -``` - -### Available Schemas - -| Schema | Description | -|--------|-------------| -| `CaptureResult` | Single capture output (id, input, output, trajectory, timing) | -| `TrialResult` | Multi-run trial output (includes passAtK, passExpK) | -| `GraderResult` | Grader return value (pass, score, reasoning) | -| `PromptInput` | Input prompt format | -| `TrajectoryStep` | Single step in trajectory array | -| `SummaryResult` | Compact summary format | - -### Usage in Other Languages - -Export schemas for validation in Python, Go, etc.: - -```bash -# Export all schemas -bunx @plaited/agent-eval-harness schemas --json -o schemas.json - -# Use in Python with jsonschema -python -c " -import json -from jsonschema import validate - -with open('schemas.json') as f: - schemas = json.load(f) - -with open('results.jsonl') as f: - for line in f: - result = json.loads(line) - validate(result, schemas['CaptureResult']) - print(f'{result[\"id\"]}: valid') -" -``` - -## Grader Interface - -Graders provide semantic pass/fail scoring for captured trajectories. The harness supports graders written in **any language**. - -### Git-Based Grading (Recommended for Coding Tasks) - -**Grade outcomes, not paths.** Use the optional `cwd` parameter to detect environmental changes with git: - -```typescript -// git-grader.ts -import type { Grader } from '@plaited/agent-eval-harness/schemas' - -export const grade: Grader = async ({ output, hint, cwd }) => { - if (!cwd) return { pass: false, score: 0, reasoning: 'No cwd' } - - // Detect file changes - const status = await Bun.$`git -C ${cwd} status --porcelain`.text() - const filesCreated = status - .split('\n') - .filter(line => line.startsWith('??')) - .map(line => line.slice(3).trim()) - - // Verify tests pass - const testResult = await Bun.$`cd ${cwd} && bun test`.nothrow() - - return { - pass: filesCreated.length > 0 && testResult.exitCode === 0, - score: testResult.exitCode === 0 ? 1 : 0, - reasoning: `Files: ${filesCreated.join(', ')}. Tests: ${testResult.exitCode === 0 ? 'pass' : 'fail'}`, - outcome: { // Optional: structured data for analysis - filesCreated, - testsPassed: testResult.exitCode === 0, - type: 'file_creation_with_tests' - } - } -} -``` - -See [inline-graders.md](references/inline-graders.md#git-based-outcome-grading) for comprehensive git-based grading patterns. - -### Output-Based Grading (General Purpose) - -```typescript -// my-grader.ts -import type { Grader } from '@plaited/agent-eval-harness/schemas' - -export const grade: Grader = async ({ input, output, hint, trajectory }) => { - const pass = output.toLowerCase().includes(hint?.toLowerCase() ?? '') - return { - pass, - score: pass ? 1 : 0, - reasoning: pass ? 'Contains hint content' : 'Missing hint content' - } -} -``` - -**Note:** `input` can be `string` (single turn) or `string[]` (multi-turn). The `hint` field provides grader context (renamed from `expected`). - -### Python/Executable Graders - -Any executable can be a grader using stdin/stdout JSON protocol: - -```python -#!/usr/bin/env python3 -import json, sys - -data = json.load(sys.stdin) -output = data.get("output", "").lower() -hint = (data.get("hint") or "").lower() - -pass_result = hint in output if hint else True -print(json.dumps({ - "pass": pass_result, - "score": 1.0 if pass_result else 0.0, - "reasoning": "Contains hint" if pass_result else "Missing hint" -})) -``` - -```bash -chmod +x ./grader.py -bunx @plaited/agent-eval-harness capture prompts.jsonl --schema ./claude.json --grader ./grader.py -o results.jsonl -``` - -See [inline-graders.md](references/inline-graders.md) for complete grader documentation including LLM-as-Judge patterns. - -## Input Format - -Each line in `prompts.jsonl`: - -```jsonl -{"id":"test-001","input":"Create a button","hint":"should contain "} -``` - -### Validation Workflow - -```bash -# Check that reference solutions pass your grader -agent-eval-harness validate-refs prompts.jsonl --grader ./grader.ts -o validation.jsonl - -# If references fail, your grader or task is broken -cat validation.jsonl | jq 'select(.pass == false)' -``` - -### Why This Matters - -If your reference solution fails your own grader: -- The task definition is ambiguous -- The grader is too strict -- The hint is wrong - -Fix the eval before evaluating the agent. - -## Test Set Balance - -### The Problem - -An eval with only "make X work" misses "don't break Y". - -**Unbalanced:** -- 50 prompts: "Add feature X" -- 0 prompts: "Don't break existing feature Y" - -### Using the Balance Command - -```bash -agent-eval-harness balance prompts.jsonl -o balance.json -``` - -Analyzes: -- Category distribution (from `metadata.category`) -- Positive/negative case ratio -- Coverage gaps - -### Balanced Eval Design - -Include both positive and negative cases: - -| Type | Example | Purpose | -|------|---------|---------| -| Positive | "Add a login button" | Agent should succeed | -| Negative | "Add a button without breaking tests" | Agent should not break things | -| Edge case | "Handle empty input gracefully" | Agent should be robust | - -## Summary - -| Concept | Command | Key Insight | -|---------|---------|-------------| -| Non-determinism | `trials` | Run multiple times to measure reliability | -| pass@k | `trials -k N` | Capability: can agent do this? | -| pass^k | `trials -k N` | Regression: will it always work? | -| Calibration | `calibrate` | Validate grader, not just agent | -| Reference validation | `validate-refs` | Prove tasks are solvable | -| Balance | `balance` | Cover positive + negative cases | diff --git a/.agents/skills/agent-eval-harness/references/inline-graders.md b/.agents/skills/agent-eval-harness/references/inline-graders.md deleted file mode 100644 index f504a90..0000000 --- a/.agents/skills/agent-eval-harness/references/inline-graders.md +++ /dev/null @@ -1,711 +0,0 @@ -# Inline Graders - -Inline graders score individual agent outputs in isolation. Each input/output pair is graded independently, producing a pass/fail result with a score. - -## Grader Interface - -```typescript -import type { Grader } from '@plaited/agent-eval-harness/schemas' - -type GraderInput = { - input: string | string[] // Original prompt(s) - output: string // Agent output - hint?: string // Grader context/expectation - trajectory?: TrajectoryStep[] // Execution trace - metadata?: Record // Optional metadata from prompt - cwd?: string // Working directory (for git-based grading) -} - -type GraderResult = { - pass: boolean // Did it pass? - score: number // 0.0 to 1.0 - reasoning?: string // Explanation - outcome?: Record // Structured outcome data (merged onto result) -} -``` - -## Building an Inline Grader - -### Step 1: Export the Schema - -Get the JSON Schema for validation in any language: - -```bash -agent-eval-harness schemas GraderResult --json -o grader-result.json -``` - -### Step 2: Create the Grader - -**TypeScript (recommended):** - -```typescript -// my-grader.ts -import type { Grader } from '@plaited/agent-eval-harness/schemas' - -export const grade: Grader = async ({ input, output, hint, trajectory, metadata }) => { - // Your scoring logic here - const pass = evaluateOutput(output, hint) - - return { - pass, - score: pass ? 1.0 : 0.0, - reasoning: pass ? 'Meets criteria' : 'Does not meet criteria' - } -} - -const evaluateOutput = (output: string, hint?: string): boolean => { - if (!hint) return true - return output.toLowerCase().includes(hint.toLowerCase()) -} -``` - -**Python:** - -```python -#!/usr/bin/env python3 -import json -import sys - -data = json.load(sys.stdin) -output = data.get("output", "").lower() -hint = (data.get("hint") or "").lower() - -pass_result = hint in output if hint else True - -print(json.dumps({ - "pass": pass_result, - "score": 1.0 if pass_result else 0.0, - "reasoning": "Contains hint" if pass_result else "Missing hint" -})) -``` - -### Step 3: Use the Grader - -```bash -# With capture command -agent-eval-harness capture prompts.jsonl --schema ./claude.json --grader ./my-grader.ts -o results.jsonl - -# With grade command (pipeline) -agent-eval-harness grade extracted.jsonl --grader ./my-grader.ts -o graded.jsonl - -# With trials command -agent-eval-harness trials prompts.jsonl --schema ./claude.json -k 5 --grader ./my-grader.ts -o trials.jsonl -``` - -## Git-Based Outcome Grading - -The most powerful grading pattern for coding agents: use Git to detect actual environmental changes, not just check the agent's output text. - -### Why Git-Based Grading? - -**Grade outcomes, not paths.** Anthropic's eval framework emphasizes grading final environmental state, not procedural steps. Git provides the perfect oracle: - -- **Universal** - Works in any git repo, any language -- **Precise** - Shows exactly what files changed -- **Zero config** - No complex outcome schemas needed -- **Debuggable** - `git diff` shows what happened - -### The `cwd` Parameter - -Graders receive an optional `cwd` parameter - the working directory where the agent executed: - -```typescript -export const grade: Grader = async ({ input, output, hint, trajectory, metadata, cwd }) => { - // cwd is the session's working directory - // Use it to run git commands and detect outcomes -} -``` - -### The `outcome` Field - -Graders can return an optional `outcome` field with structured data about what changed: - -```typescript -return { - pass: true, - score: 1.0, - reasoning: 'Created Button.tsx with valid syntax', - outcome: { // ← Optional: structured outcome data - filesCreated: ['src/components/Button.tsx'], - validSyntax: true, - type: 'file_creation' - } -} -``` - -The harness merges this `outcome` onto the capture result, making it available for downstream analysis. - -### Pattern 1: File Creation - -**Task:** "Create a button component" - -```typescript -import type { Grader } from '@plaited/agent-eval-harness/schemas' - -export const grade: Grader = async ({ output, hint, cwd }) => { - if (!cwd) { - return { pass: false, score: 0, reasoning: 'No working directory provided' } - } - - // Detect what files were created using git - const status = await Bun.$`git -C ${cwd} status --porcelain`.text() - - const filesCreated = status - .split('\n') - .filter(line => line.startsWith('??')) // ?? = untracked files - .map(line => line.slice(3)) - - const buttonFileCreated = filesCreated.some(f => - f.toLowerCase().includes('button') - ) - - // Check if file has valid syntax - let validSyntax = true - if (buttonFileCreated) { - const tscCheck = await Bun.$`cd ${cwd} && npx tsc --noEmit`.nothrow() - validSyntax = tscCheck.exitCode === 0 - } - - return { - pass: buttonFileCreated && validSyntax, - score: (buttonFileCreated ? 0.5 : 0) + (validSyntax ? 0.5 : 0), - reasoning: `Files created: ${filesCreated.join(', ')}. Valid syntax: ${validSyntax}`, - outcome: { // ← Structured outcome for analysis - filesCreated, - validSyntax, - type: 'file_creation' - } - } -} -``` - -### Pattern 2: Test Fixing - -**Task:** "Fix the failing tests in auth.spec.ts" - -```typescript -export const grade: Grader = async ({ output, cwd }) => { - if (!cwd) return { pass: false, score: 0, reasoning: 'No cwd' } - - // Run tests to verify they pass - const testResult = await Bun.$`cd ${cwd} && bun test`.nothrow() - const testsPassed = testResult.exitCode === 0 - - // Check what files were modified - const diff = await Bun.$`git -C ${cwd} diff --name-only`.text() - const filesModified = diff.split('\n').filter(Boolean) - - return { - pass: testsPassed, - score: testsPassed ? 1 : 0, - reasoning: testsPassed - ? `Tests passed. Modified: ${filesModified.join(', ')}` - : `Tests failed: ${testResult.stderr.toString().slice(0, 200)}`, - outcome: { - testsPassed, - exitCode: testResult.exitCode, - filesModified, - type: 'test_execution' - } - } -} -``` - -### Pattern 3: Non-Breaking Changes - -**Task:** "Refactor the authentication flow without breaking existing tests" - -```typescript -export const grade: Grader = async ({ output, cwd }) => { - if (!cwd) return { pass: false, score: 0, reasoning: 'No cwd' } - - // Verify tests still pass - const testResult = await Bun.$`cd ${cwd} && bun test`.nothrow() - const testsPassed = testResult.exitCode === 0 - - // Check what files changed - const diff = await Bun.$`git -C ${cwd} diff --name-only`.text() - const changedFiles = diff.split('\n').filter(Boolean) - - // Define critical files that shouldn't be touched - const touchedCriticalFiles = changedFiles.some(f => - f.includes('package.json') || - f.includes('tsconfig.json') || - f.includes('.env') - ) - - return { - pass: testsPassed && !touchedCriticalFiles, - score: testsPassed ? (touchedCriticalFiles ? 0.5 : 1) : 0, - reasoning: `Tests: ${testsPassed ? 'pass' : 'fail'}. Changed: ${changedFiles.join(', ')}. Critical files touched: ${touchedCriticalFiles}`, - outcome: { - testsPassed, - filesModified: changedFiles, - touchedCriticalFiles, - type: 'refactoring_safety' - } - } -} -``` - -### Pattern 4: Code Quality - -**Task:** "Add TypeScript types to the API functions" - -```typescript -export const grade: Grader = async ({ output, cwd }) => { - if (!cwd) return { pass: false, score: 0, reasoning: 'No cwd' } - - // Check type errors before - const beforeErrors = await Bun.$`cd ${cwd} && git stash && npx tsc --noEmit 2>&1 | grep -c "error TS" || echo 0`.text() - await Bun.$`cd ${cwd} && git stash pop`.nothrow() - - // Check type errors after - const afterErrors = await Bun.$`cd ${cwd} && npx tsc --noEmit 2>&1 | grep -c "error TS" || echo 0`.text() - - const errorsBefore = parseInt(beforeErrors.trim()) - const errorsAfter = parseInt(afterErrors.trim()) - const improved = errorsAfter < errorsBefore - - return { - pass: improved, - score: Math.max(0, (errorsBefore - errorsAfter) / errorsBefore), - reasoning: `Type errors: ${errorsBefore} → ${errorsAfter}`, - outcome: { - errorsBefore, - errorsAfter, - improved, - type: 'code_quality' - } - } -} -``` - -### Pattern 5: Build Success - -**Task:** "Fix the build errors" - -```typescript -export const grade: Grader = async ({ output, cwd }) => { - if (!cwd) return { pass: false, score: 0, reasoning: 'No cwd' } - - // Try to build - const buildResult = await Bun.$`cd ${cwd} && bun run build`.nothrow() - const buildSucceeded = buildResult.exitCode === 0 - - // Check what was changed - const diff = await Bun.$`git -C ${cwd} diff --stat`.text() - - return { - pass: buildSucceeded, - score: buildSucceeded ? 1 : 0, - reasoning: buildSucceeded - ? `Build succeeded. Changes:\n${diff}` - : `Build failed: ${buildResult.stderr.toString().slice(0, 300)}`, - outcome: { - buildSucceeded, - exitCode: buildResult.exitCode, - diffStat: diff, - type: 'build_verification' - } - } -} -``` - -### Fallback for Non-Git Repos - -Always check if git is available before using git commands: - -```typescript -export const grade: Grader = async ({ output, hint, cwd }) => { - // Check if we're in a git repo - if (cwd) { - const isGit = await Bun.$`git -C ${cwd} rev-parse --git-dir 2>/dev/null`.nothrow() - - if (isGit.exitCode === 0) { - // Use git-based grading - const status = await Bun.$`git -C ${cwd} status --porcelain`.text() - // ... git-based logic - } - } - - // Fall back to output-based grading - const pass = hint ? output.toLowerCase().includes(hint.toLowerCase()) : true - return { - pass, - score: pass ? 1 : 0, - reasoning: cwd ? 'Git not available, using output matching' : 'No cwd provided' - } -} -``` - -### Best Practices for Git-Based Grading - -1. **Always check for `cwd`** - It's an optional parameter -2. **Validate paths for security** - See security notes below -3. **Use `.nothrow()`** - Don't let failed commands crash the grader -4. **Grade outcomes, not paths** - Check if tests pass, not which tools were used -5. **Return structured outcomes** - Makes downstream analysis easier -6. **Keep repos clean** - Run evals in clean working directories (`git status` should be clean) -7. **Include reasoning** - Explain what git detected and why it passed/failed -8. **Handle non-git gracefully** - Provide fallback logic for non-git environments - -### Security Considerations - -**IMPORTANT:** When using the `cwd` parameter in shell commands, validate paths to prevent command injection. - -```typescript -import { resolve } from 'node:path' - -const isValidPath = (path: string): boolean => { - // Reject paths with shell metacharacters - const dangerousChars = /[;&|`$(){}[\]<>'"\\]/ - if (dangerousChars.test(path)) { - return false - } - - // Reject directory traversal and option injection - if (path.includes('..') || path.startsWith('-')) { - return false - } - - return true -} - -export const grade: Grader = async ({ cwd }) => { - if (!cwd || !isValidPath(cwd)) { - return { pass: false, score: 0, reasoning: 'Invalid path' } - } - - // Normalize path to prevent traversal - const safeCwd = resolve(cwd) - - // Now safe to use in shell commands - const result = await Bun.$`git -C ${safeCwd} status --porcelain`.text() - // ... -} -``` - -**Trust boundary:** The `cwd` parameter typically comes from trusted sources (`process.cwd()`, CLI `--cwd` flag). If accepting paths from untrusted sources (e.g., JSONL metadata), always validate before using in shell commands. - -### Git Status Detection Scope - -The examples above detect: -- **Untracked files** (`??`) - New files not yet staged -- **Modified files** (`M` or ` M`) - Changed tracked files - -Not included in basic examples: -- **Staged files** (`A`) - Files added to index -- **Renamed files** (`R`) - Files moved/renamed -- **Deleted files** (`D`) - Files removed -- **Copied files** (`C`) - Files duplicated - -For comprehensive detection, parse all `git status --porcelain` codes. See `git status --help` for complete format specification. - -### Performance Note - -**Git-based grading has higher latency than output-based grading** because each grader invocation spawns multiple git processes (typically 2-3 per evaluation). For large evaluation batches: - -- Output-based grading: ~1-5ms per evaluation -- Git-based grading: ~50-200ms per evaluation (depending on repo size) - -Use git-based grading when environmental outcomes matter more than speed. For high-throughput scenarios, consider batching or caching strategies. - -### Outcome Field Benefits - -When graders return the `outcome` field, it's merged onto the capture result: - -```jsonl -{ - "id": "create-button", - "input": "Create a button component", - "output": "I created Button.tsx with a reusable button component.", - "trajectory": [...], - "pass": true, - "score": 1, - "reasoning": "Files created: src/Button.tsx. Valid syntax: true", - "outcome": { - "filesCreated": ["src/Button.tsx"], - "validSyntax": true, - "type": "file_creation" - } -} -``` - -This enables powerful downstream analysis: - -```bash -# Find all test-fixing tasks -cat results.jsonl | jq 'select(.outcome.type == "test_execution")' - -# Calculate test pass rate -cat results.jsonl | jq -s 'map(select(.outcome.testsPassed)) | length' - -# Identify refactoring tasks that touched critical files -cat results.jsonl | jq 'select(.outcome.touchedCriticalFiles == true)' -``` - -## Grading Patterns - -### Hint-Based Matching - -Simple pattern for checking if output contains expected content: - -```typescript -export const grade: Grader = async ({ output, hint }) => { - if (!hint) { - return { pass: true, score: 1.0, reasoning: 'No hint provided' } - } - - const contains = output.toLowerCase().includes(hint.toLowerCase()) - return { - pass: contains, - score: contains ? 1.0 : 0.0, - reasoning: contains ? 'Output contains hint' : 'Output missing hint' - } -} -``` - -### Multi-Criteria Scoring - -Score based on multiple independent criteria: - -```typescript -export const grade: Grader = async ({ output, hint, trajectory }) => { - let score = 0 - const reasons: string[] = [] - - // Criterion 1: Contains hint - if (hint && output.toLowerCase().includes(hint.toLowerCase())) { - score += 0.4 - reasons.push('Contains expected content') - } - - // Criterion 2: No tool errors - const hasErrors = trajectory?.some(s => - s.type === 'tool_call' && s.status === 'error' - ) - if (!hasErrors) { - score += 0.3 - reasons.push('No tool errors') - } - - // Criterion 3: Efficient execution - const toolCount = trajectory?.filter(s => s.type === 'tool_call').length ?? 0 - if (toolCount <= 5) { - score += 0.3 - reasons.push(`Efficient (${toolCount} tools)`) - } - - return { - pass: score >= 0.7, - score, - reasoning: reasons.join('; ') || 'Failed all criteria' - } -} -``` - -### Metadata-Based Grading - -Use metadata for category-specific scoring logic: - -```typescript -export const grade: Grader = async ({ output, hint, metadata }) => { - const category = (metadata?.category as string) ?? 'general' - const difficulty = (metadata?.difficulty as string) ?? 'medium' - - // Apply different criteria based on category - if (category === 'code') { - // Code tasks require syntax validation - const hasCodeBlock = /```[\s\S]*?```/.test(output) - if (!hasCodeBlock) { - return { pass: false, score: 0.0, reasoning: 'Code category requires code block' } - } - } else if (category === 'web-search') { - // Web search tasks require sources - const hasSources = /source:/i.test(output) || /https?:\/\//.test(output) - if (!hasSources) { - return { pass: false, score: 0.5, reasoning: 'Web search should cite sources' } - } - } - - // Adjust score threshold by difficulty - const baseScore = hint ? (output.toLowerCase().includes(hint.toLowerCase()) ? 1.0 : 0.0) : 1.0 - const threshold = difficulty === 'hard' ? 0.9 : difficulty === 'easy' ? 0.6 : 0.7 - - return { - pass: baseScore >= threshold, - score: baseScore, - reasoning: `Category: ${category}, Difficulty: ${difficulty}` - } -} -``` - -### Trajectory-Based Grading - -Analyze the execution path, not just the output: - -```typescript -export const grade: Grader = async ({ trajectory }) => { - const toolCalls = trajectory?.filter(s => s.type === 'tool_call') ?? [] - - // Check for required tool usage - const usedWrite = toolCalls.some(t => t.name === 'Write') - const usedRead = toolCalls.some(t => t.name === 'Read') - - if (!usedWrite || !usedRead) { - return { - pass: false, - score: 0.0, - reasoning: `Missing required tools: ${!usedWrite ? 'Write' : ''} ${!usedRead ? 'Read' : ''}` - } - } - - return { - pass: true, - score: 1.0, - reasoning: 'Used required Read and Write tools' - } -} -``` - -### LLM-as-Judge - -Use an LLM for semantic evaluation: - -```typescript -import Anthropic from '@anthropic-ai/sdk' -import type { Grader } from '@plaited/agent-eval-harness/schemas' - -const client = new Anthropic() - -export const grade: Grader = async ({ input, output, hint }) => { - const response = await client.messages.create({ - model: 'claude-sonnet-4-20250514', - max_tokens: 256, - messages: [{ - role: 'user', - content: `Evaluate this agent output. - -Task: ${Array.isArray(input) ? input.join(' → ') : input} -${hint ? `Expected: ${hint}` : ''} - -Agent output: -${output} - -Did the agent correctly complete the task? Respond as JSON only: -{"pass": true/false, "score": 0.0-1.0, "reasoning": "brief explanation"}` - }] - }) - - const text = response.content[0]?.type === 'text' ? response.content[0].text : '' - - try { - return JSON.parse(text.match(/\{[\s\S]*\}/)?.[0] ?? '{}') - } catch { - return { pass: false, score: 0, reasoning: 'Failed to parse LLM response' } - } -} -``` - -## Detection Logic - -The harness determines grader type by file extension: - -| Extension | Treatment | -|-----------|-----------| -| `.ts`, `.js`, `.mjs`, `.cjs` | Import as ES module | -| Everything else (`.py`, `.sh`, etc.) | Execute as subprocess | - -## Executable Protocol - -For non-JavaScript graders, use stdin/stdout JSON: - -**Input (stdin):** -```json -{ - "input": "Find the CEO of Anthropic", - "output": "The CEO of Anthropic is Dario Amodei.", - "hint": "Dario Amodei", - "trajectory": [...], - "metadata": {"category": "web-search", "difficulty": "easy"}, - "cwd": "/path/to/working/directory" -} -``` - -**Output (stdout):** -```json -{ - "pass": true, - "score": 1.0, - "reasoning": "Output contains expected name", - "outcome": { - "method": "semantic_match", - "confidence": 0.95 - } -} -``` - -**Exit codes:** -- `0` = Success (result parsed from stdout) -- Non-zero = Error (stderr used for error message) - -## Testing Graders - -Test independently before using with the harness: - -```bash -# TypeScript -echo '{"input":"test","output":"hello world","hint":"world"}' | bun run ./my-grader.ts - -# Python -echo '{"input":"test","output":"hello world","hint":"world"}' | ./grader.py - -# Shell -echo '{"input":"test","output":"hello world","hint":"world"}' | ./grader.sh -``` - -## Commands That Support Inline Graders - -| Command | Flag | Purpose | -|---------|------|---------| -| `capture` | `--grader` | Add score to each result | -| `trials` | `--grader` | Compute pass@k, pass^k metrics | -| `grade` | `--grader` | Score extracted results (pipeline) | -| `calibrate` | `--grader` | Re-score samples with different grader | -| `validate-refs` | `--grader` | Check reference solutions | - -## Best Practices - -### Grade Outcomes, Not Paths - -**✅ Do: Grade final environmental state** -- Did the tests pass? -- Was the file created with valid syntax? -- Is the answer semantically correct? -- Does the build succeed? - -**❌ Don't: Grade procedural steps** -- Don't require specific tool usage ("must use WebSearch") -- Don't enforce reasoning patterns ("must think step-by-step") -- Don't mandate particular approaches ("must read file before editing") - -**Why this matters:** Agents should be free to find novel solutions. If the outcome is correct, the path doesn't matter. Use git-based grading to check environmental changes, not trajectory inspection to enforce procedures. - -### Other Best Practices - -1. **Grade in isolation** - Each input/output should be scored independently -2. **Deterministic scoring** - Same input should always produce same score -3. **Always return valid JSON** - Use `JSON.stringify()` or `json.dumps()` -4. **Handle missing fields** - `hint`, `trajectory`, and `cwd` may be undefined -5. **Include reasoning** - Helps debug failures during calibration -6. **Test independently** - Validate grader before running full eval -7. **Keep graders simple** - Complex logic is hard to debug and calibrate -8. **Use git for outcomes** - Let git detect file changes instead of parsing output text -9. **Return structured outcomes** - Makes downstream analysis and aggregation easier - -## Related Documentation - -- [comparison-graders.md](comparison-graders.md) - Multi-run comparison graders -- [calibration.md](calibration.md) - Grader calibration workflow -- [eval-concepts.md](eval-concepts.md) - pass@k, pass^k metrics diff --git a/.agents/skills/agent-eval-harness/references/output-formats.md b/.agents/skills/agent-eval-harness/references/output-formats.md deleted file mode 100644 index 325b907..0000000 --- a/.agents/skills/agent-eval-harness/references/output-formats.md +++ /dev/null @@ -1,261 +0,0 @@ -# Output Formats - -The harness uses a "capture once, derive many views" approach. The `capture` command produces full trajectory JSONL, and derived views are created with separate commands. - -## Capture Output (Full Trajectory) - -The `capture` command always outputs full trajectory JSONL: - -```bash -agent-eval-harness capture prompts.jsonl --schema ./claude-headless.json -o results.jsonl -``` - -### Schema - -```typescript -type CaptureResult = { - id: string // Prompt identifier - input: string | string[] // Single prompt or multi-turn conversation - output: string // Final agent response - hint?: string // Grader context (if provided in prompt) - trajectory: TrajectoryStep[] // Full execution trajectory - metadata: Record // Prompt metadata - timing: { - start: number // Unix timestamp (ms) - end: number // Unix timestamp (ms) - firstResponse?: number // Time to first response (ms) - sessionCreation: number // Time to create session (ms) - total: number // Total duration (end - start, ms) - inputTokens?: number // Input tokens consumed (if available) - outputTokens?: number // Output tokens generated (if available) - } - toolErrors: boolean // Whether any tool calls failed - errors?: string[] // Error messages (if any) - score?: GraderResult // Grader score (if grader was provided) -} - -type TrajectoryStep = - | { type: 'thought'; content: string; timestamp: number; stepId?: string } - | { type: 'message'; content: string; timestamp: number; stepId?: string } - | { - type: 'tool_call' - name: string // Tool title - status: string // pending, in_progress, completed, failed - input?: unknown // Raw input parameters - output?: unknown // Raw output - duration?: number // Execution time (ms) - timestamp: number - stepId?: string - } - | { type: 'plan'; entries: unknown[]; timestamp: number; stepId?: string } - -type GraderResult = { - pass: boolean - score: number // 0.0 to 1.0 - reasoning?: string -} -``` - -### Example Output - -```jsonl -{"id":"test-001","input":"Create a primary button","output":"I created the button in src/button.tsx","trajectory":[{"type":"thought","content":"I'll create a styled button template","timestamp":100,"stepId":"test-001-step-1"},{"type":"tool_call","name":"Write","status":"completed","input":{"file_path":"src/button.tsx","content":"..."},"output":"File written","duration":234,"timestamp":150,"stepId":"test-001-step-2"},{"type":"message","content":"I created the button","timestamp":500,"stepId":"test-001-step-3"}],"metadata":{"category":"ui"},"timing":{"start":1704067200000,"end":1704067201234,"firstResponse":100},"toolErrors":false} -``` - -## Summary Format - -The `summarize` command derives compact JSONL from full trajectory: - -```bash -agent-eval-harness summarize results.jsonl -o summary.jsonl -``` - -### Schema - -```typescript -type SummaryResult = { - id: string // Prompt identifier - input: string // Original prompt text - output: string // Final agent response - toolCalls: string[] // List of tool names used - duration: number // Total execution time (ms) -} -``` - -### Example Output - -```jsonl -{"id":"test-001","input":"Create a primary button","output":"I created the button in src/button.tsx","toolCalls":["Write"],"duration":1234} -{"id":"test-002","input":"Fix the TypeScript error","output":"I fixed the type error...","toolCalls":["Read","Edit"],"duration":2567} -``` - -### Analysis with jq - -```bash -# Calculate average duration -cat summary.jsonl | jq -s 'map(.duration) | add / length' - -# Count tool usage -cat summary.jsonl | jq -s 'map(.toolCalls) | flatten | group_by(.) | map({tool: .[0], count: length})' - -# Filter by output content -cat summary.jsonl | jq 'select(.output | contains("error"))' -``` - -## Markdown Format - -The `summarize` command can also produce markdown for LLM-as-judge workflows: - -```bash -agent-eval-harness summarize results.jsonl --markdown -o results.md -``` - -### Structure - -```markdown -## Evaluation Record: - -**Input:** - -**Trajectory:** -1. [THOUGHT] [->stepId] -2. [TOOL:] -> (ms) [->stepId] - File: ( chars) - ``` - - - // ... N lines omitted ... - - - ``` -3. [PLAN] [->stepId] -4. [MESSAGE] [->stepId] - -**Output:** -**Metadata:** category=ui, agent=claude-headless, ... -**Tool Errors:** false -**Duration:** ms - ---- -``` - -**Step ID Format:** `-step-` (e.g., `test-001-step-2`) - -**Truncation Rules:** -- Thought/message content: First 100 characters -- Output: First 200 characters -- Code preview: Head (8 lines) + tail (4 lines) for files > 12 lines - -## Trials Output - -The `trials` command produces per-prompt trial results: - -```bash -agent-eval-harness trials prompts.jsonl --schema ./claude-headless.json -k 5 --grader ./grader.ts -o trials.jsonl -``` - -### Schema - -```typescript -type TrialResult = { - id: string // Prompt identifier - input: string // Original prompt text - hint?: string // Grader context (if provided) - k: number // Number of trials - passRate?: number // passes / k (with grader only) - passAtK?: number // 1 - (1-passRate)^k (with grader only) - passExpK?: number // passRate^k (with grader only) - trials: TrialEntry[] // Individual trial results -} - -type TrialEntry = { - trialNum: number // Trial number (1-indexed) - output: string // Agent output for this trial - trajectory: TrajectoryStep[] // Full trajectory for this trial - duration: number // Duration in milliseconds - pass?: boolean // Pass/fail (if grader provided) - score?: number // Numeric score (if grader provided) - reasoning?: string // Grader reasoning (if grader provided) -} -``` - -### Example (Without Grader) - -```jsonl -{"id":"search-001","input":"Find the CEO of Anthropic","k":5,"trials":[{"trialNum":1,"output":"Dario Amodei...","trajectory":[...],"duration":1234},{"trialNum":2,"output":"The CEO is Dario...","trajectory":[...],"duration":1100},...]} -``` - -### Example (With Grader) - -```jsonl -{"id":"search-001","input":"Find the CEO of Anthropic","k":5,"passRate":0.8,"passAtK":0.9997,"passExpK":0.3277,"trials":[{"trialNum":1,"output":"Dario Amodei...","pass":true,"score":1.0,"duration":1234},{"trialNum":2,"output":"I don't know...","pass":false,"score":0.0,"reasoning":"Missing hint content","duration":1100},...]} -``` - -## Step-Level Retrieval Pattern - -For step-specific analysis, use the step IDs in the trajectory: - -```typescript -// Load results -const results = (await Bun.file('results.jsonl').text()) - .trim() - .split('\n') - .map(line => JSON.parse(line)) - -// Build step index -const stepIndex = new Map() -for (const result of results) { - for (const step of result.trajectory) { - stepIndex.set(step.stepId, step) - } -} - -// Retrieve specific step by ID -const stepId = 'test-001-step-2' // From markdown [->stepId] -const fullStep = stepIndex.get(stepId) -console.log(fullStep.input) // Complete tool input -``` - -## toolErrors Field - -The `toolErrors` field indicates whether any tool calls failed during execution: - -| `toolErrors` | Meaning | -|--------------|---------| -| `false` | All tool calls completed successfully | -| `true` | One or more tool calls had `status: 'failed'` | - -**Note:** `toolErrors` only indicates tool-level failures. For semantic pass/fail (did the agent accomplish the task?), use a grader: - -```bash -agent-eval-harness capture prompts.jsonl --schema ./claude-headless.json --grader ./grader.ts -o results.jsonl -``` - -## Input Format - -All commands accept the same JSONL input: - -```jsonl -{"id":"test-001","input":"Create a primary button","hint":"should contain